[flang-commits] [clang] [clang-tools-extra] [compiler-rt] [flang] [libc] [llvm] [Clang] Fix confusing diagnostic with explicit 'this' parameters. (PR #99824)

Braden Helmer via flang-commits flang-commits at lists.llvm.org
Tue Jul 23 04:53:55 PDT 2024


https://github.com/bradenhelmer updated https://github.com/llvm/llvm-project/pull/99824

>From e4c3701ea6be894b1094f45d0590c61e5aa44897 Mon Sep 17 00:00:00 2001
From: Braden Helmer <bradenhelmeraus at gmail.com>
Date: Sun, 21 Jul 2024 14:10:17 -0400
Subject: [PATCH 1/4] Fix diag mismatch

---
 clang/lib/Sema/SemaOverload.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index 5ea6b06121c7c..102cb5f7baef2 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -11152,7 +11152,8 @@ static void DiagnoseBadConversion(Sema &S, OverloadCandidate *Cand,
   // non-constructor method.  Note that 'I' corresponds the
   // conversion-slot index.
   bool isObjectArgument = false;
-  if (isa<CXXMethodDecl>(Fn) && !isa<CXXConstructorDecl>(Fn)) {
+  if (isa<CXXMethodDecl>(Fn) && !isa<CXXConstructorDecl>(Fn) &&
+      !Fn->hasCXXExplicitFunctionObjectParameter()) {
     if (I == 0)
       isObjectArgument = true;
     else

>From 7f15d3d1a58f11c0ec2b6e646131c0fc0b7a9d5b Mon Sep 17 00:00:00 2001
From: Braden Helmer <bradenhelmeraus at gmail.com>
Date: Sun, 21 Jul 2024 15:37:09 -0400
Subject: [PATCH 2/4] Small change and add test

---
 clang/lib/Sema/SemaOverload.cpp            | 5 ++---
 clang/test/SemaCXX/cxx2b-deducing-this.cpp | 8 ++++++++
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index 1ef43b2103a9b..ac90ccd5643c1 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -11146,11 +11146,10 @@ static void DiagnoseBadConversion(Sema &S, OverloadCandidate *Cand,
   // non-constructor method.  Note that 'I' corresponds the
   // conversion-slot index.
   bool isObjectArgument = false;
-  if (isa<CXXMethodDecl>(Fn) && !isa<CXXConstructorDecl>(Fn) &&
-      !Fn->hasCXXExplicitFunctionObjectParameter()) {
+  if (isa<CXXMethodDecl>(Fn) && !isa<CXXConstructorDecl>(Fn)) {
     if (I == 0)
       isObjectArgument = true;
-    else
+    else if (!Fn->hasCXXExplicitFunctionObjectParameter())
       I--;
   }
 
diff --git a/clang/test/SemaCXX/cxx2b-deducing-this.cpp b/clang/test/SemaCXX/cxx2b-deducing-this.cpp
index 5cbc1f735383b..6ad54b9b2c63d 100644
--- a/clang/test/SemaCXX/cxx2b-deducing-this.cpp
+++ b/clang/test/SemaCXX/cxx2b-deducing-this.cpp
@@ -959,3 +959,11 @@ void f();
 };
 void a::f(this auto) {} // expected-error {{an explicit object parameter cannot appear in a non-member function}}
 }
+
+struct R {
+  void f(this auto &&self, int &&r_value_ref) {} // expected-note {{candidate function template not viable: expects an rvalue for 2nd argument}}
+  void g(int &&r_value_ref) {
+	f(r_value_ref); // expected-error {{no matching member function for call to 'f'}}
+  }
+};
+

>From 19c05ce1b9b30a75ca72de1393d15b4e07e4c5e1 Mon Sep 17 00:00:00 2001
From: Braden Helmer <bradenhelmeraus at gmail.com>
Date: Tue, 23 Jul 2024 07:09:50 -0400
Subject: [PATCH 3/4] Add release note

---
 clang/docs/ReleaseNotes.rst | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 4638b91b48f95..481b6313974ea 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -746,6 +746,8 @@ Improvements to Clang's diagnostics
 - Clang now diagnoses dangling assignments for pointer-like objects (annotated with `[[gsl::Pointer]]`) under `-Wdangling-assignment-gsl` (off by default)
   Fixes #GH63310.
 
+- Clang now has improved diagnostics for functions with explicit 'this' parameters. Fixes #GH97878
+
 Improvements to Clang's time-trace
 ----------------------------------
 

>From 1d1ba2ac408cbfdad9611ca6520968440a13f217 Mon Sep 17 00:00:00 2001
From: Braden Helmer <bradenhelmeraus at gmail.com>
Date: Tue, 23 Jul 2024 07:49:40 -0400
Subject: [PATCH 4/4] Revert "Merge branch 'main' into diag-mismatch-97878"

This reverts commit e0ae9eeaf2c192719e8d4e1e273ebfb0478dff6b, reversing
changes made to 19c05ce1b9b30a75ca72de1393d15b4e07e4c5e1.
---
 .github/CODEOWNERS                            |    1 -
 .github/workflows/version-check.yml           |    2 +-
 bolt/docs/CommandLineArgumentReference.md     |    2 +-
 bolt/lib/Core/BinaryContext.cpp               |    2 +-
 bolt/lib/Rewrite/DWARFRewriter.cpp            |  144 +-
 clang-tools-extra/clang-tidy/add_new_check.py |    4 +-
 .../clang-tidy/cert/CERTTidyModule.cpp        |  354 +-
 .../clang-tidy/misc/ConstCorrectnessCheck.cpp |   17 +-
 .../clang-tidy/misc/ConstCorrectnessCheck.h   |    4 +-
 .../UnnecessaryValueParamCheck.cpp            |   69 +-
 .../performance/UnnecessaryValueParamCheck.h  |   12 +-
 .../clang-tidy/tool/run-clang-tidy.py         |  323 +-
 .../clang-tidy/utils/ASTUtils.cpp             |    2 +-
 .../clang-tidy/utils/ExceptionAnalyzer.cpp    |   10 +-
 clang-tools-extra/clangd/IncludeCleaner.cpp   |   20 -
 .../clangd/unittests/IncludeCleanerTests.cpp  |   22 -
 clang-tools-extra/docs/ReleaseNotes.rst       |  421 +
 .../checks/clang-analyzer/cplusplus.Move.rst  |    8 +-
 .../optin.taint.TaintedAlloc.rst              |   14 -
 .../security.PutenvStackArray.rst             |   10 -
 .../unix.BlockInCriticalSection.rst           |   13 -
 .../docs/clang-tidy/checks/list.rst           |   10 +-
 .../include-cleaner/lib/Analysis.cpp          |   17 +-
 .../unittests/AnalysisTest.cpp                |   26 -
 .../checkers/bugprone/exception-escape.cpp    |   18 -
 .../misc/const-correctness-values.cpp         |   11 +-
 clang/cmake/caches/Fuchsia-stage2.cmake       |    2 -
 clang/docs/ClangNVLinkWrapper.rst             |   74 -
 clang/docs/HLSL/AvailabilityDiagnostics.rst   |    5 +-
 clang/docs/HLSL/ExpectedDifferences.rst       |    2 +-
 clang/docs/ReleaseNotes.rst                   | 1130 ++-
 clang/docs/index.rst                          |    1 -
 clang/include/clang/AST/ASTContext.h          |    8 +-
 clang/include/clang/AST/ExprCXX.h             |   10 +-
 clang/include/clang/Basic/Attr.td             |    2 +-
 clang/include/clang/Basic/AttrDocs.td         |    3 +-
 .../include/clang/Basic/DiagnosticLexKinds.td |    5 +
 .../clang/Basic/DiagnosticParseKinds.td       |    3 -
 .../clang/Basic/DiagnosticSemaKinds.td        |   10 +-
 clang/include/clang/Basic/Features.def        |    2 -
 clang/include/clang/Basic/IdentifierTable.h   |   24 +-
 clang/include/clang/Basic/LangOptions.def     |    2 -
 .../include/clang/Basic/PointerAuthOptions.h  |   11 -
 clang/include/clang/Basic/TokenKinds.def      |    3 +
 clang/include/clang/Basic/arm_neon.td         |   64 +-
 clang/include/clang/Basic/arm_neon_incl.td    |    2 +-
 clang/include/clang/Driver/Options.td         |   15 -
 .../include/clang/Frontend/FrontendOptions.h  |    8 +-
 clang/include/clang/Lex/Preprocessor.h        |   83 +-
 clang/include/clang/Lex/PreprocessorOptions.h |    5 -
 clang/include/clang/Lex/Token.h               |    3 +
 clang/include/clang/Parse/Parser.h            |    3 +-
 clang/lib/APINotes/APINotesFormat.h           |    2 +-
 clang/lib/APINotes/APINotesYAMLCompiler.cpp   |   15 +-
 clang/lib/AST/ASTConcept.cpp                  |    6 +-
 clang/lib/AST/ASTContext.cpp                  |   12 +-
 clang/lib/AST/ASTImporter.cpp                 |    9 +-
 clang/lib/AST/DeclCXX.cpp                     |    3 -
 clang/lib/AST/ExprCXX.cpp                     |   19 -
 clang/lib/AST/ExprConstant.cpp                |   36 +-
 clang/lib/AST/Interp/ByteCodeEmitter.cpp      |    3 +-
 clang/lib/AST/Interp/Compiler.cpp             |    4 +-
 clang/lib/AST/Interp/Descriptor.cpp           |   10 +-
 clang/lib/AST/Interp/Descriptor.h             |    2 -
 clang/lib/AST/Interp/Interp.h                 |   18 +-
 clang/lib/AST/Interp/InterpBuiltin.cpp        |   32 +-
 clang/lib/AST/Interp/Opcodes.td               |    2 -
 clang/lib/AST/Interp/Pointer.cpp              |   26 +-
 clang/lib/AST/Interp/Pointer.h                |    3 -
 .../FlowSensitive/DataflowEnvironment.cpp     |   21 +-
 clang/lib/Basic/IdentifierTable.cpp           |    3 +-
 clang/lib/Basic/Targets/AArch64.cpp           |    9 +-
 clang/lib/Basic/Targets/LoongArch.cpp         |   23 +-
 clang/lib/Basic/Targets/LoongArch.h           |    2 -
 clang/lib/Basic/Targets/X86.h                 |    9 +
 clang/lib/CodeGen/CGCall.cpp                  |  213 +-
 clang/lib/CodeGen/CGExpr.cpp                  |    5 -
 clang/lib/CodeGen/CGExprConstant.cpp          |    3 +-
 clang/lib/CodeGen/CGPointerAuth.cpp           |   34 -
 clang/lib/CodeGen/CGRecordLayout.h            |    4 -
 clang/lib/CodeGen/CodeGenFunction.cpp         |    2 -
 clang/lib/CodeGen/CodeGenFunction.h           |    3 +-
 clang/lib/CodeGen/CodeGenModule.cpp           |    2 -
 clang/lib/CodeGen/CodeGenModule.h             |    8 -
 clang/lib/CodeGen/ItaniumCXXABI.cpp           |  254 +-
 clang/lib/CodeGen/TargetInfo.h                |    1 -
 clang/lib/Driver/ToolChain.cpp                |    5 +-
 clang/lib/Driver/ToolChains/AIX.cpp           |    6 -
 clang/lib/Driver/ToolChains/Arch/AArch64.cpp  |   21 -
 clang/lib/Driver/ToolChains/Arch/AArch64.h    |    3 -
 .../lib/Driver/ToolChains/Arch/LoongArch.cpp  |   86 +-
 clang/lib/Driver/ToolChains/Clang.cpp         |   68 +-
 clang/lib/Driver/ToolChains/Cuda.cpp          |   65 +-
 clang/lib/Driver/ToolChains/Cuda.h            |    3 -
 clang/lib/Driver/ToolChains/Linux.cpp         |    3 -
 clang/lib/Driver/ToolChains/PS4CPU.h          |    2 +-
 clang/lib/Format/TokenAnnotator.cpp           |   10 +-
 clang/lib/Frontend/CompilerInvocation.cpp     |   25 +-
 .../lib/Frontend/PrintPreprocessedOutput.cpp  |   12 +-
 clang/lib/Lex/DependencyDirectivesScanner.cpp |   12 +-
 clang/lib/Lex/PPLexerChange.cpp               |    9 +-
 clang/lib/Lex/Preprocessor.cpp                |  444 +-
 clang/lib/Lex/TokenConcatenation.cpp          |   10 +
 clang/lib/Parse/ParseAST.cpp                  |   10 +-
 clang/lib/Parse/ParseDecl.cpp                 |    8 +-
 clang/lib/Parse/ParseExpr.cpp                 |    2 +-
 clang/lib/Parse/ParsePragma.cpp               |   25 -
 clang/lib/Parse/ParseStmt.cpp                 |    9 -
 clang/lib/Parse/Parser.cpp                    |   93 +-
 clang/lib/Sema/SemaAPINotes.cpp               |  119 +-
 clang/lib/Sema/SemaAvailability.cpp           |    6 -
 clang/lib/Sema/SemaDeclAttr.cpp               |    4 -
 clang/lib/Sema/SemaExpr.cpp                   |   42 +-
 clang/lib/Sema/SemaOpenMP.cpp                 |   12 +-
 clang/lib/Sema/SemaTemplate.cpp               |    4 -
 clang/lib/Sema/SemaTemplateDeductionGuide.cpp |   25 +-
 clang/lib/Sema/SemaTemplateInstantiate.cpp    |   42 +-
 .../lib/Sema/SemaTemplateInstantiateDecl.cpp  |   11 +-
 clang/lib/Serialization/ASTReaderStmt.cpp     |   36 +-
 .../Checkers/ArrayBoundCheckerV2.cpp          |   18 +-
 .../Checkers/BuiltinFunctionChecker.cpp       |   38 -
 .../StaticAnalyzer/Checkers/StreamChecker.cpp |   72 +-
 clang/lib/StaticAnalyzer/Core/BugReporter.cpp |    4 +-
 .../APINotes/Inputs/Headers/Methods.apinotes  |   24 -
 clang/test/APINotes/Inputs/Headers/Methods.h  |    1 +
 .../Inputs/Headers/Namespaces.apinotes        |    3 -
 .../test/APINotes/Inputs/Headers/Namespaces.h |    3 -
 clang/test/APINotes/methods.cpp               |    8 -
 clang/test/APINotes/namespaces.cpp            |    5 -
 clang/test/AST/Interp/atomic.c                |   13 -
 clang/test/AST/Interp/codegen.cpp             |   13 -
 clang/test/AST/ast-dump-ctad-alias.cpp        |   34 +-
 .../Inputs/system-header-simulator-cxx.h      |   14 +-
 clang/test/Analysis/builtin-functions.cpp     |   20 -
 .../diagnostics/explicit-suppression.cpp      |    2 +-
 clang/test/Analysis/issue-94193.cpp           |   41 -
 .../test/Analysis/out-of-bounds-diagnostics.c |   21 -
 clang/test/Analysis/stream.c                  |   12 -
 clang/test/Analysis/use-after-move.cpp        |    9 +-
 clang/test/CMakeLists.txt                     |    1 -
 clang/test/CXX/cpp/cpp.module/p2.cppm         |   88 +
 clang/test/CXX/drs/cwg6xx.cpp                 |    2 +-
 .../CXX/expr/expr.unary/expr.unary.op/p4.cpp  |   42 +-
 .../basic/basic.link/module-declaration.cpp   |   61 +-
 .../dcl.module/dcl.module.import/p1.cppm      |   39 +-
 .../temp/temp.arg/temp.arg.template/p3-0x.cpp |   10 -
 .../temp.deduct/temp.deduct.call/p3-0x.cpp    |    2 +-
 .../temp/temp.spec/temp.expl.spec/p2-20.cpp   |    8 +-
 clang/test/CodeGen/attr-counted-by.c          |   48 -
 .../CodeGen/ptrauth-function-attributes.c     |    5 -
 .../ptrauth-global-constant-initializers.cpp  |  234 -
 .../ptrauth-member-function-pointer.cpp       |  433 -
 .../CodeGenCXX/ptrauth-type-info-vtable.cpp   |   87 -
 .../usr/include/aarch64-linux-gnu/.keep       |    0
 .../usr/include/aarch64-linux-pauthtest/.keep |    0
 clang/test/Driver/aarch64-multilib-pauthabi.c |    4 -
 clang/test/Driver/aarch64-ptrauth.c           |   85 +-
 clang/test/Driver/cuda-cross-compiling.c      |    8 +-
 clang/test/Driver/fpatchable-function-entry.c |    9 +-
 clang/test/Driver/ftime-trace-sections.cpp    |    2 +-
 clang/test/Driver/ftime-trace.cpp             |   39 +-
 clang/test/Driver/loongarch-features.c        |    2 +-
 clang/test/Driver/loongarch-march.c           |   33 -
 clang/test/Driver/loongarch-mlasx.c           |    6 +-
 clang/test/Driver/loongarch-msimd.c           |    4 +-
 clang/test/Driver/loongarch-msingle-float.c   |    4 +-
 clang/test/Driver/loongarch-msoft-float.c     |    4 +-
 clang/test/Driver/loongarch-mtune.c           |    5 -
 clang/test/Driver/nvlink-wrapper.c            |   65 -
 .../Driver/print-supported-extensions-riscv.c |    2 +-
 clang/test/Driver/ps4-ps5-runtime-flags.c     |    4 +-
 clang/test/Parser/cxx-template-decl.cpp       |    4 -
 clang/test/Preprocessor/embed_weird.cpp       |    8 -
 clang/test/Preprocessor/init-loongarch.c      |   43 +-
 clang/test/Preprocessor/pragma_mc_func.c      |   23 -
 .../test/Preprocessor/riscv-target-features.c |   18 +-
 .../aarch64-neon-without-target-feature.cpp   |   36 -
 clang/test/Sema/atomic-ops.c                  |   25 -
 .../Sema/patchable-function-entry-attr.cpp    |    5 -
 clang/test/Sema/ptrauth-indirect-goto.c       |   18 -
 clang/test/SemaCXX/builtin_vectorelements.cpp |    2 -
 .../cxx1y-variable-templates_in_class.cpp     |   16 +-
 clang/test/SemaCXX/cxx20-ctad-type-alias.cpp  |   30 +-
 .../SemaCXX/invalid-template-declaration.cpp  |    6 -
 clang/test/SemaCXX/modules.cppm               |   89 +-
 .../alias-template-deprecated.cpp             |   83 -
 .../test/SemaTemplate/class-template-decl.cpp |    2 +-
 clang/test/SemaTemplate/deduction-guide.cpp   |   68 +-
 clang/test/SemaTemplate/nested-template.cpp   |    2 +-
 clang/test/lit.cfg.py                         |    1 -
 clang/tools/CMakeLists.txt                    |    1 -
 .../clang-fuzzer/dictionary/CMakeLists.txt    |    1 -
 .../tools/clang-nvlink-wrapper/CMakeLists.txt |   42 -
 .../ClangNVLinkWrapper.cpp                    |  785 --
 .../tools/clang-nvlink-wrapper/NVLinkOpts.td  |   90 -
 clang/tools/driver/cc1_main.cpp               |    3 +-
 clang/unittests/AST/ASTImporterTest.cpp       |   95 +-
 .../FlowSensitive/DataflowEnvironmentTest.cpp |   28 -
 clang/unittests/Format/FormatTest.cpp         |    1 -
 clang/unittests/Format/TokenAnnotatorTest.cpp |   18 -
 .../Lex/DependencyDirectivesScannerTest.cpp   |   11 -
 clang/unittests/Support/TimeProfilerTest.cpp  |  127 +-
 clang/unittests/Tooling/CMakeLists.txt        |    1 -
 .../DeductionGuide.cpp                        |    4 +-
 clang/utils/TableGen/NeonEmitter.cpp          |    6 +-
 clang/www/cxx_status.html                     |    2 +-
 cmake/Modules/LLVMVersion.cmake               |    2 +-
 .../cmake/Modules/AllSupportedArchDefs.cmake  |    4 +-
 compiler-rt/cmake/config-ix.cmake             |    3 +-
 compiler-rt/include/profile/InstrProfData.inc |    1 -
 compiler-rt/lib/builtins/CMakeLists.txt       |    2 +-
 .../builtins/aarch64/sme-libc-mem-routines.S  |  344 -
 .../lib/builtins/aarch64/sme-libc-routines.c  |   75 +
 compiler-rt/lib/builtins/fp_extend.h          |   11 +-
 compiler-rt/lib/builtins/fp_lib.h             |   11 +-
 compiler-rt/lib/rtsan/rtsan.cpp               |   15 +-
 compiler-rt/lib/rtsan/rtsan.h                 |    7 -
 compiler-rt/lib/rtsan/rtsan_interceptors.cpp  |   25 -
 compiler-rt/lib/rtsan/tests/CMakeLists.txt    |    1 -
 .../sanitizer_common_interceptors.inc         |    2 +-
 .../sanitizer_procmaps_bsd.cpp                |    3 +-
 .../sanitizer_stacktrace_printer.cpp          |    4 +-
 .../lib/scudo/standalone/tests/CMakeLists.txt |    7 +-
 .../asan/TestCases/Posix/ignore_free_hook.cpp |   12 +-
 .../TestCases/Posix/ignore_free_hook.cpp      |    8 +-
 .../Linux/instrprof-vtable-value-prof.cpp     |   27 +-
 cross-project-tests/lit.cfg.py                |    9 +-
 flang/lib/Lower/ConvertCall.cpp               |   54 +-
 flang/lib/Lower/DirectivesCommon.h            |   23 -
 .../lib/Lower/OpenMP/DataSharingProcessor.cpp |   49 +-
 flang/lib/Optimizer/CodeGen/PreCGRewrite.cpp  |   23 +-
 .../lib/Optimizer/Transforms/AddDebugInfo.cpp |    8 +-
 flang/runtime/CMakeLists.txt                  |   12 -
 flang/runtime/derived.cpp                     |    7 +-
 flang/test/Fir/declare-codegen.fir            |   14 -
 .../Lower/HLFIR/calls-poly-to-nonpoly.f90     |   26 -
 .../Lower/OpenMP/Todo/atomic-character.f90    |    8 -
 .../test/Lower/OpenMP/Todo/atomic-complex.f90 |    8 -
 flang/test/Lower/OpenMP/atomic-read.f90       |   28 +-
 .../Lower/OpenMP/lastprivate-allocatable.f90  |   38 -
 flang/test/Transforms/debug-92391.fir         |   18 -
 libc/benchmarks/gpu/CMakeLists.txt            |    6 +-
 libc/benchmarks/gpu/LibcGpuBenchmark.cpp      |   59 +-
 libc/benchmarks/gpu/LibcGpuBenchmark.h        |   33 +-
 .../modules/LLVMLibCCompileOptionRules.cmake  |    4 -
 libc/cmake/modules/LLVMLibCFlagRules.cmake    |    3 -
 libc/cmake/modules/LLVMLibCObjectRules.cmake  |   25 +
 libc/cmake/modules/LLVMLibCTestRules.cmake    |   25 +-
 libc/config/baremetal/arm/entrypoints.txt     |    1 -
 libc/config/baremetal/riscv/entrypoints.txt   |    1 -
 libc/config/darwin/arm/entrypoints.txt        |    1 -
 libc/config/darwin/x86_64/entrypoints.txt     |    1 -
 libc/config/gpu/entrypoints.txt               |   15 +-
 libc/config/linux/aarch64/entrypoints.txt     |    2 -
 libc/config/linux/arm/entrypoints.txt         |    1 -
 libc/config/linux/riscv/entrypoints.txt       |   15 +-
 libc/config/linux/riscv/headers.txt           |    8 +-
 libc/config/linux/x86_64/entrypoints.txt      |    6 +-
 libc/docs/dev/header_generation.rst           |  249 +-
 libc/docs/math/index.rst                      |    2 +-
 libc/include/llvm-libc-types/fsblkcnt_t.h     |    2 +-
 libc/include/llvm-libc-types/fsfilcnt_t.h     |    2 +-
 libc/newhdrgen/yaml_to_classes.py             |    5 +-
 libc/spec/llvm_libc_ext.td                    |    2 -
 libc/spec/stdc.td                             |    3 -
 libc/src/__support/FPUtil/BasicOperations.h   |   13 +-
 libc/src/__support/FPUtil/CMakeLists.txt      |   57 +-
 libc/src/__support/FPUtil/generic/FMA.h       |   45 +-
 libc/src/__support/FPUtil/generic/add_sub.h   |   26 +-
 libc/src/__support/FPUtil/generic/div.h       |   20 +-
 libc/src/__support/FPUtil/generic/mul.h       |   20 +-
 libc/src/__support/OSUtil/linux/fcntl.cpp     |   42 +-
 libc/src/__support/libc_assert.h              |    2 +-
 libc/src/__support/threads/linux/raw_mutex.h  |    1 -
 libc/src/__support/threads/thread.h           |    1 -
 libc/src/math/CMakeLists.txt                  |    3 -
 libc/src/math/dsqrtf128.h                     |   21 -
 libc/src/math/dsqrtl.h                        |   20 -
 libc/src/math/generic/CMakeLists.txt          |   45 +-
 libc/src/math/generic/copysign.cpp            |    4 -
 libc/src/math/generic/copysignf.cpp           |    4 -
 libc/src/math/generic/copysignf16.cpp         |    4 -
 libc/src/math/generic/dsqrtf128.cpp           |   20 -
 libc/src/math/generic/dsqrtl.cpp              |   20 -
 libc/src/math/generic/expm1.cpp               |    7 +
 libc/src/math/generic/fabs.cpp                |    8 +-
 libc/src/math/generic/fabsf.cpp               |    8 +-
 libc/src/math/generic/fabsf16.cpp             |   13 +-
 libc/src/stdlib/CMakeLists.txt                |   15 +-
 libc/src/stdlib/at_quick_exit.cpp             |    2 -
 libc/src/stdlib/atexit.cpp                    |    3 -
 libc/src/stdlib/exit_handler.cpp              |   43 +
 libc/src/stdlib/exit_handler.h                |   28 +-
 libc/src/sys/epoll/linux/epoll_pwait2.cpp     |   12 +
 libc/test/UnitTest/CMakeLists.txt             |    9 +-
 libc/test/src/__support/blockstore_test.cpp   |    9 +-
 libc/test/src/math/CMakeLists.txt             |   13 -
 libc/test/src/math/dsqrtl_test.cpp            |   13 -
 .../BinaryOpSingleOutputPerf.h                |    7 +-
 .../math/performance_testing/CMakeLists.txt   |   21 +-
 .../SingleInputSingleOutputPerf.h             |   44 +-
 .../misc_basic_ops_perf.cpp                   |   41 -
 libc/test/src/math/smoke/AddTest.h            |   19 +-
 libc/test/src/math/smoke/CMakeLists.txt       |   24 -
 libc/test/src/math/smoke/DivTest.h            |   19 +-
 libc/test/src/math/smoke/MulTest.h            |   19 +-
 libc/test/src/math/smoke/SubTest.h            |   19 +-
 libc/test/src/math/smoke/dsqrtf128_test.cpp   |   13 -
 libc/test/src/math/smoke/dsqrtl_test.cpp      |   13 -
 .../src/sys/statvfs/linux/fstatvfs_test.cpp   |   12 +-
 .../src/sys/statvfs/linux/statvfs_test.cpp    |   10 +-
 libc/utils/MPFRWrapper/MPFRUtils.cpp          |    6 -
 libclc/generic/lib/common/sign.cl             |    9 -
 ...-hardening-mode-fast-with-abi-breaks.cmake |    2 -
 libcxx/docs/Hardening.rst                     |   26 +-
 libcxx/docs/ReleaseNotes.rst                  |    2 +-
 libcxx/docs/ReleaseNotes/19.rst               |   14 -
 libcxx/docs/Status/Cxx23Issues.csv            |    2 +-
 libcxx/docs/Status/Cxx23Papers.csv            |   12 +-
 libcxx/include/__config                       |    2 +-
 libcxx/include/__configuration/abi.h          |   13 -
 libcxx/include/__iterator/bounded_iter.h      |    2 -
 libcxx/include/__ranges/transform_view.h      |    7 +
 libcxx/include/string                         |   35 +-
 libcxx/include/vector                         |   37 +-
 .../sequences/vector/abi.compile.pass.cpp     |    8 -
 .../vector/assert.iterator.index.pass.cpp     |   51 -
 ...d.pass.cpp => debug.iterator.add.pass.cpp} |   15 +-
 ....cpp => debug.iterator.decrement.pass.cpp} |    8 +-
 ...pp => debug.iterator.dereference.pass.cpp} |   11 +-
 ....cpp => debug.iterator.increment.pass.cpp} |   15 +-
 .../vector/debug.iterator.index.pass.cpp      |   42 +
 .../sequences/vector/fill_to_capacity.h       |   24 -
 .../basic.string/alignof.compile.pass.cpp     |    9 -
 .../basic.string/sizeof.compile.pass.cpp      |    9 -
 ...d.pass.cpp => debug.iterator.add.pass.cpp} |    6 +-
 ....cpp => debug.iterator.decrement.pass.cpp} |    6 +-
 ...pp => debug.iterator.dereference.pass.cpp} |    6 +-
 ....cpp => debug.iterator.increment.pass.cpp} |    6 +-
 ...pass.cpp => debug.iterator.index.pass.cpp} |    9 +-
 .../assert.push_back.invalidation.pass.cpp    |   56 -
 .../thousands_sep.pass.cpp                    |    8 +-
 .../range.transform/iterator/deref.pass.cpp   |    2 +-
 .../iterator/iter_move.pass.cpp               |   49 +
 .../resize_and_overwrite.pass.cpp             |    5 +-
 .../ci/vendor/android/Dockerfile.emulator     |    2 +-
 .../ci/vendor/android/emulator-entrypoint.sh  |    2 +-
 libcxx/utils/libcxx/test/features.py          |    2 -
 lld/COFF/Symbols.h                            |   10 +-
 lld/ELF/Arch/RISCV.cpp                        |    3 -
 lld/MachO/Driver.cpp                          |   64 +-
 lld/MachO/Writer.cpp                          |   35 +-
 lld/docs/ReleaseNotes.rst                     |   64 +
 .../aarch64-cortex-a53-843419-abs-mapsyms.s   |    2 +-
 lld/test/ELF/aarch64-gnu-ifunc.s              |    2 +-
 lld/test/ELF/aarch64-reloc-pauth.s            |   12 +-
 lld/test/ELF/aarch64-thunk-script.s           |    4 +-
 lld/test/ELF/basic-aarch64.s                  |    4 +-
 lld/test/ELF/pack-dyn-relocs.s                |   48 +-
 lld/test/MachO/arm64-32-stubs.s               |    2 +-
 lld/test/MachO/arm64-stubs.s                  |    2 +-
 lld/test/MachO/dyld-stub-binder.s             |   14 +-
 .../invalid/chained-fixups-incompatible.s     |    4 +
 lld/test/MachO/objc-selrefs.s                 |    7 +-
 lldb/include/lldb/Target/DynamicLoader.h      |    9 +-
 .../MacOSX-DYLD/DynamicLoaderDarwin.cpp       |    4 +-
 .../MacOSX-DYLD/DynamicLoaderDarwin.h         |    4 +-
 llvm/benchmarks/CMakeLists.txt                |    3 +-
 llvm/benchmarks/xxhash.cpp                    |   29 -
 llvm/docs/DirectX/DXILOpTableGenDesign.rst    |  403 +-
 llvm/docs/PointerAuth.md                      |   24 -
 llvm/docs/RISCVUsage.rst                      |    9 +-
 llvm/docs/ReleaseNotes.rst                    |  311 +
 .../llvm/Analysis/TargetTransformInfo.h       |   10 -
 .../llvm/Analysis/TargetTransformInfoImpl.h   |    2 -
 llvm/include/llvm/Analysis/ValueTracking.h    |   30 +-
 llvm/include/llvm/CodeGen/AsmPrinter.h        |    3 -
 .../CodeGen/GlobalISel/GenericMachineInstrs.h |   19 -
 .../llvm/CodeGen/GlobalISel/LegalizerHelper.h |    1 -
 llvm/include/llvm/CodeGen/ISDOpcodes.h        |   74 +-
 llvm/include/llvm/CodeGen/Passes.h            |    6 +-
 llvm/include/llvm/CodeGen/RegAllocCommon.h    |    6 +-
 llvm/include/llvm/CodeGen/RegAllocFast.h      |    2 +-
 llvm/include/llvm/CodeGen/TargetInstrInfo.h   |    6 -
 .../ExecutionEngine/Orc/Shared/MemoryFlags.h  |    2 +-
 .../llvm/Frontend/OpenMP/OMPIRBuilder.h       |   25 +-
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td      |   26 -
 llvm/include/llvm/MC/MCAsmBackend.h           |    5 +-
 llvm/include/llvm/MC/MCAssembler.h            |  157 +-
 llvm/include/llvm/MC/MCContext.h              |    6 +-
 llvm/include/llvm/MC/MCELFObjectWriter.h      |   77 +-
 llvm/include/llvm/MC/MCELFStreamer.h          |    3 -
 llvm/include/llvm/MC/MCMachObjectWriter.h     |   64 -
 llvm/include/llvm/MC/MCObjectWriter.h         |   39 +-
 llvm/include/llvm/MC/MCSPIRVObjectWriter.h    |   28 -
 llvm/include/llvm/MC/MCSection.h              |    5 +
 llvm/include/llvm/MC/MCWinCOFFObjectWriter.h  |   31 -
 llvm/include/llvm/MC/MCWinCOFFStreamer.h      |    3 -
 llvm/include/llvm/MC/MCXCOFFObjectWriter.h    |    9 -
 llvm/include/llvm/Passes/PassBuilder.h        |   10 +-
 .../llvm/ProfileData/InstrProfData.inc        |    1 -
 llvm/include/llvm/SandboxIR/SandboxIR.h       |  116 +-
 .../llvm/SandboxIR/SandboxIRValues.def        |    2 -
 llvm/include/llvm/Support/TimeProfiler.h      |   23 +-
 llvm/include/llvm/Support/raw_socket_stream.h |   28 +-
 .../TargetParser/LoongArchTargetParser.def    |    2 -
 .../llvm/TargetParser/LoongArchTargetParser.h |    3 -
 .../llvm/TargetParser/RISCVTargetParser.h     |    8 -
 llvm/include/llvm/TargetParser/Triple.h       |    6 +-
 .../include/llvm/Transforms/Instrumentation.h |    6 -
 .../Instrumentation/PGOInstrumentation.h      |    6 +-
 .../Transforms/Utils/SimplifyCFGOptions.h     |    5 -
 llvm/lib/Analysis/Loads.cpp                   |   12 +-
 llvm/lib/Analysis/TargetTransformInfo.cpp     |    4 -
 llvm/lib/Analysis/ValueTracking.cpp           |   56 +-
 llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp    |    6 +-
 llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp    |    6 +-
 llvm/lib/CodeGen/BranchFolding.cpp            |    9 +-
 .../lib/CodeGen/GlobalISel/CombinerHelper.cpp |   54 +-
 .../CodeGen/GlobalISel/LegalizerHelper.cpp    |   33 -
 llvm/lib/CodeGen/RegAllocBase.h               |    9 +-
 llvm/lib/CodeGen/RegAllocBasic.cpp            |   10 +-
 llvm/lib/CodeGen/RegAllocFast.cpp             |   16 +-
 llvm/lib/CodeGen/RegAllocGreedy.cpp           |   10 +-
 llvm/lib/CodeGen/RegAllocGreedy.h             |    2 +-
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |   46 +-
 .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp |    8 +-
 .../SelectionDAG/SelectionDAGBuilder.cpp      |   11 +-
 .../CodeGen/StackFrameLayoutAnalysisPass.cpp  |   16 +-
 llvm/lib/CodeGenData/CMakeLists.txt           |    1 -
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     |  493 +-
 llvm/lib/IR/Metadata.cpp                      |   10 +-
 llvm/lib/MC/ELFObjectWriter.cpp               |  185 +-
 llvm/lib/MC/MCAsmBackend.cpp                  |   11 +-
 llvm/lib/MC/MCAssembler.cpp                   |  105 +-
 llvm/lib/MC/MCContext.cpp                     |   22 +-
 llvm/lib/MC/MCELFStreamer.cpp                 |   32 +-
 llvm/lib/MC/MCMachOStreamer.cpp               |   25 +-
 llvm/lib/MC/MCObjectFileInfo.cpp              |   22 +-
 llvm/lib/MC/MCObjectStreamer.cpp              |   13 +-
 llvm/lib/MC/MCObjectWriter.cpp                |   13 -
 llvm/lib/MC/MCSection.cpp                     |    4 +-
 llvm/lib/MC/MCSectionXCOFF.cpp                |    2 +-
 llvm/lib/MC/MCWinCOFFStreamer.cpp             |   13 +-
 llvm/lib/MC/MCXCOFFStreamer.cpp               |    9 +-
 llvm/lib/MC/MachObjectWriter.cpp              |   28 +-
 llvm/lib/MC/SPIRVObjectWriter.cpp             |   38 +-
 llvm/lib/MC/WinCOFFObjectWriter.cpp           |   62 +-
 llvm/lib/MC/XCOFFObjectWriter.cpp             |   37 +-
 llvm/lib/Object/COFFImportFile.cpp            |    5 +-
 llvm/lib/Passes/PassBuilder.cpp               |    6 +-
 llvm/lib/Passes/PassBuilderPipelines.cpp      |   24 +-
 llvm/lib/SandboxIR/SandboxIR.cpp              |  117 +-
 llvm/lib/Support/TimeProfiler.cpp             |   61 +-
 llvm/lib/Support/raw_socket_stream.cpp        |  132 +-
 llvm/lib/Support/xxhash.cpp                   |  192 +-
 llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp |  407 +-
 llvm/lib/Target/AArch64/AArch64FastISel.cpp   |    4 -
 llvm/lib/Target/AArch64/AArch64Features.td    |    3 -
 .../Target/AArch64/AArch64ISelDAGToDAG.cpp    |  102 -
 .../Target/AArch64/AArch64ISelLowering.cpp    |  201 +-
 llvm/lib/Target/AArch64/AArch64ISelLowering.h |    1 -
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |   79 +-
 llvm/lib/Target/AArch64/AArch64Subtarget.cpp  |   12 -
 llvm/lib/Target/AArch64/AArch64Subtarget.h    |   10 -
 .../GISel/AArch64InstructionSelector.cpp      |  110 +-
 .../AArch64/GISel/AArch64LegalizerInfo.cpp    |    2 -
 .../MCTargetDesc/AArch64ELFStreamer.cpp       |    8 +-
 llvm/lib/Target/AMDGPU/AMDGPU.td              |   20 +-
 llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp   |    4 -
 .../AMDGPU/AMDGPULateCodeGenPrepare.cpp       |    8 +-
 .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp |    2 -
 .../AMDGPU/AMDGPULowerBufferFatPointers.cpp   |    5 +-
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |    2 -
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |   15 +-
 .../lib/Target/AMDGPU/GCNHazardRecognizer.cpp |  112 -
 llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h  |    1 -
 llvm/lib/Target/AMDGPU/GCNSubtarget.h         |    3 -
 .../AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp  |    1 -
 .../MCTargetDesc/AMDGPUTargetStreamer.cpp     |    6 +-
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |   10 -
 .../Target/AMDGPU/SILoadStoreOptimizer.cpp    |   48 +-
 llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp  |   14 +-
 llvm/lib/Target/AMDGPU/SMInstructions.td      |   91 +-
 .../ARM/MCTargetDesc/ARMELFStreamer.cpp       |    4 +-
 .../Target/AVR/MCTargetDesc/AVRAsmBackend.cpp |    1 -
 .../AVR/MCTargetDesc/AVRELFStreamer.cpp       |    7 +-
 .../CSKY/MCTargetDesc/CSKYELFStreamer.cpp     |    9 +-
 .../CSKY/MCTargetDesc/CSKYELFStreamer.h       |    3 +
 .../Hexagon/HexagonLoopIdiomRecognition.cpp   |    2 +-
 .../MCTargetDesc/HexagonAsmBackend.cpp        |    4 +-
 .../MCTargetDesc/HexagonMCTargetDesc.cpp      |    3 +-
 llvm/lib/Target/LoongArch/LoongArch.td        |   12 +-
 .../LoongArch/LoongArchISelLowering.cpp       | 1099 +--
 .../Target/LoongArch/LoongArchISelLowering.h  |   10 -
 .../LoongArch/LoongArchLASXInstrInfo.td       |  130 -
 .../Target/LoongArch/LoongArchLSXInstrInfo.td |  148 -
 .../LoongArchTargetTransformInfo.cpp          |    2 +
 .../MCTargetDesc/LoongArchELFStreamer.cpp     |    6 +-
 .../MSP430/MCTargetDesc/MSP430ELFStreamer.cpp |    4 +
 .../Mips/MCTargetDesc/MipsAsmBackend.cpp      |    1 -
 .../Mips/MCTargetDesc/MipsTargetStreamer.cpp  |   58 +-
 .../PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp  |   12 +-
 llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp     |   27 +-
 .../RISCV/MCTargetDesc/RISCVAsmBackend.cpp    |    6 +-
 .../RISCV/MCTargetDesc/RISCVELFStreamer.cpp   |   10 +-
 .../RISCV/MCTargetDesc/RISCVELFStreamer.h     |    1 +
 llvm/lib/Target/RISCV/RISCVFeatures.td        |  136 +-
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |   23 +-
 llvm/lib/Target/RISCV/RISCVTargetMachine.cpp  |    6 +-
 .../Target/RISCV/RISCVTargetTransformInfo.cpp |   11 +-
 llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp     |    5 +-
 .../Target/X86/MCTargetDesc/X86AsmBackend.cpp |   26 +-
 llvm/lib/Target/X86/X86ISelLowering.cpp       |   14 -
 llvm/lib/Target/X86/X86TargetMachine.cpp      |    6 +-
 .../lib/Target/X86/X86TargetTransformInfo.cpp |    5 -
 llvm/lib/Target/X86/X86TargetTransformInfo.h  |    2 -
 llvm/lib/TargetParser/Host.cpp                |    5 +-
 .../TargetParser/LoongArchTargetParser.cpp    |   11 -
 llvm/lib/TargetParser/RISCVTargetParser.cpp   |   16 -
 llvm/lib/TargetParser/Triple.cpp              |    3 -
 llvm/lib/Transforms/Coroutines/CoroEarly.cpp  |    1 -
 .../InstCombine/InstCombineSelect.cpp         |    7 +-
 .../Instrumentation/AddressSanitizer.cpp      |    2 -
 .../Instrumentation/InstrProfiling.cpp        |  245 +-
 .../Instrumentation/PGOInstrumentation.cpp    |   23 +-
 .../lib/Transforms/Scalar/SimplifyCFGPass.cpp |    9 +-
 .../Scalar/StraightLineStrengthReduce.cpp     |    6 +-
 llvm/lib/Transforms/Utils/DXILResource.cpp    |    4 +-
 llvm/lib/Transforms/Utils/LoopUnroll.cpp      |    4 +-
 llvm/lib/Transforms/Utils/SimplifyCFG.cpp     |   16 +-
 .../Vectorize/LoopVectorizationLegality.cpp   |    6 +-
 .../Transforms/Vectorize/LoopVectorize.cpp    |    6 +-
 .../Transforms/Vectorize/SLPVectorizer.cpp    |    3 +-
 llvm/lib/Transforms/Vectorize/VPlan.h         |   10 +-
 .../test/Analysis/CostModel/RISCV/arith-fp.ll |  300 +-
 .../CostModel/RISCV/rvv-intrinsics.ll         |  130 +-
 .../GlobalISel/legalize-threeway-cmp.mir      |  141 -
 .../GlobalISel/legalizer-info-validation.mir  |    9 +-
 .../CodeGen/AArch64/callbr-asm-obj-file.ll    |   16 +-
 llvm/test/CodeGen/AArch64/cmp-chains.ll       |  243 -
 llvm/test/CodeGen/AArch64/cmp-select-sign.ll  |  206 -
 ...rleaving-reductions-predicated-scalable.ll |    3 +-
 llvm/test/CodeGen/AArch64/double_reduct.ll    |    9 +-
 llvm/test/CodeGen/AArch64/fast-isel-select.ll |  293 +-
 .../CodeGen/AArch64/hardened-br-jump-table.ll |  133 -
 .../CodeGen/AArch64/inlineasm-ldr-pseudo.ll   |    2 +-
 .../AArch64/intrinsic-cttz-elts-sve.ll        |    3 +-
 llvm/test/CodeGen/AArch64/neon-abd.ll         |    2 +
 llvm/test/CodeGen/AArch64/ptrauth-fpac.ll     |  374 -
 .../CodeGen/AArch64/ptrauth-indirectbr.ll     |  248 -
 ...trauth-intrinsic-auth-resign-with-blend.ll |  254 -
 .../AArch64/ptrauth-intrinsic-auth-resign.ll  |  638 --
 llvm/test/CodeGen/AArch64/scmp.ll             |   87 +-
 llvm/test/CodeGen/AArch64/sve-doublereduct.ll |   41 +-
 .../CodeGen/AArch64/sve-fixed-vector-zext.ll  |   44 +-
 .../CodeGen/AArch64/sve-fp-int-min-max.ll     |    3 +-
 llvm/test/CodeGen/AArch64/sve-int-reduce.ll   |   88 +
 .../CodeGen/AArch64/sve-stack-frame-layout.ll |  135 -
 ...-streaming-mode-fixed-length-reductions.ll |  102 +
 llvm/test/CodeGen/AArch64/ucmp.ll             |  134 +-
 llvm/test/CodeGen/AArch64/vecreduce-add.ll    |   52 +-
 .../AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll  |  100 +-
 .../GlobalISel/inst-select-fract.f64.mir      |  108 +-
 .../GlobalISel/inst-select-load-constant.mir  |  253 +-
 .../GlobalISel/llvm.amdgcn.div.scale.ll       |   56 +-
 .../GlobalISel/llvm.amdgcn.intersect_ray.ll   |   24 +-
 .../AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll  |    8 +-
 .../GlobalISel/llvm.amdgcn.update.dpp.ll      |   58 +-
 .../AMDGPU/GlobalISel/mul-known-bits.i64.ll   |    8 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll    |  110 +-
 .../test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll |  348 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll   |    4 +-
 .../test/CodeGen/AMDGPU/GlobalISel/udivrem.ll |  433 +-
 .../AMDGPU/GlobalISel/vni8-across-blocks.ll   |   30 +-
 .../GlobalISel/widen-i8-i16-scalar-loads.ll   |  168 +-
 llvm/test/CodeGen/AMDGPU/add.ll               |  116 +-
 llvm/test/CodeGen/AMDGPU/add.v2i16.ll         |   98 +-
 llvm/test/CodeGen/AMDGPU/amd.endpgm.ll        |   12 +-
 .../AMDGPU/amdgcn-load-offset-from-reg.ll     |    4 +-
 .../AMDGPU/amdgpu-codegenprepare-idiv.ll      |  849 +-
 llvm/test/CodeGen/AMDGPU/and.ll               |   12 +-
 .../AMDGPU/atomic_optimizations_buffer.ll     |   13 +-
 .../atomic_optimizations_global_pointer.ll    | 1475 ++--
 .../atomic_optimizations_local_pointer.ll     |  253 +-
 .../AMDGPU/atomic_optimizations_raw_buffer.ll |   13 +-
 .../atomic_optimizations_struct_buffer.ll     |   13 +-
 .../branch-folding-implicit-def-subreg.ll     |    2 +-
 llvm/test/CodeGen/AMDGPU/build_vector.ll      |   12 +-
 .../test/CodeGen/AMDGPU/carryout-selection.ll |  156 +-
 llvm/test/CodeGen/AMDGPU/clamp-modifier.ll    |  114 +-
 llvm/test/CodeGen/AMDGPU/clamp.ll             |  260 +-
 llvm/test/CodeGen/AMDGPU/cluster_stores.ll    |   88 +-
 .../CodeGen/AMDGPU/combine-cond-add-sub.ll    |   20 +-
 .../CodeGen/AMDGPU/combine-vload-extract.ll   |   22 +-
 llvm/test/CodeGen/AMDGPU/copy_to_scc.ll       |   22 +-
 llvm/test/CodeGen/AMDGPU/ctlz.ll              |  180 +-
 llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll   |  158 +-
 llvm/test/CodeGen/AMDGPU/cttz.ll              |  184 +-
 llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll   |  126 +-
 llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll     |  300 +-
 .../CodeGen/AMDGPU/dag-divergence-atomic.ll   |  212 +-
 llvm/test/CodeGen/AMDGPU/div_i128.ll          |   64 +-
 .../AMDGPU/divergence-driven-buildvector.ll   |   64 +-
 .../AMDGPU/divergence-driven-trunc-to-i1.ll   |   41 +-
 llvm/test/CodeGen/AMDGPU/ds_write2.ll         |   46 +-
 llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll     |   20 +-
 llvm/test/CodeGen/AMDGPU/fdiv.f16.ll          |  136 +-
 llvm/test/CodeGen/AMDGPU/fdiv.ll              |   86 +-
 llvm/test/CodeGen/AMDGPU/flat_atomics.ll      |  140 +-
 .../CodeGen/AMDGPU/flat_atomics_i32_system.ll |   56 +-
 .../CodeGen/AMDGPU/flat_atomics_i64_system.ll |  130 +-
 llvm/test/CodeGen/AMDGPU/fmax3.ll             |  152 +-
 llvm/test/CodeGen/AMDGPU/fmed3.ll             |  564 +-
 llvm/test/CodeGen/AMDGPU/fmin3.ll             |  228 +-
 llvm/test/CodeGen/AMDGPU/fmul.f16.ll          |  180 +-
 llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll       |  250 +-
 .../AMDGPU/fp-min-max-buffer-atomics.ll       |   30 +-
 .../AMDGPU/fp-min-max-buffer-ptr-atomics.ll   |   30 +-
 .../CodeGen/AMDGPU/fp64-atomics-gfx90a.ll     |  154 +-
 .../AMDGPU/fp64-min-max-buffer-atomics.ll     |   24 +-
 .../AMDGPU/fp64-min-max-buffer-ptr-atomics.ll |   24 +-
 llvm/test/CodeGen/AMDGPU/fpext.f16.ll         |  766 +-
 llvm/test/CodeGen/AMDGPU/fptoi.i128.ll        |   60 +-
 llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll       |  324 +-
 llvm/test/CodeGen/AMDGPU/fshl.ll              |   42 +-
 llvm/test/CodeGen/AMDGPU/fshr.ll              |   14 +-
 llvm/test/CodeGen/AMDGPU/fsub.f16.ll          |  152 +-
 .../CodeGen/AMDGPU/global-i16-load-store.ll   |   10 +-
 llvm/test/CodeGen/AMDGPU/global_atomics.ll    |   88 +-
 .../AMDGPU/global_atomics_i32_system.ll       |   96 +-
 .../test/CodeGen/AMDGPU/global_atomics_i64.ll |  566 +-
 .../AMDGPU/global_atomics_i64_system.ll       |  202 +-
 .../identical-subrange-spill-infloop.ll       |   30 +-
 llvm/test/CodeGen/AMDGPU/idiv-licm.ll         |    8 +-
 llvm/test/CodeGen/AMDGPU/idot4s.ll            |  117 +-
 llvm/test/CodeGen/AMDGPU/idot4u.ll            |  122 +-
 .../AMDGPU/indirect-addressing-term.ll        |    2 +-
 .../insert_waitcnt_for_precise_memory.ll      |  161 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll   |   38 +-
 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll   |   22 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll    |  256 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll    |  164 +-
 .../AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll     |   11 +
 .../AMDGPU/llvm.amdgcn.global.atomic.csub.ll  |    8 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll    |  176 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll    |  116 +-
 .../AMDGPU/llvm.amdgcn.intersect_ray.ll       |   46 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.permlane.ll    |  516 +-
 .../llvm.amdgcn.raw.atomic.buffer.load.ll     |  313 -
 .../llvm.amdgcn.raw.ptr.atomic.buffer.load.ll |  313 -
 .../CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll |    4 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll |    4 +-
 .../AMDGPU/llvm.amdgcn.sched.group.barrier.ll |  212 +-
 llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll      |   24 +-
 llvm/test/CodeGen/AMDGPU/llvm.exp.ll          |   48 +-
 llvm/test/CodeGen/AMDGPU/llvm.exp10.ll        |   48 +-
 llvm/test/CodeGen/AMDGPU/llvm.exp2.ll         |   16 +-
 llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll  |  148 +-
 llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll   |    8 +-
 llvm/test/CodeGen/AMDGPU/llvm.log2.ll         |   16 +-
 llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll   |  168 +-
 llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll   |  168 +-
 llvm/test/CodeGen/AMDGPU/llvm.mulo.ll         |  224 +-
 llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll     |   74 +-
 llvm/test/CodeGen/AMDGPU/llvm.round.ll        |  504 +-
 llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll      |   24 +-
 .../lower-buffer-fat-pointers-memops.ll       |    6 +-
 llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll      |    2 +-
 llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll        |   74 +-
 llvm/test/CodeGen/AMDGPU/mad.u16.ll           |   20 +-
 llvm/test/CodeGen/AMDGPU/madak.ll             |  104 +-
 llvm/test/CodeGen/AMDGPU/memory_clause.ll     |   18 +-
 llvm/test/CodeGen/AMDGPU/merge-s-load.mir     |  302 +-
 llvm/test/CodeGen/AMDGPU/min.ll               |   64 +-
 .../CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll  |  160 +-
 llvm/test/CodeGen/AMDGPU/mul.ll               |  448 +-
 llvm/test/CodeGen/AMDGPU/mul_int24.ll         |   40 +-
 llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll |   76 +-
 llvm/test/CodeGen/AMDGPU/optimize-compare.ll  |    8 +-
 llvm/test/CodeGen/AMDGPU/packed-op-sel.ll     |  136 +-
 .../AMDGPU/post-ra-soft-clause-dbg-info.ll    |    8 +-
 .../test/CodeGen/AMDGPU/promote-vect3-load.ll |    4 +-
 .../AMDGPU/ptr-buffer-alias-scheduling.ll     |  100 +-
 llvm/test/CodeGen/AMDGPU/rem_i128.ll          |   64 +-
 .../AMDGPU/required-export-priority.ll        |  344 -
 .../AMDGPU/required-export-priority.mir       |  294 -
 .../AMDGPU/resource-optimization-remarks.ll   |   21 +-
 llvm/test/CodeGen/AMDGPU/rotl.ll              |    8 +-
 llvm/test/CodeGen/AMDGPU/rotr.ll              |    6 +-
 llvm/test/CodeGen/AMDGPU/saddo.ll             |   82 +-
 llvm/test/CodeGen/AMDGPU/sdiv.ll              |  132 +-
 llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll     |   66 +-
 llvm/test/CodeGen/AMDGPU/shl.v2i16.ll         |   90 +-
 .../CodeGen/AMDGPU/shrink-add-sub-constant.ll |  508 +-
 llvm/test/CodeGen/AMDGPU/smrd.ll              |    4 +-
 llvm/test/CodeGen/AMDGPU/srem.ll              |  416 +-
 llvm/test/CodeGen/AMDGPU/sub.ll               |   54 +-
 llvm/test/CodeGen/AMDGPU/sub.v2i16.ll         |  138 +-
 llvm/test/CodeGen/AMDGPU/twoaddr-constrain.ll |    6 +-
 llvm/test/CodeGen/AMDGPU/uaddo.ll             |   70 +-
 llvm/test/CodeGen/AMDGPU/usubo.ll             |   70 +-
 .../CodeGen/AMDGPU/v_add_u64_pseudo_sdwa.ll   |    8 +-
 llvm/test/CodeGen/AMDGPU/v_cndmask.ll         |   10 +-
 llvm/test/CodeGen/AMDGPU/v_pack.ll            |   82 +-
 llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll   |   34 +-
 .../CodeGen/AMDGPU/v_sub_u64_pseudo_sdwa.ll   |    8 +-
 llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll |   10 +-
 .../test/CodeGen/AMDGPU/vni8-across-blocks.ll |   78 +-
 llvm/test/CodeGen/AMDGPU/wave32.ll            |  411 +-
 llvm/test/CodeGen/AMDGPU/zero_extend.ll       |    2 +-
 llvm/test/CodeGen/ARM/neon_vabd.ll            |   16 +-
 llvm/test/CodeGen/LoongArch/andn-icmp.ll      |   57 +-
 llvm/test/CodeGen/LoongArch/cpus.ll           |    5 -
 .../lasx/ir-instruction/shuffle-as-xvilv.ll   |   74 -
 .../lasx/ir-instruction/shuffle-as-xvpack.ll  |  124 -
 .../lasx/ir-instruction/shuffle-as-xvpick.ll  |   84 -
 .../ir-instruction/shuffle-as-xvrepl128vei.ll |   65 -
 .../lasx/ir-instruction/shuffle-as-xvshuf.ll  |   79 -
 .../ir-instruction/shuffle-as-xvshuf4i.ll     |   43 -
 .../lsx/ir-instruction/shuffle-as-vilv.ll     |   82 -
 .../lsx/ir-instruction/shuffle-as-vpack.ll    |  122 -
 .../lsx/ir-instruction/shuffle-as-vpick.ll    |   82 -
 .../lsx/ir-instruction/shuffle-as-vreplvei.ll |   62 -
 .../lsx/ir-instruction/shuffle-as-vshuf.ll    |   84 -
 .../lsx/ir-instruction/shuffle-as-vshuf4i.ll  |   42 -
 .../LoongArch/psabi-restricted-scheduling.ll  |  152 +-
 llvm/test/CodeGen/NVPTX/i128.ll               |  572 +-
 .../PowerPC/patchable-function-entry.ll       |   58 -
 .../RISCV/atomic-cmpxchg-branch-on-result.ll  |    6 +-
 llvm/test/CodeGen/RISCV/atomic-cmpxchg.ll     |   12 +-
 llvm/test/CodeGen/RISCV/atomic-rmw.ll         |   12 +-
 llvm/test/CodeGen/RISCV/atomic-signext.ll     |    4 +-
 llvm/test/CodeGen/RISCV/attributes.ll         |    4 +-
 llvm/test/CodeGen/RISCV/double-convert.ll     |    3 +-
 .../CodeGen/RISCV/double-round-conv-sat.ll    |  174 +-
 llvm/test/CodeGen/RISCV/pr84653_pr85190.ll    |    3 +-
 llvm/test/CodeGen/RISCV/pr97304.ll            |   51 -
 llvm/test/CodeGen/RISCV/rvv/pr99782.ll        |   18 -
 llvm/test/CodeGen/SystemZ/pr60413.ll          |  104 +-
 llvm/test/CodeGen/VE/Scalar/max.ll            |    2 -
 llvm/test/CodeGen/VE/Scalar/min.ll            |    2 -
 llvm/test/CodeGen/X86/abdu.ll                 |    2 +
 llvm/test/CodeGen/X86/combine-pmadd.ll        |  103 +-
 .../X86/div-rem-pair-recomposition-signed.ll  |  294 +-
 llvm/test/CodeGen/X86/pmul.ll                 |    2 +-
 llvm/test/CodeGen/X86/pr64589.ll              |    4 +-
 .../test/CodeGen/X86/vector-compare-all_of.ll |   34 +-
 .../test/CodeGen/X86/vector-compare-any_of.ll |   20 +-
 .../vector-interleaved-store-i16-stride-6.ll  | 3998 +++++-----
 .../vector-interleaved-store-i16-stride-7.ll  | 7086 +++++++++--------
 .../vector-interleaved-store-i8-stride-5.ll   |  516 +-
 .../vector-interleaved-store-i8-stride-6.ll   | 1890 ++---
 .../vector-interleaved-store-i8-stride-7.ll   | 2388 +++---
 .../vector-interleaved-store-i8-stride-8.ll   | 1088 +--
 llvm/test/CodeGen/X86/widen_bitcnt.ll         |  192 +-
 .../Symbolize/ELF/aarch64-mapping-symbol.s    |    6 +-
 .../Symbolize/ELF/csky-mapping-symbol.s       |    8 +-
 llvm/test/DebugInfo/XCOFF/empty.ll            |   20 +-
 llvm/test/DebugInfo/XCOFF/explicit-section.ll |   12 +-
 .../test/DebugInfo/XCOFF/function-sections.ll |   12 +-
 llvm/test/DebugInfo/omit-empty.ll             |    2 +-
 .../AddressSanitizer/skip-coro.ll             |   34 -
 llvm/test/MC/AArch64/CheckDataSymbol.s        |    2 +-
 .../test/MC/AArch64/mapping-across-sections.s |   20 +-
 llvm/test/MC/AArch64/mapping-within-section.s |   15 +-
 llvm/test/MC/AArch64/size-directive.s         |    2 +-
 llvm/test/MC/ELF/AArch64/cfi.s                |    4 +-
 llvm/test/MC/RISCV/mapping-across-sections.s  |   10 +-
 llvm/test/MC/RISCV/rv32zacas-invalid.s        |    2 +-
 llvm/test/MC/RISCV/rv32zacas-valid.s          |   12 +-
 llvm/test/MC/RISCV/rv64zacas-invalid.s        |    2 +-
 llvm/test/MC/RISCV/rv64zacas-valid.s          |    6 +-
 llvm/test/MC/RISCV/rvzabha-zacas-valid.s      |   12 +-
 llvm/test/Object/ARM/nm-mapping-symbol.s      |   12 +
 llvm/test/Other/new-pm-print-pipeline.ll      |    4 +-
 llvm/test/TableGen/riscv-target-def.td        |   18 +-
 .../memoryssa-scan-limit.ll                   |   45 -
 llvm/test/Transforms/GVN/condprop.ll          |    6 +-
 .../InstCombine/select-binop-cmp.ll           |   13 -
 llvm/test/Transforms/InstCombine/select.ll    |   16 -
 .../AArch64/vscale-fixups.ll                  |   47 -
 .../LoopVectorize/LoongArch/defaults.ll       |    2 +-
 .../RISCV/masked_gather_scatter.ll            |   84 +-
 .../RISCV/riscv-vector-reverse.ll             |  206 +-
 .../LoopVectorize/load-deref-pred-align.ll    |  128 +-
 .../PGOProfile/Inputs/cspgo_bar_sample.ll     |   82 -
 .../PGOProfile/counter_promo_sampling.ll      |   78 -
 .../Transforms/PGOProfile/cspgo_sample.ll     |  112 -
 .../instrprof_burst_sampling_fast.ll          |   47 -
 .../instrprof_burst_sampling_full.ll          |   45 -
 .../instrprof_burst_sampling_full_intsize.ll  |   45 -
 .../PGOProfile/instrprof_simple_sampling.ll   |   60 -
 .../SLPVectorizer/SystemZ/cmp-ptr-minmax.ll   |   39 -
 .../gather-extractelements-different-bbs.ll   |    2 +-
 .../X86/two-entry-phi-fold-unpredictable.ll   |  100 -
 .../RISCV/vpintrin-scalarization.ll           |  100 +-
 llvm/test/tools/llvm-dlltool/arm64ec.test     |   29 -
 .../AArch64/elf-aarch64-mapping-symbols.test  |    8 +-
 .../tools/llvm-profgen/period-scaling.test    |   84 -
 llvm/tools/llvm-profgen/PerfReader.cpp        |   14 -
 .../ExecutionEngine/Orc/CMakeLists.txt        |    1 -
 .../ExecutionEngine/Orc/MemoryFlagsTest.cpp   |   55 -
 llvm/unittests/SandboxIR/SandboxIRTest.cpp    |  121 +-
 .../Support/raw_socket_stream_test.cpp        |   45 +-
 .../TargetParser/RISCVISAInfoTest.cpp         |    2 +-
 llvm/unittests/TargetParser/TripleTest.cpp    |    6 -
 llvm/utils/TableGen/RISCVTargetDefEmitter.cpp |   37 -
 .../clang/tools/clang-nvlink-wrapper/BUILD.gn |   31 -
 .../gn/secondary/clang/tools/driver/BUILD.gn  |    1 -
 .../clang/unittests/Tooling/BUILD.gn          |    1 -
 .../compiler-rt/lib/builtins/BUILD.gn         |    1 -
 .../gn/secondary/libcxx/include/BUILD.gn      |    2 -
 .../secondary/llvm/lib/CodeGenData/BUILD.gn   |    3 -
 llvm/utils/gn/secondary/llvm/test/BUILD.gn    |    1 -
 .../secondary/llvm/tools/llvm-cgdata/BUILD.gn |   12 -
 .../unittests/ExecutionEngine/Orc/BUILD.gn    |    1 -
 llvm/utils/gn/secondary/llvm/version.gni      |    2 +-
 llvm/utils/lit/lit/__init__.py                |    2 +-
 .../tests/Inputs/shtest-env/env-calls-env.txt |    6 +-
 mlir/docs/Canonicalization.md                 |   39 +-
 mlir/include/mlir-c/BuiltinTypes.h            |   10 -
 mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td   |   24 +-
 .../mlir/Dialect/Utils/IndexingUtils.h        |    8 -
 mlir/include/mlir/IR/Builders.h               |    1 -
 mlir/include/mlir/IR/BuiltinTypes.h           |   18 +-
 mlir/include/mlir/IR/BuiltinTypes.td          |   35 +-
 mlir/include/mlir/IR/CommonTypeConstraints.td |   28 +-
 mlir/include/mlir/IR/Types.h                  |    1 -
 mlir/lib/AsmParser/TokenKinds.def             |    1 -
 mlir/lib/AsmParser/TypeParser.cpp             |    4 -
 mlir/lib/Bindings/Python/IRTypes.cpp          |   23 +-
 mlir/lib/CAPI/IR/BuiltinTypes.cpp             |   12 -
 .../Conversion/LLVMCommon/TypeConverter.cpp   |    5 +-
 .../VectorToSPIRV/VectorToSPIRV.cpp           |    4 +-
 .../Transforms/EmulateUnsupportedFloats.cpp   |    1 -
 mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp    |  157 +-
 mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp      |   61 +-
 mlir/lib/Dialect/Utils/IndexingUtils.cpp      |   26 -
 mlir/lib/IR/AffineExpr.cpp                    |    3 +-
 mlir/lib/IR/AsmPrinter.cpp                    |    1 -
 mlir/lib/IR/Builders.cpp                      |    4 -
 mlir/lib/IR/BuiltinTypes.cpp                  |    7 +-
 mlir/lib/IR/MLIRContext.cpp                   |    5 -
 mlir/lib/IR/Types.cpp                         |    1 -
 .../LLVMIR/LLVMToLLVMIRTranslation.cpp        |   19 -
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      |   56 +-
 mlir/lib/Target/LLVMIR/ModuleImport.cpp       |   22 -
 .../Transforms/Utils/DialectConversion.cpp    |   11 +-
 mlir/python/mlir/_mlir_libs/_mlir/ir.pyi      |   14 -
 mlir/python/mlir/extras/types.py              |    2 -
 .../VectorToSPIRV/vector-to-spirv.mlir        |   13 -
 mlir/test/Dialect/LLVMIR/invalid.mlir         |  165 +-
 mlir/test/Dialect/LLVMIR/roundtrip.mlir       |   18 -
 mlir/test/Dialect/Linalg/canonicalize.mlir    |   75 +-
 .../Vector/vector-transfer-flatten.mlir       |  147 +-
 mlir/test/IR/attribute.mlir                   |    4 -
 .../test/Target/LLVMIR/Import/instructions.ll |   58 -
 mlir/test/Target/LLVMIR/llvmir.mlir           |   75 -
 mlir/test/Target/LLVMIR/omptarget-depend.mlir |  140 -
 mlir/test/python/ir/builtin_types.py          |    9 -
 mlir/unittests/IR/AffineExprTest.cpp          |    6 -
 mlir/unittests/IR/ShapedTypeTest.cpp          |   58 -
 mlir/utils/lldb-scripts/mlirDataFormatters.py |    1 -
 mlir/utils/tree-sitter-mlir/grammar.js        |    3 +-
 offload/DeviceRTL/src/Parallelism.cpp         |    7 +-
 .../test/offloading/fortran/target-depend.f90 |   69 -
 openmp/docs/ReleaseNotes.rst                  |    2 +-
 pstl/docs/ReleaseNotes.rst                    |    2 +-
 .../llvm-project-overlay/libc/BUILD.bazel     |   19 -
 870 files changed, 25434 insertions(+), 39810 deletions(-)
 delete mode 100644 clang-tools-extra/docs/clang-tidy/checks/clang-analyzer/optin.taint.TaintedAlloc.rst
 delete mode 100644 clang-tools-extra/docs/clang-tidy/checks/clang-analyzer/security.PutenvStackArray.rst
 delete mode 100644 clang-tools-extra/docs/clang-tidy/checks/clang-analyzer/unix.BlockInCriticalSection.rst
 delete mode 100644 clang/docs/ClangNVLinkWrapper.rst
 delete mode 100644 clang/test/Analysis/issue-94193.cpp
 create mode 100644 clang/test/CXX/cpp/cpp.module/p2.cppm
 delete mode 100644 clang/test/CodeGenCXX/ptrauth-global-constant-initializers.cpp
 delete mode 100644 clang/test/CodeGenCXX/ptrauth-member-function-pointer.cpp
 delete mode 100644 clang/test/CodeGenCXX/ptrauth-type-info-vtable.cpp
 delete mode 100644 clang/test/Driver/Inputs/multilib_aarch64_linux_tree/usr/include/aarch64-linux-gnu/.keep
 delete mode 100644 clang/test/Driver/Inputs/multilib_aarch64_linux_tree/usr/include/aarch64-linux-pauthtest/.keep
 delete mode 100644 clang/test/Driver/aarch64-multilib-pauthabi.c
 delete mode 100644 clang/test/Driver/nvlink-wrapper.c
 delete mode 100644 clang/test/Preprocessor/pragma_mc_func.c
 delete mode 100644 clang/test/Sema/aarch64-neon-without-target-feature.cpp
 delete mode 100644 clang/test/Sema/ptrauth-indirect-goto.c
 delete mode 100644 clang/test/SemaCXX/invalid-template-declaration.cpp
 delete mode 100644 clang/test/SemaTemplate/alias-template-deprecated.cpp
 delete mode 100644 clang/tools/clang-nvlink-wrapper/CMakeLists.txt
 delete mode 100644 clang/tools/clang-nvlink-wrapper/ClangNVLinkWrapper.cpp
 delete mode 100644 clang/tools/clang-nvlink-wrapper/NVLinkOpts.td
 delete mode 100644 compiler-rt/lib/builtins/aarch64/sme-libc-mem-routines.S
 delete mode 100644 flang/test/Lower/HLFIR/calls-poly-to-nonpoly.f90
 delete mode 100644 flang/test/Lower/OpenMP/Todo/atomic-character.f90
 delete mode 100644 flang/test/Lower/OpenMP/Todo/atomic-complex.f90
 delete mode 100644 flang/test/Lower/OpenMP/lastprivate-allocatable.f90
 delete mode 100644 flang/test/Transforms/debug-92391.fir
 delete mode 100644 libc/src/math/dsqrtf128.h
 delete mode 100644 libc/src/math/dsqrtl.h
 delete mode 100644 libc/src/math/generic/dsqrtf128.cpp
 delete mode 100644 libc/src/math/generic/dsqrtl.cpp
 create mode 100644 libc/src/stdlib/exit_handler.cpp
 delete mode 100644 libc/test/src/math/dsqrtl_test.cpp
 delete mode 100644 libc/test/src/math/performance_testing/misc_basic_ops_perf.cpp
 delete mode 100644 libc/test/src/math/smoke/dsqrtf128_test.cpp
 delete mode 100644 libc/test/src/math/smoke/dsqrtl_test.cpp
 delete mode 100644 libcxx/test/libcxx/containers/sequences/vector/assert.iterator.index.pass.cpp
 rename libcxx/test/libcxx/containers/sequences/vector/{assert.iterator.add.pass.cpp => debug.iterator.add.pass.cpp} (64%)
 rename libcxx/test/libcxx/containers/sequences/vector/{assert.iterator.decrement.pass.cpp => debug.iterator.decrement.pass.cpp} (70%)
 rename libcxx/test/libcxx/containers/sequences/vector/{assert.iterator.dereference.pass.cpp => debug.iterator.dereference.pass.cpp} (64%)
 rename libcxx/test/libcxx/containers/sequences/vector/{assert.iterator.increment.pass.cpp => debug.iterator.increment.pass.cpp} (63%)
 create mode 100644 libcxx/test/libcxx/containers/sequences/vector/debug.iterator.index.pass.cpp
 delete mode 100644 libcxx/test/libcxx/containers/sequences/vector/fill_to_capacity.h
 rename libcxx/test/libcxx/strings/basic.string/string.iterators/{assert.iterator.add.pass.cpp => debug.iterator.add.pass.cpp} (77%)
 rename libcxx/test/libcxx/strings/basic.string/string.iterators/{assert.iterator.decrement.pass.cpp => debug.iterator.decrement.pass.cpp} (76%)
 rename libcxx/test/libcxx/strings/basic.string/string.iterators/{assert.iterator.dereference.pass.cpp => debug.iterator.dereference.pass.cpp} (75%)
 rename libcxx/test/libcxx/strings/basic.string/string.iterators/{assert.iterator.increment.pass.cpp => debug.iterator.increment.pass.cpp} (76%)
 rename libcxx/test/libcxx/strings/basic.string/string.iterators/{assert.iterator.index.pass.cpp => debug.iterator.index.pass.cpp} (66%)
 delete mode 100644 libcxx/test/std/containers/sequences/vector/vector.modifiers/assert.push_back.invalidation.pass.cpp
 create mode 100644 libcxx/test/std/ranges/range.adaptors/range.transform/iterator/iter_move.pass.cpp
 delete mode 100644 llvm/benchmarks/xxhash.cpp
 delete mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/legalize-threeway-cmp.mir
 delete mode 100644 llvm/test/CodeGen/AArch64/hardened-br-jump-table.ll
 delete mode 100644 llvm/test/CodeGen/AArch64/ptrauth-fpac.ll
 delete mode 100644 llvm/test/CodeGen/AArch64/ptrauth-indirectbr.ll
 delete mode 100644 llvm/test/CodeGen/AArch64/ptrauth-intrinsic-auth-resign-with-blend.ll
 delete mode 100644 llvm/test/CodeGen/AArch64/ptrauth-intrinsic-auth-resign.ll
 delete mode 100644 llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll
 create mode 100644 llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reductions.ll
 delete mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll
 delete mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll
 delete mode 100644 llvm/test/CodeGen/AMDGPU/required-export-priority.ll
 delete mode 100644 llvm/test/CodeGen/AMDGPU/required-export-priority.mir
 delete mode 100644 llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvilv.ll
 delete mode 100644 llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvpack.ll
 delete mode 100644 llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvpick.ll
 delete mode 100644 llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvrepl128vei.ll
 delete mode 100644 llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvshuf.ll
 delete mode 100644 llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvshuf4i.ll
 delete mode 100644 llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vilv.ll
 delete mode 100644 llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vpack.ll
 delete mode 100644 llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vpick.ll
 delete mode 100644 llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vreplvei.ll
 delete mode 100644 llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vshuf.ll
 delete mode 100644 llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vshuf4i.ll
 delete mode 100644 llvm/test/CodeGen/PowerPC/patchable-function-entry.ll
 delete mode 100644 llvm/test/CodeGen/RISCV/pr97304.ll
 delete mode 100644 llvm/test/CodeGen/RISCV/rvv/pr99782.ll
 delete mode 100644 llvm/test/Instrumentation/AddressSanitizer/skip-coro.ll
 create mode 100644 llvm/test/Object/ARM/nm-mapping-symbol.s
 delete mode 100644 llvm/test/Transforms/PGOProfile/Inputs/cspgo_bar_sample.ll
 delete mode 100644 llvm/test/Transforms/PGOProfile/counter_promo_sampling.ll
 delete mode 100644 llvm/test/Transforms/PGOProfile/cspgo_sample.ll
 delete mode 100644 llvm/test/Transforms/PGOProfile/instrprof_burst_sampling_fast.ll
 delete mode 100644 llvm/test/Transforms/PGOProfile/instrprof_burst_sampling_full.ll
 delete mode 100644 llvm/test/Transforms/PGOProfile/instrprof_burst_sampling_full_intsize.ll
 delete mode 100644 llvm/test/Transforms/PGOProfile/instrprof_simple_sampling.ll
 delete mode 100644 llvm/test/Transforms/SLPVectorizer/SystemZ/cmp-ptr-minmax.ll
 delete mode 100644 llvm/test/Transforms/SimplifyCFG/X86/two-entry-phi-fold-unpredictable.ll
 delete mode 100644 llvm/test/tools/llvm-profgen/period-scaling.test
 delete mode 100644 llvm/unittests/ExecutionEngine/Orc/MemoryFlagsTest.cpp
 delete mode 100644 llvm/utils/gn/secondary/clang/tools/clang-nvlink-wrapper/BUILD.gn
 delete mode 100644 llvm/utils/gn/secondary/llvm/tools/llvm-cgdata/BUILD.gn
 delete mode 100644 mlir/test/Target/LLVMIR/omptarget-depend.mlir
 delete mode 100644 offload/test/offloading/fortran/target-depend.f90

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 02303c5e6cc57..0da482457ad70 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -86,7 +86,6 @@ clang/test/AST/Interp/ @tbaederr
 /mlir/**/*VectorToLLVM* @banach-space @dcaballe @nicolasvasilache
 /mlir/**/*X86Vector* @aartbik @dcaballe @nicolasvasilache
 /mlir/include/mlir/Dialect/Vector @banach-space @dcaballe @nicolasvasilache
-/mlir/include/mlir/Dialect/Vector/IR @kuhar
 /mlir/lib/Dialect/Vector @banach-space @dcaballe @nicolasvasilache
 /mlir/lib/Dialect/Vector/Transforms/* @banach-space @dcaballe @hanhanW @nicolasvasilache
 /mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp @banach-space @dcaballe @MaheshRavishankar @nicolasvasilache
diff --git a/.github/workflows/version-check.yml b/.github/workflows/version-check.yml
index 894e07d323ca9..4ce6119a407f5 100644
--- a/.github/workflows/version-check.yml
+++ b/.github/workflows/version-check.yml
@@ -27,5 +27,5 @@ jobs:
 
       - name: Version Check
         run: |
-          version=$(grep -o 'LLVM_VERSION_\(MAJOR\|MINOR\|PATCH\) [0-9]\+' cmake/Modules/LLVMVersion.cmake  | cut -d ' ' -f 2 | tr "\n" "." | sed 's/.$//g')
+          version=$(grep -o 'LLVM_VERSION_\(MAJOR\|MINOR\|PATCH\) [0-9]\+' llvm/CMakeLists.txt  | cut -d ' ' -f 2 | tr "\n" "." | sed 's/.$//g')
           .github/workflows/version-check.py "$version"
diff --git a/bolt/docs/CommandLineArgumentReference.md b/bolt/docs/CommandLineArgumentReference.md
index 0c8935457366d..ea51b77d838a1 100644
--- a/bolt/docs/CommandLineArgumentReference.md
+++ b/bolt/docs/CommandLineArgumentReference.md
@@ -88,7 +88,7 @@
 
 - `--comp-dir-override=<string>`
 
-  Overrides DW_AT_comp_dir, and provides an alternative base location, which is
+  Overrides DW_AT_comp_dir, and provides an alterantive base location, which is
   used with DW_AT_dwo_name to construct a path to *.dwo files.
 
 - `--create-debug-names-section`
diff --git a/bolt/lib/Core/BinaryContext.cpp b/bolt/lib/Core/BinaryContext.cpp
index 6a1106f23e485..83a5484f097ef 100644
--- a/bolt/lib/Core/BinaryContext.cpp
+++ b/bolt/lib/Core/BinaryContext.cpp
@@ -71,7 +71,7 @@ PrintMemData("print-mem-data",
 
 cl::opt<std::string> CompDirOverride(
     "comp-dir-override",
-    cl::desc("overrides DW_AT_comp_dir, and provides an alternative base "
+    cl::desc("overrides DW_AT_comp_dir, and provides an alterantive base "
              "location, which is used with DW_AT_dwo_name to construct a path "
              "to *.dwo files."),
     cl::Hidden, cl::init(""), cl::cat(BoltCategory));
diff --git a/bolt/lib/Rewrite/DWARFRewriter.cpp b/bolt/lib/Rewrite/DWARFRewriter.cpp
index 1ec216b39e95c..ccb45f40c5c7a 100644
--- a/bolt/lib/Rewrite/DWARFRewriter.cpp
+++ b/bolt/lib/Rewrite/DWARFRewriter.cpp
@@ -620,9 +620,10 @@ void DWARFRewriter::updateDebugInfo() {
   uint32_t CUIndex = 0;
   std::mutex AccessMutex;
   // Needs to be invoked in the same order as CUs are processed.
-  auto createRangeLocListAddressWriters =
-      [&](DWARFUnit &CU) -> DebugLocWriter * {
+  llvm::DenseMap<uint64_t, uint64_t> LocListWritersIndexByCU;
+  auto createRangeLocListAddressWriters = [&](DWARFUnit &CU) {
     std::lock_guard<std::mutex> Lock(AccessMutex);
+
     const uint16_t DwarfVersion = CU.getVersion();
     if (DwarfVersion >= 5) {
       auto AddrW = std::make_unique<DebugAddrWriterDwarf5>(
@@ -641,7 +642,6 @@ void DWARFRewriter::updateDebugInfo() {
         RangeListsWritersByCU[*DWOId] = std::move(DWORangeListsSectionWriter);
       }
       AddressWritersByCU[CU.getOffset()] = std::move(AddrW);
-
     } else {
       auto AddrW =
           std::make_unique<DebugAddrWriter>(&BC, CU.getAddressByteSize());
@@ -657,7 +657,7 @@ void DWARFRewriter::updateDebugInfo() {
             std::move(LegacyRangesSectionWriterByCU);
       }
     }
-    return LocListWritersByCU[CUIndex++].get();
+    LocListWritersIndexByCU[CU.getOffset()] = CUIndex++;
   };
 
   DWARF5AcceleratorTable DebugNamesTable(opts::CreateDebugNames, BC,
@@ -666,74 +666,68 @@ void DWARFRewriter::updateDebugInfo() {
   DWPState State;
   if (opts::WriteDWP)
     initDWPState(State);
-  auto processUnitDIE = [&](DWARFUnit *Unit, DIEBuilder *DIEBlder) {
-    // Check if the unit is a skeleton and we need special updates for it and
-    // its matching split/DWO CU.
-    std::optional<DWARFUnit *> SplitCU;
+  auto processSplitCU = [&](DWARFUnit &Unit, DWARFUnit &SplitCU,
+                            DIEBuilder &DIEBlder,
+                            DebugRangesSectionWriter &TempRangesSectionWriter,
+                            DebugAddrWriter &AddressWriter) {
+    DIEBuilder DWODIEBuilder(BC, &(SplitCU).getContext(), DebugNamesTable,
+                             &Unit);
+    DWODIEBuilder.buildDWOUnit(SplitCU);
+    std::string DWOName = "";
+    std::optional<std::string> DwarfOutputPath =
+        opts::DwarfOutputPath.empty()
+            ? std::nullopt
+            : std::optional<std::string>(opts::DwarfOutputPath.c_str());
+    {
+      std::lock_guard<std::mutex> Lock(AccessMutex);
+      DWOName = DIEBlder.updateDWONameCompDir(
+          *StrOffstsWriter, *StrWriter, Unit, DwarfOutputPath, std::nullopt);
+    }
+    DebugStrOffsetsWriter DWOStrOffstsWriter(BC);
+    DebugStrWriter DWOStrWriter((SplitCU).getContext(), true);
+    DWODIEBuilder.updateDWONameCompDirForTypes(
+        DWOStrOffstsWriter, DWOStrWriter, SplitCU, DwarfOutputPath, DWOName);
+    DebugLoclistWriter DebugLocDWoWriter(Unit, Unit.getVersion(), true,
+                                         AddressWriter);
+
+    updateUnitDebugInfo(SplitCU, DWODIEBuilder, DebugLocDWoWriter,
+                        TempRangesSectionWriter, AddressWriter);
+    DebugLocDWoWriter.finalize(DWODIEBuilder,
+                               *DWODIEBuilder.getUnitDIEbyUnit(SplitCU));
+    if (Unit.getVersion() >= 5)
+      TempRangesSectionWriter.finalizeSection();
+
+    emitDWOBuilder(DWOName, DWODIEBuilder, *this, SplitCU, Unit, State,
+                   DebugLocDWoWriter, DWOStrOffstsWriter, DWOStrWriter,
+                   GDBIndexSection);
+  };
+  auto processMainBinaryCU = [&](DWARFUnit &Unit, DIEBuilder &DIEBlder) {
+    DebugAddrWriter &AddressWriter =
+        *AddressWritersByCU[Unit.getOffset()].get();
+    DebugRangesSectionWriter &RangesSectionWriter =
+        Unit.getVersion() >= 5 ? *RangeListsSectionWriter.get()
+                               : *LegacyRangesSectionWriter.get();
+    DebugLocWriter &DebugLocWriter =
+        *LocListWritersByCU[LocListWritersIndexByCU[Unit.getOffset()]].get();
     std::optional<uint64_t> RangesBase;
-    std::optional<uint64_t> DWOId = Unit->getDWOId();
+    std::optional<DWARFUnit *> SplitCU;
+    std::optional<uint64_t> DWOId = Unit.getDWOId();
     if (DWOId)
       SplitCU = BC.getDWOCU(*DWOId);
-    DebugLocWriter *DebugLocWriter = createRangeLocListAddressWriters(*Unit);
-    DebugRangesSectionWriter *RangesSectionWriter =
-        Unit->getVersion() >= 5 ? RangeListsSectionWriter.get()
-                                : LegacyRangesSectionWriter.get();
-    DebugAddrWriter *AddressWriter =
-        AddressWritersByCU[Unit->getOffset()].get();
-    // Skipping CUs that failed to load.
-    if (SplitCU) {
-      DIEBuilder DWODIEBuilder(BC, &(*SplitCU)->getContext(), DebugNamesTable,
-                               Unit);
-      DWODIEBuilder.buildDWOUnit(**SplitCU);
-      std::string DWOName = "";
-      std::optional<std::string> DwarfOutputPath =
-          opts::DwarfOutputPath.empty()
-              ? std::nullopt
-              : std::optional<std::string>(opts::DwarfOutputPath.c_str());
-      {
-        std::lock_guard<std::mutex> Lock(AccessMutex);
-        DWOName = DIEBlder->updateDWONameCompDir(
-            *StrOffstsWriter, *StrWriter, *Unit, DwarfOutputPath, std::nullopt);
-      }
-      DebugStrOffsetsWriter DWOStrOffstsWriter(BC);
-      DebugStrWriter DWOStrWriter((*SplitCU)->getContext(), true);
-      DWODIEBuilder.updateDWONameCompDirForTypes(DWOStrOffstsWriter,
-                                                 DWOStrWriter, **SplitCU,
-                                                 DwarfOutputPath, DWOName);
-      DebugLoclistWriter DebugLocDWoWriter(*Unit, Unit->getVersion(), true,
-                                           *AddressWriter);
-      DebugRangesSectionWriter *TempRangesSectionWriter = RangesSectionWriter;
-      if (Unit->getVersion() >= 5) {
-        TempRangesSectionWriter = RangeListsWritersByCU[*DWOId].get();
-      } else {
-        TempRangesSectionWriter = LegacyRangesWritersByCU[*DWOId].get();
-        RangesBase = RangesSectionWriter->getSectionOffset();
-      }
-
-      updateUnitDebugInfo(*(*SplitCU), DWODIEBuilder, DebugLocDWoWriter,
-                          *TempRangesSectionWriter, *AddressWriter);
-      DebugLocDWoWriter.finalize(DWODIEBuilder,
-                                 *DWODIEBuilder.getUnitDIEbyUnit(**SplitCU));
-      if (Unit->getVersion() >= 5)
-        TempRangesSectionWriter->finalizeSection();
-
-      emitDWOBuilder(DWOName, DWODIEBuilder, *this, **SplitCU, *Unit, State,
-                     DebugLocDWoWriter, DWOStrOffstsWriter, DWOStrWriter,
-                     GDBIndexSection);
-    }
-
-    if (Unit->getVersion() >= 5) {
-      RangesBase = RangesSectionWriter->getSectionOffset() +
+    if (Unit.getVersion() >= 5) {
+      RangesBase = RangesSectionWriter.getSectionOffset() +
                    getDWARF5RngListLocListHeaderSize();
-      RangesSectionWriter->initSection(*Unit);
-      StrOffstsWriter->finalizeSection(*Unit, *DIEBlder);
+      RangesSectionWriter.initSection(Unit);
+      StrOffstsWriter->finalizeSection(Unit, DIEBlder);
+    } else if (SplitCU) {
+      RangesBase = LegacyRangesSectionWriter.get()->getSectionOffset();
     }
 
-    updateUnitDebugInfo(*Unit, *DIEBlder, *DebugLocWriter, *RangesSectionWriter,
-                        *AddressWriter, RangesBase);
-    DebugLocWriter->finalize(*DIEBlder, *DIEBlder->getUnitDIEbyUnit(*Unit));
-    if (Unit->getVersion() >= 5)
-      RangesSectionWriter->finalizeSection();
+    updateUnitDebugInfo(Unit, DIEBlder, DebugLocWriter, RangesSectionWriter,
+                        AddressWriter, RangesBase);
+    DebugLocWriter.finalize(DIEBlder, *DIEBlder.getUnitDIEbyUnit(Unit));
+    if (Unit.getVersion() >= 5)
+      RangesSectionWriter.finalizeSection();
   };
 
   DIEBuilder DIEBlder(BC, BC.DwCtx.get(), DebugNamesTable);
@@ -751,8 +745,24 @@ void DWARFRewriter::updateDebugInfo() {
   CUPartitionVector PartVec = partitionCUs(*BC.DwCtx);
   for (std::vector<DWARFUnit *> &Vec : PartVec) {
     DIEBlder.buildCompileUnits(Vec);
+    for (DWARFUnit *CU : DIEBlder.getProcessedCUs()) {
+      createRangeLocListAddressWriters(*CU);
+      std::optional<DWARFUnit *> SplitCU;
+      std::optional<uint64_t> DWOId = CU->getDWOId();
+      if (DWOId)
+        SplitCU = BC.getDWOCU(*DWOId);
+      if (!SplitCU)
+        continue;
+      DebugAddrWriter &AddressWriter =
+          *AddressWritersByCU[CU->getOffset()].get();
+      DebugRangesSectionWriter *TempRangesSectionWriter =
+          CU->getVersion() >= 5 ? RangeListsWritersByCU[*DWOId].get()
+                                : LegacyRangesWritersByCU[*DWOId].get();
+      processSplitCU(*CU, **SplitCU, DIEBlder, *TempRangesSectionWriter,
+                     AddressWriter);
+    }
     for (DWARFUnit *CU : DIEBlder.getProcessedCUs())
-      processUnitDIE(CU, &DIEBlder);
+      processMainBinaryCU(*CU, DIEBlder);
     finalizeCompileUnits(DIEBlder, *Streamer, OffsetMap,
                          DIEBlder.getProcessedCUs(), *FinalAddrWriter);
   }
diff --git a/clang-tools-extra/clang-tidy/add_new_check.py b/clang-tools-extra/clang-tidy/add_new_check.py
index 3a62df1f510ba..3b14d5d158d2d 100755
--- a/clang-tools-extra/clang-tidy/add_new_check.py
+++ b/clang-tools-extra/clang-tidy/add_new_check.py
@@ -552,8 +552,8 @@ def format_link_alias(doc_file):
                 f.write('   :header: "Name", "Offers fixes"\n\n')
                 f.writelines(checks)
                 # and the aliases
-                f.write("\nCheck aliases\n-------------\n\n")
-                f.write(".. csv-table::\n")
+                f.write("\n\n")
+                f.write(".. csv-table:: Aliases..\n")
                 f.write('   :header: "Name", "Redirect", "Offers fixes"\n\n')
                 f.writelines(checks_alias)
                 break
diff --git a/clang-tools-extra/clang-tidy/cert/CERTTidyModule.cpp b/clang-tools-extra/clang-tidy/cert/CERTTidyModule.cpp
index 8b5be9cd95f76..ffb62b409b29b 100644
--- a/clang-tools-extra/clang-tidy/cert/CERTTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/cert/CERTTidyModule.cpp
@@ -49,183 +49,183 @@ namespace {
 // with NULL argument and in this case the check is not applicable:
 // `mblen, mbrlen, mbrtowc, mbtowc, wctomb, wctomb_s`.
 // FIXME: The check can be improved to handle such cases.
-const llvm::StringRef CertErr33CCheckedFunctions = "^::aligned_alloc;"
-                                                   "^::asctime_s;"
-                                                   "^::at_quick_exit;"
-                                                   "^::atexit;"
-                                                   "^::bsearch;"
-                                                   "^::bsearch_s;"
-                                                   "^::btowc;"
-                                                   "^::c16rtomb;"
-                                                   "^::c32rtomb;"
-                                                   "^::calloc;"
-                                                   "^::clock;"
-                                                   "^::cnd_broadcast;"
-                                                   "^::cnd_init;"
-                                                   "^::cnd_signal;"
-                                                   "^::cnd_timedwait;"
-                                                   "^::cnd_wait;"
-                                                   "^::ctime_s;"
-                                                   "^::fclose;"
-                                                   "^::fflush;"
-                                                   "^::fgetc;"
-                                                   "^::fgetpos;"
-                                                   "^::fgets;"
-                                                   "^::fgetwc;"
-                                                   "^::fopen;"
-                                                   "^::fopen_s;"
-                                                   "^::fprintf;"
-                                                   "^::fprintf_s;"
-                                                   "^::fputc;"
-                                                   "^::fputs;"
-                                                   "^::fputwc;"
-                                                   "^::fputws;"
-                                                   "^::fread;"
-                                                   "^::freopen;"
-                                                   "^::freopen_s;"
-                                                   "^::fscanf;"
-                                                   "^::fscanf_s;"
-                                                   "^::fseek;"
-                                                   "^::fsetpos;"
-                                                   "^::ftell;"
-                                                   "^::fwprintf;"
-                                                   "^::fwprintf_s;"
-                                                   "^::fwrite;"
-                                                   "^::fwscanf;"
-                                                   "^::fwscanf_s;"
-                                                   "^::getc;"
-                                                   "^::getchar;"
-                                                   "^::getenv;"
-                                                   "^::getenv_s;"
-                                                   "^::gets_s;"
-                                                   "^::getwc;"
-                                                   "^::getwchar;"
-                                                   "^::gmtime;"
-                                                   "^::gmtime_s;"
-                                                   "^::localtime;"
-                                                   "^::localtime_s;"
-                                                   "^::malloc;"
-                                                   "^::mbrtoc16;"
-                                                   "^::mbrtoc32;"
-                                                   "^::mbsrtowcs;"
-                                                   "^::mbsrtowcs_s;"
-                                                   "^::mbstowcs;"
-                                                   "^::mbstowcs_s;"
-                                                   "^::memchr;"
-                                                   "^::mktime;"
-                                                   "^::mtx_init;"
-                                                   "^::mtx_lock;"
-                                                   "^::mtx_timedlock;"
-                                                   "^::mtx_trylock;"
-                                                   "^::mtx_unlock;"
-                                                   "^::printf_s;"
-                                                   "^::putc;"
-                                                   "^::putwc;"
-                                                   "^::raise;"
-                                                   "^::realloc;"
-                                                   "^::remove;"
-                                                   "^::rename;"
-                                                   "^::scanf;"
-                                                   "^::scanf_s;"
-                                                   "^::setlocale;"
-                                                   "^::setvbuf;"
-                                                   "^::signal;"
-                                                   "^::snprintf;"
-                                                   "^::snprintf_s;"
-                                                   "^::sprintf;"
-                                                   "^::sprintf_s;"
-                                                   "^::sscanf;"
-                                                   "^::sscanf_s;"
-                                                   "^::strchr;"
-                                                   "^::strerror_s;"
-                                                   "^::strftime;"
-                                                   "^::strpbrk;"
-                                                   "^::strrchr;"
-                                                   "^::strstr;"
-                                                   "^::strtod;"
-                                                   "^::strtof;"
-                                                   "^::strtoimax;"
-                                                   "^::strtok;"
-                                                   "^::strtok_s;"
-                                                   "^::strtol;"
-                                                   "^::strtold;"
-                                                   "^::strtoll;"
-                                                   "^::strtoul;"
-                                                   "^::strtoull;"
-                                                   "^::strtoumax;"
-                                                   "^::strxfrm;"
-                                                   "^::swprintf;"
-                                                   "^::swprintf_s;"
-                                                   "^::swscanf;"
-                                                   "^::swscanf_s;"
-                                                   "^::thrd_create;"
-                                                   "^::thrd_detach;"
-                                                   "^::thrd_join;"
-                                                   "^::thrd_sleep;"
-                                                   "^::time;"
-                                                   "^::timespec_get;"
-                                                   "^::tmpfile;"
-                                                   "^::tmpfile_s;"
-                                                   "^::tmpnam;"
-                                                   "^::tmpnam_s;"
-                                                   "^::tss_create;"
-                                                   "^::tss_get;"
-                                                   "^::tss_set;"
-                                                   "^::ungetc;"
-                                                   "^::ungetwc;"
-                                                   "^::vfprintf;"
-                                                   "^::vfprintf_s;"
-                                                   "^::vfscanf;"
-                                                   "^::vfscanf_s;"
-                                                   "^::vfwprintf;"
-                                                   "^::vfwprintf_s;"
-                                                   "^::vfwscanf;"
-                                                   "^::vfwscanf_s;"
-                                                   "^::vprintf_s;"
-                                                   "^::vscanf;"
-                                                   "^::vscanf_s;"
-                                                   "^::vsnprintf;"
-                                                   "^::vsnprintf_s;"
-                                                   "^::vsprintf;"
-                                                   "^::vsprintf_s;"
-                                                   "^::vsscanf;"
-                                                   "^::vsscanf_s;"
-                                                   "^::vswprintf;"
-                                                   "^::vswprintf_s;"
-                                                   "^::vswscanf;"
-                                                   "^::vswscanf_s;"
-                                                   "^::vwprintf_s;"
-                                                   "^::vwscanf;"
-                                                   "^::vwscanf_s;"
-                                                   "^::wcrtomb;"
-                                                   "^::wcschr;"
-                                                   "^::wcsftime;"
-                                                   "^::wcspbrk;"
-                                                   "^::wcsrchr;"
-                                                   "^::wcsrtombs;"
-                                                   "^::wcsrtombs_s;"
-                                                   "^::wcsstr;"
-                                                   "^::wcstod;"
-                                                   "^::wcstof;"
-                                                   "^::wcstoimax;"
-                                                   "^::wcstok;"
-                                                   "^::wcstok_s;"
-                                                   "^::wcstol;"
-                                                   "^::wcstold;"
-                                                   "^::wcstoll;"
-                                                   "^::wcstombs;"
-                                                   "^::wcstombs_s;"
-                                                   "^::wcstoul;"
-                                                   "^::wcstoull;"
-                                                   "^::wcstoumax;"
-                                                   "^::wcsxfrm;"
-                                                   "^::wctob;"
-                                                   "^::wctrans;"
-                                                   "^::wctype;"
-                                                   "^::wmemchr;"
-                                                   "^::wprintf_s;"
-                                                   "^::wscanf;"
-                                                   "^::wscanf_s;";
+const llvm::StringRef CertErr33CCheckedFunctions = "::aligned_alloc;"
+                                                   "::asctime_s;"
+                                                   "::at_quick_exit;"
+                                                   "::atexit;"
+                                                   "::bsearch;"
+                                                   "::bsearch_s;"
+                                                   "::btowc;"
+                                                   "::c16rtomb;"
+                                                   "::c32rtomb;"
+                                                   "::calloc;"
+                                                   "::clock;"
+                                                   "::cnd_broadcast;"
+                                                   "::cnd_init;"
+                                                   "::cnd_signal;"
+                                                   "::cnd_timedwait;"
+                                                   "::cnd_wait;"
+                                                   "::ctime_s;"
+                                                   "::fclose;"
+                                                   "::fflush;"
+                                                   "::fgetc;"
+                                                   "::fgetpos;"
+                                                   "::fgets;"
+                                                   "::fgetwc;"
+                                                   "::fopen;"
+                                                   "::fopen_s;"
+                                                   "::fprintf;"
+                                                   "::fprintf_s;"
+                                                   "::fputc;"
+                                                   "::fputs;"
+                                                   "::fputwc;"
+                                                   "::fputws;"
+                                                   "::fread;"
+                                                   "::freopen;"
+                                                   "::freopen_s;"
+                                                   "::fscanf;"
+                                                   "::fscanf_s;"
+                                                   "::fseek;"
+                                                   "::fsetpos;"
+                                                   "::ftell;"
+                                                   "::fwprintf;"
+                                                   "::fwprintf_s;"
+                                                   "::fwrite;"
+                                                   "::fwscanf;"
+                                                   "::fwscanf_s;"
+                                                   "::getc;"
+                                                   "::getchar;"
+                                                   "::getenv;"
+                                                   "::getenv_s;"
+                                                   "::gets_s;"
+                                                   "::getwc;"
+                                                   "::getwchar;"
+                                                   "::gmtime;"
+                                                   "::gmtime_s;"
+                                                   "::localtime;"
+                                                   "::localtime_s;"
+                                                   "::malloc;"
+                                                   "::mbrtoc16;"
+                                                   "::mbrtoc32;"
+                                                   "::mbsrtowcs;"
+                                                   "::mbsrtowcs_s;"
+                                                   "::mbstowcs;"
+                                                   "::mbstowcs_s;"
+                                                   "::memchr;"
+                                                   "::mktime;"
+                                                   "::mtx_init;"
+                                                   "::mtx_lock;"
+                                                   "::mtx_timedlock;"
+                                                   "::mtx_trylock;"
+                                                   "::mtx_unlock;"
+                                                   "::printf_s;"
+                                                   "::putc;"
+                                                   "::putwc;"
+                                                   "::raise;"
+                                                   "::realloc;"
+                                                   "::remove;"
+                                                   "::rename;"
+                                                   "::scanf;"
+                                                   "::scanf_s;"
+                                                   "::setlocale;"
+                                                   "::setvbuf;"
+                                                   "::signal;"
+                                                   "::snprintf;"
+                                                   "::snprintf_s;"
+                                                   "::sprintf;"
+                                                   "::sprintf_s;"
+                                                   "::sscanf;"
+                                                   "::sscanf_s;"
+                                                   "::strchr;"
+                                                   "::strerror_s;"
+                                                   "::strftime;"
+                                                   "::strpbrk;"
+                                                   "::strrchr;"
+                                                   "::strstr;"
+                                                   "::strtod;"
+                                                   "::strtof;"
+                                                   "::strtoimax;"
+                                                   "::strtok;"
+                                                   "::strtok_s;"
+                                                   "::strtol;"
+                                                   "::strtold;"
+                                                   "::strtoll;"
+                                                   "::strtoul;"
+                                                   "::strtoull;"
+                                                   "::strtoumax;"
+                                                   "::strxfrm;"
+                                                   "::swprintf;"
+                                                   "::swprintf_s;"
+                                                   "::swscanf;"
+                                                   "::swscanf_s;"
+                                                   "::thrd_create;"
+                                                   "::thrd_detach;"
+                                                   "::thrd_join;"
+                                                   "::thrd_sleep;"
+                                                   "::time;"
+                                                   "::timespec_get;"
+                                                   "::tmpfile;"
+                                                   "::tmpfile_s;"
+                                                   "::tmpnam;"
+                                                   "::tmpnam_s;"
+                                                   "::tss_create;"
+                                                   "::tss_get;"
+                                                   "::tss_set;"
+                                                   "::ungetc;"
+                                                   "::ungetwc;"
+                                                   "::vfprintf;"
+                                                   "::vfprintf_s;"
+                                                   "::vfscanf;"
+                                                   "::vfscanf_s;"
+                                                   "::vfwprintf;"
+                                                   "::vfwprintf_s;"
+                                                   "::vfwscanf;"
+                                                   "::vfwscanf_s;"
+                                                   "::vprintf_s;"
+                                                   "::vscanf;"
+                                                   "::vscanf_s;"
+                                                   "::vsnprintf;"
+                                                   "::vsnprintf_s;"
+                                                   "::vsprintf;"
+                                                   "::vsprintf_s;"
+                                                   "::vsscanf;"
+                                                   "::vsscanf_s;"
+                                                   "::vswprintf;"
+                                                   "::vswprintf_s;"
+                                                   "::vswscanf;"
+                                                   "::vswscanf_s;"
+                                                   "::vwprintf_s;"
+                                                   "::vwscanf;"
+                                                   "::vwscanf_s;"
+                                                   "::wcrtomb;"
+                                                   "::wcschr;"
+                                                   "::wcsftime;"
+                                                   "::wcspbrk;"
+                                                   "::wcsrchr;"
+                                                   "::wcsrtombs;"
+                                                   "::wcsrtombs_s;"
+                                                   "::wcsstr;"
+                                                   "::wcstod;"
+                                                   "::wcstof;"
+                                                   "::wcstoimax;"
+                                                   "::wcstok;"
+                                                   "::wcstok_s;"
+                                                   "::wcstol;"
+                                                   "::wcstold;"
+                                                   "::wcstoll;"
+                                                   "::wcstombs;"
+                                                   "::wcstombs_s;"
+                                                   "::wcstoul;"
+                                                   "::wcstoull;"
+                                                   "::wcstoumax;"
+                                                   "::wcsxfrm;"
+                                                   "::wctob;"
+                                                   "::wctrans;"
+                                                   "::wctype;"
+                                                   "::wmemchr;"
+                                                   "::wprintf_s;"
+                                                   "::wscanf;"
+                                                   "::wscanf_s;";
 
 } // namespace
 
diff --git a/clang-tools-extra/clang-tidy/misc/ConstCorrectnessCheck.cpp b/clang-tools-extra/clang-tidy/misc/ConstCorrectnessCheck.cpp
index e20cf6fbcb55a..8b500de0c028c 100644
--- a/clang-tools-extra/clang-tidy/misc/ConstCorrectnessCheck.cpp
+++ b/clang-tools-extra/clang-tidy/misc/ConstCorrectnessCheck.cpp
@@ -93,12 +93,13 @@ void ConstCorrectnessCheck::registerMatchers(MatchFinder *Finder) {
   // shall be run.
   const auto FunctionScope =
       functionDecl(
-          hasBody(stmt(forEachDescendant(
-                           declStmt(containsAnyDeclaration(
-                                        LocalValDecl.bind("local-value")),
-                                    unless(has(decompositionDecl())))
-                               .bind("decl-stmt")))
-                      .bind("scope")))
+          hasBody(
+              compoundStmt(forEachDescendant(
+                               declStmt(containsAnyDeclaration(
+                                            LocalValDecl.bind("local-value")),
+                                        unless(has(decompositionDecl())))
+                                   .bind("decl-stmt")))
+                  .bind("scope")))
           .bind("function-decl");
 
   Finder->addMatcher(FunctionScope, this);
@@ -108,7 +109,7 @@ void ConstCorrectnessCheck::registerMatchers(MatchFinder *Finder) {
 enum class VariableCategory { Value, Reference, Pointer };
 
 void ConstCorrectnessCheck::check(const MatchFinder::MatchResult &Result) {
-  const auto *LocalScope = Result.Nodes.getNodeAs<Stmt>("scope");
+  const auto *LocalScope = Result.Nodes.getNodeAs<CompoundStmt>("scope");
   const auto *Variable = Result.Nodes.getNodeAs<VarDecl>("local-value");
   const auto *Function = Result.Nodes.getNodeAs<FunctionDecl>("function-decl");
 
@@ -197,7 +198,7 @@ void ConstCorrectnessCheck::check(const MatchFinder::MatchResult &Result) {
   }
 }
 
-void ConstCorrectnessCheck::registerScope(const Stmt *LocalScope,
+void ConstCorrectnessCheck::registerScope(const CompoundStmt *LocalScope,
                                           ASTContext *Context) {
   auto &Analyzer = ScopesCache[LocalScope];
   if (!Analyzer)
diff --git a/clang-tools-extra/clang-tidy/misc/ConstCorrectnessCheck.h b/clang-tools-extra/clang-tidy/misc/ConstCorrectnessCheck.h
index bba060e555d00..08ffde524522a 100644
--- a/clang-tools-extra/clang-tidy/misc/ConstCorrectnessCheck.h
+++ b/clang-tools-extra/clang-tidy/misc/ConstCorrectnessCheck.h
@@ -32,10 +32,10 @@ class ConstCorrectnessCheck : public ClangTidyCheck {
   void check(const ast_matchers::MatchFinder::MatchResult &Result) override;
 
 private:
-  void registerScope(const Stmt *LocalScope, ASTContext *Context);
+  void registerScope(const CompoundStmt *LocalScope, ASTContext *Context);
 
   using MutationAnalyzer = std::unique_ptr<ExprMutationAnalyzer>;
-  llvm::DenseMap<const Stmt *, MutationAnalyzer> ScopesCache;
+  llvm::DenseMap<const CompoundStmt *, MutationAnalyzer> ScopesCache;
   llvm::DenseSet<SourceLocation> TemplateDiagnosticsCache;
 
   const bool AnalyzeValues;
diff --git a/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.cpp b/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.cpp
index 3a255c5c133f1..5a4c2363bd8af 100644
--- a/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.cpp
+++ b/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.cpp
@@ -119,72 +119,65 @@ void UnnecessaryValueParamCheck::check(const MatchFinder::MatchResult &Result) {
     }
   }
 
-  handleConstRefFix(*Function, *Param, *Result.Context);
-}
-
-void UnnecessaryValueParamCheck::registerPPCallbacks(
-    const SourceManager &SM, Preprocessor *PP, Preprocessor *ModuleExpanderPP) {
-  Inserter.registerPreprocessor(PP);
-}
-
-void UnnecessaryValueParamCheck::storeOptions(
-    ClangTidyOptions::OptionMap &Opts) {
-  Options.store(Opts, "IncludeStyle", Inserter.getStyle());
-  Options.store(Opts, "AllowedTypes",
-                utils::options::serializeStringList(AllowedTypes));
-}
-
-void UnnecessaryValueParamCheck::onEndOfTranslationUnit() {
-  MutationAnalyzerCache.clear();
-}
-
-void UnnecessaryValueParamCheck::handleConstRefFix(const FunctionDecl &Function,
-                                                   const ParmVarDecl &Param,
-                                                   ASTContext &Context) {
-  const size_t Index =
-      llvm::find(Function.parameters(), &Param) - Function.parameters().begin();
-  const bool IsConstQualified =
-      Param.getType().getCanonicalType().isConstQualified();
+  const size_t Index = llvm::find(Function->parameters(), Param) -
+                       Function->parameters().begin();
 
   auto Diag =
-      diag(Param.getLocation(),
+      diag(Param->getLocation(),
            "the %select{|const qualified }0parameter %1 is copied for each "
            "invocation%select{ but only used as a const reference|}0; consider "
            "making it a %select{const |}0reference")
-      << IsConstQualified << paramNameOrIndex(Param.getName(), Index);
+      << IsConstQualified << paramNameOrIndex(Param->getName(), Index);
   // Do not propose fixes when:
   // 1. the ParmVarDecl is in a macro, since we cannot place them correctly
   // 2. the function is virtual as it might break overrides
   // 3. the function is referenced outside of a call expression within the
   //    compilation unit as the signature change could introduce build errors.
   // 4. the function is an explicit template/ specialization.
-  const auto *Method = llvm::dyn_cast<CXXMethodDecl>(&Function);
-  if (Param.getBeginLoc().isMacroID() || (Method && Method->isVirtual()) ||
-      isReferencedOutsideOfCallExpr(Function, Context) ||
-      Function.getTemplateSpecializationKind() == TSK_ExplicitSpecialization)
+  const auto *Method = llvm::dyn_cast<CXXMethodDecl>(Function);
+  if (Param->getBeginLoc().isMacroID() || (Method && Method->isVirtual()) ||
+      isReferencedOutsideOfCallExpr(*Function, *Result.Context) ||
+      Function->getTemplateSpecializationKind() == TSK_ExplicitSpecialization)
     return;
-  for (const auto *FunctionDecl = &Function; FunctionDecl != nullptr;
+  for (const auto *FunctionDecl = Function; FunctionDecl != nullptr;
        FunctionDecl = FunctionDecl->getPreviousDecl()) {
     const auto &CurrentParam = *FunctionDecl->getParamDecl(Index);
-    Diag << utils::fixit::changeVarDeclToReference(CurrentParam, Context);
+    Diag << utils::fixit::changeVarDeclToReference(CurrentParam,
+                                                   *Result.Context);
     // The parameter of each declaration needs to be checked individually as to
     // whether it is const or not as constness can differ between definition and
     // declaration.
     if (!CurrentParam.getType().getCanonicalType().isConstQualified()) {
       if (std::optional<FixItHint> Fix = utils::fixit::addQualifierToVarDecl(
-              CurrentParam, Context, DeclSpec::TQ::TQ_const))
+              CurrentParam, *Result.Context, DeclSpec::TQ::TQ_const))
         Diag << *Fix;
     }
   }
 }
 
-void UnnecessaryValueParamCheck::handleMoveFix(const ParmVarDecl &Param,
+void UnnecessaryValueParamCheck::registerPPCallbacks(
+    const SourceManager &SM, Preprocessor *PP, Preprocessor *ModuleExpanderPP) {
+  Inserter.registerPreprocessor(PP);
+}
+
+void UnnecessaryValueParamCheck::storeOptions(
+    ClangTidyOptions::OptionMap &Opts) {
+  Options.store(Opts, "IncludeStyle", Inserter.getStyle());
+  Options.store(Opts, "AllowedTypes",
+                utils::options::serializeStringList(AllowedTypes));
+}
+
+void UnnecessaryValueParamCheck::onEndOfTranslationUnit() {
+  MutationAnalyzerCache.clear();
+}
+
+void UnnecessaryValueParamCheck::handleMoveFix(const ParmVarDecl &Var,
                                                const DeclRefExpr &CopyArgument,
-                                               ASTContext &Context) {
+                                               const ASTContext &Context) {
   auto Diag = diag(CopyArgument.getBeginLoc(),
                    "parameter %0 is passed by value and only copied once; "
                    "consider moving it to avoid unnecessary copies")
-              << &Param;
+              << &Var;
   // Do not propose fixes in macros since we cannot place them correctly.
   if (CopyArgument.getBeginLoc().isMacroID())
     return;
diff --git a/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.h b/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.h
index 8bfd814d16357..7250bffd20b2f 100644
--- a/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.h
+++ b/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.h
@@ -33,16 +33,10 @@ class UnnecessaryValueParamCheck : public ClangTidyCheck {
   void storeOptions(ClangTidyOptions::OptionMap &Opts) override;
   void onEndOfTranslationUnit() override;
 
-protected:
-  // Create diagnostics. These are virtual so that derived classes can change
-  // behaviour.
-  virtual void handleMoveFix(const ParmVarDecl &Param,
-                             const DeclRefExpr &CopyArgument,
-                             ASTContext &Context);
-  virtual void handleConstRefFix(const FunctionDecl &Function,
-                                 const ParmVarDecl &Param, ASTContext &Context);
-
 private:
+  void handleMoveFix(const ParmVarDecl &Var, const DeclRefExpr &CopyArgument,
+                     const ASTContext &Context);
+
   ExprMutationAnalyzer::Memoized MutationAnalyzerCache;
   utils::IncludeInserter Inserter;
   const std::vector<StringRef> AllowedTypes;
diff --git a/clang-tools-extra/clang-tidy/tool/run-clang-tidy.py b/clang-tools-extra/clang-tidy/tool/run-clang-tidy.py
index 48401ba5ea42a..0dc35ad587362 100755
--- a/clang-tools-extra/clang-tidy/tool/run-clang-tidy.py
+++ b/clang-tools-extra/clang-tidy/tool/run-clang-tidy.py
@@ -34,32 +34,29 @@
 http://clang.llvm.org/docs/HowToSetupToolingForLLVM.html
 """
 
+from __future__ import print_function
+
 import argparse
-import asyncio
-from dataclasses import dataclass
 import glob
 import json
 import multiprocessing
 import os
+import queue
 import re
 import shutil
 import subprocess
 import sys
 import tempfile
-import time
+import threading
 import traceback
-from types import ModuleType
-from typing import Any, Awaitable, Callable, List, Optional, Tuple, TypeVar
-
 
-yaml: Optional[ModuleType] = None
 try:
     import yaml
 except ImportError:
     yaml = None
 
 
-def strtobool(val: str) -> bool:
+def strtobool(val):
     """Convert a string representation of truth to a bool following LLVM's CLI argument parsing."""
 
     val = val.lower()
@@ -70,11 +67,11 @@ def strtobool(val: str) -> bool:
 
     # Return ArgumentTypeError so that argparse does not substitute its own error message
     raise argparse.ArgumentTypeError(
-        f"'{val}' is invalid value for boolean argument! Try 0 or 1."
+        "'{}' is invalid value for boolean argument! Try 0 or 1.".format(val)
     )
 
 
-def find_compilation_database(path: str) -> str:
+def find_compilation_database(path):
     """Adjusts the directory until a compilation database is found."""
     result = os.path.realpath("./")
     while not os.path.isfile(os.path.join(result, path)):
@@ -86,43 +83,49 @@ def find_compilation_database(path: str) -> str:
     return result
 
 
+def make_absolute(f, directory):
+    if os.path.isabs(f):
+        return f
+    return os.path.normpath(os.path.join(directory, f))
+
+
 def get_tidy_invocation(
-    f: str,
-    clang_tidy_binary: str,
-    checks: str,
-    tmpdir: Optional[str],
-    build_path: str,
-    header_filter: Optional[str],
-    allow_enabling_alpha_checkers: bool,
-    extra_arg: List[str],
-    extra_arg_before: List[str],
-    quiet: bool,
-    config_file_path: str,
-    config: str,
-    line_filter: Optional[str],
-    use_color: bool,
-    plugins: List[str],
-    warnings_as_errors: Optional[str],
-    exclude_header_filter: Optional[str],
-    allow_no_checks: bool,
-) -> List[str]:
+    f,
+    clang_tidy_binary,
+    checks,
+    tmpdir,
+    build_path,
+    header_filter,
+    allow_enabling_alpha_checkers,
+    extra_arg,
+    extra_arg_before,
+    quiet,
+    config_file_path,
+    config,
+    line_filter,
+    use_color,
+    plugins,
+    warnings_as_errors,
+    exclude_header_filter,
+    allow_no_checks,
+):
     """Gets a command line for clang-tidy."""
     start = [clang_tidy_binary]
     if allow_enabling_alpha_checkers:
         start.append("-allow-enabling-analyzer-alpha-checkers")
     if exclude_header_filter is not None:
-        start.append(f"--exclude-header-filter={exclude_header_filter}")
+        start.append("--exclude-header-filter=" + exclude_header_filter)
     if header_filter is not None:
-        start.append(f"-header-filter={header_filter}")
+        start.append("-header-filter=" + header_filter)
     if line_filter is not None:
-        start.append(f"-line-filter={line_filter}")
+        start.append("-line-filter=" + line_filter)
     if use_color is not None:
         if use_color:
             start.append("--use-color")
         else:
             start.append("--use-color=false")
     if checks:
-        start.append(f"-checks={checks}")
+        start.append("-checks=" + checks)
     if tmpdir is not None:
         start.append("-export-fixes")
         # Get a temporary file. We immediately close the handle so clang-tidy can
@@ -131,29 +134,28 @@ def get_tidy_invocation(
         os.close(handle)
         start.append(name)
     for arg in extra_arg:
-        start.append(f"-extra-arg={arg}")
+        start.append("-extra-arg=%s" % arg)
     for arg in extra_arg_before:
-        start.append(f"-extra-arg-before={arg}")
-    start.append(f"-p={build_path}")
+        start.append("-extra-arg-before=%s" % arg)
+    start.append("-p=" + build_path)
     if quiet:
         start.append("-quiet")
     if config_file_path:
-        start.append(f"--config-file={config_file_path}")
+        start.append("--config-file=" + config_file_path)
     elif config:
-        start.append(f"-config={config}")
+        start.append("-config=" + config)
     for plugin in plugins:
-        start.append(f"-load={plugin}")
+        start.append("-load=" + plugin)
     if warnings_as_errors:
-        start.append(f"--warnings-as-errors={warnings_as_errors}")
+        start.append("--warnings-as-errors=" + warnings_as_errors)
     if allow_no_checks:
         start.append("--allow-no-checks")
     start.append(f)
     return start
 
 
-def merge_replacement_files(tmpdir: str, mergefile: str) -> None:
+def merge_replacement_files(tmpdir, mergefile):
     """Merge all replacement files in a directory into a single file"""
-    assert yaml
     # The fixes suggested by clang-tidy >= 4.0.0 are given under
     # the top level key 'Diagnostics' in the output yaml files
     mergekey = "Diagnostics"
@@ -177,14 +179,16 @@ def merge_replacement_files(tmpdir: str, mergefile: str) -> None:
         open(mergefile, "w").close()
 
 
-def find_binary(arg: str, name: str, build_path: str) -> str:
+def find_binary(arg, name, build_path):
     """Get the path for a binary or exit"""
     if arg:
         if shutil.which(arg):
             return arg
         else:
             raise SystemExit(
-                f"error: passed binary '{arg}' was not found or is not executable"
+                "error: passed binary '{}' was not found or is not executable".format(
+                    arg
+                )
             )
 
     built_path = os.path.join(build_path, "bin", name)
@@ -192,102 +196,66 @@ def find_binary(arg: str, name: str, build_path: str) -> str:
     if binary:
         return binary
     else:
-        raise SystemExit(f"error: failed to find {name} in $PATH or at {built_path}")
+        raise SystemExit(
+            "error: failed to find {} in $PATH or at {}".format(name, built_path)
+        )
 
 
-def apply_fixes(
-    args: argparse.Namespace, clang_apply_replacements_binary: str, tmpdir: str
-) -> None:
+def apply_fixes(args, clang_apply_replacements_binary, tmpdir):
     """Calls clang-apply-fixes on a given directory."""
     invocation = [clang_apply_replacements_binary]
     invocation.append("-ignore-insert-conflict")
     if args.format:
         invocation.append("-format")
     if args.style:
-        invocation.append(f"-style={args.style}")
+        invocation.append("-style=" + args.style)
     invocation.append(tmpdir)
     subprocess.call(invocation)
 
 
-# FIXME Python 3.12: This can be simplified out with run_with_semaphore[T](...).
-T = TypeVar("T")
-
-
-async def run_with_semaphore(
-    semaphore: asyncio.Semaphore,
-    f: Callable[..., Awaitable[T]],
-    *args: Any,
-    **kwargs: Any,
-) -> T:
-    async with semaphore:
-        return await f(*args, **kwargs)
-
-
- at dataclass
-class ClangTidyResult:
-    filename: str
-    invocation: List[str]
-    returncode: int
-    stdout: str
-    stderr: str
-    elapsed: float
-
-
-async def run_tidy(
-    args: argparse.Namespace,
-    name: str,
-    clang_tidy_binary: str,
-    tmpdir: str,
-    build_path: str,
-) -> ClangTidyResult:
-    """
-    Runs clang-tidy on a single file and returns the result.
-    """
-    invocation = get_tidy_invocation(
-        name,
-        clang_tidy_binary,
-        args.checks,
-        tmpdir,
-        build_path,
-        args.header_filter,
-        args.allow_enabling_alpha_checkers,
-        args.extra_arg,
-        args.extra_arg_before,
-        args.quiet,
-        args.config_file,
-        args.config,
-        args.line_filter,
-        args.use_color,
-        args.plugins,
-        args.warnings_as_errors,
-        args.exclude_header_filter,
-        args.allow_no_checks,
-    )
-
-    try:
-        process = await asyncio.create_subprocess_exec(
-            *invocation, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+def run_tidy(args, clang_tidy_binary, tmpdir, build_path, queue, lock, failed_files):
+    """Takes filenames out of queue and runs clang-tidy on them."""
+    while True:
+        name = queue.get()
+        invocation = get_tidy_invocation(
+            name,
+            clang_tidy_binary,
+            args.checks,
+            tmpdir,
+            build_path,
+            args.header_filter,
+            args.allow_enabling_alpha_checkers,
+            args.extra_arg,
+            args.extra_arg_before,
+            args.quiet,
+            args.config_file,
+            args.config,
+            args.line_filter,
+            args.use_color,
+            args.plugins,
+            args.warnings_as_errors,
+            args.exclude_header_filter,
+            args.allow_no_checks,
         )
-        start = time.time()
-        stdout, stderr = await process.communicate()
-        end = time.time()
-    except asyncio.CancelledError:
-        process.terminate()
-        await process.wait()
-        raise
-
-    assert process.returncode is not None
-    return ClangTidyResult(
-        name,
-        invocation,
-        process.returncode,
-        stdout.decode("UTF-8"),
-        stderr.decode("UTF-8"),
-        end - start,
-    )
 
-
-async def main() -> None:
+        proc = subprocess.Popen(
+            invocation, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+        )
+        output, err = proc.communicate()
+        if proc.returncode != 0:
+            if proc.returncode < 0:
+                msg = "%s: terminated by signal %d\n" % (name, -proc.returncode)
+                err += msg.encode("utf-8")
+            failed_files.append(name)
+        with lock:
+            sys.stdout.write(" ".join(invocation) + "\n" + output.decode("utf-8"))
+            if len(err) > 0:
+                sys.stdout.flush()
+                sys.stderr.write(err.decode("utf-8"))
+        queue.task_done()
+
+
+def main():
     parser = argparse.ArgumentParser(
         description="Runs clang-tidy over all files "
         "in a compilation database. Requires "
@@ -464,7 +432,7 @@ async def main() -> None:
         )
 
     combine_fixes = False
-    export_fixes_dir: Optional[str] = None
+    export_fixes_dir = None
     delete_fixes_dir = False
     if args.export_fixes is not None:
         # if a directory is given, create it if it does not exist
@@ -522,10 +490,10 @@ async def main() -> None:
         sys.exit(1)
 
     # Load the database and extract all files.
-    with open(os.path.join(build_path, db_path)) as f:
-        database = json.load(f)
-    files = {os.path.abspath(os.path.join(e["directory"], e["file"])) for e in database}
-    number_files_in_database = len(files)
+    database = json.load(open(os.path.join(build_path, db_path)))
+    files = set(
+        [make_absolute(entry["file"], entry["directory"]) for entry in database]
+    )
 
     # Filter source files from compilation database.
     if args.source_filter:
@@ -546,81 +514,70 @@ async def main() -> None:
 
     # Build up a big regexy filter from all command line arguments.
     file_name_re = re.compile("|".join(args.files))
-    files = {f for f in files if file_name_re.search(f)}
-
-    print(
-        "Running clang-tidy for",
-        len(files),
-        "files out of",
-        number_files_in_database,
-        "in compilation database ...",
-    )
-
-    returncode = 0
-    semaphore = asyncio.Semaphore(max_task)
-    tasks = [
-        asyncio.create_task(
-            run_with_semaphore(
-                semaphore,
-                run_tidy,
-                args,
-                f,
-                clang_tidy_binary,
-                export_fixes_dir,
-                build_path,
-            )
-        )
-        for f in files
-    ]
 
+    return_code = 0
     try:
-        for i, coro in enumerate(asyncio.as_completed(tasks)):
-            result = await coro
-            if result.returncode != 0:
-                returncode = 1
-                if result.returncode < 0:
-                    result.stderr += f"{result.filename}: terminated by signal {-result.returncode}\n"
-            progress = f"[{i + 1: >{len(f'{len(files)}')}}/{len(files)}]"
-            runtime = f"[{result.elapsed:.1f}s]"
-            print(f"{progress}{runtime} {' '.join(result.invocation)}")
-            if result.stdout:
-                print(result.stdout, end=("" if result.stderr else "\n"))
-            if result.stderr:
-                print(result.stderr)
-    except asyncio.CancelledError:
+        # Spin up a bunch of tidy-launching threads.
+        task_queue = queue.Queue(max_task)
+        # List of files with a non-zero return code.
+        failed_files = []
+        lock = threading.Lock()
+        for _ in range(max_task):
+            t = threading.Thread(
+                target=run_tidy,
+                args=(
+                    args,
+                    clang_tidy_binary,
+                    export_fixes_dir,
+                    build_path,
+                    task_queue,
+                    lock,
+                    failed_files,
+                ),
+            )
+            t.daemon = True
+            t.start()
+
+        # Fill the queue with files.
+        for name in files:
+            if file_name_re.search(name):
+                task_queue.put(name)
+
+        # Wait for all threads to be done.
+        task_queue.join()
+        if len(failed_files):
+            return_code = 1
+
+    except KeyboardInterrupt:
+        # This is a sad hack. Unfortunately subprocess goes
+        # bonkers with ctrl-c and we start forking merrily.
         print("\nCtrl-C detected, goodbye.")
-        for task in tasks:
-            task.cancel()
         if delete_fixes_dir:
-            assert export_fixes_dir
             shutil.rmtree(export_fixes_dir)
-        return
+        os.kill(0, 9)
 
     if combine_fixes:
-        print(f"Writing fixes to {args.export_fixes} ...")
+        print("Writing fixes to " + args.export_fixes + " ...")
         try:
-            assert export_fixes_dir
             merge_replacement_files(export_fixes_dir, args.export_fixes)
         except:
             print("Error exporting fixes.\n", file=sys.stderr)
             traceback.print_exc()
-            returncode = 1
+            return_code = 1
 
     if args.fix:
         print("Applying fixes ...")
         try:
-            assert export_fixes_dir
             apply_fixes(args, clang_apply_replacements_binary, export_fixes_dir)
         except:
             print("Error applying fixes.\n", file=sys.stderr)
             traceback.print_exc()
-            returncode = 1
+            return_code = 1
 
     if delete_fixes_dir:
-        assert export_fixes_dir
         shutil.rmtree(export_fixes_dir)
-    sys.exit(returncode)
+    sys.exit(return_code)
 
 
 if __name__ == "__main__":
-    asyncio.run(main())
+    main()
diff --git a/clang-tools-extra/clang-tidy/utils/ASTUtils.cpp b/clang-tools-extra/clang-tidy/utils/ASTUtils.cpp
index 0cdc7d08abc99..fd5dadc9b01db 100644
--- a/clang-tools-extra/clang-tidy/utils/ASTUtils.cpp
+++ b/clang-tools-extra/clang-tidy/utils/ASTUtils.cpp
@@ -96,7 +96,7 @@ bool areStatementsIdentical(const Stmt *FirstStmt, const Stmt *SecondStmt,
   if (FirstStmt == SecondStmt)
     return true;
 
-  if (FirstStmt->getStmtClass() != SecondStmt->getStmtClass())
+  if (FirstStmt->getStmtClass() != FirstStmt->getStmtClass())
     return false;
 
   if (isa<Expr>(FirstStmt) && isa<Expr>(SecondStmt)) {
diff --git a/clang-tools-extra/clang-tidy/utils/ExceptionAnalyzer.cpp b/clang-tools-extra/clang-tidy/utils/ExceptionAnalyzer.cpp
index 9bfb7e2677533..6ae46e2b1262a 100644
--- a/clang-tools-extra/clang-tidy/utils/ExceptionAnalyzer.cpp
+++ b/clang-tools-extra/clang-tidy/utils/ExceptionAnalyzer.cpp
@@ -141,10 +141,7 @@ bool isStandardPointerConvertible(QualType From, QualType To) {
     if (RD->isCompleteDefinition() &&
         isBaseOf(From->getPointeeType().getTypePtr(),
                  To->getPointeeType().getTypePtr())) {
-      // If B is an inaccessible or ambiguous base class of D, a program
-      // that necessitates this conversion is ill-formed
-      return isUnambiguousPublicBaseClass(From->getPointeeType().getTypePtr(),
-                                          To->getPointeeType().getTypePtr());
+      return true;
     }
   }
 
@@ -378,7 +375,10 @@ bool ExceptionAnalyzer::ExceptionInfo::filterByCatch(
         isPointerOrPointerToMember(ExceptionCanTy->getTypePtr())) {
       // A standard pointer conversion not involving conversions to pointers to
       // private or protected or ambiguous classes ...
-      if (isStandardPointerConvertible(ExceptionCanTy, HandlerCanTy)) {
+      if (isStandardPointerConvertible(ExceptionCanTy, HandlerCanTy) &&
+          isUnambiguousPublicBaseClass(
+              ExceptionCanTy->getTypePtr()->getPointeeType().getTypePtr(),
+              HandlerCanTy->getTypePtr()->getPointeeType().getTypePtr())) {
         TypesToDelete.push_back(ExceptionTy);
       }
       // A function pointer conversion ...
diff --git a/clang-tools-extra/clangd/IncludeCleaner.cpp b/clang-tools-extra/clangd/IncludeCleaner.cpp
index e34706172f0bf..dc5b7ec95db5f 100644
--- a/clang-tools-extra/clangd/IncludeCleaner.cpp
+++ b/clang-tools-extra/clangd/IncludeCleaner.cpp
@@ -401,26 +401,6 @@ computeIncludeCleanerFindings(ParsedAST &AST, bool AnalyzeAngledIncludes) {
             Ref.RT != include_cleaner::RefType::Explicit)
           return;
 
-        // Check if we have any headers with the same spelling, in edge cases
-        // like `#include_next "foo.h"`, the user can't ever include the
-        // physical foo.h, but can have a spelling that refers to it.
-        // We postpone this check because spelling a header for every usage is
-        // expensive.
-        std::string Spelling = include_cleaner::spellHeader(
-            {Providers.front(), AST.getPreprocessor().getHeaderSearchInfo(),
-             MainFile});
-        for (auto *Inc :
-             ConvertedIncludes.match(include_cleaner::Header{Spelling})) {
-          Satisfied = true;
-          auto HeaderID =
-              AST.getIncludeStructure().getID(&Inc->Resolved->getFileEntry());
-          assert(HeaderID.has_value() &&
-                 "ConvertedIncludes only contains resolved includes.");
-          Used.insert(*HeaderID);
-        }
-        if (Satisfied)
-          return;
-
         // We actually always want to map usages to their spellings, but
         // spelling locations can point into preamble section. Using these
         // offsets could lead into crashes in presence of stale preambles. Hence
diff --git a/clang-tools-extra/clangd/unittests/IncludeCleanerTests.cpp b/clang-tools-extra/clangd/unittests/IncludeCleanerTests.cpp
index 0ee748c1ed2d0..7027232460354 100644
--- a/clang-tools-extra/clangd/unittests/IncludeCleanerTests.cpp
+++ b/clang-tools-extra/clangd/unittests/IncludeCleanerTests.cpp
@@ -644,28 +644,6 @@ TEST(IncludeCleaner, ResourceDirIsIgnored) {
   EXPECT_THAT(Findings.MissingIncludes, IsEmpty());
 }
 
-TEST(IncludeCleaner, DifferentHeaderSameSpelling) {
-  // `foo` is declared in foo_inner/foo.h, but there's no way to spell it
-  // directly. Make sure we don't generate unusued/missing include findings in
-  // such cases.
-  auto TU = TestTU::withCode(R"cpp(
-    #include <foo.h>
-    void baz() {
-      foo();
-    }
-  )cpp");
-  TU.AdditionalFiles["foo/foo.h"] = guard("#include_next <foo.h>");
-  TU.AdditionalFiles["foo_inner/foo.h"] = guard(R"cpp(
-    void foo();
-  )cpp");
-  TU.ExtraArgs.push_back("-Ifoo");
-  TU.ExtraArgs.push_back("-Ifoo_inner");
-
-  auto AST = TU.build();
-  auto Findings = computeIncludeCleanerFindings(AST);
-  EXPECT_THAT(Findings.UnusedIncludes, IsEmpty());
-  EXPECT_THAT(Findings.MissingIncludes, IsEmpty());
-}
 } // namespace
 } // namespace clangd
 } // namespace clang
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 642ad39cc0c1c..a23483e6df6d2 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -48,6 +48,10 @@ Major New Features
 Improvements to clangd
 ----------------------
 
+- Introduced exmperimental support for C++20 Modules. The experimental support can
+  be enabled by `-experimental-modules-support` option. It is in an early development
+  stage and may not perform efficiently in real-world scenarios.
+
 Inlay hints
 ^^^^^^^^^^^
 
@@ -69,6 +73,9 @@ Code completion
 Code actions
 ^^^^^^^^^^^^
 
+- The tweak for turning unscoped into scoped enums now removes redundant prefixes
+  from the enum values.
+
 Signature help
 ^^^^^^^^^^^^^^
 
@@ -81,12 +88,23 @@ Objective-C
 Miscellaneous
 ^^^^^^^^^^^^^
 
+- Added a boolean option `AnalyzeAngledIncludes` to `Includes` config section,
+  which allows to enable unused includes detection for all angled ("system") headers.
+  At this moment umbrella headers are not supported, so enabling this option
+  may result in false-positives.
+
 Improvements to clang-doc
 -------------------------
 
 Improvements to clang-query
 ---------------------------
 
+- Added the `file` command to dynamically load a list of commands and matchers
+  from an external file, allowing the cost of reading the compilation database
+  and building the AST to be imposed just once for faster prototyping.
+
+- Removed support for ``enable output srcloc``. Fixes #GH82591
+
 Improvements to clang-rename
 ----------------------------
 
@@ -95,21 +113,424 @@ The improvements are...
 Improvements to clang-tidy
 --------------------------
 
+- Improved :program:`run-clang-tidy.py` script. Added argument `-source-filter`
+  to filter source files from the compilation database, via a RegEx. In a
+  similar fashion to what `-header-filter` does for header files.
+
+- Improved :program:`check_clang_tidy.py` script. Added argument `-export-fixes`
+  to aid in clang-tidy and test development.
+
+- Fixed bug where big values for unsigned check options overflowed into negative values
+  when being printed with `--dump-config`.
+
+- Fixed `--verify-config` option not properly parsing checks when using the
+  literal operator in the `.clang-tidy` config.
+
+- Added argument `--exclude-header-filter` and config option `ExcludeHeaderFilterRegex`
+  to exclude headers from analysis via a RegEx.
+
+- Added argument `--allow-no-checks` to suppress "no checks enabled" error
+  when disabling all of the checks by `--checks='-*'`.
+
 New checks
 ^^^^^^^^^^
 
+- New :doc:`boost-use-ranges
+  <clang-tidy/checks/boost/use-ranges>` check.
+
+  Detects calls to standard library iterator algorithms that could be replaced
+  with a Boost ranges version instead.
+
+- New :doc:`bugprone-crtp-constructor-accessibility
+  <clang-tidy/checks/bugprone/crtp-constructor-accessibility>` check.
+
+  Detects error-prone Curiously Recurring Template Pattern usage, when the CRTP
+  can be constructed outside itself and the derived class.
+
+- New :doc:`bugprone-pointer-arithmetic-on-polymorphic-object
+  <clang-tidy/checks/bugprone/pointer-arithmetic-on-polymorphic-object>` check.
+
+  Finds pointer arithmetic performed on classes that contain a virtual function.
+
+- New :doc:`bugprone-return-const-ref-from-parameter
+  <clang-tidy/checks/bugprone/return-const-ref-from-parameter>` check.
+
+  Detects return statements that return a constant reference parameter as constant
+  reference. This may cause use-after-free errors if the caller uses xvalues as
+  arguments.
+
+- New :doc:`bugprone-suspicious-stringview-data-usage
+  <clang-tidy/checks/bugprone/suspicious-stringview-data-usage>` check.
+
+  Identifies suspicious usages of ``std::string_view::data()`` that could lead
+  to reading out-of-bounds data due to inadequate or incorrect string null
+  termination.
+
+- New :doc:`misc-use-internal-linkage
+  <clang-tidy/checks/misc/use-internal-linkage>` check.
+
+  Detects variables and functions that can be marked as static or moved into
+  an anonymous namespace to enforce internal linkage.
+
+- New :doc:`modernize-min-max-use-initializer-list
+  <clang-tidy/checks/modernize/min-max-use-initializer-list>` check.
+
+  Replaces nested ``std::min`` and ``std::max`` calls with an initializer list
+  where applicable.
+
+- New :doc:`modernize-use-designated-initializers
+  <clang-tidy/checks/modernize/use-designated-initializers>` check.
+
+  Finds initializer lists for aggregate types that could be
+  written as designated initializers instead.
+
+- New :doc:`modernize-use-ranges
+  <clang-tidy/checks/modernize/use-ranges>` check.
+
+  Detects calls to standard library iterator algorithms that could be replaced
+  with a ranges version instead.
+
+- New :doc:`modernize-use-std-format
+  <clang-tidy/checks/modernize/use-std-format>` check.
+
+  Converts calls to ``absl::StrFormat``, or other functions via
+  configuration options, to C++20's ``std::format``, or another function
+  via a configuration option, modifying the format string appropriately and
+  removing now-unnecessary calls to ``std::string::c_str()`` and
+  ``std::string::data()``.
+
+- New :doc:`readability-enum-initial-value
+  <clang-tidy/checks/readability/enum-initial-value>` check.
+
+  Enforces consistent style for enumerators' initialization, covering three
+  styles: none, first only, or all initialized explicitly.
+
+- New :doc:`readability-math-missing-parentheses
+  <clang-tidy/checks/readability/math-missing-parentheses>` check.
+
+  Check for missing parentheses in mathematical expressions that involve
+  operators of different priorities.
+
+- New :doc:`readability-use-std-min-max
+  <clang-tidy/checks/readability/use-std-min-max>` check.
+
+  Replaces certain conditional statements with equivalent calls to
+  ``std::min`` or ``std::max``.
+
 New check aliases
 ^^^^^^^^^^^^^^^^^
 
+- New alias :doc:`cert-ctr56-cpp <clang-tidy/checks/cert/ctr56-cpp>` to
+  :doc:`bugprone-pointer-arithmetic-on-polymorphic-object
+  <clang-tidy/checks/bugprone/pointer-arithmetic-on-polymorphic-object>`
+  was added.
+
+- New alias :doc:`cert-int09-c <clang-tidy/checks/cert/int09-c>` to
+  :doc:`readability-enum-initial-value <clang-tidy/checks/readability/enum-initial-value>`
+  was added.
+
 Changes in existing checks
 ^^^^^^^^^^^^^^^^^^^^^^^^^^
 
+- Improved :doc:`bugprone-assert-side-effect
+  <clang-tidy/checks/bugprone/assert-side-effect>` check by detecting side
+  effect from calling a method with non-const reference parameters.
+
+- Improved :doc:`bugprone-assignment-in-if-condition
+  <clang-tidy/checks/bugprone/assignment-in-if-condition>` check by ignoring
+  assignments in the C++20 ``requires`` clause.
+
+- Improved :doc:`bugprone-casting-through-void
+  <clang-tidy/checks/bugprone/casting-through-void>` check by ignoring casts
+  where source is already a ``void``` pointer, making middle ``void`` pointer
+  casts bug-free.
+
+- Improved :doc:`bugprone-forwarding-reference-overload
+  <clang-tidy/checks/bugprone/forwarding-reference-overload>`
+  check to ignore deleted constructors which won't hide other overloads.
+
+- Improved :doc:`bugprone-implicit-widening-of-multiplication-result
+  <clang-tidy/checks/bugprone/implicit-widening-of-multiplication-result>` check
+  by adding an option to ignore constant expressions of signed integer types
+  that fit in the source expression type.
+
+- Improved :doc:`bugprone-inc-dec-in-conditions
+  <clang-tidy/checks/bugprone/inc-dec-in-conditions>` check to ignore code
+  within unevaluated contexts, such as ``decltype``.
+
+- Improved :doc:`bugprone-lambda-function-name<clang-tidy/checks/bugprone/lambda-function-name>`
+  check by ignoring ``__func__`` macro in lambda captures, initializers of
+  default parameters and nested function declarations.
+
+- Improved :doc:`bugprone-multi-level-implicit-pointer-conversion
+  <clang-tidy/checks/bugprone/multi-level-implicit-pointer-conversion>` check
+  by ignoring implicit pointer conversions that are part of a cast expression.
+
+- Improved :doc:`bugprone-non-zero-enum-to-bool-conversion
+  <clang-tidy/checks/bugprone/non-zero-enum-to-bool-conversion>` check by
+  eliminating false positives resulting from direct usage of bitwise operators
+  within parentheses.
+
+- Improved :doc:`bugprone-optional-value-conversion
+  <clang-tidy/checks/bugprone/optional-value-conversion>` check by eliminating
+  false positives resulting from use of optionals in unevaluated context.
+
+- Improved :doc:`bugprone-sizeof-expression
+  <clang-tidy/checks/bugprone/sizeof-expression>` check by clarifying the
+  diagnostics, eliminating some false positives and adding a new
+  (off-by-default) option `WarnOnSizeOfPointer` that reports all
+  ``sizeof(pointer)`` expressions (except for a few that are idiomatic).
+
+- Improved :doc:`bugprone-suspicious-include
+  <clang-tidy/checks/bugprone/suspicious-include>` check by replacing the local
+  options `HeaderFileExtensions` and `ImplementationFileExtensions` by the
+  global options of the same name.
+
+- Improved :doc:`bugprone-too-small-loop-variable
+  <clang-tidy/checks/bugprone/too-small-loop-variable>` check by incorporating
+  better support for ``const`` loop boundaries.
+
+- Improved :doc:`bugprone-unused-local-non-trivial-variable
+  <clang-tidy/checks/bugprone/unused-local-non-trivial-variable>` check by
+  ignoring local variable with ``[maybe_unused]`` attribute.
+
+- Improved :doc:`bugprone-unused-return-value
+  <clang-tidy/checks/bugprone/unused-return-value>` check by updating the
+  parameter `CheckedFunctions` to support regexp, avoiding false positive for
+  function with the same prefix as the default argument, e.g. ``std::unique_ptr``
+  and ``std::unique``, avoiding false positive for assignment operator overloading.
+
+- Improved :doc:`bugprone-use-after-move
+  <clang-tidy/checks/bugprone/use-after-move>` check to also handle
+  calls to ``std::forward``. Fixed sequencing of designated initializers. Fixed
+  sequencing of callees: In C++17 and later, the callee of a function is guaranteed
+  to be sequenced before the arguments, so don't warn if the use happens in the
+  callee and the move happens in one of the arguments.
+
+- Improved :doc:`cppcoreguidelines-avoid-non-const-global-variables
+  <clang-tidy/checks/cppcoreguidelines/avoid-non-const-global-variables>` check
+  with a new option `AllowInternalLinkage` to disable the warning for variables
+  with internal linkage.
+
+- Improved :doc:`cppcoreguidelines-macro-usage
+  <clang-tidy/checks/cppcoreguidelines/macro-usage>` check by ignoring macro with
+  hash preprocessing token.
+
+- Improved :doc:`cppcoreguidelines-missing-std-forward
+  <clang-tidy/checks/cppcoreguidelines/missing-std-forward>` check by no longer
+  giving false positives for deleted functions, by fixing false negatives when only
+  a few parameters are forwarded and by ignoring parameters without a name (unused
+  arguments).
+
+- Improved :doc:`cppcoreguidelines-owning-memory
+  <clang-tidy/checks/cppcoreguidelines/owning-memory>` check to properly handle
+  return type in lambdas and in nested functions.
+
+- Improved :doc:`cppcoreguidelines-prefer-member-initializer
+  <clang-tidy/checks/cppcoreguidelines/prefer-member-initializer>` check
+  by removing enforcement of rule `C.48
+  <https://isocpp.github.io/CppCoreGuidelines/CppCoreGuidelines#c48-prefer-in-class-initializers-to-member-initializers-in-constructors-for-constant-initializers>`_,
+  which was deprecated since :program:`clang-tidy` 17. This rule is now covered
+  by :doc:`cppcoreguidelines-use-default-member-init
+  <clang-tidy/checks/cppcoreguidelines/use-default-member-init>`. Fixed
+  incorrect hints when using list-initialization.
+
+- Improved :doc:`cppcoreguidelines-special-member-functions
+  <clang-tidy/checks/cppcoreguidelines/special-member-functions>` check with a
+  new option `AllowImplicitlyDeletedCopyOrMove`, which removes the requirement
+  for explicit copy or move special member functions when they are already
+  implicitly deleted.
+
+- Improved :doc:`google-build-namespaces
+  <clang-tidy/checks/google/build-namespaces>` check by replacing the local
+  option `HeaderFileExtensions` by the global option of the same name.
+
+- Improved :doc:`google-explicit-constructor
+  <clang-tidy/checks/google/explicit-constructor>` check to better handle
+  C++20 `explicit(bool)`.
+
+- Improved :doc:`google-global-names-in-headers
+  <clang-tidy/checks/google/global-names-in-headers>` check by replacing the local
+  option `HeaderFileExtensions` by the global option of the same name.
+
+- Improved :doc:`google-runtime-int <clang-tidy/checks/google/runtime-int>`
+  check performance through optimizations.
+
+- Improved :doc:`hicpp-signed-bitwise <clang-tidy/checks/hicpp/signed-bitwise>`
+  check by ignoring false positives involving positive integer literals behind
+  implicit casts when `IgnorePositiveIntegerLiterals` is enabled.
+
+- Improved :doc:`hicpp-ignored-remove-result <clang-tidy/checks/hicpp/ignored-remove-result>`
+  check by ignoring other functions with same prefixes as the target specific
+  functions.
+
+- Improved :doc:`linuxkernel-must-check-errs
+  <clang-tidy/checks/linuxkernel/must-check-errs>` check documentation to
+  consistently use the check's proper name.
+
+- Improved :doc:`llvm-header-guard
+  <clang-tidy/checks/llvm/header-guard>` check by replacing the local
+  option `HeaderFileExtensions` by the global option of the same name.
+
+- Improved :doc:`misc-const-correctness
+  <clang-tidy/checks/misc/const-correctness>` check by avoiding infinite recursion
+  for recursive functions with forwarding reference parameters and reference
+  variables which refer to themselves.
+
+- Improved :doc:`misc-definitions-in-headers
+  <clang-tidy/checks/misc/definitions-in-headers>` check by replacing the local
+  option `HeaderFileExtensions` by the global option of the same name.
+  Additionally, the option `UseHeaderFileExtensions` is removed, so that the
+  check uses the `HeaderFileExtensions` option unconditionally.
+
+- Improved :doc:`misc-header-include-cycle
+  <clang-tidy/checks/misc/header-include-cycle>` check by avoiding crash for self
+  include cycles.
+
+- Improved :doc:`misc-unused-using-decls
+  <clang-tidy/checks/misc/unused-using-decls>` check by replacing the local
+  option `HeaderFileExtensions` by the global option of the same name.
+
+- Improved :doc:`misc-use-anonymous-namespace
+  <clang-tidy/checks/misc/use-anonymous-namespace>` check by replacing the local
+  option `HeaderFileExtensions` by the global option of the same name.
+
+- Improved :doc:`modernize-avoid-c-arrays
+  <clang-tidy/checks/modernize/avoid-c-arrays>` check by introducing the new
+  `AllowStringArrays` option, enabling the exclusion of array types with deduced
+  length initialized from string literals.
+
+- Improved :doc:`modernize-loop-convert
+  <clang-tidy/checks/modernize/loop-convert>` check by ensuring that fix-its
+  don't remove parentheses used in ``sizeof`` calls when they have array index
+  accesses as arguments.
+
+- Improved :doc:`modernize-use-constraints
+  <clang-tidy/checks/modernize/use-constraints>` check by fixing a crash that
+  occurred in some scenarios and excluding system headers from analysis.
+
+- Improved :doc:`modernize-use-nullptr
+  <clang-tidy/checks/modernize/use-nullptr>` check to include support for C23,
+  which also has introduced the ``nullptr`` keyword.
+
+- Improved :doc:`modernize-use-override
+  <clang-tidy/checks/modernize/use-override>` check to also remove any trailing
+  whitespace when deleting the ``virtual`` keyword.
+
+- Improved :doc:`modernize-use-starts-ends-with
+  <clang-tidy/checks/modernize/use-starts-ends-with>` check to also handle
+  calls to ``compare`` method.
+
+- Improved :doc:`modernize-use-std-print
+  <clang-tidy/checks/modernize/use-std-print>` check to not crash if the
+  format string parameter of the function to be replaced is not of the
+  expected type.
+
+- Improved :doc:`modernize-use-using <clang-tidy/checks/modernize/use-using>`
+  check by adding support for detection of typedefs declared on function level.
+
+- Improved :doc:`performance-inefficient-vector-operation
+  <clang-tidy/checks/performance/inefficient-vector-operation>` fixing false
+  negatives caused by different variable definition type and variable initial
+  value type in loop initialization expression.
+
+- Improved :doc:`performance-move-const-arg
+  <clang-tidy/checks/performance/move-const-arg>` check by ignoring
+  ``std::move()`` calls when their target is used as an rvalue.
+
+- Improved :doc:`performance-unnecessary-copy-initialization
+  <clang-tidy/checks/performance/unnecessary-copy-initialization>` check by
+  detecting more cases of constant access. In particular, pointers can be
+  analyzed, so the check now handles the common patterns
+  `const auto e = (*vector_ptr)[i]` and `const auto e = vector_ptr->at(i);`.
+  Calls to mutable function where there exists a `const` overload are also
+  handled. Fix crash in the case of a non-member operator call.
+
+- Improved :doc:`performance-unnecessary-value-param
+  <clang-tidy/checks/performance/unnecessary-value-param>` check
+  detecting more cases for template functions including lambdas with ``auto``.
+  E.g., ``std::sort(a.begin(), a.end(), [](auto x, auto y) { return a > b; });``
+  will be detected for expensive to copy types. Fixed false positives for
+  dependent call expressions.
+
+- Improved :doc:`readability-avoid-return-with-void-value
+  <clang-tidy/checks/readability/avoid-return-with-void-value>` check by adding
+  fix-its.
+
+- Improved :doc:`readability-const-return-type
+  <clang-tidy/checks/readability/const-return-type>` check to eliminate false
+  positives when returning types with const not at the top level.
+
+- Improved :doc:`readability-container-size-empty
+  <clang-tidy/checks/readability/container-size-empty>` check to prevent false
+  positives when utilizing ``size`` or ``length`` methods that accept parameter.
+  Fixed crash when facing template user defined literals.
+
+- Improved :doc:`readability-duplicate-include
+  <clang-tidy/checks/readability/duplicate-include>` check by excluding include
+  directives that form the filename using macro.
+
+- Improved :doc:`readability-else-after-return
+  <clang-tidy/checks/readability/else-after-return>` check to ignore
+  `if consteval` statements, for which the `else` branch must not be removed.
+
+- Improved :doc:`readability-identifier-naming
+  <clang-tidy/checks/readability/identifier-naming>` check in `GetConfigPerFile`
+  mode by resolving symbolic links to header files. Fixed handling of Hungarian
+  Prefix when configured to `LowerCase`. Added support for renaming designated
+  initializers. Added support for renaming macro arguments. Fixed renaming
+  conflicts arising from out-of-line member function template definitions.
+
+- Improved :doc:`readability-implicit-bool-conversion
+  <clang-tidy/checks/readability/implicit-bool-conversion>` check to provide
+  valid fix suggestions for ``static_cast`` without a preceding space and
+  fixed problem with duplicate parentheses in double implicit casts. Corrected
+  the fix suggestions for C23 and later by using C-style casts instead of
+  ``static_cast``. Fixed false positives in C++20 spaceship operator by ignoring
+  casts in implicit and defaulted functions.
+
+- Improved :doc:`readability-redundant-inline-specifier
+  <clang-tidy/checks/readability/redundant-inline-specifier>` check to properly
+  emit warnings for static data member with an in-class initializer.
+
+- Improved :doc:`readability-redundant-member-init
+  <clang-tidy/checks/readability/redundant-member-init>` check to avoid
+  false-positives when type of the member does not match the type of the
+  initializer.
+
+- Improved :doc:`readability-static-accessed-through-instance
+  <clang-tidy/checks/readability/static-accessed-through-instance>` check to
+  support calls to overloaded operators as base expression and provide fixes to
+  expressions with side-effects.
+
+- Improved :doc:`readability-simplify-boolean-expr
+  <clang-tidy/checks/readability/simplify-boolean-expr>` check to avoid to emit
+  warning for macro when IgnoreMacro option is enabled and improve messages
+  when auto-fix does not work.
+
+- Improved :doc:`readability-static-definition-in-anonymous-namespace
+  <clang-tidy/checks/readability/static-definition-in-anonymous-namespace>`
+  check by resolving fix-it overlaps in template code by disregarding implicit
+  instances.
+
+- Improved :doc:`readability-string-compare
+  <clang-tidy/checks/readability/string-compare>` check to also detect
+  usages of ``std::string_view::compare``. Added a `StringLikeClasses` option
+  to detect usages of ``compare`` method in custom string-like classes.
+
 Removed checks
 ^^^^^^^^^^^^^^
 
+- Removed `cert-dcl21-cpp`, which was deprecated since :program:`clang-tidy` 17,
+  since the rule DCL21-CPP has been removed from the CERT guidelines.
+
 Miscellaneous
 ^^^^^^^^^^^^^
 
+- Fixed incorrect formatting in :program:`clang-apply-replacements` when no
+  `--format` option is specified. Now :program:`clang-apply-replacements`
+  applies formatting only with the option.
+
 Improvements to include-fixer
 -----------------------------
 
diff --git a/clang-tools-extra/docs/clang-tidy/checks/clang-analyzer/cplusplus.Move.rst b/clang-tools-extra/docs/clang-tidy/checks/clang-analyzer/cplusplus.Move.rst
index f478598bb4e6c..e723f21f6bc60 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/clang-analyzer/cplusplus.Move.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/clang-analyzer/cplusplus.Move.rst
@@ -1,13 +1,9 @@
 .. title:: clang-tidy - clang-analyzer-cplusplus.Move
-.. meta::
-   :http-equiv=refresh: 5;URL=https://clang.llvm.org/docs/analyzer/checkers.html#cplusplus-move
 
 clang-analyzer-cplusplus.Move
 =============================
 
 Find use-after-move bugs in C++.
 
-The `clang-analyzer-cplusplus.Move` check is an alias, please see
-`Clang Static Analyzer Available Checkers
-<https://clang.llvm.org/docs/analyzer/checkers.html#cplusplus-move>`_
-for more information.
+The clang-analyzer-cplusplus.Move check is an alias of
+Clang Static Analyzer cplusplus.Move.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/clang-analyzer/optin.taint.TaintedAlloc.rst b/clang-tools-extra/docs/clang-tidy/checks/clang-analyzer/optin.taint.TaintedAlloc.rst
deleted file mode 100644
index 9732333c61aa7..0000000000000
--- a/clang-tools-extra/docs/clang-tidy/checks/clang-analyzer/optin.taint.TaintedAlloc.rst
+++ /dev/null
@@ -1,14 +0,0 @@
-.. title:: clang-tidy - clang-analyzer-optin.taint.TaintedAlloc
-.. meta::
-   :http-equiv=refresh: 5;URL=https://clang.llvm.org/docs/analyzer/checkers.html#optin-taint-taintedalloc
-
-clang-analyzer-optin.taint.TaintedAlloc
-=======================================
-
-Check for memory allocations, where the size parameter might be a tainted
-(attacker controlled) value.
-
-The `clang-analyzer-optin.taint.TaintedAlloc` check is an alias, please see
-`Clang Static Analyzer Available Checkers
-<https://clang.llvm.org/docs/analyzer/checkers.html#optin-taint-taintedalloc>`_
-for more information.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/clang-analyzer/security.PutenvStackArray.rst b/clang-tools-extra/docs/clang-tidy/checks/clang-analyzer/security.PutenvStackArray.rst
deleted file mode 100644
index 0a5feff8d3ca8..0000000000000
--- a/clang-tools-extra/docs/clang-tidy/checks/clang-analyzer/security.PutenvStackArray.rst
+++ /dev/null
@@ -1,10 +0,0 @@
-.. title:: clang-tidy - clang-analyzer-security.PutenvStackArray
-
-clang-analyzer-security.PutenvStackArray
-========================================
-
-Finds calls to the function 'putenv' which pass a pointer to an automatic
-(stack-allocated) array as the argument.
-
-The clang-analyzer-security.PutenvStackArray check is an alias of
-Clang Static Analyzer security.PutenvStackArray.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/clang-analyzer/unix.BlockInCriticalSection.rst b/clang-tools-extra/docs/clang-tidy/checks/clang-analyzer/unix.BlockInCriticalSection.rst
deleted file mode 100644
index 40d2d3d8b8aac..0000000000000
--- a/clang-tools-extra/docs/clang-tidy/checks/clang-analyzer/unix.BlockInCriticalSection.rst
+++ /dev/null
@@ -1,13 +0,0 @@
-.. title:: clang-tidy - clang-analyzer-unix.BlockInCriticalSection
-.. meta::
-   :http-equiv=refresh: 5;URL=https://clang.llvm.org/docs/analyzer/checkers.html#unix-blockincriticalsection
-
-clang-analyzer-unix.BlockInCriticalSection
-==========================================
-
-Check for calls to blocking functions inside a critical section.
-
-The `clang-analyzer-unix.BlockInCriticalSection` check is an alias, please see
-`Clang Static Analyzer Available Checkers
-<https://clang.llvm.org/docs/analyzer/checkers.html#unix-blockincriticalsection>`_
-for more information.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/list.rst b/clang-tools-extra/docs/clang-tidy/checks/list.rst
index a931ebf025a10..dd2887edb0f8d 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/list.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/list.rst
@@ -118,7 +118,6 @@ Clang-Tidy Checks
    :doc:`bugprone-not-null-terminated-result <bugprone/not-null-terminated-result>`, "Yes"
    :doc:`bugprone-optional-value-conversion <bugprone/optional-value-conversion>`, "Yes"
    :doc:`bugprone-parent-virtual-call <bugprone/parent-virtual-call>`, "Yes"
-   :doc:`bugprone-pointer-arithmetic-on-polymorphic-object <bugprone/pointer-arithmetic-on-polymorphic-object>`,
    :doc:`bugprone-posix-return <bugprone/posix-return>`, "Yes"
    :doc:`bugprone-redundant-branch-condition <bugprone/redundant-branch-condition>`, "Yes"
    :doc:`bugprone-reserved-identifier <bugprone/reserved-identifier>`, "Yes"
@@ -159,6 +158,7 @@ Clang-Tidy Checks
    :doc:`bugprone-unused-raii <bugprone/unused-raii>`, "Yes"
    :doc:`bugprone-unused-return-value <bugprone/unused-return-value>`,
    :doc:`bugprone-use-after-move <bugprone/use-after-move>`,
+   :doc:`bugprone-pointer-arithmetic-on-polymorphic-object <bugprone/pointer-arithmetic-on-polymorphic-object>`,
    :doc:`bugprone-virtual-near-miss <bugprone/virtual-near-miss>`, "Yes"
    :doc:`cert-dcl50-cpp <cert/dcl50-cpp>`,
    :doc:`cert-dcl58-cpp <cert/dcl58-cpp>`,
@@ -269,7 +269,7 @@ Clang-Tidy Checks
    :doc:`misc-unused-parameters <misc/unused-parameters>`, "Yes"
    :doc:`misc-unused-using-decls <misc/unused-using-decls>`, "Yes"
    :doc:`misc-use-anonymous-namespace <misc/use-anonymous-namespace>`,
-   :doc:`misc-use-internal-linkage <misc/use-internal-linkage>`, "Yes"
+   :doc:`misc-use-internal-linkage <misc/use-internal-linkage>`,
    :doc:`modernize-avoid-bind <modernize/avoid-bind>`, "Yes"
    :doc:`modernize-avoid-c-arrays <modernize/avoid-c-arrays>`,
    :doc:`modernize-concat-nested-namespaces <modernize/concat-nested-namespaces>`, "Yes"
@@ -409,7 +409,6 @@ Check aliases
    :doc:`bugprone-narrowing-conversions <bugprone/narrowing-conversions>`, :doc:`cppcoreguidelines-narrowing-conversions <cppcoreguidelines/narrowing-conversions>`,
    :doc:`cert-con36-c <cert/con36-c>`, :doc:`bugprone-spuriously-wake-up-functions <bugprone/spuriously-wake-up-functions>`,
    :doc:`cert-con54-cpp <cert/con54-cpp>`, :doc:`bugprone-spuriously-wake-up-functions <bugprone/spuriously-wake-up-functions>`,
-   :doc:`cert-ctr56-cpp <cert/ctr56-cpp>`, :doc:`bugprone-pointer-arithmetic-on-polymorphic-object <bugprone/pointer-arithmetic-on-polymorphic-object>`,
    :doc:`cert-dcl03-c <cert/dcl03-c>`, :doc:`misc-static-assert <misc/static-assert>`, "Yes"
    :doc:`cert-dcl16-c <cert/dcl16-c>`, :doc:`readability-uppercase-literal-suffix <readability/uppercase-literal-suffix>`, "Yes"
    :doc:`cert-dcl37-c <cert/dcl37-c>`, :doc:`bugprone-reserved-identifier <bugprone/reserved-identifier>`, "Yes"
@@ -449,7 +448,7 @@ Check aliases
    :doc:`clang-analyzer-core.uninitialized.UndefReturn <clang-analyzer/core.uninitialized.UndefReturn>`, `Clang Static Analyzer core.uninitialized.UndefReturn <https://clang.llvm.org/docs/analyzer/checkers.html#core-uninitialized-undefreturn>`_,
    :doc:`clang-analyzer-cplusplus.ArrayDelete <clang-analyzer/cplusplus.ArrayDelete>`, `Clang Static Analyzer cplusplus.ArrayDelete <https://clang.llvm.org/docs/analyzer/checkers.html#cplusplus-arraydelete>`_,
    :doc:`clang-analyzer-cplusplus.InnerPointer <clang-analyzer/cplusplus.InnerPointer>`, `Clang Static Analyzer cplusplus.InnerPointer <https://clang.llvm.org/docs/analyzer/checkers.html#cplusplus-innerpointer>`_,
-   :doc:`clang-analyzer-cplusplus.Move <clang-analyzer/cplusplus.Move>`, `Clang Static Analyzer cplusplus.Move <https://clang.llvm.org/docs/analyzer/checkers.html#cplusplus-move>`_,
+   :doc:`clang-analyzer-cplusplus.Move <clang-analyzer/cplusplus.Move>`, Clang Static Analyzer cplusplus.Move,
    :doc:`clang-analyzer-cplusplus.NewDelete <clang-analyzer/cplusplus.NewDelete>`, `Clang Static Analyzer cplusplus.NewDelete <https://clang.llvm.org/docs/analyzer/checkers.html#cplusplus-newdelete>`_,
    :doc:`clang-analyzer-cplusplus.NewDeleteLeaks <clang-analyzer/cplusplus.NewDeleteLeaks>`, `Clang Static Analyzer cplusplus.NewDeleteLeaks <https://clang.llvm.org/docs/analyzer/checkers.html#cplusplus-newdeleteleaks>`_,
    :doc:`clang-analyzer-cplusplus.PlacementNew <clang-analyzer/cplusplus.PlacementNew>`, `Clang Static Analyzer cplusplus.PlacementNew <https://clang.llvm.org/docs/analyzer/checkers.html#cplusplus-placementnew>`_,
@@ -472,7 +471,6 @@ Check aliases
    :doc:`clang-analyzer-optin.performance.GCDAntipattern <clang-analyzer/optin.performance.GCDAntipattern>`, `Clang Static Analyzer optin.performance.GCDAntipattern <https://clang.llvm.org/docs/analyzer/checkers.html#optin-performance-gcdantipattern>`_,
    :doc:`clang-analyzer-optin.performance.Padding <clang-analyzer/optin.performance.Padding>`, `Clang Static Analyzer optin.performance.Padding <https://clang.llvm.org/docs/analyzer/checkers.html#optin-performance-padding>`_,
    :doc:`clang-analyzer-optin.portability.UnixAPI <clang-analyzer/optin.portability.UnixAPI>`, `Clang Static Analyzer optin.portability.UnixAPI <https://clang.llvm.org/docs/analyzer/checkers.html#optin-portability-unixapi>`_,
-   :doc:`clang-analyzer-optin.taint.TaintedAlloc <clang-analyzer/optin.taint.TaintedAlloc>`, `Clang Static Analyzer optin.taint.TaintedAlloc <https://clang.llvm.org/docs/analyzer/checkers.html#optin-taint-taintedalloc>`_,
    :doc:`clang-analyzer-osx.API <clang-analyzer/osx.API>`, `Clang Static Analyzer osx.API <https://clang.llvm.org/docs/analyzer/checkers.html#osx-api>`_,
    :doc:`clang-analyzer-osx.MIG <clang-analyzer/osx.MIG>`, Clang Static Analyzer osx.MIG,
    :doc:`clang-analyzer-osx.NumberObjectConversion <clang-analyzer/osx.NumberObjectConversion>`, `Clang Static Analyzer osx.NumberObjectConversion <https://clang.llvm.org/docs/analyzer/checkers.html#osx-numberobjectconversion>`_,
@@ -503,7 +501,6 @@ Check aliases
    :doc:`clang-analyzer-osx.coreFoundation.containers.OutOfBounds <clang-analyzer/osx.coreFoundation.containers.OutOfBounds>`, `Clang Static Analyzer osx.coreFoundation.containers.OutOfBounds <https://clang.llvm.org/docs/analyzer/checkers.html#osx-corefoundation-containers-outofbounds>`_,
    :doc:`clang-analyzer-osx.coreFoundation.containers.PointerSizedValues <clang-analyzer/osx.coreFoundation.containers.PointerSizedValues>`, `Clang Static Analyzer osx.coreFoundation.containers.PointerSizedValues <https://clang.llvm.org/docs/analyzer/checkers.html#osx-corefoundation-containers-pointersizedvalues>`_,
    :doc:`clang-analyzer-security.FloatLoopCounter <clang-analyzer/security.FloatLoopCounter>`, `Clang Static Analyzer security.FloatLoopCounter <https://clang.llvm.org/docs/analyzer/checkers.html#security-floatloopcounter>`_,
-   :doc:`clang-analyzer-security.PutenvStackArray <clang-analyzer/security.PutenvStackArray>`, Clang Static Analyzer security.PutenvStackArray,
    :doc:`clang-analyzer-security.SetgidSetuidOrder <clang-analyzer/security.SetgidSetuidOrder>`, Clang Static Analyzer security.SetgidSetuidOrder,
    :doc:`clang-analyzer-security.cert.env.InvalidPtr <clang-analyzer/security.cert.env.InvalidPtr>`, `Clang Static Analyzer security.cert.env.InvalidPtr <https://clang.llvm.org/docs/analyzer/checkers.html#security-cert-env-invalidptr>`_,
    :doc:`clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling <clang-analyzer/security.insecureAPI.DeprecatedOrUnsafeBufferHandling>`, `Clang Static Analyzer security.insecureAPI.DeprecatedOrUnsafeBufferHandling <https://clang.llvm.org/docs/analyzer/checkers.html#security-insecureapi-deprecatedorunsafebufferhandling>`_,
@@ -520,7 +517,6 @@ Check aliases
    :doc:`clang-analyzer-security.insecureAPI.strcpy <clang-analyzer/security.insecureAPI.strcpy>`, `Clang Static Analyzer security.insecureAPI.strcpy <https://clang.llvm.org/docs/analyzer/checkers.html#security-insecureapi-strcpy>`_,
    :doc:`clang-analyzer-security.insecureAPI.vfork <clang-analyzer/security.insecureAPI.vfork>`, `Clang Static Analyzer security.insecureAPI.vfork <https://clang.llvm.org/docs/analyzer/checkers.html#security-insecureapi-vfork>`_,
    :doc:`clang-analyzer-unix.API <clang-analyzer/unix.API>`, `Clang Static Analyzer unix.API <https://clang.llvm.org/docs/analyzer/checkers.html#unix-api>`_,
-   :doc:`clang-analyzer-unix.BlockInCriticalSection <clang-analyzer/unix.BlockInCriticalSection>`, `Clang Static Analyzer unix.BlockInCriticalSection <https://clang.llvm.org/docs/analyzer/checkers.html#unix-blockincriticalsection>`_,
    :doc:`clang-analyzer-unix.Errno <clang-analyzer/unix.Errno>`, `Clang Static Analyzer unix.Errno <https://clang.llvm.org/docs/analyzer/checkers.html#unix-errno>`_,
    :doc:`clang-analyzer-unix.Malloc <clang-analyzer/unix.Malloc>`, `Clang Static Analyzer unix.Malloc <https://clang.llvm.org/docs/analyzer/checkers.html#unix-malloc>`_,
    :doc:`clang-analyzer-unix.MallocSizeof <clang-analyzer/unix.MallocSizeof>`, `Clang Static Analyzer unix.MallocSizeof <https://clang.llvm.org/docs/analyzer/checkers.html#unix-mallocsizeof>`_,
diff --git a/clang-tools-extra/include-cleaner/lib/Analysis.cpp b/clang-tools-extra/include-cleaner/lib/Analysis.cpp
index 68fe79d6929f6..f1cd72f877ca2 100644
--- a/clang-tools-extra/include-cleaner/lib/Analysis.cpp
+++ b/clang-tools-extra/include-cleaner/lib/Analysis.cpp
@@ -105,20 +105,9 @@ analyze(llvm::ArrayRef<Decl *> ASTRoots,
              }
              if (!Satisfied && !Providers.empty() &&
                  Ref.RT == RefType::Explicit &&
-                 !HeaderFilter(Providers.front().resolvedPath())) {
-               // Check if we have any headers with the same spelling, in edge
-               // cases like `#include_next "foo.h"`, the user can't ever
-               // include the physical foo.h, but can have a spelling that
-               // refers to it.
-               auto Spelling = spellHeader(
-                   {Providers.front(), PP.getHeaderSearchInfo(), MainFile});
-               for (const Include *I : Inc.match(Header{Spelling})) {
-                 Used.insert(I);
-                 Satisfied = true;
-               }
-               if (!Satisfied)
-                 Missing.insert(std::move(Spelling));
-             }
+                 !HeaderFilter(Providers.front().resolvedPath()))
+               Missing.insert(spellHeader(
+                   {Providers.front(), PP.getHeaderSearchInfo(), MainFile}));
            });
 
   AnalysisResults Results;
diff --git a/clang-tools-extra/include-cleaner/unittests/AnalysisTest.cpp b/clang-tools-extra/include-cleaner/unittests/AnalysisTest.cpp
index 5696c380758f8..6558b68087684 100644
--- a/clang-tools-extra/include-cleaner/unittests/AnalysisTest.cpp
+++ b/clang-tools-extra/include-cleaner/unittests/AnalysisTest.cpp
@@ -12,7 +12,6 @@
 #include "clang-include-cleaner/Record.h"
 #include "clang-include-cleaner/Types.h"
 #include "clang/AST/ASTContext.h"
-#include "clang/AST/DeclBase.h"
 #include "clang/Basic/FileManager.h"
 #include "clang/Basic/IdentifierTable.h"
 #include "clang/Basic/SourceLocation.h"
@@ -297,31 +296,6 @@ TEST_F(AnalyzeTest, ResourceDirIsIgnored) {
   EXPECT_THAT(Results.Missing, testing::IsEmpty());
 }
 
-TEST_F(AnalyzeTest, DifferentHeaderSameSpelling) {
-  Inputs.ExtraArgs.push_back("-Ifoo");
-  Inputs.ExtraArgs.push_back("-Ifoo_inner");
-  // `foo` is declared in foo_inner/foo.h, but there's no way to spell it
-  // directly. Make sure we don't generate unusued/missing include findings in
-  // such cases.
-  Inputs.Code = R"cpp(
-    #include <foo.h>
-    void baz() {
-      foo();
-    }
-  )cpp";
-  Inputs.ExtraFiles["foo/foo.h"] = guard("#include_next <foo.h>");
-  Inputs.ExtraFiles["foo_inner/foo.h"] = guard(R"cpp(
-    void foo();
-  )cpp");
-  TestAST AST(Inputs);
-  std::vector<Decl *> DeclsInTU;
-  for (auto *D : AST.context().getTranslationUnitDecl()->decls())
-    DeclsInTU.push_back(D);
-  auto Results = analyze(DeclsInTU, {}, PP.Includes, &PI, AST.preprocessor());
-  EXPECT_THAT(Results.Unused, testing::IsEmpty());
-  EXPECT_THAT(Results.Missing, testing::IsEmpty());
-}
-
 TEST(FixIncludes, Basic) {
   llvm::StringRef Code = R"cpp(#include "d.h"
 #include "a.h"
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/exception-escape.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/exception-escape.cpp
index 26c443b139629..f5e74df1621ce 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/exception-escape.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/exception-escape.cpp
@@ -756,21 +756,3 @@ struct test_implicit_throw {
 };
 
 }}
-
-void pointer_exception_can_not_escape_with_const_void_handler() noexcept {
-  // CHECK-MESSAGES-NOT: :[[@LINE-1]]:6: warning: an exception may be thrown in function 'pointer_exception_can_not_escape_with_const_void_handler' which should not throw exceptions
-  const int value = 42;
-  try {
-    throw &value;
-  } catch (const void *) {
-  }
-}
-
-void pointer_exception_can_not_escape_with_void_handler() noexcept {
-  // CHECK-MESSAGES-NOT: :[[@LINE-1]]:6: warning: an exception may be thrown in function 'pointer_exception_can_not_escape_with_void_handler' which should not throw exceptions
-  int value = 42;
-  try {
-    throw &value;
-  } catch (void *) {
-  }
-}
diff --git a/clang-tools-extra/test/clang-tidy/checkers/misc/const-correctness-values.cpp b/clang-tools-extra/test/clang-tidy/checkers/misc/const-correctness-values.cpp
index 0d1ff0db58371..cb6bfcc1dccba 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/misc/const-correctness-values.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/misc/const-correctness-values.cpp
@@ -3,7 +3,7 @@
 // RUN:     misc-const-correctness.TransformValues: true, \
 // RUN:     misc-const-correctness.WarnPointersAsValues: false, \
 // RUN:     misc-const-correctness.TransformPointersAsValues: false \
-// RUN:   }}" -- -fno-delayed-template-parsing -fexceptions
+// RUN:   }}" -- -fno-delayed-template-parsing
 
 // ------- Provide test samples for primitive builtins ---------
 // - every 'p_*' variable is a 'potential_const_*' variable
@@ -56,15 +56,6 @@ void some_function(double np_arg0, wchar_t np_arg1) {
   np_local6--;
 }
 
-int function_try_block() try {
-  int p_local0 = 0;
-  // CHECK-MESSAGES: [[@LINE-1]]:3: warning: variable 'p_local0' of type 'int' can be declared 'const'
-  // CHECK-FIXES: int const p_local0
-  return p_local0;
-} catch (...) {
-  return 0;
-}
-
 void nested_scopes() {
   int p_local0 = 2;
   // CHECK-MESSAGES: [[@LINE-1]]:3: warning: variable 'p_local0' of type 'int' can be declared 'const'
diff --git a/clang/cmake/caches/Fuchsia-stage2.cmake b/clang/cmake/caches/Fuchsia-stage2.cmake
index 8e0835cb3158f..0218f0b21eb28 100644
--- a/clang/cmake/caches/Fuchsia-stage2.cmake
+++ b/clang/cmake/caches/Fuchsia-stage2.cmake
@@ -334,7 +334,6 @@ foreach(target armv6m-unknown-eabi;armv7m-unknown-eabi;armv8m.main-unknown-eabi)
   endforeach()
   set(RUNTIMES_${target}_LLVM_LIBC_FULL_BUILD ON CACHE BOOL "")
   set(RUNTIMES_${target}_LIBC_ENABLE_USE_BY_CLANG ON CACHE BOOL "")
-  set(RUNTIMES_${target}_LIBC_USE_NEW_HEADER_GEN OFF CACHE BOOL "")
   set(RUNTIMES_${target}_LIBCXX_ABI_VERSION 2 CACHE STRING "")
   set(RUNTIMES_${target}_LIBCXX_CXX_ABI none CACHE STRING "")
   set(RUNTIMES_${target}_LIBCXX_ENABLE_SHARED OFF CACHE BOOL "")
@@ -386,7 +385,6 @@ foreach(target riscv32-unknown-elf)
   endforeach()
   set(RUNTIMES_${target}_LLVM_LIBC_FULL_BUILD ON CACHE BOOL "")
   set(RUNTIMES_${target}_LIBC_ENABLE_USE_BY_CLANG ON CACHE BOOL "")
-  set(RUNTIMES_${target}_LIBC_USE_NEW_HEADER_GEN OFF CACHE BOOL "")
   set(RUNTIMES_${target}_LIBCXX_ABI_VERSION 2 CACHE STRING "")
   set(RUNTIMES_${target}_LIBCXX_CXX_ABI none CACHE STRING "")
   set(RUNTIMES_${target}_LIBCXX_ENABLE_SHARED OFF CACHE BOOL "")
diff --git a/clang/docs/ClangNVLinkWrapper.rst b/clang/docs/ClangNVLinkWrapper.rst
deleted file mode 100644
index 2acdb054572f8..0000000000000
--- a/clang/docs/ClangNVLinkWrapper.rst
+++ /dev/null
@@ -1,74 +0,0 @@
-====================
-Clang nvlink Wrapper
-====================
-
-.. contents::
-   :local:
-
-.. _clang-nvlink-wrapper:
-
-Introduction
-============
-
-This tools works as a wrapper around the NVIDIA ``nvlink`` linker. The purpose
-of this wrapper is to provide an interface similar to the ``ld.lld`` linker
-while still relying on NVIDIA's proprietary linker to produce the final output.
-
-``nvlink`` has a number of known quirks that make it difficult to use in a
-unified offloading setting. For example, it does not accept ``.o`` files as they
-must be named ``.cubin``. Static archives do not work, so passing a ``.a`` will
-provide a linker error. ``nvlink`` also does not support link time optimization
-and ignores many standard linker arguments. This tool works around these issues.
-
-Usage
-=====
-
-This tool can be used with the following options. Any arguments not intended
-only for the linker wrapper will be forwarded to ``nvlink``.
-
-.. code-block:: console
-
-  OVERVIEW: A utility that wraps around the NVIDIA 'nvlink' linker.
-  This enables static linking and LTO handling for NVPTX targets.
-
-  USAGE: clang-nvlink-wrapper [options] <options to passed to nvlink>
-
-  OPTIONS:
-    --arch <value>       Specify the 'sm_' name of the target architecture.
-    --cuda-path=<dir>    Set the system CUDA path
-    --dry-run            Print generated commands without running.
-    --feature <value>    Specify the '+ptx' freature to use for LTO.
-    -g                   Specify that this was a debug compile.
-    -help-hidden         Display all available options
-    -help                Display available options (--help-hidden for more)
-    -L <dir>             Add <dir> to the library search path
-    -l <libname>         Search for library <libname>
-    -mllvm <arg>         Arguments passed to LLVM, including Clang invocations,
-                         for which the '-mllvm' prefix is preserved. Use '-mllvm
-                         --help' for a list of options.
-    -o <path>            Path to file to write output
-    --plugin-opt=jobs=<value>
-                         Number of LTO codegen partitions
-    --plugin-opt=lto-partitions=<value>
-                         Number of LTO codegen partitions
-    --plugin-opt=O<O0, O1, O2, or O3>
-                         Optimization level for LTO
-    --plugin-opt=thinlto<value>
-                         Enable the thin-lto backend
-    --plugin-opt=<value> Arguments passed to LLVM, including Clang invocations,
-                         for which the '-mllvm' prefix is preserved. Use '-mllvm
-                         --help' for a list of options.
-    --save-temps         Save intermediate results
-    --version            Display the version number and exit
-    -v                   Print verbose information
-
-Example
-=======
-
-This tool is intended to be invoked when targeting the NVPTX toolchain directly
-as a cross-compiling target. This can be used to create standalone GPU
-executables with normal linking semantics similar to standard compilation.
-
-.. code-block:: console
-
-  clang --target=nvptx64-nvidia-cuda -march=native -flto=full input.c
diff --git a/clang/docs/HLSL/AvailabilityDiagnostics.rst b/clang/docs/HLSL/AvailabilityDiagnostics.rst
index c2f260f268e7b..bb9d02f21dde6 100644
--- a/clang/docs/HLSL/AvailabilityDiagnostics.rst
+++ b/clang/docs/HLSL/AvailabilityDiagnostics.rst
@@ -52,6 +52,7 @@ If the compilation target is a shader library, only availability based on shader
 
 As a result, availability based on specific shader stage will only be diagnosed in code that is reachable from a shader entry point or library export function. It also means that function bodies might be scanned multiple time. When that happens, care should be taken not to produce duplicated diagnostics.
 
+========
 Examples
 ========
 
@@ -61,7 +62,7 @@ For the example below, the ``WaveActiveCountBits`` API function became available
 The availability of ``ddx`` function depends on a shader stage. It is available for pixel shaders in shader model 2.1 and higher, for compute, mesh and amplification shaders in shader model 6.6 and higher. For any other shader stages it is not available.
 
 Compute shader example
-----------------------
+======================
 
 .. code-block:: c++
 
@@ -93,7 +94,7 @@ With strict diagnostic mode, in addition to the 2 errors above Clang will also e
    <>:7:13: error: 'WaveActiveCountBits' is only available on Shader Model 6.5 or newer
 
 Shader library example
-----------------------
+======================
 
 .. code-block:: c++
 
diff --git a/clang/docs/HLSL/ExpectedDifferences.rst b/clang/docs/HLSL/ExpectedDifferences.rst
index 4782eb3cda754..a29b6348e0b8e 100644
--- a/clang/docs/HLSL/ExpectedDifferences.rst
+++ b/clang/docs/HLSL/ExpectedDifferences.rst
@@ -1,4 +1,4 @@
-===================================
+
 Expected Differences vs DXC and FXC
 ===================================
 
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 998ec4465cd08..481b6313974ea 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -37,23 +37,135 @@ These changes are ones which we think may surprise users when upgrading to
 Clang |release| because of the opportunity they pose for disruption to existing
 code bases.
 
+- Setting the deprecated CMake variable ``GCC_INSTALL_PREFIX`` (which sets the
+  default ``--gcc-toolchain=``) now leads to a fatal error.
+
 C/C++ Language Potentially Breaking Changes
 -------------------------------------------
 
+- Clang now supports raw string literals in ``-std=gnuXY`` mode as an extension in
+  C99 and later. This behaviour can also be overridden using ``-f[no-]raw-string-literals``.
+  Support of raw string literals in C++ is not affected. Fixes (#GH85703).
+
 C++ Specific Potentially Breaking Changes
 -----------------------------------------
+- Clang now diagnoses function/variable templates that shadow their own template parameters, e.g. ``template<class T> void T();``.
+  This error can be disabled via `-Wno-strict-primary-template-shadow` for compatibility with previous versions of clang.
+
+- The behavior controlled by the `-frelaxed-template-template-args` flag is now
+  on by default, and the flag is deprecated. Until the flag is finally removed,
+  it's negative spelling can be used to obtain compatibility with previous
+  versions of clang. The deprecation warning for the negative spelling can be
+  disabled with `-Wno-deprecated-no-relaxed-template-template-args`.
+
+- Clang now rejects pointer to member from parenthesized expression in unevaluated context such as ``decltype(&(foo::bar))``. (#GH40906).
+
+- Clang now performs semantic analysis for unary operators with dependent operands
+  that are known to be of non-class non-enumeration type prior to instantiation.
+
+  This change uncovered a bug in libstdc++ 14.1.0 which may cause compile failures
+  on systems using that version of libstdc++ and Clang 19, with an error that looks
+  something like this:
+
+  .. code-block:: text
+
+    <source>:4:5: error: expression is not assignable
+    4 |     ++this;
+      |     ^ ~~~~
+
+  To fix this, update libstdc++ to version 14.1.1 or greater.
 
 ABI Changes in This Version
 ---------------------------
+- Fixed Microsoft name mangling of implicitly defined variables used for thread
+  safe static initialization of static local variables. This change resolves
+  incompatibilities with code compiled by MSVC but might introduce
+  incompatibilities with code compiled by earlier versions of Clang when an
+  inline member function that contains a static local variable with a dynamic
+  initializer is declared with ``__declspec(dllimport)``. (#GH83616).
+
+- Fixed Microsoft name mangling of lifetime extended temporary objects. This
+  change corrects missing back reference registrations that could result in
+  incorrect back reference indexes and suprising demangled name results. Since
+  MSVC uses a different mangling for these objects, compatibility is not affected.
+  (#GH85423).
+
+- Fixed Microsoft calling convention for returning certain classes with a
+  templated constructor. If a class has a templated constructor, it should
+  be returned indirectly even if it meets all the other requirements for
+  returning a class in a register. This affects some uses of std::pair.
+  (#GH86384).
+
+- Fixed Microsoft calling convention when returning classes that have a deleted
+  copy assignment operator. Such a class should be returned indirectly.
+
+- Removed the global alias that was pointing to AArch64 Function Multiversioning
+  ifuncs. Its purpose was to preserve backwards compatibility when the ".ifunc"
+  suffix got removed from the name mangling. The alias interacts badly with
+  GlobalOpt (see the issue #96197).
+
+- Fixed Microsoft name mangling for auto non-type template arguments of pointer
+  type for MSVC 1920+. This change resolves incompatibilities with code compiled
+  by MSVC 1920+ but will introduce incompatibilities with code compiled by
+  earlier versions of Clang unless such code is built with the compiler option
+  `-fms-compatibility-version=19.14` to imitate the MSVC 1914 mangling behavior.
+
+- Fixed Microsoft name mangling for auto non-type template arguments of pointer
+  to member type for MSVC 1920+. This change resolves incompatibilities with code
+  compiled by MSVC 1920+ but will introduce incompatibilities with code compiled by
+  earlier versions of Clang unless such code is built with the compiler option
+  `-fms-compatibility-version=19.14` to imitate the MSVC 1914 mangling behavior.
+  (GH#70899).
 
 AST Dumping Potentially Breaking Changes
 ----------------------------------------
 
+- The text ast-dumper has improved printing of TemplateArguments.
+- The text decl-dumper prints template parameters' trailing requires expressions now.
+
 Clang Frontend Potentially Breaking Changes
 -------------------------------------------
+- Removed support for constructing on-stack ``TemplateArgumentList``\ s; interfaces should instead
+  use ``ArrayRef<TemplateArgument>`` to pass template arguments. Transitioning internal uses to
+  ``ArrayRef<TemplateArgument>`` reduces AST memory usage by 0.4% when compiling clang, and is
+  expected to show similar improvements on other workloads.
+
+- The ``-Wgnu-binary-literal`` diagnostic group no longer controls any
+  diagnostics. Binary literals are no longer a GNU extension, they're now a C23
+  extension which is controlled via ``-pedantic`` or ``-Wc23-extensions``. Use
+  of ``-Wno-gnu-binary-literal`` will no longer silence this pedantic warning,
+  which may break existing uses with ``-Werror``.
+
+- The normalization of 3 element target triples where ``-none-`` is the middle
+  element has changed. For example, ``armv7m-none-eabi`` previously normalized
+  to ``armv7m-none-unknown-eabi``, with ``none`` for the vendor and ``unknown``
+  for the operating system. It now normalizes to ``armv7m-unknown-none-eabi``,
+  which has ``unknown`` vendor and ``none`` operating system.
+
+  The affected triples are primarily for bare metal Arm where it is intended
+  that ``none`` means that there is no operating system. As opposed to an unknown
+  type of operating system.
+
+  This change my cause clang to not find libraries, or libraries to be built at
+  different file system locations. This can be fixed by changing your builds to
+  use the new normalized triple. However, we recommend instead getting the
+  normalized triple from clang itself, as this will make your builds more
+  robust in case of future changes::
+
+    $ clang --target=<your target triple> -print-target-triple
+    <the normalized target triple>
+
+- The ``hasTypeLoc`` AST matcher will no longer match a ``classTemplateSpecializationDecl``;
+  existing uses should switch to ``templateArgumentLoc`` or ``hasAnyTemplateArgumentLoc`` instead.
 
 Clang Python Bindings Potentially Breaking Changes
 --------------------------------------------------
+- Renamed ``CursorKind`` variant 272 from ``OMP_TEAMS_DISTRIBUTE_DIRECTIVE``
+  to ``OMP_TEAMS_DISTRIBUTE_SIMD_DIRECTIVE``. The previous name was incorrect, it was a duplicate
+  of variant 271.
+- Renamed ``TypeKind`` variant 162 from ``OBJCCLASS`` to ``OBJCTYPEPARAM``.
+  The previous name was incorrect, it was a duplicate of variant 28.
+- Refactored enum implementation, switching to the standard library `Enum` type.
 
 What's New in Clang |release|?
 ==============================
@@ -62,62 +174,547 @@ here. Generic improvements to Clang as a whole or to its underlying
 infrastructure are described first, followed by language-specific
 sections with improvements to Clang's support for those languages.
 
+- Implemented improvements to BMIs for C++20 Modules that can reduce
+  the number of rebuilds during incremental recompilation. We are seeking
+  feedback from Build System authors and other interested users, especially
+  when you feel Clang changes the BMI and misses an opportunity to avoid
+  recompilations or causes correctness issues. See StandardCPlusPlusModules
+  `StandardCPlusPlusModules <StandardCPlusPlusModules.html>`_ for more details.
+
+- The ``\par`` documentation comment command now supports an optional
+  argument, which denotes the header of the paragraph started by
+  an instance of the ``\par`` command comment. The implementation
+  of the argument handling matches its semantics
+  `in Doxygen <https://www.doxygen.nl/manual/commands.html#cmdpar>`.
+  Namely, any text on the same line as the ``\par`` command will become
+  a header for the paragaph, and if there is no text then the command
+  will start a new paragraph.
+
 C++ Language Changes
 --------------------
+- C++17 support is now completed, with the enablement of the
+  relaxed temlate template argument matching rules introduced in P0522,
+  which was retroactively applied as a defect report.
+  While the implementation already existed since Clang 4, it was turned off by
+  default, and was controlled with the `-frelaxed-template-template-args` flag.
+  In this release, we implement provisional wording for a core defect on
+  P0522 (CWG2398), which avoids the most serious compatibility issues caused
+  by it, allowing us to enable it by default in this release.
+  The flag is now deprecated, and will be removed in the next release, but can
+  still be used to turn it off and regain compatibility with previous versions
+  (#GH36505).
+- Implemented ``_BitInt`` literal suffixes ``__wb`` or ``__WB`` as a Clang extension with ``unsigned`` modifiers also allowed. (#GH85223).
 
 C++17 Feature Support
 ^^^^^^^^^^^^^^^^^^^^^
+- Clang now exposes ``__GCC_DESTRUCTIVE_SIZE`` and ``__GCC_CONSTRUCTIVE_SIZE``
+  predefined macros to support standard library implementations of
+  ``std::hardware_destructive_interference_size`` and
+  ``std::hardware_constructive_interference_size``, respectively. These macros
+  are predefined in all C and C++ language modes. The values the macros
+  expand to are not stable between releases of Clang and do not need to match
+  the values produced by GCC, so these macros should not be used from header
+  files because they may not be stable across multiple TUs (the values may vary
+  based on compiler version as well as CPU tuning). #GH60174
 
 C++14 Feature Support
 ^^^^^^^^^^^^^^^^^^^^^
+- Sized deallocation is enabled by default in C++14 onwards. The user may specify
+  ``-fno-sized-deallocation`` to disable it if there are some regressions.
 
 C++20 Feature Support
 ^^^^^^^^^^^^^^^^^^^^^
 
+- Clang won't perform ODR checks for decls in the global module fragment any
+  more to ease the implementation and improve the user's using experience.
+  This follows the MSVC's behavior. Users interested in testing the more strict
+  behavior can use the flag '-Xclang -fno-skip-odr-check-in-gmf'.
+  (#GH79240).
+
+- Implemented the `__is_layout_compatible` and `__is_pointer_interconvertible_base_of`
+  intrinsics to support
+  `P0466R5: Layout-compatibility and Pointer-interconvertibility Traits <https://wg21.link/P0466R5>`_.
+
+- Clang now implements [module.import]p7 fully. Clang now will import module
+  units transitively for the module units coming from the same module of the
+  current module units. Fixes #GH84002
+
+- Initial support for class template argument deduction (CTAD) for type alias
+  templates (`P1814R0 <https://wg21.link/p1814r0>`_).
+  (#GH54051).
+
+- We have sufficient confidence and experience with the concepts implementation
+  to update the ``__cpp_concepts`` macro to `202002L`. This enables
+  ``<expected>`` from libstdc++ to work correctly with Clang.
+
+- User defined constructors are allowed for copy-list-initialization with CTAD.
+  (#GH62925).
+
 C++23 Feature Support
 ^^^^^^^^^^^^^^^^^^^^^
 
+- Implemented `P2718R0: Lifetime extension in range-based for loops <https://wg21.link/P2718R0>`_. Also
+  materialize temporary object which is a prvalue in discarded-value expression.
+- Implemented `P1774R8: Portable assumptions <https://wg21.link/P1774R8>`_.
+
+- Implemented `P2448R2: Relaxing some constexpr restrictions <https://wg21.link/P2448R2>`_.
+  Note, the ``-Winvalid-constexpr`` diagnostic is now disabled in C++23 mode,
+  but can be explicitly specified to retain the old diagnostic checking
+  behavior.
+
+- Added a ``__reference_converts_from_temporary`` builtin, completing the necessary compiler support for
+  `P2255R2: Type trait to determine if a reference binds to a temporary <https://wg21.link/P2255R2>`_.
+
+- Implemented `P2797R0: Static and explicit object member functions with the same parameter-type-lists <https://wg21.link/P2797R0>`_.
+  This completes the support for "deducing this".
+
 C++2c Feature Support
 ^^^^^^^^^^^^^^^^^^^^^
 
+- Implemented `P2662R3 Pack Indexing <https://wg21.link/P2662R3>`_.
+
+- Implemented `P2573R2: = delete("should have a reason"); <https://wg21.link/P2573R2>`_
+
+- Implemented `P0609R3: Attributes for Structured Bindings <https://wg21.link/P0609R3>`_
+
+- Implemented `P2748R5 Disallow Binding a Returned Glvalue to a Temporary <https://wg21.link/P2748R5>`_.
+
+- Implemented `P2809R3: Trivial infinite loops are not Undefined Behavior <https://wg21.link/P2809R3>`_.
+
+- Implemented `P3144R2 Deleting a Pointer to an Incomplete Type Should be Ill-formed <https://wg21.link/P3144R2>`_.
+
+- Implemented `P2963R3 Ordering of constraints involving fold expressions <https://wg21.link/P2963R3>`_.
+
+- Implemented `P3034R1 Module Declarations Shouldn’t be Macros <https://wg21.link/P3034R1>`_.
+
+
 Resolutions to C++ Defect Reports
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+- Substitute template parameter pack, when it is not explicitly specified
+  in the template parameters, but is deduced from a previous argument. (#GH78449)
+
+- Type qualifications are now ignored when evaluating layout compatibility
+  of two types.
+  (`CWG1719: Layout compatibility and cv-qualification revisited <https://cplusplus.github.io/CWG/issues/1719.html>`_).
+
+- Alignment of members is now respected when evaluating layout compatibility
+  of structs.
+  (`CWG2583: Common initial sequence should consider over-alignment <https://cplusplus.github.io/CWG/issues/2583.html>`_).
+
+- ``[[no_unique_address]]`` is now respected when evaluating layout
+  compatibility of two types.
+  (`CWG2759: [[no_unique_address] and common initial sequence  <https://cplusplus.github.io/CWG/issues/2759.html>`_).
+
+- Clang now diagnoses declarative nested-name-specifiers with pack-index-specifiers.
+  (`CWG2858: Declarative nested-name-specifiers and pack-index-specifiers <https://cplusplus.github.io/CWG/issues/2858.html>`_).
+
+- Clang now allows attributes on concepts.
+  (`CWG2428: Deprecating a concept <https://cplusplus.github.io/CWG/issues/2428.html>`_).
+
+- P0522 implementation is enabled by default in all language versions, and
+  provisional wording for CWG2398 is implemented.
+
+- Clang now performs type-only lookup for the name in ``using enum`` declaration.
+  (`CWG2877: Type-only lookup for using-enum-declarator <https://cplusplus.github.io/CWG/issues/2877.html>`_).
+
+- Clang now requires a template argument list after a template keyword.
+  (`CWG96: Syntactic disambiguation using the template keyword <https://cplusplus.github.io/CWG/issues/96.html>`_).
+
+- Clang now considers ``noexcept(typeid(expr))`` more carefully, instead of always assuming that ``std::bad_typeid`` can be thrown.
+  (`CWG2191: Incorrect result for noexcept(typeid(v)) <https://cplusplus.github.io/CWG/issues/2191.html>`_).
 
 C Language Changes
 ------------------
 
 C2y Feature Support
 ^^^^^^^^^^^^^^^^^^^
+- Clang now enables C2y mode with ``-std=c2y``. This sets ``__STDC_VERSION__``
+  to ``202400L`` so that it's greater than the value for C23. The value of this
+  macro is subject to change in the future.
 
 C23 Feature Support
 ^^^^^^^^^^^^^^^^^^^
+- No longer diagnose use of binary literals as an extension in C23 mode. Fixes
+  #GH72017.
+
+- Corrected parsing behavior for the ``alignas`` specifier/qualifier in C23. We
+  previously handled it as an attribute as in C++, but there are parsing
+  differences. The behavioral differences are:
+
+  .. code-block:: c
+
+     struct alignas(8) /* was accepted, now rejected */ S {
+       char alignas(8) /* was rejected, now accepted */ C;
+     };
+     int i alignas(8) /* was accepted, now rejected */ ;
+
+  Fixes (#GH81472).
+
+- Clang now generates predefined macros of the form ``__TYPE_FMTB__`` and
+  ``__TYPE_FMTb__`` (e.g., ``__UINT_FAST64_FMTB__``) in C23 mode for use with
+  macros typically exposed from ``<inttypes.h>``, such as ``PRIb8``. (#GH81896)
+
+- Clang now supports `N3018 The constexpr specifier for object definitions`
+  <https://www.open-std.org/jtc1/sc22/wg14/www/docs/n3018.htm>`_.
+
+- Properly promote bit-fields of bit-precise integer types to the field's type
+  rather than to ``int``. #GH87641
+
+- Added the ``INFINITY`` and ``NAN`` macros to Clang's ``<float.h>``
+  freestanding implementation; these macros were defined in ``<math.h>`` in C99
+  but C23 added them to ``<float.h>`` in
+  `WG14 N2848 <https://www.open-std.org/jtc1/sc22/wg14/www/docs/n2848.pdf>`_.
+
+- Clang now supports `N3017 <https://www.open-std.org/jtc1/sc22/wg14/www/docs/n3017.htm>`_
+  ``#embed`` - a scannable, tooling-friendly binary resource inclusion mechanism.
+
+- Added the ``FLT_NORM_MAX``, ``DBL_NORM_MAX``, and ``LDBL_NORM_MAX`` to the
+  freestanding implementation of ``<float.h>`` that ships with Clang.
+
+- Compiler support for `N2653 char8_t: A type for UTF-8 characters and strings`
+  <https://www.open-std.org/jtc1/sc22/wg14/www/docs/n2653.htm>`_: ``u8`` string
+  literals are now of type ``char8_t[N]`` in C23 and expose
+  ``__CLANG_ATOMIC_CHAR8_T_LOCK_FREE``/``__GCC_ATOMIC_CHAR8_T_LOCK_FREE`` to
+  implement the corresponding macro in ``<stdatomic.h>``.
+
+Non-comprehensive list of changes in this release
+-------------------------------------------------
+
+- Added ``__builtin_readsteadycounter`` for reading fixed frequency hardware
+  counters.
+
+- ``__builtin_addc``, ``__builtin_subc``, and the other sizes of those
+  builtins are now constexpr and may be used in constant expressions.
+
+- Added ``__builtin_popcountg`` as a type-generic alternative to
+  ``__builtin_popcount{,l,ll}`` with support for any unsigned integer type. Like
+  the previous builtins, this new builtin is constexpr and may be used in
+  constant expressions.
+
+- Lambda expressions are now accepted in C++03 mode as an extension.
+
+- Added ``__builtin_clzg`` and ``__builtin_ctzg`` as type-generic alternatives
+  to ``__builtin_clz{,s,l,ll}`` and ``__builtin_ctz{,s,l,ll}`` respectively,
+  with support for any unsigned integer type. Like the previous builtins, these
+  new builtins are constexpr and may be used in constant expressions.
+
+- ``__typeof_unqual__`` is available in all C modes as an extension, which behaves
+  like ``typeof_unqual`` from C23, similar to ``__typeof__`` and ``typeof``.
+
+- ``__builtin_reduce_{add|mul|xor|or|and|min|max}`` builtins now support scalable vectors.
+
+* Shared libraries linked with either the ``-ffast-math``, ``-Ofast``, or
+  ``-funsafe-math-optimizations`` flags will no longer enable flush-to-zero
+  floating-point mode by default. This decision can be overridden with use of
+  ``-mdaz-ftz``. This behavior now matches GCC's behavior.
+  (`#57589 <https://github.com/llvm/llvm-project/issues/57589>`_)
+
+* ``-fdenormal-fp-math=preserve-sign`` is no longer implied by ``-ffast-math``
+  on x86 systems.
+
+- Builtins ``__builtin_shufflevector()`` and ``__builtin_convertvector()`` may
+  now be used within constant expressions.
+
+- When compiling a constexpr function, Clang will check to see whether the
+  function can *never* be used in a constant expression context and issues a
+  diagnostic under the ``-Winvalid-constexpr`` diagostic flag (which defaults
+  to an error). This check can be expensive because the mere presence of a
+  function marked ``constexpr`` will cause us to undergo constant expression
+  evaluation, even if the function is not called within the translation unit
+  being compiled. Due to the expense, Clang no longer checks constexpr function
+  bodies when the function is defined in a system header file or when
+  ``-Winvalid-constexpr`` is not enabled for the function definition, which
+  should result in mild compile-time performance improvements.
+
+- Added ``__is_bitwise_cloneable`` which is used to check whether a type
+  can be safely copied by memcpy/memmove.
+
+- ``#pragma GCC diagnostic warning "-Wfoo"`` can now downgrade ``-Werror=foo``
+  errors and certain default-to-error ``-W`` diagnostics to warnings.
+
+- Support importing C++20 modules in clang-repl.
+
+- Added support for ``TypeLoc::dump()`` for easier debugging, and improved
+  textual and JSON dumping for various ``TypeLoc``-related nodes.
+
+- Clang can now emit distinct type-based alias analysis tags for incompatible
+  pointers, enabling more powerful alias analysis when accessing pointer types.
+  The new behavior can be enabled using ``-fpointer-tbaa``.
 
 New Compiler Flags
 ------------------
+- ``-fsanitize=implicit-bitfield-conversion`` checks implicit truncation and
+  sign change.
+- ``-fsanitize=implicit-integer-conversion`` a group that replaces the previous
+  group ``-fsanitize=implicit-conversion``.
+
+- ``-Wmissing-designated-field-initializers``, grouped under ``-Wmissing-field-initializers``.
+  This diagnostic can be disabled to make ``-Wmissing-field-initializers`` behave
+  like it did before Clang 18.x. Fixes #GH56628
+
+- ``-fexperimental-modules-reduced-bmi`` enables the Reduced BMI for C++20 named modules.
+  See the document of standard C++ modules for details.
+
+- ``-fexperimental-late-parse-attributes`` enables an experimental feature to
+  allow late parsing certain attributes in specific contexts where they would
+  not normally be late parsed. Currently this allows late parsing the
+  `counted_by` attribute in C. See `Attribute Changes in Clang`_.
+
+- ``-fseparate-named-sections`` uses separate unique sections for global
+  symbols in named special sections (i.e. symbols annotated with
+  ``__attribute__((section(...)))``. This enables linker GC to collect unused
+  symbols without having to use a per-symbol section.
+
+- ``-fms-define-stdc`` and its clang-cl counterpart ``/Zc:__STDC__``.
+  Matches MSVC behaviour by defining ``__STDC__`` to ``1`` when
+  MSVC compatibility mode is used. It has no effect for C++ code.
+
+- ``-Wc++23-compat`` group was added to help migrating existing codebases
+  to C++23.
+
+- ``-Wc++2c-compat`` group was added to help migrating existing codebases
+  to upcoming C++26.
+
+- ``-fdisable-block-signature-string`` instructs clang not to emit the signature
+  string for blocks. Disabling the string can potentially break existing code
+  that relies on it. Users should carefully consider this possibiilty when using
+  the flag.
+
+- For the ARM target, added ``-Warm-interrupt-vfp-clobber`` that will emit a
+  diagnostic when an interrupt handler is declared and VFP is enabled.
+
+- ``-fpointer-tbaa`` enables emission of distinct type-based alias
+  analysis tags for incompatible pointers.
 
 Deprecated Compiler Flags
 -------------------------
 
+- The ``-Ofast`` command-line option has been deprecated. This option both
+  enables the ``-O3`` optimization-level, as well as enabling non-standard
+  ``-ffast-math`` behaviors. As such, it is somewhat misleading as an
+  "optimization level". Users are advised to switch to ``-O3 -ffast-math`` if
+  the use of non-standard math behavior is intended, and ``-O3`` otherwise.
+  See `RFC <https://discourse.llvm.org/t/rfc-deprecate-ofast/78687>`_ for details.
+
 Modified Compiler Flags
 -----------------------
+- Added a new diagnostic flag ``-Wreturn-mismatch`` which is grouped under
+  ``-Wreturn-type``, and moved some of the diagnostics previously controlled by
+  ``-Wreturn-type`` under this new flag. Fixes #GH72116.
+- ``-fsanitize=implicit-conversion`` is now a group for both
+  ``-fsanitize=implicit-integer-conversion`` and
+  ``-fsanitize=implicit-bitfield-conversion``.
+
+- Added ``-Wcast-function-type-mismatch`` under the ``-Wcast-function-type``
+  warning group. Moved the diagnostic previously controlled by
+  ``-Wcast-function-type`` to the new warning group and added
+  ``-Wcast-function-type-mismatch`` to ``-Wextra``. #GH76872
+
+  .. code-block:: c
+
+     int x(long);
+     typedef int (f2)(void*);
+     typedef int (f3)();
+
+     void func(void) {
+       // Diagnoses under -Wcast-function-type, -Wcast-function-type-mismatch,
+       // -Wcast-function-type-strict, -Wextra
+       f2 *b = (f2 *)x;
+       // Diagnoses under -Wcast-function-type, -Wcast-function-type-strict
+       f3 *c = (f3 *)x;
+     }
+
+- Carved out ``-Wformat`` warning about scoped enums into a subwarning and
+  make it controlled by ``-Wformat-pedantic``. Fixes #GH88595.
+
+- Trivial infinite loops (i.e loops with a constant controlling expresion
+  evaluating to ``true`` and an empty body such as ``while(1);``)
+  are considered infinite, even when the ``-ffinite-loop`` flag is set.
+
+- Diagnostics groups about compatibility with a particular C++ Standard version
+  now include dianostics about C++26 features that are not present in older
+  versions.
+
+- Removed the "arm interrupt calling convention" warning that was included in
+  ``-Wextra`` without its own flag. This warning suggested adding
+  ``__attribute__((interrupt))`` to functions that are called from interrupt
+  handlers to prevent clobbering VFP registers. Following this suggestion leads
+  to unpredictable behavior by causing multiple exception returns from one
+  exception. Fixes #GH34876.
 
 Removed Compiler Flags
 -------------------------
 
+- The ``-freroll-loops`` flag has been removed. It had no effect since Clang 13.
+- ``-m[no-]unaligned-access`` is removed for RISC-V and LoongArch.
+  ``-m[no-]strict-align``, also supported by GCC, should be used instead. (#GH85350)
+
 Attribute Changes in Clang
 --------------------------
+- Introduced a new function attribute ``__attribute__((amdgpu_max_num_work_groups(x, y, z)))`` or
+  ``[[clang::amdgpu_max_num_work_groups(x, y, z)]]`` for the AMDGPU target. This attribute can be
+  attached to HIP or OpenCL kernel function definitions to provide an optimization hint. The parameters
+  ``x``, ``y``, and ``z`` specify the maximum number of workgroups for the respective dimensions,
+  and each must be a positive integer when provided. The parameter ``x`` is required, while ``y`` and
+  ``z`` are optional with default value of 1.
+
+- The ``swiftasynccc`` attribute is now considered to be a Clang extension
+  rather than a language standard feature. Please use
+  ``__has_extension(swiftasynccc)`` to check the availability of this attribute
+  for the target platform instead of ``__has_feature(swiftasynccc)``. Also,
+  added a new extension query ``__has_extension(swiftcc)`` corresponding to the
+  ``__attribute__((swiftcc))`` attribute.
+
+- The ``_Nullable`` and ``_Nonnull`` family of type attributes can now apply
+  to certain C++ class types, such as smart pointers:
+  ``void useObject(std::unique_ptr<Object> _Nonnull obj);``.
+
+  This works for standard library types including ``unique_ptr``, ``shared_ptr``,
+  and ``function``. See
+  `the attribute reference documentation <https://llvm.org/docs/AttributeReference.html#nullability-attributes>`_
+  for the full list.
+
+- The ``_Nullable`` attribute can be applied to C++ class declarations:
+  ``template <class T> class _Nullable MySmartPointer {};``.
+
+  This allows the ``_Nullable`` and ``_Nonnull`` family of type attributes to
+  apply to this class.
+
+- Clang now warns that the ``exclude_from_explicit_instantiation`` attribute
+  is ignored when applied to a local class or a member thereof.
+
+- The ``clspv_libclc_builtin`` attribute has been added to allow clspv
+  (`OpenCL-C to Vulkan SPIR-V compiler <https://github.com/google/clspv>`_) to identify functions coming from libclc
+  (`OpenCL-C builtin library <https://libclc.llvm.org>`_).
+- The ``counted_by`` attribute is now allowed on pointers that are members of a
+  struct in C.
+
+- The ``counted_by`` attribute can now be late parsed in C when
+  ``-fexperimental-late-parse-attributes`` is passed but only when attribute is
+  used in the declaration attribute position. This allows using the
+  attribute on existing code where it previously impossible to do so without
+  re-ordering struct field declarations would break ABI as shown below.
+
+  .. code-block:: c
+
+     struct BufferTy {
+       /* Refering to `count` requires late parsing */
+       char* buffer __counted_by(count);
+       /* Swapping `buffer` and `count` to avoid late parsing would break ABI */
+       size_t count;
+     };
+
+- The attributes ``sized_by``, ``counted_by_or_null`` and ``sized_by_or_null```
+  have been added as variants on ``counted_by``, each with slightly different semantics.
+  ``sized_by`` takes a byte size parameter instead of an element count, allowing pointees
+  with unknown size. The ``counted_by_or_null`` and ``sized_by_or_null`` variants are equivalent
+  to their base variants, except the pointer can be null regardless of count/size value.
+  If the pointer is null the size is effectively 0. ``sized_by_or_null`` is needed to properly
+  annotate allocator functions like ``malloc`` that return a buffer of a given byte size, but can
+  also return null.
+
+- The ``guarded_by``, ``pt_guarded_by``, ``acquired_after``, ``acquired_before``
+  attributes now support referencing struct members in C. The arguments are also
+  now late parsed when ``-fexperimental-late-parse-attributes`` is passed like
+  for ``counted_by``.
+
+- Introduced new function type attributes ``[[clang::nonblocking]]``, ``[[clang::nonallocating]]``,
+  ``[[clang::blocking]]``, and ``[[clang::allocating]]``, with GNU-style variants as well.
+  The attributes declare constraints about a function's behavior pertaining to blocking and
+  heap memory allocation.
 
 Improvements to Clang's diagnostics
 -----------------------------------
+- Clang now emits an error instead of a warning for ``-Wundefined-internal``
+  when compiling with `-pedantic-errors` to conform to the C standard
+
+- Clang now applies syntax highlighting to the code snippets it
+  prints.
+
+- Clang now diagnoses member template declarations with multiple declarators.
+
+- Clang now diagnoses use of the ``template`` keyword after declarative nested
+  name specifiers.
+
+- The ``-Wshorten-64-to-32`` diagnostic is now grouped under ``-Wimplicit-int-conversion`` instead
+   of ``-Wconversion``. Fixes #GH69444.
 
-- Some template related diagnostics have been improved.
+- Clang now uses thousand separators when printing large numbers in integer overflow diagnostics.
+  Fixes #GH80939.
+
+- Clang now diagnoses friend declarations with an ``enum`` elaborated-type-specifier in language modes after C++98.
+
+- Added diagnostics for C11 keywords being incompatible with language standards
+  before C11, under a new warning group: ``-Wpre-c11-compat``.
+
+- Now diagnoses an enumeration constant whose value is larger than can be
+  represented by ``unsigned long long``, which can happen with a large constant
+  using the ``wb`` or ``uwb`` suffix. The maximal underlying type is currently
+  ``unsigned long long``, but this behavior may change in the future when Clang
+  implements
+  `WG14 N3029 <https://www.open-std.org/jtc1/sc22/wg14/www/docs/n3029.htm>`_.
+  (#GH69352).
+
+- Clang now diagnoses extraneous template parameter lists as a language extension.
+
+- Clang now diagnoses declarative nested name specifiers that name alias templates.
+
+- Clang now diagnoses lambda function expressions being implicitly cast to boolean values, under ``-Wpointer-bool-conversion``.
+  Fixes #GH82512.
+
+- Clang now provides improved warnings for the ``cleanup`` attribute to detect misuse scenarios,
+  such as attempting to call ``free`` on an unallocated object. Fixes #GH79443.
+
+- Clang no longer warns when the ``bitand`` operator is used with boolean
+  operands, distinguishing it from potential typographical errors or unintended
+  bitwise operations. Fixes #GH77601.
+
+- Clang now correctly diagnoses no arguments to a variadic macro parameter as a C23/C++20 extension.
+  Fixes #GH84495.
+
+- Clang no longer emits a ``-Wexit-time destructors`` warning on static variables explicitly
+  annotated with the ``clang::always_destroy`` attribute.
+  Fixes #GH68686, #GH86486
+
+- ``-Wmicrosoft``, ``-Wgnu``, or ``-pedantic`` is now required to diagnose C99
+  flexible array members in a union or alone in a struct. Fixes GH#84565.
+
+- Clang now no longer diagnoses type definitions in ``offsetof`` in C23 mode.
+  Fixes #GH83658.
+
+- New ``-Wformat-signedness`` diagnostic that warn if the format string requires an
+  unsigned argument and the argument is signed and vice versa.
+
+- Clang now emits ``unused argument`` warning when the -fmodule-output flag is used
+  with an input that is not of type c++-module.
+
+- Clang emits a ``-Wreturn-stack-address`` warning if a function returns a pointer or
+  reference to a struct literal. Fixes #GH8678
+
+- Clang emits a ``-Wunused-but-set-variable`` warning on C++ variables whose declaration
+  (with initializer) entirely consist the condition expression of a if/while/for construct
+  but are not actually used in the body of the if/while/for construct. Fixes #GH41447
+
+- Clang emits a diagnostic when a tentative array definition is assumed to have
+  a single element, but that diagnostic was never given a diagnostic group.
+  Added the ``-Wtentative-definition-array`` warning group to cover this.
+  Fixes #GH87766
+
+- Clang now uses the correct type-parameter-key (``class`` or ``typename``) when printing
+  template template parameter declarations.
+
+- Clang now diagnoses requires expressions with explicit object parameters.
+
+- Clang now looks up members of the current instantiation in the template definition context
+  if the current instantiation has no dependent base classes.
 
   .. code-block:: c++
-    
-     void foo() { template <typename> int i; } // error: templates can only be declared in namespace or class scope
 
-     struct S {
-      template <typename> int i; // error: non-static data member 'i' cannot be declared as a template
+     template<typename T>
+     struct A {
+       int f() {
+         return this->x; // error: no member named 'x' in 'A<T>'
+       }
      };
 
 - Clang emits a ``-Wparentheses`` warning for expressions with consecutive comparisons like ``x < y < z``.
@@ -154,30 +751,364 @@ Improvements to Clang's diagnostics
 Improvements to Clang's time-trace
 ----------------------------------
 
+- Clang now specifies that using ``auto`` in a lambda parameter is a C++14 extension when
+  appropriate. (`#46059: <https://github.com/llvm/llvm-project/issues/46059>`_).
+
 Improvements to Coverage Mapping
 --------------------------------
 
+- Macros defined in system headers are not expanded in coverage
+  mapping. Conditional expressions in system header macros are no
+  longer taken into account for branch coverage. They can be included
+  with ``-mllvm -system-headers-coverage``.
+  (`#78920: <https://github.com/llvm/llvm-project/issues/78920>`_)
+- MC/DC Coverage has been improved.
+  (`#82448: <https://github.com/llvm/llvm-project/pull/82448>`_)
+
+  - The maximum number of conditions is no longer limited to 6. See
+    `this <SourceBasedCodeCoverage.html#mc-dc-instrumentation>` for
+    more details.
+
 Bug Fixes in This Version
 -------------------------
+- Clang's ``-Wundefined-func-template`` no longer warns on pure virtual
+  functions. (#GH74016)
+
+- Fixed missing warnings when comparing mismatched enumeration constants
+  in C (#GH29217)
+
+- Clang now accepts elaborated-type-specifiers that explicitly specialize
+  a member class template for an implicit instantiation of a class template.
+
+- Fixed missing warnings when doing bool-like conversions in C23 (#GH79435).
+- Clang's ``-Wshadow`` no longer warns when an init-capture is named the same as
+  a class field unless the lambda can capture this.
+  Fixes (#GH71976)
+
+- Clang now accepts qualified partial/explicit specializations of variable templates that
+  are not nominable in the lookup context of the specialization.
+
+- Clang now doesn't produce false-positive warning `-Wconstant-logical-operand`
+  for logical operators in C23.
+  Fixes (#GH64356).
+
+- ``__is_trivially_relocatable`` no longer returns ``false`` for volatile-qualified types.
+  Fixes (#GH77091).
+
+- Clang no longer produces a false-positive `-Wunused-variable` warning
+  for variables created through copy initialization having side-effects in C++17 and later.
+  Fixes (#GH64356) (#GH79518).
+
+- Fix value of predefined macro ``__FUNCTION__`` in MSVC compatibility mode.
+  Fixes (#GH66114).
+
+- Clang now emits errors for explicit specializations/instatiations of lambda call
+  operator.
+  Fixes (#GH83267).
+
+- Fix crash on ill-formed partial specialization with CRTP.
+  Fixes (#GH89374).
+
+- Clang now correctly generates overloads for bit-precise integer types for
+  builtin operators in C++. Fixes #GH82998.
+
+- Fix crash when destructor definition is preceded with an equals sign.
+  Fixes (#GH89544).
+
+- When performing mixed arithmetic between ``_Complex`` floating-point types and integers,
+  Clang now correctly promotes the integer to its corresponding real floating-point
+  type only rather than to the complex type (e.g. ``_Complex float / int`` is now evaluated
+  as ``_Complex float / float`` rather than ``_Complex float / _Complex float``), as mandated
+  by the C standard. This significantly improves codegen of `*` and `/` especially.
+  Fixes #GH31205.
+
+- Fixes an assertion failure on invalid code when trying to define member
+  functions in lambdas.
+
+- Fixed a regression in CTAD that a friend declaration that befriends itself may cause
+  incorrect constraint substitution. (#GH86769).
+
+- Fixed an assertion failure on invalid InitListExpr in C89 mode (#GH88008).
+
+- Fixed missing destructor calls when we branch from middle of an expression.
+  This could happen through a branch in stmt-expr or in an expression containing a coroutine
+  suspension. Fixes (#GH63818) (#GH88478).
+
+- Clang will no longer diagnose an erroneous non-dependent ``switch`` condition
+  during instantiation, and instead will only diagnose it once, during checking
+  of the function template.
+
+- Clang now allows the value of unroll count to be zero in ``#pragma GCC unroll`` and ``#pragma unroll``.
+  The values of 0 and 1 block any unrolling of the loop. This keeps the same behavior with GCC.
+  Fixes (`#88624 <https://github.com/llvm/llvm-project/issues/88624>`_).
+
+- Clang will no longer emit a duplicate -Wunused-value warning for an expression
+  `(A, B)` which evaluates to glvalue `B` that can be converted to non ODR-use. (#GH45783)
+
+- Clang now correctly disallows VLA type compound literals, e.g. ``(int[size]){}``,
+  as the C standard mandates. (#GH89835)
+
+- ``__is_array`` and ``__is_bounded_array`` no longer return ``true`` for
+  zero-sized arrays. Fixes (#GH54705).
+
+- Correctly reject declarations where a statement is required in C.
+  Fixes #GH92775
+
+- Fixed `static_cast` to array of unknown bound. Fixes (#GH62863).
+
+- Fixed Clang crashing when failing to perform some C++ Initialization Sequences. (#GH98102)
+
+- ``__is_trivially_equality_comparable`` no longer returns true for types which
+  have a constrained defaulted comparison operator (#GH89293).
+
+- Fixed Clang from generating dangling StringRefs when deserializing Exprs & Stmts (#GH98667)
+
+- ``__has_unique_object_representations`` correctly handles arrays of unknown bounds of
+  types by ensuring they are complete and instantiating them if needed. Fixes (#GH95311).
+
+- ``typeof_unqual`` now properly removes type qualifiers from arrays and their element types. (#GH92667)
+
+- Fixed an assertion failure when a template non-type parameter contains
+  an invalid expression.
 
 Bug Fixes to Compiler Builtins
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
+- Fix crash when atomic builtins are called with pointer to zero-size struct (#GH90330)
+
+- Clang now allows pointee types of atomic builtin arguments to be complete template types
+  that was not instantiated elsewhere.
+
 Bug Fixes to Attribute Support
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Bug Fixes to C++ Support
 ^^^^^^^^^^^^^^^^^^^^^^^^
 
+- Fix crash when calling the constructor of an invalid class.
+  (#GH10518) (#GH67914) (#GH78388)
+- Fix crash when using lifetimebound attribute in function with trailing return.
+  (#GH73619)
+- Addressed an issue where constraints involving injected class types are perceived
+  distinct from its specialization types. (#GH56482)
+- Fixed a bug where variables referenced by requires-clauses inside
+  nested generic lambdas were not properly injected into the constraint scope. (#GH73418)
+- Fixed a crash where substituting into a requires-expression that refers to function
+  parameters during the equivalence determination of two constraint expressions.
+  (#GH74447)
+- Fixed deducing auto& from const int in template parameters of partial
+  specializations. (#GH77189)
+- Fix for crash when using a erroneous type in a return statement.
+  (#GH63244) (#GH79745)
+- Fixed an out-of-bounds error caused by building a recovery expression for ill-formed
+  function calls while substituting into constraints. (#GH58548)
+- Fix incorrect code generation caused by the object argument
+  of ``static operator()`` and ``static operator[]`` calls not being evaluated. (#GH67976)
+- Fix crash and diagnostic with const qualified member operator new.
+  Fixes (#GH79748)
+- Fixed a crash where substituting into a requires-expression that involves parameter packs
+  during the equivalence determination of two constraint expressions. (#GH72557)
+- Fix a crash when specializing an out-of-line member function with a default
+  parameter where we did an incorrect specialization of the initialization of
+  the default parameter. (#GH68490)
+- Fix a crash when trying to call a varargs function that also has an explicit object parameter.
+  Fixes (#GH80971)
+- Reject explicit object parameters on `new` and `delete` operators. (#GH82249)
+- Fix a crash when trying to call a varargs function that also has an explicit object parameter. (#GH80971)
+- Fixed a bug where abbreviated function templates would append their invented template parameters to
+  an empty template parameter lists.
+- Fix parsing of abominable function types inside type traits. Fixes #GH77585
+- Clang now classifies aggregate initialization in C++17 and newer as constant
+  or non-constant more accurately. Previously, only a subset of the initializer
+  elements were considered, misclassifying some initializers as constant. Partially fixes
+  #GH80510.
+- Clang now ignores top-level cv-qualifiers on function parameters in template partial orderings. (#GH75404)
+- No longer reject valid use of the ``_Alignas`` specifier when declaring a
+  local variable, which is supported as a C11 extension in C++. Previously, it
+  was only accepted at namespace scope but not at local function scope.
+- Clang no longer tries to call consteval constructors at runtime when they appear in a member initializer. (#GH82154)
+- Fix crash when using an immediate-escalated function at global scope. (#GH82258)
+- Correctly immediate-escalate lambda conversion functions. (#GH82258)
+- Fixed an issue where template parameters of a nested abbreviated generic lambda within
+  a requires-clause lie at the same depth as those of the surrounding lambda. This,
+  in turn, results in the wrong template argument substitution during constraint checking.
+  (#GH78524)
+- Clang no longer instantiates the exception specification of discarded candidate function
+  templates when determining the primary template of an explicit specialization.
+- Fixed a crash in Microsoft compatibility mode where unqualified dependent base class
+  lookup searches the bases of an incomplete class.
+- Fix a crash when an unresolved overload set is encountered on the RHS of a ``.*`` operator.
+  (#GH53815)
+- In ``__restrict``-qualified member functions, attach ``__restrict`` to the pointer type of
+  ``this`` rather than the pointee type.
+  Fixes (#GH82941), (#GH42411) and (#GH18121).
+- Clang now properly reports supported C++11 attributes when using
+  ``__has_cpp_attribute`` and parses attributes with arguments in C++03 (#GH82995)
+- Clang now properly diagnoses missing 'default' template arguments on a variety
+  of templates. Previously we were diagnosing on any non-function template
+  instead of only on class, alias, and variable templates, as last updated by
+  CWG2032. Fixes (#GH83461)
+- Fixed an issue where an attribute on a declarator would cause the attribute to
+  be destructed prematurely. This fixes a pair of Chromium that were brought to
+  our attention by an attempt to fix in (#GH77703). Fixes (#GH83385).
+- Fix evaluation of some immediate calls in default arguments.
+  Fixes (#GH80630)
+- Fixed an issue where the ``RequiresExprBody`` was involved in the lambda dependency
+  calculation. (#GH56556), (#GH82849).
+- Fix a bug where overload resolution falsely reported an ambiguity when it was comparing
+  a member-function against a non member function or a member-function with an
+  explicit object parameter against a member function with no explicit object parameter
+  when one of the function had more specialized templates. Fixes #GH82509 and #GH74494
+- Clang now supports direct lambda calls inside of a type alias template declarations.
+  This addresses (#GH70601), (#GH76674), (#GH79555), (#GH81145) and (#GH82104).
+- Allow access to a public template alias declaration that refers to friend's
+  private nested type. (#GH25708).
+- Fixed a crash in constant evaluation when trying to access a
+  captured ``this`` pointer in a lambda with an explicit object parameter.
+  Fixes (#GH80997)
+- Fix an issue where missing set friend declaration in template class instantiation.
+  Fixes (#GH84368).
+- Fixed a crash while checking constraints of a trailing requires-expression of a lambda, that the
+  expression references to an entity declared outside of the lambda. (#GH64808)
+- Clang's __builtin_bit_cast will now produce a constant value for records with empty bases. See:
+  (#GH82383)
+- Fix a crash when instantiating a lambda that captures ``this`` outside of its context. Fixes (#GH85343).
+- Fix an issue where a namespace alias could be defined using a qualified name (all name components
+  following the first `::` were ignored).
+- Fix an out-of-bounds crash when checking the validity of template partial specializations. (part of #GH86757).
+- Fix an issue caused by not handling invalid cases when substituting into the parameter mapping of a constraint. Fixes (#GH86757).
+- Fixed a bug that prevented member function templates of class templates declared with a deduced return type
+  from being explicitly specialized for a given implicit instantiation of the class template.
+- Fixed a crash when ``this`` is used in a dependent class scope function template specialization
+  that instantiates to a static member function.
+- Fix crash when inheriting from a cv-qualified type. Fixes #GH35603
+- Fix a crash when the using enum declaration uses an anonymous enumeration. Fixes (#GH86790).
+- Handled an edge case in ``getFullyPackExpandedSize`` so that we now avoid a false-positive diagnostic. (#GH84220)
+- Clang now correctly tracks type dependence of by-value captures in lambdas with an explicit
+  object parameter.
+  Fixes (#GH70604), (#GH79754), (#GH84163), (#GH84425), (#GH86054), (#GH86398), and (#GH86399).
+- Fix a crash when deducing ``auto`` from an invalid dereference (#GH88329).
+- Fix a crash in requires expression with templated base class member function. Fixes (#GH84020).
+- Fix a crash caused by defined struct in a type alias template when the structure
+  has fields with dependent type. Fixes (#GH75221).
+- Fix the Itanium mangling of lambdas defined in a member of a local class (#GH88906)
+- Fixed a crash when trying to evaluate a user-defined ``static_assert`` message whose ``size()``
+  function returns a large or negative value. Fixes (#GH89407).
+- Fixed a use-after-free bug in parsing of type constraints with default arguments that involve lambdas. (#GH67235)
+- Fixed bug in which the body of a consteval lambda within a template was not parsed as within an
+  immediate function context.
+- Fix CTAD for ``std::initializer_list``. This allows ``std::initializer_list{1, 2, 3}`` to be deduced as
+  ``std::initializer_list<int>`` as intended.
+- Fix a bug on template partial specialization whose template parameter is `decltype(auto)`.
+- Fix a bug on template partial specialization with issue on deduction of nontype template parameter
+  whose type is `decltype(auto)`. Fixes (#GH68885).
+- Clang now correctly treats the noexcept-specifier of a friend function to be a complete-class context.
+- Fix an assertion failure when parsing an invalid members of an anonymous class. (#GH85447)
+- Fixed a misuse of ``UnresolvedLookupExpr`` for ill-formed templated expressions. Fixes (#GH48673), (#GH63243)
+  and (#GH88832).
+- Clang now defers all substitution into the exception specification of a function template specialization
+  until the noexcept-specifier is instantiated.
+- Fix a crash when an implicitly declared ``operator==`` function with a trailing requires-clause has its
+  constraints compared to that of another declaration.
+- Fix a bug where explicit specializations of member functions/function templates would have substitution
+  performed incorrectly when checking constraints. Fixes (#GH90349).
+- Clang now allows constrained member functions to be explicitly specialized for an implicit instantiation
+  of a class template.
+- Fix a C++23 bug in implementation of P2564R3 which evaluates immediate invocations in place
+  within initializers for variables that are usable in constant expressions or are constant
+  initialized, rather than evaluating them as a part of the larger manifestly constant evaluated
+  expression.
+- Fix a bug in access control checking due to dealyed checking of friend declaration. Fixes (#GH12361).
+- Correctly treat the compound statement of an ``if consteval`` as an immediate context. Fixes (#GH91509).
+- When partial ordering alias templates against template template parameters,
+  allow pack expansions when the alias has a fixed-size parameter list. Fixes (#GH62529).
+- Clang now ignores template parameters only used within the exception specification of candidate function
+  templates during partial ordering when deducing template arguments from a function declaration or when
+  taking the address of a function template.
+- Fix a bug with checking constrained non-type template parameters for equivalence. Fixes (#GH77377).
+- Fix a bug where the last argument was not considered when considering the most viable function for
+  explicit object argument member functions. Fixes (#GH92188).
+- Fix a C++11 crash when a non-const non-static member function is defined out-of-line with
+  the ``constexpr`` specifier. Fixes (#GH61004).
+- Clang no longer transforms dependent qualified names into implicit class member access expressions
+  until it can be determined whether the name is that of a non-static member.
+- Clang now correctly diagnoses when the current instantiation is used as an incomplete base class.
+- Clang no longer treats ``constexpr`` class scope function template specializations of non-static members
+  as implicitly ``const`` in language modes after C++11.
+- Fixed a crash when trying to emit captures in a lambda call operator with an explicit object
+  parameter that is called on a derived type of the lambda.
+  Fixes (#GH87210), (GH89541).
+- Clang no longer tries to check if an expression is immediate-escalating in an unevaluated context.
+  Fixes (#GH91308).
+- Fix a crash caused by a regression in the handling of ``source_location``
+  in dependent contexts. Fixes (#GH92680).
+- Fixed a crash when diagnosing failed conversions involving template parameter
+  packs. (#GH93076)
+- Fixed a regression introduced in Clang 18 causing a static function overloading a non-static function
+  with the same parameters not to be diagnosed. (Fixes #GH93456).
+- Clang now diagnoses unexpanded parameter packs in attributes. (Fixes #GH93269).
+- Clang now allows ``@$``` in raw string literals. Fixes (#GH93130).
+- Fix an assertion failure when checking invalid ``this`` usage in the wrong context. (Fixes #GH91536).
+- Clang no longer models dependent NTTP arguments as ``TemplateParamObjectDecl`` s. Fixes (#GH84052).
+- Fix incorrect merging of modules which contain using declarations which shadow
+  other declarations. This could manifest as ODR checker false positives.
+  Fixes (`#80252 <https://github.com/llvm/llvm-project/issues/80252>`_)
+- Fix a regression introduced in Clang 18 causing incorrect overload resolution in the presence of functions only
+  differering by their constraints when only one of these function was variadic.
+- Fix a crash when a variable is captured by a block nested inside a lambda. (Fixes #GH93625).
+- Fixed a type constraint substitution issue involving a generic lambda expression. (#GH93821)
+- Fix a crash caused by improper use of ``__array_extent``. (#GH80474)
+- Fixed several bugs in capturing variables within unevaluated contexts. (#GH63845), (#GH67260), (#GH69307),
+  (#GH88081), (#GH89496), (#GH90669), (#GH91633) and (#GH97453).
+- Fixed a crash in constraint instantiation under nested lambdas with dependent parameters.
+- Fixed handling of brace ellison when building deduction guides. (#GH64625), (#GH83368).
+- Fixed a failed assertion when attempting to convert an integer representing the difference
+  between the addresses of two labels (a GNU extension) to a pointer within a constant expression. (#GH95366).
+- Fix immediate escalation bugs in the presence of dependent call arguments. (#GH94935)
+- Clang now diagnoses explicit specializations with storage class specifiers in all contexts.
+- Fix an assertion failure caused by parsing a lambda used as a default argument for the value of a
+  forward-declared class. (#GH93512).
+- Fixed a bug in access checking inside return-type-requirement of compound requirements. (#GH93788).
+- Fixed an assertion failure about invalid conversion when calling lambda. (#GH96205).
+- Fixed a bug where the first operand of binary ``operator&`` would be transformed as if it was the operand
+  of the address of operator. (#GH97483).
+- Fixed an assertion failure about a constant expression which is a known integer but is not
+  evaluated to an integer. (#GH96670).
+- Fixed a bug where references to lambda capture inside a ``noexcept`` specifier were not correctly
+  instantiated. (#GH95735).
+- Fixed a CTAD substitution bug involving type aliases that reference outer template parameters. (#GH94614).
+- Clang now correctly handles unexpanded packs in the template parameter list of a generic lambda expression
+  (#GH48937)
+- Fix a crash when parsing an invalid type-requirement in a requires expression. (#GH51868)
+- Fix parsing of built-in type-traits such as ``__is_pointer`` in libstdc++ headers. (#GH95598)
+- Fixed failed assertion when resolving context of defaulted comparison method outside of struct. (#GH96043).
+- Clang now diagnoses explicit object parameters in member pointers and other contexts where they should not appear.
+  Fixes (#GH85992).
+- Fixed a crash-on-invalid bug involving extraneous template parameter with concept substitution. (#GH73885)
+
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
+- Clang now properly preserves ``FoundDecls`` within a ``ConceptReference``. (#GH82628)
+- The presence of the ``typename`` keyword is now stored in ``TemplateTemplateParmDecl``.
+- Fixed malformed AST generated for anonymous union access in templates. (#GH90842)
+- Improved preservation of qualifiers and sugar in `TemplateNames`, including
+  template keyword.
 
 Miscellaneous Bug Fixes
 ^^^^^^^^^^^^^^^^^^^^^^^
 
+- Fixed an infinite recursion in ASTImporter, on return type declared inside
+  body of C++11 lambda without trailing return (#GH68775).
+- Fixed declaration name source location of instantiated function definitions (GH71161).
+- Improve diagnostic output to print an expression instead of 'no argument` when comparing Values as template arguments.
+
 Miscellaneous Clang Crashes Fixed
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
+- Do not attempt to dump the layout of dependent types or invalid declarations
+  when ``-fdump-record-layouts-complete`` is passed. Fixes #GH83684.
+- Unhandled StructuralValues in the template differ (#GH93068).
+
 OpenACC Specific Changes
 ------------------------
 
@@ -190,36 +1121,131 @@ AMDGPU Support
 X86 Support
 ^^^^^^^^^^^
 
+- Remove knl/knm specific ISA supports: AVX512PF, AVX512ER, PREFETCHWT1
+- Support has been removed for the AMD "3DNow!" instruction-set.
+  Neither modern AMD CPUs, nor any Intel CPUs implement these
+  instructions, and they were never widely used.
+
+  * The options ``-m3dnow`` and ``-m3dnowa`` are no longer honored, and will emit a warning if used.
+  * The macros ``__3dNOW__`` and ``__3dNOW_A__`` are no longer ever set by the compiler.
+  * The header ``<mm3dnow.h>`` is deprecated, and emits a warning if included.
+  * The 3dNow intrinsic functions have been removed: ``_m_femms``,
+    ``_m_pavgusb``, ``_m_pf2id``, ``_m_pfacc``, ``_m_pfadd``,
+    ``_m_pfcmpeq``, ``_m_pfcmpge``, ``_m_pfcmpgt``, ``_m_pfmax``,
+    ``_m_pfmin``, ``_m_pfmul``, ``_m_pfrcp``, ``_m_pfrcpit1``,
+    ``_m_pfrcpit2``, ``_m_pfrsqrt``, ``_m_pfrsqrtit1``, ``_m_pfsub``,
+    ``_m_pfsubr``, ``_m_pi2fd``, ``_m_pmulhrw``, ``_m_pf2iw``,
+    ``_m_pfnacc``, ``_m_pfpnacc``, ``_m_pi2fw``, ``_m_pswapdsf``,
+    ``_m_pswapdsi``.
+  * The compiler builtins corresponding to each of the above
+    intrinsics have also been removed  (``__builtin_ia32_femms``, and so on).
+  * "3DNow!" instructions remain supported in assembly code, including
+    inside inline-assembly.
+
 Arm and AArch64 Support
 ^^^^^^^^^^^^^^^^^^^^^^^
 
+- ARMv7+ targets now default to allowing unaligned access, except Armv6-M, and
+  Armv8-M without the Main Extension. Baremetal targets should check that the
+  new default will work with their system configurations, since it requires
+  that SCTLR.A is 0, SCTLR.U is 1, and that the memory in question is
+  configured as "normal" memory. This brings Clang in-line with the default
+  settings for GCC and Arm Compiler. Aside from making Clang align with other
+  compilers, changing the default brings major performance and code size
+  improvements for most targets. We have not changed the default behavior for
+  ARMv6, but may revisit that decision in the future. Users can restore the old
+  behavior with -m[no-]unaligned-access.
+- An alias identifier (rdma) has been added for targeting the AArch64
+  Architecture Extension which uses Rounding Doubling Multiply Accumulate
+  instructions (rdm). The identifier is available on the command line as
+  a feature modifier for -march and -mcpu as well as via target attributes
+  like ``target_version`` or ``target_clones``.
+- Support has been added for the following processors (-mcpu identifiers in parenthesis):
+    * Arm Cortex-R52+ (cortex-r52plus).
+    * Arm Cortex-R82AE (cortex-r82ae).
+    * Arm Cortex-A78AE (cortex-a78ae).
+    * Arm Cortex-A520AE (cortex-a520ae).
+    * Arm Cortex-A720AE (cortex-a720ae).
+    * Arm Cortex-A725 (cortex-a725).
+    * Arm Cortex-X925 (cortex-x925).
+    * Arm Neoverse-N3 (neoverse-n3).
+    * Arm Neoverse-V3 (neoverse-v3).
+    * Arm Neoverse-V3AE (neoverse-v3ae).
+
 Android Support
 ^^^^^^^^^^^^^^^
 
 Windows Support
 ^^^^^^^^^^^^^^^
 
+- The clang-cl ``/Ot`` compiler option ("optimize for speed", also implied by
+  ``/O2``) now maps to clang's ``-O3`` optimizataztion level instead of ``-O2``.
+  Users who prefer the old behavior can use ``clang-cl /Ot /clang:-O2 ...``.
+
+- Clang-cl now supports function targets with intrinsic headers. This allows
+  for runtime feature detection of intrinsics. Previously under clang-cl
+  ``immintrin.h`` and similar intrinsic headers would only include the intrinsics
+  if building with that feature enabled at compile time, e.g. ``avxintrin.h``
+  would only be included if AVX was enabled at compile time. This was done to work
+  around include times from MSVC STL including ``intrin.h`` under clang-cl.
+  Clang-cl now provides ``intrin0.h`` for MSVC STL and therefore all intrinsic
+  features without requiring enablement at compile time. Fixes #GH53520
+
+- Improved compile times with MSVC STL. MSVC provides ``intrin0.h`` which is a
+  header that only includes intrinsics that are used by MSVC STL to avoid the
+  use of ``intrin.h``. MSVC STL when compiled under clang uses ``intrin.h``
+  instead. Clang-cl now provides ``intrin0.h`` for the same compiler throughput
+  purposes as MSVC. Clang-cl also provides ``yvals_core.h`` to redefine
+  ``_STL_INTRIN_HEADER`` to expand to ``intrin0.h`` instead of ``intrin.h``.
+  This also means that if all intrinsic features are enabled at compile time
+  including STL headers will no longer slow down compile times since ``intrin.h``
+  is not included from MSVC STL.
+
+- When the target triple is `*-windows-msvc` strict aliasing is now disabled by default
+  to ensure compatibility with msvc. Previously strict aliasing was only disabled if the
+  driver mode was cl.
+
 LoongArch Support
 ^^^^^^^^^^^^^^^^^
 
 RISC-V Support
 ^^^^^^^^^^^^^^
 
+- ``__attribute__((rvv_vector_bits(N)))`` is now supported for RVV vbool*_t types.
+- Profile names in ``-march`` option are now supported.
+- Passing empty structs/unions as arguments in C++ is now handled correctly. The behavior is similar to GCC's.
+- ``-m[no-]scalar-strict-align`` and ``-m[no-]vector-strict-align`` options have
+  been added to give separate control of whether scalar or vector misaligned
+  accesses may be created. ``-m[no-]strict-align`` applies to both scalar and
+  vector.
+
 CUDA/HIP Language Changes
 ^^^^^^^^^^^^^^^^^^^^^^^^^
 
+- PTX is no longer included by default when compiling for CUDA. Using
+  ``--cuda-include-ptx=all`` will return the old behavior.
+
 CUDA Support
 ^^^^^^^^^^^^
+- Clang now supports CUDA SDK up to 12.5
 
 AIX Support
 ^^^^^^^^^^^
 
-NetBSD Support
-^^^^^^^^^^^^^^
+- Introduced the ``-maix-small-local-dynamic-tls`` option to produce a faster
+  access sequence for local-dynamic TLS variables where the offset from the TLS
+  base is encoded as an immediate operand.
+  This access sequence is not used for TLS variables larger than 32KB, and is
+  currently only supported on 64-bit mode.
 
 WebAssembly Support
 ^^^^^^^^^^^^^^^^^^^
 
+The -mcpu=generic configuration now enables multivalue and reference-types.
+These proposals are standardized and available in all major engines. Enabling
+multivalue here only enables the language feature but does not turn on the
+multivalue ABI (this enables non-ABI uses of multivalue, like exnref).
+
 AVR Support
 ^^^^^^^^^^^
 
@@ -229,44 +1255,134 @@ DWARF Support in Clang
 Floating Point Support in Clang
 -------------------------------
 
+- Add ``__builtin__fmaf16`` builtin for floating point types.
+
 Fixed Point Support in Clang
 ----------------------------
 
+- Support fixed point precision macros according to ``7.18a.3`` of
+  `ISO/IEC TR 18037:2008 <https://standards.iso.org/ittf/PubliclyAvailableStandards/c051126_ISO_IEC_TR_18037_2008.zip>`_.
+
 AST Matchers
 ------------
 
+- Fixes a long-standing performance issue in parent map generation for
+  ancestry-based matchers such as ``hasParent`` and ``hasAncestor``, making
+  them significantly faster.
+- ``isInStdNamespace`` now supports Decl declared with ``extern "C++"``.
+- Add ``isExplicitObjectMemberFunction``.
+- Fixed ``forEachArgumentWithParam`` and ``forEachArgumentWithParamType`` to
+  not skip the explicit object parameter for operator calls.
+- Fixed captureVars assertion failure if not capturesVariables. (#GH76425)
+- ``forCallable`` now properly preserves binding on successful match. (#GH89657)
+
 clang-format
 ------------
 
+- ``AlwaysBreakTemplateDeclarations`` is deprecated and renamed to
+  ``BreakTemplateDeclarations``.
+- ``AlwaysBreakAfterReturnType`` is deprecated and renamed to
+  ``BreakAfterReturnType``.
+- Handles Java switch expressions.
+- Adds ``AllowShortCaseExpressionOnASingleLine`` option.
+- Adds ``AlignCaseArrows`` suboption to ``AlignConsecutiveShortCaseStatements``.
+- Adds ``LeftWithLastLine`` suboption to ``AlignEscapedNewlines``.
+- Adds ``KeepEmptyLines`` option to deprecate ``KeepEmptyLinesAtEOF``
+  and ``KeepEmptyLinesAtTheStartOfBlocks``.
+- Add ``ExceptDoubleParentheses`` sub-option for ``SpacesInParensOptions``
+  to override addition of spaces between multiple, non-redundant parentheses
+  similar to the rules used for ``RemoveParentheses``.
+
 libclang
 --------
 
+- ``clang_getSpellingLocation`` now correctly resolves macro expansions; that
+  is, it returns the spelling location instead of the expansion location.
+
 Static Analyzer
 ---------------
 
 New features
 ^^^^^^^^^^^^
 
+- The attribute ``[[clang::suppress]]`` can now be applied to declarations.
+  (#GH80371)
+
+- Support C++23 static operator calls. (#GH84972)
+
 Crash and bug fixes
 ^^^^^^^^^^^^^^^^^^^
 
+- Fixed crashing on loops if the loop variable was declared in switch blocks
+  but not under any case blocks if ``unroll-loops=true`` analyzer config is
+  set. (#GH68819)
+
+- Fixed a crash in ``security.cert.env.InvalidPtr`` checker when accidentally
+  matched user-defined ``strerror`` and similar library functions. (#GH88181)
+
+- Fixed a crash when storing through an address that refers to the address of
+  a label. (#GH89185)
+
+- Z3 crosschecking (aka. Z3 refutation) is now bounded, and can't consume
+  more total time than the eymbolic execution itself. (#GH97298)
+
 Improvements
 ^^^^^^^^^^^^
 
 Moved checkers
 ^^^^^^^^^^^^^^
 
+- Moved ``alpha.cplusplus.ArrayDelete`` out of the ``alpha`` package
+  to ``cplusplus.ArrayDelete``. (#GH83985)
+  `Documentation <https://clang.llvm.org/docs/analyzer/checkers.html#cplusplus-arraydelete-c>`__.
+
+- Moved ``alpha.unix.Stream`` out of the ``alpha`` package to
+  ``unix.Stream``. (#GH89247)
+  `Documentation <https://clang.llvm.org/docs/analyzer/checkers.html#unix-stream-c>`__.
+
+- Moved ``alpha.unix.BlockInCriticalSection`` out of the ``alpha`` package to
+  ``unix.BlockInCriticalSection``. (#GH93815)
+  `Documentation <https://clang.llvm.org/docs/analyzer/checkers.html#unix-blockincriticalsection-c-c>`__.
+
+- Moved ``alpha.security.cert.pos.34c`` out of the ``alpha`` package to
+  ``security.PutenvStackArray``. (#GH92424, #GH93815)
+  `Documentation <https://clang.llvm.org/docs/analyzer/checkers.html#security-putenvstackarray-c>`__.
+
+- Moved ``alpha.core.SizeofPtr`` into ``clang-tidy``
+  ``bugprone-sizeof-expression``. (#GH95118, #GH94356)
+  `Documentation <https://clang.llvm.org/extra/clang-tidy/checks/bugprone/sizeof-expression.html>`__.
+
 .. _release-notes-sanitizers:
 
 Sanitizers
 ----------
 
+- ``-fsanitize=signed-integer-overflow`` now instruments signed arithmetic even
+  when ``-fwrapv`` is enabled. Previously, only division checks were enabled.
+
+  Users with ``-fwrapv`` as well as a sanitizer group like
+  ``-fsanitize=undefined`` or ``-fsanitize=integer`` enabled may want to
+  manually disable potentially noisy signed integer overflow checks with
+  ``-fno-sanitize=signed-integer-overflow``
+
+- ``-fsanitize=cfi -fsanitize-cfi-cross-dso`` (cross-DSO CFI instrumentation)
+  now generates the ``__cfi_check`` function with proper target-specific
+  attributes, for example allowing unwind table generation.
+
 Python Binding Changes
 ----------------------
 
+- Exposed `CXRewriter` API as `class Rewriter`.
+- Add some missing kinds from Index.h (CursorKind: 149-156, 272-320, 420-437.
+  TemplateArgumentKind: 5-9. TypeKind: 161-175 and 178).
+- Add support for retrieving binary operator information through
+  Cursor.binary_operator().
+
 OpenMP Support
 --------------
 
+- Added support for the `[[omp::assume]]` attribute.
+
 Additional Information
 ======================
 
diff --git a/clang/docs/index.rst b/clang/docs/index.rst
index 9bae0bd83243b..a35a867b96bd7 100644
--- a/clang/docs/index.rst
+++ b/clang/docs/index.rst
@@ -92,7 +92,6 @@ Using Clang Tools
    ClangFormatStyleOptions
    ClangFormattedStatus
    ClangLinkerWrapper
-   ClangNVLinkWrapper
    ClangOffloadBundler
    ClangOffloadPackager
    ClangRepl
diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h
index 6d1c8ca8a2f96..608bd90fcc3ff 100644
--- a/clang/include/clang/AST/ASTContext.h
+++ b/clang/include/clang/AST/ASTContext.h
@@ -738,12 +738,6 @@ class ASTContext : public RefCountedBase<ASTContext> {
   }
   void Deallocate(void *Ptr) const {}
 
-  llvm::StringRef backupStr(llvm::StringRef S) const {
-    char *Buf = new (*this) char[S.size()];
-    std::copy(S.begin(), S.end(), Buf);
-    return llvm::StringRef(Buf, S.size());
-  }
-
   /// Allocates a \c DeclListNode or returns one from the \c ListNodeFreeList
   /// pool.
   DeclListNode *AllocateDeclListNode(clang::NamedDecl *ND) {
@@ -1293,7 +1287,7 @@ class ASTContext : public RefCountedBase<ASTContext> {
   getPointerAuthVTablePointerDiscriminator(const CXXRecordDecl *RD);
 
   /// Return the "other" type-specific discriminator for the given type.
-  uint16_t getPointerAuthTypeDiscriminator(QualType T);
+  uint16_t getPointerAuthTypeDiscriminator(QualType T) const;
 
   /// Apply Objective-C protocol qualifiers to the given type.
   /// \param allowOnPointerType specifies if we can apply protocol
diff --git a/clang/include/clang/AST/ExprCXX.h b/clang/include/clang/AST/ExprCXX.h
index f86f1818110e6..c2feac525c1ea 100644
--- a/clang/include/clang/AST/ExprCXX.h
+++ b/clang/include/clang/AST/ExprCXX.h
@@ -4854,7 +4854,15 @@ class CXXFoldExpr : public Expr {
   CXXFoldExpr(QualType T, UnresolvedLookupExpr *Callee,
               SourceLocation LParenLoc, Expr *LHS, BinaryOperatorKind Opcode,
               SourceLocation EllipsisLoc, Expr *RHS, SourceLocation RParenLoc,
-              std::optional<unsigned> NumExpansions);
+              std::optional<unsigned> NumExpansions)
+      : Expr(CXXFoldExprClass, T, VK_PRValue, OK_Ordinary),
+        LParenLoc(LParenLoc), EllipsisLoc(EllipsisLoc), RParenLoc(RParenLoc),
+        NumExpansions(NumExpansions ? *NumExpansions + 1 : 0), Opcode(Opcode) {
+    SubExprs[SubExpr::Callee] = Callee;
+    SubExprs[SubExpr::LHS] = LHS;
+    SubExprs[SubExpr::RHS] = RHS;
+    setDependence(computeDependence(this));
+  }
 
   CXXFoldExpr(EmptyShell Empty) : Expr(CXXFoldExprClass, Empty) {}
 
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 4825979a974d2..1293d0ddbc117 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -902,7 +902,7 @@ def PatchableFunctionEntry
     : InheritableAttr,
       TargetSpecificAttr<TargetArch<
           ["aarch64", "aarch64_be", "loongarch32", "loongarch64", "riscv32",
-           "riscv64", "x86", "x86_64", "ppc", "ppc64"]>> {
+           "riscv64", "x86", "x86_64"]>> {
   let Spellings = [GCC<"patchable_function_entry">];
   let Subjects = SubjectList<[Function, ObjCMethod]>;
   let Args = [UnsignedArgument<"Count">, DefaultIntArgument<"Offset", 0>];
diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index 99738812c8157..09cf4f80bd999 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -5800,8 +5800,7 @@ takes precedence over the command line option ``-fpatchable-function-entry=N,M``
 ``M`` defaults to 0 if omitted.
 
 This attribute is only supported on
-aarch64/aarch64-be/loongarch32/loongarch64/riscv32/riscv64/i386/x86-64/ppc/ppc64 targets.
-For ppc/ppc64 targets, AIX is still not supported.
+aarch64/aarch64-be/loongarch32/loongarch64/riscv32/riscv64/i386/x86-64 targets.
 }];
 }
 
diff --git a/clang/include/clang/Basic/DiagnosticLexKinds.td b/clang/include/clang/Basic/DiagnosticLexKinds.td
index 12d7b8c0205ee..08ece01009387 100644
--- a/clang/include/clang/Basic/DiagnosticLexKinds.td
+++ b/clang/include/clang/Basic/DiagnosticLexKinds.td
@@ -952,6 +952,11 @@ def warn_module_conflict : Warning<
   InGroup<ModuleConflict>;
 
 // C++20 modules
+def err_module_decl_cannot_be_macros : Error<
+  "the module name in a module%select{| partition}0 declaration cannot contain "
+  "an object-like macro %1">;
+def err_unxepected_paren_in_module_decl : Error<
+  "unexpected '(' after the module name in a module%select{| partition}0 declaration">;
 def err_header_import_semi_in_macro : Error<
   "semicolon terminating header import declaration cannot be produced "
   "by a macro">;
diff --git a/clang/include/clang/Basic/DiagnosticParseKinds.td b/clang/include/clang/Basic/DiagnosticParseKinds.td
index f8d50d12bb935..12aab09f28556 100644
--- a/clang/include/clang/Basic/DiagnosticParseKinds.td
+++ b/clang/include/clang/Basic/DiagnosticParseKinds.td
@@ -1260,9 +1260,6 @@ def warn_pragma_intrinsic_builtin : Warning<
 def warn_pragma_unused_expected_var : Warning<
   "expected '#pragma unused' argument to be a variable name">,
   InGroup<IgnoredPragmas>;
-// - #pragma mc_func
-def err_pragma_mc_func_not_supported :
-   Error<"#pragma mc_func is not supported">;
 // - #pragma init_seg
 def warn_pragma_init_seg_unsupported_target : Warning<
   "'#pragma init_seg' is only supported when targeting a "
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 00affee1ea67f..d60f32674ca3a 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -949,9 +949,6 @@ def note_ptrauth_virtual_function_pointer_incomplete_arg_ret :
 def note_ptrauth_virtual_function_incomplete_arg_ret_type :
   Note<"%0 is incomplete">;
 
-def err_ptrauth_indirect_goto_addrlabel_arithmetic : Error<
-  "%select{subtraction|addition}0 of address-of-label expressions is not "
-  "supported with ptrauth indirect gotos">;
 
 /// main()
 // static main() is not an error in C, just in C++.
@@ -3495,7 +3492,7 @@ def err_attr_tlsmodel_arg : Error<"tls_model must be \"global-dynamic\", "
 
 def err_attr_codemodel_arg : Error<"code model '%0' is not supported on this target">;
 
-def err_aix_attr_unsupported : Error<"%0 attribute is not yet supported on AIX">;
+def err_aix_attr_unsupported_tls_model : Error<"TLS model '%0' is not yet supported on AIX">;
 
 def err_tls_var_aligned_over_maximum : Error<
   "alignment (%0) of thread-local variable %1 is greater than the maximum supported "
@@ -5168,7 +5165,7 @@ def warn_cxx11_compat_variable_template : Warning<
   InGroup<CXXPre14Compat>, DefaultIgnore;
 def err_template_variable_noparams : Error<
   "extraneous 'template<>' in declaration of variable %0">;
-def err_template_member : Error<"non-static data member %0 cannot be declared as a template">;
+def err_template_member : Error<"member %0 declared as a template">;
 def err_member_with_template_arguments : Error<"member %0 cannot have template arguments">;
 def err_template_member_noparams : Error<
   "extraneous 'template<>' in declaration of member %0">;
@@ -7599,6 +7596,9 @@ def err_nested_non_static_member_use : Error<
 def warn_cxx98_compat_non_static_member_use : Warning<
   "use of non-static data member %0 in an unevaluated context is "
   "incompatible with C++98">, InGroup<CXX98Compat>, DefaultIgnore;
+def err_form_ptr_to_member_from_parenthesized_expr : Error<
+  "cannot form pointer to member from a parenthesized expression; "
+  "did you mean to remove the parentheses?">;
 def err_invalid_incomplete_type_use : Error<
   "invalid use of incomplete type %0">;
 def err_builtin_func_cast_more_than_one_arg : Error<
diff --git a/clang/include/clang/Basic/Features.def b/clang/include/clang/Basic/Features.def
index dc71ef8f98692..2f864ff1c0edf 100644
--- a/clang/include/clang/Basic/Features.def
+++ b/clang/include/clang/Basic/Features.def
@@ -108,11 +108,9 @@ FEATURE(ptrauth_calls, LangOpts.PointerAuthCalls)
 FEATURE(ptrauth_returns, LangOpts.PointerAuthReturns)
 FEATURE(ptrauth_vtable_pointer_address_discrimination, LangOpts.PointerAuthVTPtrAddressDiscrimination)
 FEATURE(ptrauth_vtable_pointer_type_discrimination, LangOpts.PointerAuthVTPtrTypeDiscrimination)
-FEATURE(ptrauth_type_info_vtable_pointer_discrimination, LangOpts.PointerAuthTypeInfoVTPtrDiscrimination)
 FEATURE(ptrauth_member_function_pointer_type_discrimination, LangOpts.PointerAuthCalls)
 FEATURE(ptrauth_init_fini, LangOpts.PointerAuthInitFini)
 FEATURE(ptrauth_function_pointer_type_discrimination, LangOpts.PointerAuthFunctionTypeDiscrimination)
-FEATURE(ptrauth_indirect_gotos, LangOpts.PointerAuthIndirectGotos)
 EXTENSION(swiftcc,
   PP.getTargetInfo().checkCallingConvention(CC_Swift) ==
   clang::TargetInfo::CCCR_OK)
diff --git a/clang/include/clang/Basic/IdentifierTable.h b/clang/include/clang/Basic/IdentifierTable.h
index ae9ebd9f59154..f40f74d0355ad 100644
--- a/clang/include/clang/Basic/IdentifierTable.h
+++ b/clang/include/clang/Basic/IdentifierTable.h
@@ -180,6 +180,10 @@ class alignas(IdentifierInfoAlignment) IdentifierInfo {
   LLVM_PREFERRED_TYPE(bool)
   unsigned IsModulesImport : 1;
 
+  // True if this is the 'module' contextual keyword.
+  LLVM_PREFERRED_TYPE(bool)
+  unsigned IsModulesDecl : 1;
+
   // True if this is a mangled OpenMP variant name.
   LLVM_PREFERRED_TYPE(bool)
   unsigned IsMangledOpenMPVariantName : 1;
@@ -196,7 +200,7 @@ class alignas(IdentifierInfoAlignment) IdentifierInfo {
   LLVM_PREFERRED_TYPE(bool)
   unsigned IsFinal : 1;
 
-  // 22 bits left in a 64-bit word.
+  // 21 bits left in a 64-bit word.
 
   // Managed by the language front-end.
   void *FETokenInfo = nullptr;
@@ -212,8 +216,8 @@ class alignas(IdentifierInfoAlignment) IdentifierInfo {
         IsCPPOperatorKeyword(false), NeedsHandleIdentifier(false),
         IsFromAST(false), ChangedAfterLoad(false), FEChangedAfterLoad(false),
         RevertedTokenID(false), OutOfDate(false), IsModulesImport(false),
-        IsMangledOpenMPVariantName(false), IsDeprecatedMacro(false),
-        IsRestrictExpansion(false), IsFinal(false) {}
+        IsModulesDecl(false), IsMangledOpenMPVariantName(false),
+        IsDeprecatedMacro(false), IsRestrictExpansion(false), IsFinal(false) {}
 
 public:
   IdentifierInfo(const IdentifierInfo &) = delete;
@@ -520,6 +524,18 @@ class alignas(IdentifierInfoAlignment) IdentifierInfo {
       RecomputeNeedsHandleIdentifier();
   }
 
+  /// Determine whether this is the contextual keyword \c module.
+  bool isModulesDeclaration() const { return IsModulesDecl; }
+
+  /// Set whether this identifier is the contextual keyword \c module.
+  void setModulesDeclaration(bool I) {
+    IsModulesDecl = I;
+    if (I)
+      NeedsHandleIdentifier = true;
+    else
+      RecomputeNeedsHandleIdentifier();
+  }
+
   /// Determine whether this is the mangled name of an OpenMP variant.
   bool isMangledOpenMPVariantName() const { return IsMangledOpenMPVariantName; }
 
@@ -740,6 +756,8 @@ class IdentifierTable {
     // If this is the 'import' contextual keyword, mark it as such.
     if (Name == "import")
       II->setModulesImport(true);
+    else if (Name == "module")
+      II->setModulesDeclaration(true);
 
     return *II;
   }
diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
index 834a6f6cd43e3..a6f36b23f07dc 100644
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -165,11 +165,9 @@ LANGOPT(ExperimentalLibrary, 1, 0, "enable unstable and experimental library fea
 LANGOPT(PointerAuthIntrinsics, 1, 0, "pointer authentication intrinsics")
 LANGOPT(PointerAuthCalls  , 1, 0, "function pointer authentication")
 LANGOPT(PointerAuthReturns, 1, 0, "return pointer authentication")
-LANGOPT(PointerAuthIndirectGotos, 1, 0, "indirect gotos pointer authentication")
 LANGOPT(PointerAuthAuthTraps, 1, 0, "pointer authentication failure traps")
 LANGOPT(PointerAuthVTPtrAddressDiscrimination, 1, 0, "incorporate address discrimination in authenticated vtable pointers")
 LANGOPT(PointerAuthVTPtrTypeDiscrimination, 1, 0, "incorporate type discrimination in authenticated vtable pointers")
-LANGOPT(PointerAuthTypeInfoVTPtrDiscrimination, 1, 0, "incorporate type and address discrimination in authenticated vtable pointers for std::type_info")
 LANGOPT(PointerAuthInitFini, 1, 0, "sign function pointers in init/fini arrays")
 BENIGN_LANGOPT(PointerAuthFunctionTypeDiscrimination, 1, 0,
                "Use type discrimination when signing function pointers")
diff --git a/clang/include/clang/Basic/PointerAuthOptions.h b/clang/include/clang/Basic/PointerAuthOptions.h
index 417b4b00648c7..197d63642ca6d 100644
--- a/clang/include/clang/Basic/PointerAuthOptions.h
+++ b/clang/include/clang/Basic/PointerAuthOptions.h
@@ -25,11 +25,6 @@ namespace clang {
 
 constexpr unsigned PointerAuthKeyNone = -1;
 
-/// Constant discriminator for std::type_info vtable pointers: 0xB1EA/45546
-/// The value is ptrauth_string_discriminator("_ZTVSt9type_info"), i.e.,
-/// the vtable type discriminator for classes derived from std::type_info.
-constexpr uint16_t StdTypeInfoVTablePointerConstantDiscrimination = 0xB1EA;
-
 class PointerAuthSchema {
 public:
   enum class Kind : unsigned {
@@ -159,9 +154,6 @@ class PointerAuthSchema {
 };
 
 struct PointerAuthOptions {
-  /// Do indirect goto label addresses need to be authenticated?
-  bool IndirectGotos = false;
-
   /// The ABI for C function pointers.
   PointerAuthSchema FunctionPointers;
 
@@ -183,9 +175,6 @@ struct PointerAuthOptions {
 
   /// The ABI for variadic C++ virtual function pointers.
   PointerAuthSchema CXXVirtualVariadicFunctionPointers;
-
-  /// The ABI for C++ member function pointers.
-  PointerAuthSchema CXXMemberFunctionPointers;
 };
 
 } // end namespace clang
diff --git a/clang/include/clang/Basic/TokenKinds.def b/clang/include/clang/Basic/TokenKinds.def
index 7f4912b9bcd96..8db18c049b6d0 100644
--- a/clang/include/clang/Basic/TokenKinds.def
+++ b/clang/include/clang/Basic/TokenKinds.def
@@ -1003,6 +1003,9 @@ ANNOTATION(module_include)
 ANNOTATION(module_begin)
 ANNOTATION(module_end)
 
+// Annotations for C++, Clang and Objective-C named modules.
+ANNOTATION(module_name)
+
 // Annotation for a header_name token that has been looked up and transformed
 // into the name of a header unit.
 ANNOTATION(header_unit)
diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td
index 3098fa67e6a51..6390ba3f9fe5e 100644
--- a/clang/include/clang/Basic/arm_neon.td
+++ b/clang/include/clang/Basic/arm_neon.td
@@ -289,7 +289,7 @@ def SPLATQ : WInst<"splat_laneq", ".(!Q)I",
                    "UcUsUicsilPcPsfQUcQUsQUiQcQsQiQPcQPsQflUlQlQUlhdQhQdPlQPl"> {
   let isLaneQ = 1;
 }
-let TargetGuard = "bf16,neon" in {
+let TargetGuard = "bf16" in {
   def SPLAT_BF  : WInst<"splat_lane", ".(!q)I", "bQb">;
   def SPLATQ_BF : WInst<"splat_laneq", ".(!Q)I", "bQb"> {
     let isLaneQ = 1;
@@ -323,7 +323,7 @@ def VMLSL    : SOpInst<"vmlsl", "(>Q)(>Q)..", "csiUcUsUi", OP_MLSL>;
 def VQDMULH  : SInst<"vqdmulh", "...", "siQsQi">;
 def VQRDMULH : SInst<"vqrdmulh", "...", "siQsQi">;
 
-let TargetGuard = "v8.1a,neon" in {
+let TargetGuard = "v8.1a" in {
 def VQRDMLAH : SInst<"vqrdmlah", "....", "siQsQi">;
 def VQRDMLSH : SInst<"vqrdmlsh", "....", "siQsQi">;
 }
@@ -614,7 +614,7 @@ def A64_VQDMULH_LANE  : SInst<"vqdmulh_lane", "..(!q)I", "siQsQi">;
 def A64_VQRDMULH_LANE : SInst<"vqrdmulh_lane", "..(!q)I", "siQsQi">;
 }
 
-let TargetGuard = "v8.1a,neon" in {
+let TargetGuard = "v8.1a" in {
 def VQRDMLAH_LANE : SOpInst<"vqrdmlah_lane", "...qI", "siQsQi", OP_QRDMLAH_LN>;
 def VQRDMLSH_LANE : SOpInst<"vqrdmlsh_lane", "...qI", "siQsQi", OP_QRDMLSH_LN>;
 }
@@ -957,7 +957,7 @@ def VQDMLAL_HIGH : SOpInst<"vqdmlal_high", "(>Q)(>Q)QQ", "si", OP_QDMLALHi>;
 def VQDMLAL_HIGH_N : SOpInst<"vqdmlal_high_n", "(>Q)(>Q)Q1", "si", OP_QDMLALHi_N>;
 def VQDMLSL_HIGH : SOpInst<"vqdmlsl_high", "(>Q)(>Q)QQ", "si", OP_QDMLSLHi>;
 def VQDMLSL_HIGH_N : SOpInst<"vqdmlsl_high_n", "(>Q)(>Q)Q1", "si", OP_QDMLSLHi_N>;
-let TargetGuard = "aes,neon" in {
+let TargetGuard = "aes" in {
   def VMULL_P64    : SInst<"vmull", "(1>)11", "Pl">;
   def VMULL_HIGH_P64 : SOpInst<"vmull_high", "(1>)..", "HPl", OP_MULLHi_P64>;
 }
@@ -1091,7 +1091,7 @@ let isLaneQ = 1 in {
 def VQDMULH_LANEQ  : SInst<"vqdmulh_laneq", "..QI", "siQsQi">;
 def VQRDMULH_LANEQ : SInst<"vqrdmulh_laneq", "..QI", "siQsQi">;
 }
-let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "v8.1a,neon" in {
+let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "v8.1a" in {
 def VQRDMLAH_LANEQ : SOpInst<"vqrdmlah_laneq", "...QI", "siQsQi", OP_QRDMLAH_LN> {
   let isLaneQ = 1;
 }
@@ -1122,14 +1122,14 @@ def VEXT_A64 : WInst<"vext", "...I", "dQdPlQPl">;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Crypto
-let ArchGuard = "__ARM_ARCH >= 8", TargetGuard = "aes,neon" in {
+let ArchGuard = "__ARM_ARCH >= 8", TargetGuard = "aes" in {
 def AESE : SInst<"vaese", "...", "QUc">;
 def AESD : SInst<"vaesd", "...", "QUc">;
 def AESMC : SInst<"vaesmc", "..", "QUc">;
 def AESIMC : SInst<"vaesimc", "..", "QUc">;
 }
 
-let ArchGuard = "__ARM_ARCH >= 8", TargetGuard = "sha2,neon" in {
+let ArchGuard = "__ARM_ARCH >= 8", TargetGuard = "sha2" in {
 def SHA1H : SInst<"vsha1h", "11", "Ui">;
 def SHA1SU1 : SInst<"vsha1su1", "...", "QUi">;
 def SHA256SU0 : SInst<"vsha256su0", "...", "QUi">;
@@ -1143,7 +1143,7 @@ def SHA256H2 : SInst<"vsha256h2", "....", "QUi">;
 def SHA256SU1 : SInst<"vsha256su1", "....", "QUi">;
 }
 
-let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "sha3,neon" in {
+let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "sha3" in {
 def BCAX : SInst<"vbcax", "....", "QUcQUsQUiQUlQcQsQiQl">;
 def EOR3 : SInst<"veor3", "....", "QUcQUsQUiQUlQcQsQiQl">;
 def RAX1 : SInst<"vrax1", "...", "QUl">;
@@ -1153,14 +1153,14 @@ def XAR :  SInst<"vxar", "...I", "QUl">;
 }
 }
 
-let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "sha3,neon" in {
+let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "sha3" in {
 def SHA512SU0 : SInst<"vsha512su0", "...", "QUl">;
 def SHA512su1 : SInst<"vsha512su1", "....", "QUl">;
 def SHA512H : SInst<"vsha512h", "....", "QUl">;
 def SHA512H2 : SInst<"vsha512h2", "....", "QUl">;
 }
 
-let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "sm4,neon" in {
+let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "sm4" in {
 def SM3SS1 : SInst<"vsm3ss1", "....", "QUi">;
 def SM3TT1A : SInst<"vsm3tt1a", "....I", "QUi">;
 def SM3TT1B : SInst<"vsm3tt1b", "....I", "QUi">;
@@ -1170,7 +1170,7 @@ def SM3PARTW1 : SInst<"vsm3partw1", "....", "QUi">;
 def SM3PARTW2 : SInst<"vsm3partw2", "....", "QUi">;
 }
 
-let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "sm4,neon" in {
+let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "sm4" in {
 def SM4E : SInst<"vsm4e", "...", "QUi">;
 def SM4EKEY : SInst<"vsm4ekey", "...", "QUi">;
 }
@@ -1227,7 +1227,7 @@ def FRINTZ_S64 : SInst<"vrnd", "..", "dQd">;
 def FRINTI_S64 : SInst<"vrndi", "..", "dQd">;
 }
 
-let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "v8.5a,neon" in {
+let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "v8.5a" in {
 def FRINT32X_S32 : SInst<"vrnd32x", "..", "fQf">;
 def FRINT32Z_S32 : SInst<"vrnd32z", "..", "fQf">;
 def FRINT64X_S32 : SInst<"vrnd64x", "..", "fQf">;
@@ -1401,7 +1401,7 @@ def SCALAR_SQDMULH : SInst<"vqdmulh", "111", "SsSi">;
 // Scalar Integer Saturating Rounding Doubling Multiply Half High
 def SCALAR_SQRDMULH : SInst<"vqrdmulh", "111", "SsSi">;
 
-let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "v8.1a,neon" in {
+let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "v8.1a" in {
 ////////////////////////////////////////////////////////////////////////////////
 // Signed Saturating Rounding Doubling Multiply Accumulate Returning High Half
 def SCALAR_SQRDMLAH : SInst<"vqrdmlah", "1111", "SsSi">;
@@ -1632,7 +1632,7 @@ def SCALAR_SQRDMULH_LANEQ : SOpInst<"vqrdmulh_laneq", "11QI", "SsSi", OP_SCALAR_
   let isLaneQ = 1;
 }
 
-let TargetGuard = "v8.1a,neon" in {
+let TargetGuard = "v8.1a" in {
 // Signed Saturating Rounding Doubling Multiply Accumulate Returning High Half
 def SCALAR_SQRDMLAH_LANE : SOpInst<"vqrdmlah_lane", "111.I", "SsSi", OP_SCALAR_QRDMLAH_LN>;
 def SCALAR_SQRDMLAH_LANEQ : SOpInst<"vqrdmlah_laneq", "111QI", "SsSi", OP_SCALAR_QRDMLAH_LN> {
@@ -1654,7 +1654,7 @@ def SCALAR_VDUP_LANEQ : IInst<"vdup_laneq", "1QI", "ScSsSiSlSfSdSUcSUsSUiSUlSPcS
 } // ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)"
 
 // ARMv8.2-A FP16 vector intrinsics for A32/A64.
-let TargetGuard = "fullfp16,neon" in {
+let TargetGuard = "fullfp16" in {
 
   // ARMv8.2-A FP16 one-operand vector intrinsics.
 
@@ -1679,7 +1679,7 @@ let TargetGuard = "fullfp16,neon" in {
   def VCVTP_U16    : SInst<"vcvtp_u16", "U.", "hQh">;
 
   // Vector rounding
-  let ArchGuard = "__ARM_ARCH >= 8 && defined(__ARM_FEATURE_DIRECTED_ROUNDING)", TargetGuard = "fullfp16,neon" in {
+  let ArchGuard = "__ARM_ARCH >= 8 && defined(__ARM_FEATURE_DIRECTED_ROUNDING)", TargetGuard = "fullfp16" in {
     def FRINTZH      : SInst<"vrnd",  "..", "hQh">;
     def FRINTNH      : SInst<"vrndn", "..", "hQh">;
     def FRINTAH      : SInst<"vrnda", "..", "hQh">;
@@ -1728,7 +1728,7 @@ let TargetGuard = "fullfp16,neon" in {
   // Max/Min
   def VMAXH         : SInst<"vmax", "...", "hQh">;
   def VMINH         : SInst<"vmin", "...", "hQh">;
-  let ArchGuard = "__ARM_ARCH >= 8 && defined(__ARM_FEATURE_NUMERIC_MAXMIN)", TargetGuard = "fullfp16,neon" in {
+  let ArchGuard = "__ARM_ARCH >= 8 && defined(__ARM_FEATURE_NUMERIC_MAXMIN)", TargetGuard = "fullfp16" in {
     def FMAXNMH       : SInst<"vmaxnm", "...", "hQh">;
     def FMINNMH       : SInst<"vminnm", "...", "hQh">;
   }
@@ -1775,7 +1775,7 @@ def VEXTH      : WInst<"vext", "...I", "hQh">;
 def VREV64H    : WOpInst<"vrev64", "..", "hQh", OP_REV64>;
 
 // ARMv8.2-A FP16 vector intrinsics for A64 only.
-let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "fullfp16,neon" in {
+let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "fullfp16" in {
 
   // Vector rounding
   def FRINTIH      : SInst<"vrndi", "..", "hQh">;
@@ -1872,11 +1872,11 @@ let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)" in {
 }
 
 // v8.2-A dot product instructions.
-let TargetGuard = "dotprod,neon" in {
+let TargetGuard = "dotprod" in {
   def DOT : SInst<"vdot", "..(<<)(<<)", "iQiUiQUi">;
   def DOT_LANE : SOpInst<"vdot_lane", "..(<<)(<<q)I", "iUiQiQUi", OP_DOT_LN>;
 }
-let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "dotprod,neon" in {
+let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "dotprod" in {
   // Variants indexing into a 128-bit vector are A64 only.
   def UDOT_LANEQ : SOpInst<"vdot_laneq", "..(<<)(<<Q)I", "iUiQiQUi", OP_DOT_LNQ> {
     let isLaneQ = 1;
@@ -1884,7 +1884,7 @@ let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "d
 }
 
 // v8.2-A FP16 fused multiply-add long instructions.
-let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "fp16fml,neon" in {
+let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "fp16fml" in {
   def VFMLAL_LOW  : SInst<"vfmlal_low",  ">>..", "hQh">;
   def VFMLSL_LOW  : SInst<"vfmlsl_low",  ">>..", "hQh">;
   def VFMLAL_HIGH : SInst<"vfmlal_high", ">>..", "hQh">;
@@ -1909,7 +1909,7 @@ let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "f
   }
 }
 
-let TargetGuard = "i8mm,neon" in {
+let TargetGuard = "i8mm" in {
   def VMMLA   : SInst<"vmmla", "..(<<)(<<)", "QUiQi">;
   def VUSMMLA : SInst<"vusmmla", "..(<<U)(<<)", "Qi">;
 
@@ -1926,7 +1926,7 @@ let TargetGuard = "i8mm,neon" in {
   }
 }
 
-let TargetGuard = "bf16,neon" in {
+let TargetGuard = "bf16" in {
   def VDOT_BF : SInst<"vbfdot", "..BB", "fQf">;
   def VDOT_LANE_BF : SOpInst<"vbfdot_lane", "..B(Bq)I", "fQf", OP_BFDOT_LN>;
   def VDOT_LANEQ_BF : SOpInst<"vbfdot_laneq", "..B(BQ)I", "fQf", OP_BFDOT_LNQ> {
@@ -1970,7 +1970,7 @@ multiclass VCMLA_ROTS<string type, string lanety, string laneqty> {
 }
 
 // v8.3-A Vector complex addition intrinsics
-let TargetGuard = "v8.3a,fullfp16,neon" in {
+let TargetGuard = "v8.3a,fullfp16" in {
   def VCADD_ROT90_FP16   : SInst<"vcadd_rot90", "...", "h">;
   def VCADD_ROT270_FP16  : SInst<"vcadd_rot270", "...", "h">;
   def VCADDQ_ROT90_FP16  : SInst<"vcaddq_rot90", "QQQ", "h">;
@@ -1978,7 +1978,7 @@ let TargetGuard = "v8.3a,fullfp16,neon" in {
 
   defm VCMLA_FP16  : VCMLA_ROTS<"h", "uint32x2_t", "uint32x4_t">;
 }
-let TargetGuard = "v8.3a,neon" in {
+let TargetGuard = "v8.3a" in {
   def VCADD_ROT90   : SInst<"vcadd_rot90", "...", "f">;
   def VCADD_ROT270  : SInst<"vcadd_rot270", "...", "f">;
   def VCADDQ_ROT90  : SInst<"vcaddq_rot90", "QQQ", "f">;
@@ -1986,7 +1986,7 @@ let TargetGuard = "v8.3a,neon" in {
 
   defm VCMLA_F32        : VCMLA_ROTS<"f", "uint64x1_t", "uint64x2_t">;
 }
-let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "v8.3a,neon" in {
+let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "v8.3a" in {
   def VCADDQ_ROT90_FP64  : SInst<"vcaddq_rot90", "QQQ", "d">;
   def VCADDQ_ROT270_FP64 : SInst<"vcaddq_rot270", "QQQ", "d">;
 
@@ -1994,7 +1994,7 @@ let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "v
 }
 
 // V8.2-A BFloat intrinsics
-let TargetGuard = "bf16,neon" in {
+let TargetGuard = "bf16" in {
   def VCREATE_BF : NoTestOpInst<"vcreate", ".(IU>)", "b", OP_CAST> {
     let BigEndianSafe = 1;
   }
@@ -2058,14 +2058,14 @@ let TargetGuard = "bf16,neon" in {
   def SCALAR_CVT_F32_BF16 : SOpInst<"vcvtah_f32", "(1F>)(1!)", "b", OP_CVT_F32_BF16>;
 }
 
-let ArchGuard = "!defined(__aarch64__) && !defined(__arm64ec__)", TargetGuard = "bf16,neon" in {
+let ArchGuard = "!defined(__aarch64__) && !defined(__arm64ec__)", TargetGuard = "bf16" in {
   def VCVT_BF16_F32_A32_INTERNAL : WInst<"__a32_vcvt_bf16", "BQ", "f">;
   def VCVT_BF16_F32_A32 : SOpInst<"vcvt_bf16", "BQ", "f", OP_VCVT_BF16_F32_A32>;
   def VCVT_LOW_BF16_F32_A32 : SOpInst<"vcvt_low_bf16",  "BQ", "Qf", OP_VCVT_BF16_F32_LO_A32>;
   def VCVT_HIGH_BF16_F32_A32 : SOpInst<"vcvt_high_bf16", "BBQ", "Qf", OP_VCVT_BF16_F32_HI_A32>;
 }
 
-let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "bf16,neon" in {
+let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "bf16" in {
   def VCVT_LOW_BF16_F32_A64_INTERNAL : WInst<"__a64_vcvtq_low_bf16", "BQ", "Hf">;
   def VCVT_LOW_BF16_F32_A64 : SOpInst<"vcvt_low_bf16", "BQ", "Qf", OP_VCVT_BF16_F32_LO_A64>;
   def VCVT_HIGH_BF16_F32_A64 : SInst<"vcvt_high_bf16", "BBQ", "Qf">;
@@ -2077,14 +2077,14 @@ let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "b
   def COPYQ_LANEQ_BF16 : IOpInst<"vcopy_laneq", "..I.I", "Qb", OP_COPY_LN>;
 }
 
-let ArchGuard = "!defined(__aarch64__) && !defined(__arm64ec__)", TargetGuard = "bf16,neon" in {
+let ArchGuard = "!defined(__aarch64__) && !defined(__arm64ec__)", TargetGuard = "bf16" in {
   let BigEndianSafe = 1 in {
     defm VREINTERPRET_BF : REINTERPRET_CROSS_TYPES<
         "csilUcUsUiUlhfPcPsPlQcQsQiQlQUcQUsQUiQUlQhQfQPcQPsQPl", "bQb">;
   }
 }
 
-let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "bf16,neon" in {
+let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "bf16" in {
   let BigEndianSafe = 1 in {
     defm VVREINTERPRET_BF : REINTERPRET_CROSS_TYPES<
         "csilUcUsUiUlhfdPcPsPlQcQsQiQlQUcQUsQUiQUlQhQfQdQPcQPsQPlQPk", "bQb">;
@@ -2092,7 +2092,7 @@ let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "b
 }
 
 // v8.9a/v9.4a LRCPC3 intrinsics
-let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "rcpc3,neon" in {
+let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "rcpc3" in {
   def VLDAP1_LANE : WInst<"vldap1_lane", ".(c*!).I", "QUlQlUlldQdPlQPl">;
   def VSTL1_LANE  : WInst<"vstl1_lane", "v*(.!)I", "QUlQlUlldQdPlQPl">;
 }
diff --git a/clang/include/clang/Basic/arm_neon_incl.td b/clang/include/clang/Basic/arm_neon_incl.td
index 3b8015daee6d9..b8155c187d1bc 100644
--- a/clang/include/clang/Basic/arm_neon_incl.td
+++ b/clang/include/clang/Basic/arm_neon_incl.td
@@ -265,7 +265,7 @@ class Inst <string n, string p, string t, Operation o> {
   string Prototype = p;
   string Types = t;
   string ArchGuard = "";
-  string TargetGuard = "neon";
+  string TargetGuard = "";
 
   Operation Operation = o;
   bit BigEndianSafe = 0;
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 69269cf7537b0..8707e71f2a319 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -3998,10 +3998,6 @@ def ftime_trace_granularity_EQ : Joined<["-"], "ftime-trace-granularity=">, Grou
   HelpText<"Minimum time granularity (in microseconds) traced by time profiler">,
   Visibility<[ClangOption, CC1Option, CLOption, DXCOption]>,
   MarshallingInfoInt<FrontendOpts<"TimeTraceGranularity">, "500u">;
-def ftime_trace_verbose : Joined<["-"], "ftime-trace-verbose">, Group<f_Group>,
-  HelpText<"Make time trace capture verbose event details (e.g. source filenames). This can increase the size of the output by 2-3 times">,
-  Visibility<[ClangOption, CC1Option, CLOption, DXCOption]>,
-  MarshallingInfoFlag<FrontendOpts<"TimeTraceVerbose">>;
 def ftime_trace_EQ : Joined<["-"], "ftime-trace=">, Group<f_Group>,
   HelpText<"Similar to -ftime-trace. Specify the JSON file or a directory which will contain the JSON file">,
   Visibility<[ClangOption, CC1Option, CLOption, DXCOption]>,
@@ -4249,13 +4245,9 @@ defm ptrauth_vtable_pointer_address_discrimination :
   OptInCC1FFlag<"ptrauth-vtable-pointer-address-discrimination", "Enable address discrimination of vtable pointers">;
 defm ptrauth_vtable_pointer_type_discrimination :
   OptInCC1FFlag<"ptrauth-vtable-pointer-type-discrimination", "Enable type discrimination of vtable pointers">;
-defm ptrauth_type_info_vtable_pointer_discrimination :
-  OptInCC1FFlag<"ptrauth-type-info-vtable-pointer-discrimination", "Enable type and address discrimination of vtable pointer of std::type_info">;
 defm ptrauth_init_fini : OptInCC1FFlag<"ptrauth-init-fini", "Enable signing of function pointers in init/fini arrays">;
 defm ptrauth_function_pointer_type_discrimination : OptInCC1FFlag<"ptrauth-function-pointer-type-discrimination",
   "Enable type discrimination on C function pointers">;
-defm ptrauth_indirect_gotos : OptInCC1FFlag<"ptrauth-indirect-gotos",
-  "Enable signing and authentication of indirect goto targets">;
 }
 
 def fenable_matrix : Flag<["-"], "fenable-matrix">, Group<f_Group>,
@@ -8086,13 +8078,6 @@ def source_date_epoch : Separate<["-"], "source-date-epoch">,
 
 } // let Visibility = [CC1Option]
 
-defm err_pragma_mc_func_aix : BoolFOption<"err-pragma-mc-func-aix",
-  PreprocessorOpts<"ErrorOnPragmaMcfuncOnAIX">, DefaultFalse,
-  PosFlag<SetTrue, [], [ClangOption, CC1Option],
-          "Treat uses of #pragma mc_func as errors">,
-  NegFlag<SetFalse,[], [ClangOption, CC1Option],
-          "Ignore uses of #pragma mc_func">>;
-
 //===----------------------------------------------------------------------===//
 // CUDA Options
 //===----------------------------------------------------------------------===//
diff --git a/clang/include/clang/Frontend/FrontendOptions.h b/clang/include/clang/Frontend/FrontendOptions.h
index 8241925c98476..5e5034fe01eb5 100644
--- a/clang/include/clang/Frontend/FrontendOptions.h
+++ b/clang/include/clang/Frontend/FrontendOptions.h
@@ -580,11 +580,6 @@ class FrontendOptions {
   /// Minimum time granularity (in microseconds) traced by time profiler.
   unsigned TimeTraceGranularity;
 
-  /// Make time trace capture verbose event details (e.g. source filenames).
-  /// This can increase the size of the output by 2-3 times.
-  LLVM_PREFERRED_TYPE(bool)
-  unsigned TimeTraceVerbose : 1;
-
   /// Path which stores the output files for -ftime-trace
   std::string TimeTracePath;
 
@@ -606,8 +601,7 @@ class FrontendOptions {
         EmitSymbolGraph(false), EmitExtensionSymbolGraphs(false),
         EmitSymbolGraphSymbolLabelsForTesting(false),
         EmitPrettySymbolGraphs(false), GenReducedBMI(false),
-        UseClangIRPipeline(false), TimeTraceGranularity(500),
-        TimeTraceVerbose(false) {}
+        UseClangIRPipeline(false), TimeTraceGranularity(500) {}
 
   /// getInputKindForExtension - Return the appropriate input kind for a file
   /// extension. For example, "c" would return Language::C.
diff --git a/clang/include/clang/Lex/Preprocessor.h b/clang/include/clang/Lex/Preprocessor.h
index fc7d0053f2323..56aef99a3f38a 100644
--- a/clang/include/clang/Lex/Preprocessor.h
+++ b/clang/include/clang/Lex/Preprocessor.h
@@ -615,10 +615,6 @@ class Preprocessor {
 
   ModuleDeclSeq ModuleDeclState;
 
-  /// Whether the module import expects an identifier next. Otherwise,
-  /// it expects a '.' or ';'.
-  bool ModuleImportExpectsIdentifier = false;
-
   /// The identifier and source location of the currently-active
   /// \#pragma clang arc_cf_code_audited begin.
   std::pair<IdentifierInfo *, SourceLocation> PragmaARCCFCodeAuditedInfo;
@@ -1744,11 +1740,14 @@ class Preprocessor {
   /// Lex a token, forming a header-name token if possible.
   bool LexHeaderName(Token &Result, bool AllowMacroExpansion = true);
 
+  /// Lex a module name or a partition name.
+  bool LexModuleName(Token &Result, bool IsImport);
+
   /// Lex the parameters for an #embed directive, returns nullopt on error.
   std::optional<LexEmbedParametersResult> LexEmbedParameters(Token &Current,
                                                              bool ForHasEmbed);
-
   bool LexAfterModuleImport(Token &Result);
+  bool LexAfterModuleDecl(Token &Result);
   void CollectPpImportSuffix(SmallVectorImpl<Token> &Toks);
 
   void makeModuleVisible(Module *M, SourceLocation Loc);
@@ -3039,6 +3038,9 @@ class Preprocessor {
   static bool CLK_LexAfterModuleImport(Preprocessor &P, Token &Result) {
     return P.LexAfterModuleImport(Result);
   }
+  static bool CLK_LexAfterModuleDecl(Preprocessor &P, Token &Result) {
+    return P.LexAfterModuleDecl(Result);
+  }
 };
 
 /// Abstract base class that describes a handler that will receive
@@ -3071,6 +3073,77 @@ struct EmbedAnnotationData {
 /// Registry of pragma handlers added by plugins
 using PragmaHandlerRegistry = llvm::Registry<PragmaHandler>;
 
+/// Represents module or partition name token sequance.
+///
+///     module-name:
+///           module-name-qualifier[opt] identifier
+///
+///     partition-name: [C++20]
+///           : module-name-qualifier[opt] identifier
+///
+///     module-name-qualifier
+///           module-name-qualifier[opt] identifier .
+///
+/// This class can only be created by the preprocessor and guarantees that the
+/// two source array being contiguous in memory and only contains 3 kind of
+/// tokens (identifier, '.' and ':'). And only available when the preprocessor
+/// returns annot_module_name token.
+///
+/// For exmaple:
+///
+/// export module m.n:c.d
+///
+/// The module name array has 3 tokens ['m', '.', 'n'].
+/// The partition name array has 4 tokens [':', 'c', '.', 'd'].
+///
+/// When import a partition in a named module fragment (Eg. import :part1;),
+/// the module name array will be empty, and the partition name array has 2
+/// tokens.
+///
+/// When we meet a private-module-fragment (Eg. module :private;), preprocessor
+/// will not return a annot_module_name token, but will return 2 separate tokens
+/// [':', 'kw_private'].
+
+class ModuleNameInfo {
+  friend class Preprocessor;
+  ArrayRef<Token> ModuleName;
+  ArrayRef<Token> PartitionName;
+
+  ModuleNameInfo(ArrayRef<Token> AnnotToks, std::optional<unsigned> ColonIndex);
+
+public:
+  /// Return the contiguous token array.
+  ArrayRef<Token> getTokens() const {
+    if (ModuleName.empty())
+      return PartitionName;
+    if (PartitionName.empty())
+      return ModuleName;
+    return ArrayRef(ModuleName.begin(), PartitionName.end());
+  }
+  bool hasModuleName() const { return !ModuleName.empty(); }
+  bool hasPartitionName() const { return !PartitionName.empty(); }
+  ArrayRef<Token> getModuleName() const { return ModuleName; }
+  ArrayRef<Token> getPartitionName() const { return PartitionName; }
+  Token getColonToken() const {
+    assert(hasPartitionName() && "Do not have a partition name");
+    return getPartitionName().front();
+  }
+
+  /// Under the standard C++ Modules, the dot is just part of the module name,
+  /// and not a real hierarchy separator. Flatten such module names now.
+  std::string getFlatName() const;
+
+  /// Build a module id path from the contiguous token array, both include
+  /// module name and partition name.
+  void getModuleIdPath(
+      SmallVectorImpl<std::pair<IdentifierInfo *, SourceLocation>> &Path) const;
+
+  /// Build a module id path from \param ModuleName.
+  static void getModuleIdPath(
+      ArrayRef<Token> ModuleName,
+      SmallVectorImpl<std::pair<IdentifierInfo *, SourceLocation>> &Path);
+};
+
 } // namespace clang
 
 #endif // LLVM_CLANG_LEX_PREPROCESSOR_H
diff --git a/clang/include/clang/Lex/PreprocessorOptions.h b/clang/include/clang/Lex/PreprocessorOptions.h
index 3f7dd9db18ba7..c2e3d68333024 100644
--- a/clang/include/clang/Lex/PreprocessorOptions.h
+++ b/clang/include/clang/Lex/PreprocessorOptions.h
@@ -211,10 +211,6 @@ class PreprocessorOptions {
   /// If set, the UNIX timestamp specified by SOURCE_DATE_EPOCH.
   std::optional<uint64_t> SourceDateEpoch;
 
-  /// If set, the preprocessor reports an error when processing #pragma mc_func
-  /// on AIX.
-  bool ErrorOnPragmaMcfuncOnAIX = false;
-
 public:
   PreprocessorOptions() : PrecompiledPreambleBytes(0, false) {}
 
@@ -252,7 +248,6 @@ class PreprocessorOptions {
     PrecompiledPreambleBytes.first = 0;
     PrecompiledPreambleBytes.second = false;
     RetainExcludedConditionalBlocks = false;
-    ErrorOnPragmaMcfuncOnAIX = false;
   }
 };
 
diff --git a/clang/include/clang/Lex/Token.h b/clang/include/clang/Lex/Token.h
index 4f29fb7d11415..2be3ad39529f0 100644
--- a/clang/include/clang/Lex/Token.h
+++ b/clang/include/clang/Lex/Token.h
@@ -235,6 +235,9 @@ class Token {
     assert(isAnnotation() && "Used AnnotVal on non-annotation token");
     return PtrData;
   }
+  template <class T> T getAnnotationValueAs() const {
+    return static_cast<T>(getAnnotationValue());
+  }
   void setAnnotationValue(void *val) {
     assert(isAnnotation() && "Used AnnotVal on non-annotation token");
     PtrData = val;
diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h
index 613bab9120dfc..afcdacf02583a 100644
--- a/clang/include/clang/Parse/Parser.h
+++ b/clang/include/clang/Parse/Parser.h
@@ -221,7 +221,6 @@ class Parser : public CodeCompletionHandler {
   std::unique_ptr<PragmaHandler> MaxTokensHerePragmaHandler;
   std::unique_ptr<PragmaHandler> MaxTokensTotalPragmaHandler;
   std::unique_ptr<PragmaHandler> RISCVPragmaHandler;
-  std::unique_ptr<PragmaHandler> MCFuncPragmaHandler;
 
   std::unique_ptr<CommentHandler> CommentSemaHandler;
 
@@ -3877,7 +3876,7 @@ class Parser : public CodeCompletionHandler {
   }
 
   bool ParseModuleName(
-      SourceLocation UseLoc,
+      SourceLocation UseLoc, ArrayRef<Token> ModuleName,
       SmallVectorImpl<std::pair<IdentifierInfo *, SourceLocation>> &Path,
       bool IsImport);
 
diff --git a/clang/lib/APINotes/APINotesFormat.h b/clang/lib/APINotes/APINotesFormat.h
index 9d254dcc1c9ef..cd6456dbe37b2 100644
--- a/clang/lib/APINotes/APINotesFormat.h
+++ b/clang/lib/APINotes/APINotesFormat.h
@@ -24,7 +24,7 @@ const uint16_t VERSION_MAJOR = 0;
 /// API notes file minor version number.
 ///
 /// When the format changes IN ANY WAY, this number should be incremented.
-const uint16_t VERSION_MINOR = 28; // nested tags
+const uint16_t VERSION_MINOR = 27; // SingleDeclTableKey
 
 const uint8_t kSwiftCopyable = 1;
 const uint8_t kSwiftNonCopyable = 2;
diff --git a/clang/lib/APINotes/APINotesYAMLCompiler.cpp b/clang/lib/APINotes/APINotesYAMLCompiler.cpp
index 11cccc94a15f0..060e1fdaf2fd9 100644
--- a/clang/lib/APINotes/APINotesYAMLCompiler.cpp
+++ b/clang/lib/APINotes/APINotesYAMLCompiler.cpp
@@ -406,9 +406,6 @@ template <> struct ScalarEnumerationTraits<EnumConvenienceAliasKind> {
 } // namespace llvm
 
 namespace {
-struct Tag;
-typedef std::vector<Tag> TagsSeq;
-
 struct Tag {
   StringRef Name;
   AvailabilityItem Availability;
@@ -424,11 +421,9 @@ struct Tag {
   std::optional<EnumConvenienceAliasKind> EnumConvenienceKind;
   std::optional<bool> SwiftCopyable;
   FunctionsSeq Methods;
-
-  /// Tags that are declared within the current tag. Only the tags that have
-  /// corresponding API Notes will be listed.
-  TagsSeq Tags;
 };
+
+typedef std::vector<Tag> TagsSeq;
 } // namespace
 
 LLVM_YAML_IS_SEQUENCE_VECTOR(Tag)
@@ -461,7 +456,6 @@ template <> struct MappingTraits<Tag> {
     IO.mapOptional("EnumKind", T.EnumConvenienceKind);
     IO.mapOptional("SwiftCopyable", T.SwiftCopyable);
     IO.mapOptional("Methods", T.Methods);
-    IO.mapOptional("Tags", T.Tags);
   }
 };
 } // namespace yaml
@@ -964,17 +958,12 @@ class YAMLConverter {
     ContextInfo CI;
     auto TagCtxID = Writer.addContext(ParentContextID, T.Name, ContextKind::Tag,
                                       CI, SwiftVersion);
-    Context TagCtx(TagCtxID, ContextKind::Tag);
 
     for (const auto &CXXMethod : T.Methods) {
       CXXMethodInfo MI;
       convertFunction(CXXMethod, MI);
       Writer.addCXXMethod(TagCtxID, CXXMethod.Name, MI, SwiftVersion);
     }
-
-    // Convert nested tags.
-    for (const auto &Tag : T.Tags)
-      convertTagContext(TagCtx, Tag, SwiftVersion);
   }
 
   void convertTopLevelItems(std::optional<Context> Ctx,
diff --git a/clang/lib/AST/ASTConcept.cpp b/clang/lib/AST/ASTConcept.cpp
index d8efbe44dbecb..95e7ac1a3d775 100644
--- a/clang/lib/AST/ASTConcept.cpp
+++ b/clang/lib/AST/ASTConcept.cpp
@@ -28,9 +28,11 @@ CreateUnsatisfiedConstraintRecord(const ASTContext &C,
   else {
     auto &SubstitutionDiagnostic =
         *Detail.get<std::pair<SourceLocation, StringRef> *>();
-    StringRef Message = C.backupStr(SubstitutionDiagnostic.second);
+    unsigned MessageSize = SubstitutionDiagnostic.second.size();
+    char *Mem = new (C) char[MessageSize];
+    memcpy(Mem, SubstitutionDiagnostic.second.data(), MessageSize);
     auto *NewSubstDiag = new (C) std::pair<SourceLocation, StringRef>(
-        SubstitutionDiagnostic.first, Message);
+        SubstitutionDiagnostic.first, StringRef(Mem, MessageSize));
     new (TrailingObject) UnsatisfiedConstraintRecord(NewSubstDiag);
   }
 }
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index 7af9ea7105bb0..90bcbea072e39 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -3407,7 +3407,7 @@ static void encodeTypeForFunctionPointerAuth(const ASTContext &Ctx,
   }
 }
 
-uint16_t ASTContext::getPointerAuthTypeDiscriminator(QualType T) {
+uint16_t ASTContext::getPointerAuthTypeDiscriminator(QualType T) const {
   assert(!T->isDependentType() &&
          "cannot compute type discriminator of a dependent type");
 
@@ -3417,13 +3417,11 @@ uint16_t ASTContext::getPointerAuthTypeDiscriminator(QualType T) {
   if (T->isFunctionPointerType() || T->isFunctionReferenceType())
     T = T->getPointeeType();
 
-  if (T->isFunctionType()) {
+  if (T->isFunctionType())
     encodeTypeForFunctionPointerAuth(*this, Out, T);
-  } else {
-    T = T.getUnqualifiedType();
-    std::unique_ptr<MangleContext> MC(createMangleContext());
-    MC->mangleCanonicalTypeName(T, Out);
-  }
+  else
+    llvm_unreachable(
+        "type discrimination of non-function type not implemented yet");
 
   return llvm::getPointerAuthStableSipHash(Str);
 }
diff --git a/clang/lib/AST/ASTImporter.cpp b/clang/lib/AST/ASTImporter.cpp
index 08ef09d353afc..0c27f6f5df2da 100644
--- a/clang/lib/AST/ASTImporter.cpp
+++ b/clang/lib/AST/ASTImporter.cpp
@@ -2949,7 +2949,7 @@ ExpectedDecl ASTNodeImporter::VisitEnumDecl(EnumDecl *D) {
       if (auto *FoundEnum = dyn_cast<EnumDecl>(FoundDecl)) {
         if (!hasSameVisibilityContextAndLinkage(FoundEnum, D))
           continue;
-        if (IsStructuralMatch(D, FoundEnum, !SearchName.isEmpty())) {
+        if (IsStructuralMatch(D, FoundEnum)) {
           EnumDecl *FoundDef = FoundEnum->getDefinition();
           if (D->isThisDeclarationADefinition() && FoundDef)
             return Importer.MapImported(D, FoundDef);
@@ -2960,12 +2960,7 @@ ExpectedDecl ASTNodeImporter::VisitEnumDecl(EnumDecl *D) {
       }
     }
 
-    // In case of unnamed enums, we try to find an existing similar one, if none
-    // was found, perform the import always.
-    // Structural in-equivalence is not detected in this way here, but it may
-    // be found when the parent decl is imported (if the enum is part of a
-    // class). To make this totally exact a more difficult solution is needed.
-    if (SearchName && !ConflictingDecls.empty()) {
+    if (!ConflictingDecls.empty()) {
       ExpectedName NameOrErr = Importer.HandleNameConflict(
           SearchName, DC, IDNS, ConflictingDecls.data(),
           ConflictingDecls.size());
diff --git a/clang/lib/AST/DeclCXX.cpp b/clang/lib/AST/DeclCXX.cpp
index b573c2713a3aa..72d68f39a97a5 100644
--- a/clang/lib/AST/DeclCXX.cpp
+++ b/clang/lib/AST/DeclCXX.cpp
@@ -675,9 +675,6 @@ bool CXXRecordDecl::hasSubobjectAtOffsetZeroOfEmptyBaseType(
       if (!IsFirstField && !FD->isZeroSize(Ctx))
         continue;
 
-      if (FD->isInvalidDecl())
-        continue;
-
       //   -- If X is n array type, [visit the element type]
       QualType T = Ctx.getBaseElementType(FD->getType());
       if (auto *RD = T->getAsCXXRecordDecl())
diff --git a/clang/lib/AST/ExprCXX.cpp b/clang/lib/AST/ExprCXX.cpp
index e2c9643151126..8d2a1b5611ccc 100644
--- a/clang/lib/AST/ExprCXX.cpp
+++ b/clang/lib/AST/ExprCXX.cpp
@@ -1944,22 +1944,3 @@ CXXParenListInitExpr *CXXParenListInitExpr::CreateEmpty(ASTContext &C,
                          alignof(CXXParenListInitExpr));
   return new (Mem) CXXParenListInitExpr(Empty, NumExprs);
 }
-
-CXXFoldExpr::CXXFoldExpr(QualType T, UnresolvedLookupExpr *Callee,
-                                SourceLocation LParenLoc, Expr *LHS,
-                                BinaryOperatorKind Opcode,
-                                SourceLocation EllipsisLoc, Expr *RHS,
-                                SourceLocation RParenLoc,
-                                std::optional<unsigned> NumExpansions)
-    : Expr(CXXFoldExprClass, T, VK_PRValue, OK_Ordinary), LParenLoc(LParenLoc),
-      EllipsisLoc(EllipsisLoc), RParenLoc(RParenLoc),
-      NumExpansions(NumExpansions ? *NumExpansions + 1 : 0), Opcode(Opcode) {
-  // We rely on asserted invariant to distinguish left and right folds.
-  assert(((LHS && LHS->containsUnexpandedParameterPack()) !=
-          (RHS && RHS->containsUnexpandedParameterPack())) &&
-         "Exactly one of LHS or RHS should contain an unexpanded pack");
-  SubExprs[SubExpr::Callee] = Callee;
-  SubExprs[SubExpr::LHS] = LHS;
-  SubExprs[SubExpr::RHS] = RHS;
-  setDependence(computeDependence(this));
-}
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index fcb382474ea62..5af712dd7257b 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -12949,35 +12949,19 @@ bool IntExprEvaluator::VisitBuiltinCallExpr(const CallExpr *E,
           Info.Ctx.getTargetInfo().getMaxAtomicInlineWidth();
       if (Size <= Info.Ctx.toCharUnitsFromBits(InlineWidthBits)) {
         if (BuiltinOp == Builtin::BI__c11_atomic_is_lock_free ||
-            Size == CharUnits::One())
+            Size == CharUnits::One() ||
+            E->getArg(1)->isNullPointerConstant(Info.Ctx,
+                                                Expr::NPC_NeverValueDependent))
+          // OK, we will inline appropriately-aligned operations of this size,
+          // and _Atomic(T) is appropriately-aligned.
           return Success(1, E);
 
-        // If the pointer argument can be evaluated to a compile-time constant
-        // integer (or nullptr), check if that value is appropriately aligned.
-        const Expr *PtrArg = E->getArg(1);
-        Expr::EvalResult ExprResult;
-        APSInt IntResult;
-        if (PtrArg->EvaluateAsRValue(ExprResult, Info.Ctx) &&
-            ExprResult.Val.toIntegralConstant(IntResult, PtrArg->getType(),
-                                              Info.Ctx) &&
-            IntResult.isAligned(Size.getAsAlign()))
+        QualType PointeeType = E->getArg(1)->IgnoreImpCasts()->getType()->
+          castAs<PointerType>()->getPointeeType();
+        if (!PointeeType->isIncompleteType() &&
+            Info.Ctx.getTypeAlignInChars(PointeeType) >= Size) {
+          // OK, we will inline operations on this object.
           return Success(1, E);
-
-        // Otherwise, check if the type's alignment against Size.
-        if (auto *ICE = dyn_cast<ImplicitCastExpr>(PtrArg)) {
-          // Drop the potential implicit-cast to 'const volatile void*', getting
-          // the underlying type.
-          if (ICE->getCastKind() == CK_BitCast)
-            PtrArg = ICE->getSubExpr();
-        }
-
-        if (auto PtrTy = PtrArg->getType()->getAs<PointerType>()) {
-          QualType PointeeType = PtrTy->getPointeeType();
-          if (!PointeeType->isIncompleteType() &&
-              Info.Ctx.getTypeAlignInChars(PointeeType) >= Size) {
-            // OK, we will inline operations on this object.
-            return Success(1, E);
-          }
         }
       }
     }
diff --git a/clang/lib/AST/Interp/ByteCodeEmitter.cpp b/clang/lib/AST/Interp/ByteCodeEmitter.cpp
index fee4432a8f661..a3d4c7d7392da 100644
--- a/clang/lib/AST/Interp/ByteCodeEmitter.cpp
+++ b/clang/lib/AST/Interp/ByteCodeEmitter.cpp
@@ -27,8 +27,7 @@ using namespace clang::interp;
 /// Similar information is available via ASTContext::BuiltinInfo,
 /// but that is not correct for our use cases.
 static bool isUnevaluatedBuiltin(unsigned BuiltinID) {
-  return BuiltinID == Builtin::BI__builtin_classify_type ||
-         BuiltinID == Builtin::BI__builtin_os_log_format_buffer_size;
+  return BuiltinID == Builtin::BI__builtin_classify_type;
 }
 
 Function *ByteCodeEmitter::compileFunc(const FunctionDecl *FuncDecl) {
diff --git a/clang/lib/AST/Interp/Compiler.cpp b/clang/lib/AST/Interp/Compiler.cpp
index 0fc93c14131e6..ef579bc5d8972 100644
--- a/clang/lib/AST/Interp/Compiler.cpp
+++ b/clang/lib/AST/Interp/Compiler.cpp
@@ -1698,8 +1698,10 @@ bool Compiler<Emitter>::VisitUnaryExprOrTypeTraitExpr(
   if (Kind == UETT_VectorElements) {
     if (const auto *VT = E->getTypeOfArgument()->getAs<VectorType>())
       return this->emitConst(VT->getNumElements(), E);
+
+    // FIXME: Apparently we need to catch the fact that a sizeless vector type
+    // has been passed and diagnose that (at run time).
     assert(E->getTypeOfArgument()->isSizelessVectorType());
-    return this->emitSizelessVectorElementSize(E);
   }
 
   if (Kind == UETT_VecStep) {
diff --git a/clang/lib/AST/Interp/Descriptor.cpp b/clang/lib/AST/Interp/Descriptor.cpp
index 4f7e9eac76a32..f7d1201f625bb 100644
--- a/clang/lib/AST/Interp/Descriptor.cpp
+++ b/clang/lib/AST/Interp/Descriptor.cpp
@@ -162,8 +162,7 @@ static void initField(Block *B, std::byte *Ptr, bool IsConst, bool IsMutable,
 }
 
 static void initBase(Block *B, std::byte *Ptr, bool IsConst, bool IsMutable,
-                     bool IsActive, const Descriptor *D, unsigned FieldOffset,
-                     bool IsVirtualBase) {
+                     bool IsActive, const Descriptor *D, unsigned FieldOffset) {
   assert(D);
   assert(D->ElemRecord);
 
@@ -173,14 +172,13 @@ static void initBase(Block *B, std::byte *Ptr, bool IsConst, bool IsMutable,
   Desc->Desc = D;
   Desc->IsInitialized = D->IsArray;
   Desc->IsBase = true;
-  Desc->IsVirtualBase = IsVirtualBase;
   Desc->IsActive = IsActive && !IsUnion;
   Desc->IsConst = IsConst || D->IsConst;
   Desc->IsFieldMutable = IsMutable || D->IsMutable;
 
   for (const auto &V : D->ElemRecord->bases())
     initBase(B, Ptr + FieldOffset, IsConst, IsMutable, IsActive, V.Desc,
-             V.Offset, false);
+             V.Offset);
   for (const auto &F : D->ElemRecord->fields())
     initField(B, Ptr + FieldOffset, IsConst, IsMutable, IsActive, IsUnion,
               F.Desc, F.Offset);
@@ -189,11 +187,11 @@ static void initBase(Block *B, std::byte *Ptr, bool IsConst, bool IsMutable,
 static void ctorRecord(Block *B, std::byte *Ptr, bool IsConst, bool IsMutable,
                        bool IsActive, const Descriptor *D) {
   for (const auto &V : D->ElemRecord->bases())
-    initBase(B, Ptr, IsConst, IsMutable, IsActive, V.Desc, V.Offset, false);
+    initBase(B, Ptr, IsConst, IsMutable, IsActive, V.Desc, V.Offset);
   for (const auto &F : D->ElemRecord->fields())
     initField(B, Ptr, IsConst, IsMutable, IsActive, D->ElemRecord->isUnion(), F.Desc, F.Offset);
   for (const auto &V : D->ElemRecord->virtual_bases())
-    initBase(B, Ptr, IsConst, IsMutable, IsActive, V.Desc, V.Offset, true);
+    initBase(B, Ptr, IsConst, IsMutable, IsActive, V.Desc, V.Offset);
 }
 
 static void destroyField(Block *B, std::byte *Ptr, const Descriptor *D,
diff --git a/clang/lib/AST/Interp/Descriptor.h b/clang/lib/AST/Interp/Descriptor.h
index 0cc5d77c407e3..0dd97812e5a5c 100644
--- a/clang/lib/AST/Interp/Descriptor.h
+++ b/clang/lib/AST/Interp/Descriptor.h
@@ -83,8 +83,6 @@ struct InlineDescriptor {
   /// Flag indicating if the field is an embedded base class.
   LLVM_PREFERRED_TYPE(bool)
   unsigned IsBase : 1;
-  LLVM_PREFERRED_TYPE(bool)
-  unsigned IsVirtualBase : 1;
   /// Flag indicating if the field is the active member of a union.
   LLVM_PREFERRED_TYPE(bool)
   unsigned IsActive : 1;
diff --git a/clang/lib/AST/Interp/Interp.h b/clang/lib/AST/Interp/Interp.h
index 8e96f78d90568..b2581b5f7b5d0 100644
--- a/clang/lib/AST/Interp/Interp.h
+++ b/clang/lib/AST/Interp/Interp.h
@@ -187,15 +187,10 @@ template <typename T>
 bool CheckDivRem(InterpState &S, CodePtr OpPC, const T &LHS, const T &RHS) {
   if (RHS.isZero()) {
     const auto *Op = cast<BinaryOperator>(S.Current->getExpr(OpPC));
-    if constexpr (std::is_same_v<T, Floating>) {
-      S.CCEDiag(Op, diag::note_expr_divide_by_zero)
-          << Op->getRHS()->getSourceRange();
-      return true;
-    }
-
     S.FFDiag(Op, diag::note_expr_divide_by_zero)
         << Op->getRHS()->getSourceRange();
-    return false;
+    if constexpr (!std::is_same_v<T, Floating>)
+      return false;
   }
 
   if (LHS.isSigned() && LHS.isMin() && RHS.isNegative() && RHS.isMinusOne()) {
@@ -2741,15 +2736,6 @@ inline bool InvalidDeclRef(InterpState &S, CodePtr OpPC,
   return CheckDeclRef(S, OpPC, DR);
 }
 
-inline bool SizelessVectorElementSize(InterpState &S, CodePtr OpPC) {
-  if (S.inConstantContext()) {
-    const SourceRange &ArgRange = S.Current->getRange(OpPC);
-    const Expr *E = S.Current->getExpr(OpPC);
-    S.CCEDiag(E, diag::note_constexpr_non_const_vectorelements) << ArgRange;
-  }
-  return false;
-}
-
 inline bool Assume(InterpState &S, CodePtr OpPC) {
   const auto Val = S.Stk.pop<Boolean>();
 
diff --git a/clang/lib/AST/Interp/InterpBuiltin.cpp b/clang/lib/AST/Interp/InterpBuiltin.cpp
index c170042144acc..98928b3c22d7c 100644
--- a/clang/lib/AST/Interp/InterpBuiltin.cpp
+++ b/clang/lib/AST/Interp/InterpBuiltin.cpp
@@ -942,29 +942,15 @@ static bool interp__builtin_atomic_lock_free(InterpState &S, CodePtr OpPC,
       if (Ptr.isZero())
         return returnBool(true);
 
-      if (Ptr.isIntegralPointer()) {
-        uint64_t IntVal = Ptr.getIntegerRepresentation();
-        if (APSInt(APInt(64, IntVal, false), true).isAligned(Size.getAsAlign()))
-          return returnBool(true);
-      }
-
-      const Expr *PtrArg = Call->getArg(1);
-      // Otherwise, check if the type's alignment against Size.
-      if (const auto *ICE = dyn_cast<ImplicitCastExpr>(PtrArg)) {
-        // Drop the potential implicit-cast to 'const volatile void*', getting
-        // the underlying type.
-        if (ICE->getCastKind() == CK_BitCast)
-          PtrArg = ICE->getSubExpr();
-      }
-
-      if (auto PtrTy = PtrArg->getType()->getAs<PointerType>()) {
-        QualType PointeeType = PtrTy->getPointeeType();
-        if (!PointeeType->isIncompleteType() &&
-            S.getCtx().getTypeAlignInChars(PointeeType) >= Size) {
-          // OK, we will inline operations on this object.
-          return returnBool(true);
-        }
-      }
+      QualType PointeeType = Call->getArg(1)
+                                 ->IgnoreImpCasts()
+                                 ->getType()
+                                 ->castAs<PointerType>()
+                                 ->getPointeeType();
+      // OK, we will inline operations on this object.
+      if (!PointeeType->isIncompleteType() &&
+          S.getCtx().getTypeAlignInChars(PointeeType) >= Size)
+        return returnBool(true);
     }
   }
 
diff --git a/clang/lib/AST/Interp/Opcodes.td b/clang/lib/AST/Interp/Opcodes.td
index 9f29fa9272711..49ebb156ab2fb 100644
--- a/clang/lib/AST/Interp/Opcodes.td
+++ b/clang/lib/AST/Interp/Opcodes.td
@@ -733,8 +733,6 @@ def InvalidDeclRef : Opcode {
   let Args = [ArgDeclRef];
 }
 
-def SizelessVectorElementSize : Opcode;
-
 def Assume : Opcode;
 
 def ArrayDecay : Opcode;
diff --git a/clang/lib/AST/Interp/Pointer.cpp b/clang/lib/AST/Interp/Pointer.cpp
index 29579f5db40b6..229007c6d720a 100644
--- a/clang/lib/AST/Interp/Pointer.cpp
+++ b/clang/lib/AST/Interp/Pointer.cpp
@@ -149,10 +149,6 @@ APValue Pointer::toAPValue(const ASTContext &ASTCtx) const {
   CharUnits Offset = CharUnits::Zero();
 
   auto getFieldOffset = [&](const FieldDecl *FD) -> CharUnits {
-    // This shouldn't happen, but if it does, don't crash inside
-    // getASTRecordLayout.
-    if (FD->getParent()->isInvalidDecl())
-      return CharUnits::Zero();
     const ASTRecordLayout &Layout = ASTCtx.getASTRecordLayout(FD->getParent());
     unsigned FieldIndex = FD->getFieldIndex();
     return ASTCtx.toCharUnitsFromBits(Layout.getFieldOffset(FieldIndex));
@@ -180,30 +176,18 @@ APValue Pointer::toAPValue(const ASTContext &ASTCtx) const {
       Path.push_back(APValue::LValuePathEntry::ArrayIndex(Index));
       Ptr = Ptr.getArray();
     } else {
+      // TODO: figure out if base is virtual
       bool IsVirtual = false;
 
       // Create a path entry for the field.
       const Descriptor *Desc = Ptr.getFieldDesc();
       if (const auto *BaseOrMember = Desc->asDecl()) {
-        if (const auto *FD = dyn_cast<FieldDecl>(BaseOrMember)) {
-          Ptr = Ptr.getBase();
+        Path.push_back(APValue::LValuePathEntry({BaseOrMember, IsVirtual}));
+        Ptr = Ptr.getBase();
+
+        if (const auto *FD = dyn_cast<FieldDecl>(BaseOrMember))
           Offset += getFieldOffset(FD);
-        } else if (const auto *RD = dyn_cast<CXXRecordDecl>(BaseOrMember)) {
-          IsVirtual = Ptr.isVirtualBaseClass();
-          Ptr = Ptr.getBase();
-          const Record *BaseRecord = Ptr.getRecord();
-
-          const ASTRecordLayout &Layout = ASTCtx.getASTRecordLayout(
-              cast<CXXRecordDecl>(BaseRecord->getDecl()));
-          if (IsVirtual)
-            Offset += Layout.getVBaseClassOffset(RD);
-          else
-            Offset += Layout.getBaseClassOffset(RD);
 
-        } else {
-          Ptr = Ptr.getBase();
-        }
-        Path.push_back(APValue::LValuePathEntry({BaseOrMember, IsVirtual}));
         continue;
       }
       llvm_unreachable("Invalid field type");
diff --git a/clang/lib/AST/Interp/Pointer.h b/clang/lib/AST/Interp/Pointer.h
index e351699023ba5..7fa6a3230a4f9 100644
--- a/clang/lib/AST/Interp/Pointer.h
+++ b/clang/lib/AST/Interp/Pointer.h
@@ -487,9 +487,6 @@ class Pointer {
   }
   /// Checks if a structure is a base class.
   bool isBaseClass() const { return isField() && getInlineDesc()->IsBase; }
-  bool isVirtualBaseClass() const {
-    return isField() && getInlineDesc()->IsVirtualBase;
-  }
   /// Checks if the pointer points to a dummy value.
   bool isDummy() const {
     if (!isBlockPointer())
diff --git a/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp b/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
index 8d7fe18488217..f734168e647bd 100644
--- a/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
+++ b/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
@@ -524,21 +524,12 @@ void Environment::initialize() {
           assert(VarDecl != nullptr);
           setStorageLocation(*VarDecl, createObject(*VarDecl, nullptr));
         } else if (Capture.capturesThis()) {
-          if (auto *Ancestor = InitialTargetFunc->getNonClosureAncestor()) {
-            const auto *SurroundingMethodDecl = cast<CXXMethodDecl>(Ancestor);
-            QualType ThisPointeeType =
-                SurroundingMethodDecl->getFunctionObjectParameterType();
-            setThisPointeeStorageLocation(
-                cast<RecordStorageLocation>(createObject(ThisPointeeType)));
-          } else if (auto *FieldBeingInitialized =
-                         dyn_cast<FieldDecl>(Parent->getLambdaContextDecl())) {
-            // This is in a field initializer, rather than a method.
-            setThisPointeeStorageLocation(
-                cast<RecordStorageLocation>(createObject(QualType(
-                    FieldBeingInitialized->getParent()->getTypeForDecl(), 0))));
-          } else {
-            assert(false && "Unexpected this-capturing lambda context.");
-          }
+          const auto *SurroundingMethodDecl =
+              cast<CXXMethodDecl>(InitialTargetFunc->getNonClosureAncestor());
+          QualType ThisPointeeType =
+              SurroundingMethodDecl->getFunctionObjectParameterType();
+          setThisPointeeStorageLocation(
+              cast<RecordStorageLocation>(createObject(ThisPointeeType)));
         }
       }
     } else if (MethodDecl->isImplicitObjectMemberFunction()) {
diff --git a/clang/lib/Basic/IdentifierTable.cpp b/clang/lib/Basic/IdentifierTable.cpp
index 4f7ccaf4021d6..97d830214f890 100644
--- a/clang/lib/Basic/IdentifierTable.cpp
+++ b/clang/lib/Basic/IdentifierTable.cpp
@@ -322,8 +322,9 @@ void IdentifierTable::AddKeywords(const LangOptions &LangOpts) {
   if (LangOpts.IEEE128)
     AddKeyword("__ieee128", tok::kw___float128, KEYALL, LangOpts, *this);
 
-  // Add the 'import' contextual keyword.
+  // Add the 'import' and 'module' contextual keyword.
   get("import").setModulesImport(true);
+  get("module").setModulesDeclaration(true);
 }
 
 /// Checks if the specified token kind represents a keyword in the
diff --git a/clang/lib/Basic/Targets/AArch64.cpp b/clang/lib/Basic/Targets/AArch64.cpp
index 6ba31cc05a0d7..6349fcf3dadd7 100644
--- a/clang/lib/Basic/Targets/AArch64.cpp
+++ b/clang/lib/Basic/Targets/AArch64.cpp
@@ -204,8 +204,7 @@ AArch64TargetInfo::AArch64TargetInfo(const llvm::Triple &Triple,
 StringRef AArch64TargetInfo::getABI() const { return ABI; }
 
 bool AArch64TargetInfo::setABI(const std::string &Name) {
-  if (Name != "aapcs" && Name != "aapcs-soft" && Name != "darwinpcs" &&
-      Name != "pauthtest")
+  if (Name != "aapcs" && Name != "aapcs-soft" && Name != "darwinpcs")
     return false;
 
   ABI = Name;
@@ -219,12 +218,6 @@ bool AArch64TargetInfo::validateTarget(DiagnosticsEngine &Diags) const {
     Diags.Report(diag::err_target_unsupported_abi_with_fpu) << ABI;
     return false;
   }
-  if (getTriple().getEnvironment() == llvm::Triple::PAuthTest &&
-      getTriple().getOS() != llvm::Triple::Linux) {
-    Diags.Report(diag::err_target_unsupported_abi_for_triple)
-        << getTriple().getEnvironmentName() << getTriple().getTriple();
-    return false;
-  }
   return true;
 }
 
diff --git a/clang/lib/Basic/Targets/LoongArch.cpp b/clang/lib/Basic/Targets/LoongArch.cpp
index cb3fd12c48ddb..75f71a337b7a4 100644
--- a/clang/lib/Basic/Targets/LoongArch.cpp
+++ b/clang/lib/Basic/Targets/LoongArch.cpp
@@ -200,24 +200,7 @@ void LoongArchTargetInfo::getTargetDefines(const LangOptions &Opts,
 
   // Define __loongarch_arch.
   StringRef ArchName = getCPU();
-  if (ArchName == "loongarch64") {
-    if (HasFeatureLSX) {
-      // TODO: As more features of the V1.1 ISA are supported, a unified "v1.1"
-      // arch feature set will be used to include all sub-features belonging to
-      // the V1.1 ISA version.
-      if (HasFeatureFrecipe)
-        Builder.defineMacro("__loongarch_arch",
-                            Twine('"') + "la64v1.1" + Twine('"'));
-      else
-        Builder.defineMacro("__loongarch_arch",
-                            Twine('"') + "la64v1.0" + Twine('"'));
-    } else {
-      Builder.defineMacro("__loongarch_arch",
-                          Twine('"') + ArchName + Twine('"'));
-    }
-  } else {
-    Builder.defineMacro("__loongarch_arch", Twine('"') + ArchName + Twine('"'));
-  }
+  Builder.defineMacro("__loongarch_arch", Twine('"') + ArchName + Twine('"'));
 
   // Define __loongarch_tune.
   StringRef TuneCPU = getTargetOpts().TuneCPU;
@@ -233,8 +216,6 @@ void LoongArchTargetInfo::getTargetDefines(const LangOptions &Opts,
     Builder.defineMacro("__loongarch_simd_width", "128");
     Builder.defineMacro("__loongarch_sx", Twine(1));
   }
-  if (HasFeatureFrecipe)
-    Builder.defineMacro("__loongarch_frecipe", Twine(1));
 
   StringRef ABI = getABI();
   if (ABI == "lp64d" || ABI == "lp64f" || ABI == "lp64s")
@@ -310,8 +291,6 @@ bool LoongArchTargetInfo::handleTargetFeatures(
       HasFeatureLASX = true;
     else if (Feature == "-ual")
       HasUnalignedAccess = false;
-    else if (Feature == "+frecipe")
-      HasFeatureFrecipe = true;
   }
   return true;
 }
diff --git a/clang/lib/Basic/Targets/LoongArch.h b/clang/lib/Basic/Targets/LoongArch.h
index c668ca7eca047..5fc223483951e 100644
--- a/clang/lib/Basic/Targets/LoongArch.h
+++ b/clang/lib/Basic/Targets/LoongArch.h
@@ -29,7 +29,6 @@ class LLVM_LIBRARY_VISIBILITY LoongArchTargetInfo : public TargetInfo {
   bool HasFeatureF;
   bool HasFeatureLSX;
   bool HasFeatureLASX;
-  bool HasFeatureFrecipe;
 
 public:
   LoongArchTargetInfo(const llvm::Triple &Triple, const TargetOptions &)
@@ -38,7 +37,6 @@ class LLVM_LIBRARY_VISIBILITY LoongArchTargetInfo : public TargetInfo {
     HasFeatureF = false;
     HasFeatureLSX = false;
     HasFeatureLASX = false;
-    HasFeatureFrecipe = false;
     LongDoubleWidth = 128;
     LongDoubleAlign = 128;
     LongDoubleFormat = &llvm::APFloat::IEEEquad();
diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h
index ba34ab2c7f336..cdec41afd1a4b 100644
--- a/clang/lib/Basic/Targets/X86.h
+++ b/clang/lib/Basic/Targets/X86.h
@@ -513,6 +513,15 @@ class LLVM_LIBRARY_VISIBILITY NetBSDI386TargetInfo
 public:
   NetBSDI386TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
       : NetBSDTargetInfo<X86_32TargetInfo>(Triple, Opts) {}
+
+  LangOptions::FPEvalMethodKind getFPEvalMethod() const override {
+    VersionTuple OsVersion = getTriple().getOSVersion();
+    // New NetBSD uses the default rounding mode.
+    if (OsVersion >= VersionTuple(6, 99, 26) || OsVersion.getMajor() == 0)
+      return X86_32TargetInfo::getFPEvalMethod();
+    // NetBSD before 6.99.26 defaults to "double" rounding.
+    return LangOptions::FPEvalMethodKind::FEM_Double;
+  }
 };
 
 class LLVM_LIBRARY_VISIBILITY OpenBSDI386TargetInfo
diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index 234a9c16e39df..d582aba679ddc 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -5034,8 +5034,7 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
                                  ReturnValueSlot ReturnValue,
                                  const CallArgList &CallArgs,
                                  llvm::CallBase **callOrInvoke, bool IsMustTail,
-                                 SourceLocation Loc,
-                                 bool IsVirtualFunctionPointerThunk) {
+                                 SourceLocation Loc) {
   // FIXME: We no longer need the types from CallArgs; lift up and simplify.
 
   assert(Callee.isOrdinary() || Callee.isVirtual());
@@ -5099,11 +5098,7 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
   RawAddress SRetAlloca = RawAddress::invalid();
   llvm::Value *UnusedReturnSizePtr = nullptr;
   if (RetAI.isIndirect() || RetAI.isInAlloca() || RetAI.isCoerceAndExpand()) {
-    if (IsVirtualFunctionPointerThunk && RetAI.isIndirect()) {
-      SRetPtr = makeNaturalAddressForPointer(CurFn->arg_begin() +
-                                                 IRFunctionArgs.getSRetArgNo(),
-                                             RetTy, CharUnits::fromQuantity(1));
-    } else if (!ReturnValue.isNull()) {
+    if (!ReturnValue.isNull()) {
       SRetPtr = ReturnValue.getAddress();
     } else {
       SRetPtr = CreateMemTemp(RetTy, "tmp", &SRetAlloca);
@@ -5882,131 +5877,119 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
   CallArgs.freeArgumentMemory(*this);
 
   // Extract the return value.
-  RValue Ret;
+  RValue Ret = [&] {
+    switch (RetAI.getKind()) {
+    case ABIArgInfo::CoerceAndExpand: {
+      auto coercionType = RetAI.getCoerceAndExpandType();
 
-  // If the current function is a virtual function pointer thunk, avoid copying
-  // the return value of the musttail call to a temporary.
-  if (IsVirtualFunctionPointerThunk) {
-    Ret = RValue::get(CI);
-  } else {
-    Ret = [&] {
-      switch (RetAI.getKind()) {
-      case ABIArgInfo::CoerceAndExpand: {
-        auto coercionType = RetAI.getCoerceAndExpandType();
-
-        Address addr = SRetPtr.withElementType(coercionType);
-
-        assert(CI->getType() == RetAI.getUnpaddedCoerceAndExpandType());
-        bool requiresExtract = isa<llvm::StructType>(CI->getType());
-
-        unsigned unpaddedIndex = 0;
-        for (unsigned i = 0, e = coercionType->getNumElements(); i != e; ++i) {
-          llvm::Type *eltType = coercionType->getElementType(i);
-          if (ABIArgInfo::isPaddingForCoerceAndExpand(eltType))
-            continue;
-          Address eltAddr = Builder.CreateStructGEP(addr, i);
-          llvm::Value *elt = CI;
-          if (requiresExtract)
-            elt = Builder.CreateExtractValue(elt, unpaddedIndex++);
-          else
-            assert(unpaddedIndex == 0);
-          Builder.CreateStore(elt, eltAddr);
-        }
-        [[fallthrough]];
-      }
+      Address addr = SRetPtr.withElementType(coercionType);
+
+      assert(CI->getType() == RetAI.getUnpaddedCoerceAndExpandType());
+      bool requiresExtract = isa<llvm::StructType>(CI->getType());
 
-      case ABIArgInfo::InAlloca:
-      case ABIArgInfo::Indirect: {
-        RValue ret = convertTempToRValue(SRetPtr, RetTy, SourceLocation());
-        if (UnusedReturnSizePtr)
-          PopCleanupBlock();
-        return ret;
+      unsigned unpaddedIndex = 0;
+      for (unsigned i = 0, e = coercionType->getNumElements(); i != e; ++i) {
+        llvm::Type *eltType = coercionType->getElementType(i);
+        if (ABIArgInfo::isPaddingForCoerceAndExpand(eltType)) continue;
+        Address eltAddr = Builder.CreateStructGEP(addr, i);
+        llvm::Value *elt = CI;
+        if (requiresExtract)
+          elt = Builder.CreateExtractValue(elt, unpaddedIndex++);
+        else
+          assert(unpaddedIndex == 0);
+        Builder.CreateStore(elt, eltAddr);
       }
+      [[fallthrough]];
+    }
 
-      case ABIArgInfo::Ignore:
-        // If we are ignoring an argument that had a result, make sure to
-        // construct the appropriate return value for our caller.
-        return GetUndefRValue(RetTy);
-
-      case ABIArgInfo::Extend:
-      case ABIArgInfo::Direct: {
-        llvm::Type *RetIRTy = ConvertType(RetTy);
-        if (RetAI.getCoerceToType() == RetIRTy &&
-            RetAI.getDirectOffset() == 0) {
-          switch (getEvaluationKind(RetTy)) {
-          case TEK_Complex: {
-            llvm::Value *Real = Builder.CreateExtractValue(CI, 0);
-            llvm::Value *Imag = Builder.CreateExtractValue(CI, 1);
-            return RValue::getComplex(std::make_pair(Real, Imag));
-          }
-          case TEK_Aggregate: {
-            Address DestPtr = ReturnValue.getAddress();
-            bool DestIsVolatile = ReturnValue.isVolatile();
+    case ABIArgInfo::InAlloca:
+    case ABIArgInfo::Indirect: {
+      RValue ret = convertTempToRValue(SRetPtr, RetTy, SourceLocation());
+      if (UnusedReturnSizePtr)
+        PopCleanupBlock();
+      return ret;
+    }
 
-            if (!DestPtr.isValid()) {
-              DestPtr = CreateMemTemp(RetTy, "agg.tmp");
-              DestIsVolatile = false;
-            }
-            EmitAggregateStore(CI, DestPtr, DestIsVolatile);
-            return RValue::getAggregate(DestPtr);
-          }
-          case TEK_Scalar: {
-            // If the argument doesn't match, perform a bitcast to coerce it.
-            // This can happen due to trivial type mismatches.
-            llvm::Value *V = CI;
-            if (V->getType() != RetIRTy)
-              V = Builder.CreateBitCast(V, RetIRTy);
-            return RValue::get(V);
-          }
-          }
-          llvm_unreachable("bad evaluation kind");
+    case ABIArgInfo::Ignore:
+      // If we are ignoring an argument that had a result, make sure to
+      // construct the appropriate return value for our caller.
+      return GetUndefRValue(RetTy);
+
+    case ABIArgInfo::Extend:
+    case ABIArgInfo::Direct: {
+      llvm::Type *RetIRTy = ConvertType(RetTy);
+      if (RetAI.getCoerceToType() == RetIRTy && RetAI.getDirectOffset() == 0) {
+        switch (getEvaluationKind(RetTy)) {
+        case TEK_Complex: {
+          llvm::Value *Real = Builder.CreateExtractValue(CI, 0);
+          llvm::Value *Imag = Builder.CreateExtractValue(CI, 1);
+          return RValue::getComplex(std::make_pair(Real, Imag));
         }
+        case TEK_Aggregate: {
+          Address DestPtr = ReturnValue.getAddress();
+          bool DestIsVolatile = ReturnValue.isVolatile();
 
-        // If coercing a fixed vector from a scalable vector for ABI
-        // compatibility, and the types match, use the llvm.vector.extract
-        // intrinsic to perform the conversion.
-        if (auto *FixedDstTy = dyn_cast<llvm::FixedVectorType>(RetIRTy)) {
-          llvm::Value *V = CI;
-          if (auto *ScalableSrcTy =
-                  dyn_cast<llvm::ScalableVectorType>(V->getType())) {
-            if (FixedDstTy->getElementType() ==
-                ScalableSrcTy->getElementType()) {
-              llvm::Value *Zero = llvm::Constant::getNullValue(CGM.Int64Ty);
-              V = Builder.CreateExtractVector(FixedDstTy, V, Zero,
-                                              "cast.fixed");
-              return RValue::get(V);
-            }
+          if (!DestPtr.isValid()) {
+            DestPtr = CreateMemTemp(RetTy, "agg.tmp");
+            DestIsVolatile = false;
           }
+          EmitAggregateStore(CI, DestPtr, DestIsVolatile);
+          return RValue::getAggregate(DestPtr);
+        }
+        case TEK_Scalar: {
+          // If the argument doesn't match, perform a bitcast to coerce it.  This
+          // can happen due to trivial type mismatches.
+          llvm::Value *V = CI;
+          if (V->getType() != RetIRTy)
+            V = Builder.CreateBitCast(V, RetIRTy);
+          return RValue::get(V);
         }
-
-        Address DestPtr = ReturnValue.getValue();
-        bool DestIsVolatile = ReturnValue.isVolatile();
-
-        if (!DestPtr.isValid()) {
-          DestPtr = CreateMemTemp(RetTy, "coerce");
-          DestIsVolatile = false;
         }
+        llvm_unreachable("bad evaluation kind");
+      }
 
-        // An empty record can overlap other data (if declared with
-        // no_unique_address); omit the store for such types - as there is no
-        // actual data to store.
-        if (!isEmptyRecord(getContext(), RetTy, true)) {
-          // If the value is offset in memory, apply the offset now.
-          Address StorePtr = emitAddressAtOffset(*this, DestPtr, RetAI);
-          CreateCoercedStore(CI, StorePtr, DestIsVolatile, *this);
+      // If coercing a fixed vector from a scalable vector for ABI
+      // compatibility, and the types match, use the llvm.vector.extract
+      // intrinsic to perform the conversion.
+      if (auto *FixedDstTy = dyn_cast<llvm::FixedVectorType>(RetIRTy)) {
+        llvm::Value *V = CI;
+        if (auto *ScalableSrcTy =
+                dyn_cast<llvm::ScalableVectorType>(V->getType())) {
+          if (FixedDstTy->getElementType() == ScalableSrcTy->getElementType()) {
+            llvm::Value *Zero = llvm::Constant::getNullValue(CGM.Int64Ty);
+            V = Builder.CreateExtractVector(FixedDstTy, V, Zero, "cast.fixed");
+            return RValue::get(V);
+          }
         }
+      }
 
-        return convertTempToRValue(DestPtr, RetTy, SourceLocation());
+      Address DestPtr = ReturnValue.getValue();
+      bool DestIsVolatile = ReturnValue.isVolatile();
+
+      if (!DestPtr.isValid()) {
+        DestPtr = CreateMemTemp(RetTy, "coerce");
+        DestIsVolatile = false;
       }
 
-      case ABIArgInfo::Expand:
-      case ABIArgInfo::IndirectAliased:
-        llvm_unreachable("Invalid ABI kind for return argument");
+      // An empty record can overlap other data (if declared with
+      // no_unique_address); omit the store for such types - as there is no
+      // actual data to store.
+      if (!isEmptyRecord(getContext(), RetTy, true)) {
+        // If the value is offset in memory, apply the offset now.
+        Address StorePtr = emitAddressAtOffset(*this, DestPtr, RetAI);
+        CreateCoercedStore(CI, StorePtr, DestIsVolatile, *this);
       }
 
-      llvm_unreachable("Unhandled ABIArgInfo::Kind");
-    }();
-  }
+      return convertTempToRValue(DestPtr, RetTy, SourceLocation());
+    }
+
+    case ABIArgInfo::Expand:
+    case ABIArgInfo::IndirectAliased:
+      llvm_unreachable("Invalid ABI kind for return argument");
+    }
+
+    llvm_unreachable("Unhandled ABIArgInfo::Kind");
+  } ();
 
   // Emit the assume_aligned check on the return value.
   if (Ret.isScalar() && TargetDecl) {
diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index 5f58a64d8386c..aa53f96358044 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -1076,11 +1076,6 @@ static bool getGEPIndicesToField(CodeGenFunction &CGF, const RecordDecl *RD,
   const CGRecordLayout &Layout = CGF.CGM.getTypes().getCGRecordLayout(RD);
   int64_t FieldNo = -1;
   for (const FieldDecl *FD : RD->fields()) {
-    if (!Layout.containsFieldDecl(FD))
-      // This could happen if the field has a struct type that's empty. I don't
-      // know why either.
-      continue;
-
     FieldNo = Layout.getLLVMFieldNo(FD);
     if (FD == Field) {
       Indices.emplace_back(std::make_pair(RD, CGF.Builder.getInt32(FieldNo)));
diff --git a/clang/lib/CodeGen/CGExprConstant.cpp b/clang/lib/CodeGen/CGExprConstant.cpp
index f22321f0e738a..7c65fccb60855 100644
--- a/clang/lib/CodeGen/CGExprConstant.cpp
+++ b/clang/lib/CodeGen/CGExprConstant.cpp
@@ -814,7 +814,8 @@ bool ConstStructBuilder::Build(const APValue &Val, const RecordDecl *RD,
       llvm::Constant *VTableAddressPoint =
           CGM.getCXXABI().getVTableAddressPoint(BaseSubobject(CD, Offset),
                                                 VTableClass);
-      if (auto Authentication = CGM.getVTablePointerAuthentication(CD)) {
+      if (auto Authentication =
+              CGM.getVTablePointerAuthentication(VTableClass)) {
         VTableAddressPoint = Emitter.tryEmitConstantSignedPointer(
             VTableAddressPoint, *Authentication);
         if (!VTableAddressPoint)
diff --git a/clang/lib/CodeGen/CGPointerAuth.cpp b/clang/lib/CodeGen/CGPointerAuth.cpp
index 0c63b9d6bb7e7..72aed9f24d595 100644
--- a/clang/lib/CodeGen/CGPointerAuth.cpp
+++ b/clang/lib/CodeGen/CGPointerAuth.cpp
@@ -365,40 +365,6 @@ llvm::Constant *CodeGenModule::getFunctionPointer(GlobalDecl GD,
   return getFunctionPointer(getRawFunctionPointer(GD, Ty), FuncType);
 }
 
-CGPointerAuthInfo CodeGenModule::getMemberFunctionPointerAuthInfo(QualType FT) {
-  assert(FT->getAs<MemberPointerType>() && "MemberPointerType expected");
-  const auto &Schema = getCodeGenOpts().PointerAuth.CXXMemberFunctionPointers;
-  if (!Schema)
-    return CGPointerAuthInfo();
-
-  assert(!Schema.isAddressDiscriminated() &&
-         "function pointers cannot use address-specific discrimination");
-
-  llvm::ConstantInt *Discriminator =
-      getPointerAuthOtherDiscriminator(Schema, GlobalDecl(), FT);
-  return CGPointerAuthInfo(Schema.getKey(), Schema.getAuthenticationMode(),
-                           /* IsIsaPointer */ false,
-                           /* AuthenticatesNullValues */ false, Discriminator);
-}
-
-llvm::Constant *CodeGenModule::getMemberFunctionPointer(llvm::Constant *Pointer,
-                                                        QualType FT) {
-  if (CGPointerAuthInfo PointerAuth = getMemberFunctionPointerAuthInfo(FT))
-    return getConstantSignedPointer(
-        Pointer, PointerAuth.getKey(), nullptr,
-        cast_or_null<llvm::ConstantInt>(PointerAuth.getDiscriminator()));
-
-  return Pointer;
-}
-
-llvm::Constant *CodeGenModule::getMemberFunctionPointer(const FunctionDecl *FD,
-                                                        llvm::Type *Ty) {
-  QualType FT = FD->getType();
-  FT = getContext().getMemberPointerType(
-      FT, cast<CXXMethodDecl>(FD)->getParent()->getTypeForDecl());
-  return getMemberFunctionPointer(getRawFunctionPointer(FD, Ty), FT);
-}
-
 std::optional<PointerAuthQualifier>
 CodeGenModule::computeVTPointerAuthentication(const CXXRecordDecl *ThisClass) {
   auto DefaultAuthentication = getCodeGenOpts().PointerAuth.CXXVTablePointers;
diff --git a/clang/lib/CodeGen/CGRecordLayout.h b/clang/lib/CodeGen/CGRecordLayout.h
index 44e888c93108f..6c06ad20fbe56 100644
--- a/clang/lib/CodeGen/CGRecordLayout.h
+++ b/clang/lib/CodeGen/CGRecordLayout.h
@@ -193,10 +193,6 @@ class CGRecordLayout {
     return IsZeroInitializableAsBase;
   }
 
-  bool containsFieldDecl(const FieldDecl *FD) const {
-    return FieldInfo.count(FD) != 0;
-  }
-
   /// Return llvm::StructType element number that corresponds to the
   /// field FD.
   unsigned getLLVMFieldNo(const FieldDecl *FD) const {
diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp
index d6078696a7d91..1e98bea8c8ce3 100644
--- a/clang/lib/CodeGen/CodeGenFunction.cpp
+++ b/clang/lib/CodeGen/CodeGenFunction.cpp
@@ -882,8 +882,6 @@ void CodeGenFunction::StartFunction(GlobalDecl GD, QualType RetTy,
   const CodeGenOptions &CodeGenOpts = CGM.getCodeGenOpts();
   if (CodeGenOpts.PointerAuth.FunctionPointers)
     Fn->addFnAttr("ptrauth-calls");
-  if (CodeGenOpts.PointerAuth.IndirectGotos)
-    Fn->addFnAttr("ptrauth-indirect-gotos");
 
   // Apply xray attributes to the function (as a string, for now)
   bool AlwaysXRayAttr = false;
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index ba7b565d97559..eebb865e8f42c 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -4374,8 +4374,7 @@ class CodeGenFunction : public CodeGenTypeCache {
   RValue EmitCall(const CGFunctionInfo &CallInfo, const CGCallee &Callee,
                   ReturnValueSlot ReturnValue, const CallArgList &Args,
                   llvm::CallBase **callOrInvoke, bool IsMustTail,
-                  SourceLocation Loc,
-                  bool IsVirtualFunctionPointerThunk = false);
+                  SourceLocation Loc);
   RValue EmitCall(const CGFunctionInfo &CallInfo, const CGCallee &Callee,
                   ReturnValueSlot ReturnValue, const CallArgList &Args,
                   llvm::CallBase **callOrInvoke = nullptr,
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index 63ed5b4dd0c31..71192cb0e8c4a 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -149,8 +149,6 @@ createTargetCodeGenInfo(CodeGenModule &CGM) {
       return createWindowsAArch64TargetCodeGenInfo(CGM, AArch64ABIKind::Win64);
     else if (Target.getABI() == "aapcs-soft")
       Kind = AArch64ABIKind::AAPCSSoft;
-    else if (Target.getABI() == "pauthtest")
-      Kind = AArch64ABIKind::PAuthTest;
 
     return createAArch64TargetCodeGenInfo(CGM, Kind);
   }
diff --git a/clang/lib/CodeGen/CodeGenModule.h b/clang/lib/CodeGen/CodeGenModule.h
index 284bba823baeb..657e681730c3a 100644
--- a/clang/lib/CodeGen/CodeGenModule.h
+++ b/clang/lib/CodeGen/CodeGenModule.h
@@ -973,16 +973,8 @@ class CodeGenModule : public CodeGenTypeCache {
   llvm::Constant *getFunctionPointer(llvm::Constant *Pointer,
                                      QualType FunctionType);
 
-  llvm::Constant *getMemberFunctionPointer(const FunctionDecl *FD,
-                                           llvm::Type *Ty = nullptr);
-
-  llvm::Constant *getMemberFunctionPointer(llvm::Constant *Pointer,
-                                           QualType FT);
-
   CGPointerAuthInfo getFunctionPointerAuthInfo(QualType T);
 
-  CGPointerAuthInfo getMemberFunctionPointerAuthInfo(QualType FT);
-
   CGPointerAuthInfo getPointerAuthInfoForPointeeType(QualType type);
 
   CGPointerAuthInfo getPointerAuthInfoForType(QualType type);
diff --git a/clang/lib/CodeGen/ItaniumCXXABI.cpp b/clang/lib/CodeGen/ItaniumCXXABI.cpp
index cd76f8406e7b7..37b436a21fbc0 100644
--- a/clang/lib/CodeGen/ItaniumCXXABI.cpp
+++ b/clang/lib/CodeGen/ItaniumCXXABI.cpp
@@ -388,9 +388,6 @@ class ItaniumCXXABI : public CodeGen::CGCXXABI {
 
   bool NeedsVTTParameter(GlobalDecl GD) override;
 
-  llvm::Constant *
-  getOrCreateVirtualFunctionPointerThunk(const CXXMethodDecl *MD);
-
   /**************************** RTTI Uniqueness ******************************/
 
 protected:
@@ -429,9 +426,6 @@ class ItaniumCXXABI : public CodeGen::CGCXXABI {
                 const CXXRecordDecl *RD) override;
 
  private:
-   llvm::Constant *
-   getSignedVirtualMemberFunctionPointer(const CXXMethodDecl *MD);
-
    bool hasAnyUnusedVirtualInlineFunction(const CXXRecordDecl *RD) const {
      const auto &VtableLayout =
          CGM.getItaniumVTableContext().getVTableLayout(RD);
@@ -841,25 +835,7 @@ CGCallee ItaniumCXXABI::EmitLoadOfMemberFunctionPointer(
   CalleePtr->addIncoming(VirtualFn, FnVirtual);
   CalleePtr->addIncoming(NonVirtualFn, FnNonVirtual);
 
-  CGPointerAuthInfo PointerAuth;
-
-  if (const auto &Schema =
-          CGM.getCodeGenOpts().PointerAuth.CXXMemberFunctionPointers) {
-    llvm::PHINode *DiscriminatorPHI = Builder.CreatePHI(CGF.IntPtrTy, 2);
-    DiscriminatorPHI->addIncoming(llvm::ConstantInt::get(CGF.IntPtrTy, 0),
-                                  FnVirtual);
-    const auto &AuthInfo =
-        CGM.getMemberFunctionPointerAuthInfo(QualType(MPT, 0));
-    assert(Schema.getKey() == AuthInfo.getKey() &&
-           "Keys for virtual and non-virtual member functions must match");
-    auto *NonVirtualDiscriminator = AuthInfo.getDiscriminator();
-    DiscriminatorPHI->addIncoming(NonVirtualDiscriminator, FnNonVirtual);
-    PointerAuth = CGPointerAuthInfo(
-        Schema.getKey(), Schema.getAuthenticationMode(), Schema.isIsaPointer(),
-        Schema.authenticatesNullValues(), DiscriminatorPHI);
-  }
-
-  CGCallee Callee(FPT, CalleePtr, PointerAuth);
+  CGCallee Callee(FPT, CalleePtr);
   return Callee;
 }
 
@@ -877,25 +853,6 @@ llvm::Value *ItaniumCXXABI::EmitMemberDataPointerAddress(
                                    "memptr.offset");
 }
 
-// See if it's possible to return a constant signed pointer.
-static llvm::Constant *pointerAuthResignConstant(
-    llvm::Value *Ptr, const CGPointerAuthInfo &CurAuthInfo,
-    const CGPointerAuthInfo &NewAuthInfo, CodeGenModule &CGM) {
-  const auto *CPA = dyn_cast<llvm::ConstantPtrAuth>(Ptr);
-
-  if (!CPA)
-    return nullptr;
-
-  assert(CPA->getKey()->getZExtValue() == CurAuthInfo.getKey() &&
-         CPA->getAddrDiscriminator()->isZeroValue() &&
-         CPA->getDiscriminator() == CurAuthInfo.getDiscriminator() &&
-         "unexpected key or discriminators");
-
-  return CGM.getConstantSignedPointer(
-      CPA->getPointer(), NewAuthInfo.getKey(), nullptr,
-      cast<llvm::ConstantInt>(NewAuthInfo.getDiscriminator()));
-}
-
 /// Perform a bitcast, derived-to-base, or base-to-derived member pointer
 /// conversion.
 ///
@@ -923,63 +880,21 @@ llvm::Value *
 ItaniumCXXABI::EmitMemberPointerConversion(CodeGenFunction &CGF,
                                            const CastExpr *E,
                                            llvm::Value *src) {
-  // Use constant emission if we can.
-  if (isa<llvm::Constant>(src))
-    return EmitMemberPointerConversion(E, cast<llvm::Constant>(src));
-
   assert(E->getCastKind() == CK_DerivedToBaseMemberPointer ||
          E->getCastKind() == CK_BaseToDerivedMemberPointer ||
          E->getCastKind() == CK_ReinterpretMemberPointer);
 
-  CGBuilderTy &Builder = CGF.Builder;
-  QualType DstType = E->getType();
-
-  if (DstType->isMemberFunctionPointerType()) {
-    if (const auto &NewAuthInfo =
-            CGM.getMemberFunctionPointerAuthInfo(DstType)) {
-      QualType SrcType = E->getSubExpr()->getType();
-      assert(SrcType->isMemberFunctionPointerType());
-      const auto &CurAuthInfo = CGM.getMemberFunctionPointerAuthInfo(SrcType);
-      llvm::Value *MemFnPtr = Builder.CreateExtractValue(src, 0, "memptr.ptr");
-      llvm::Type *OrigTy = MemFnPtr->getType();
-
-      llvm::BasicBlock *StartBB = Builder.GetInsertBlock();
-      llvm::BasicBlock *ResignBB = CGF.createBasicBlock("resign");
-      llvm::BasicBlock *MergeBB = CGF.createBasicBlock("merge");
-
-      // Check whether we have a virtual offset or a pointer to a function.
-      assert(UseARMMethodPtrABI && "ARM ABI expected");
-      llvm::Value *Adj = Builder.CreateExtractValue(src, 1, "memptr.adj");
-      llvm::Constant *Ptrdiff_1 = llvm::ConstantInt::get(CGM.PtrDiffTy, 1);
-      llvm::Value *AndVal = Builder.CreateAnd(Adj, Ptrdiff_1);
-      llvm::Value *IsVirtualOffset =
-          Builder.CreateIsNotNull(AndVal, "is.virtual.offset");
-      Builder.CreateCondBr(IsVirtualOffset, MergeBB, ResignBB);
-
-      CGF.EmitBlock(ResignBB);
-      llvm::Type *PtrTy = llvm::PointerType::getUnqual(CGM.Int8Ty);
-      MemFnPtr = Builder.CreateIntToPtr(MemFnPtr, PtrTy);
-      MemFnPtr =
-          CGF.emitPointerAuthResign(MemFnPtr, SrcType, CurAuthInfo, NewAuthInfo,
-                                    isa<llvm::Constant>(src));
-      MemFnPtr = Builder.CreatePtrToInt(MemFnPtr, OrigTy);
-      llvm::Value *ResignedVal = Builder.CreateInsertValue(src, MemFnPtr, 0);
-      ResignBB = Builder.GetInsertBlock();
-
-      CGF.EmitBlock(MergeBB);
-      llvm::PHINode *NewSrc = Builder.CreatePHI(src->getType(), 2);
-      NewSrc->addIncoming(src, StartBB);
-      NewSrc->addIncoming(ResignedVal, ResignBB);
-      src = NewSrc;
-    }
-  }
-
   // Under Itanium, reinterprets don't require any additional processing.
   if (E->getCastKind() == CK_ReinterpretMemberPointer) return src;
 
+  // Use constant emission if we can.
+  if (isa<llvm::Constant>(src))
+    return EmitMemberPointerConversion(E, cast<llvm::Constant>(src));
+
   llvm::Constant *adj = getMemberPointerAdjustment(E);
   if (!adj) return src;
 
+  CGBuilderTy &Builder = CGF.Builder;
   bool isDerivedToBase = (E->getCastKind() == CK_DerivedToBaseMemberPointer);
 
   const MemberPointerType *destTy =
@@ -1017,34 +932,6 @@ ItaniumCXXABI::EmitMemberPointerConversion(CodeGenFunction &CGF,
   return Builder.CreateInsertValue(src, dstAdj, 1);
 }
 
-static llvm::Constant *
-pointerAuthResignMemberFunctionPointer(llvm::Constant *Src, QualType DestType,
-                                       QualType SrcType, CodeGenModule &CGM) {
-  assert(DestType->isMemberFunctionPointerType() &&
-         SrcType->isMemberFunctionPointerType() &&
-         "member function pointers expected");
-  if (DestType == SrcType)
-    return Src;
-
-  const auto &NewAuthInfo = CGM.getMemberFunctionPointerAuthInfo(DestType);
-  const auto &CurAuthInfo = CGM.getMemberFunctionPointerAuthInfo(SrcType);
-
-  if (!NewAuthInfo && !CurAuthInfo)
-    return Src;
-
-  llvm::Constant *MemFnPtr = Src->getAggregateElement(0u);
-  if (MemFnPtr->getNumOperands() == 0) {
-    // src must be a pair of null pointers.
-    assert(isa<llvm::ConstantInt>(MemFnPtr) && "constant int expected");
-    return Src;
-  }
-
-  llvm::Constant *ConstPtr = pointerAuthResignConstant(
-      cast<llvm::User>(MemFnPtr)->getOperand(0), CurAuthInfo, NewAuthInfo, CGM);
-  ConstPtr = llvm::ConstantExpr::getPtrToInt(ConstPtr, MemFnPtr->getType());
-  return ConstantFoldInsertValueInstruction(Src, ConstPtr, 0);
-}
-
 llvm::Constant *
 ItaniumCXXABI::EmitMemberPointerConversion(const CastExpr *E,
                                            llvm::Constant *src) {
@@ -1052,12 +939,6 @@ ItaniumCXXABI::EmitMemberPointerConversion(const CastExpr *E,
          E->getCastKind() == CK_BaseToDerivedMemberPointer ||
          E->getCastKind() == CK_ReinterpretMemberPointer);
 
-  QualType DstType = E->getType();
-
-  if (DstType->isMemberFunctionPointerType())
-    src = pointerAuthResignMemberFunctionPointer(
-        src, DstType, E->getSubExpr()->getType(), CGM);
-
   // Under Itanium, reinterprets don't require any additional processing.
   if (E->getCastKind() == CK_ReinterpretMemberPointer) return src;
 
@@ -1155,32 +1036,9 @@ llvm::Constant *ItaniumCXXABI::BuildMemberPointer(const CXXMethodDecl *MD,
       //   least significant bit of adj then makes exactly the same
       //   discrimination as the least significant bit of ptr does for
       //   Itanium.
-
-      // We cannot use the Itanium ABI's representation for virtual member
-      // function pointers under pointer authentication because it would
-      // require us to store both the virtual offset and the constant
-      // discriminator in the pointer, which would be immediately vulnerable
-      // to attack.  Instead we introduce a thunk that does the virtual dispatch
-      // and store it as if it were a non-virtual member function.  This means
-      // that virtual function pointers may not compare equal anymore, but
-      // fortunately they aren't required to by the standard, and we do make
-      // a best-effort attempt to re-use the thunk.
-      //
-      // To support interoperation with code in which pointer authentication
-      // is disabled, derefencing a member function pointer must still handle
-      // the virtual case, but it can use a discriminator which should never
-      // be valid.
-      const auto &Schema =
-          CGM.getCodeGenOpts().PointerAuth.CXXMemberFunctionPointers;
-      if (Schema)
-        MemPtr[0] = llvm::ConstantExpr::getPtrToInt(
-            getSignedVirtualMemberFunctionPointer(MD), CGM.PtrDiffTy);
-      else
-        MemPtr[0] = llvm::ConstantInt::get(CGM.PtrDiffTy, VTableOffset);
-      // Don't set the LSB of adj to 1 if pointer authentication for member
-      // function pointers is enabled.
-      MemPtr[1] = llvm::ConstantInt::get(
-          CGM.PtrDiffTy, 2 * ThisAdjustment.getQuantity() + !Schema);
+      MemPtr[0] = llvm::ConstantInt::get(CGM.PtrDiffTy, VTableOffset);
+      MemPtr[1] = llvm::ConstantInt::get(CGM.PtrDiffTy,
+                                         2 * ThisAdjustment.getQuantity() + 1);
     } else {
       // Itanium C++ ABI 2.3:
       //   For a virtual function, [the pointer field] is 1 plus the
@@ -1202,7 +1060,7 @@ llvm::Constant *ItaniumCXXABI::BuildMemberPointer(const CXXMethodDecl *MD,
       // function type is incomplete.
       Ty = CGM.PtrDiffTy;
     }
-    llvm::Constant *addr = CGM.getMemberFunctionPointer(MD, Ty);
+    llvm::Constant *addr = CGM.GetAddrOfFunction(MD, Ty);
 
     MemPtr[0] = llvm::ConstantExpr::getPtrToInt(addr, CGM.PtrDiffTy);
     MemPtr[1] = llvm::ConstantInt::get(CGM.PtrDiffTy,
@@ -1222,12 +1080,8 @@ llvm::Constant *ItaniumCXXABI::EmitMemberPointer(const APValue &MP,
 
   CharUnits ThisAdjustment = getContext().getMemberPointerPathAdjustment(MP);
 
-  if (const CXXMethodDecl *MD = dyn_cast<CXXMethodDecl>(MPD)) {
-    llvm::Constant *Src = BuildMemberPointer(MD, ThisAdjustment);
-    QualType SrcType = getContext().getMemberPointerType(
-        MD->getType(), MD->getParent()->getTypeForDecl());
-    return pointerAuthResignMemberFunctionPointer(Src, MPType, SrcType, CGM);
-  }
+  if (const CXXMethodDecl *MD = dyn_cast<CXXMethodDecl>(MPD))
+    return BuildMemberPointer(MD, ThisAdjustment);
 
   CharUnits FieldOffset =
     getContext().toCharUnitsFromBits(getContext().getFieldOffset(MPD));
@@ -3331,78 +3185,6 @@ bool ItaniumCXXABI::NeedsVTTParameter(GlobalDecl GD) {
   return false;
 }
 
-llvm::Constant *
-ItaniumCXXABI::getOrCreateVirtualFunctionPointerThunk(const CXXMethodDecl *MD) {
-  SmallString<256> MethodName;
-  llvm::raw_svector_ostream Out(MethodName);
-  getMangleContext().mangleCXXName(MD, Out);
-  MethodName += "_vfpthunk_";
-  StringRef ThunkName = MethodName.str();
-  llvm::Function *ThunkFn;
-  if ((ThunkFn = cast_or_null<llvm::Function>(
-           CGM.getModule().getNamedValue(ThunkName))))
-    return ThunkFn;
-
-  const CGFunctionInfo &FnInfo = CGM.getTypes().arrangeCXXMethodDeclaration(MD);
-  llvm::FunctionType *ThunkTy = CGM.getTypes().GetFunctionType(FnInfo);
-  llvm::GlobalValue::LinkageTypes Linkage =
-      MD->isExternallyVisible() ? llvm::GlobalValue::LinkOnceODRLinkage
-                                : llvm::GlobalValue::InternalLinkage;
-  ThunkFn =
-      llvm::Function::Create(ThunkTy, Linkage, ThunkName, &CGM.getModule());
-  if (Linkage == llvm::GlobalValue::LinkOnceODRLinkage)
-    ThunkFn->setVisibility(llvm::GlobalValue::HiddenVisibility);
-  assert(ThunkFn->getName() == ThunkName && "name was uniqued!");
-
-  CGM.SetLLVMFunctionAttributes(MD, FnInfo, ThunkFn, /*IsThunk=*/true);
-  CGM.SetLLVMFunctionAttributesForDefinition(MD, ThunkFn);
-
-  // Stack protection sometimes gets inserted after the musttail call.
-  ThunkFn->removeFnAttr(llvm::Attribute::StackProtect);
-  ThunkFn->removeFnAttr(llvm::Attribute::StackProtectStrong);
-  ThunkFn->removeFnAttr(llvm::Attribute::StackProtectReq);
-
-  // Start codegen.
-  CodeGenFunction CGF(CGM);
-  CGF.CurGD = GlobalDecl(MD);
-  CGF.CurFuncIsThunk = true;
-
-  // Build FunctionArgs.
-  FunctionArgList FunctionArgs;
-  CGF.BuildFunctionArgList(CGF.CurGD, FunctionArgs);
-
-  CGF.StartFunction(GlobalDecl(), FnInfo.getReturnType(), ThunkFn, FnInfo,
-                    FunctionArgs, MD->getLocation(), SourceLocation());
-  llvm::Value *ThisVal = loadIncomingCXXThis(CGF);
-  setCXXABIThisValue(CGF, ThisVal);
-
-  CallArgList CallArgs;
-  for (const VarDecl *VD : FunctionArgs)
-    CGF.EmitDelegateCallArg(CallArgs, VD, SourceLocation());
-
-  const FunctionProtoType *FPT = MD->getType()->getAs<FunctionProtoType>();
-  RequiredArgs Required = RequiredArgs::forPrototypePlus(FPT, /*this*/ 1);
-  const CGFunctionInfo &CallInfo =
-      CGM.getTypes().arrangeCXXMethodCall(CallArgs, FPT, Required, 0);
-  CGCallee Callee = CGCallee::forVirtual(nullptr, GlobalDecl(MD),
-                                         getThisAddress(CGF), ThunkTy);
-  llvm::CallBase *CallOrInvoke;
-  CGF.EmitCall(CallInfo, Callee, ReturnValueSlot(), CallArgs, &CallOrInvoke,
-               /*IsMustTail=*/true, SourceLocation(), true);
-  auto *Call = cast<llvm::CallInst>(CallOrInvoke);
-  Call->setTailCallKind(llvm::CallInst::TCK_MustTail);
-  if (Call->getType()->isVoidTy())
-    CGF.Builder.CreateRetVoid();
-  else
-    CGF.Builder.CreateRet(Call);
-
-  // Finish the function to maintain CodeGenFunction invariants.
-  // FIXME: Don't emit unreachable code.
-  CGF.EmitBlock(CGF.createBasicBlock());
-  CGF.FinishFunction();
-  return ThunkFn;
-}
-
 namespace {
 class ItaniumRTTIBuilder {
   CodeGenModule &CGM;  // Per-module state.
@@ -5093,18 +4875,6 @@ ItaniumCXXABI::LoadVTablePtr(CodeGenFunction &CGF, Address This,
   return {CGF.GetVTablePtr(This, CGM.Int8PtrTy, RD), RD};
 }
 
-llvm::Constant *
-ItaniumCXXABI::getSignedVirtualMemberFunctionPointer(const CXXMethodDecl *MD) {
-  const CXXMethodDecl *origMD =
-      cast<CXXMethodDecl>(CGM.getItaniumVTableContext()
-                              .findOriginalMethod(MD->getCanonicalDecl())
-                              .getDecl());
-  llvm::Constant *thunk = getOrCreateVirtualFunctionPointerThunk(origMD);
-  QualType funcType = CGM.getContext().getMemberPointerType(
-      MD->getType(), MD->getParent()->getTypeForDecl());
-  return CGM.getMemberFunctionPointer(thunk, funcType);
-}
-
 void WebAssemblyCXXABI::emitBeginCatch(CodeGenFunction &CGF,
                                        const CXXCatchStmt *C) {
   if (CGF.getTarget().hasFeature("exception-handling"))
diff --git a/clang/lib/CodeGen/TargetInfo.h b/clang/lib/CodeGen/TargetInfo.h
index 2f2138582ba1e..0925609cc74aa 100644
--- a/clang/lib/CodeGen/TargetInfo.h
+++ b/clang/lib/CodeGen/TargetInfo.h
@@ -437,7 +437,6 @@ enum class AArch64ABIKind {
   DarwinPCS,
   Win64,
   AAPCSSoft,
-  PAuthTest,
 };
 
 std::unique_ptr<TargetCodeGenInfo>
diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp
index 20a555afb8092..85ae4d2a26fee 100644
--- a/clang/lib/Driver/ToolChain.cpp
+++ b/clang/lib/Driver/ToolChain.cpp
@@ -1031,12 +1031,11 @@ std::string ToolChain::ComputeLLVMTriple(const ArgList &Args,
   }
   case llvm::Triple::aarch64: {
     llvm::Triple Triple = getTriple();
-    tools::aarch64::setPAuthABIInTriple(getDriver(), Args, Triple);
     if (!Triple.isOSBinFormatMachO())
-      return Triple.getTriple();
+      return getTripleString();
 
     if (Triple.isArm64e())
-      return Triple.getTriple();
+      return getTripleString();
 
     // FIXME: older versions of ld64 expect the "arm64" component in the actual
     // triple string and query it to determine whether an LTO file can be
diff --git a/clang/lib/Driver/ToolChains/AIX.cpp b/clang/lib/Driver/ToolChains/AIX.cpp
index fb780fb75651d..b04502a57a9f7 100644
--- a/clang/lib/Driver/ToolChains/AIX.cpp
+++ b/clang/lib/Driver/ToolChains/AIX.cpp
@@ -557,12 +557,6 @@ void AIX::addClangTargetOptions(
   if (!Args.getLastArgNoClaim(options::OPT_fsized_deallocation,
                               options::OPT_fno_sized_deallocation))
     CC1Args.push_back("-fno-sized-deallocation");
-
-  if (Args.hasFlag(options::OPT_ferr_pragma_mc_func_aix,
-                   options::OPT_fno_err_pragma_mc_func_aix, false))
-    CC1Args.push_back("-ferr-pragma-mc-func-aix");
-  else
-    CC1Args.push_back("-fno-err-pragma-mc-func-aix");
 }
 
 void AIX::addProfileRTLibs(const llvm::opt::ArgList &Args,
diff --git a/clang/lib/Driver/ToolChains/Arch/AArch64.cpp b/clang/lib/Driver/ToolChains/Arch/AArch64.cpp
index f083e40df1314..5fbf38cdda12b 100644
--- a/clang/lib/Driver/ToolChains/Arch/AArch64.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/AArch64.cpp
@@ -449,24 +449,3 @@ void aarch64::getAArch64TargetFeatures(const Driver &D,
   if (Args.getLastArg(options::OPT_mno_bti_at_return_twice))
     Features.push_back("+no-bti-at-return-twice");
 }
-
-void aarch64::setPAuthABIInTriple(const Driver &D, const ArgList &Args,
-                                  llvm::Triple &Triple) {
-  Arg *ABIArg = Args.getLastArg(options::OPT_mabi_EQ);
-  bool HasPAuthABI =
-      ABIArg ? (StringRef(ABIArg->getValue()) == "pauthtest") : false;
-
-  switch (Triple.getEnvironment()) {
-  case llvm::Triple::UnknownEnvironment:
-    if (HasPAuthABI)
-      Triple.setEnvironment(llvm::Triple::PAuthTest);
-    break;
-  case llvm::Triple::PAuthTest:
-    break;
-  default:
-    if (HasPAuthABI)
-      D.Diag(diag::err_drv_unsupported_opt_for_target)
-          << ABIArg->getAsString(Args) << Triple.getTriple();
-    break;
-  }
-}
diff --git a/clang/lib/Driver/ToolChains/Arch/AArch64.h b/clang/lib/Driver/ToolChains/Arch/AArch64.h
index 6d071167bd392..d47c402d4a42d 100644
--- a/clang/lib/Driver/ToolChains/Arch/AArch64.h
+++ b/clang/lib/Driver/ToolChains/Arch/AArch64.h
@@ -28,9 +28,6 @@ void getAArch64TargetFeatures(const Driver &D, const llvm::Triple &Triple,
 std::string getAArch64TargetCPU(const llvm::opt::ArgList &Args,
                                 const llvm::Triple &Triple, llvm::opt::Arg *&A);
 
-void setPAuthABIInTriple(const Driver &D, const llvm::opt::ArgList &Args,
-                         llvm::Triple &triple);
-
 } // end namespace aarch64
 } // end namespace target
 } // end namespace driver
diff --git a/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp b/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp
index 1e8aac71dc9ba..4a2b9efc9ffad 100644
--- a/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp
@@ -127,11 +127,6 @@ void loongarch::getLoongArchTargetFeatures(const Driver &D,
                                            const llvm::Triple &Triple,
                                            const ArgList &Args,
                                            std::vector<StringRef> &Features) {
-  // Enable the `lsx` feature on 64-bit LoongArch by default.
-  if (Triple.isLoongArch64() &&
-      (!Args.hasArgNoClaim(clang::driver::options::OPT_march_EQ)))
-    Features.push_back("+lsx");
-
   std::string ArchName;
   if (const Arg *A = Args.getLastArg(options::OPT_march_EQ))
     ArchName = A->getValue();
@@ -150,11 +145,9 @@ void loongarch::getLoongArchTargetFeatures(const Driver &D,
     } else if (A->getOption().matches(options::OPT_msingle_float)) {
       Features.push_back("+f");
       Features.push_back("-d");
-      Features.push_back("-lsx");
     } else /*Soft-float*/ {
       Features.push_back("-f");
       Features.push_back("-d");
-      Features.push_back("-lsx");
     }
   } else if (const Arg *A = Args.getLastArg(options::OPT_mfpu_EQ)) {
     StringRef FPU = A->getValue();
@@ -164,11 +157,9 @@ void loongarch::getLoongArchTargetFeatures(const Driver &D,
     } else if (FPU == "32") {
       Features.push_back("+f");
       Features.push_back("-d");
-      Features.push_back("-lsx");
     } else if (FPU == "0" || FPU == "none") {
       Features.push_back("-f");
       Features.push_back("-d");
-      Features.push_back("-lsx");
     } else {
       D.Diag(diag::err_drv_loongarch_invalid_mfpu_EQ) << FPU;
     }
@@ -183,42 +174,6 @@ void loongarch::getLoongArchTargetFeatures(const Driver &D,
     A->ignoreTargetSpecific();
   if (Arg *A = Args.getLastArgNoClaim(options::OPT_mfpu_EQ))
     A->ignoreTargetSpecific();
-  if (Arg *A = Args.getLastArgNoClaim(options::OPT_msimd_EQ))
-    A->ignoreTargetSpecific();
-
-  // Select lsx/lasx feature determined by -msimd=.
-  // Option -msimd= precedes -m[no-]lsx and -m[no-]lasx.
-  if (const Arg *A = Args.getLastArg(options::OPT_msimd_EQ)) {
-    StringRef MSIMD = A->getValue();
-    if (MSIMD == "lsx") {
-      // Option -msimd=lsx depends on 64-bit FPU.
-      // -m*-float and -mfpu=none/0/32 conflict with -msimd=lsx.
-      if (llvm::find(Features, "-d") != Features.end())
-        D.Diag(diag::err_drv_loongarch_wrong_fpu_width) << /*LSX*/ 0;
-      else
-        Features.push_back("+lsx");
-    } else if (MSIMD == "lasx") {
-      // Option -msimd=lasx depends on 64-bit FPU and LSX.
-      // -m*-float, -mfpu=none/0/32 and -mno-lsx conflict with -msimd=lasx.
-      if (llvm::find(Features, "-d") != Features.end())
-        D.Diag(diag::err_drv_loongarch_wrong_fpu_width) << /*LASX*/ 1;
-      else if (llvm::find(Features, "-lsx") != Features.end())
-        D.Diag(diag::err_drv_loongarch_invalid_simd_option_combination);
-
-      // The command options do not contain -mno-lasx.
-      if (!Args.getLastArg(options::OPT_mno_lasx)) {
-        Features.push_back("+lsx");
-        Features.push_back("+lasx");
-      }
-    } else if (MSIMD == "none") {
-      if (llvm::find(Features, "+lsx") != Features.end())
-        Features.push_back("-lsx");
-      if (llvm::find(Features, "+lasx") != Features.end())
-        Features.push_back("-lasx");
-    } else {
-      D.Diag(diag::err_drv_loongarch_invalid_msimd_EQ) << MSIMD;
-    }
-  }
 
   // Select lsx feature determined by -m[no-]lsx.
   if (const Arg *A = Args.getLastArg(options::OPT_mlsx, options::OPT_mno_lsx)) {
@@ -242,6 +197,8 @@ void loongarch::getLoongArchTargetFeatures(const Driver &D,
     if (A->getOption().matches(options::OPT_mlasx)) {
       if (llvm::find(Features, "-d") != Features.end())
         D.Diag(diag::err_drv_loongarch_wrong_fpu_width) << /*LASX*/ 1;
+      else if (llvm::find(Features, "-lsx") != Features.end())
+        D.Diag(diag::err_drv_loongarch_invalid_simd_option_combination);
       else { /*-mlasx*/
         Features.push_back("+lsx");
         Features.push_back("+lasx");
@@ -249,6 +206,35 @@ void loongarch::getLoongArchTargetFeatures(const Driver &D,
     } else /*-mno-lasx*/
       Features.push_back("-lasx");
   }
+
+  // Select lsx/lasx feature determined by -msimd=.
+  // Option -msimd= has lower priority than -m[no-]lsx and -m[no-]lasx.
+  if (const Arg *A = Args.getLastArg(options::OPT_msimd_EQ)) {
+    StringRef MSIMD = A->getValue();
+    if (MSIMD == "lsx") {
+      // Option -msimd=lsx depends on 64-bit FPU.
+      // -m*-float and -mfpu=none/0/32 conflict with -mlsx.
+      if (llvm::find(Features, "-d") != Features.end())
+        D.Diag(diag::err_drv_loongarch_wrong_fpu_width) << /*LSX*/ 0;
+      // The previous option does not contain feature -lsx.
+      else if (llvm::find(Features, "-lsx") == Features.end())
+        Features.push_back("+lsx");
+    } else if (MSIMD == "lasx") {
+      // Option -msimd=lasx depends on 64-bit FPU and LSX.
+      // -m*-float and -mfpu=none/0/32 conflict with -mlsx.
+      if (llvm::find(Features, "-d") != Features.end())
+        D.Diag(diag::err_drv_loongarch_wrong_fpu_width) << /*LASX*/ 1;
+      else if (llvm::find(Features, "-lsx") != Features.end())
+        D.Diag(diag::err_drv_loongarch_invalid_simd_option_combination);
+      // The previous option does not contain feature -lasx.
+      else if (llvm::find(Features, "-lasx") == Features.end()) {
+        Features.push_back("+lsx");
+        Features.push_back("+lasx");
+      }
+    } else if (MSIMD != "none") {
+      D.Diag(diag::err_drv_loongarch_invalid_msimd_EQ) << MSIMD;
+    }
+  }
 }
 
 std::string loongarch::postProcessTargetCPUString(const std::string &CPU,
@@ -267,14 +253,8 @@ std::string loongarch::postProcessTargetCPUString(const std::string &CPU,
 std::string loongarch::getLoongArchTargetCPU(const llvm::opt::ArgList &Args,
                                              const llvm::Triple &Triple) {
   std::string CPU;
-  std::string Arch;
   // If we have -march, use that.
-  if (const Arg *A = Args.getLastArg(options::OPT_march_EQ)) {
-    Arch = A->getValue();
-    if (Arch == "la64v1.0" || Arch == "la64v1.1")
-      CPU = llvm::LoongArch::getDefaultArch(Triple.isLoongArch64());
-    else
-      CPU = Arch;
-  }
+  if (const Arg *A = Args.getLastArg(options::OPT_march_EQ))
+    CPU = A->getValue();
   return postProcessTargetCPUString(CPU, Triple);
 }
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 78936fd634f33..71cdaa10416f4 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -1486,41 +1486,6 @@ void AddUnalignedAccessWarning(ArgStringList &CmdArgs) {
 }
 }
 
-// Each combination of options here forms a signing schema, and in most cases
-// each signing schema is its own incompatible ABI. The default values of the
-// options represent the default signing schema.
-static void handlePAuthABI(const ArgList &DriverArgs, ArgStringList &CC1Args) {
-  if (!DriverArgs.hasArg(options::OPT_fptrauth_intrinsics,
-                         options::OPT_fno_ptrauth_intrinsics))
-    CC1Args.push_back("-fptrauth-intrinsics");
-
-  if (!DriverArgs.hasArg(options::OPT_fptrauth_calls,
-                         options::OPT_fno_ptrauth_calls))
-    CC1Args.push_back("-fptrauth-calls");
-
-  if (!DriverArgs.hasArg(options::OPT_fptrauth_returns,
-                         options::OPT_fno_ptrauth_returns))
-    CC1Args.push_back("-fptrauth-returns");
-
-  if (!DriverArgs.hasArg(options::OPT_fptrauth_auth_traps,
-                         options::OPT_fno_ptrauth_auth_traps))
-    CC1Args.push_back("-fptrauth-auth-traps");
-
-  if (!DriverArgs.hasArg(
-          options::OPT_fptrauth_vtable_pointer_address_discrimination,
-          options::OPT_fno_ptrauth_vtable_pointer_address_discrimination))
-    CC1Args.push_back("-fptrauth-vtable-pointer-address-discrimination");
-
-  if (!DriverArgs.hasArg(
-          options::OPT_fptrauth_vtable_pointer_type_discrimination,
-          options::OPT_fno_ptrauth_vtable_pointer_type_discrimination))
-    CC1Args.push_back("-fptrauth-vtable-pointer-type-discrimination");
-
-  if (!DriverArgs.hasArg(options::OPT_fptrauth_init_fini,
-                         options::OPT_fno_ptrauth_init_fini))
-    CC1Args.push_back("-fptrauth-init-fini");
-}
-
 static void CollectARMPACBTIOptions(const ToolChain &TC, const ArgList &Args,
                                     ArgStringList &CmdArgs, bool isAArch64) {
   const Arg *A = isAArch64
@@ -1583,30 +1548,16 @@ static void CollectARMPACBTIOptions(const ToolChain &TC, const ArgList &Args,
 
   CmdArgs.push_back(
       Args.MakeArgString(Twine("-msign-return-address=") + Scope));
-  if (Scope != "none") {
-    if (Triple.getEnvironment() == llvm::Triple::PAuthTest)
-      D.Diag(diag::err_drv_unsupported_opt_for_target)
-          << A->getAsString(Args) << Triple.getTriple();
+  if (Scope != "none")
     CmdArgs.push_back(
         Args.MakeArgString(Twine("-msign-return-address-key=") + Key));
-  }
-  if (BranchProtectionPAuthLR) {
-    if (Triple.getEnvironment() == llvm::Triple::PAuthTest)
-      D.Diag(diag::err_drv_unsupported_opt_for_target)
-          << A->getAsString(Args) << Triple.getTriple();
+  if (BranchProtectionPAuthLR)
     CmdArgs.push_back(
         Args.MakeArgString(Twine("-mbranch-protection-pauth-lr")));
-  }
   if (IndirectBranches)
     CmdArgs.push_back("-mbranch-target-enforce");
-  // GCS is currently untested with PAuthABI, but enabling this could be allowed
-  // in future after testing with a suitable system.
-  if (GuardedControlStack) {
-    if (Triple.getEnvironment() == llvm::Triple::PAuthTest)
-      D.Diag(diag::err_drv_unsupported_opt_for_target)
-          << A->getAsString(Args) << Triple.getTriple();
+  if (GuardedControlStack)
     CmdArgs.push_back("-mguarded-control-stack");
-  }
 }
 
 void Clang::AddARMTargetArgs(const llvm::Triple &Triple, const ArgList &Args,
@@ -1750,8 +1701,6 @@ void RenderAArch64ABI(const llvm::Triple &Triple, const ArgList &Args,
     ABIName = A->getValue();
   else if (Triple.isOSDarwin())
     ABIName = "darwinpcs";
-  else if (Triple.getEnvironment() == llvm::Triple::PAuthTest)
-    ABIName = "pauthtest";
   else
     ABIName = "aapcs";
 
@@ -1788,9 +1737,6 @@ void Clang::AddAArch64TargetArgs(const ArgList &Args,
   // Enable/disable return address signing and indirect branch targets.
   CollectARMPACBTIOptions(getToolChain(), Args, CmdArgs, true /*isAArch64*/);
 
-  if (Triple.getEnvironment() == llvm::Triple::PAuthTest)
-    handlePAuthABI(Args, CmdArgs);
-
   // Handle -msve_vector_bits=<bits>
   if (Arg *A = Args.getLastArg(options::OPT_msve_vector_bits_EQ)) {
     StringRef Val = A->getValue();
@@ -1848,9 +1794,6 @@ void Clang::AddAArch64TargetArgs(const ArgList &Args,
   Args.addOptInFlag(
       CmdArgs, options::OPT_fptrauth_function_pointer_type_discrimination,
       options::OPT_fno_ptrauth_function_pointer_type_discrimination);
-
-  Args.addOptInFlag(CmdArgs, options::OPT_fptrauth_indirect_gotos,
-                    options::OPT_fno_ptrauth_indirect_gotos);
 }
 
 void Clang::AddLoongArchTargetArgs(const ArgList &Args,
@@ -6723,9 +6666,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
     StringRef S0 = A->getValue(), S = S0;
     unsigned Size, Offset = 0;
     if (!Triple.isAArch64() && !Triple.isLoongArch() && !Triple.isRISCV() &&
-        !Triple.isX86() &&
-        !(!Triple.isOSAIX() && (Triple.getArch() == llvm::Triple::ppc ||
-                                Triple.getArch() == llvm::Triple::ppc64)))
+        !Triple.isX86())
       D.Diag(diag::err_drv_unsupported_opt_for_target)
           << A->getAsString(Args) << TripleStr;
     else if (S.consumeInteger(10, Size) ||
@@ -6818,7 +6759,6 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
   if (const char *Name = C.getTimeTraceFile(&JA)) {
     CmdArgs.push_back(Args.MakeArgString("-ftime-trace=" + Twine(Name)));
     Args.AddLastArg(CmdArgs, options::OPT_ftime_trace_granularity_EQ);
-    Args.AddLastArg(CmdArgs, options::OPT_ftime_trace_verbose);
   }
 
   if (Arg *A = Args.getLastArg(options::OPT_ftrapv_handler_EQ)) {
diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp
index 59453c484ae4f..08a4633902654 100644
--- a/clang/lib/Driver/ToolChains/Cuda.cpp
+++ b/clang/lib/Driver/ToolChains/Cuda.cpp
@@ -461,6 +461,13 @@ void NVPTX::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
   CmdArgs.push_back("--output-file");
   std::string OutputFileName = TC.getInputFilename(Output);
 
+  // If we are invoking `nvlink` internally we need to output a `.cubin` file.
+  // FIXME: This should hopefully be removed if NVIDIA updates their tooling.
+  if (!C.getInputArgs().getLastArg(options::OPT_c)) {
+    SmallString<256> Filename(Output.getFilename());
+    llvm::sys::path::replace_extension(Filename, "cubin");
+    OutputFileName = Filename.str();
+  }
   if (Output.isFilename() && OutputFileName != Output.getFilename())
     C.addTempFile(Args.MakeArgString(OutputFileName));
 
@@ -605,21 +612,12 @@ void NVPTX::Linker::ConstructJob(Compilation &C, const JobAction &JA,
   CmdArgs.push_back("-arch");
   CmdArgs.push_back(Args.MakeArgString(GPUArch));
 
-  if (Args.hasArg(options::OPT_ptxas_path_EQ))
-    CmdArgs.push_back(Args.MakeArgString(
-        "--pxtas-path=" + Args.getLastArgValue(options::OPT_ptxas_path_EQ)));
-
   // Add paths specified in LIBRARY_PATH environment variable as -L options.
   addDirectoryList(Args, CmdArgs, "-L", "LIBRARY_PATH");
 
   // Add standard library search paths passed on the command line.
   Args.AddAllArgs(CmdArgs, options::OPT_L);
   getToolChain().AddFilePathLibArgs(Args, CmdArgs);
-  AddLinkerInputs(getToolChain(), Inputs, Args, CmdArgs, JA);
-
-  if (C.getDriver().isUsingLTO())
-    addLTOOptions(getToolChain(), Args, CmdArgs, Output, Inputs[0],
-                  C.getDriver().getLTOMode() == LTOK_Thin);
 
   // Add paths for the default clang library path.
   SmallString<256> DefaultLibPath =
@@ -627,12 +625,51 @@ void NVPTX::Linker::ConstructJob(Compilation &C, const JobAction &JA,
   llvm::sys::path::append(DefaultLibPath, CLANG_INSTALL_LIBDIR_BASENAME);
   CmdArgs.push_back(Args.MakeArgString(Twine("-L") + DefaultLibPath));
 
+  for (const auto &II : Inputs) {
+    if (II.getType() == types::TY_LLVM_IR || II.getType() == types::TY_LTO_IR ||
+        II.getType() == types::TY_LTO_BC || II.getType() == types::TY_LLVM_BC) {
+      C.getDriver().Diag(diag::err_drv_no_linker_llvm_support)
+          << getToolChain().getTripleString();
+      continue;
+    }
+
+    // The 'nvlink' application performs RDC-mode linking when given a '.o'
+    // file and device linking when given a '.cubin' file. We always want to
+    // perform device linking, so just rename any '.o' files.
+    // FIXME: This should hopefully be removed if NVIDIA updates their tooling.
+    if (II.isFilename()) {
+      auto InputFile = getToolChain().getInputFilename(II);
+      if (llvm::sys::path::extension(InputFile) != ".cubin") {
+        // If there are no actions above this one then this is direct input and
+        // we can copy it. Otherwise the input is internal so a `.cubin` file
+        // should exist.
+        if (II.getAction() && II.getAction()->getInputs().size() == 0) {
+          const char *CubinF =
+              Args.MakeArgString(getToolChain().getDriver().GetTemporaryPath(
+                  llvm::sys::path::stem(InputFile), "cubin"));
+          if (llvm::sys::fs::copy_file(InputFile, C.addTempFile(CubinF)))
+            continue;
+
+          CmdArgs.push_back(CubinF);
+        } else {
+          SmallString<256> Filename(InputFile);
+          llvm::sys::path::replace_extension(Filename, "cubin");
+          CmdArgs.push_back(Args.MakeArgString(Filename));
+        }
+      } else {
+        CmdArgs.push_back(Args.MakeArgString(InputFile));
+      }
+    } else if (!II.isNothing()) {
+      II.getInputArg().renderAsInput(Args, CmdArgs);
+    }
+  }
+
   C.addCommand(std::make_unique<Command>(
       JA, *this,
       ResponseFileSupport{ResponseFileSupport::RF_Full, llvm::sys::WEM_UTF8,
                           "--options-file"},
-      Args.MakeArgString(getToolChain().GetProgramPath("clang-nvlink-wrapper")),
-      CmdArgs, Inputs, Output));
+      Args.MakeArgString(getToolChain().GetProgramPath("nvlink")), CmdArgs,
+      Inputs, Output));
 }
 
 void NVPTX::getNVPTXTargetFeatures(const Driver &D, const llvm::Triple &Triple,
@@ -912,7 +949,11 @@ std::string CudaToolChain::getInputFilename(const InputInfo &Input) const {
   if (Input.getType() != types::TY_Object || getDriver().offloadDeviceOnly())
     return ToolChain::getInputFilename(Input);
 
-  return ToolChain::getInputFilename(Input);
+  // Replace extension for object files with cubin because nvlink relies on
+  // these particular file names.
+  SmallString<256> Filename(ToolChain::getInputFilename(Input));
+  llvm::sys::path::replace_extension(Filename, "cubin");
+  return std::string(Filename);
 }
 
 llvm::opt::DerivedArgList *
diff --git a/clang/lib/Driver/ToolChains/Cuda.h b/clang/lib/Driver/ToolChains/Cuda.h
index 7a6a6fb209012..7464d88cb350b 100644
--- a/clang/lib/Driver/ToolChains/Cuda.h
+++ b/clang/lib/Driver/ToolChains/Cuda.h
@@ -155,7 +155,6 @@ class LLVM_LIBRARY_VISIBILITY NVPTXToolChain : public ToolChain {
   bool isPIEDefault(const llvm::opt::ArgList &Args) const override {
     return false;
   }
-  bool HasNativeLLVMSupport() const override { return true; }
   bool isPICDefaultForced() const override { return false; }
   bool SupportsProfiling() const override { return false; }
 
@@ -193,8 +192,6 @@ class LLVM_LIBRARY_VISIBILITY CudaToolChain : public NVPTXToolChain {
     return &HostTC.getTriple();
   }
 
-  bool HasNativeLLVMSupport() const override { return false; }
-
   std::string getInputFilename(const InputInfo &Input) const override;
 
   llvm::opt::DerivedArgList *
diff --git a/clang/lib/Driver/ToolChains/Linux.cpp b/clang/lib/Driver/ToolChains/Linux.cpp
index 2265138edbffb..98a878e1d764d 100644
--- a/clang/lib/Driver/ToolChains/Linux.cpp
+++ b/clang/lib/Driver/ToolChains/Linux.cpp
@@ -86,9 +86,6 @@ std::string Linux::getMultiarchTriple(const Driver &D,
   case llvm::Triple::aarch64:
     if (IsAndroid)
       return "aarch64-linux-android";
-    if (hasEffectiveTriple() &&
-        getEffectiveTriple().getEnvironment() == llvm::Triple::PAuthTest)
-      return "aarch64-linux-pauthtest";
     return "aarch64-linux-gnu";
   case llvm::Triple::aarch64_be:
     return "aarch64_be-linux-gnu";
diff --git a/clang/lib/Driver/ToolChains/PS4CPU.h b/clang/lib/Driver/ToolChains/PS4CPU.h
index a33728bce5186..0be90183c637c 100644
--- a/clang/lib/Driver/ToolChains/PS4CPU.h
+++ b/clang/lib/Driver/ToolChains/PS4CPU.h
@@ -179,7 +179,7 @@ class LLVM_LIBRARY_VISIBILITY PS5CPU : public PS4PS5Base {
                         llvm::opt::ArgStringList &CmdArgs, const char *Prefix,
                         const char *Suffix) const override;
   const char *getProfileRTLibName() const override {
-    return "libclang_rt.profile_nosubmission.a";
+    return "libclang_rt.profile-x86_64_nosubmission.a";
   }
 
 protected:
diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index 21924a8fe17d1..db66911f00f63 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -2467,9 +2467,9 @@ class AnnotatingParser {
         Current.setType(TT_CastRParen);
       if (Current.MatchingParen && Current.Next &&
           !Current.Next->isBinaryOperator() &&
-          !Current.Next->isOneOf(
-              tok::semi, tok::colon, tok::l_brace, tok::l_paren, tok::comma,
-              tok::period, tok::arrow, tok::coloncolon, tok::kw_noexcept)) {
+          !Current.Next->isOneOf(tok::semi, tok::colon, tok::l_brace,
+                                 tok::comma, tok::period, tok::arrow,
+                                 tok::coloncolon, tok::kw_noexcept)) {
         if (FormatToken *AfterParen = Current.MatchingParen->Next;
             AfterParen && AfterParen->isNot(tok::caret)) {
           // Make sure this isn't the return type of an Obj-C block declaration.
@@ -2625,10 +2625,8 @@ class AnnotatingParser {
       return false;
 
     // int a or auto a.
-    if (PreviousNotConst->isOneOf(tok::identifier, tok::kw_auto) &&
-        PreviousNotConst->isNot(TT_StatementAttributeLikeMacro)) {
+    if (PreviousNotConst->isOneOf(tok::identifier, tok::kw_auto))
       return true;
-    }
 
     // *a or &a or &&a.
     if (PreviousNotConst->is(TT_PointerOrReference))
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index f6b6c44a4cab6..bf57addff1c6d 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -1488,30 +1488,20 @@ void CompilerInvocation::setDefaultPointerAuthOptions(
         Key::ASDA, LangOpts.PointerAuthVTPtrAddressDiscrimination,
         LangOpts.PointerAuthVTPtrTypeDiscrimination ? Discrimination::Type
                                                     : Discrimination::None);
-
-    if (LangOpts.PointerAuthTypeInfoVTPtrDiscrimination)
-      Opts.CXXTypeInfoVTablePointer =
-          PointerAuthSchema(Key::ASDA, true, Discrimination::Constant,
-                            StdTypeInfoVTablePointerConstantDiscrimination);
-    else
-      Opts.CXXTypeInfoVTablePointer =
-          PointerAuthSchema(Key::ASDA, false, Discrimination::None);
-
+    Opts.CXXTypeInfoVTablePointer =
+        PointerAuthSchema(Key::ASDA, false, Discrimination::None);
     Opts.CXXVTTVTablePointers =
         PointerAuthSchema(Key::ASDA, false, Discrimination::None);
     Opts.CXXVirtualFunctionPointers = Opts.CXXVirtualVariadicFunctionPointers =
         PointerAuthSchema(Key::ASIA, true, Discrimination::Decl);
-    Opts.CXXMemberFunctionPointers =
-        PointerAuthSchema(Key::ASIA, false, Discrimination::Type);
   }
-  Opts.IndirectGotos = LangOpts.PointerAuthIndirectGotos;
 }
 
 static void parsePointerAuthOptions(PointerAuthOptions &Opts,
                                     const LangOptions &LangOpts,
                                     const llvm::Triple &Triple,
                                     DiagnosticsEngine &Diags) {
-  if (!LangOpts.PointerAuthCalls && !LangOpts.PointerAuthIndirectGotos)
+  if (!LangOpts.PointerAuthCalls)
     return;
 
   CompilerInvocation::setDefaultPointerAuthOptions(Opts, LangOpts, Triple);
@@ -3415,17 +3405,12 @@ static void GeneratePointerAuthArgs(const LangOptions &Opts,
     GenerateArg(Consumer, OPT_fptrauth_calls);
   if (Opts.PointerAuthReturns)
     GenerateArg(Consumer, OPT_fptrauth_returns);
-  if (Opts.PointerAuthIndirectGotos)
-    GenerateArg(Consumer, OPT_fptrauth_indirect_gotos);
   if (Opts.PointerAuthAuthTraps)
     GenerateArg(Consumer, OPT_fptrauth_auth_traps);
   if (Opts.PointerAuthVTPtrAddressDiscrimination)
     GenerateArg(Consumer, OPT_fptrauth_vtable_pointer_address_discrimination);
   if (Opts.PointerAuthVTPtrTypeDiscrimination)
     GenerateArg(Consumer, OPT_fptrauth_vtable_pointer_type_discrimination);
-  if (Opts.PointerAuthTypeInfoVTPtrDiscrimination)
-    GenerateArg(Consumer, OPT_fptrauth_type_info_vtable_pointer_discrimination);
-
   if (Opts.PointerAuthInitFini)
     GenerateArg(Consumer, OPT_fptrauth_init_fini);
   if (Opts.PointerAuthFunctionTypeDiscrimination)
@@ -3437,15 +3422,11 @@ static void ParsePointerAuthArgs(LangOptions &Opts, ArgList &Args,
   Opts.PointerAuthIntrinsics = Args.hasArg(OPT_fptrauth_intrinsics);
   Opts.PointerAuthCalls = Args.hasArg(OPT_fptrauth_calls);
   Opts.PointerAuthReturns = Args.hasArg(OPT_fptrauth_returns);
-  Opts.PointerAuthIndirectGotos = Args.hasArg(OPT_fptrauth_indirect_gotos);
   Opts.PointerAuthAuthTraps = Args.hasArg(OPT_fptrauth_auth_traps);
   Opts.PointerAuthVTPtrAddressDiscrimination =
       Args.hasArg(OPT_fptrauth_vtable_pointer_address_discrimination);
   Opts.PointerAuthVTPtrTypeDiscrimination =
       Args.hasArg(OPT_fptrauth_vtable_pointer_type_discrimination);
-  Opts.PointerAuthTypeInfoVTPtrDiscrimination =
-      Args.hasArg(OPT_fptrauth_type_info_vtable_pointer_discrimination);
-
   Opts.PointerAuthInitFini = Args.hasArg(OPT_fptrauth_init_fini);
   Opts.PointerAuthFunctionTypeDiscrimination =
       Args.hasArg(OPT_fptrauth_function_pointer_type_discrimination);
diff --git a/clang/lib/Frontend/PrintPreprocessedOutput.cpp b/clang/lib/Frontend/PrintPreprocessedOutput.cpp
index 0592423c12eca..1fff88ccf0405 100644
--- a/clang/lib/Frontend/PrintPreprocessedOutput.cpp
+++ b/clang/lib/Frontend/PrintPreprocessedOutput.cpp
@@ -758,9 +758,10 @@ void PrintPPOutputPPCallbacks::HandleWhitespaceBeforeTok(const Token &Tok,
   // These tokens are not expanded to anything and don't need whitespace before
   // them.
   if (Tok.is(tok::eof) ||
-      (Tok.isAnnotation() && !Tok.is(tok::annot_header_unit) &&
-       !Tok.is(tok::annot_module_begin) && !Tok.is(tok::annot_module_end) &&
-       !Tok.is(tok::annot_repl_input_end) && !Tok.is(tok::annot_embed)))
+      (Tok.isAnnotation() && Tok.isNot(tok::annot_header_unit) &&
+       Tok.isNot(tok::annot_module_begin) && Tok.isNot(tok::annot_module_end) &&
+       Tok.isNot(tok::annot_module_name) &&
+       Tok.isNot(tok::annot_repl_input_end) && Tok.isNot(tok::annot_embed)))
     return;
 
   // EmittedDirectiveOnThisLine takes priority over RequireSameLine.
@@ -951,6 +952,11 @@ static void PrintPreprocessedTokens(Preprocessor &PP, Token &Tok,
       PP.Lex(Tok);
       IsStartOfLine = true;
       continue;
+    } else if (Tok.is(tok::annot_module_name)) {
+      auto *Info = static_cast<ModuleNameInfo *>(Tok.getAnnotationValue());
+      *Callbacks->OS << Info->getFlatName();
+      PP.Lex(Tok);
+      continue;
     } else if (Tok.is(tok::annot_header_unit)) {
       // This is a header-name that has been (effectively) converted into a
       // module-name.
diff --git a/clang/lib/Lex/DependencyDirectivesScanner.cpp b/clang/lib/Lex/DependencyDirectivesScanner.cpp
index 31a4c0f52b465..8a020d0e95fe3 100644
--- a/clang/lib/Lex/DependencyDirectivesScanner.cpp
+++ b/clang/lib/Lex/DependencyDirectivesScanner.cpp
@@ -88,8 +88,8 @@ struct Scanner {
   [[nodiscard]] dependency_directives_scan::Token &
   lexToken(const char *&First, const char *const End);
 
-  [[nodiscard]] dependency_directives_scan::Token &
-  lexIncludeFilename(const char *&First, const char *const End);
+  dependency_directives_scan::Token &lexIncludeFilename(const char *&First,
+                                                        const char *const End);
 
   void skipLine(const char *&First, const char *const End);
   void skipDirective(StringRef Name, const char *&First, const char *const End);
@@ -544,7 +544,7 @@ Scanner::lexIncludeFilename(const char *&First, const char *const End) {
 void Scanner::lexPPDirectiveBody(const char *&First, const char *const End) {
   while (true) {
     const dependency_directives_scan::Token &Tok = lexToken(First, End);
-    if (Tok.is(tok::eod) || Tok.is(tok::eof))
+    if (Tok.is(tok::eod))
       break;
   }
 }
@@ -912,11 +912,7 @@ bool Scanner::lexPPLine(const char *&First, const char *const End) {
   case pp___include_macros:
   case pp_include_next:
   case pp_import:
-    // Ignore missing filenames in include or import directives.
-    if (lexIncludeFilename(First, End).is(tok::eod)) {
-      skipDirective(Id, First, End);
-      return true;
-    }
+    lexIncludeFilename(First, End);
     break;
   default:
     break;
diff --git a/clang/lib/Lex/PPLexerChange.cpp b/clang/lib/Lex/PPLexerChange.cpp
index 8221db46e06ac..c3a903917e9ce 100644
--- a/clang/lib/Lex/PPLexerChange.cpp
+++ b/clang/lib/Lex/PPLexerChange.cpp
@@ -122,7 +122,8 @@ void Preprocessor::EnterSourceFileWithLexer(Lexer *TheLexer,
   CurPPLexer = TheLexer;
   CurDirLookup = CurDir;
   CurLexerSubmodule = nullptr;
-  if (CurLexerCallback != CLK_LexAfterModuleImport)
+  if (CurLexerCallback != CLK_LexAfterModuleImport &&
+      CurLexerCallback != CLK_LexAfterModuleDecl)
     CurLexerCallback = TheLexer->isDependencyDirectivesLexer()
                            ? CLK_DependencyDirectivesLexer
                            : CLK_Lexer;
@@ -161,8 +162,7 @@ void Preprocessor::EnterMacro(Token &Tok, SourceLocation ILEnd,
   PushIncludeMacroStack();
   CurDirLookup = nullptr;
   CurTokenLexer = std::move(TokLexer);
-  if (CurLexerCallback != CLK_LexAfterModuleImport)
-    CurLexerCallback = CLK_TokenLexer;
+  CurLexerCallback = CLK_TokenLexer;
 }
 
 /// EnterTokenStream - Add a "macro" context to the top of the include stack,
@@ -216,7 +216,8 @@ void Preprocessor::EnterTokenStream(const Token *Toks, unsigned NumToks,
   PushIncludeMacroStack();
   CurDirLookup = nullptr;
   CurTokenLexer = std::move(TokLexer);
-  if (CurLexerCallback != CLK_LexAfterModuleImport)
+  if (CurLexerCallback != CLK_LexAfterModuleImport &&
+      CurLexerCallback != CLK_LexAfterModuleDecl)
     CurLexerCallback = CLK_TokenLexer;
 }
 
diff --git a/clang/lib/Lex/Preprocessor.cpp b/clang/lib/Lex/Preprocessor.cpp
index 63e27e62cffc8..2726fae344337 100644
--- a/clang/lib/Lex/Preprocessor.cpp
+++ b/clang/lib/Lex/Preprocessor.cpp
@@ -860,9 +860,15 @@ bool Preprocessor::HandleIdentifier(Token &Identifier) {
     ModuleImportLoc = Identifier.getLocation();
     NamedModuleImportPath.clear();
     IsAtImport = true;
-    ModuleImportExpectsIdentifier = true;
     CurLexerCallback = CLK_LexAfterModuleImport;
   }
+
+  if ((II.isModulesDeclaration() || Identifier.is(tok::kw_module)) &&
+      !InMacroArgs && !DisableMacroExpansion &&
+      (getLangOpts().CPlusPlusModules || getLangOpts().DebuggerSupport) &&
+      CurLexerCallback != CLK_CachingLexer) {
+    CurLexerCallback = CLK_LexAfterModuleDecl;
+  }
   return true;
 }
 
@@ -905,6 +911,7 @@ void Preprocessor::Lex(Token &Result) {
     // This token is injected to represent the translation of '#include "a.h"'
     // into "import a.h;". Mimic the notional ';'.
     case tok::annot_module_include:
+    case tok::annot_repl_input_end:
     case tok::semi:
       TrackGMFState.handleSemi();
       StdCXXImportSeqState.handleSemi();
@@ -919,12 +926,30 @@ void Preprocessor::Lex(Token &Result) {
       StdCXXImportSeqState.handleExport();
       ModuleDeclState.handleExport();
       break;
-    case tok::colon:
-      ModuleDeclState.handleColon();
-      break;
-    case tok::period:
-      ModuleDeclState.handlePeriod();
+    case tok::annot_module_name: {
+      auto *Info = static_cast<ModuleNameInfo *>(Result.getAnnotationValue());
+      for (const auto &Tok : Info->getTokens()) {
+        switch (Tok.getKind()) {
+        case tok::identifier:
+          ModuleDeclState.handleIdentifier(Tok.getIdentifierInfo());
+          break;
+        case tok::period:
+          ModuleDeclState.handlePeriod();
+          break;
+        case tok::colon:
+          ModuleDeclState.handleColon();
+          break;
+        default:
+          llvm_unreachable("Unexpected token in module name");
+        }
+      }
+      if (ModuleDeclState.isModuleCandidate())
+        break;
+      TrackGMFState.handleMisc();
+      StdCXXImportSeqState.handleMisc();
+      ModuleDeclState.handleMisc();
       break;
+    }
     case tok::identifier:
       // Check "import" and "module" when there is no open bracket. The two
       // identifiers are not meaningful with open brackets.
@@ -936,17 +961,17 @@ void Preprocessor::Lex(Token &Result) {
             ModuleImportLoc = Result.getLocation();
             NamedModuleImportPath.clear();
             IsAtImport = false;
-            ModuleImportExpectsIdentifier = true;
             CurLexerCallback = CLK_LexAfterModuleImport;
           }
           break;
-        } else if (Result.getIdentifierInfo() == getIdentifierInfo("module")) {
+        }
+        if (Result.getIdentifierInfo()->isModulesDeclaration()) {
           TrackGMFState.handleModule(StdCXXImportSeqState.afterTopLevelSeq());
           ModuleDeclState.handleModule();
+          CurLexerCallback = CLK_LexAfterModuleDecl;
           break;
         }
       }
-      ModuleDeclState.handleIdentifier(Result.getIdentifierInfo());
       if (ModuleDeclState.isModuleCandidate())
         break;
       [[fallthrough]];
@@ -1121,6 +1146,151 @@ void Preprocessor::CollectPpImportSuffix(SmallVectorImpl<Token> &Toks) {
   }
 }
 
+ModuleNameInfo::ModuleNameInfo(ArrayRef<Token> AnnotToks,
+                               std::optional<unsigned> ColonIndex) {
+  assert(!AnnotToks.empty() && "Named module token cannot be empty.");
+  if (!ColonIndex.has_value())
+    ColonIndex = AnnotToks.size();
+  ModuleName = ArrayRef(AnnotToks.begin(), AnnotToks.begin() + *ColonIndex);
+  PartitionName = ArrayRef(AnnotToks.begin() + *ColonIndex, AnnotToks.end());
+  assert(ModuleName.end() == PartitionName.begin());
+}
+
+std::string ModuleNameInfo::getFlatName() const {
+  std::string FlatModuleName;
+  for (auto &Tok : getTokens()) {
+    switch (Tok.getKind()) {
+    case tok::identifier:
+      FlatModuleName += Tok.getIdentifierInfo()->getName();
+      break;
+    case tok::period:
+      FlatModuleName += '.';
+      break;
+    case tok::colon:
+      FlatModuleName += ':';
+      break;
+    default:
+      llvm_unreachable("Unexpected token in module name");
+    }
+  }
+  return FlatModuleName;
+}
+
+void ModuleNameInfo::getModuleIdPath(
+    SmallVectorImpl<std::pair<IdentifierInfo *, SourceLocation>> &Path) const {
+  return getModuleIdPath(getTokens(), Path);
+}
+
+void ModuleNameInfo::getModuleIdPath(
+    ArrayRef<Token> ModuleName,
+    SmallVectorImpl<std::pair<IdentifierInfo *, SourceLocation>> &Path) {
+  for (const auto &Tok : ModuleName) {
+    if (Tok.is(tok::identifier))
+      Path.push_back(
+          std::make_pair(Tok.getIdentifierInfo(), Tok.getLocation()));
+  }
+}
+
+/// Lex a module name or a partition name.
+///
+///     module-name:
+///           module-name-qualifier[opt] identifier
+///
+///     partition-name: [C++20]
+///           : module-name-qualifier[opt] identifier
+///
+///     module-name-qualifier
+///           module-name-qualifier[opt] identifier .
+bool Preprocessor::LexModuleName(Token &Result, bool IsImport) {
+  bool ExpectsIdentifier = true, IsLexingPartition = false;
+  SmallVector<Token, 8> ModuleName;
+  std::optional<unsigned> ColonTokIndex;
+  auto LexNextToken = [&](Token &Tok) {
+    if (IsImport)
+      Lex(Tok);
+    else
+      LexUnexpandedToken(Tok);
+  };
+
+  while (true) {
+    LexNextToken(Result);
+    if (ExpectsIdentifier && Result.is(tok::identifier)) {
+      auto *MI = getMacroInfo(Result.getIdentifierInfo());
+      if (getLangOpts().CPlusPlusModules && !IsImport && MI &&
+          MI->isObjectLike()) {
+        Diag(Result, diag::err_module_decl_cannot_be_macros)
+            << Result.getLocation() << IsLexingPartition
+            << Result.getIdentifierInfo();
+      }
+      ModuleName.push_back(Result);
+      ExpectsIdentifier = false;
+      continue;
+    }
+
+    if (!ExpectsIdentifier && Result.is(tok::period)) {
+      ModuleName.push_back(Result);
+      ExpectsIdentifier = true;
+      continue;
+    }
+
+    // Module partition only allowed in C++20 Modules.
+    if (getLangOpts().CPlusPlusModules && Result.is(tok::colon)) {
+      // Handle the form like: import :P;
+      // If the token after ':' is not an identifier, this is a invalid module
+      // name.
+      if (ModuleName.empty()) {
+        Token Tmp;
+        LexNextToken(Tmp);
+        EnterToken(Tmp, /*IsReiject=*/false);
+        // A private-module-fragment:
+        // export module :private;
+        if (!IsImport && Tmp.is(tok::kw_private))
+          return true;
+        // import :N;
+        if (IsImport && Tmp.isNot(tok::identifier))
+          return false;
+      } else if (!ExpectsIdentifier) {
+        ExpectsIdentifier = true;
+      }
+      IsLexingPartition = true;
+      ColonTokIndex = ModuleName.size();
+      ModuleName.push_back(Result);
+      continue;
+    }
+
+    // [cpp.module]/p2: where the pp-tokens (if any) shall not begin with a (
+    // preprocessing token [...]
+    //
+    // We only emit diagnostic in the preprocessor, and in the parser we skip
+    // invalid tokens and recover from errors.
+    if (getLangOpts().CPlusPlusModules && !ExpectsIdentifier &&
+        Result.is(tok::l_paren))
+      Diag(Result, diag::err_unxepected_paren_in_module_decl)
+          << IsLexingPartition;
+    break;
+  }
+
+  // Put the last token back to stream, it's not a valid part of module name.
+  // We lexed it unexpanded but it might be a valid macro expansion
+  Result.clearFlag(Token::DisableExpand);
+  auto ToksCopy = std::make_unique<Token[]>(1);
+  *ToksCopy.get() = Result;
+  EnterTokenStream(std::move(ToksCopy), 1,
+                   /*DisableMacroExpansion=*/false,
+                   /*IsReinject=*/false);
+
+  if (ModuleName.empty())
+    return false;
+  Result.startToken();
+  Result.setKind(tok::annot_module_name);
+  Result.setLocation(ModuleName.front().getLocation());
+  Result.setAnnotationEndLoc(ModuleName.back().getLocation());
+  auto AnnotToks = ArrayRef(ModuleName).copy(getPreprocessorAllocator());
+  ModuleNameInfo *Info =
+      new (getPreprocessorAllocator()) ModuleNameInfo(AnnotToks, ColonTokIndex);
+  Result.setAnnotationValue(static_cast<void *>(Info));
+  return true;
+}
 
 /// Lex a token following the 'import' contextual keyword.
 ///
@@ -1145,6 +1315,17 @@ bool Preprocessor::LexAfterModuleImport(Token &Result) {
   // Figure out what kind of lexer we actually have.
   recomputeCurLexerKind();
 
+  // Allocate a holding buffer for a sequence of tokens and introduce it into
+  // the token stream.
+  auto EnterTokens = [this](ArrayRef<Token> Toks) {
+    auto ToksCopy = std::make_unique<Token[]>(Toks.size());
+    std::copy(Toks.begin(), Toks.end(), ToksCopy.get());
+    EnterTokenStream(std::move(ToksCopy), Toks.size(),
+                     /*DisableMacroExpansion*/ true, /*IsReinject*/ false);
+  };
+
+  SmallVector<Token, 32> Suffix;
+
   // Lex the next token. The header-name lexing rules are used at the start of
   // a pp-import.
   //
@@ -1155,122 +1336,108 @@ bool Preprocessor::LexAfterModuleImport(Token &Result) {
     if (LexHeaderName(Result))
       return true;
 
-    if (Result.is(tok::colon) && ModuleDeclState.isNamedModule()) {
-      std::string Name = ModuleDeclState.getPrimaryName().str();
-      Name += ":";
-      NamedModuleImportPath.push_back(
-          {getIdentifierInfo(Name), Result.getLocation()});
-      CurLexerCallback = CLK_LexAfterModuleImport;
-      return true;
-    }
-  } else {
-    Lex(Result);
-  }
+    // Check for a header-name.
+    if (Result.is(tok::header_name)) {
+      // Enter the header-name token into the token stream; a Lex action cannot
+      // both return a token and cache tokens (doing so would corrupt the token
+      // cache if the call to Lex comes from CachingLex / PeekAhead).
+      Suffix.push_back(Result);
+
+      // Consume the pp-import-suffix and expand any macros in it now. We'll add
+      // it back into the token stream later.
+      CollectPpImportSuffix(Suffix);
+      if (Suffix.back().isNot(tok::semi)) {
+        // This is not a pp-import after all.
+        EnterTokens(Suffix);
+        return false;
+      }
 
-  // Allocate a holding buffer for a sequence of tokens and introduce it into
-  // the token stream.
-  auto EnterTokens = [this](ArrayRef<Token> Toks) {
-    auto ToksCopy = std::make_unique<Token[]>(Toks.size());
-    std::copy(Toks.begin(), Toks.end(), ToksCopy.get());
-    EnterTokenStream(std::move(ToksCopy), Toks.size(),
-                     /*DisableMacroExpansion*/ true, /*IsReinject*/ false);
-  };
+      // C++2a [cpp.module]p1:
+      //   The ';' preprocessing-token terminating a pp-import shall not have
+      //   been produced by macro replacement.
+      SourceLocation SemiLoc = Suffix.back().getLocation();
+      if (SemiLoc.isMacroID())
+        Diag(SemiLoc, diag::err_header_import_semi_in_macro);
+
+      // Reconstitute the import token.
+      Token ImportTok;
+      ImportTok.startToken();
+      ImportTok.setKind(tok::kw_import);
+      ImportTok.setLocation(ModuleImportLoc);
+      ImportTok.setIdentifierInfo(getIdentifierInfo("import"));
+      ImportTok.setLength(6);
+
+      auto Action = HandleHeaderIncludeOrImport(
+          /*HashLoc*/ SourceLocation(), ImportTok, Suffix.front(), SemiLoc);
+      switch (Action.Kind) {
+      case ImportAction::None:
+        break;
 
-  bool ImportingHeader = Result.is(tok::header_name);
-  // Check for a header-name.
-  SmallVector<Token, 32> Suffix;
-  if (ImportingHeader) {
-    // Enter the header-name token into the token stream; a Lex action cannot
-    // both return a token and cache tokens (doing so would corrupt the token
-    // cache if the call to Lex comes from CachingLex / PeekAhead).
-    Suffix.push_back(Result);
+      case ImportAction::ModuleBegin:
+        // Let the parser know we're textually entering the module.
+        Suffix.emplace_back();
+        Suffix.back().startToken();
+        Suffix.back().setKind(tok::annot_module_begin);
+        Suffix.back().setLocation(SemiLoc);
+        Suffix.back().setAnnotationEndLoc(SemiLoc);
+        Suffix.back().setAnnotationValue(Action.ModuleForHeader);
+        [[fallthrough]];
+
+      case ImportAction::ModuleImport:
+      case ImportAction::HeaderUnitImport:
+      case ImportAction::SkippedModuleImport:
+        // We chose to import (or textually enter) the file. Convert the
+        // header-name token into a header unit annotation token.
+        Suffix[0].setKind(tok::annot_header_unit);
+        Suffix[0].setAnnotationEndLoc(Suffix[0].getLocation());
+        Suffix[0].setAnnotationValue(Action.ModuleForHeader);
+        // FIXME: Call the moduleImport callback?
+        break;
+      case ImportAction::Failure:
+        assert(TheModuleLoader.HadFatalFailure &&
+               "This should be an early exit only to a fatal error");
+        Result.setKind(tok::eof);
+        CurLexer->cutOffLexing();
+        EnterTokens(Suffix);
+        return true;
+      }
 
-    // Consume the pp-import-suffix and expand any macros in it now. We'll add
-    // it back into the token stream later.
-    CollectPpImportSuffix(Suffix);
-    if (Suffix.back().isNot(tok::semi)) {
-      // This is not a pp-import after all.
       EnterTokens(Suffix);
       return false;
     }
+  } else {
+    Lex(Result);
+  }
 
-    // C++2a [cpp.module]p1:
-    //   The ';' preprocessing-token terminating a pp-import shall not have
-    //   been produced by macro replacement.
-    SourceLocation SemiLoc = Suffix.back().getLocation();
-    if (SemiLoc.isMacroID())
-      Diag(SemiLoc, diag::err_header_import_semi_in_macro);
-
-    // Reconstitute the import token.
-    Token ImportTok;
-    ImportTok.startToken();
-    ImportTok.setKind(tok::kw_import);
-    ImportTok.setLocation(ModuleImportLoc);
-    ImportTok.setIdentifierInfo(getIdentifierInfo("import"));
-    ImportTok.setLength(6);
-
-    auto Action = HandleHeaderIncludeOrImport(
-        /*HashLoc*/ SourceLocation(), ImportTok, Suffix.front(), SemiLoc);
-    switch (Action.Kind) {
-    case ImportAction::None:
-      break;
-
-    case ImportAction::ModuleBegin:
-      // Let the parser know we're textually entering the module.
-      Suffix.emplace_back();
-      Suffix.back().startToken();
-      Suffix.back().setKind(tok::annot_module_begin);
-      Suffix.back().setLocation(SemiLoc);
-      Suffix.back().setAnnotationEndLoc(SemiLoc);
-      Suffix.back().setAnnotationValue(Action.ModuleForHeader);
-      [[fallthrough]];
-
-    case ImportAction::ModuleImport:
-    case ImportAction::HeaderUnitImport:
-    case ImportAction::SkippedModuleImport:
-      // We chose to import (or textually enter) the file. Convert the
-      // header-name token into a header unit annotation token.
-      Suffix[0].setKind(tok::annot_header_unit);
-      Suffix[0].setAnnotationEndLoc(Suffix[0].getLocation());
-      Suffix[0].setAnnotationValue(Action.ModuleForHeader);
-      // FIXME: Call the moduleImport callback?
-      break;
-    case ImportAction::Failure:
-      assert(TheModuleLoader.HadFatalFailure &&
-             "This should be an early exit only to a fatal error");
-      Result.setKind(tok::eof);
-      CurLexer->cutOffLexing();
-      EnterTokens(Suffix);
+  if (Result.isOneOf(tok::identifier, tok::colon)) {
+    EnterToken(Result, /*IsReinject=*/false);
+    if (!LexModuleName(Result, /*IsImport=*/true))
       return true;
+    auto *Info = Result.getAnnotationValueAs<ModuleNameInfo *>();
+    if (getLangOpts().CPlusPlusModules) {
+      // Under the standard C++ Modules, the dot is just part of the module
+      // name, and not a real hierarchy separator. Flatten such module names
+      // now.
+      //
+      // FIXME: Is this the right level to be performing this transformation?
+      std::string FlatModuleName;
+      if (Info->getTokens().front().is(tok::colon)) {
+        // Import a module partition allowed in C++20 Modules.
+        // We can import a partition in named module TU.
+        if (NamedModuleImportPath.empty() && ModuleDeclState.isNamedModule())
+          FlatModuleName = llvm::Twine(ModuleDeclState.getPrimaryName())
+                               .concat(Info->getFlatName())
+                               .str();
+        else
+          return true;
+      } else {
+        FlatModuleName = Info->getFlatName();
+      }
+      NamedModuleImportPath.emplace_back(getIdentifierInfo(FlatModuleName),
+                                         Result.getLocation());
+    } else {
+      Info->getModuleIdPath(NamedModuleImportPath);
     }
-
-    EnterTokens(Suffix);
-    return false;
-  }
-
-  // The token sequence
-  //
-  //   import identifier (. identifier)*
-  //
-  // indicates a module import directive. We already saw the 'import'
-  // contextual keyword, so now we're looking for the identifiers.
-  if (ModuleImportExpectsIdentifier && Result.getKind() == tok::identifier) {
-    // We expected to see an identifier here, and we did; continue handling
-    // identifiers.
-    NamedModuleImportPath.push_back(
-        std::make_pair(Result.getIdentifierInfo(), Result.getLocation()));
-    ModuleImportExpectsIdentifier = false;
-    CurLexerCallback = CLK_LexAfterModuleImport;
-    return true;
-  }
-
-  // If we're expecting a '.' or a ';', and we got a '.', then wait until we
-  // see the next identifier. (We can also see a '[[' that begins an
-  // attribute-specifier-seq here under the Standard C++ Modules.)
-  if (!ModuleImportExpectsIdentifier && Result.getKind() == tok::period) {
-    ModuleImportExpectsIdentifier = true;
-    CurLexerCallback = CLK_LexAfterModuleImport;
-    return true;
   }
 
   // If we didn't recognize a module name at all, this is not a (valid) import.
@@ -1291,24 +1458,6 @@ bool Preprocessor::LexAfterModuleImport(Token &Result) {
     SemiLoc = Suffix.back().getLocation();
   }
 
-  // Under the standard C++ Modules, the dot is just part of the module name,
-  // and not a real hierarchy separator. Flatten such module names now.
-  //
-  // FIXME: Is this the right level to be performing this transformation?
-  std::string FlatModuleName;
-  if (getLangOpts().CPlusPlusModules) {
-    for (auto &Piece : NamedModuleImportPath) {
-      // If the FlatModuleName ends with colon, it implies it is a partition.
-      if (!FlatModuleName.empty() && FlatModuleName.back() != ':')
-        FlatModuleName += ".";
-      FlatModuleName += Piece.first->getName();
-    }
-    SourceLocation FirstPathLoc = NamedModuleImportPath[0].second;
-    NamedModuleImportPath.clear();
-    NamedModuleImportPath.push_back(
-        std::make_pair(getIdentifierInfo(FlatModuleName), FirstPathLoc));
-  }
-
   Module *Imported = nullptr;
   // We don't/shouldn't load the standard c++20 modules when preprocessing.
   if (getLangOpts().Modules && !isInImportingCXXNamedModules()) {
@@ -1330,6 +1479,33 @@ bool Preprocessor::LexAfterModuleImport(Token &Result) {
   return true;
 }
 
+/// Lex a token following the 'module' contextual keyword.
+///
+/// [cpp.module]/p2:
+/// The pp-tokens, if any, of a pp-module shall be of the form:
+///       pp-module-name pp-module-partition[opt] pp-tokens[opt]
+///
+/// where the pp-tokens (if any) shall not begin with a ( preprocessing token
+/// and the grammar non-terminals are defined as:
+///       pp-module-name:
+///             pp-module-name-qualifierp[opt] identifier
+///       pp-module-partition:
+///             : pp-module-name-qualifier[opt] identifier
+///       pp-module-name-qualifier:
+///             identifier .
+///             pp-module-name-qualifier identifier .
+/// No identifier in the pp-module-name or pp-module-partition shall currently
+/// be defined as an object-like macro.
+///
+/// [cpp.module]/p3:
+/// Any preprocessing tokens after the module preprocessing token in the module
+/// directive are processed just as in normal text.
+bool Preprocessor::LexAfterModuleDecl(Token &Result) {
+  // Figure out what kind of lexer we actually have.
+  recomputeCurLexerKind();
+  return LexModuleName(Result, /*IsImport=*/false);
+}
+
 void Preprocessor::makeModuleVisible(Module *M, SourceLocation Loc) {
   CurSubmoduleState->VisibleModules.setVisible(
       M, Loc, [](Module *) {},
diff --git a/clang/lib/Lex/TokenConcatenation.cpp b/clang/lib/Lex/TokenConcatenation.cpp
index 865879d180533..cdb636923b9e9 100644
--- a/clang/lib/Lex/TokenConcatenation.cpp
+++ b/clang/lib/Lex/TokenConcatenation.cpp
@@ -160,6 +160,13 @@ static char GetFirstChar(const Preprocessor &PP, const Token &Tok) {
 bool TokenConcatenation::AvoidConcat(const Token &PrevPrevTok,
                                      const Token &PrevTok,
                                      const Token &Tok) const {
+  // If previous token is a module name, we need avoid concat it with current
+  // token, otherwise, there will has an extra space between 'M' and ';' for the
+  // following code:
+  //
+  // import M;
+  if (PrevTok.is(tok::annot_module_name))
+    return false;
   // Conservatively assume that every annotation token that has a printable
   // form requires whitespace.
   if (PrevTok.isAnnotation())
@@ -190,6 +197,9 @@ bool TokenConcatenation::AvoidConcat(const Token &PrevPrevTok,
       return true;
     ConcatInfo &= ~aci_avoid_equal;
   }
+
+  if (Tok.is(tok::annot_module_name))
+    return true;
   if (Tok.isAnnotation()) {
     // Modules annotation can show up when generated automatically for includes.
     assert(Tok.isOneOf(tok::annot_module_include, tok::annot_module_begin,
diff --git a/clang/lib/Parse/ParseAST.cpp b/clang/lib/Parse/ParseAST.cpp
index e008cc0e38ced..77ab3b556da58 100644
--- a/clang/lib/Parse/ParseAST.cpp
+++ b/clang/lib/Parse/ParseAST.cpp
@@ -152,15 +152,7 @@ void clang::ParseAST(Sema &S, bool PrintStats, bool SkipFunctionBodies) {
   bool HaveLexer = S.getPreprocessor().getCurrentLexer();
 
   if (HaveLexer) {
-    llvm::TimeTraceScope TimeScope("Frontend", [&]() {
-      llvm::TimeTraceMetadata M;
-      if (llvm::isTimeTraceVerbose()) {
-        const SourceManager &SM = S.getSourceManager();
-        if (const auto *FE = SM.getFileEntryForID(SM.getMainFileID()))
-          M.File = FE->tryGetRealPathName();
-      }
-      return M;
-    });
+    llvm::TimeTraceScope TimeScope("Frontend");
     P.Initialize();
     Parser::DeclGroupPtrTy ADecl;
     Sema::ModuleImportState ImportState;
diff --git a/clang/lib/Parse/ParseDecl.cpp b/clang/lib/Parse/ParseDecl.cpp
index 7ce9a9cea1c7a..577527d0318f2 100644
--- a/clang/lib/Parse/ParseDecl.cpp
+++ b/clang/lib/Parse/ParseDecl.cpp
@@ -3958,7 +3958,13 @@ void Parser::ParseDeclarationSpecifiers(
 
       // We're done with the declaration-specifiers.
       goto DoneWithDeclSpec;
-
+    case tok::annot_module_name: {
+      PP.EnterTokenStream(
+          Tok.getAnnotationValueAs<ModuleNameInfo *>()->getTokens(),
+          /*DisableMacroExpansion=*/true, /*IsReinject=*/false);
+      ConsumeAnyToken();
+      [[fallthrough]];
+    }
       // typedef-name
     case tok::kw___super:
     case tok::kw_decltype:
diff --git a/clang/lib/Parse/ParseExpr.cpp b/clang/lib/Parse/ParseExpr.cpp
index a12c375c8d48c..3d7c58e5b3c3c 100644
--- a/clang/lib/Parse/ParseExpr.cpp
+++ b/clang/lib/Parse/ParseExpr.cpp
@@ -3597,7 +3597,7 @@ void Parser::injectEmbedTokens() {
     I += 2;
   }
   PP.EnterTokenStream(std::move(Toks), /*DisableMacroExpansion=*/true,
-                      /*IsReinject=*/true);
+                      /*IsReinject=*/false);
   ConsumeAnyToken(/*ConsumeCodeCompletionTok=*/true);
 }
 
diff --git a/clang/lib/Parse/ParsePragma.cpp b/clang/lib/Parse/ParsePragma.cpp
index aef4ddb758816..cc6f18b5b319f 100644
--- a/clang/lib/Parse/ParsePragma.cpp
+++ b/clang/lib/Parse/ParsePragma.cpp
@@ -14,7 +14,6 @@
 #include "clang/Basic/PragmaKinds.h"
 #include "clang/Basic/TargetInfo.h"
 #include "clang/Lex/Preprocessor.h"
-#include "clang/Lex/PreprocessorOptions.h"
 #include "clang/Lex/Token.h"
 #include "clang/Parse/LoopHint.h"
 #include "clang/Parse/ParseDiagnostic.h"
@@ -412,19 +411,6 @@ struct PragmaRISCVHandler : public PragmaHandler {
   Sema &Actions;
 };
 
-struct PragmaMCFuncHandler : public PragmaHandler {
-  PragmaMCFuncHandler(bool ReportError)
-      : PragmaHandler("mc_func"), ReportError(ReportError) {}
-  void HandlePragma(Preprocessor &PP, PragmaIntroducer Introducer,
-                    Token &Tok) override {
-    if (ReportError)
-      PP.Diag(Tok, diag::err_pragma_mc_func_not_supported);
-  }
-
-private:
-  bool ReportError = false;
-};
-
 void markAsReinjectedForRelexing(llvm::MutableArrayRef<clang::Token> Toks) {
   for (auto &T : Toks)
     T.setFlag(clang::Token::IsReinjected);
@@ -582,12 +568,6 @@ void Parser::initializePragmaHandlers() {
     RISCVPragmaHandler = std::make_unique<PragmaRISCVHandler>(Actions);
     PP.AddPragmaHandler("clang", RISCVPragmaHandler.get());
   }
-
-  if (getTargetInfo().getTriple().isOSAIX()) {
-    MCFuncPragmaHandler = std::make_unique<PragmaMCFuncHandler>(
-        PP.getPreprocessorOpts().ErrorOnPragmaMcfuncOnAIX);
-    PP.AddPragmaHandler(MCFuncPragmaHandler.get());
-  }
 }
 
 void Parser::resetPragmaHandlers() {
@@ -722,11 +702,6 @@ void Parser::resetPragmaHandlers() {
     PP.RemovePragmaHandler("clang", RISCVPragmaHandler.get());
     RISCVPragmaHandler.reset();
   }
-
-  if (getTargetInfo().getTriple().isOSAIX()) {
-    PP.RemovePragmaHandler(MCFuncPragmaHandler.get());
-    MCFuncPragmaHandler.reset();
-  }
 }
 
 /// Handle the annotation token produced for #pragma unused(...)
diff --git a/clang/lib/Parse/ParseStmt.cpp b/clang/lib/Parse/ParseStmt.cpp
index bdb3fc051d0b3..22d38adc28ebe 100644
--- a/clang/lib/Parse/ParseStmt.cpp
+++ b/clang/lib/Parse/ParseStmt.cpp
@@ -299,15 +299,6 @@ StmtResult Parser::ParseStatementOrDeclarationAfterAttributes(
     goto Retry;
   }
 
-  case tok::kw_template: {
-    SourceLocation DeclEnd;
-    ParsedAttributes Attrs(AttrFactory);
-    ParseTemplateDeclarationOrSpecialization(DeclaratorContext::Block, DeclEnd,
-                                             Attrs,
-                                             getAccessSpecifierIfPresent());
-    return StmtError();
-  }
-
   case tok::kw_case:                // C99 6.8.1: labeled-statement
     return ParseCaseStatement(StmtCtx);
   case tok::kw_default:             // C99 6.8.1: labeled-statement
diff --git a/clang/lib/Parse/Parser.cpp b/clang/lib/Parse/Parser.cpp
index 5ebe71e496a2e..afb2e1e416168 100644
--- a/clang/lib/Parse/Parser.cpp
+++ b/clang/lib/Parse/Parser.cpp
@@ -2511,18 +2511,28 @@ Parser::ParseModuleDecl(Sema::ModuleImportState &ImportState) {
   }
 
   SmallVector<std::pair<IdentifierInfo *, SourceLocation>, 2> Path;
-  if (ParseModuleName(ModuleLoc, Path, /*IsImport*/ false))
+  if (Tok.isNot(tok::annot_module_name)) {
+    Diag(Tok, diag::err_module_expected_ident) << /*IsImport=*/false;
+    SkipUntil(tok::semi, StopBeforeMatch);
+    return nullptr;
+  }
+
+  auto *Info = Tok.getAnnotationValueAs<ModuleNameInfo *>();
+  ConsumeAnnotationToken();
+  if (ParseModuleName(ModuleLoc, Info->getModuleName(), Path,
+                      /*IsImport=*/false))
     return nullptr;
 
   // Parse the optional module-partition.
   SmallVector<std::pair<IdentifierInfo *, SourceLocation>, 2> Partition;
-  if (Tok.is(tok::colon)) {
-    SourceLocation ColonLoc = ConsumeToken();
+  if (Info->hasPartitionName()) {
+    SourceLocation ColonLoc = Info->getColonToken().getLocation();
     if (!getLangOpts().CPlusPlusModules)
       Diag(ColonLoc, diag::err_unsupported_module_partition)
           << SourceRange(ColonLoc, Partition.back().second);
     // Recover by ignoring the partition name.
-    else if (ParseModuleName(ModuleLoc, Partition, /*IsImport*/ false))
+    else if (ParseModuleName(ModuleLoc, Info->getPartitionName(), Partition,
+                             /*IsImport=*/false))
       return nullptr;
   }
 
@@ -2581,18 +2591,32 @@ Decl *Parser::ParseModuleImport(SourceLocation AtLoc,
     // This is a header import that the preprocessor mapped to a module import.
     HeaderUnit = reinterpret_cast<Module *>(Tok.getAnnotationValue());
     ConsumeAnnotationToken();
-  } else if (Tok.is(tok::colon)) {
-    SourceLocation ColonLoc = ConsumeToken();
-    if (!getLangOpts().CPlusPlusModules)
-      Diag(ColonLoc, diag::err_unsupported_module_partition)
-          << SourceRange(ColonLoc, Path.back().second);
-    // Recover by leaving partition empty.
-    else if (ParseModuleName(ColonLoc, Path, /*IsImport*/ true))
-      return nullptr;
-    else
-      IsPartition = true;
   } else {
-    if (ParseModuleName(ImportLoc, Path, /*IsImport*/ true))
+    if (Tok.isNot(tok::annot_module_name)) {
+      if (Tok.is(tok::code_completion)) {
+        cutOffParsing();
+        Actions.CodeCompletion().CodeCompleteModuleImport(ImportLoc, Path);
+        return nullptr;
+      }
+      Diag(Tok, diag::err_module_expected_ident) << /*IsImport=*/true;
+      SkipUntil(tok::semi, StopBeforeMatch);
+      return nullptr;
+    }
+    auto *Info = Tok.getAnnotationValueAs<ModuleNameInfo *>();
+    ConsumeAnnotationToken();
+    if (Info->hasPartitionName()) {
+      SourceLocation ColonLoc = Info->getColonToken().getLocation();
+      if (!getLangOpts().CPlusPlusModules)
+        Diag(ColonLoc, diag::err_unsupported_module_partition)
+            << SourceRange(ColonLoc, Path.back().second);
+      // Recover by leaving partition empty.
+      else if (ParseModuleName(ColonLoc, Info->getPartitionName(), Path,
+                               /*IsImport=*/true))
+        return nullptr;
+      else
+        IsPartition = true;
+    } else if (ParseModuleName(ImportLoc, Info->getModuleName(), Path,
+                               /*IsImport=*/true))
       return nullptr;
   }
 
@@ -2689,32 +2713,31 @@ Decl *Parser::ParseModuleImport(SourceLocation AtLoc,
 ///         module-name-qualifier:
 ///           module-name-qualifier[opt] identifier '.'
 bool Parser::ParseModuleName(
-    SourceLocation UseLoc,
+    SourceLocation UseLoc, ArrayRef<Token> ModuleName,
     SmallVectorImpl<std::pair<IdentifierInfo *, SourceLocation>> &Path,
     bool IsImport) {
-  // Parse the module path.
-  while (true) {
-    if (!Tok.is(tok::identifier)) {
-      if (Tok.is(tok::code_completion)) {
-        cutOffParsing();
-        Actions.CodeCompletion().CodeCompleteModuleImport(UseLoc, Path);
-        return true;
-      }
-
-      Diag(Tok, diag::err_module_expected_ident) << IsImport;
-      SkipUntil(tok::semi);
+  ModuleNameInfo::getModuleIdPath(ModuleName, Path);
+  // Eg. import A.B.
+  if (ModuleName.back().isNot(tok::identifier)) {
+    if (Tok.is(tok::code_completion)) {
+      cutOffParsing();
+      Actions.CodeCompletion().CodeCompleteModuleImport(UseLoc, Path);
       return true;
     }
+    Diag(ModuleName.back(), diag::err_module_expected_ident) << IsImport;
+    SkipUntil(tok::semi, StopBeforeMatch);
+    return true;
+  }
 
-    // Record this part of the module path.
-    Path.push_back(std::make_pair(Tok.getIdentifierInfo(), Tok.getLocation()));
-    ConsumeToken();
-
-    if (Tok.isNot(tok::period))
-      return false;
-
-    ConsumeToken();
+  // [cpp.module]/p2: where the pp-tokens (if any) shall not begin with a (
+  // preprocessing token [...]
+  //
+  // Skip unitl ';' to recovery.
+  if (getLangOpts().CPlusPlusModules && Tok.is(tok::l_paren)) {
+    SkipUntil(tok::semi, StopBeforeMatch);
+    return true;
   }
+  return false;
 }
 
 /// Try recover parser when module annotation appears where it must not
diff --git a/clang/lib/Sema/SemaAPINotes.cpp b/clang/lib/Sema/SemaAPINotes.cpp
index be5b7b92dfe6f..055e66a0c3486 100644
--- a/clang/lib/Sema/SemaAPINotes.cpp
+++ b/clang/lib/Sema/SemaAPINotes.cpp
@@ -783,77 +783,51 @@ static void ProcessVersionedAPINotes(
   }
 }
 
-static std::optional<api_notes::Context>
-UnwindNamespaceContext(DeclContext *DC, api_notes::APINotesManager &APINotes) {
-  if (auto NamespaceContext = dyn_cast<NamespaceDecl>(DC)) {
-    for (auto Reader : APINotes.findAPINotes(NamespaceContext->getLocation())) {
-      // Retrieve the context ID for the parent namespace of the decl.
-      std::stack<NamespaceDecl *> NamespaceStack;
-      {
-        for (auto CurrentNamespace = NamespaceContext; CurrentNamespace;
-             CurrentNamespace =
-                 dyn_cast<NamespaceDecl>(CurrentNamespace->getParent())) {
-          if (!CurrentNamespace->isInlineNamespace())
-            NamespaceStack.push(CurrentNamespace);
-        }
-      }
-      std::optional<api_notes::ContextID> NamespaceID;
-      while (!NamespaceStack.empty()) {
-        auto CurrentNamespace = NamespaceStack.top();
-        NamespaceStack.pop();
-        NamespaceID =
-            Reader->lookupNamespaceID(CurrentNamespace->getName(), NamespaceID);
-        if (!NamespaceID)
-          return std::nullopt;
-      }
-      if (NamespaceID)
-        return api_notes::Context(*NamespaceID,
-                                  api_notes::ContextKind::Namespace);
-    }
-  }
-  return std::nullopt;
-}
-
-static std::optional<api_notes::Context>
-UnwindTagContext(TagDecl *DC, api_notes::APINotesManager &APINotes) {
-  assert(DC && "tag context must not be null");
-  for (auto Reader : APINotes.findAPINotes(DC->getLocation())) {
-    // Retrieve the context ID for the parent tag of the decl.
-    std::stack<TagDecl *> TagStack;
-    {
-      for (auto CurrentTag = DC; CurrentTag;
-           CurrentTag = dyn_cast<TagDecl>(CurrentTag->getParent()))
-        TagStack.push(CurrentTag);
-    }
-    assert(!TagStack.empty());
-    std::optional<api_notes::Context> Ctx =
-        UnwindNamespaceContext(TagStack.top()->getDeclContext(), APINotes);
-    while (!TagStack.empty()) {
-      auto CurrentTag = TagStack.top();
-      TagStack.pop();
-      auto CtxID = Reader->lookupTagID(CurrentTag->getName(), Ctx);
-      if (!CtxID)
-        return std::nullopt;
-      Ctx = api_notes::Context(*CtxID, api_notes::ContextKind::Tag);
-    }
-    return Ctx;
-  }
-  return std::nullopt;
-}
-
 /// Process API notes that are associated with this declaration, mapping them
 /// to attributes as appropriate.
 void Sema::ProcessAPINotes(Decl *D) {
   if (!D)
     return;
 
+  auto GetNamespaceContext =
+      [&](DeclContext *DC) -> std::optional<api_notes::Context> {
+    if (auto NamespaceContext = dyn_cast<NamespaceDecl>(DC)) {
+      for (auto Reader :
+           APINotes.findAPINotes(NamespaceContext->getLocation())) {
+        // Retrieve the context ID for the parent namespace of the decl.
+        std::stack<NamespaceDecl *> NamespaceStack;
+        {
+          for (auto CurrentNamespace = NamespaceContext; CurrentNamespace;
+               CurrentNamespace =
+                   dyn_cast<NamespaceDecl>(CurrentNamespace->getParent())) {
+            if (!CurrentNamespace->isInlineNamespace())
+              NamespaceStack.push(CurrentNamespace);
+          }
+        }
+        std::optional<api_notes::ContextID> NamespaceID;
+        while (!NamespaceStack.empty()) {
+          auto CurrentNamespace = NamespaceStack.top();
+          NamespaceStack.pop();
+          NamespaceID = Reader->lookupNamespaceID(CurrentNamespace->getName(),
+                                                  NamespaceID);
+          if (!NamespaceID)
+            break;
+        }
+        if (NamespaceID)
+          return api_notes::Context(*NamespaceID,
+                                    api_notes::ContextKind::Namespace);
+      }
+    }
+    return std::nullopt;
+  };
+
   // Globals.
   if (D->getDeclContext()->isFileContext() ||
       D->getDeclContext()->isNamespace() ||
       D->getDeclContext()->isExternCContext() ||
       D->getDeclContext()->isExternCXXContext()) {
     std::optional<api_notes::Context> APINotesContext =
-        UnwindNamespaceContext(D->getDeclContext(), APINotes);
+        GetNamespaceContext(D->getDeclContext());
     // Global variables.
     if (auto VD = dyn_cast<VarDecl>(D)) {
       for (auto Reader : APINotes.findAPINotes(D->getLocation())) {
@@ -925,8 +899,6 @@ void Sema::ProcessAPINotes(Decl *D) {
       }
 
       for (auto Reader : APINotes.findAPINotes(D->getLocation())) {
-        if (auto ParentTag = dyn_cast<TagDecl>(Tag->getDeclContext()))
-          APINotesContext = UnwindTagContext(ParentTag, APINotes);
         auto Info = Reader->lookupTag(LookupName, APINotesContext);
         ProcessVersionedAPINotes(*this, Tag, Info);
       }
@@ -1042,24 +1014,23 @@ void Sema::ProcessAPINotes(Decl *D) {
     }
   }
 
-  if (auto TagContext = dyn_cast<TagDecl>(D->getDeclContext())) {
+  if (auto CXXRecord = dyn_cast<CXXRecordDecl>(D->getDeclContext())) {
+    auto GetRecordContext = [&](api_notes::APINotesReader *Reader)
+        -> std::optional<api_notes::ContextID> {
+      auto ParentContext = GetNamespaceContext(CXXRecord->getDeclContext());
+      if (auto Found = Reader->lookupTagID(CXXRecord->getName(), ParentContext))
+        return *Found;
+
+      return std::nullopt;
+    };
+
     if (auto CXXMethod = dyn_cast<CXXMethodDecl>(D)) {
       for (auto Reader : APINotes.findAPINotes(D->getLocation())) {
-        if (auto Context = UnwindTagContext(TagContext, APINotes)) {
-          auto Info =
-              Reader->lookupCXXMethod(Context->id, CXXMethod->getName());
+        if (auto Context = GetRecordContext(Reader)) {
+          auto Info = Reader->lookupCXXMethod(*Context, CXXMethod->getName());
           ProcessVersionedAPINotes(*this, CXXMethod, Info);
         }
       }
     }
-
-    if (auto Tag = dyn_cast<TagDecl>(D)) {
-      for (auto Reader : APINotes.findAPINotes(D->getLocation())) {
-        if (auto Context = UnwindTagContext(TagContext, APINotes)) {
-          auto Info = Reader->lookupTag(Tag->getName(), Context);
-          ProcessVersionedAPINotes(*this, Tag, Info);
-        }
-      }
-    }
   }
 }
diff --git a/clang/lib/Sema/SemaAvailability.cpp b/clang/lib/Sema/SemaAvailability.cpp
index 17566c226ec80..df83bbfb7aac8 100644
--- a/clang/lib/Sema/SemaAvailability.cpp
+++ b/clang/lib/Sema/SemaAvailability.cpp
@@ -107,12 +107,6 @@ ShouldDiagnoseAvailabilityOfDecl(Sema &S, const NamedDecl *D,
     break;
   }
 
-  // For alias templates, get the underlying declaration.
-  if (const auto *ADecl = dyn_cast<TypeAliasTemplateDecl>(D)) {
-    D = ADecl->getTemplatedDecl();
-    Result = D->getAvailability(Message);
-  }
-
   // Forward class declarations get their attributes from their definition.
   if (const auto *IDecl = dyn_cast<ObjCInterfaceDecl>(D)) {
     if (IDecl->getDefinition()) {
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 5fd8622c90dd8..50e7be9ea6020 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -5247,10 +5247,6 @@ static void handleXRayLogArgsAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
 
 static void handlePatchableFunctionEntryAttr(Sema &S, Decl *D,
                                              const ParsedAttr &AL) {
-  if (S.Context.getTargetInfo().getTriple().isOSAIX()) {
-    S.Diag(AL.getLoc(), diag::err_aix_attr_unsupported) << AL;
-    return;
-  }
   uint32_t Count = 0, Offset = 0;
   if (!S.checkUInt32Argument(AL, AL.getArgAsExpr(0), Count, 0, true))
     return;
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 439db55668cc6..8d24e34520e77 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -10909,14 +10909,6 @@ QualType Sema::CheckAdditionOperands(ExprResult &LHS, ExprResult &RHS,
   if (isObjCPointer && checkArithmeticOnObjCPointer(*this, Loc, PExp))
     return QualType();
 
-  // Arithmetic on label addresses is normally allowed, except when we add
-  // a ptrauth signature to the addresses.
-  if (isa<AddrLabelExpr>(PExp) && getLangOpts().PointerAuthIndirectGotos) {
-    Diag(Loc, diag::err_ptrauth_indirect_goto_addrlabel_arithmetic)
-        << /*addition*/ 1;
-    return QualType();
-  }
-
   // Check array bounds for pointer arithemtic
   CheckArrayAccess(PExp, IExp);
 
@@ -10991,15 +10983,6 @@ QualType Sema::CheckSubtractionOperands(ExprResult &LHS, ExprResult &RHS,
         checkArithmeticOnObjCPointer(*this, Loc, LHS.get()))
       return QualType();
 
-    // Arithmetic on label addresses is normally allowed, except when we add
-    // a ptrauth signature to the addresses.
-    if (isa<AddrLabelExpr>(LHS.get()) &&
-        getLangOpts().PointerAuthIndirectGotos) {
-      Diag(Loc, diag::err_ptrauth_indirect_goto_addrlabel_arithmetic)
-          << /*subtraction*/ 0;
-      return QualType();
-    }
-
     // The result type of a pointer-int computation is the pointer type.
     if (RHS.get()->getType()->isIntegerType()) {
       // Subtracting from a null pointer should produce a warning.
@@ -14134,14 +14117,7 @@ QualType Sema::CheckAddressOfOperand(ExprResult &OrigOp, SourceLocation OpLoc) {
       // Okay: we can take the address of a field.
       // Could be a pointer to member, though, if there is an explicit
       // scope qualifier for the class.
-
-      // [C++26] [expr.prim.id.general]
-      // If an id-expression E denotes a non-static non-type member
-      // of some class C [...] and if E is a qualified-id, E is
-      // not the un-parenthesized operand of the unary & operator [...]
-      // the id-expression is transformed into a class member access expression.
-      if (isa<DeclRefExpr>(op) && cast<DeclRefExpr>(op)->getQualifier() &&
-          !isa<ParenExpr>(OrigOp.get())) {
+      if (isa<DeclRefExpr>(op) && cast<DeclRefExpr>(op)->getQualifier()) {
         DeclContext *Ctx = dcl->getDeclContext();
         if (Ctx && Ctx->isRecord()) {
           if (dcl->getType()->isReferenceType()) {
@@ -14151,6 +14127,22 @@ QualType Sema::CheckAddressOfOperand(ExprResult &OrigOp, SourceLocation OpLoc) {
             return QualType();
           }
 
+          // C++11 [expr.unary.op] p4:
+          // A pointer to member is only formed when an explicit & is used and
+          // its operand is a qualified-id not enclosed in parentheses.
+          if (isa<ParenExpr>(OrigOp.get())) {
+            SourceLocation LeftParenLoc = OrigOp.get()->getBeginLoc(),
+                           RightParenLoc = OrigOp.get()->getEndLoc();
+
+            Diag(LeftParenLoc,
+                 diag::err_form_ptr_to_member_from_parenthesized_expr)
+                << SourceRange(OpLoc, RightParenLoc)
+                << FixItHint::CreateRemoval(LeftParenLoc)
+                << FixItHint::CreateRemoval(RightParenLoc);
+
+            // Continuing might lead to better error recovery.
+          }
+
           while (cast<RecordDecl>(Ctx)->isAnonymousStructOrUnion())
             Ctx = Ctx->getParent();
 
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index 67e3c1d9067f3..3bd981cb442aa 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -6168,14 +6168,14 @@ StmtResult SemaOpenMP::ActOnOpenMPExecutableDirective(
               DMC->getDefaultmapModifierLoc();
     }
     for (unsigned VC = 0; VC < DefaultmapKindNum; ++VC) {
-      auto K = static_cast<OpenMPDefaultmapClauseKind>(VC);
+      auto Kind = static_cast<OpenMPDefaultmapClauseKind>(VC);
       for (unsigned I = 0; I < OMPC_MAP_delete; ++I) {
-        ArrayRef<Expr *> ImplicitMap =
-            DSAChecker.getImplicitMap(K, static_cast<OpenMPMapClauseKind>(I));
+        ArrayRef<Expr *> ImplicitMap = DSAChecker.getImplicitMap(
+            Kind, static_cast<OpenMPMapClauseKind>(I));
         ImplicitMaps[VC][I].append(ImplicitMap.begin(), ImplicitMap.end());
       }
       ArrayRef<OpenMPMapModifierKind> ImplicitModifier =
-          DSAChecker.getImplicitMapModifier(K);
+          DSAChecker.getImplicitMapModifier(Kind);
       ImplicitMapModifiers[VC].append(ImplicitModifier.begin(),
                                       ImplicitModifier.end());
       std::fill_n(std::back_inserter(ImplicitMapModifiersLoc[VC]),
@@ -6249,10 +6249,10 @@ StmtResult SemaOpenMP::ActOnOpenMPExecutableDirective(
           continue;
         CXXScopeSpec MapperIdScopeSpec;
         DeclarationNameInfo MapperId;
-        auto K = static_cast<OpenMPMapClauseKind>(ClauseKindCnt);
+        auto Kind = static_cast<OpenMPMapClauseKind>(ClauseKindCnt);
         if (OMPClause *Implicit = ActOnOpenMPMapClause(
                 nullptr, ImplicitMapModifiers[I], ImplicitMapModifiersLoc[I],
-                MapperIdScopeSpec, MapperId, K, /*IsMapTypeImplicit=*/true,
+                MapperIdScopeSpec, MapperId, Kind, /*IsMapTypeImplicit=*/true,
                 SourceLocation(), SourceLocation(), ImplicitMap,
                 OMPVarListLocTy())) {
           ClausesWithImplicit.emplace_back(Implicit);
diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
index 87b1f98bbe5ac..9d96201625389 100644
--- a/clang/lib/Sema/SemaTemplate.cpp
+++ b/clang/lib/Sema/SemaTemplate.cpp
@@ -3344,10 +3344,6 @@ QualType Sema::CheckTemplateIdType(TemplateName Name,
         *this, /*PointOfInstantiation=*/TemplateLoc,
         /*Entity=*/AliasTemplate,
         /*TemplateArgs=*/TemplateArgLists.getInnermost());
-
-    // Diagnose uses of this alias.
-    (void)DiagnoseUseOfDecl(AliasTemplate, TemplateLoc);
-
     if (Inst.isInvalid())
       return QualType();
 
diff --git a/clang/lib/Sema/SemaTemplateDeductionGuide.cpp b/clang/lib/Sema/SemaTemplateDeductionGuide.cpp
index 0602d07c6b9b0..7dff2c8f98589 100644
--- a/clang/lib/Sema/SemaTemplateDeductionGuide.cpp
+++ b/clang/lib/Sema/SemaTemplateDeductionGuide.cpp
@@ -363,7 +363,8 @@ struct ConvertConstructorToDeductionGuideTransform {
           return nullptr;
         // Constraints require that we substitute depth-1 arguments
         // to match depths when substituted for evaluation later
-        Depth1Args.push_back(SemaRef.Context.getInjectedTemplateArg(NewParam));
+        Depth1Args.push_back(SemaRef.Context.getCanonicalTemplateArgument(
+            SemaRef.Context.getInjectedTemplateArg(NewParam)));
 
         if (NestedPattern) {
           TemplateDeclInstantiator Instantiator(SemaRef, DC,
@@ -378,7 +379,8 @@ struct ConvertConstructorToDeductionGuideTransform {
                "Unexpected template parameter depth");
 
         AllParams.push_back(NewParam);
-        SubstArgs.push_back(SemaRef.Context.getInjectedTemplateArg(NewParam));
+        SubstArgs.push_back(SemaRef.Context.getCanonicalTemplateArgument(
+            SemaRef.Context.getInjectedTemplateArg(NewParam)));
       }
 
       // Substitute new template parameters into requires-clause if present.
@@ -793,8 +795,8 @@ buildAssociatedConstraints(Sema &SemaRef, FunctionTemplateDecl *F,
         /*NewIndex=*/AdjustedAliasTemplateArgs.size(),
         getTemplateParameterDepth(TP) + AdjustDepth);
 
-    TemplateArgument NewTemplateArgument =
-        Context.getInjectedTemplateArg(NewParam);
+    auto NewTemplateArgument = Context.getCanonicalTemplateArgument(
+        Context.getInjectedTemplateArg(NewParam));
     AdjustedAliasTemplateArgs.push_back(NewTemplateArgument);
   }
   // Template arguments used to transform the template arguments in
@@ -820,8 +822,8 @@ buildAssociatedConstraints(Sema &SemaRef, FunctionTemplateDecl *F,
           getTemplateParameterDepth(TP) + AdjustDepth);
       FirstUndeducedParamIdx += 1;
       assert(TemplateArgsForBuildingRC[Index].isNull());
-      TemplateArgsForBuildingRC[Index] =
-          Context.getInjectedTemplateArg(NewParam);
+      TemplateArgsForBuildingRC[Index] = Context.getCanonicalTemplateArgument(
+          Context.getInjectedTemplateArg(NewParam));
       continue;
     }
     TemplateArgumentLoc Input =
@@ -921,8 +923,8 @@ Expr *buildIsDeducibleConstraint(Sema &SemaRef,
           /*NewIndex=*/TransformedTemplateArgs.size(),
           getTemplateParameterDepth(TP) + AdjustDepth);
 
-      TemplateArgument NewTemplateArgument =
-          Context.getInjectedTemplateArg(NewParam);
+      auto NewTemplateArgument = Context.getCanonicalTemplateArgument(
+          Context.getInjectedTemplateArg(NewParam));
       TransformedTemplateArgs.push_back(NewTemplateArgument);
     }
     // Transformed the ReturnType to restore the uninstantiated depth.
@@ -1085,8 +1087,8 @@ BuildDeductionGuideForTypeAlias(Sema &SemaRef,
         getTemplateParameterDepth(TP));
     FPrimeTemplateParams.push_back(NewParam);
 
-    TemplateArgument NewTemplateArgument =
-        Context.getInjectedTemplateArg(NewParam);
+    auto NewTemplateArgument = Context.getCanonicalTemplateArgument(
+        Context.getInjectedTemplateArg(NewParam));
     TransformedDeducedAliasArgs[AliasTemplateParamIdx] = NewTemplateArgument;
   }
   unsigned FirstUndeducedParamIdx = FPrimeTemplateParams.size();
@@ -1107,7 +1109,8 @@ BuildDeductionGuideForTypeAlias(Sema &SemaRef,
     assert(TemplateArgsForBuildingFPrime[FTemplateParamIdx].isNull() &&
            "The argument must be null before setting");
     TemplateArgsForBuildingFPrime[FTemplateParamIdx] =
-        Context.getInjectedTemplateArg(NewParam);
+        Context.getCanonicalTemplateArgument(
+            Context.getInjectedTemplateArg(NewParam));
   }
 
   // To form a deduction guide f' from f, we leverage clang's instantiation
diff --git a/clang/lib/Sema/SemaTemplateInstantiate.cpp b/clang/lib/Sema/SemaTemplateInstantiate.cpp
index 8995d461362d7..a7bc6749c5852 100644
--- a/clang/lib/Sema/SemaTemplateInstantiate.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiate.cpp
@@ -2576,12 +2576,16 @@ createSubstDiag(Sema &S, TemplateDeductionInfo &Info,
   } else {
     ErrorLoc = Info.getLocation();
   }
+  char *MessageBuf = new (S.Context) char[Message.size()];
+  std::copy(Message.begin(), Message.end(), MessageBuf);
   SmallString<128> Entity;
   llvm::raw_svector_ostream OS(Entity);
   Printer(OS);
-  const ASTContext &C = S.Context;
-  return new (C) concepts::Requirement::SubstitutionDiagnostic{
-      C.backupStr(Entity), ErrorLoc, C.backupStr(Message)};
+  char *EntityBuf = new (S.Context) char[Entity.size()];
+  std::copy(Entity.begin(), Entity.end(), EntityBuf);
+  return new (S.Context) concepts::Requirement::SubstitutionDiagnostic{
+      StringRef(EntityBuf, Entity.size()), ErrorLoc,
+      StringRef(MessageBuf, Message.size())};
 }
 
 concepts::Requirement::SubstitutionDiagnostic *
@@ -2590,9 +2594,10 @@ concepts::createSubstDiagAt(Sema &S, SourceLocation Location,
   SmallString<128> Entity;
   llvm::raw_svector_ostream OS(Entity);
   Printer(OS);
-  const ASTContext &C = S.Context;
-  return new (C) concepts::Requirement::SubstitutionDiagnostic{
-      /*SubstitutedEntity=*/C.backupStr(Entity),
+  char *EntityBuf = new (S.Context) char[Entity.size()];
+  llvm::copy(Entity, EntityBuf);
+  return new (S.Context) concepts::Requirement::SubstitutionDiagnostic{
+      /*SubstitutedEntity=*/StringRef(EntityBuf, Entity.size()),
       /*DiagLoc=*/Location, /*DiagMessage=*/StringRef()};
 }
 
@@ -2768,21 +2773,23 @@ TemplateInstantiator::TransformNestedRequirement(
     assert(!Trap.hasErrorOccurred() && "Substitution failures must be handled "
                                        "by CheckConstraintSatisfaction.");
   }
-  ASTContext &C = SemaRef.Context;
   if (TransConstraint.isUsable() &&
       TransConstraint.get()->isInstantiationDependent())
-    return new (C) concepts::NestedRequirement(TransConstraint.get());
+    return new (SemaRef.Context)
+        concepts::NestedRequirement(TransConstraint.get());
   if (TransConstraint.isInvalid() || !TransConstraint.get() ||
       Satisfaction.HasSubstitutionFailure()) {
     SmallString<128> Entity;
     llvm::raw_svector_ostream OS(Entity);
     Req->getConstraintExpr()->printPretty(OS, nullptr,
                                           SemaRef.getPrintingPolicy());
-    return new (C) concepts::NestedRequirement(
-        SemaRef.Context, C.backupStr(Entity), Satisfaction);
+    char *EntityBuf = new (SemaRef.Context) char[Entity.size()];
+    std::copy(Entity.begin(), Entity.end(), EntityBuf);
+    return new (SemaRef.Context) concepts::NestedRequirement(
+        SemaRef.Context, StringRef(EntityBuf, Entity.size()), Satisfaction);
   }
-  return new (C)
-      concepts::NestedRequirement(C, TransConstraint.get(), Satisfaction);
+  return new (SemaRef.Context) concepts::NestedRequirement(
+      SemaRef.Context, TransConstraint.get(), Satisfaction);
 }
 
 TypeSourceInfo *Sema::SubstType(TypeSourceInfo *T,
@@ -3419,16 +3426,11 @@ Sema::InstantiateClass(SourceLocation PointOfInstantiation,
     return true;
 
   llvm::TimeTraceScope TimeScope("InstantiateClass", [&]() {
-    llvm::TimeTraceMetadata M;
-    llvm::raw_string_ostream OS(M.Detail);
+    std::string Name;
+    llvm::raw_string_ostream OS(Name);
     Instantiation->getNameForDiagnostic(OS, getPrintingPolicy(),
                                         /*Qualified=*/true);
-    if (llvm::isTimeTraceVerbose()) {
-      auto Loc = SourceMgr.getExpansionLoc(Instantiation->getLocation());
-      M.File = SourceMgr.getFilename(Loc);
-      M.Line = SourceMgr.getExpansionLineNumber(Loc);
-    }
-    return M;
+    return Name;
   });
 
   Pattern = PatternDef;
diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
index a12d2eff1d2c8..97161febc15f7 100644
--- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -4966,16 +4966,11 @@ void Sema::InstantiateFunctionDefinition(SourceLocation PointOfInstantiation,
   }
 
   llvm::TimeTraceScope TimeScope("InstantiateFunction", [&]() {
-    llvm::TimeTraceMetadata M;
-    llvm::raw_string_ostream OS(M.Detail);
+    std::string Name;
+    llvm::raw_string_ostream OS(Name);
     Function->getNameForDiagnostic(OS, getPrintingPolicy(),
                                    /*Qualified=*/true);
-    if (llvm::isTimeTraceVerbose()) {
-      auto Loc = SourceMgr.getExpansionLoc(Function->getLocation());
-      M.File = SourceMgr.getFilename(Loc);
-      M.Line = SourceMgr.getExpansionLineNumber(Loc);
-    }
-    return M;
+    return Name;
   });
 
   // If we're performing recursive template instantiation, create our own
diff --git a/clang/lib/Serialization/ASTReaderStmt.cpp b/clang/lib/Serialization/ASTReaderStmt.cpp
index c1361efd8c5f2..a7fe20bd0a466 100644
--- a/clang/lib/Serialization/ASTReaderStmt.cpp
+++ b/clang/lib/Serialization/ASTReaderStmt.cpp
@@ -785,22 +785,29 @@ void ASTStmtReader::VisitUnaryExprOrTypeTraitExpr(UnaryExprOrTypeTraitExpr *E) {
   E->setRParenLoc(readSourceLocation());
 }
 
+static StringRef saveStrToCtx(const std::string &S, ASTContext &Ctx) {
+  char *Buf = new (Ctx) char[S.size()];
+  std::copy(S.begin(), S.end(), Buf);
+  return StringRef(Buf, S.size());
+}
+
 static ConstraintSatisfaction
 readConstraintSatisfaction(ASTRecordReader &Record) {
   ConstraintSatisfaction Satisfaction;
   Satisfaction.IsSatisfied = Record.readInt();
   Satisfaction.ContainsErrors = Record.readInt();
-  const ASTContext &C = Record.getContext();
   if (!Satisfaction.IsSatisfied) {
     unsigned NumDetailRecords = Record.readInt();
     for (unsigned i = 0; i != NumDetailRecords; ++i) {
       if (/* IsDiagnostic */Record.readInt()) {
         SourceLocation DiagLocation = Record.readSourceLocation();
-        StringRef DiagMessage = C.backupStr(Record.readString());
+        StringRef DiagMessage =
+            saveStrToCtx(Record.readString(), Record.getContext());
 
         Satisfaction.Details.emplace_back(
-            new (C) ConstraintSatisfaction::SubstitutionDiagnostic(
-                DiagLocation, DiagMessage));
+            new (Record.getContext())
+                ConstraintSatisfaction::SubstitutionDiagnostic(DiagLocation,
+                                                               DiagMessage));
       } else
         Satisfaction.Details.emplace_back(Record.readExpr());
     }
@@ -821,10 +828,12 @@ void ASTStmtReader::VisitConceptSpecializationExpr(
 
 static concepts::Requirement::SubstitutionDiagnostic *
 readSubstitutionDiagnostic(ASTRecordReader &Record) {
-  const ASTContext &C = Record.getContext();
-  StringRef SubstitutedEntity = C.backupStr(Record.readString());
+  StringRef SubstitutedEntity =
+      saveStrToCtx(Record.readString(), Record.getContext());
+
   SourceLocation DiagLoc = Record.readSourceLocation();
-  StringRef DiagMessage = C.backupStr(Record.readString());
+  StringRef DiagMessage =
+      saveStrToCtx(Record.readString(), Record.getContext());
 
   return new (Record.getContext())
       concepts::Requirement::SubstitutionDiagnostic{SubstitutedEntity, DiagLoc,
@@ -910,21 +919,22 @@ void ASTStmtReader::VisitRequiresExpr(RequiresExpr *E) {
                   std::move(*Req));
       } break;
       case concepts::Requirement::RK_Nested: {
-        ASTContext &C = Record.getContext();
         bool HasInvalidConstraint = Record.readInt();
         if (HasInvalidConstraint) {
-          StringRef InvalidConstraint = C.backupStr(Record.readString());
-          R = new (C) concepts::NestedRequirement(
+          StringRef InvalidConstraint =
+              saveStrToCtx(Record.readString(), Record.getContext());
+          R = new (Record.getContext()) concepts::NestedRequirement(
               Record.getContext(), InvalidConstraint,
               readConstraintSatisfaction(Record));
           break;
         }
         Expr *E = Record.readExpr();
         if (E->isInstantiationDependent())
-          R = new (C) concepts::NestedRequirement(E);
+          R = new (Record.getContext()) concepts::NestedRequirement(E);
         else
-          R = new (C) concepts::NestedRequirement(
-              C, E, readConstraintSatisfaction(Record));
+          R = new (Record.getContext())
+              concepts::NestedRequirement(Record.getContext(), E,
+                                          readConstraintSatisfaction(Record));
       } break;
     }
     if (!R)
diff --git a/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp b/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
index 3f837564cf47c..f82288f1099e8 100644
--- a/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
@@ -373,14 +373,14 @@ static std::optional<int64_t> getConcreteValue(std::optional<NonLoc> SV) {
 }
 
 static Messages getPrecedesMsgs(const SubRegion *Region, NonLoc Offset) {
-  std::string RegName = getRegionName(Region), OffsetStr = "";
-
-  if (auto ConcreteOffset = getConcreteValue(Offset))
-    OffsetStr = formatv(" {0}", ConcreteOffset);
-
-  return {
-      formatv("Out of bound access to memory preceding {0}", RegName),
-      formatv("Access of {0} at negative byte offset{1}", RegName, OffsetStr)};
+  std::string RegName = getRegionName(Region);
+  SmallString<128> Buf;
+  llvm::raw_svector_ostream Out(Buf);
+  Out << "Access of " << RegName << " at negative byte offset";
+  if (auto ConcreteIdx = Offset.getAs<nonloc::ConcreteInt>())
+    Out << ' ' << ConcreteIdx->getValue();
+  return {formatv("Out of bound access to memory preceding {0}", RegName),
+          std::string(Buf)};
 }
 
 /// Try to divide `Val1` and `Val2` (in place) by `Divisor` and return true if
@@ -609,7 +609,7 @@ void ArrayBoundCheckerV2::performCheck(const Expr *E, CheckerContext &C) const {
   // CHECK UPPER BOUND
   DefinedOrUnknownSVal Size = getDynamicExtent(State, Reg, SVB);
   if (auto KnownSize = Size.getAs<NonLoc>()) {
-    // In a situation where both underflow and overflow are possible (but the
+    // In a situation where both overflow and overflow are possible (but the
     // index is either tainted or known to be invalid), the logic of this
     // checker will first assume that the offset is non-negative, and then
     // (with this additional assumption) it will detect an overflow error.
diff --git a/clang/lib/StaticAnalyzer/Checkers/BuiltinFunctionChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/BuiltinFunctionChecker.cpp
index b198b1c2ff4d1..1a75d7b52ad6e 100644
--- a/clang/lib/StaticAnalyzer/Checkers/BuiltinFunctionChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/BuiltinFunctionChecker.cpp
@@ -18,7 +18,6 @@
 #include "clang/StaticAnalyzer/Checkers/BuiltinCheckerRegistration.h"
 #include "clang/StaticAnalyzer/Core/Checker.h"
 #include "clang/StaticAnalyzer/Core/CheckerManager.h"
-#include "clang/StaticAnalyzer/Core/PathSensitive/CallDescription.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/CallEvent.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/DynamicExtent.h"
@@ -31,40 +30,8 @@ namespace {
 class BuiltinFunctionChecker : public Checker<eval::Call> {
 public:
   bool evalCall(const CallEvent &Call, CheckerContext &C) const;
-
-private:
-  // From: clang/include/clang/Basic/Builtins.def
-  // C++ standard library builtins in namespace 'std'.
-  const CallDescriptionSet BuiltinLikeStdFunctions{
-      {CDM::SimpleFunc, {"std", "addressof"}},        //
-      {CDM::SimpleFunc, {"std", "__addressof"}},      //
-      {CDM::SimpleFunc, {"std", "as_const"}},         //
-      {CDM::SimpleFunc, {"std", "forward"}},          //
-      {CDM::SimpleFunc, {"std", "forward_like"}},     //
-      {CDM::SimpleFunc, {"std", "move"}},             //
-      {CDM::SimpleFunc, {"std", "move_if_noexcept"}}, //
-  };
-
-  bool isBuiltinLikeFunction(const CallEvent &Call) const;
 };
 
-} // namespace
-
-bool BuiltinFunctionChecker::isBuiltinLikeFunction(
-    const CallEvent &Call) const {
-  const auto *FD = llvm::dyn_cast_or_null<FunctionDecl>(Call.getDecl());
-  if (!FD || FD->getNumParams() != 1)
-    return false;
-
-  if (QualType RetTy = FD->getReturnType();
-      !RetTy->isPointerType() && !RetTy->isReferenceType())
-    return false;
-
-  if (QualType ParmTy = FD->getParamDecl(0)->getType();
-      !ParmTy->isPointerType() && !ParmTy->isReferenceType())
-    return false;
-
-  return BuiltinLikeStdFunctions.contains(Call);
 }
 
 bool BuiltinFunctionChecker::evalCall(const CallEvent &Call,
@@ -77,11 +44,6 @@ bool BuiltinFunctionChecker::evalCall(const CallEvent &Call,
   const LocationContext *LCtx = C.getLocationContext();
   const Expr *CE = Call.getOriginExpr();
 
-  if (isBuiltinLikeFunction(Call)) {
-    C.addTransition(state->BindExpr(CE, LCtx, Call.getArgSVal(0)));
-    return true;
-  }
-
   switch (FD->getBuiltinID()) {
   default:
     return false;
diff --git a/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp
index 53770532609d5..e8d538388e56c 100644
--- a/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp
@@ -254,8 +254,7 @@ inline void assertStreamStateOpened(const StreamState *SS) {
 }
 
 class StreamChecker : public Checker<check::PreCall, eval::Call,
-                                     check::DeadSymbols, check::PointerEscape,
-                                     check::ASTDecl<TranslationUnitDecl>> {
+                                     check::DeadSymbols, check::PointerEscape> {
   BugType BT_FileNull{this, "NULL stream pointer", "Stream handling error"};
   BugType BT_UseAfterClose{this, "Closed stream", "Stream handling error"};
   BugType BT_UseAfterOpenFailed{this, "Invalid stream",
@@ -277,21 +276,11 @@ class StreamChecker : public Checker<check::PreCall, eval::Call,
                                      const CallEvent *Call,
                                      PointerEscapeKind Kind) const;
 
-  /// Finds the declarations of 'FILE *stdin, *stdout, *stderr'.
-  void checkASTDecl(const TranslationUnitDecl *TU, AnalysisManager &,
-                    BugReporter &) const;
-
   const BugType *getBT_StreamEof() const { return &BT_StreamEof; }
   const BugType *getBT_IndeterminatePosition() const {
     return &BT_IndeterminatePosition;
   }
 
-  /// Assumes that the result of 'fopen' can't alias with the pointee of
-  /// 'stdin', 'stdout' or 'stderr'.
-  ProgramStateRef assumeNoAliasingWithStdStreams(ProgramStateRef State,
-                                                 DefinedSVal RetVal,
-                                                 CheckerContext &C) const;
-
   const NoteTag *constructSetEofNoteTag(CheckerContext &C,
                                         SymbolRef StreamSym) const {
     return C.getNoteTag([this, StreamSym](PathSensitiveBugReport &BR) {
@@ -462,10 +451,6 @@ class StreamChecker : public Checker<check::PreCall, eval::Call,
   /// The built-in va_list type is platform-specific
   mutable QualType VaListType;
 
-  mutable const VarDecl *StdinDecl = nullptr;
-  mutable const VarDecl *StdoutDecl = nullptr;
-  mutable const VarDecl *StderrDecl = nullptr;
-
   void evalFopen(const FnDescription *Desc, const CallEvent &Call,
                  CheckerContext &C) const;
 
@@ -902,30 +887,6 @@ bool StreamChecker::evalCall(const CallEvent &Call, CheckerContext &C) const {
   return C.isDifferent();
 }
 
-ProgramStateRef StreamChecker::assumeNoAliasingWithStdStreams(
-    ProgramStateRef State, DefinedSVal RetVal, CheckerContext &C) const {
-  auto assumeRetNE = [&C, RetVal](ProgramStateRef State,
-                                  const VarDecl *Var) -> ProgramStateRef {
-    if (!Var)
-      return State;
-    const auto *LCtx = C.getLocationContext();
-    auto &StoreMgr = C.getStoreManager();
-    auto &SVB = C.getSValBuilder();
-    SVal VarValue = State->getSVal(StoreMgr.getLValueVar(Var, LCtx));
-    auto NoAliasState =
-        SVB.evalBinOp(State, BO_NE, RetVal, VarValue, SVB.getConditionType())
-            .castAs<DefinedOrUnknownSVal>();
-    return State->assume(NoAliasState, true);
-  };
-
-  assert(State);
-  State = assumeRetNE(State, StdinDecl);
-  State = assumeRetNE(State, StdoutDecl);
-  State = assumeRetNE(State, StderrDecl);
-  assert(State);
-  return State;
-}
-
 void StreamChecker::evalFopen(const FnDescription *Desc, const CallEvent &Call,
                               CheckerContext &C) const {
   ProgramStateRef State = C.getState();
@@ -938,7 +899,6 @@ void StreamChecker::evalFopen(const FnDescription *Desc, const CallEvent &Call,
   assert(RetSym && "RetVal must be a symbol here.");
 
   State = State->BindExpr(CE, C.getLocationContext(), RetVal);
-  State = assumeNoAliasingWithStdStreams(State, RetVal, C);
 
   // Bifurcate the state into two: one with a valid FILE* pointer, the other
   // with a NULL.
@@ -2057,36 +2017,6 @@ ProgramStateRef StreamChecker::checkPointerEscape(
   return State;
 }
 
-static const VarDecl *
-getGlobalStreamPointerByName(const TranslationUnitDecl *TU, StringRef VarName) {
-  ASTContext &Ctx = TU->getASTContext();
-  const auto &SM = Ctx.getSourceManager();
-  const QualType FileTy = Ctx.getFILEType();
-
-  if (FileTy.isNull())
-    return nullptr;
-
-  const QualType FilePtrTy = Ctx.getPointerType(FileTy).getCanonicalType();
-
-  auto LookupRes = TU->lookup(&Ctx.Idents.get(VarName));
-  for (const Decl *D : LookupRes) {
-    if (auto *VD = dyn_cast_or_null<VarDecl>(D)) {
-      if (SM.isInSystemHeader(VD->getLocation()) && VD->hasExternalStorage() &&
-          VD->getType().getCanonicalType() == FilePtrTy) {
-        return VD;
-      }
-    }
-  }
-  return nullptr;
-}
-
-void StreamChecker::checkASTDecl(const TranslationUnitDecl *TU,
-                                 AnalysisManager &, BugReporter &) const {
-  StdinDecl = getGlobalStreamPointerByName(TU, "stdin");
-  StdoutDecl = getGlobalStreamPointerByName(TU, "stdout");
-  StderrDecl = getGlobalStreamPointerByName(TU, "stderr");
-}
-
 //===----------------------------------------------------------------------===//
 // Checker registration.
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/StaticAnalyzer/Core/BugReporter.cpp b/clang/lib/StaticAnalyzer/Core/BugReporter.cpp
index d73dc40cf03fb..e2002bfbe594a 100644
--- a/clang/lib/StaticAnalyzer/Core/BugReporter.cpp
+++ b/clang/lib/StaticAnalyzer/Core/BugReporter.cpp
@@ -2198,7 +2198,7 @@ const Decl *PathSensitiveBugReport::getDeclWithIssue() const {
 void BasicBugReport::Profile(llvm::FoldingSetNodeID& hash) const {
   hash.AddInteger(static_cast<int>(getKind()));
   hash.AddPointer(&BT);
-  hash.AddString(getShortDescription());
+  hash.AddString(Description);
   assert(Location.isValid());
   Location.Profile(hash);
 
@@ -2213,7 +2213,7 @@ void BasicBugReport::Profile(llvm::FoldingSetNodeID& hash) const {
 void PathSensitiveBugReport::Profile(llvm::FoldingSetNodeID &hash) const {
   hash.AddInteger(static_cast<int>(getKind()));
   hash.AddPointer(&BT);
-  hash.AddString(getShortDescription());
+  hash.AddString(Description);
   PathDiagnosticLocation UL = getUniqueingLocation();
   if (UL.isValid()) {
     UL.Profile(hash);
diff --git a/clang/test/APINotes/Inputs/Headers/Methods.apinotes b/clang/test/APINotes/Inputs/Headers/Methods.apinotes
index 618c2cb14b47f..0fa6991a51ff4 100644
--- a/clang/test/APINotes/Inputs/Headers/Methods.apinotes
+++ b/clang/test/APINotes/Inputs/Headers/Methods.apinotes
@@ -6,27 +6,3 @@ Tags:
   - Name: getIncremented
     Availability: none
     AvailabilityMsg: "oh no"
-  - Name: getDecremented
-    Availability: none
-    AvailabilityMsg: "this should have no effect"
-- Name: Outer
-  Tags:
-  - Name: Inner
-    Methods:
-    - Name: getDecremented
-      Availability: none
-      AvailabilityMsg: "nope"
-    - Name: getIncremented
-      Availability: none
-      AvailabilityMsg: "this should have no effect"
-  Methods:
-  - Name: getDecremented
-    Availability: none
-    AvailabilityMsg: "this should have no effect"  
-Functions:
-- Name: getIncremented
-  Availability: none
-  AvailabilityMsg: "this should have no effect"
-- Name: getDecremented
-  Availability: none
-  AvailabilityMsg: "this should have no effect"
diff --git a/clang/test/APINotes/Inputs/Headers/Methods.h b/clang/test/APINotes/Inputs/Headers/Methods.h
index 6a96b12762871..f46fe31533e5d 100644
--- a/clang/test/APINotes/Inputs/Headers/Methods.h
+++ b/clang/test/APINotes/Inputs/Headers/Methods.h
@@ -4,6 +4,7 @@ struct IntWrapper {
   IntWrapper getIncremented() const { return {value + 1}; }
 };
 
+// TODO: support nested tags
 struct Outer {
   struct Inner {
     int value;
diff --git a/clang/test/APINotes/Inputs/Headers/Namespaces.apinotes b/clang/test/APINotes/Inputs/Headers/Namespaces.apinotes
index b7d962e0efda6..68073932d600e 100644
--- a/clang/test/APINotes/Inputs/Headers/Namespaces.apinotes
+++ b/clang/test/APINotes/Inputs/Headers/Namespaces.apinotes
@@ -41,9 +41,6 @@ Namespaces:
             Methods:
               - Name: methodInNestedNamespace
                 SwiftName: swiftMethodInNestedNamespace()
-            Tags:
-              - Name: inner_char_box
-                SwiftName: InnerCharBox
         Namespaces:
           - Name: Namespace1
             Tags:
diff --git a/clang/test/APINotes/Inputs/Headers/Namespaces.h b/clang/test/APINotes/Inputs/Headers/Namespaces.h
index c7a891f205c06..e996b8ffa6b6e 100644
--- a/clang/test/APINotes/Inputs/Headers/Namespaces.h
+++ b/clang/test/APINotes/Inputs/Headers/Namespaces.h
@@ -10,9 +10,6 @@ void funcInNestedNamespace(int i);
 struct char_box {
   char c;
   void methodInNestedNamespace();
-  struct inner_char_box {
-    char c;
-  };
 };
 }
 
diff --git a/clang/test/APINotes/methods.cpp b/clang/test/APINotes/methods.cpp
index 910565745bea2..692f750ed66c7 100644
--- a/clang/test/APINotes/methods.cpp
+++ b/clang/test/APINotes/methods.cpp
@@ -1,17 +1,9 @@
 // RUN: rm -rf %t && mkdir -p %t
 // RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/Methods -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -x c++
 // RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/Methods -fdisable-module-hash -fapinotes-modules -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter IntWrapper::getIncremented -x c++ | FileCheck --check-prefix=CHECK-METHOD %s
-// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/Methods -fdisable-module-hash -fapinotes-modules -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter Outer::Inner::getDecremented -x c++ | FileCheck --check-prefix=CHECK-DEEP-METHOD %s
 
 #include "Methods.h"
 
 // CHECK-METHOD: Dumping IntWrapper::getIncremented:
 // CHECK-METHOD-NEXT: CXXMethodDecl {{.+}} getIncremented
 // CHECK-METHOD: UnavailableAttr {{.+}} <<invalid sloc>> "oh no"
-
-// CHECK-DEEP-METHOD: Dumping Outer::Inner::getDecremented:
-// CHECK-DEEP-METHOD-NEXT: CXXMethodDecl {{.+}} getDecremented
-// CHECK-DEEP-METHOD: UnavailableAttr {{.+}} <<invalid sloc>> "nope"
-
-// CHECK-METHOD-NOT: this should have no effect
-// CHECK-DEEP-METHOD-NOT: this should have no effect
diff --git a/clang/test/APINotes/namespaces.cpp b/clang/test/APINotes/namespaces.cpp
index df108b02e6301..a6517a324b9c5 100644
--- a/clang/test/APINotes/namespaces.cpp
+++ b/clang/test/APINotes/namespaces.cpp
@@ -8,7 +8,6 @@
 // RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/CxxInterop -fdisable-module-hash -fapinotes-modules -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter Namespace1::Nested1::varInNestedNamespace -x objective-c++ | FileCheck -check-prefix=CHECK-GLOBAL-IN-NESTED-NAMESPACE %s
 // RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/CxxInterop -fdisable-module-hash -fapinotes-modules -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter Namespace1::Nested2::varInNestedNamespace -x objective-c++ | FileCheck -check-prefix=CHECK-ANOTHER-GLOBAL-IN-NESTED-NAMESPACE %s
 // RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/CxxInterop -fdisable-module-hash -fapinotes-modules -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter Namespace1::Nested1::char_box -x objective-c++ | FileCheck -check-prefix=CHECK-STRUCT-IN-NESTED-NAMESPACE %s
-// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/CxxInterop -fdisable-module-hash -fapinotes-modules -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter Namespace1::Nested1::char_box::inner_char_box -x objective-c++ | FileCheck -check-prefix=CHECK-STRUCT-IN-STRUCT-IN-NESTED-NAMESPACE %s
 // RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/CxxInterop -fdisable-module-hash -fapinotes-modules -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter Namespace1::Nested1::funcInNestedNamespace -x objective-c++ | FileCheck -check-prefix=CHECK-FUNC-IN-NESTED-NAMESPACE %s
 // RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/CxxInterop -fdisable-module-hash -fapinotes-modules -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter Namespace1::Nested1::char_box::methodInNestedNamespace -x objective-c++ | FileCheck -check-prefix=CHECK-METHOD-IN-NESTED-NAMESPACE %s
 // RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/CxxInterop -fdisable-module-hash -fapinotes-modules -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter Namespace1::Nested1::Namespace1::char_box -x objective-c++ | FileCheck -check-prefix=CHECK-STRUCT-IN-DEEP-NESTED-NAMESPACE %s
@@ -57,10 +56,6 @@
 // CHECK-STRUCT-IN-NESTED-NAMESPACE-NEXT: CXXRecordDecl {{.+}} imported in Namespaces <undeserialized declarations> struct char_box
 // CHECK-STRUCT-IN-NESTED-NAMESPACE: SwiftNameAttr {{.+}} <<invalid sloc>> "NestedCharBox"
 
-// CHECK-STRUCT-IN-STRUCT-IN-NESTED-NAMESPACE: Dumping Namespace1::Nested1::char_box::inner_char_box:
-// CHECK-STRUCT-IN-STRUCT-IN-NESTED-NAMESPACE-NEXT: CXXRecordDecl {{.+}} imported in Namespaces <undeserialized declarations> struct inner_char_box
-// CHECK-STRUCT-IN-STRUCT-IN-NESTED-NAMESPACE: SwiftNameAttr {{.+}} <<invalid sloc>> "InnerCharBox"
-
 // CHECK-METHOD-IN-NESTED-NAMESPACE: Dumping Namespace1::Nested1::char_box::methodInNestedNamespace:
 // CHECK-METHOD-IN-NESTED-NAMESPACE-NEXT: CXXMethodDecl {{.+}} imported in Namespaces methodInNestedNamespace
 // CHECK-METHOD-IN-NESTED-NAMESPACE: SwiftNameAttr {{.+}} <<invalid sloc>> "swiftMethodInNestedNamespace()"
diff --git a/clang/test/AST/Interp/atomic.c b/clang/test/AST/Interp/atomic.c
index c8469d4a938b8..c5fd9ab222934 100644
--- a/clang/test/AST/Interp/atomic.c
+++ b/clang/test/AST/Interp/atomic.c
@@ -58,16 +58,3 @@ _Static_assert(atomic_is_lock_free((atomic_short*)0), "");
 _Static_assert(atomic_is_lock_free((atomic_int*)0), "");
 _Static_assert(atomic_is_lock_free((atomic_long*)0), "");
 _Static_assert(atomic_is_lock_free(0 + (atomic_char*)0), "");
-
-_Static_assert(__atomic_always_lock_free(1, (void*)1), "");
-_Static_assert(__atomic_always_lock_free(1, (void*)-1), "");
-_Static_assert(!__atomic_always_lock_free(4, (void*)2), "");
-_Static_assert(!__atomic_always_lock_free(4, (void*)-2), "");
-_Static_assert(__atomic_always_lock_free(4, (void*)4), "");
-_Static_assert(__atomic_always_lock_free(4, (void*)-4), "");
-
-_Static_assert(__atomic_always_lock_free(1, "string"), "");
-_Static_assert(!__atomic_always_lock_free(2, "string"), "");
-_Static_assert(__atomic_always_lock_free(2, (int[2]){}), "");
-void dummyfn();
-_Static_assert(__atomic_always_lock_free(2, dummyfn) || 1, "");
diff --git a/clang/test/AST/Interp/codegen.cpp b/clang/test/AST/Interp/codegen.cpp
index a5583d953d234..8a0d070d19da3 100644
--- a/clang/test/AST/Interp/codegen.cpp
+++ b/clang/test/AST/Interp/codegen.cpp
@@ -18,16 +18,3 @@ struct S {
 S s;
 // CHECK: @sp = constant ptr getelementptr (i8, ptr @s, i64 16), align 8
 float &sp = s.c[3];
-
-
-namespace BaseClassOffsets {
-  struct A { int a; };
-  struct B { int b; };
-  struct C : A, B { int c; };
-
-  extern C c;
-  // CHECK: @_ZN16BaseClassOffsets1aE = global ptr @_ZN16BaseClassOffsets1cE, align 8
-  A* a = &c;
-  // CHECK: @_ZN16BaseClassOffsets1bE = global ptr getelementptr (i8, ptr @_ZN16BaseClassOffsets1cE, i64 4), align 8
-  B* b = &c;
-}
diff --git a/clang/test/AST/ast-dump-ctad-alias.cpp b/clang/test/AST/ast-dump-ctad-alias.cpp
index b1631f7822ce0..a088c0a7b0272 100644
--- a/clang/test/AST/ast-dump-ctad-alias.cpp
+++ b/clang/test/AST/ast-dump-ctad-alias.cpp
@@ -32,24 +32,22 @@ Out2<double>::AInner t(1.0);
 // CHECK-NEXT: |     | |-UnresolvedLookupExpr {{.*}} '<dependent type>' lvalue (no ADL) = 'Concept'
 // CHECK-NEXT: |     | | |-TemplateArgument type 'int'
 // CHECK-NEXT: |     | | | `-BuiltinType {{.*}} 'int'
-// CHECK-NEXT: |     | | `-TemplateArgument type 'Y':'type-parameter-1-0'
-// CHECK-NEXT: |     | |   `-TemplateTypeParmType {{.*}} 'Y' dependent depth 1 index 0
-// CHECK-NEXT: |     | |     `-TemplateTypeParm {{.*}} 'Y'
+// CHECK-NEXT: |     | | `-TemplateArgument type 'type-parameter-1-0'
+// CHECK-NEXT: |     | |   `-TemplateTypeParmType {{.*}} 'type-parameter-1-0' dependent depth 1 index 0
 // CHECK-NEXT: |     | `-TypeTraitExpr {{.*}} 'bool' __is_deducible
 // CHECK-NEXT: |     |   |-DeducedTemplateSpecializationType {{.*}} 'Out2<double>::AInner' dependent
 // CHECK-NEXT: |     |   | `-name: 'Out2<double>::AInner'
 // CHECK-NEXT: |     |   |   `-TypeAliasTemplateDecl {{.+}} AInner{{$}}
-// CHECK-NEXT: |     |   `-ElaboratedType {{.*}} 'Inner<Y>' sugar dependent
-// CHECK-NEXT: |     |     `-TemplateSpecializationType {{.*}} 'Inner<Y>' dependent
+// CHECK-NEXT: |     |   `-ElaboratedType {{.*}} 'Inner<type-parameter-1-0>' sugar dependent
+// CHECK-NEXT: |     |     `-TemplateSpecializationType {{.*}} 'Inner<type-parameter-1-0>' dependent
 // CHECK-NEXT: |     |       |-name: 'Inner':'Out<int>::Inner' qualified
 // CHECK-NEXT: |     |       | `-ClassTemplateDecl {{.+}} Inner{{$}}
-// CHECK-NEXT: |     |       `-TemplateArgument type 'Y'
-// CHECK-NEXT: |     |         `-SubstTemplateTypeParmType {{.*}} 'Y'
+// CHECK-NEXT: |     |       `-TemplateArgument type 'type-parameter-1-0'
+// CHECK-NEXT: |     |         `-SubstTemplateTypeParmType {{.*}} 'type-parameter-1-0'
 // CHECK-NEXT: |     |           |-FunctionTemplate {{.*}} '<deduction guide for Inner>'
-// CHECK-NEXT: |     |           `-TemplateTypeParmType {{.*}} 'Y' dependent depth 1 index 0
-// CHECK-NEXT: |     |             `-TemplateTypeParm {{.*}} 'Y'
-// CHECK-NEXT: |     |-CXXDeductionGuideDecl {{.*}} <deduction guide for AInner> 'auto (Y) -> Inner<Y>'
-// CHECK-NEXT: |     | `-ParmVarDecl {{.*}} 'Y'
+// CHECK-NEXT: |     |           `-TemplateTypeParmType {{.*}} 'type-parameter-1-0' dependent depth 1 index 0
+// CHECK-NEXT: |     |-CXXDeductionGuideDecl {{.*}} <deduction guide for AInner> 'auto (type-parameter-0-0) -> Inner<type-parameter-0-0>'
+// CHECK-NEXT: |     | `-ParmVarDecl {{.*}} 'type-parameter-0-0'
 // CHECK-NEXT: |     `-CXXDeductionGuideDecl {{.*}} used <deduction guide for AInner> 'auto (double) -> Inner<double>' implicit_instantiation
 // CHECK-NEXT: |       |-TemplateArgument type 'double'
 // CHECK-NEXT: |       | `-BuiltinType {{.*}} 'double'
@@ -79,8 +77,8 @@ AFoo3 afoo3{0, 1};
 // CHECK-NEXT: | |-UnresolvedLookupExpr {{.*}} '<dependent type>' lvalue (no ADL) = 'Concept'
 // CHECK-NEXT: | | |-TemplateArgument type 'int'
 // CHECK-NEXT: | | | `-BuiltinType {{.*}} 'int'
-// CHECK-NEXT: | | `-TemplateArgument type 'V'
-// CHECK-NEXT: | |   `-TemplateTypeParmType {{.*}} 'V' dependent depth 0 index 1
+// CHECK-NEXT: | | `-TemplateArgument type 'type-parameter-0-1'
+// CHECK-NEXT: | |   `-TemplateTypeParmType {{.*}} 'type-parameter-0-1' dependent depth 0 index 1
 
 template <typename... T1>
 struct Foo {
@@ -90,16 +88,16 @@ struct Foo {
 template <typename...T2>
 using AFoo = Foo<T2...>;
 AFoo a(1, 2);
-// CHECK:      |-CXXDeductionGuideDecl {{.*}} implicit <deduction guide for AFoo> 'auto (T2...) -> Foo<T2...>'
-// CHECK-NEXT: | | `-ParmVarDecl {{.*}} 'T2...' pack
+// CHECK:      |-CXXDeductionGuideDecl {{.*}} implicit <deduction guide for AFoo> 'auto (type-parameter-0-0...) -> Foo<type-parameter-0-0...>'
+// CHECK-NEXT: | | `-ParmVarDecl {{.*}} 'type-parameter-0-0...' pack
 // CHECK-NEXT: | `-CXXDeductionGuideDecl {{.*}} implicit used <deduction guide for AFoo> 'auto (int, int) -> Foo<int, int>' implicit_instantiation
 
 template <typename T>
 using BFoo = Foo<T, T>;
 BFoo b2(1.0, 2.0);
-// CHECK:      |-CXXDeductionGuideDecl {{.*}} implicit <deduction guide for BFoo> 'auto (T, T) -> Foo<T, T>'
-// CHECK-NEXT: | | |-ParmVarDecl {{.*}} 'T'
-// CHECK-NEXT: | | `-ParmVarDecl {{.*}} 'T'
+// CHECK:      |-CXXDeductionGuideDecl {{.*}} implicit <deduction guide for BFoo> 'auto (type-parameter-0-0, type-parameter-0-0) -> Foo<type-parameter-0-0, type-parameter-0-0>'
+// CHECK-NEXT: | | |-ParmVarDecl {{.*}} 'type-parameter-0-0'
+// CHECK-NEXT: | | `-ParmVarDecl {{.*}} 'type-parameter-0-0'
 // CHECK-NEXT: | `-CXXDeductionGuideDecl {{.*}} implicit used <deduction guide for BFoo> 'auto (double, double) -> Foo<double, double>' implicit_instantiation
 
 namespace GH90209 {
diff --git a/clang/test/Analysis/Inputs/system-header-simulator-cxx.h b/clang/test/Analysis/Inputs/system-header-simulator-cxx.h
index a379a47515668..29326ec1f9280 100644
--- a/clang/test/Analysis/Inputs/system-header-simulator-cxx.h
+++ b/clang/test/Analysis/Inputs/system-header-simulator-cxx.h
@@ -263,13 +263,11 @@ namespace std {
   template< class T > struct remove_reference<T&>  {typedef T type;};
   template< class T > struct remove_reference<T&&> {typedef T type;};
 
-  template<typename T> typename remove_reference<T>::type&& move(T&& a);
-  template <typename T> T *__addressof(T &x);
-  template <typename T> T *addressof(T &x);
-  template <typename T> const T& as_const(T& x);
-  template <typename T> T&& forward(T&& x);
-  // FIXME: Declare forward_like
-  // FIXME: Declare move_if_noexcept
+  template<class T> 
+  typename remove_reference<T>::type&& move(T&& a) {
+    typedef typename remove_reference<T>::type&& RvalRef;
+    return static_cast<RvalRef>(a);
+  }
 
   template< class T >
   using remove_reference_t = typename remove_reference<T>::type;
@@ -756,7 +754,7 @@ namespace std {
     template<class InputIter, class OutputIter>
     OutputIter __copy(InputIter II, InputIter IE, OutputIter OI) {
       while (II != IE)
-        *OI++ = *II++; // #system_header_simulator_cxx_std_copy_impl_loop
+        *OI++ = *II++;
 
       return OI;
     }
diff --git a/clang/test/Analysis/builtin-functions.cpp b/clang/test/Analysis/builtin-functions.cpp
index f7bafabd23cbc..8719193e405c4 100644
--- a/clang/test/Analysis/builtin-functions.cpp
+++ b/clang/test/Analysis/builtin-functions.cpp
@@ -1,16 +1,6 @@
 // RUN: %clang_analyze_cc1 -triple x86_64-apple-darwin10 -analyzer-checker=core,debug.ExprInspection %s -std=c++11 -verify
 // RUN: %clang_analyze_cc1 -triple x86_64-pc-windows-msvc19.11.0 -fms-extensions -analyzer-checker=core,debug.ExprInspection %s -std=c++11 -verify
 
-#include "Inputs/system-header-simulator-cxx.h"
-
-namespace std {
-// Intentionally not using an "auto" return type here, as such must also have a definition.
-template <typename T, typename U> constexpr int&& forward_like(U&& x) noexcept;
-template <typename T> const T& move_if_noexcept(T& x) noexcept;
-} // namespace std
-
-template <typename T> void clang_analyzer_dump_ref(T&&);
-template <typename T> void clang_analyzer_dump_ptr(T*);
 void clang_analyzer_eval(bool);
 void clang_analyzer_warnIfReached();
 
@@ -18,16 +8,6 @@ void testAddressof(int x) {
   clang_analyzer_eval(&x == __builtin_addressof(x)); // expected-warning{{TRUE}}
 }
 
-void testStdBuiltinLikeFunctions(int x) {
-  clang_analyzer_dump_ptr(std::addressof(x));           // expected-warning{{&x}}
-  clang_analyzer_dump_ptr(std::__addressof(x));         // expected-warning{{&x}}
-  clang_analyzer_dump_ref(std::as_const(x));            // expected-warning{{&x}}
-  clang_analyzer_dump_ref(std::forward<int &>(x));      // expected-warning{{&x}}
-  clang_analyzer_dump_ref(std::forward_like<int &>(x)); // expected-warning{{&x}}
-  clang_analyzer_dump_ref(std::move(x));                // expected-warning{{&x}}
-  clang_analyzer_dump_ref(std::move_if_noexcept(x));    // expected-warning{{&x}}
-}
-
 void testSize() {
   struct {
     int x;
diff --git a/clang/test/Analysis/diagnostics/explicit-suppression.cpp b/clang/test/Analysis/diagnostics/explicit-suppression.cpp
index 3a904dac1c0fe..24586e37fe207 100644
--- a/clang/test/Analysis/diagnostics/explicit-suppression.cpp
+++ b/clang/test/Analysis/diagnostics/explicit-suppression.cpp
@@ -19,6 +19,6 @@ class C {
 void testCopyNull(C *I, C *E) {
   std::copy(I, E, (C *)0);
 #ifndef SUPPRESSED
-  // expected-warning@#system_header_simulator_cxx_std_copy_impl_loop {{Called C++ object pointer is null}}
+  // expected-warning at ../Inputs/system-header-simulator-cxx.h:757 {{Called C++ object pointer is null}}
 #endif
 }
diff --git a/clang/test/Analysis/issue-94193.cpp b/clang/test/Analysis/issue-94193.cpp
deleted file mode 100644
index 97acfdd8685be..0000000000000
--- a/clang/test/Analysis/issue-94193.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-// RUN: %clang_analyze_cc1 %s -verify -analyzer-checker=core
-
-#include "Inputs/system-header-simulator-cxx.h"
-
-
-namespace GH94193 {
-template<typename T> class optional {
-  union {
-    char x;
-    T uvalue;
-  };
-  bool holds_value = false;
-public:
-  optional() = default;
-  optional(const optional&) = delete;
-  optional(optional&&) = delete;
-  template <typename U = T> explicit optional(U&& value) : holds_value(true) {
-    new (static_cast<void*>(std::addressof(uvalue))) T(std::forward<U>(value));
-  }
-  optional& operator=(const optional&) = delete;
-  optional& operator=(optional&&) = delete;
-  explicit operator bool() const {
-    return holds_value;
-  }
-  T& unwrap() & {
-    return uvalue; // no-warning: returns a valid value
-  }
-};
-
-int top1(int x) {
-  optional<int> opt{x}; // note: Ctor was inlined.
-  return opt.unwrap();  // no-warning: returns a valid value
-}
-
-std::string *top2() {
-  std::string a = "123";
-  // expected-warning at +2 {{address of stack memory associated with local variable 'a' returned}} diagnosed by -Wreturn-stack-address
-  // expected-warning at +1 {{Address of stack memory associated with local variable 'a' returned to caller [core.StackAddressEscape]}}
-  return std::addressof(a);
-}
-} // namespace GH94193
diff --git a/clang/test/Analysis/out-of-bounds-diagnostics.c b/clang/test/Analysis/out-of-bounds-diagnostics.c
index de70e483add1c..92f983d8b1561 100644
--- a/clang/test/Analysis/out-of-bounds-diagnostics.c
+++ b/clang/test/Analysis/out-of-bounds-diagnostics.c
@@ -17,27 +17,6 @@ int underflowWithDeref(void) {
   // expected-note at -2 {{Access of 'TenElements' at negative byte offset -4}}
 }
 
-int rng(void);
-int getIndex(void) {
-  switch (rng()) {
-    case 1: return -152;
-    case 2: return -160;
-    case 3: return -168;
-    default: return -172;
-  }
-}
-
-void gh86959(void) {
-  // Previously code like this produced many almost-identical bug reports that
-  // only differed in the offset value. Verify that now we only see one report.
-
-  // expected-note at +1 {{Entering loop body}}
-  while (rng())
-    TenElements[getIndex()] = 10;
-  // expected-warning at -1 {{Out of bound access to memory preceding 'TenElements'}}
-  // expected-note at -2 {{Access of 'TenElements' at negative byte offset -688}}
-}
-
 int scanf(const char *restrict fmt, ...);
 
 void taintedIndex(void) {
diff --git a/clang/test/Analysis/stream.c b/clang/test/Analysis/stream.c
index b3a47ce4153d3..c924cbd36f759 100644
--- a/clang/test/Analysis/stream.c
+++ b/clang/test/Analysis/stream.c
@@ -498,15 +498,3 @@ void gh_93408_regression_ZeroSized(struct ZeroSized *buffer) {
   fread(buffer, 1, 1, f); // expected-warning {{Stream pointer might be NULL}} no-crash
   fclose(f);
 }
-
-extern FILE *stdout_like_ptr;
-void no_aliasing(void) {
-  FILE *f = fopen("file", "r");
-  clang_analyzer_eval(f == stdin);           // expected-warning {{FALSE}} no-TRUE
-  clang_analyzer_eval(f == stdout);          // expected-warning {{FALSE}} no-TRUE
-  clang_analyzer_eval(f == stderr);          // expected-warning {{FALSE}} no-TRUE
-  clang_analyzer_eval(f == stdout_like_ptr); // expected-warning {{FALSE}} expected-warning {{TRUE}}
-  if (f && f != stdout) {
-    fclose(f);
-  }
-} // no-leak: 'fclose()' is always called because 'f' cannot be 'stdout'.
diff --git a/clang/test/Analysis/use-after-move.cpp b/clang/test/Analysis/use-after-move.cpp
index 24d5dd8a8b3d2..33980e6ea2b8b 100644
--- a/clang/test/Analysis/use-after-move.cpp
+++ b/clang/test/Analysis/use-after-move.cpp
@@ -570,8 +570,13 @@ void differentBranchesTest(int i) {
   {
     A a;
     a.foo() > 0 ? a.foo() : A(std::move(a)).foo();
-    // peaceful-note at -1 {{Assuming the condition is true}}
-    // peaceful-note at -2 {{'?' condition is true}}
+#ifdef DFS
+    // peaceful-note at -2 {{Assuming the condition is false}}
+    // peaceful-note at -3 {{'?' condition is false}}
+#else
+    // peaceful-note at -5 {{Assuming the condition is true}}
+    // peaceful-note at -6 {{'?' condition is true}}
+#endif
   }
   // Same thing, but with a switch statement.
   {
diff --git a/clang/test/CMakeLists.txt b/clang/test/CMakeLists.txt
index 299a35723b59d..8303269a9ad07 100644
--- a/clang/test/CMakeLists.txt
+++ b/clang/test/CMakeLists.txt
@@ -78,7 +78,6 @@ list(APPEND CLANG_TEST_DEPS
   clang-installapi
   clang-scan-deps
   clang-linker-wrapper
-  clang-nvlink-wrapper
   clang-offload-bundler
   clang-offload-packager
   diagtool
diff --git a/clang/test/CXX/cpp/cpp.module/p2.cppm b/clang/test/CXX/cpp/cpp.module/p2.cppm
new file mode 100644
index 0000000000000..966a88ccfa972
--- /dev/null
+++ b/clang/test/CXX/cpp/cpp.module/p2.cppm
@@ -0,0 +1,88 @@
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+// RUN: split-file %s %t
+
+// RUN: %clang_cc1 -std=c++20 %t/A.cppm -triple x86_64-linux-gnu -verify
+// RUN: %clang_cc1 -std=c++20 %t/B.cppm -triple x86_64-linux-gnu -verify
+// RUN: %clang_cc1 -std=c++20 %t/C.cppm -triple x86_64-linux-gnu -verify
+// RUN: %clang_cc1 -std=c++20 %t/D.cppm -triple x86_64-linux-gnu -verify
+// RUN: %clang_cc1 -std=c++20 %t/E.cppm -triple x86_64-linux-gnu -verify
+// RUN: %clang_cc1 -std=c++20 %t/F.cppm -triple x86_64-linux-gnu -verify
+// RUN: %clang_cc1 -std=c++20 %t/G.cppm -triple x86_64-linux-gnu -verify
+// RUN: %clang_cc1 -std=c++20 %t/H.cppm -triple x86_64-linux-gnu -verify
+// RUN: %clang_cc1 -std=c++20 %t/I.cppm -triple x86_64-linux-gnu -verify
+// RUN: %clang_cc1 -std=c++20 %t/J.cppm -triple x86_64-linux-gnu -verify
+
+//--- version.h
+#ifndef VERSION_H
+#define VERSION_H
+
+#define VERSION libv5
+#define A a
+#define B b
+#define C c
+#define FUNC_LIKE(X) function_like_##X
+#define ATTRS [[]]
+#define SEMICOLON ;
+
+#endif // VERSION_H
+
+//--- A.cppm
+module;
+#include "version.h"
+export module VERSION;  // expected-error {{the module name in a module declaration cannot contain an object-like macro 'VERSION'}}
+
+//--- B.cppm
+module;
+#include "version.h"
+export module A.B;      // expected-error {{the module name in a module declaration cannot contain an object-like macro 'A'}} \
+                        // expected-error {{the module name in a module declaration cannot contain an object-like macro 'B'}}
+
+//--- C.cppm
+module;                             // expected-error {{missing 'module' declaration at end of global module fragment introduced here}}
+#include "version.h"
+export module A.FUNC_LIKE(foo):C;   // expected-error {{the module name in a module declaration cannot contain an object-like macro 'A'}} \
+                                    // expected-error {{unexpected '(' after the module name in a module declaration}}
+
+//--- D.cppm
+module;                               // expected-error {{missing 'module' declaration at end of global module fragment introduced here}}
+#include "version.h"
+export module B.A.FUNC_LIKE(bar):C;   // expected-error {{the module name in a module declaration cannot contain an object-like macro 'B'}} \
+                                      // expected-error {{the module name in a module declaration cannot contain an object-like macro 'A'}} \
+                                      // expected-error {{unexpected '(' after the module name in a module declaration}}
+
+//--- E.cppm
+module;
+#include "version.h"
+export module a.FUNC_LIKE:c; // OK, FUNC_LIKE would not be treated as a macro name.
+// expected-no-diagnostics
+
+//--- F.cppm
+module;
+#include "version.h"
+export module a.FUNC_LIKE:c ATTRS; // OK, FUNC_LIKE would not be treated as a macro name.
+// expected-no-diagnostics
+
+//--- G.cppm
+module;                               // expected-error {{missing 'module' declaration at end of global module fragment introduced here}}
+#include "version.h"
+export module A.FUNC_LIKE(B c:C ATTRS // expected-error {{the module name in a module declaration cannot contain an object-like macro 'A'}} \
+                                      // expected-error {{unexpected '(' after the module name in a module declaration}}
+
+//--- H.cppm
+module;                                   // expected-error {{missing 'module' declaration at end of global module fragment introduced here}}
+#include "version.h"
+export module A.FUNC_LIKE(B,). c:C ATTRS  // expected-error {{the module name in a module declaration cannot contain an object-like macro 'A'}} \
+                                          // expected-error {{unexpected '(' after the module name in a module declaration}}
+
+//--- I.cppm
+module;                                   // expected-error {{missing 'module' declaration at end of global module fragment introduced here}}
+#include "version.h"
+export module A.FUNC_LIKE(B,) c:C ATTRS   // expected-error {{the module name in a module declaration cannot contain an object-like macro 'A'}} \
+                                          // expected-error {{unexpected '(' after the module name in a module declaration}}
+
+//--- J.cppm
+module;
+#include "version.h"
+export module unexpanded : unexpanded ATTRS SEMICOLON // OK, ATTRS and SEMICOLON can be expanded.
+// expected-no-diagnostics
diff --git a/clang/test/CXX/drs/cwg6xx.cpp b/clang/test/CXX/drs/cwg6xx.cpp
index 1c56dd3907152..069102d9c5975 100644
--- a/clang/test/CXX/drs/cwg6xx.cpp
+++ b/clang/test/CXX/drs/cwg6xx.cpp
@@ -1265,7 +1265,7 @@ namespace cwg687 { // cwg687 (9 c++20, but the issue is still considered open)
 
     // This is not.
     template g<int>(a);
-    // expected-error at -1 {{expected '<' after 'template'}}
+    // expected-error at -1 {{expected expression}}
   }
 }
 
diff --git a/clang/test/CXX/expr/expr.unary/expr.unary.op/p4.cpp b/clang/test/CXX/expr/expr.unary/expr.unary.op/p4.cpp
index 170ca0a3f1c6b..162d59439d08e 100644
--- a/clang/test/CXX/expr/expr.unary/expr.unary.op/p4.cpp
+++ b/clang/test/CXX/expr/expr.unary/expr.unary.op/p4.cpp
@@ -43,36 +43,18 @@ namespace test2 {
 }
 
 namespace GH40906 {
-struct S {
-    int x;
-    void func();
-    static_assert(__is_same_as(decltype((S::x)), int&), "");
-    static_assert(__is_same_as(decltype(&(S::x)), int*), "");
-
-    // FIXME: provide better error messages
-    static_assert(__is_same_as(decltype((S::func)), int&), ""); // expected-error {{call to non-static member function without an object argument}}
-    static_assert(__is_same_as(decltype(&(S::func)), int*), ""); // expected-error {{call to non-static member function without an object argument}}
-};
-static_assert(__is_same_as(decltype((S::x)), int&), "");
-static_assert(__is_same_as(decltype(&(S::x)), int*), "");
-static_assert(__is_same_as(decltype((S::func)), int&), ""); // expected-error {{call to non-static member function without an object argument}}
-static_assert(__is_same_as(decltype(&(S::func)), int*), ""); // expected-error {{call to non-static member function without an object argument}}
-
-struct A { int x;};
-
-char q(int *);
-short q(int A::*);
-
-template <typename T>
-constexpr int f(char (*)[sizeof(q(&T::x))]) { return 1; }
-
-template <typename T>
-constexpr int f(char (*)[sizeof(q(&(T::x)))]) { return 2; }
-
-constexpr int g(char (*p)[sizeof(char)] = 0) { return f<A>(p); }
-constexpr int h(char (*p)[sizeof(short)] = 0) { return f<A>(p); }
+  struct A {
+    int val;
+    void func() {}
+  };
 
-static_assert(g() == 2);
-static_assert(h() == 1);
+  void test() {
+    decltype(&(A::val)) ptr1; // expected-error {{cannot form pointer to member from a parenthesized expression; did you mean to remove the parentheses?}}
+    int A::* ptr2 = &(A::val); // expected-error {{invalid use of non-static data member 'val'}}
 
+    // FIXME: Error messages in these cases are less than clear, we can do
+    // better.
+    int size = sizeof(&(A::func)); // expected-error {{call to non-static member function without an object argument}}
+    void (A::* ptr3)() = &(A::func); // expected-error {{call to non-static member function without an object argument}}
+  }
 }
diff --git a/clang/test/CXX/module/basic/basic.link/module-declaration.cpp b/clang/test/CXX/module/basic/basic.link/module-declaration.cpp
index d71358cc7a571..14bbc911febfc 100644
--- a/clang/test/CXX/module/basic/basic.link/module-declaration.cpp
+++ b/clang/test/CXX/module/basic/basic.link/module-declaration.cpp
@@ -8,27 +8,19 @@
 // RUN: %clang_cc1 -std=c++20 -emit-module-interface -fmodule-file=x=%t/x.pcm %t/x.y.cppm -o %t/x.y.pcm
 //
 // Module implementation for unknown and known module. (The former is ill-formed.)
-// RUN: %clang_cc1 -std=c++20 -I%t -fmodule-file=x.y=%t/x.y.pcm -verify -x c++ %t/M.cpp \
-// RUN:            -DTEST=1 -DEXPORT= -DMODULE_NAME=z
-// RUN: %clang_cc1 -std=c++20 -I%t -fmodule-file=x=%t/x.pcm -fmodule-file=x.y=%t/x.y.pcm -verify -x c++ %t/M.cpp \
-// RUN:            -DTEST=2 -DEXPORT= -DMODULE_NAME=x
+// RUN: %clang_cc1 -std=c++20 -I%t -fmodule-file=x.y=%t/x.y.pcm -verify -x c++ %t/M1.cpp
+// RUN: %clang_cc1 -std=c++20 -I%t -fmodule-file=x=%t/x.pcm -fmodule-file=x.y=%t/x.y.pcm -verify -x c++ %t/M2.cpp
 //
 // Module interface for unknown and known module. (The latter is ill-formed due to
 // redefinition.)
-// RUN: %clang_cc1 -std=c++20 -I%t -fmodule-file=x.y=%t/x.y.pcm -verify %t/M.cpp \
-// RUN:            -DTEST=3 -DEXPORT=export -DMODULE_NAME=z
-// RUN: %clang_cc1 -std=c++20 -I%t -fmodule-file=x.y=%t/x.y.pcm -verify %t/M.cpp \
-// RUN:            -DTEST=4 -DEXPORT=export -DMODULE_NAME=x
+// RUN: %clang_cc1 -std=c++20 -I%t -fmodule-file=x.y=%t/x.y.pcm -verify %t/M3.cpp
+// RUN: %clang_cc1 -std=c++20 -I%t -fmodule-file=x.y=%t/x.y.pcm -verify %t/M4.cpp
 //
 // Miscellaneous syntax.
-// RUN: %clang_cc1 -std=c++20 -I%t -fmodule-file=x.y=%t/x.y.pcm -verify %t/M.cpp \
-// RUN:            -DTEST=7 -DEXPORT=export -DMODULE_NAME='z elderberry'
-// RUN: %clang_cc1 -std=c++20 -I%t -fmodule-file=x.y=%t/x.y.pcm -verify %t/M.cpp \
-// RUN:            -DTEST=8 -DEXPORT=export -DMODULE_NAME='z [[]]'
-// RUN: %clang_cc1 -std=c++20 -I%t -fmodule-file=x.y=%t/x.y.pcm -verify %t/M.cpp \
-// RUN:            -DTEST=9 -DEXPORT=export -DMODULE_NAME='z [[fancy]]'
-// RUN: %clang_cc1 -std=c++20 -I%t -fmodule-file=x.y=%t/x.y.pcm -verify %t/M.cpp \
-// RUN:            -DTEST=10 -DEXPORT=export -DMODULE_NAME='z [[maybe_unused]]'
+// RUN: %clang_cc1 -std=c++20 -I%t -fmodule-file=x.y=%t/x.y.pcm -verify %t/M5.cpp
+// RUN: %clang_cc1 -std=c++20 -I%t -fmodule-file=x.y=%t/x.y.pcm -verify %t/M6.cpp
+// RUN: %clang_cc1 -std=c++20 -I%t -fmodule-file=x.y=%t/x.y.pcm -verify %t/M7.cpp
+// RUN: %clang_cc1 -std=c++20 -I%t -fmodule-file=x.y=%t/x.y.pcm -verify %t/M8.cpp
 
 //--- x.cppm
 export module x;
@@ -38,17 +30,26 @@ int a, b;
 export module x.y;
 int c;
 
-//--- M.cpp
-
-EXPORT module MODULE_NAME;
-#if TEST == 7
-// expected-error at -2 {{expected ';'}} expected-error at -2 {{a type specifier is required}}
-#elif TEST == 9
-// expected-warning at -4 {{unknown attribute 'fancy' ignored}}
-#elif TEST == 10
-// expected-error-re at -6 {{'maybe_unused' attribute cannot be applied to a module{{$}}}}
-#elif TEST == 1
-// expected-error at -8 {{module 'z' not found}}
-#else
-// expected-no-diagnostics
-#endif
+//--- M1.cpp
+module z; // expected-error {{module 'z' not found}}
+
+//--- M2.cpp
+module x; // expected-no-diagnostics
+
+//--- M3.cpp
+export module z; // expected-no-diagnostics
+
+//--- M4.cpp
+export module x; // expected-no-diagnostics
+
+//--- M5.cpp
+export module z elderberry; // expected-error {{expected ';'}} expected-error {{a type specifier is required}}
+
+//--- M6.cpp
+export module z [[]]; // expected-no-diagnostics
+
+//--- M7.cpp
+export module z [[fancy]]; // expected-warning {{unknown attribute 'fancy' ignored}}
+
+//--- M8.cpp
+export module z [[maybe_unused]]; // expected-error-re {{'maybe_unused' attribute cannot be applied to a module{{$}}}}
diff --git a/clang/test/CXX/module/dcl.dcl/dcl.module/dcl.module.import/p1.cppm b/clang/test/CXX/module/dcl.dcl/dcl.module/dcl.module.import/p1.cppm
index 873e4c0edeac2..ecad4db32a7e9 100644
--- a/clang/test/CXX/module/dcl.dcl/dcl.module/dcl.module.import/p1.cppm
+++ b/clang/test/CXX/module/dcl.dcl/dcl.module/dcl.module.import/p1.cppm
@@ -6,10 +6,12 @@
 // RUN: %clang_cc1 -std=c++20 -emit-module-interface -fmodule-file=x=%t/x.pcm %t/x.y.cppm -o %t/x.y.pcm
 // RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/a.b.cppm -o %t/a.b.pcm
 //
-// RUN: %clang_cc1 -std=c++20 -I%t -fmodule-file=x.y=%t/x.y.pcm -fmodule-file=x=%t/x.pcm -verify %t/test.cpp \
-// RUN:            -DMODULE_NAME=z -DINTERFACE
+// RUN: %clang_cc1 -std=c++20 -I%t -fmodule-file=x.y=%t/x.y.pcm -fmodule-file=x=%t/x.pcm -verify %t/test-interface.cpp \
+// RUN:            -DINTERFACE
 // RUN: %clang_cc1 -std=c++20 -I%t -fmodule-file=x.y=%t/x.y.pcm -fmodule-file=x=%t/x.pcm \
-// RUN:            -fmodule-file=a.b=%t/a.b.pcm -verify %t/test.cpp -DMODULE_NAME=a.b
+// RUN:            -fmodule-file=a.b=%t/a.b.pcm -verify %t/test.cpp
+// RUN: %clang_cc1 -std=c++20 -I%t -fmodule-file=x.y=%t/x.y.pcm -fmodule-file=x=%t/x.pcm \
+// RUN:            -verify %t/test-module-not-found.cpp
 // RUN: %clang_cc1 -std=c++20 -I%t -fmodule-file=x.y=%t/x.y.pcm -fmodule-file=x=%t/x.pcm -verify %t/test.x.cpp
 
 //--- x.cppm
@@ -34,11 +36,8 @@ int use_2 = b; // ok
 int use_3 = c; // expected-error {{use of undeclared identifier 'c'}}
 
 //--- test.cpp
-#ifdef INTERFACE
-export module MODULE_NAME;
-#else
-module MODULE_NAME;
-#endif
+module;
+module a.b;
 
 import x;
 
@@ -51,6 +50,28 @@ import x.y;
 import x.; // expected-error {{expected a module name after 'import'}}
 import .x; // expected-error {{expected a module name after 'import'}}
 
-import blarg; // expected-error {{module 'blarg' not found}}
+int use_4 = c; // ok
+
+
+//--- test-interface.cpp
+module;
+export module z;
+
+import x;
+
+import x [[]];
+import x [[foo]]; // expected-warning {{unknown attribute 'foo' ignored}}
+import x [[noreturn]]; // expected-error {{'noreturn' attribute cannot be applied to a module import}}
+import x [[blarg::noreturn]]; // expected-warning {{unknown attribute 'noreturn' ignored}}
+
+import x.y;
+import x.; // expected-error {{expected a module name after 'import'}}
+import .x; // expected-error {{expected a module name after 'import'}}
 
 int use_4 = c; // ok
+
+//--- test-module-not-found.cpp
+module;
+
+import blarg; // expected-error {{module 'blarg' not found}}
+
diff --git a/clang/test/CXX/temp/temp.arg/temp.arg.template/p3-0x.cpp b/clang/test/CXX/temp/temp.arg/temp.arg.template/p3-0x.cpp
index 19793fe826372..51489c5eac5ad 100644
--- a/clang/test/CXX/temp/temp.arg/temp.arg.template/p3-0x.cpp
+++ b/clang/test/CXX/temp/temp.arg/temp.arg.template/p3-0x.cpp
@@ -38,13 +38,3 @@ X1<int, X1a> inst_x1a;
 X1<long, X1b> inst_x1b;
 X1<short, X1c> inst_x1c;
 X1<short, X1d> inst_x1d; // expected-error{{template template argument has different template parameters than its corresponding template template paramete}}
-
-template <int> class X2; // expected-note{{template is declared here}} \
-                         // expected-note{{template is declared here}}
-class X3 : X2<1> {}; // expected-error{{implicit instantiation of undefined template 'X2<1>'}}
-
-template <int> class X4 : X3 {
-  struct {
-    X2<1> e; // expected-error{{implicit instantiation of undefined template 'X2<1>'}}
-  } f;
-};
diff --git a/clang/test/CXX/temp/temp.fct.spec/temp.deduct/temp.deduct.call/p3-0x.cpp b/clang/test/CXX/temp/temp.fct.spec/temp.deduct/temp.deduct.call/p3-0x.cpp
index b9ff26a7620db..ed445360c4fdd 100644
--- a/clang/test/CXX/temp/temp.fct.spec/temp.deduct/temp.deduct.call/p3-0x.cpp
+++ b/clang/test/CXX/temp/temp.fct.spec/temp.deduct/temp.deduct.call/p3-0x.cpp
@@ -78,7 +78,7 @@ namespace std_example {
   template<class T> struct A { // expected-note {{candidate}} expected-note {{implicit deduction guide}}
     template<class U>
     A(T &&, U &&, int *); // expected-note {{[with T = int, U = int] not viable: expects an rvalue}} \
-                          // expected-note {{implicit deduction guide declared as 'template <class T, class U> A(T &&, U &&, int *) -> A<T>'}}
+                          // expected-note {{implicit deduction guide declared as 'template <class T, class U> A(T &&, type-parameter-0-1 &&, int *) -> A<T>'}}
     A(T &&, int *);       // expected-note {{requires 2}} \
                           // expected-note {{implicit deduction guide declared as 'template <class T> A(T &&, int *) -> A<T>'}}
   };
diff --git a/clang/test/CXX/temp/temp.spec/temp.expl.spec/p2-20.cpp b/clang/test/CXX/temp/temp.spec/temp.expl.spec/p2-20.cpp
index 935dd360847bc..7a261fef27361 100644
--- a/clang/test/CXX/temp/temp.spec/temp.expl.spec/p2-20.cpp
+++ b/clang/test/CXX/temp/temp.spec/temp.expl.spec/p2-20.cpp
@@ -56,7 +56,7 @@ struct B {
   static int y;
 
   template<typename T>
-  int z; // expected-error {{non-static data member 'z' cannot be declared as a template}}
+  int z; // expected-error {{member 'z' declared as a template}}
 
   template<typename T>
   static int x<T*>;
@@ -65,7 +65,7 @@ struct B {
   static int y<T*>;
 
   template<typename T>
-  int x<T**>; // expected-error {{non-static data member 'x' cannot be declared as a template}}
+  int x<T**>; // expected-error {{member 'x' declared as a template}}
 
   template<>
   int x<short>;
@@ -169,7 +169,7 @@ struct D {
   static int y;
 
   template<typename U>
-  int z; // expected-error {{non-static data member 'z' cannot be declared as a template}}
+  int z; // expected-error {{member 'z' declared as a template}}
 
   template<typename U>
   static int x<U*>;
@@ -178,7 +178,7 @@ struct D {
   static int y<U*>;
 
   template<typename U>
-  int x<U**>; // expected-error {{non-static data member 'x' cannot be declared as a template}}
+  int x<U**>; // expected-error {{member 'x' declared as a template}}
 
   template<>
   int x<short>;
diff --git a/clang/test/CodeGen/attr-counted-by.c b/clang/test/CodeGen/attr-counted-by.c
index 46a6c2b492dbe..32db136076d7d 100644
--- a/clang/test/CodeGen/attr-counted-by.c
+++ b/clang/test/CodeGen/attr-counted-by.c
@@ -1906,51 +1906,3 @@ struct test30_struct {
 void test30(struct test30_struct *ptr, int idx) {
   ptr->pcpu_refcnt.__padding[idx] = __builtin_dynamic_object_size(ptr, 1);
 }
-
-struct test31_empty {};
-
-struct test31_struct {
-  struct test31_empty y;
-  int s;
-  int x[] __counted_by(s);
-};
-
-// SANITIZE-WITH-ATTR-LABEL: define dso_local i32 @test31(
-// SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readonly [[PTR:%.*]], i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR2]] {
-// SANITIZE-WITH-ATTR-NEXT:  entry:
-// SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_LOAD:%.*]] = load i32, ptr [[PTR]], align 4
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = sext i32 [[DOT_COUNTED_BY_LOAD]] to i64
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = shl nsw i64 [[TMP0]], 2
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.smax.i64(i64 [[TMP1]], i64 0)
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], 4
-// SANITIZE-WITH-ATTR-NEXT:    [[DOTINV:%.*]] = icmp slt i32 [[DOT_COUNTED_BY_LOAD]], 0
-// SANITIZE-WITH-ATTR-NEXT:    [[CONV:%.*]] = select i1 [[DOTINV]], i32 0, i32 [[TMP4]]
-// SANITIZE-WITH-ATTR-NEXT:    ret i32 [[CONV]]
-//
-// NO-SANITIZE-WITH-ATTR-LABEL: define dso_local i32 @test31(
-// NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readonly [[PTR:%.*]], i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR2]] {
-// NO-SANITIZE-WITH-ATTR-NEXT:  entry:
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_LOAD:%.*]] = load i32, ptr [[PTR]], align 4
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = sext i32 [[DOT_COUNTED_BY_LOAD]] to i64
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = shl nsw i64 [[TMP0]], 2
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.smax.i64(i64 [[TMP1]], i64 0)
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], 4
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[DOTINV:%.*]] = icmp slt i32 [[DOT_COUNTED_BY_LOAD]], 0
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[CONV:%.*]] = select i1 [[DOTINV]], i32 0, i32 [[TMP4]]
-// NO-SANITIZE-WITH-ATTR-NEXT:    ret i32 [[CONV]]
-//
-// SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i32 @test31(
-// SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[PTR:%.*]], i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR2]] {
-// SANITIZE-WITHOUT-ATTR-NEXT:  entry:
-// SANITIZE-WITHOUT-ATTR-NEXT:    ret i32 -1
-//
-// NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i32 @test31(
-// NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[PTR:%.*]], i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR1]] {
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret i32 -1
-//
-int test31(struct test31_struct *ptr, int idx) {
-  return __builtin_dynamic_object_size(ptr, 0);
-}
diff --git a/clang/test/CodeGen/ptrauth-function-attributes.c b/clang/test/CodeGen/ptrauth-function-attributes.c
index 7f93ccc7c4bce..7ec30498b9d35 100644
--- a/clang/test/CodeGen/ptrauth-function-attributes.c
+++ b/clang/test/CodeGen/ptrauth-function-attributes.c
@@ -4,15 +4,10 @@
 // RUN: %clang_cc1 -triple arm64-apple-ios  -fptrauth-calls   -emit-llvm %s  -o - | FileCheck %s --check-prefixes=ALL,CALLS
 // RUN: %clang_cc1 -triple aarch64-linux-gnu -fptrauth-calls  -emit-llvm %s  -o - | FileCheck %s --check-prefixes=ALL,CALLS
 
-// RUN: %clang_cc1 -triple arm64-apple-ios  -fptrauth-indirect-gotos -emit-llvm %s -o - | FileCheck %s --check-prefixes=ALL,GOTOS
-// RUN: %clang_cc1 -triple arm64e-apple-ios -fptrauth-indirect-gotos -emit-llvm %s -o - | FileCheck %s --check-prefixes=ALL,GOTOS
-
 // ALL: define {{(dso_local )?}}void @test() #0
 void test() {
 }
 
 // CALLS: attributes #0 = {{{.*}} "ptrauth-calls" {{.*}}}
 
-// GOTOS: attributes #0 = {{{.*}} "ptrauth-indirect-gotos" {{.*}}}
-
 // OFF-NOT: attributes {{.*}} "ptrauth-
diff --git a/clang/test/CodeGenCXX/ptrauth-global-constant-initializers.cpp b/clang/test/CodeGenCXX/ptrauth-global-constant-initializers.cpp
deleted file mode 100644
index f0c3ea83d8958..0000000000000
--- a/clang/test/CodeGenCXX/ptrauth-global-constant-initializers.cpp
+++ /dev/null
@@ -1,234 +0,0 @@
-// RUN: %clang_cc1 -triple arm64-apple-ios -fptrauth-calls -fno-rtti -fptrauth-vtable-pointer-type-discrimination -fptrauth-vtable-pointer-address-discrimination -emit-llvm -o - %s | FileCheck %s
-
-// CHECK: %struct.Base1 = type { ptr }
-// CHECK: %struct.Base2 = type { ptr }
-// CHECK: %struct.Derived1 = type { %struct.Base1, %struct.Base2 }
-// CHECK: %struct.Derived2 = type { %struct.Base2, %struct.Base1 }
-// CHECK: %struct.Derived3 = type { %struct.Base1, %struct.Base2 }
-
-// CHECK: @_ZTV5Base1 = linkonce_odr unnamed_addr constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr null, ptr ptrauth (ptr @_ZN5Base11aEv, i32 0, i64 [[BASE1_A_DISC:38871]], ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV5Base1, i32 0, i32 0, i32 2))] }, align 8
-// CHECK: @g_b1 = global %struct.Base1 { ptr ptrauth (ptr getelementptr inbounds inrange(-16, 8) ({ [3 x ptr] }, ptr @_ZTV5Base1, i32 0, i32 0, i32 2), i32 2, i64 [[BASE1_VTABLE_DISC:6511]], ptr @g_b1) }, align 8
-// CHECK: @_ZTV5Base2 = linkonce_odr unnamed_addr constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr null, ptr ptrauth (ptr @_ZN5Base21bEv, i32 0, i64 [[BASE2_B_DISC:27651]], ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV5Base2, i32 0, i32 0, i32 2))] }, align 8
-// CHECK: @g_b2 = global %struct.Base2 { ptr ptrauth (ptr getelementptr inbounds inrange(-16, 8) ({ [3 x ptr] }, ptr @_ZTV5Base2, i32 0, i32 0, i32 2), i32 2, i64 [[BASE2_VTABLE_DISC:63631]], ptr @g_b2) }, align 8
-// CHECK: @_ZTV8Derived1 = linkonce_odr unnamed_addr constant { [5 x ptr], [3 x ptr] } { [5 x ptr] [ptr null, ptr null, ptr ptrauth (ptr @_ZN5Base11aEv, i32 0, i64 [[BASE1_A_DISC]], ptr getelementptr inbounds ({ [5 x ptr], [3 x ptr] }, ptr @_ZTV8Derived1, i32 0, i32 0, i32 2)), ptr ptrauth (ptr @_ZN8Derived11cEv, i32 0, i64 [[DERIVED1_C_DISC:54092]], ptr getelementptr inbounds ({ [5 x ptr], [3 x ptr] }, ptr @_ZTV8Derived1, i32 0, i32 0, i32 3)), ptr ptrauth (ptr @_ZN8Derived11dEv, i32 0, i64 [[DERIVED1_D_DISC:37391]], ptr getelementptr inbounds ({ [5 x ptr], [3 x ptr] }, ptr @_ZTV8Derived1, i32 0, i32 0, i32 4))], [3 x ptr] [ptr inttoptr (i64 -8 to ptr), ptr null, ptr ptrauth (ptr @_ZN5Base21bEv, i32 0, i64 [[BASE2_B_DISC]], ptr getelementptr inbounds ({ [5 x ptr], [3 x ptr] }, ptr @_ZTV8Derived1, i32 0, i32 1, i32 2))] }, align 8
-// CHECK: @g_d1 = global { ptr, ptr } { ptr ptrauth (ptr getelementptr inbounds inrange(-16, 24) ({ [5 x ptr], [3 x ptr] }, ptr @_ZTV8Derived1, i32 0, i32 0, i32 2), i32 2, i64 [[BASE1_VTABLE_DISC]], ptr @g_d1), ptr ptrauth (ptr getelementptr inbounds inrange(-16, 8) ({ [5 x ptr], [3 x ptr] }, ptr @_ZTV8Derived1, i32 0, i32 1, i32 2), i32 2, i64 [[BASE2_VTABLE_DISC]], ptr getelementptr inbounds ({ ptr, ptr }, ptr @g_d1, i32 0, i32 1)) }, align 8
-// CHECK: @_ZTV8Derived2 = linkonce_odr unnamed_addr constant { [5 x ptr], [3 x ptr] } { [5 x ptr] [ptr null, ptr null, ptr ptrauth (ptr @_ZN5Base21bEv, i32 0, i64 [[BASE2_B_DISC]], ptr getelementptr inbounds ({ [5 x ptr], [3 x ptr] }, ptr @_ZTV8Derived2, i32 0, i32 0, i32 2)), ptr ptrauth (ptr @_ZN8Derived21cEv, i32 0, i64 [[DERIVED2_C_DISC:15537]], ptr getelementptr inbounds ({ [5 x ptr], [3 x ptr] }, ptr @_ZTV8Derived2, i32 0, i32 0, i32 3)), ptr ptrauth (ptr @_ZN8Derived21eEv, i32 0, i64 209, ptr getelementptr inbounds ({ [5 x ptr], [3 x ptr] }, ptr @_ZTV8Derived2, i32 0, i32 0, i32 4))], [3 x ptr] [ptr inttoptr (i64 -8 to ptr), ptr null, ptr ptrauth (ptr @_ZN5Base11aEv, i32 0, i64 [[BASE1_A_DISC]], ptr getelementptr inbounds ({ [5 x ptr], [3 x ptr] }, ptr @_ZTV8Derived2, i32 0, i32 1, i32 2))] }, align 8
-// CHECK: @g_d2 = global { ptr, ptr } { ptr ptrauth (ptr getelementptr inbounds inrange(-16, 24) ({ [5 x ptr], [3 x ptr] }, ptr @_ZTV8Derived2, i32 0, i32 0, i32 2), i32 2, i64 [[BASE2_VTABLE_DISC]], ptr @g_d2), ptr ptrauth (ptr getelementptr inbounds inrange(-16, 8) ({ [5 x ptr], [3 x ptr] }, ptr @_ZTV8Derived2, i32 0, i32 1, i32 2), i32 2, i64 [[BASE1_VTABLE_DISC]], ptr getelementptr inbounds ({ ptr, ptr }, ptr @g_d2, i32 0, i32 1)) }, align 8
-// CHECK: @_ZTV8Derived3 = linkonce_odr unnamed_addr constant { [4 x ptr], [3 x ptr] } { [4 x ptr] [ptr null, ptr null, ptr ptrauth (ptr @_ZN5Base11aEv, i32 0, i64 [[BASE1_A_DISC]], ptr getelementptr inbounds ({ [4 x ptr], [3 x ptr] }, ptr @_ZTV8Derived3, i32 0, i32 0, i32 2)), ptr ptrauth (ptr @_ZN8Derived31iEv, i32 0, i64 [[DERIVED3_I_DISC:19084]], ptr getelementptr inbounds ({ [4 x ptr], [3 x ptr] }, ptr @_ZTV8Derived3, i32 0, i32 0, i32 3))], [3 x ptr] [ptr inttoptr (i64 -8 to ptr), ptr null, ptr ptrauth (ptr @_ZN5Base21bEv, i32 0, i64 [[BASE2_B_DISC]], ptr getelementptr inbounds ({ [4 x ptr], [3 x ptr] }, ptr @_ZTV8Derived3, i32 0, i32 1, i32 2))] }, align 8
-// CHECK: @g_d3 = global { ptr, ptr } { ptr ptrauth (ptr getelementptr inbounds inrange(-16, 16) ({ [4 x ptr], [3 x ptr] }, ptr @_ZTV8Derived3, i32 0, i32 0, i32 2), i32 2, i64 [[BASE1_VTABLE_DISC]], ptr @g_d3), ptr ptrauth (ptr getelementptr inbounds inrange(-16, 8) ({ [4 x ptr], [3 x ptr] }, ptr @_ZTV8Derived3, i32 0, i32 1, i32 2), i32 2, i64 [[BASE2_VTABLE_DISC]], ptr getelementptr inbounds ({ ptr, ptr }, ptr @g_d3, i32 0, i32 1)) }, align 8
-// CHECK: @g_vb1 = global %struct.VirtualBase1 zeroinitializer, align 8
-// CHECK: @g_vb2 = global %struct.VirtualBase2 zeroinitializer, align 8
-// CHECK: @g_d4 = global %struct.Derived4 zeroinitializer, align 8
-// CHECK: @_ZTV12VirtualBase1 = linkonce_odr unnamed_addr constant { [6 x ptr] } { [6 x ptr] [ptr null, ptr null, ptr null, ptr null, ptr ptrauth (ptr @_ZN5Base11aEv, i32 0, i64 [[BASE1_A_DISC]], ptr getelementptr inbounds ({ [6 x ptr] }, ptr @_ZTV12VirtualBase1, i32 0, i32 0, i32 4)), ptr ptrauth (ptr @_ZN12VirtualBase11fEv, i32 0, i64 [[VIRTUALBASE1_F_DISC:7987]], ptr getelementptr inbounds ({ [6 x ptr] }, ptr @_ZTV12VirtualBase1, i32 0, i32 0, i32 5))] }, align 8
-// CHECK: @_ZTT12VirtualBase1 = linkonce_odr unnamed_addr constant [2 x ptr] [ptr ptrauth (ptr getelementptr inbounds inrange(-32, 16) ({ [6 x ptr] }, ptr @_ZTV12VirtualBase1, i32 0, i32 0, i32 4), i32 2), ptr ptrauth (ptr getelementptr inbounds inrange(-32, 16) ({ [6 x ptr] }, ptr @_ZTV12VirtualBase1, i32 0, i32 0, i32 4), i32 2)], align 8
-// CHECK: @_ZTV12VirtualBase2 = linkonce_odr unnamed_addr constant { [5 x ptr], [4 x ptr] } { [5 x ptr] [ptr inttoptr (i64 8 to ptr), ptr null, ptr null, ptr ptrauth (ptr @_ZN5Base21bEv, i32 0, i64 [[BASE2_B_DISC]], ptr getelementptr inbounds ({ [5 x ptr], [4 x ptr] }, ptr @_ZTV12VirtualBase2, i32 0, i32 0, i32 3)), ptr ptrauth (ptr @_ZN12VirtualBase21gEv, i32 0, i64 [[VIRTUALBASE2_G_DISC:51224]], ptr getelementptr inbounds ({ [5 x ptr], [4 x ptr] }, ptr @_ZTV12VirtualBase2, i32 0, i32 0, i32 4))], [4 x ptr] [ptr null, ptr inttoptr (i64 -8 to ptr), ptr null, ptr ptrauth (ptr @_ZN5Base11aEv, i32 0, i64 [[BASE1_A_DISC]], ptr getelementptr inbounds ({ [5 x ptr], [4 x ptr] }, ptr @_ZTV12VirtualBase2, i32 0, i32 1, i32 3))] }, align 8
-// CHECK: @_ZTT12VirtualBase2 = linkonce_odr unnamed_addr constant [2 x ptr] [ptr ptrauth (ptr getelementptr inbounds inrange(-24, 16) ({ [5 x ptr], [4 x ptr] }, ptr @_ZTV12VirtualBase2, i32 0, i32 0, i32 3), i32 2), ptr ptrauth (ptr getelementptr inbounds inrange(-24, 8) ({ [5 x ptr], [4 x ptr] }, ptr @_ZTV12VirtualBase2, i32 0, i32 1, i32 3), i32 2)], align 8
-// CHECK: @_ZTV8Derived4 = linkonce_odr unnamed_addr constant { [7 x ptr], [5 x ptr] } { [7 x ptr] [ptr null, ptr null, ptr null, ptr null, ptr ptrauth (ptr @_ZN5Base11aEv, i32 0, i64 [[BASE1_A_DISC]], ptr getelementptr inbounds ({ [7 x ptr], [5 x ptr] }, ptr @_ZTV8Derived4, i32 0, i32 0, i32 4)), ptr ptrauth (ptr @_ZN12VirtualBase11fEv, i32 0, i64 [[VIRTUALBASE1_F_DISC]], ptr getelementptr inbounds ({ [7 x ptr], [5 x ptr] }, ptr @_ZTV8Derived4, i32 0, i32 0, i32 5)), ptr ptrauth (ptr @_ZN8Derived41hEv, i32 0, i64 [[DERIVED4_H_DISC:31844]], ptr getelementptr inbounds ({ [7 x ptr], [5 x ptr] }, ptr @_ZTV8Derived4, i32 0, i32 0, i32 6))], [5 x ptr] [ptr inttoptr (i64 -8 to ptr), ptr inttoptr (i64 -8 to ptr), ptr null, ptr ptrauth (ptr @_ZN5Base21bEv, i32 0, i64 [[BASE2_B_DISC]], ptr getelementptr inbounds ({ [7 x ptr], [5 x ptr] }, ptr @_ZTV8Derived4, i32 0, i32 1, i32 3)), ptr ptrauth (ptr @_ZN12VirtualBase21gEv, i32 0, i64 [[VIRTUALBASE2_G_DISC]], ptr getelementptr inbounds ({ [7 x ptr], [5 x ptr] }, ptr @_ZTV8Derived4, i32 0, i32 1, i32 4))] }, align 8
-// CHECK: @_ZTT8Derived4 = linkonce_odr unnamed_addr constant [7 x ptr] [ptr ptrauth (ptr getelementptr inbounds inrange(-32, 24) ({ [7 x ptr], [5 x ptr] }, ptr @_ZTV8Derived4, i32 0, i32 0, i32 4), i32 2), ptr ptrauth (ptr getelementptr inbounds inrange(-32, 16) ({ [6 x ptr] }, ptr @_ZTC8Derived40_12VirtualBase1, i32 0, i32 0, i32 4), i32 2), ptr ptrauth (ptr getelementptr inbounds inrange(-32, 16) ({ [6 x ptr] }, ptr @_ZTC8Derived40_12VirtualBase1, i32 0, i32 0, i32 4), i32 2), ptr ptrauth (ptr getelementptr inbounds inrange(-24, 16) ({ [5 x ptr], [4 x ptr] }, ptr @_ZTC8Derived48_12VirtualBase2, i32 0, i32 0, i32 3), i32 2), ptr ptrauth (ptr getelementptr inbounds inrange(-24, 8) ({ [5 x ptr], [4 x ptr] }, ptr @_ZTC8Derived48_12VirtualBase2, i32 0, i32 1, i32 3), i32 2), ptr ptrauth (ptr getelementptr inbounds inrange(-32, 24) ({ [7 x ptr], [5 x ptr] }, ptr @_ZTV8Derived4, i32 0, i32 0, i32 4), i32 2), ptr ptrauth (ptr getelementptr inbounds inrange(-24, 16) ({ [7 x ptr], [5 x ptr] }, ptr @_ZTV8Derived4, i32 0, i32 1, i32 3), i32 2)], align 8
-// CHECK: @_ZTC8Derived40_12VirtualBase1 = linkonce_odr unnamed_addr constant { [6 x ptr] } { [6 x ptr] [ptr null, ptr null, ptr null, ptr null, ptr ptrauth (ptr @_ZN5Base11aEv, i32 0, i64 [[BASE1_A_DISC]], ptr getelementptr inbounds ({ [6 x ptr] }, ptr @_ZTC8Derived40_12VirtualBase1, i32 0, i32 0, i32 4)), ptr ptrauth (ptr @_ZN12VirtualBase11fEv, i32 0, i64 [[VIRTUALBASE1_F_DISC]], ptr getelementptr inbounds ({ [6 x ptr] }, ptr @_ZTC8Derived40_12VirtualBase1, i32 0, i32 0, i32 5))] }, align 8
-// CHECK: @_ZTC8Derived48_12VirtualBase2 = linkonce_odr unnamed_addr constant { [5 x ptr], [4 x ptr] } { [5 x ptr] [ptr inttoptr (i64 -8 to ptr), ptr null, ptr null, ptr ptrauth (ptr @_ZN5Base21bEv, i32 0, i64 [[BASE2_B_DISC]], ptr getelementptr inbounds ({ [5 x ptr], [4 x ptr] }, ptr @_ZTC8Derived48_12VirtualBase2, i32 0, i32 0, i32 3)), ptr ptrauth (ptr @_ZN12VirtualBase21gEv, i32 0, i64 [[VIRTUALBASE2_G_DISC]], ptr getelementptr inbounds ({ [5 x ptr], [4 x ptr] }, ptr @_ZTC8Derived48_12VirtualBase2, i32 0, i32 0, i32 4))], [4 x ptr] [ptr null, ptr inttoptr (i64 8 to ptr), ptr null, ptr ptrauth (ptr @_ZN5Base11aEv, i32 0, i64 [[BASE1_A_DISC]], ptr getelementptr inbounds ({ [5 x ptr], [4 x ptr] }, ptr @_ZTC8Derived48_12VirtualBase2, i32 0, i32 1, i32 3))] }, align 8
-
-struct Base1 { virtual void a() {} };
-struct Base2 { virtual void b() {} };
-struct Derived1 : public Base1, public Base2 {
-  virtual void c() {}
-  virtual void d() {}
-};
-struct Derived2 : public Base2, public Base1 {
-  virtual void c() {}
-  virtual void e() {}
-};
-
-struct Derived3 : public Base1, public Base2 {
-  constexpr Derived3(){}
-  virtual void i() {}
-};
-
-Base1 g_b1;
-Base2 g_b2;
-Derived1 g_d1;
-Derived2 g_d2;
-Derived3 g_d3;
-
-extern "C" void test_basic_inheritance() {
-  Base1 g_b1;
-  Base2 g_b2;
-  Derived1 g_d1;
-  Derived2 g_d2;
-  Derived3 g_d3;
-}
-
-struct VirtualBase1 : virtual Base1 {
-  VirtualBase1(){}
-  virtual void f() {}
-};
-struct VirtualBase2 : virtual Base1, Base2 {
-  VirtualBase2(){}
-  virtual void g() {}
-};
-struct Derived4 : VirtualBase1, VirtualBase2 {
-  virtual void h() {}
-};
-struct Derived5 : VirtualBase2, VirtualBase1 {
-  virtual void h() {}
-};
-
-// CHECK-LABEL: define {{.*}} ptr @_ZN12VirtualBase1C1Ev
-// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
-// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
-
-// CHECK-LABEL: define {{.*}} ptr @_ZN12VirtualBase2C1Ev
-// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE2_VTABLE_DISC]])
-// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
-
-// CHECK-LABEL: define {{.*}} ptr @_ZN8Derived4C1Ev
-// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
-// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
-// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE2_VTABLE_DISC]])
-
-// CHECK-LABEL: define {{.*}} ptr @_ZN8Derived5C1Ev
-// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE2_VTABLE_DISC]])
-// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
-// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
-
-
-VirtualBase1 g_vb1;
-VirtualBase2 g_vb2;
-Derived4 g_d4;
-Derived5 g_d5;
-
-
-extern "C" void cross_check_vtables(Base1 *b1,
-                   Base2 *b2,
-                   Derived1 *d1,
-                   Derived2 *d2,
-                   Derived3 *d3,
-                   VirtualBase1 *vb1,
-                   VirtualBase2 *vb2,
-                   Derived4 *d4,
-                   Derived4 *d5) {
-  asm("; b1->a()" ::: "memory");
-  b1->a();
-  asm("; b2->b()" ::: "memory");
-  b2->b();
-  asm("; d1->a()" ::: "memory");
-  d1->a();
-  asm("; d1->c()" ::: "memory");
-  d1->c();
-  asm("; d2->a()" ::: "memory");
-  d2->a();
-  asm("; d2->c()" ::: "memory");
-  d2->c();
-  asm("; d3->a()" ::: "memory");
-  d3->a();
-  asm("; d3->b()" ::: "memory");
-  d3->b();
-  asm("; d3->i()" ::: "memory");
-  d3->i();
-  asm("; vb1->a()" ::: "memory");
-  vb1->a();
-  asm("; vb1->f()" ::: "memory");
-  vb1->f();
-  asm("; vb2->a()" ::: "memory");
-  vb2->a();
-  asm("; vb2->g()" ::: "memory");
-  vb2->g();
-  asm("; d4->a()" ::: "memory");
-  d4->a();
-  asm("; d4->b()" ::: "memory");
-  d4->b();
-  asm("; d4->f()" ::: "memory");
-  d4->f();
-  asm("; d4->g()" ::: "memory");
-  d4->g();
-  asm("; d4->h()" ::: "memory");
-  d4->h();
-  asm("; d5->a()" ::: "memory");
-  d5->a();
-  asm("; d5->b()" ::: "memory");
-  d5->b();
-  asm("; d5->f()" ::: "memory");
-  d5->f();
-  asm("; d5->g()" ::: "memory");
-  d5->g();
-  asm("; d5->h()" ::: "memory");
-  d5->h();
-}
-
-// CHECK-LABEL: define void @cross_check_vtables(
-// CHECK: "; b1->a()",
-// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
-// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_A_DISC]])
-// CHECK: "; b2->b()"
-// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE2_VTABLE_DISC]])
-// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE2_B_DISC]])
-// CHECK: "; d1->a()"
-// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
-// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_A_DISC]])
-// CHECK: "; d1->c()"
-// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
-// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[DERIVED1_C_DISC]])
-// CHECK: "; d2->a()"
-// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
-// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_A_DISC]])
-// CHECK: "; d2->c()"
-// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE2_VTABLE_DISC]])
-// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[DERIVED2_C_DISC]])
-// CHECK: "; d3->a()"
-// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
-// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_A_DISC]])
-// CHECK: "; d3->b()"
-// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE2_VTABLE_DISC]])
-// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE2_B_DISC]])
-// CHECK: "; d3->i()"
-// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
-// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[DERIVED3_I_DISC]])
-// CHECK: "; vb1->a()"
-// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
-// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
-// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_A_DISC]])
-// CHECK: "; vb1->f()"
-// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
-// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[VIRTUALBASE1_F_DISC]])
-// CHECK: "; vb2->a()"
-// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE2_VTABLE_DISC]])
-// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
-// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_A_DISC]])
-// CHECK: "; vb2->g()"
-// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE2_VTABLE_DISC]])
-// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[VIRTUALBASE2_G_DISC]])
-// CHECK: "; d4->a()"
-// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
-// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
-// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_A_DISC]])
-// CHECK: "; d4->b()"
-// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE2_VTABLE_DISC]])
-// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE2_B_DISC]])
-// CHECK: "; d4->f()"
-// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
-// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[VIRTUALBASE1_F_DISC]])
-// CHECK: "; d4->g()"
-// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE2_VTABLE_DISC]])
-// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[VIRTUALBASE2_G_DISC]])
-// CHECK: "; d4->h()"
-// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
-// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[DERIVED4_H_DISC]])
-
-// CHECK-LABEL: define {{.*}} ptr @_ZN5Base1C2Ev
-// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
-
-// CHECK-LABEL: define {{.*}} ptr @_ZN5Base2C2Ev
-// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE2_VTABLE_DISC]])
-
-// CHECK-LABEL: define {{.*}} ptr @_ZN8Derived1C2Ev
-// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
-// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE2_VTABLE_DISC]])
-
-// CHECK-LABEL: define {{.*}} ptr @_ZN8Derived2C2Ev
-// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE2_VTABLE_DISC]])
-// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
-
-// CHECK-LABEL: define {{.*}} ptr @_ZN8Derived3C2Ev
-// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
-// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE2_VTABLE_DISC]])
-
diff --git a/clang/test/CodeGenCXX/ptrauth-member-function-pointer.cpp b/clang/test/CodeGenCXX/ptrauth-member-function-pointer.cpp
deleted file mode 100644
index 5e84e3e7bc5e9..0000000000000
--- a/clang/test/CodeGenCXX/ptrauth-member-function-pointer.cpp
+++ /dev/null
@@ -1,433 +0,0 @@
-// RUN: %clang_cc1 -triple arm64-apple-ios -fptrauth-calls -fptrauth-intrinsics -emit-llvm -std=c++11 -O1 -disable-llvm-passes -o - %s | FileCheck -check-prefixes=CHECK,NODEBUG %s
-// RUN: %clang_cc1 -triple arm64-apple-ios -fptrauth-calls -fptrauth-intrinsics -emit-llvm -std=c++11 -O1 -disable-llvm-passes -debug-info-kind=limited -o - %s | FileCheck -check-prefixes=CHECK %s
-// RUN: %clang_cc1 -triple arm64-apple-ios -fptrauth-calls -fptrauth-intrinsics -emit-llvm -std=c++11 -O1 -disable-llvm-passes -stack-protector 1 -o - %s | FileCheck %s -check-prefix=STACK-PROT
-// RUN: %clang_cc1 -triple arm64-apple-ios -fptrauth-calls -fptrauth-intrinsics -emit-llvm -std=c++11 -O1 -disable-llvm-passes -stack-protector 2 -o - %s | FileCheck %s -check-prefix=STACK-PROT
-// RUN: %clang_cc1 -triple arm64-apple-ios -fptrauth-calls -fptrauth-intrinsics -emit-llvm -std=c++11 -O1 -disable-llvm-passes -stack-protector 3 -o - %s | FileCheck %s -check-prefix=STACK-PROT
-
-
-// CHECK: @gmethod0 = global { i64, i64 } { i64 ptrtoint (ptr ptrauth (ptr @_ZN5Base011nonvirtual0Ev, i32 0, i64 [[TYPEDISC1:35591]]) to i64), i64 0 }, align 8
-// CHECK: @gmethod1 = global { i64, i64 } { i64 ptrtoint (ptr ptrauth (ptr @_ZN8Derived011nonvirtual5Ev, i32 0, i64 [[TYPEDISC0:22163]]) to i64), i64 0 }, align 8
-// CHECK: @gmethod2 = global { i64, i64 } { i64 ptrtoint (ptr ptrauth (ptr @_ZN5Base08virtual1Ev_vfpthunk_, i32 0, i64 [[TYPEDISC0]]) to i64), i64 0 }, align 8
-
-// CHECK: @__const._Z13testArrayInitv.p0 = private unnamed_addr constant [1 x { i64, i64 }] [{ i64, i64 } { i64 ptrtoint (ptr ptrauth (ptr @_ZN5Base011nonvirtual0Ev, i32 0, i64 35591) to i64), i64 0 }], align 8
-// CHECK: @__const._Z13testArrayInitv.p1 = private unnamed_addr constant [1 x { i64, i64 }] [{ i64, i64 } { i64 ptrtoint (ptr ptrauth (ptr @_ZN5Base08virtual1Ev_vfpthunk_, i32 0, i64 35591) to i64), i64 0 }], align 8
-// CHECK: @__const._Z13testArrayInitv.c0 = private unnamed_addr constant %struct.Class0 { { i64, i64 } { i64 ptrtoint (ptr ptrauth (ptr @_ZN5Base011nonvirtual0Ev, i32 0, i64 35591) to i64), i64 0 } }, align 8
-// CHECK: @__const._Z13testArrayInitv.c1 = private unnamed_addr constant %struct.Class0 { { i64, i64 } { i64 ptrtoint (ptr ptrauth (ptr @_ZN5Base08virtual1Ev_vfpthunk_, i32 0, i64 35591) to i64), i64 0 } }, align 8
-
-// CHECK: @_ZTV5Base0 = unnamed_addr constant { [5 x ptr] } { [5 x ptr] [ptr null, ptr @_ZTI5Base0,
-// CHECK-SAME: ptr ptrauth (ptr @_ZN5Base08virtual1Ev, i32 0, i64 55600, ptr getelementptr inbounds ({ [5 x ptr] }, ptr @_ZTV5Base0, i32 0, i32 0, i32 2)),
-// CHECK-SAME: ptr ptrauth (ptr @_ZN5Base08virtual3Ev, i32 0, i64 53007, ptr getelementptr inbounds ({ [5 x ptr] }, ptr @_ZTV5Base0, i32 0, i32 0, i32 3)),
-// CHECK-SAME: ptr ptrauth (ptr @_ZN5Base016virtual_variadicEiz, i32 0, i64 7464, ptr getelementptr inbounds ({ [5 x ptr] }, ptr @_ZTV5Base0, i32 0, i32 0, i32 4))] }, align 8
-
-typedef __SIZE_TYPE__ size_t;
-
-namespace std {
-template <typename _Ep>
-class initializer_list {
-  const _Ep *__begin_;
-  size_t __size_;
-
-  initializer_list(const _Ep *__b, size_t __s);
-};
-} // namespace std
-
-struct Base0 {
-  void nonvirtual0();
-  virtual void virtual1();
-  virtual void virtual3();
-  virtual void virtual_variadic(int, ...);
-};
-
-struct A0 {
-  int d[4];
-};
-
-struct A1 {
-  int d[8];
-};
-
-struct __attribute__((trivial_abi)) TrivialS {
-  TrivialS(const TrivialS &);
-  ~TrivialS();
-  int p[4];
-};
-
-struct Derived0 : Base0 {
-  void virtual1() override;
-  void nonvirtual5();
-  virtual void virtual6();
-  virtual A0 return_agg();
-  virtual A1 sret();
-  virtual void trivial_abi(TrivialS);
-};
-
-struct Base1 {
-  virtual void virtual7();
-};
-
-struct Derived1 : Base0, Base1 {
-  void virtual1() override;
-  void virtual7() override;
-};
-
-typedef void (Base0::*MethodTy0)();
-typedef void (Base0::*VariadicMethodTy0)(int, ...);
-typedef void (Derived0::*MethodTy1)();
-
-struct Class0 {
-  MethodTy1 m0;
-};
-
-// CHECK: define void @_ZN5Base08virtual1Ev(
-
-// CHECK: define void @_Z5test0v()
-// CHECK: %[[METHOD0:.*]] = alloca { i64, i64 }, align 8
-// CHECK-NEXT: %[[VARMETHOD1:.*]] = alloca { i64, i64 }, align 8
-// CHECK-NEXT: %[[METHOD2:.*]] = alloca { i64, i64 }, align 8
-// CHECK-NEXT: %[[METHOD3:.*]] = alloca { i64, i64 }, align 8
-// CHECK-NEXT: %[[METHOD4:.*]] = alloca { i64, i64 }, align 8
-// CHECK-NEXT: %[[METHOD5:.*]] = alloca { i64, i64 }, align 8
-// CHECK-NEXT: %[[METHOD6:.*]] = alloca { i64, i64 }, align 8
-// CHECK-NEXT: %[[METHOD7:.*]] = alloca { i64, i64 }, align 8
-// CHECK: store { i64, i64 } { i64 ptrtoint (ptr ptrauth (ptr @_ZN5Base011nonvirtual0Ev, i32 0, i64 [[TYPEDISC0]]) to i64), i64 0 }, ptr %[[METHOD0]], align 8
-// CHECK-NEXT: store { i64, i64 } { i64 ptrtoint (ptr ptrauth (ptr @_ZN5Base08virtual1Ev_vfpthunk_, i32 0, i64 [[TYPEDISC0]]) to i64), i64 0 }, ptr %[[METHOD0]], align 8
-// CHECK-NEXT: store { i64, i64 } { i64 ptrtoint (ptr ptrauth (ptr @_ZN5Base08virtual3Ev_vfpthunk_, i32 0, i64 [[TYPEDISC0]]) to i64), i64 0 }, ptr %[[METHOD0]], align 8
-// CHECK: store { i64, i64 } { i64 ptrtoint (ptr ptrauth (ptr @_ZN5Base016virtual_variadicEiz_vfpthunk_, i32 0, i64 34368) to i64), i64 0 }, ptr %[[VARMETHOD1]], align 8
-// CHECK: store { i64, i64 } { i64 ptrtoint (ptr ptrauth (ptr @_ZN5Base011nonvirtual0Ev, i32 0, i64 [[TYPEDISC1]]) to i64), i64 0 }, ptr %[[METHOD2]], align 8
-// CHECK-NEXT: store { i64, i64 } { i64 ptrtoint (ptr ptrauth (ptr @_ZN5Base08virtual1Ev_vfpthunk_, i32 0, i64 [[TYPEDISC1]]) to i64), i64 0 }, ptr %[[METHOD2]], align 8
-// CHECK-NEXT: store { i64, i64 } { i64 ptrtoint (ptr ptrauth (ptr @_ZN5Base08virtual3Ev_vfpthunk_, i32 0, i64 [[TYPEDISC1]]) to i64), i64 0 }, ptr %[[METHOD2]], align 8
-// CHECK-NEXT: store { i64, i64 } { i64 ptrtoint (ptr ptrauth (ptr @_ZN8Derived011nonvirtual5Ev, i32 0, i64 [[TYPEDISC1]]) to i64), i64 0 }, ptr %[[METHOD2]], align 8
-// CHECK-NEXT: store { i64, i64 } { i64 ptrtoint (ptr ptrauth (ptr @_ZN8Derived08virtual6Ev_vfpthunk_, i32 0, i64 [[TYPEDISC1]]) to i64), i64 0 }, ptr %[[METHOD2]], align 8
-// CHECK: store { i64, i64 } { i64 ptrtoint (ptr ptrauth (ptr @_ZN8Derived010return_aggEv_vfpthunk_, i32 0, i64 64418) to i64), i64 0 }, ptr %[[METHOD3]], align 8
-// CHECK: store { i64, i64 } { i64 ptrtoint (ptr ptrauth (ptr @_ZN8Derived04sretEv_vfpthunk_, i32 0, i64 28187) to i64), i64 0 }, ptr %[[METHOD4]], align 8
-// CHECK: store { i64, i64 } { i64 ptrtoint (ptr ptrauth (ptr @_ZN8Derived011trivial_abiE8TrivialS_vfpthunk_, i32 0, i64 8992) to i64), i64 0 }, ptr %[[METHOD5]], align 8
-// CHECK: store { i64, i64 } { i64 ptrtoint (ptr ptrauth (ptr @_ZN5Base18virtual7Ev_vfpthunk_, i32 0, i64 [[TYPEDISC2:61596]]) to i64), i64 0 }, ptr %[[METHOD6]], align 8
-// CHECK: store { i64, i64 } { i64 ptrtoint (ptr ptrauth (ptr @_ZN8Derived18virtual7Ev_vfpthunk_, i32 0, i64 25206) to i64), i64 0 }, ptr %[[METHOD7]], align 8
-// CHECK-NEXT: store { i64, i64 } { i64 ptrtoint (ptr ptrauth (ptr @_ZN5Base08virtual1Ev_vfpthunk_, i32 0, i64 25206) to i64), i64 0 }, ptr %[[METHOD7]], align 8
-// CHECK: ret void
-
-// CHECK: define linkonce_odr hidden void @_ZN5Base08virtual1Ev_vfpthunk_(ptr noundef %[[THIS:.*]])
-// CHECK: %[[THIS_ADDR:.*]] = alloca ptr, align 8
-// CHECK: store ptr %[[THIS]], ptr %[[THIS_ADDR]], align 8
-// CHECK: %[[THIS1:.*]] = load ptr, ptr %[[THIS_ADDR]], align 8
-// CHECK-NEXT: %[[V0:.*]] = load ptr, ptr %[[THIS_ADDR]], align 8
-// CHECK-NEXT: %[[VTABLE:.*]] = load ptr, ptr %[[THIS1]], align 8
-// CHECK-NEXT: %[[V2:.*]] = ptrtoint ptr %[[VTABLE]] to i64
-// CHECK-NEXT: %[[V3:.*]] = call i64 @llvm.ptrauth.auth(i64 %[[V2]], i32 2, i64 0)
-// CHECK-NEXT: %[[V4:.*]] = inttoptr i64 %[[V3]] to ptr
-// CHECK-NEXT: %[[VFN:.*]] = getelementptr inbounds ptr, ptr %[[V4]], i64 0
-// CHECK-NEXT: %[[V5:.*]] = load ptr, ptr %[[VFN]], align 8
-// CHECK-NEXT: %[[V6:.*]] = ptrtoint ptr %[[VFN]] to i64
-// CHECK-NEXT: %[[V7:.*]] = call i64 @llvm.ptrauth.blend(i64 %[[V6]], i64 55600)
-// CHECK-NEXT: musttail call void %[[V5]](ptr noundef nonnull align {{[0-9]+}} dereferenceable(8) %[[V0]]) [ "ptrauth"(i32 0, i64 %[[V7]]) ]
-// CHECK-NEXT: ret void
-
-// CHECK: define linkonce_odr hidden void @_ZN5Base08virtual3Ev_vfpthunk_(ptr noundef %{{.*}})
-// CHECK: load ptr, ptr %{{.*}}, align 8
-// CHECK: load ptr, ptr %{{.*}}, align 8
-// CHECK: %[[VTABLE:.*]] = load ptr, ptr %{{.*}}, align 8
-// CHECK: %[[V2:.*]] = ptrtoint ptr %[[VTABLE]] to i64
-// CHECK: %[[V3:.*]] = call i64 @llvm.ptrauth.auth(i64 %[[V2]], i32 2, i64 0)
-// CHECK: %[[V4:.*]] = inttoptr i64 %[[V3]] to ptr
-// CHECK: getelementptr inbounds ptr, ptr %[[V4]], i64 1
-// CHECK: call i64 @llvm.ptrauth.blend(i64 %{{.*}}, i64 53007)
-
-// CHECK: define linkonce_odr hidden void @_ZN5Base016virtual_variadicEiz_vfpthunk_(ptr noundef %[[THIS:.*]], i32 noundef %0, ...)
-// CHECK: %[[THIS_ADDR:.*]] = alloca ptr, align 8
-// CHECK-NEXT: %[[_ADDR:.*]] = alloca i32, align 4
-// CHECK-NEXT: store ptr %[[THIS]], ptr %[[THIS_ADDR]], align 8
-// CHECK: store i32 %0, ptr %[[_ADDR]], align 4
-// CHECK: %[[THIS1:.*]] = load ptr, ptr %[[THIS_ADDR]], align 8
-// CHECK-NEXT: %[[V1:.*]] = load ptr, ptr %[[THIS_ADDR]], align 8
-// CHECK-NEXT: %[[V2:.*]] = load i32, ptr %[[_ADDR]], align 4
-// CHECK-NEXT: %[[VTABLE:.*]] = load ptr, ptr %[[THIS1]], align 8
-// CHECK-NEXT: %[[V4:.*]] = ptrtoint ptr %[[VTABLE]] to i64
-// CHECK-NEXT: %[[V5:.*]] = call i64 @llvm.ptrauth.auth(i64 %[[V4]], i32 2, i64 0)
-// CHECK-NEXT: %[[V6:.*]] = inttoptr i64 %[[V5]] to ptr
-// CHECK-NEXT: %[[VFN:.*]] = getelementptr inbounds ptr, ptr %[[V6]], i64 2
-// CHECK-NEXT: %[[V7:.*]] = load ptr, ptr %[[VFN]], align 8
-// CHECK-NEXT: %[[V8:.*]] = ptrtoint ptr %[[VFN]] to i64
-// CHECK-NEXT: %[[V9:.*]] = call i64 @llvm.ptrauth.blend(i64 %[[V8]], i64 7464)
-// CHECK-NEXT: musttail call void (ptr, i32, ...) %[[V7]](ptr noundef nonnull align {{[0-9]+}} dereferenceable(8) %[[V1]], i32 noundef %[[V2]], ...) [ "ptrauth"(i32 0, i64 %[[V9]]) ]
-// CHECK-NEXT: ret void
-
-// CHECK: define linkonce_odr hidden void @_ZN8Derived08virtual6Ev_vfpthunk_(ptr noundef %[[THIS:.*]])
-// CHECK: %[[THIS_ADDR:.*]] = alloca ptr, align 8
-// CHECK: store ptr %[[THIS]], ptr %[[THIS_ADDR]], align 8
-// CHECK: %[[THIS1:.*]] = load ptr, ptr %[[THIS_ADDR]], align 8
-// CHECK: %[[V0:.*]] = load ptr, ptr %[[THIS_ADDR]], align 8
-// CHECK: %[[VTABLE:.*]] = load ptr, ptr %[[THIS1]], align 8
-// CHECK: %[[V1:.*]] = ptrtoint ptr %[[VTABLE]] to i64
-// CHECK: %[[V2:.*]] = call i64 @llvm.ptrauth.auth(i64 %[[V1]], i32 2, i64 0)
-// CHECK: %[[V3:.*]] = inttoptr i64 %[[V2]] to ptr
-// CHECK: %[[VFN:.*]] = getelementptr inbounds ptr, ptr %[[V3]], i64 3 
-// CHECK: %[[V5:.*]] = ptrtoint ptr %[[VFN]] to i64
-// CHECK: call i64 @llvm.ptrauth.blend(i64 %[[V5]], i64 55535)
-
-// Check that the return value of the musttail call isn't copied to a temporary.
-
-// CHECK: define linkonce_odr hidden [2 x i64] @_ZN8Derived010return_aggEv_vfpthunk_(ptr noundef %{{.*}})
-// CHECK: %[[CALL:.*]] = musttail call [2 x i64] %{{.*}}(ptr noundef nonnull align {{[0-9]+}} dereferenceable(8) %{{.*}}) [ "ptrauth"(i32 0, i64 %{{.*}}) ]
-// CHECK-NEXT: ret [2 x i64] %[[CALL]]
-
-// Check that the sret pointer passed to the caller is forwarded to the musttail
-// call.
-
-// CHECK: define linkonce_odr hidden void @_ZN8Derived04sretEv_vfpthunk_(ptr dead_on_unwind noalias writable sret(%struct.A1) align 4 %[[AGG_RESULT:.*]], ptr noundef %{{.*}})
-// CHECK: musttail call void %{{.*}}(ptr dead_on_unwind writable  sret(%struct.A1) align 4 %[[AGG_RESULT]], ptr noundef nonnull align {{[0-9]+}} dereferenceable(8) %{{.*}}) [ "ptrauth"(i32 0, i64 %{{.*}}) ]
-// CHECK-NEXT: ret void
-
-// Check that the thunk function doesn't destruct the trivial_abi argument.
-
-// CHECK: define linkonce_odr hidden void @_ZN8Derived011trivial_abiE8TrivialS_vfpthunk_(ptr noundef %{{.*}}, [2 x i64] %{{.*}})
-// NODEBUG-NOT: call
-// CHECK: call i64 @llvm.ptrauth.auth(
-// NODEBUG-NOT: call
-// CHECK: call i64 @llvm.ptrauth.blend(
-// NODEBUG-NOT: call
-// CHECK: musttail call void
-// CHECK-NEXT: ret void
-
-// CHECK: define linkonce_odr hidden void @_ZN5Base18virtual7Ev_vfpthunk_(ptr noundef %[[THIS:.*]])
-// CHECK: entry:
-// CHECK: %[[THIS_ADDR:.*]] = alloca ptr, align 8
-// CHECK: store ptr %[[THIS]], ptr %[[THIS_ADDR]], align 8
-// CHECK: %[[THIS1:.*]] = load ptr, ptr %[[THIS_ADDR]], align 8
-// CHECK: %[[V0:.*]] = load ptr, ptr %[[THIS_ADDR]], align 8
-// CHECK: %[[VTABLE:.*]] = load ptr, ptr %[[THIS1]], align 8
-// CHECK: %[[V1:.*]] = ptrtoint ptr %[[VTABLE]] to i64
-// CHECK: %[[V2:.*]] = call i64 @llvm.ptrauth.auth(i64 %[[V1]], i32 2, i64 0)
-// CHECK: %[[V3:.*]] = inttoptr i64 %[[V2]] to ptr
-// CHECK: getelementptr inbounds ptr, ptr %[[V3]], i64 0
-
-// CHECK: define linkonce_odr hidden void @_ZN8Derived18virtual7Ev_vfpthunk_(ptr noundef %[[THIS:.*]])
-// CHECK: %[[THIS_ADDR:.*]] = alloca ptr, align 8
-// CHECK: store ptr %[[THIS]], ptr %[[THIS_ADDR]], align 8
-// CHECK: %[[THIS1:.*]] = load ptr, ptr %[[THIS_ADDR]], align 8
-// CHECK: load ptr, ptr %[[THIS_ADDR]], align 8
-// CHECK: %[[VTABLE:.*]] = load ptr, ptr %[[THIS1]], align 8
-// CHECK: %[[V1:.*]] = ptrtoint ptr %[[VTABLE]] to i64
-// CHECK: %[[V2:.*]] = call i64 @llvm.ptrauth.auth(i64 %[[V1]], i32 2, i64 0)
-// CHECK: %[[V3:.*]] = inttoptr i64 %[[V2]] to ptr
-// CHECK: getelementptr inbounds ptr, ptr %[[V3]], i64 3
-
-void Base0::virtual1() {}
-
-void test0() {
-  MethodTy0 method0;
-  method0 = &Base0::nonvirtual0;
-  method0 = &Base0::virtual1;
-  method0 = &Base0::virtual3;
-
-  VariadicMethodTy0 varmethod1;
-  varmethod1 = &Base0::virtual_variadic;
-
-  MethodTy1 method2;
-  method2 = &Derived0::nonvirtual0;
-  method2 = &Derived0::virtual1;
-  method2 = &Derived0::virtual3;
-  method2 = &Derived0::nonvirtual5;
-  method2 = &Derived0::virtual6;
-
-  A0 (Derived0::*method3)();
-  method3 = &Derived0::return_agg;
-
-  A1 (Derived0::*method4)();
-  method4 = &Derived0::sret;
-
-  void (Derived0::*method5)(TrivialS);
-  method5 = &Derived0::trivial_abi;
-
-  void (Base1::*method6)();
-  method6 = &Base1::virtual7;
-
-  void (Derived1::*method7)();
-  method7 = &Derived1::virtual7;
-  method7 = &Derived1::virtual1;
-}
-
-// CHECK: define void @_Z5test1P5Base0MS_FvvE(ptr noundef %[[A0:.*]], [2 x i64] %[[A1_COERCE:.*]])
-// CHECK: %[[A1:.*]] = alloca { i64, i64 }, align 8
-// CHECK: %[[A0_ADDR:.*]] = alloca ptr, align 8
-// CHECK: %[[A1_ADDR:.*]] = alloca { i64, i64 }, align 8
-// CHECK: store [2 x i64] %[[A1_COERCE]], ptr %[[A1]], align 8
-// CHECK: %[[A11:.*]] = load { i64, i64 }, ptr %[[A1]], align 8
-// CHECK: store ptr %[[A0]], ptr %[[A0_ADDR]], align 8
-// CHECK: store { i64, i64 } %[[A11]], ptr %[[A1_ADDR]], align 8
-// CHECK: %[[V1:.*]] = load ptr, ptr %[[A0_ADDR]], align 8
-// CHECK: %[[V2:.*]] = load { i64, i64 }, ptr %[[A1_ADDR]], align 8
-// CHECK: %[[MEMPTR_ADJ:.*]] = extractvalue { i64, i64 } %[[V2]], 1
-// CHECK: %[[MEMPTR_ADJ_SHIFTED:.*]] = ashr i64 %[[MEMPTR_ADJ]], 1
-// CHECK: %[[V4:.*]] = getelementptr inbounds i8, ptr %[[V1]], i64 %[[MEMPTR_ADJ_SHIFTED]]
-// CHECK: %[[MEMPTR_PTR:.*]] = extractvalue { i64, i64 } %[[V2]], 0
-// CHECK: %[[V5:.*]] = and i64 %[[MEMPTR_ADJ]], 1
-// CHECK: %[[MEMPTR_ISVIRTUAL:.*]] = icmp ne i64 %[[V5]], 0
-// CHECK: br i1 %[[MEMPTR_ISVIRTUAL]]
-
-// CHECK: %[[VTABLE:.*]] = load ptr, ptr %[[V4]], align 8
-// CHECK: %[[V7:.*]] = ptrtoint ptr %[[VTABLE]] to i64
-// CHECK: %[[V8:.*]] = call i64 @llvm.ptrauth.auth(i64 %[[V7]], i32 2, i64 0)
-// CHECK: %[[V9:.*]] = inttoptr i64 %[[V8]] to ptr
-// CHECK: %[[V10:.*]] = trunc i64 %[[MEMPTR_PTR]] to i32
-// CHECK: %[[V11:.*]] = zext i32 %[[V10]] to i64
-// CHECK: %[[V12:.*]] = getelementptr i8, ptr %[[V9]], i64 %[[V11]]
-// CHECK: %[[MEMPTR_VIRTUALFN:.*]] = load ptr, ptr %[[V12]], align 8
-// CHECK: br
-
-// CHECK: %[[MEMPTR_NONVIRTUALFN:.*]] = inttoptr i64 %[[MEMPTR_PTR]] to ptr
-// CHECK: br
-
-// CHECK: %[[V14:.*]] = phi ptr [ %[[MEMPTR_VIRTUALFN]], {{.*}} ], [ %[[MEMPTR_NONVIRTUALFN]], {{.*}} ]
-// CHECK: %[[V15:.*]] = phi i64 [ 0, {{.*}} ], [ [[TYPEDISC0]], {{.*}} ]
-// CHECK: call void %[[V14]](ptr noundef nonnull align {{[0-9]+}} dereferenceable(8) %[[V4]]) [ "ptrauth"(i32 0, i64 %[[V15]]) ]
-// CHECK: ret void
-
-void test1(Base0 *a0, MethodTy0 a1) {
-  (a0->*a1)();
-}
-
-// CHECK: define void @_Z15testConversion0M5Base0FvvEM8Derived0FvvE([2 x i64] %[[METHOD0_COERCE:.*]], [2 x i64] %[[METHOD1_COERCE:.*]])
-// CHECK: %[[METHOD0:.*]] = alloca { i64, i64 }, align 8
-// CHECK: %[[METHOD1:.*]] = alloca { i64, i64 }, align 8
-// CHECK: %[[METHOD0_ADDR:.*]] = alloca { i64, i64 }, align 8
-// CHECK: %[[METHOD1_ADDR:.*]] = alloca { i64, i64 }, align 8
-// CHECK: store [2 x i64] %[[METHOD0_COERCE]], ptr %[[METHOD0]], align 8
-// CHECK: %[[METHOD01:.*]] = load { i64, i64 }, ptr %[[METHOD0]], align 8
-// CHECK: store [2 x i64] %[[METHOD1_COERCE]], ptr %[[METHOD1]], align 8
-// CHECK: %[[METHOD12:.*]] = load { i64, i64 }, ptr %[[METHOD1]], align 8
-// CHECK: store { i64, i64 } %[[METHOD01]], ptr %[[METHOD0_ADDR]], align 8
-// CHECK: store { i64, i64 } %[[METHOD12]], ptr %[[METHOD1_ADDR]], align 8
-// CHECK: %[[V2:.*]] = load { i64, i64 }, ptr %[[METHOD0_ADDR]], align 8
-// CHECK: %[[MEMPTR_PTR:.*]] = extractvalue { i64, i64 } %[[V2]], 0
-// CHECK: %[[MEMPTR_ADJ:.*]] = extractvalue { i64, i64 } %[[V2]], 1
-// CHECK: %[[V3:.*]] = and i64 %[[MEMPTR_ADJ]], 1
-// CHECK: %[[IS_VIRTUAL_OFFSET:.*]] = icmp ne i64 %[[V3]], 0
-// CHECK: br i1 %[[IS_VIRTUAL_OFFSET]]
-
-// CHECK: %[[V4:.*]] = inttoptr i64 %[[MEMPTR_PTR]] to ptr
-// CHECK: %[[V5:.*]] = icmp ne ptr %[[V4]], null
-// CHECK: br i1 %[[V5]]
-
-// CHECK: %[[V6:.*]] = ptrtoint ptr %[[V4]] to i64
-// CHECK: %[[V7:.*]] = call i64 @llvm.ptrauth.resign(i64 %[[V6]], i32 0, i64 [[TYPEDISC0]], i32 0, i64 [[TYPEDISC1]])
-// CHECK: %[[V8:.*]] = inttoptr i64 %[[V7]] to ptr
-// CHECK: br
-
-// CHECK: %[[V9:.*]] = phi ptr [ null, {{.*}} ], [ %[[V8]], {{.*}} ]
-// CHECK: %[[V1:.*]] = ptrtoint ptr %[[V9]] to i64
-// CHECK: %[[V11:.*]] = insertvalue { i64, i64 } %[[V2]], i64 %[[V1]], 0
-// CHECK: br
-
-// CHECK: %[[V12:.*]] = phi { i64, i64 } [ %[[V2]], {{.*}} ], [ %[[V11]], {{.*}} ]
-// CHECK: store { i64, i64 } %[[V12]], ptr %[[METHOD1_ADDR]], align 8
-// CHECK: ret void
-
-void testConversion0(MethodTy0 method0, MethodTy1 method1) {
-  method1 = method0;
-}
-
-// CHECK: define void @_Z15testConversion1M5Base0FvvE(
-// CHECK: call i64 @llvm.ptrauth.resign(i64 %{{.*}}, i32 0, i64 [[TYPEDISC0]], i32 0, i64 [[TYPEDISC1]])
-
-void testConversion1(MethodTy0 method0) {
-  MethodTy1 method1 = reinterpret_cast<MethodTy1>(method0);
-}
-
-// CHECK: define void @_Z15testConversion2M8Derived0FvvE(
-// CHECK: call i64 @llvm.ptrauth.resign(i64 %{{.*}}, i32 0, i64 [[TYPEDISC1]], i32 0, i64 [[TYPEDISC0]])
-
-void testConversion2(MethodTy1 method1) {
-  MethodTy0 method0 = static_cast<MethodTy0>(method1);
-}
-
-// CHECK: define void @_Z15testConversion3M8Derived0FvvE(
-// CHECK: call i64 @llvm.ptrauth.resign(i64 %{{.*}}, i32 0, i64 [[TYPEDISC1]], i32 0, i64 [[TYPEDISC0]])
-
-void testConversion3(MethodTy1 method1) {
-  MethodTy0 method0 = reinterpret_cast<MethodTy0>(method1);
-}
-
-// No need to call @llvm.ptrauth.resign if the source member function
-// pointer is a constant.
-
-// CHECK: define void @_Z15testConversion4v(
-// CHECK: %[[METHOD0:.*]] = alloca { i64, i64 }, align 8
-// CHECK: store { i64, i64 } { i64 ptrtoint (ptr ptrauth (ptr @_ZN5Base08virtual1Ev_vfpthunk_, i32 0, i64 [[TYPEDISC0]]) to i64), i64 0 }, ptr %[[METHOD0]], align 8
-// CHECK: ret void
-
-void testConversion4() {
-  MethodTy0 method0 = reinterpret_cast<MethodTy0>(&Derived0::virtual1);
-}
-
-// This code used to crash.
-namespace testNonVirtualThunk {
-  struct R {};
-
-  struct B0 {
-    virtual void bar();
-  };
-
-  struct B1 {
-    virtual R foo();
-  };
-
-  struct D : B0, B1 {
-    virtual R foo();
-  };
-
-  D d;
-}
-
-// CHECK: define internal void @_ZN22TestAnonymousNamespace12_GLOBAL__N_11S3fooEv_vfpthunk_(
-
-namespace TestAnonymousNamespace {
-namespace {
-struct S {
-  virtual void foo(){};
-};
-} // namespace
-
-void test() {
-  auto t = &S::foo;
-}
-} // namespace TestAnonymousNamespace
-
-MethodTy1 gmethod0 = reinterpret_cast<MethodTy1>(&Base0::nonvirtual0);
-MethodTy0 gmethod1 = reinterpret_cast<MethodTy0>(&Derived0::nonvirtual5);
-MethodTy0 gmethod2 = reinterpret_cast<MethodTy0>(&Derived0::virtual1);
-
-// CHECK-LABEL: define void @_Z13testArrayInitv()
-// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 %p0, ptr align 8 @__const._Z13testArrayInitv.p0, i64 16, i1 false)
-// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 %p1, ptr align 8 @__const._Z13testArrayInitv.p1, i64 16, i1 false)
-// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 %c0, ptr align 8 @__const._Z13testArrayInitv.c0, i64 16, i1 false)
-// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 %c1, ptr align 8 @__const._Z13testArrayInitv.c1, i64 16, i1 false)
-// CHECK: store { i64, i64 } { i64 ptrtoint (ptr ptrauth (ptr @_ZN5Base011nonvirtual0Ev, i32 0, i64 [[TYPEDISC1]]) to i64), i64 0 }, ptr %{{.*}} align 8
-// CHECK: store { i64, i64 } { i64 ptrtoint (ptr ptrauth (ptr @_ZN5Base08virtual1Ev_vfpthunk_, i32 0, i64 [[TYPEDISC1]]) to i64), i64 0 }, ptr %{{.*}}, align 8
-
-void initList(std::initializer_list<MethodTy1>);
-
-void testArrayInit() {
-  MethodTy1 p0[] = {&Base0::nonvirtual0};
-  MethodTy1 p1[] = {&Base0::virtual1};
-  Class0 c0{&Base0::nonvirtual0};
-  Class0 c1{&Base0::virtual1};
-  initList({&Base0::nonvirtual0});
-  initList({&Base0::virtual1});
-}
-
-
-
-// STACK-PROT: define {{.*}}_vfpthunk{{.*}}[[ATTRS:#[0-9]+]]
-// STACK-PROT: attributes [[ATTRS]] =
-// STACK-PROT-NOT: ssp
-// STACK-PROT-NOT: sspstrong
-// STACK-PROT-NOT: sspreq
-// STACK-PROT-NEXT: attributes
-
-// CHECK: define void @_Z15testConvertNullv(
-// CHECK: %[[T:.*]] = alloca { i64, i64 },
-// store { i64, i64 } zeroinitializer, { i64, i64 }* %[[T]],
-
-void testConvertNull() {
-  VariadicMethodTy0 t = (VariadicMethodTy0)(MethodTy0{});
-}
diff --git a/clang/test/CodeGenCXX/ptrauth-type-info-vtable.cpp b/clang/test/CodeGenCXX/ptrauth-type-info-vtable.cpp
deleted file mode 100644
index d5f69e0485140..0000000000000
--- a/clang/test/CodeGenCXX/ptrauth-type-info-vtable.cpp
+++ /dev/null
@@ -1,87 +0,0 @@
-// RUN: %clang_cc1 -DENABLE_TID=0 -I%S -std=c++11 -triple=arm64e-apple-darwin \
-// RUN:   -fptrauth-calls -fptrauth-intrinsics \
-// RUN:   -fptrauth-vtable-pointer-type-discrimination \
-// RUN:   -fptrauth-vtable-pointer-address-discrimination \
-// RUN:   %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NODISC
-
-// RUN: %clang_cc1 -DENABLE_TID=1 -I%S -std=c++11 -triple=arm64e-apple-darwin \
-// RUN:   -fptrauth-calls -fptrauth-intrinsics \
-// RUN:   -fptrauth-vtable-pointer-type-discrimination \
-// RUN:   -fptrauth-vtable-pointer-address-discrimination \
-// RUN:   -fptrauth-type-info-vtable-pointer-discrimination \
-// RUN:   %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,DISC
-
-// copied from typeinfo
-namespace std {
-
-#if __has_cpp_attribute(clang::ptrauth_vtable_pointer)
-#  if __has_feature(ptrauth_type_info_vtable_pointer_discrimination)
-#    define _LIBCPP_TYPE_INFO_VTABLE_POINTER_AUTH \
-       [[clang::ptrauth_vtable_pointer(process_independent, address_discrimination, type_discrimination)]]
-#  else
-#    define _LIBCPP_TYPE_INFO_VTABLE_POINTER_AUTH \
-       [[clang::ptrauth_vtable_pointer(process_independent, no_address_discrimination, no_extra_discrimination)]]
-#  endif
-#else
-#  define _LIBCPP_TYPE_INFO_VTABLE_POINTER_AUTH
-#endif
-
-  class _LIBCPP_TYPE_INFO_VTABLE_POINTER_AUTH type_info
-  {
-    type_info& operator=(const type_info&);
-    type_info(const type_info&);
-
-  protected:
-      explicit type_info(const char* __n);
-
-  public:
-      virtual ~type_info();
-
-      virtual void test_method();
-  };
-} // namespace std
-
-static_assert(__has_feature(ptrauth_type_info_vtable_pointer_discrimination) == ENABLE_TID, "incorrect feature state");
-
-// CHECK: @disc_std_type_info = global i32 [[STDTYPEINFO_DISC:45546]]
-extern "C" int disc_std_type_info = __builtin_ptrauth_string_discriminator("_ZTVSt9type_info");
-
-// CHECK: @_ZTV10TestStruct = unnamed_addr constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr @_ZTI10TestStruct, ptr ptrauth (ptr @_ZN10TestStructD1Ev, i32 0, i64 52216, ptr getelementptr inbounds ({ [4 x ptr] }, ptr @_ZTV10TestStruct, i32 0, i32 0, i32 2)), ptr ptrauth (ptr @_ZN10TestStructD0Ev, i32 0, i64 39671, ptr getelementptr inbounds ({ [4 x ptr] }, ptr @_ZTV10TestStruct, i32 0, i32 0, i32 3))] }, align 8
-// CHECK: @_ZTVN10__cxxabiv117__class_type_infoE = external global [0 x ptr]
-// CHECK: @_ZTS10TestStruct = constant [13 x i8] c"10TestStruct\00", align 1
-
-// NODISC: @_ZTI10TestStruct = constant { ptr, ptr } { ptr ptrauth (ptr getelementptr inbounds (ptr, ptr @_ZTVN10__cxxabiv117__class_type_infoE, i64 2), i32 2), ptr @_ZTS10TestStruct }, align 8
-
-// DISC: @_ZTI10TestStruct = constant { ptr, ptr } { ptr ptrauth (ptr getelementptr inbounds (ptr, ptr @_ZTVN10__cxxabiv117__class_type_infoE, i64 2), i32 2, i64 [[STDTYPEINFO_DISC]]), ptr @_ZTS10TestStruct }, align 8
-
-struct TestStruct {
-  virtual ~TestStruct();
-  int a;
-};
-
-TestStruct::~TestStruct(){}
-
-extern "C" void test_vtable(std::type_info* t) {
-  t->test_method();
-}
-// NODISC: define void @test_vtable(ptr noundef %t)
-// NODISC: [[T_ADDR:%.*]] = alloca ptr, align 8
-// NODISC: store ptr %t, ptr [[T_ADDR]], align 8
-// NODISC: [[T:%.*]] = load ptr, ptr [[T_ADDR]], align 8
-// NODISC: [[VPTR:%.*]] = load ptr, ptr [[T]], align 8
-// NODISC: [[CAST_VPTR:%.*]] = ptrtoint ptr [[VPTR]] to i64
-// NODISC: [[AUTHED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[CAST_VPTR]], i32 2, i64 0)
-
-// DISC: define void @test_vtable(ptr noundef %t)
-// DISC: [[T_ADDR:%.*]] = alloca ptr, align 8
-// DISC: store ptr %t, ptr [[T_ADDR]], align 8
-// DISC: [[T:%.*]] = load ptr, ptr [[T_ADDR]], align 8
-// DISC: [[VPTR:%.*]] = load ptr, ptr [[T]], align 8
-// DISC: [[ADDR:%.*]] = ptrtoint ptr [[T]] to i64
-// DISC: [[DISCRIMINATOR:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[ADDR]], i64 [[STDTYPEINFO_DISC]])
-// DISC: [[VPTRI:%.*]] = ptrtoint ptr [[VPTR]] to i64
-// DISC: [[AUTHED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[VPTRI]], i32 2, i64 [[DISCRIMINATOR]])
-
-extern "C" const void *ensure_typeinfo() {
-  return new TestStruct;
-}
diff --git a/clang/test/Driver/Inputs/multilib_aarch64_linux_tree/usr/include/aarch64-linux-gnu/.keep b/clang/test/Driver/Inputs/multilib_aarch64_linux_tree/usr/include/aarch64-linux-gnu/.keep
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/clang/test/Driver/Inputs/multilib_aarch64_linux_tree/usr/include/aarch64-linux-pauthtest/.keep b/clang/test/Driver/Inputs/multilib_aarch64_linux_tree/usr/include/aarch64-linux-pauthtest/.keep
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/clang/test/Driver/aarch64-multilib-pauthabi.c b/clang/test/Driver/aarch64-multilib-pauthabi.c
deleted file mode 100644
index 3046cb856e83c..0000000000000
--- a/clang/test/Driver/aarch64-multilib-pauthabi.c
+++ /dev/null
@@ -1,4 +0,0 @@
-// RUN: %clang --target=aarch64-linux-pauthtest       --sysroot=%S/Inputs/multilib_aarch64_linux_tree -### -c %s 2>&1 | FileCheck %s
-// RUN: %clang --target=aarch64-linux -mabi=pauthtest --sysroot=%S/Inputs/multilib_aarch64_linux_tree -### -c %s 2>&1 | FileCheck %s
-
-// CHECK: "-internal-externc-isystem" "{{.*}}/usr/include/aarch64-linux-pauthtest"
diff --git a/clang/test/Driver/aarch64-ptrauth.c b/clang/test/Driver/aarch64-ptrauth.c
index d13930e8f4b37..fa0125f4b22a9 100644
--- a/clang/test/Driver/aarch64-ptrauth.c
+++ b/clang/test/Driver/aarch64-ptrauth.c
@@ -1,7 +1,5 @@
-// REQUIRES: aarch64-registered-target
-
 // RUN: %clang -### -c --target=aarch64 %s 2>&1 | FileCheck %s --check-prefix NONE
-// NONE:     "-cc1"
+// NONE: "-cc1"
 // NONE-NOT: "-fptrauth-
 
 // RUN: %clang -### -c --target=aarch64 \
@@ -15,78 +13,13 @@
 // RUN:   %s 2>&1 | FileCheck %s --check-prefix=ALL
 // ALL: "-cc1"{{.*}} "-fptrauth-intrinsics" "-fptrauth-calls" "-fptrauth-returns" "-fptrauth-auth-traps" "-fptrauth-vtable-pointer-address-discrimination" "-fptrauth-vtable-pointer-type-discrimination" "-fptrauth-init-fini"
 
-// RUN: %clang -### -c --target=aarch64-linux -mabi=pauthtest %s 2>&1 | FileCheck %s --check-prefix=PAUTHABI1
-// RUN: %clang -### -c --target=aarch64-linux-pauthtest %s 2>&1 | FileCheck %s --check-prefix=PAUTHABI1
-// PAUTHABI1:      "-cc1"{{.*}} "-triple" "aarch64-unknown-linux-pauthtest"
-// PAUTHABI1-SAME: "-target-abi" "pauthtest"
-// PAUTHABI1-SAME: "-fptrauth-intrinsics" "-fptrauth-calls" "-fptrauth-returns" "-fptrauth-auth-traps" "-fptrauth-vtable-pointer-address-discrimination" "-fptrauth-vtable-pointer-type-discrimination" "-fptrauth-init-fini"
-
-// RUN: %clang -### -c --target=aarch64 -mabi=pauthtest -fno-ptrauth-intrinsics \
-// RUN:   -fno-ptrauth-calls -fno-ptrauth-returns -fno-ptrauth-auth-traps \
-// RUN:   -fno-ptrauth-vtable-pointer-address-discrimination -fno-ptrauth-vtable-pointer-type-discrimination \
-// RUN:   -fno-ptrauth-init-fini %s 2>&1 | FileCheck %s --check-prefix=PAUTHABI2
-// RUN: %clang -### -c --target=aarch64-pauthtest -fno-ptrauth-intrinsics \
-// RUN:   -fno-ptrauth-calls -fno-ptrauth-returns -fno-ptrauth-auth-traps \
-// RUN:   -fno-ptrauth-vtable-pointer-address-discrimination -fno-ptrauth-vtable-pointer-type-discrimination \
-// RUN:   -fno-ptrauth-init-fini %s 2>&1 | FileCheck %s --check-prefix=PAUTHABI2
-// PAUTHABI2:     "-cc1"
-// PAUTHABI2-NOT: "-fptrauth-
-
 // RUN: not %clang -### -c --target=x86_64 -fptrauth-intrinsics -fptrauth-calls -fptrauth-returns -fptrauth-auth-traps \
 // RUN:   -fptrauth-vtable-pointer-address-discrimination -fptrauth-vtable-pointer-type-discrimination \
-// RUN:   -fptrauth-init-fini %s 2>&1 | FileCheck %s --check-prefix=ERR1
-// ERR1:      error: unsupported option '-fptrauth-intrinsics' for target '{{.*}}'
-// ERR1-NEXT: error: unsupported option '-fptrauth-calls' for target '{{.*}}'
-// ERR1-NEXT: error: unsupported option '-fptrauth-returns' for target '{{.*}}'
-// ERR1-NEXT: error: unsupported option '-fptrauth-auth-traps' for target '{{.*}}'
-// ERR1-NEXT: error: unsupported option '-fptrauth-vtable-pointer-address-discrimination' for target '{{.*}}'
-// ERR1-NEXT: error: unsupported option '-fptrauth-vtable-pointer-type-discrimination' for target '{{.*}}'
-// ERR1-NEXT: error: unsupported option '-fptrauth-init-fini' for target '{{.*}}'
-
-//// Only support PAuth ABI for Linux as for now.
-// RUN: not %clang -c --target=aarch64-unknown -mabi=pauthtest %s 2>&1 | FileCheck %s --check-prefix=ERR2
-// RUN: not %clang -c --target=aarch64-unknown-pauthtest       %s 2>&1 | FileCheck %s --check-prefix=ERR2
-// ERR2: error: ABI 'pauthtest' is not supported for 'aarch64-unknown-unknown-pauthtest'
-
-//// PAuth ABI is encoded as environment part of the triple, so don't allow to explicitly set other environments.
-// RUN: not %clang -c --target=aarch64-linux-gnu -mabi=pauthtest %s 2>&1 | FileCheck %s --check-prefix=ERR3
-// ERR3: error: unsupported option '-mabi=pauthtest' for target 'aarch64-unknown-linux-gnu'
-// RUN: %clang -c --target=aarch64-linux-pauthtest -mabi=pauthtest %s
-
-//// The only branch protection option compatible with PAuthABI is BTI.
-// RUN: not %clang -### -c --target=aarch64-linux -mabi=pauthtest -mbranch-protection=pac-ret %s 2>&1 | \
-// RUN:   FileCheck %s --check-prefix=ERR4
-// RUN: not %clang -### -c --target=aarch64-linux-pauthtest       -mbranch-protection=pac-ret %s 2>&1 | \
-// RUN:   FileCheck %s --check-prefix=ERR4
-// ERR4: error: unsupported option '-mbranch-protection=pac-ret' for target 'aarch64-unknown-linux-pauthtest'
-
-// RUN: not %clang -### -c --target=aarch64-linux -mabi=pauthtest -mbranch-protection=gcs %s 2>&1 | \
-// RUN:   FileCheck %s --check-prefix=ERR5
-// RUN: not %clang -### -c --target=aarch64-linux-pauthtest       -mbranch-protection=gcs %s 2>&1 | \
-// RUN:   FileCheck %s --check-prefix=ERR5
-// ERR5: error: unsupported option '-mbranch-protection=gcs' for target 'aarch64-unknown-linux-pauthtest'
-
-// RUN: not %clang -### -c --target=aarch64-linux -mabi=pauthtest -mbranch-protection=standard %s 2>&1 | \
-// RUN:   FileCheck %s --check-prefix=ERR6
-// RUN: not %clang -### -c --target=aarch64-linux-pauthtest       -mbranch-protection=standard %s 2>&1 | \
-// RUN:   FileCheck %s --check-prefix=ERR6
-// ERR6: error: unsupported option '-mbranch-protection=standard' for target 'aarch64-unknown-linux-pauthtest'
-
-// RUN: not %clang -### -c --target=aarch64-linux -mabi=pauthtest -msign-return-address=all %s 2>&1 | \
-// RUN:   FileCheck %s --check-prefix=ERR7
-// RUN: not %clang -### -c --target=aarch64-linux-pauthtest       -msign-return-address=all %s 2>&1 | \
-// RUN:   FileCheck %s --check-prefix=ERR7
-// ERR7: error: unsupported option '-msign-return-address=all' for target 'aarch64-unknown-linux-pauthtest'
-
-// RUN: not %clang -### -c --target=aarch64-linux -mabi=pauthtest -msign-return-address=non-leaf %s 2>&1 | \
-// RUN:   FileCheck %s --check-prefix=ERR8
-// RUN: not %clang -### -c --target=aarch64-linux-pauthtest       -msign-return-address=non-leaf %s 2>&1 | \
-// RUN:   FileCheck %s --check-prefix=ERR8
-// ERR8: error: unsupported option '-msign-return-address=non-leaf' for target 'aarch64-unknown-linux-pauthtest'
-
-// RUN: %clang -### -c --target=aarch64-linux -mabi=pauthtest -msign-return-address=none %s
-// RUN: %clang -### -c --target=aarch64-linux-pauthtest       -msign-return-address=none %s
-// RUN: %clang -### -c --target=aarch64-linux -mabi=pauthtest -mbranch-protection=bti %s
-// RUN: %clang -### -c --target=aarch64-linux-pauthtest       -mbranch-protection=bti %s
-// RUN: %clang -### -c --target=aarch64-linux -mabi=pauthtest -mbranch-protection=none %s
-// RUN: %clang -### -c --target=aarch64-linux-pauthtest       -mbranch-protection=none %s
+// RUN:   -fptrauth-init-fini %s 2>&1 | FileCheck %s --check-prefix=ERR
+// ERR:      error: unsupported option '-fptrauth-intrinsics' for target '{{.*}}'
+// ERR-NEXT: error: unsupported option '-fptrauth-calls' for target '{{.*}}'
+// ERR-NEXT: error: unsupported option '-fptrauth-returns' for target '{{.*}}'
+// ERR-NEXT: error: unsupported option '-fptrauth-auth-traps' for target '{{.*}}'
+// ERR-NEXT: error: unsupported option '-fptrauth-vtable-pointer-address-discrimination' for target '{{.*}}'
+// ERR-NEXT: error: unsupported option '-fptrauth-vtable-pointer-type-discrimination' for target '{{.*}}'
+// ERR-NEXT: error: unsupported option '-fptrauth-init-fini' for target '{{.*}}'
diff --git a/clang/test/Driver/cuda-cross-compiling.c b/clang/test/Driver/cuda-cross-compiling.c
index 42d56cbfcc321..1dc4520f485db 100644
--- a/clang/test/Driver/cuda-cross-compiling.c
+++ b/clang/test/Driver/cuda-cross-compiling.c
@@ -32,8 +32,8 @@
 // RUN:   | FileCheck -check-prefix=ARGS %s
 
 //      ARGS: -cc1" "-triple" "nvptx64-nvidia-cuda" "-S" {{.*}} "-target-cpu" "sm_61" "-target-feature" "+ptx{{[0-9]+}}" {{.*}} "-o" "[[PTX:.+]].s"
-// ARGS-NEXT: ptxas{{.*}}"-m64" "-O0" "--gpu-name" "sm_61" "--output-file" "[[CUBIN:.+]].o" "[[PTX]].s" "-c"
-// ARGS-NEXT: clang-nvlink-wrapper{{.*}}"-o" "a.out" "-arch" "sm_61"{{.*}}"[[CUBIN]].o"
+// ARGS-NEXT: ptxas{{.*}}"-m64" "-O0" "--gpu-name" "sm_61" "--output-file" "[[CUBIN:.+]].cubin" "[[PTX]].s" "-c"
+// ARGS-NEXT: nvlink{{.*}}"-o" "a.out" "-arch" "sm_61" {{.*}} "[[CUBIN]].cubin"
 
 //
 // Test the generated arguments to the CUDA binary utils when targeting NVPTX. 
@@ -55,7 +55,7 @@
 // RUN: %clang -target nvptx64-nvidia-cuda -march=sm_61 -### %t.o 2>&1 \
 // RUN:   | FileCheck -check-prefix=LINK %s
 
-// LINK: clang-nvlink-wrapper{{.*}}"-o" "a.out" "-arch" "sm_61"{{.*}}[[CUBIN:.+]].o
+// LINK: nvlink{{.*}}"-o" "a.out" "-arch" "sm_61" {{.*}} "{{.*}}.cubin"
 
 //
 // Test to ensure that we enable handling global constructors in a freestanding
@@ -72,7 +72,7 @@
 // RUN: %clang -target nvptx64-nvidia-cuda -Wl,-v -Wl,a,b -march=sm_52 -### %s 2>&1 \
 // RUN:   | FileCheck -check-prefix=LINKER-ARGS %s
 
-// LINKER-ARGS: clang-nvlink-wrapper{{.*}}"-v"{{.*}}"a" "b"
+// LINKER-ARGS: nvlink{{.*}}"-v"{{.*}}"a" "b"
 
 // Tests for handling a missing architecture.
 //
diff --git a/clang/test/Driver/fpatchable-function-entry.c b/clang/test/Driver/fpatchable-function-entry.c
index 5f07ca99a69de..ab04fd39ffa1c 100644
--- a/clang/test/Driver/fpatchable-function-entry.c
+++ b/clang/test/Driver/fpatchable-function-entry.c
@@ -6,8 +6,6 @@
 // RUN: %clang --target=loongarch64 %s -fpatchable-function-entry=1,0 -c -### 2>&1 | FileCheck %s
 // RUN: %clang --target=riscv32 %s -fpatchable-function-entry=1,0 -c -### 2>&1 | FileCheck %s
 // RUN: %clang --target=riscv64 %s -fpatchable-function-entry=1,0 -c -### 2>&1 | FileCheck %s
-// RUN: %clang --target=powerpc-unknown-linux-gnu %s -fpatchable-function-entry=1,0 -c -### 2>&1 | FileCheck %s
-// RUN: %clang --target=powerpc64-unknown-linux-gnu %s -fpatchable-function-entry=1,0 -c -### 2>&1 | FileCheck %s
 // CHECK: "-fpatchable-function-entry=1"
 
 // RUN: %clang --target=aarch64 -fsyntax-only %s -fpatchable-function-entry=1,1 -c -### 2>&1 | FileCheck --check-prefix=11 %s
@@ -15,11 +13,8 @@
 // RUN: %clang --target=aarch64 -fsyntax-only %s -fpatchable-function-entry=2,1 -c -### 2>&1 | FileCheck --check-prefix=21 %s
 // 21: "-fpatchable-function-entry=2" "-fpatchable-function-entry-offset=1"
 
-// RUN: not %clang --target=powerpc64-ibm-aix-xcoff -fsyntax-only %s -fpatchable-function-entry=1 2>&1 | FileCheck --check-prefix=AIX64 %s
-// AIX64: error: unsupported option '-fpatchable-function-entry=1' for target 'powerpc64-ibm-aix-xcoff'
-
-// RUN: not %clang --target=powerpc-ibm-aix-xcoff -fsyntax-only %s -fpatchable-function-entry=1 2>&1 | FileCheck --check-prefix=AIX32 %s
-// AIX32: error: unsupported option '-fpatchable-function-entry=1' for target 'powerpc-ibm-aix-xcoff'
+// RUN: not %clang --target=ppc64 -fsyntax-only %s -fpatchable-function-entry=1 2>&1 | FileCheck --check-prefix=TARGET %s
+// TARGET: error: unsupported option '-fpatchable-function-entry=1' for target 'ppc64'
 
 // RUN: not %clang --target=x86_64 -fsyntax-only %s -fpatchable-function-entry=1,0, 2>&1 | FileCheck --check-prefix=EXCESS %s
 // EXCESS: error: invalid argument '1,0,' to -fpatchable-function-entry=
diff --git a/clang/test/Driver/ftime-trace-sections.cpp b/clang/test/Driver/ftime-trace-sections.cpp
index da7109b9d81a6..0c16052bc0c3a 100644
--- a/clang/test/Driver/ftime-trace-sections.cpp
+++ b/clang/test/Driver/ftime-trace-sections.cpp
@@ -1,5 +1,5 @@
 // RUN: rm -rf %t && mkdir %t && cd %t
-// RUN: %clangxx -S -ftime-trace -ftime-trace-granularity=0 -ftime-trace-verbose -o out %s
+// RUN: %clangxx -S -ftime-trace -ftime-trace-granularity=0 -o out %s
 // RUN: %python %S/ftime-trace-sections.py < out.json
 
 template <typename T>
diff --git a/clang/test/Driver/ftime-trace.cpp b/clang/test/Driver/ftime-trace.cpp
index 60c5885704b58..5fe63de915a71 100644
--- a/clang/test/Driver/ftime-trace.cpp
+++ b/clang/test/Driver/ftime-trace.cpp
@@ -1,18 +1,18 @@
 // RUN: rm -rf %t && mkdir -p %t && cd %t
-// RUN: %clangxx -S -no-canonical-prefixes -ftime-trace -ftime-trace-granularity=0 -ftime-trace-verbose -o out %s
+// RUN: %clangxx -S -no-canonical-prefixes -ftime-trace -ftime-trace-granularity=0 -o out %s
 // RUN: cat out.json \
 // RUN:   | %python -c 'import json, sys; json.dump(json.loads(sys.stdin.read()), sys.stdout, sort_keys=True, indent=2)' \
 // RUN:   | FileCheck %s
-// RUN: %clangxx -S -no-canonical-prefixes -ftime-trace=new-name.json -ftime-trace-granularity=0 -ftime-trace-verbose -o out %s
+// RUN: %clangxx -S -no-canonical-prefixes -ftime-trace=new-name.json -ftime-trace-granularity=0 -o out %s
 // RUN: cat new-name.json \
 // RUN:   | %python -c 'import json, sys; json.dump(json.loads(sys.stdin.read()), sys.stdout, sort_keys=True, indent=2)' \
 // RUN:   | FileCheck %s
 // RUN: mkdir dir1 dir2
-// RUN: %clangxx -S -no-canonical-prefixes -ftime-trace=dir1 -ftime-trace-granularity=0 -ftime-trace-verbose -o out %s
+// RUN: %clangxx -S -no-canonical-prefixes -ftime-trace=dir1 -ftime-trace-granularity=0 -o out %s
 // RUN: cat dir1/out.json \
 // RUN:   | %python -c 'import json, sys; json.dump(json.loads(sys.stdin.read()), sys.stdout, sort_keys=True, indent=2)' \
 // RUN:   | FileCheck %s
-// RUN: %clangxx -S -no-canonical-prefixes -ftime-trace=dir2/ -ftime-trace-granularity=0 -ftime-trace-verbose -o out %s
+// RUN: %clangxx -S -no-canonical-prefixes -ftime-trace=dir2/ -ftime-trace-granularity=0 -o out %s
 // RUN: cat dir2/out.json \
 // RUN:   | %python -c 'import json, sys; json.dump(json.loads(sys.stdin.read()), sys.stdout, sort_keys=True, indent=2)' \
 // RUN:   | FileCheck %s
@@ -34,33 +34,32 @@
 // RUN: mkdir d e f && cp %s d/a.cpp && touch d/b.c
 
 /// TODO: Support -fno-integrated-as.
-// RUN: %clang -### -c -ftime-trace -ftime-trace-granularity=0 -ftime-trace-verbose -fintegrated-as d/a.cpp -o e/a.o 2>&1 | FileCheck %s --check-prefix=COMPILE1
-// COMPILE1: -cc1{{.*}} "-ftime-trace=e/a.json" "-ftime-trace-granularity=0" "-ftime-trace-verbose"
+// RUN: %clang -### -c -ftime-trace -ftime-trace-granularity=0 -fintegrated-as d/a.cpp -o e/a.o 2>&1 | FileCheck %s --check-prefix=COMPILE1
+// COMPILE1: -cc1{{.*}} "-ftime-trace=e/a.json" "-ftime-trace-granularity=0"
 
-// RUN: %clang -### -c -ftime-trace -ftime-trace-granularity=0 -ftime-trace-verbose d/a.cpp d/b.c -dumpdir f/ 2>&1 | FileCheck %s --check-prefix=COMPILE2
-// COMPILE2: -cc1{{.*}} "-ftime-trace=f/a.json" "-ftime-trace-granularity=0" "-ftime-trace-verbose"
-// COMPILE2: -cc1{{.*}} "-ftime-trace=f/b.json" "-ftime-trace-granularity=0" "-ftime-trace-verbose"
+// RUN: %clang -### -c -ftime-trace -ftime-trace-granularity=0 d/a.cpp d/b.c -dumpdir f/ 2>&1 | FileCheck %s --check-prefix=COMPILE2
+// COMPILE2: -cc1{{.*}} "-ftime-trace=f/a.json" "-ftime-trace-granularity=0"
+// COMPILE2: -cc1{{.*}} "-ftime-trace=f/b.json" "-ftime-trace-granularity=0"
 
 /// -o specifies the link output. Create ${output}-${basename}.json.
-// RUN: %clang -### -ftime-trace -ftime-trace-granularity=0 -ftime-trace-verbose d/a.cpp d/b.c -o e/x 2>&1 | FileCheck %s --check-prefix=LINK1
-// LINK1: -cc1{{.*}} "-ftime-trace=e/x-a.json" "-ftime-trace-granularity=0" "-ftime-trace-verbose"
-// LINK1: -cc1{{.*}} "-ftime-trace=e/x-b.json" "-ftime-trace-granularity=0" "-ftime-trace-verbose"
+// RUN: %clang -### -ftime-trace -ftime-trace-granularity=0 d/a.cpp d/b.c -o e/x 2>&1 | FileCheck %s --check-prefix=LINK1
+// LINK1: -cc1{{.*}} "-ftime-trace=e/x-a.json" "-ftime-trace-granularity=0"
+// LINK1: -cc1{{.*}} "-ftime-trace=e/x-b.json" "-ftime-trace-granularity=0"
 
 /// -dumpdir is f/g, not ending with a path separator. We create f/g${basename}.json.
-// RUN: %clang -### -ftime-trace -ftime-trace-granularity=0 -ftime-trace-verbose d/a.cpp d/b.c -o e/x -dumpdir f/g 2>&1 | FileCheck %s --check-prefix=LINK2
-// LINK2: -cc1{{.*}} "-ftime-trace=f/ga.json" "-ftime-trace-granularity=0" "-ftime-trace-verbose"
-// LINK2: -cc1{{.*}} "-ftime-trace=f/gb.json" "-ftime-trace-granularity=0" "-ftime-trace-verbose"
+// RUN: %clang -### -ftime-trace -ftime-trace-granularity=0 d/a.cpp d/b.c -o e/x -dumpdir f/g 2>&1 | FileCheck %s --check-prefix=LINK2
+// LINK2: -cc1{{.*}} "-ftime-trace=f/ga.json" "-ftime-trace-granularity=0"
+// LINK2: -cc1{{.*}} "-ftime-trace=f/gb.json" "-ftime-trace-granularity=0"
 
-// RUN: %clang -### -ftime-trace=e -ftime-trace-granularity=0 -ftime-trace-verbose d/a.cpp d/b.c -o f/x -dumpdir f/ 2>&1 | FileCheck %s --check-prefix=LINK3
-// LINK3: -cc1{{.*}} "-ftime-trace=e{{/|\\\\}}a-{{[^.]*}}.json" "-ftime-trace-granularity=0" "-ftime-trace-verbose"
-// LINK3: -cc1{{.*}} "-ftime-trace=e{{/|\\\\}}b-{{[^.]*}}.json" "-ftime-trace-granularity=0" "-ftime-trace-verbose"
+// RUN: %clang -### -ftime-trace=e -ftime-trace-granularity=0 d/a.cpp d/b.c -o f/x -dumpdir f/ 2>&1 | FileCheck %s --check-prefix=LINK3
+// LINK3: -cc1{{.*}} "-ftime-trace=e{{/|\\\\}}a-{{[^.]*}}.json" "-ftime-trace-granularity=0"
+// LINK3: -cc1{{.*}} "-ftime-trace=e{{/|\\\\}}b-{{[^.]*}}.json" "-ftime-trace-granularity=0"
 
-// RUN: %clang -### -ftime-trace -ftime-trace=e -ftime-trace-granularity=1 -ftime-trace-verbose -xassembler d/a.cpp 2>&1 | \
+// RUN: %clang -### -ftime-trace -ftime-trace=e -ftime-trace-granularity=1 -xassembler d/a.cpp 2>&1 | \
 // RUN:   FileCheck %s --check-prefix=UNUSED
 // UNUSED:      warning: argument unused during compilation: '-ftime-trace'
 // UNUSED-NEXT: warning: argument unused during compilation: '-ftime-trace=e'
 // UNUSED-NEXT: warning: argument unused during compilation: '-ftime-trace-granularity=1'
-// UNUSED-NEXT: warning: argument unused during compilation: '-ftime-trace-verbose'
 // UNUSED-NOT:  warning:
 
 template <typename T>
diff --git a/clang/test/Driver/loongarch-features.c b/clang/test/Driver/loongarch-features.c
index 90634bbcf0035..3cdf3ba3d23e1 100644
--- a/clang/test/Driver/loongarch-features.c
+++ b/clang/test/Driver/loongarch-features.c
@@ -2,7 +2,7 @@
 // RUN: %clang --target=loongarch64 -S -emit-llvm %s -o - | FileCheck %s --check-prefix=LA64
 
 // LA32: "target-features"="+32bit"
-// LA64: "target-features"="+64bit,+d,+f,+lsx,+ual"
+// LA64: "target-features"="+64bit,+d,+f,+ual"
 
 int foo(void) {
   return 3;
diff --git a/clang/test/Driver/loongarch-march.c b/clang/test/Driver/loongarch-march.c
index 2d5b315d962a1..9214130cd034f 100644
--- a/clang/test/Driver/loongarch-march.c
+++ b/clang/test/Driver/loongarch-march.c
@@ -2,22 +2,10 @@
 // RUN:   FileCheck %s --check-prefix=CC1-LOONGARCH64
 // RUN: %clang --target=loongarch64 -march=la464 -fsyntax-only %s -### 2>&1 | \
 // RUN:   FileCheck %s --check-prefix=CC1-LA464
-// RUN: %clang --target=loongarch64 -march=la64v1.0 -fsyntax-only %s -### 2>&1 | \
-// RUN:   FileCheck %s --check-prefix=CC1-LA64V1P0
-// RUN: %clang --target=loongarch64 -march=la64v1.1 -fsyntax-only %s -### 2>&1 | \
-// RUN:   FileCheck %s --check-prefix=CC1-LA64V1P1
-// RUN: %clang --target=loongarch64 -march=la664 -fsyntax-only %s -### 2>&1 | \
-// RUN:   FileCheck %s --check-prefix=CC1-LA664
 // RUN: %clang --target=loongarch64 -march=loongarch64 -S -emit-llvm %s -o - | \
 // RUN:   FileCheck %s --check-prefix=IR-LOONGARCH64
 // RUN: %clang --target=loongarch64 -march=la464 -S -emit-llvm %s -o - | \
 // RUN:   FileCheck %s --check-prefix=IR-LA464
-// RUN: %clang --target=loongarch64 -march=la64v1.0 -S -emit-llvm %s -o - | \
-// RUN:   FileCheck %s --check-prefix=IR-LA64V1P0
-// RUN: %clang --target=loongarch64 -march=la64v1.1 -S -emit-llvm %s -o - | \
-// RUN:   FileCheck %s --check-prefix=IR-LA64V1P1
-// RUN: %clang --target=loongarch64 -march=la664 -S -emit-llvm %s -o - | \
-// RUN:   FileCheck %s --check-prefix=IR-LA664
 
 // CC1-LOONGARCH64: "-target-cpu" "loongarch64"
 // CC1-LOONGARCH64-NOT: "-target-feature"
@@ -31,29 +19,8 @@
 // CC1-LA464-NOT: "-target-feature"
 // CC1-LA464: "-target-abi" "lp64d"
 
-// CC1-LA64V1P0: "-target-cpu" "loongarch64"
-// CC1-LA64V1P0-NOT: "-target-feature"
-// CC1-LA64V1P0: "-target-feature" "+64bit" "-target-feature" "+d" "-target-feature" "+lsx" "-target-feature" "+ual"
-// CC1-LA64V1P0-NOT: "-target-feature"
-// CC1-LA64V1P0: "-target-abi" "lp64d"
-
-// CC1-LA64V1P1: "-target-cpu" "loongarch64"
-// CC1-LA64V1P1-NOT: "-target-feature"
-// CC1-LA64V1P1: "-target-feature" "+64bit" "-target-feature" "+d" "-target-feature" "+lsx" "-target-feature" "+ual" "-target-feature" "+frecipe"
-// CC1-LA64V1P1-NOT: "-target-feature"
-// CC1-LA64V1P1: "-target-abi" "lp64d"
-
-// CC1-LA664: "-target-cpu" "la664"
-// CC1-LA664-NOT: "-target-feature"
-// CC1-LA664: "-target-feature" "+64bit" "-target-feature" "+f" "-target-feature" "+d" "-target-feature" "+lsx" "-target-feature" "+lasx" "-target-feature" "+ual" "-target-feature" "+frecipe"
-// CC1-LA664-NOT: "-target-feature"
-// CC1-LA664: "-target-abi" "lp64d"
-
 // IR-LOONGARCH64: attributes #[[#]] ={{.*}}"target-cpu"="loongarch64" {{.*}}"target-features"="+64bit,+d,+f,+ual"
 // IR-LA464: attributes #[[#]] ={{.*}}"target-cpu"="la464" {{.*}}"target-features"="+64bit,+d,+f,+lasx,+lsx,+ual"
-// IR-LA64V1P0: attributes #[[#]] ={{.*}}"target-cpu"="loongarch64" {{.*}}"target-features"="+64bit,+d,+lsx,+ual"
-// IR-LA64V1P1: attributes #[[#]] ={{.*}}"target-cpu"="loongarch64" {{.*}}"target-features"="+64bit,+d,+frecipe,+lsx,+ual"
-// IR-LA664: attributes #[[#]] ={{.*}}"target-cpu"="la664" {{.*}}"target-features"="+64bit,+d,+f,+frecipe,+lasx,+lsx,+ual"
 
 int foo(void) {
   return 3;
diff --git a/clang/test/Driver/loongarch-mlasx.c b/clang/test/Driver/loongarch-mlasx.c
index 87634ff5a9a40..0b934f125c9e4 100644
--- a/clang/test/Driver/loongarch-mlasx.c
+++ b/clang/test/Driver/loongarch-mlasx.c
@@ -5,7 +5,7 @@
 // RUN: %clang --target=loongarch64 -mno-lasx -fsyntax-only %s -### 2>&1 | \
 // RUN:   FileCheck %s --check-prefix=CC1-NOLASX
 // RUN: %clang --target=loongarch64 -mlasx -mno-lasx -fsyntax-only %s -### 2>&1 | \
-// RUN:   FileCheck %s --check-prefix=CC1-LSX
+// RUN:   FileCheck %s --check-prefix=CC1-NOLASX
 // RUN: %clang --target=loongarch64 -mno-lasx -mlasx -fsyntax-only %s -### 2>&1 | \
 // RUN:   FileCheck %s --check-prefix=CC1-LASX
 // RUN: %clang --target=loongarch64 -mlsx -mlasx -fsyntax-only %s -### 2>&1 | \
@@ -18,7 +18,7 @@
 // RUN: %clang --target=loongarch64 -mno-lasx -S -emit-llvm %s -o - | \
 // RUN:   FileCheck %s --check-prefix=IR-NOLASX
 // RUN: %clang --target=loongarch64 -mlasx -mno-lasx -S -emit-llvm %s -o - | \
-// RUN:   FileCheck %s --check-prefix=IR-LSX
+// RUN:   FileCheck %s --check-prefix=IR-NOLASX
 // RUN: %clang --target=loongarch64 -mno-lasx -mlasx -S -emit-llvm %s -o - | \
 // RUN:   FileCheck %s --check-prefix=IR-LASX
 // RUN: %clang --target=loongarch64 -mlsx -mlasx -S -emit-llvm %s -o - | \
@@ -26,11 +26,9 @@
 // RUN: %clang --target=loongarch64 -mlasx -mlsx -S -emit-llvm %s -o - | \
 // RUN:   FileCheck %s --check-prefix=IR-LASX
 
-// CC1-LSX: "-target-feature" "+lsx"
 // CC1-LASX: "-target-feature" "+lsx" "-target-feature" "+lasx"
 // CC1-NOLASX: "-target-feature" "-lasx"
 
-// IR-LSX: attributes #[[#]] ={{.*}}"target-features"="{{(.*,)?}}+lsx{{(,.*)?}}"
 // IR-LASX: attributes #[[#]] ={{.*}}"target-features"="{{(.*,)?}}+lasx{{(,.*)?}}"
 // IR-NOLASX: attributes #[[#]] ={{.*}}"target-features"="{{(.*,)?}}-lasx{{(,.*)?}}"
 
diff --git a/clang/test/Driver/loongarch-msimd.c b/clang/test/Driver/loongarch-msimd.c
index 49d298e1b2e3f..cd463300c8747 100644
--- a/clang/test/Driver/loongarch-msimd.c
+++ b/clang/test/Driver/loongarch-msimd.c
@@ -75,9 +75,9 @@
 // RUN:   FileCheck %s --check-prefixes=LSX,LASX
 
 // RUN: %clang --target=loongarch64 -mlasx -mno-lasx -msimd=lasx -fsyntax-only %s -### 2>&1 | \
-// RUN:   FileCheck %s --check-prefixes=LSX,NOLASX
+// RUN:   FileCheck %s --check-prefixes=NOLSX,NOLASX
 // RUN: %clang --target=loongarch64 -mno-lasx -msimd=lasx -fsyntax-only %s -### 2>&1 | \
-// RUN:   FileCheck %s --check-prefixes=LSX,NOLASX
+// RUN:   FileCheck %s --check-prefixes=NOLSX,NOLASX
 
 // RUN: %clang --target=loongarch64 -mlasx -mno-lasx -mlsx -msimd=lasx -fsyntax-only %s -### 2>&1 | \
 // RUN:   FileCheck %s --check-prefixes=LSX,NOLASX
diff --git a/clang/test/Driver/loongarch-msingle-float.c b/clang/test/Driver/loongarch-msingle-float.c
index 4eb0865b53a59..bd9b3e8a8c019 100644
--- a/clang/test/Driver/loongarch-msingle-float.c
+++ b/clang/test/Driver/loongarch-msingle-float.c
@@ -11,10 +11,10 @@
 // WARN: warning: ignoring '-mabi=lp64s' as it conflicts with that implied by '-msingle-float' (lp64f)
 // WARN: warning: ignoring '-mfpu=64' as it conflicts with that implied by '-msingle-float' (32)
 
-// CC1: "-target-feature" "+f"{{.*}} "-target-feature" "-d" "-target-feature" "-lsx"
+// CC1: "-target-feature" "+f"{{.*}} "-target-feature" "-d"
 // CC1: "-target-abi" "lp64f"
 
-// IR: attributes #[[#]] ={{.*}}"target-features"="{{(.*,)?}}+f,{{(.*,)?}}-d,-lsx"
+// IR: attributes #[[#]] ={{.*}}"target-features"="{{(.*,)?}}+f,{{(.*,)?}}-d"
 
 int foo(void) {
   return 3;
diff --git a/clang/test/Driver/loongarch-msoft-float.c b/clang/test/Driver/loongarch-msoft-float.c
index ebf27fb00e309..0e5121ac84b4c 100644
--- a/clang/test/Driver/loongarch-msoft-float.c
+++ b/clang/test/Driver/loongarch-msoft-float.c
@@ -11,10 +11,10 @@
 // WARN: warning: ignoring '-mabi=lp64d' as it conflicts with that implied by '-msoft-float' (lp64s)
 // WARN: warning: ignoring '-mfpu=64' as it conflicts with that implied by '-msoft-float' (0)
 
-// CC1: "-target-feature" "-f"{{.*}} "-target-feature" "-d" "-target-feature" "-lsx"
+// CC1: "-target-feature" "-f"{{.*}} "-target-feature" "-d"
 // CC1: "-target-abi" "lp64s"
 
-// IR: attributes #[[#]] ={{.*}}"target-features"="{{(.*,)?}}-d,{{(.*,)?}}-f,-lsx"
+// IR: attributes #[[#]] ={{.*}}"target-features"="{{(.*,)?}}-d,{{(.*,)?}}-f{{(,.*)?}}"
 
 int foo(void) {
   return 3;
diff --git a/clang/test/Driver/loongarch-mtune.c b/clang/test/Driver/loongarch-mtune.c
index face12e1a1a82..6f3f39e9bbd86 100644
--- a/clang/test/Driver/loongarch-mtune.c
+++ b/clang/test/Driver/loongarch-mtune.c
@@ -8,11 +8,6 @@
 // RUN: %clang --target=loongarch64 -mtune=la464 -S -emit-llvm %s -o - | \
 // RUN:   FileCheck %s --check-prefix=IRATTR -DCPU=la464
 
-// RUN: %clang --target=loongarch64 -mtune=la664 -fsyntax-only %s -### 2>&1 | \
-// RUN:   FileCheck %s --check-prefix=CC1ARG -DCPU=la664
-// RUN: %clang --target=loongarch64 -mtune=la664 -S -emit-llvm %s -o - | \
-// RUN:   FileCheck %s --check-prefix=IRATTR -DCPU=la664
-
 // RUN: %clang --target=loongarch64 -mtune=invalidcpu -fsyntax-only %s -### 2>&1 | \
 // RUN:   FileCheck %s --check-prefix=CC1ARG -DCPU=invalidcpu
 // RUN: not %clang --target=loongarch64 -mtune=invalidcpu -S -emit-llvm %s -o /dev/null 2>&1 | \
diff --git a/clang/test/Driver/nvlink-wrapper.c b/clang/test/Driver/nvlink-wrapper.c
deleted file mode 100644
index fdda93f1f9cdc..0000000000000
--- a/clang/test/Driver/nvlink-wrapper.c
+++ /dev/null
@@ -1,65 +0,0 @@
-// REQUIRES: x86-registered-target
-// REQUIRES: nvptx-registered-target
-
-#if defined(X)
-extern int y;
-int foo() { return y; }
-
-int x = 0;
-#elif defined(Y)
-int y = 42;
-#elif defined(Z)
-int z = 42;
-#elif defined(W)
-int w = 42;
-#elif defined(U)
-extern int x;
-extern int __attribute__((weak)) w;
-
-int bar() {
-  return x + w;
-}
-#else
-extern int y;
-int __attribute__((visibility("hidden"))) x = 999;
-int baz() { return y + x; }
-#endif
-
-// Create various inputs to test basic linking and LTO capabilities. Creating a
-// CUDA binary requires access to the `ptxas` executable, so we just use x64.
-// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -DX -o %t-x.o
-// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -DY -o %t-y.o
-// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -DZ -o %t-z.o
-// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -DW -o %t-w.o
-// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -DU -o %t-u.o
-// RUN: llvm-ar rcs %t-x.a %t-x.o
-// RUN: llvm-ar rcs %t-y.a %t-y.o
-// RUN: llvm-ar rcs %t-z.a %t-z.o
-// RUN: llvm-ar rcs %t-w.a %t-w.o
-
-//
-// Check that we forward any unrecognized argument to 'nvlink'.
-//
-// RUN: clang-nvlink-wrapper --dry-run -arch sm_52 %t-u.o -foo -o a.out 2>&1 \
-// RUN:   | FileCheck %s --check-prefix=ARGS
-// ARGS: nvlink{{.*}} -arch sm_52 -foo -o a.out [[INPUT:.+]].cubin
-
-//
-// Check the symbol resolution for static archives. We expect to only link
-// `libx.a` and `liby.a` because extern weak symbols do not extract and `libz.a`
-// is not used at all.
-//
-// RUN: clang-nvlink-wrapper --dry-run %t-x.a %t-u.o %t-y.a %t-z.a %t-w.a \
-// RUN:   -arch sm_52 -o a.out 2>&1 | FileCheck %s --check-prefix=LINK
-// LINK: nvlink{{.*}} -arch sm_52 -o a.out [[INPUT:.+]].cubin {{.*}}-x-{{.*}}.cubin{{.*}}-y-{{.*}}.cubin
-
-// RUN: %clang -cc1 %s -triple nvptx64-nvidia-cuda -emit-llvm-bc -o %t.o
-
-//
-// Check that the LTO interface works and properly preserves symbols used in a
-// regular object file.
-//
-// RUN: clang-nvlink-wrapper --dry-run %t.o %t-u.o %t-y.a \
-// RUN:   -arch sm_52 -o a.out 2>&1 | FileCheck %s --check-prefix=LTO
-// LTO: ptxas{{.*}} -m64 -c [[PTX:.+]].s -O3 -arch sm_52 -o [[CUBIN:.+]].cubin
-// LTO: nvlink{{.*}} -arch sm_52 -o a.out [[CUBIN]].cubin {{.*}}-u-{{.*}}.cubin {{.*}}-y-{{.*}}.cubin
diff --git a/clang/test/Driver/print-supported-extensions-riscv.c b/clang/test/Driver/print-supported-extensions-riscv.c
index 91f12b8416b2a..1dc4580ec202e 100644
--- a/clang/test/Driver/print-supported-extensions-riscv.c
+++ b/clang/test/Driver/print-supported-extensions-riscv.c
@@ -35,6 +35,7 @@
 // CHECK-NEXT:     za64rs               1.0       'Za64rs' (Reservation Set Size of at Most 64 Bytes)
 // CHECK-NEXT:     zaamo                1.0       'Zaamo' (Atomic Memory Operations)
 // CHECK-NEXT:     zabha                1.0       'Zabha' (Byte and Halfword Atomic Memory Operations)
+// CHECK-NEXT:     zacas                1.0       'Zacas' (Atomic Compare-And-Swap Instructions)
 // CHECK-NEXT:     zalrsc               1.0       'Zalrsc' (Load-Reserved/Store-Conditional)
 // CHECK-NEXT:     zama16b              1.0       'Zama16b' (Atomic 16-byte misaligned loads, stores and AMOs)
 // CHECK-NEXT:     zawrs                1.0       'Zawrs' (Wait on Reservation Set)
@@ -170,7 +171,6 @@
 // CHECK-NEXT: Experimental extensions
 // CHECK-NEXT:     zicfilp              1.0       'Zicfilp' (Landing pad)
 // CHECK-NEXT:     zicfiss              1.0       'Zicfiss' (Shadow stack)
-// CHECK-NEXT:     zacas                1.0       'Zacas' (Atomic Compare-And-Swap Instructions)
 // CHECK-NEXT:     zalasr               0.1       'Zalasr' (Load-Acquire and Store-Release Instructions)
 // CHECK-NEXT:     smmpm                1.0       'Smmpm' (Machine-level Pointer Masking for M-mode)
 // CHECK-NEXT:     smnpm                1.0       'Smnpm' (Machine-level Pointer Masking for next lower privilege mode)
diff --git a/clang/test/Driver/ps4-ps5-runtime-flags.c b/clang/test/Driver/ps4-ps5-runtime-flags.c
index 4b00aead6fe5c..e75ba97948d23 100644
--- a/clang/test/Driver/ps4-ps5-runtime-flags.c
+++ b/clang/test/Driver/ps4-ps5-runtime-flags.c
@@ -40,5 +40,5 @@
 // RUN: %clang -target x86_64-sie-ps5 -fcs-profile-generate %s -### 2>&1 | FileCheck --check-prefix=CHECK-PS5-PROFILE %s
 // RUN: %clang -target x86_64-sie-ps5 -fcs-profile-generate -fno-profile-generate %s -### 2>&1 | FileCheck --check-prefix=CHECK-PS5-NO-PROFILE %s
 //
-// CHECK-PS5-PROFILE: "--dependent-lib=libclang_rt.profile_nosubmission.a"
-// CHECK-PS5-NO-PROFILE-NOT: "--dependent-lib=libclang_rt.profile_nosubmission.a"
+// CHECK-PS5-PROFILE: "--dependent-lib=libclang_rt.profile-x86_64_nosubmission.a"
+// CHECK-PS5-NO-PROFILE-NOT: "--dependent-lib=libclang_rt.profile-x86_64_nosubmission.a"
diff --git a/clang/test/Parser/cxx-template-decl.cpp b/clang/test/Parser/cxx-template-decl.cpp
index 476341686a64b..734438069b9ae 100644
--- a/clang/test/Parser/cxx-template-decl.cpp
+++ b/clang/test/Parser/cxx-template-decl.cpp
@@ -297,7 +297,3 @@ namespace PR46231 {
   template<> int; // expected-error {{declaration does not declare anything}}
   template<int> int; // expected-error {{declaration does not declare anything}}
 }
-
-namespace PR99933 {
-  void foo() { template <typename> int i; } // expected-error {{templates can only be declared in namespace or class scope}}
-}
diff --git a/clang/test/Preprocessor/embed_weird.cpp b/clang/test/Preprocessor/embed_weird.cpp
index 90180e2d3cc70..6eb2923152f15 100644
--- a/clang/test/Preprocessor/embed_weird.cpp
+++ b/clang/test/Preprocessor/embed_weird.cpp
@@ -116,14 +116,6 @@ void f1() {
 }
 #endif
 
-static_assert(_Generic(
-#embed __FILE__ limit(1)
-  , int : 1, default : 0));
-
-static_assert(alignof(typeof(
-#embed __FILE__ limit(1)
-)) == alignof(int));
-
 struct HasChar {
   signed char ch;
 };
diff --git a/clang/test/Preprocessor/init-loongarch.c b/clang/test/Preprocessor/init-loongarch.c
index 771d56ffb1c1b..182f904b76592 100644
--- a/clang/test/Preprocessor/init-loongarch.c
+++ b/clang/test/Preprocessor/init-loongarch.c
@@ -798,51 +798,24 @@
 // LA64-FPU0-LP64S-NOT: #define __loongarch_single_float
 // LA64-FPU0-LP64S: #define __loongarch_soft_float 1
 
-/// Check __loongarch_arch{_tune/_frecipe}.
+/// Check __loongarch_arch and __loongarch_tune.
 
 // RUN: %clang --target=loongarch64 -x c -E -dM %s -o - | \
-// RUN:   FileCheck --match-full-lines --check-prefix=ARCH-TUNE -DARCH=la64v1.0 -DTUNE=loongarch64 %s
+// RUN:   FileCheck --match-full-lines --check-prefix=ARCH-TUNE -DARCH=loongarch64 -DTUNE=loongarch64 %s
 // RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=loongarch64 | \
 // RUN:   FileCheck --match-full-lines --check-prefix=ARCH-TUNE -DARCH=loongarch64 -DTUNE=loongarch64 %s
 // RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=la464 | \
 // RUN:   FileCheck --match-full-lines --check-prefix=ARCH-TUNE -DARCH=la464 -DTUNE=la464 %s
 // RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -mtune=loongarch64 | \
-// RUN:   FileCheck --match-full-lines --check-prefix=ARCH-TUNE -DARCH=la64v1.0 -DTUNE=loongarch64 %s
+// RUN:   FileCheck --match-full-lines --check-prefix=ARCH-TUNE -DARCH=loongarch64 -DTUNE=loongarch64 %s
 // RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -mtune=la464 | \
-// RUN:   FileCheck --match-full-lines --check-prefix=ARCH-TUNE -DARCH=la64v1.0 -DTUNE=la464 %s
+// RUN:   FileCheck --match-full-lines --check-prefix=ARCH-TUNE -DARCH=loongarch64 -DTUNE=la464 %s
 // RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=loongarch64 -mtune=la464 | \
 // RUN:   FileCheck --match-full-lines --check-prefix=ARCH-TUNE -DARCH=loongarch64 -DTUNE=la464 %s
 // RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=la464 -mtune=loongarch64 | \
 // RUN:   FileCheck --match-full-lines --check-prefix=ARCH-TUNE -DARCH=la464 -DTUNE=loongarch64 %s
-// RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=la64v1.0 | \
-// RUN:   FileCheck --match-full-lines --check-prefix=ARCH-TUNE -DARCH=la64v1.0 -DTUNE=loongarch64 %s
-// RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=la64v1.0 -Xclang -target-feature -Xclang -lsx | \
-// RUN:   FileCheck --match-full-lines --check-prefix=ARCH-TUNE -DARCH=loongarch64 -DTUNE=loongarch64 %s
-// RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=la64v1.0 -Xclang -target-feature -Xclang +frecipe | \
-// RUN:   FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,FRECIPE -DARCH=la64v1.1 -DTUNE=loongarch64 %s
-// RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=loongarch64 -Xclang -target-feature -Xclang +lsx | \
-// RUN:   FileCheck --match-full-lines --check-prefix=ARCH-TUNE -DARCH=la64v1.0 -DTUNE=loongarch64 %s
-// RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=la64v1.1 | \
-// RUN:   FileCheck --match-full-lines  --check-prefixes=ARCH-TUNE,FRECIPE -DARCH=la64v1.1 -DTUNE=loongarch64 %s
-// RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=la64v1.1 -Xclang -target-feature -Xclang -frecipe | \
-// RUN:   FileCheck --match-full-lines --check-prefix=ARCH-TUNE -DARCH=la64v1.0 -DTUNE=loongarch64 %s
-// RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=la64v1.1 -Xclang -target-feature -Xclang -lsx | \
-// RUN:   FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,FRECIPE -DARCH=loongarch64 -DTUNE=loongarch64 %s
-// RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=loongarch64 -Xclang -target-feature -Xclang +frecipe | \
-// RUN:   FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,FRECIPE -DARCH=loongarch64 -DTUNE=loongarch64 %s
-// RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=loongarch64 -Xclang -target-feature -Xclang +lsx -Xclang -target-feature -Xclang +frecipe | \
-// RUN:   FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,FRECIPE -DARCH=la64v1.1 -DTUNE=loongarch64 %s
-// RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=la664 | \
-// RUN:   FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,FRECIPE -DARCH=la664 -DTUNE=la664 %s
-// RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -mtune=la664 | \
-// RUN:   FileCheck --match-full-lines --check-prefix=ARCH-TUNE -DARCH=la64v1.0 -DTUNE=la664 %s
-// RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=loongarch64 -mtune=la664 | \
-// RUN:   FileCheck --match-full-lines --check-prefix=ARCH-TUNE -DARCH=loongarch64 -DTUNE=la664 %s
-// RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=la664 -mtune=loongarch64 | \
-// RUN:   FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,FRECIPE -DARCH=la664 -DTUNE=loongarch64 %s
 
 // ARCH-TUNE: #define __loongarch_arch "[[ARCH]]"
-// FRECIPE: #define __loongarch_frecipe 1
 // ARCH-TUNE: #define __loongarch_tune "[[TUNE]]"
 
 // RUN: %clang --target=loongarch64 -mlsx -x c -E -dM %s -o - \
@@ -851,8 +824,6 @@
 // RUN:   | FileCheck --match-full-lines --check-prefix=MLSX %s
 // RUN: %clang --target=loongarch64 -mlsx -mno-lasx -x c -E -dM %s -o - \
 // RUN:   | FileCheck --match-full-lines --check-prefix=MLSX %s
-// RUN: %clang --target=loongarch64 -mno-lasx -x c -E -dM %s -o - \
-// RUN:   | FileCheck --match-full-lines --check-prefix=MLSX %s
 // RUN: %clang --target=loongarch64 -mno-lasx -mlsx -x c -E -dM %s -o - \
 // RUN:   | FileCheck --match-full-lines --check-prefix=MLSX %s
 // MLSX-NOT: #define __loongarch_asx
@@ -861,12 +832,12 @@
 
 // RUN: %clang --target=loongarch64 -mlasx -x c -E -dM %s -o - \
 // RUN:   | FileCheck --match-full-lines --check-prefix=MLASX %s
+// RUN: %clang --target=loongarch64 -mno-lasx -mlasx -x c -E -dM %s -o - \
+// RUN:   | FileCheck --match-full-lines --check-prefix=MLASX %s
 // RUN: %clang --target=loongarch64 -mlsx -mlasx -x c -E -dM %s -o - \
 // RUN:   | FileCheck --match-full-lines --check-prefix=MLASX %s
 // RUN: %clang --target=loongarch64 -mlasx -mlsx -x c -E -dM %s -o - \
 // RUN:   | FileCheck --match-full-lines --check-prefix=MLASX %s
-// RUN: %clang --target=loongarch64 -mno-lasx -mlasx -x c -E -dM %s -o - \
-// RUN:   | FileCheck --match-full-lines --check-prefix=MLASX %s
 // MLASX: #define __loongarch_asx 1
 // MLASX: #define __loongarch_simd_width 256
 // MLASX: #define __loongarch_sx 1
@@ -879,6 +850,8 @@
 // RUN:   | FileCheck --match-full-lines --check-prefix=MNO-LSX %s
 // RUN: %clang --target=loongarch64 -mno-lasx -mno-lsx -x c -E -dM %s -o - \
 // RUN:   | FileCheck --match-full-lines --check-prefix=MNO-LSX %s
+// RUN: %clang --target=loongarch64 -mno-lasx -x c -E -dM %s -o - \
+// RUN:   | FileCheck --match-full-lines --check-prefix=MNO-LSX %s
 // MNO-LSX-NOT: #define __loongarch_asx
 // MNO-LSX-NOT: #define __loongarch_simd_width
 // MNO-LSX-NOT: #define __loongarch_sx
diff --git a/clang/test/Preprocessor/pragma_mc_func.c b/clang/test/Preprocessor/pragma_mc_func.c
deleted file mode 100644
index f0d3e49e5dddc..0000000000000
--- a/clang/test/Preprocessor/pragma_mc_func.c
+++ /dev/null
@@ -1,23 +0,0 @@
-// RUN: not %clang --target=powerpc64-ibm-aix -ferr-pragma-mc-func-aix -fsyntax-only \
-// RUN:   %s 2>&1 | FileCheck %s
-#pragma mc_func asm_barrier {"60000000"}
-
-// CHECK:  error: #pragma mc_func is not supported
-
-// Cases where no errors occur.
-// RUN: %clang --target=powerpc64-ibm-aix -fno-err-pragma-mc-func-aix -fsyntax-only %s
-// RUN: %clang --target=powerpc64-ibm-aix -ferr-pragma-mc-func-aix -fsyntax-only \
-// RUN:    -fno-err-pragma-mc-func-aix %s
-// RUN: %clang --target=powerpc64-ibm-aix -fsyntax-only %s
-// RUN: %clang --target=powerpc64-ibm-aix -Werror=unknown-pragmas \
-// RUN:   -fno-err-pragma-mc-func-aix -fsyntax-only %s
-
-// Cases where we have errors or warnings.
-// RUN: not %clang --target=powerpc64le-unknown-linux-gnu \
-// RUN:   -Werror=unknown-pragmas -fno-err-pragma-mc-func-aix -fsyntax-only %s 2>&1 | \
-// RUN:   FileCheck --check-prefix=UNUSED %s
-// RUN: %clang --target=powerpc64le-unknown-linux-gnu \
-// RUN:   -fno-err-pragma-mc-func-aix -fsyntax-only %s 2>&1 | \
-// RUN:   FileCheck --check-prefix=UNUSED %s
-
-// UNUSED: clang: warning: argument unused during compilation: '-fno-err-pragma-mc-func-aix' [-Wunused-command-line-argument]
diff --git a/clang/test/Preprocessor/riscv-target-features.c b/clang/test/Preprocessor/riscv-target-features.c
index 72131108cb5f6..fd718a126aaa7 100644
--- a/clang/test/Preprocessor/riscv-target-features.c
+++ b/clang/test/Preprocessor/riscv-target-features.c
@@ -86,6 +86,7 @@
 // CHECK-NOT: __riscv_za64rs {{.*$}}
 // CHECK-NOT: __riscv_zaamo {{.*$}}
 // CHECK-NOT: __riscv_zabha {{.*$}}
+// CHECK-NOT: __riscv_zacas {{.*$}}
 // CHECK-NOT: __riscv_zalrsc {{.*$}}
 // CHECK-NOT: __riscv_zama16b {{.*$}}
 // CHECK-NOT: __riscv_zawrs {{.*$}}
@@ -181,7 +182,6 @@
 // CHECK-NOT: __riscv_sspm{{.*$}}
 // CHECK-NOT: __riscv_ssqosid{{.*$}}
 // CHECK-NOT: __riscv_supm{{.*$}}
-// CHECK-NOT: __riscv_zacas {{.*$}}
 // CHECK-NOT: __riscv_zalasr {{.*$}}
 // CHECK-NOT: __riscv_zfbfmin {{.*$}}
 // CHECK-NOT: __riscv_zicfilp {{.*$}}
@@ -747,6 +747,14 @@
 // RUN:   -o - | FileCheck --check-prefix=CHECK-ZABHA-EXT %s
 // CHECK-ZABHA-EXT: __riscv_zabha 1000000{{$}}
 
+// RUN: %clang --target=riscv32 \
+// RUN:   -march=rv32ia_zacas1p0 -E -dM %s \
+// RUN:   -o - | FileCheck --check-prefix=CHECK-ZACAS-EXT %s
+// RUN: %clang --target=riscv64 \
+// RUN:   -march=rv64ia_zacas1p0 -E -dM %s \
+// RUN:   -o - | FileCheck --check-prefix=CHECK-ZACAS-EXT %s
+// CHECK-ZACAS-EXT: __riscv_zacas 1000000{{$}}
+
 // RUN: %clang --target=riscv32 \
 // RUN:   -march=rv32i_zalrsc1p0 -E -dM %s \
 // RUN:   -o - | FileCheck --check-prefix=CHECK-ZALRSC-EXT %s
@@ -1618,14 +1626,6 @@
 // CHECK-ZVKT-EXT: __riscv_zvkt 1000000{{$}}
 
 // Experimental extensions
-// RUN: %clang --target=riscv32 -menable-experimental-extensions \
-// RUN:   -march=rv32ia_zacas1p0 -E -dM %s \
-// RUN:   -o - | FileCheck --check-prefix=CHECK-ZACAS-EXT %s
-// RUN: %clang --target=riscv64 -menable-experimental-extensions \
-// RUN:   -march=rv64ia_zacas1p0 -E -dM %s \
-// RUN:   -o - | FileCheck --check-prefix=CHECK-ZACAS-EXT %s
-// CHECK-ZACAS-EXT: __riscv_zacas 1000000{{$}}
-
 // RUN: %clang --target=riscv32 -menable-experimental-extensions \
 // RUN:   -march=rv32i_zalasr0p1 -E -dM %s \
 // RUN:   -o - | FileCheck --check-prefix=CHECK-ZALASR-EXT %s
diff --git a/clang/test/Sema/aarch64-neon-without-target-feature.cpp b/clang/test/Sema/aarch64-neon-without-target-feature.cpp
deleted file mode 100644
index 0831eb7c754a7..0000000000000
--- a/clang/test/Sema/aarch64-neon-without-target-feature.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +dotprod  -target-feature +fullfp16 -target-feature +fp16fml -target-feature +i8mm -target-feature +bf16 -verify -emit-llvm -o - %s
-
-// REQUIRES: aarch64-registered-target
-
-// This test is testing the diagnostics that Clang emits when compiling without '+neon'.
-
-#include <arm_neon.h>
-
-void undefined(uint32x2_t v2i32, uint32x4_t v4i32, uint16x8_t v8i16, uint8x16_t v16i8, uint8x8_t v8i8, float32x2_t v2f32, float32x4_t v4f32, float16x4_t v4f16, float64x2_t v2f64, bfloat16x4_t v4bf16, __bf16 bf16, poly64_t poly64, poly64x2_t poly64x2) {
-  // dotprod
-  vdot_u32(v2i32, v8i8, v8i8); // expected-error {{always_inline function 'vdot_u32' requires target feature 'neon'}}
-  vdot_laneq_u32(v2i32, v8i8, v16i8, 1); // expected-error {{always_inline function 'vdot_u32' requires target feature 'neon'}} expected-error {{'__builtin_neon_splat_laneq_v' needs target feature neon}}
-  // fp16
-  vceqz_f16(v4f16); // expected-error {{always_inline function 'vceqz_f16' requires target feature 'neon'}}
-  vrnd_f16(v4f16); // expected-error {{always_inline function 'vrnd_f16' requires target feature 'neon'}}
-  vmaxnm_f16(v4f16, v4f16); // expected-error {{always_inline function 'vmaxnm_f16' requires target feature 'neon'}}
-  vrndi_f16(v4f16); // expected-error {{always_inline function 'vrndi_f16' requires target feature 'neon'}}
-  // fp16fml depends on fp-armv8
-  vfmlal_low_f16(v2f32, v4f16, v4f16); // expected-error {{always_inline function 'vfmlal_low_f16' requires target feature 'neon'}}
-  // i8mm
-  vmmlaq_s32(v4i32, v8i16, v8i16); // expected-error {{always_inline function 'vmmlaq_s32' requires target feature 'neon'}}
-  vusdot_laneq_s32(v2i32, v8i8, v8i16, 0); // expected-error {{always_inline function 'vusdot_s32' requires target feature 'neon'}} expected-error {{'__builtin_neon_splat_laneq_v' needs target feature neon}}
-  // bf16
-  vbfdot_f32(v2f32, v4bf16, v4bf16); // expected-error {{always_inline function 'vbfdot_f32' requires target feature 'neon'}}
-  vcreate_bf16(10);
-  vdup_lane_bf16(v4bf16, 2); // expected-error {{'__builtin_neon_splat_lane_bf16' needs target feature bf16,neon}}
-  vdup_n_bf16(bf16); // expected-error {{always_inline function 'vdup_n_bf16' requires target feature 'neon'}}
-  vld1_bf16(0); // expected-error {{'__builtin_neon_vld1_bf16' needs target feature bf16,neon}}
-  vcvt_f32_bf16(v4bf16); // expected-error {{always_inline function 'vcvt_f32_bf16' requires target feature 'neon'}}
-  vcvt_bf16_f32(v4f32); // expected-error {{always_inline function 'vcvt_bf16_f32' requires target feature 'neon'}}
-  vmull_p64(poly64, poly64);  // expected-error {{always_inline function 'vmull_p64' requires target feature 'neon'}}
-  vmull_high_p64(poly64x2, poly64x2);  // expected-error {{always_inline function 'vmull_high_p64' requires target feature 'neon'}}
-  vtrn1_s8(v8i8, v8i8); // expected-error {{always_inline function 'vtrn1_s8' requires target feature 'neon'}}
-  vqabsq_s16(v8i16); // expected-error {{always_inline function 'vqabsq_s16' requires target feature 'neon'}}
-  vbslq_s16(v8i16, v8i16, v8i16);// expected-error {{always_inline function 'vbslq_s16' requires target feature 'neon'}}
-}
diff --git a/clang/test/Sema/atomic-ops.c b/clang/test/Sema/atomic-ops.c
index 2405f804d0da5..9b82d82ff8269 100644
--- a/clang/test/Sema/atomic-ops.c
+++ b/clang/test/Sema/atomic-ops.c
@@ -124,31 +124,6 @@ _Static_assert(__atomic_always_lock_free(4, &i64), "");
 _Static_assert(!__atomic_always_lock_free(8, &i32), "");
 _Static_assert(__atomic_always_lock_free(8, &i64), "");
 
-// Validate use with fake pointers constants. This mechanism is used to allow
-// validating atomicity of a given size and alignment.
-_Static_assert(__atomic_is_lock_free(1, (void*)1), "");
-_Static_assert(__atomic_is_lock_free(1, (void*)-1), "");
-_Static_assert(__atomic_is_lock_free(4, (void*)2), ""); // expected-error {{not an integral constant expression}}
-_Static_assert(__atomic_is_lock_free(4, (void*)-2), ""); // expected-error {{not an integral constant expression}}
-_Static_assert(__atomic_is_lock_free(4, (void*)4), "");
-_Static_assert(__atomic_is_lock_free(4, (void*)-4), "");
-
-_Static_assert(__atomic_always_lock_free(1, (void*)1), "");
-_Static_assert(__atomic_always_lock_free(1, (void*)-1), "");
-_Static_assert(!__atomic_always_lock_free(4, (void*)2), "");
-_Static_assert(!__atomic_always_lock_free(4, (void*)-2), "");
-_Static_assert(__atomic_always_lock_free(4, (void*)4), "");
-_Static_assert(__atomic_always_lock_free(4, (void*)-4), "");
-
-// Ensure that "weird" constants don't cause trouble.
-_Static_assert(__atomic_always_lock_free(1, "string"), "");
-_Static_assert(!__atomic_always_lock_free(2, "string"), "");
-_Static_assert(__atomic_always_lock_free(2, (int[2]){}), "");
-void dummyfn();
-_Static_assert(__atomic_always_lock_free(2, dummyfn) || 1, "");
-
-
-
 #define _AS1 __attribute__((address_space(1)))
 #define _AS2 __attribute__((address_space(2)))
 
diff --git a/clang/test/Sema/patchable-function-entry-attr.cpp b/clang/test/Sema/patchable-function-entry-attr.cpp
index bd4d57a7e3093..9134c851da588 100644
--- a/clang/test/Sema/patchable-function-entry-attr.cpp
+++ b/clang/test/Sema/patchable-function-entry-attr.cpp
@@ -6,14 +6,9 @@
 // RUN: %clang_cc1 -triple loongarch64 -fsyntax-only -verify=silence %s
 // RUN: %clang_cc1 -triple riscv32 -fsyntax-only -verify=silence %s
 // RUN: %clang_cc1 -triple riscv64 -fsyntax-only -verify=silence %s
-// RUN: %clang_cc1 -triple powerpc-unknown-linux-gnu -fsyntax-only -verify=silence %s
-// RUN: %clang_cc1 -triple powerpc64-unknown-linux-gnu -fsyntax-only -verify=silence %s
 // RUN: %clang_cc1 -triple ppc64le -fsyntax-only -verify %s
-// RUN: %clang_cc1 -triple powerpc64-ibm-aix-xcoff -fsyntax-only -verify=AIX %s
-// RUN: %clang_cc1 -triple powerpc-ibm-aix-xcoff -fsyntax-only -verify=AIX %s
 
 // silence-no-diagnostics
 
-// AIX-error at +2 {{'patchable_function_entry' attribute is not yet supported on AIX}}
 // expected-warning at +1 {{unknown attribute 'patchable_function_entry' ignored}}
 [[gnu::patchable_function_entry(0)]] void f();
diff --git a/clang/test/Sema/ptrauth-indirect-goto.c b/clang/test/Sema/ptrauth-indirect-goto.c
deleted file mode 100644
index 47bc76738d23b..0000000000000
--- a/clang/test/Sema/ptrauth-indirect-goto.c
+++ /dev/null
@@ -1,18 +0,0 @@
-// RUN: %clang_cc1 -triple arm64e-apple-darwin -fsyntax-only -verify %s -fptrauth-indirect-gotos
-
-int f() {
-  static void *addrs[] = { &&l1, &&l2 };
-
-  static int diffs[] = {
-    &&l1 - &&l1, // expected-error{{subtraction of address-of-label expressions is not supported with ptrauth indirect gotos}}
-    &&l1 - &&l2 // expected-error{{subtraction of address-of-label expressions is not supported with ptrauth indirect gotos}}
-  };
-
-  int diff_32 = &&l1 - &&l2; // expected-error{{subtraction of address-of-label expressions is not supported with ptrauth indirect gotos}}
-  goto *(&&l1 + diff_32); // expected-error{{addition of address-of-label expressions is not supported with ptrauth indirect gotos}}
-
-l1:
-  return 0;
-l2:
-  return 1;
-}
diff --git a/clang/test/SemaCXX/builtin_vectorelements.cpp b/clang/test/SemaCXX/builtin_vectorelements.cpp
index b23675ea0ac6a..59ff09ac72e42 100644
--- a/clang/test/SemaCXX/builtin_vectorelements.cpp
+++ b/clang/test/SemaCXX/builtin_vectorelements.cpp
@@ -1,9 +1,7 @@
 // RUN: %clang_cc1 -triple x86_64                       -std=c++20 -fsyntax-only -verify -disable-llvm-passes %s
-// RUN: %clang_cc1 -triple x86_64                       -std=c++20 -fsyntax-only -verify -disable-llvm-passes -fexperimental-new-constant-interpreter %s
 
 // REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sve -std=c++20 -fsyntax-only -verify -disable-llvm-passes %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -std=c++20 -fsyntax-only -verify -disable-llvm-passes -fexperimental-new-constant-interpreter %s
 
 template <typename T>
 using VecT __attribute__((vector_size(16))) = T;
diff --git a/clang/test/SemaCXX/cxx1y-variable-templates_in_class.cpp b/clang/test/SemaCXX/cxx1y-variable-templates_in_class.cpp
index eafadb07b29e1..5e189b758c61e 100644
--- a/clang/test/SemaCXX/cxx1y-variable-templates_in_class.cpp
+++ b/clang/test/SemaCXX/cxx1y-variable-templates_in_class.cpp
@@ -9,12 +9,12 @@
 #endif
 
 class A {
-  template<typename T> CONST T wrong;           // expected-error {{non-static data member 'wrong' cannot be declared as a template}}
-  template<typename T> CONST T wrong_init = 5;      // expected-error {{non-static data member 'wrong_init' cannot be declared as a template}}
+  template<typename T> CONST T wrong;           // expected-error {{member 'wrong' declared as a template}}
+  template<typename T> CONST T wrong_init = 5;      // expected-error {{member 'wrong_init' declared as a template}}
   template<typename T, typename T0> static CONST T right = T(100);
   template<typename T> static CONST T right<T,int> = 5;
-  template<typename T> CONST int right<int,T>;  // expected-error {{non-static data member 'right' cannot be declared as a template}}
-  template<typename T> CONST float right<float,T> = 5;  // expected-error {{non-static data member 'right' cannot be declared as a template}}
+  template<typename T> CONST int right<int,T>;  // expected-error {{member 'right' declared as a template}}
+  template<typename T> CONST float right<float,T> = 5;  // expected-error {{member 'right' declared as a template}}
 #ifdef PRECXX11
                                                         // expected-warning at -2 {{in-class initializer for static data member of type 'const float' is a GNU extension}}
 #else
@@ -161,14 +161,14 @@ namespace non_const_init {
 #ifndef PRECXX11
 namespace constexpred {
   class A {
-    template<typename T> constexpr T wrong;           // expected-error {{non-static data member 'wrong' cannot be declared as a template}}
+    template<typename T> constexpr T wrong;           // expected-error {{member 'wrong' declared as a template}}
                                                       // expected-error at -1 {{declaration of constexpr static data member 'wrong' requires an initializer}}
-    template<typename T> constexpr T wrong_init = 5;  // expected-error {{non-static data member 'wrong_init' cannot be declared as a template}}
+    template<typename T> constexpr T wrong_init = 5;  // expected-error {{member 'wrong_init' declared as a template}}
     template<typename T, typename T0> static constexpr T right = T(100);
     template<typename T> static constexpr T right<T,int> = 5;
-    template<typename T> constexpr int right<int,T>;         // expected-error {{non-static data member 'right' cannot be declared as a template}}
+    template<typename T> constexpr int right<int,T>;         // expected-error {{member 'right' declared as a template}}
                                                              // expected-error at -1 {{declaration of constexpr static data member 'right<int, T>' requires an initializer}}
-    template<typename T> constexpr float right<float,T> = 5; // expected-error {{non-static data member 'right' cannot be declared as a template}}
+    template<typename T> constexpr float right<float,T> = 5; // expected-error {{member 'right' declared as a template}}
     template<> constexpr int right<int,int> = 7;
     template<> constexpr float right<float, int>; // expected-error {{declaration of constexpr static data member 'right<float, int>' requires an initializer}}
     template static constexpr int right<int,int>;     // expected-error {{expected '<' after 'template'}}
diff --git a/clang/test/SemaCXX/cxx20-ctad-type-alias.cpp b/clang/test/SemaCXX/cxx20-ctad-type-alias.cpp
index 5392573fcdb9d..2193be03ad9a3 100644
--- a/clang/test/SemaCXX/cxx20-ctad-type-alias.cpp
+++ b/clang/test/SemaCXX/cxx20-ctad-type-alias.cpp
@@ -110,8 +110,8 @@ struct Foo {
 
 template <typename X, int Y>
 using Bar = Foo<X, sizeof(X)>; // expected-note {{candidate template ignored: couldn't infer template argument 'X'}} \
-                               // expected-note {{implicit deduction guide declared as 'template <typename X> requires __is_deducible(test9::Bar, Foo<X, sizeof(X)>) Bar(Foo<X, sizeof(X)>) -> Foo<X, sizeof(X)>'}} \
-                               // expected-note {{implicit deduction guide declared as 'template <typename X> requires __is_deducible(test9::Bar, Foo<X, sizeof(X)>) Bar(const X (&)[sizeof(X)]) -> Foo<X, sizeof(X)>'}} \
+                               // expected-note {{implicit deduction guide declared as 'template <typename X> requires __is_deducible(test9::Bar, Foo<type-parameter-0-0, sizeof(type-parameter-0-0)>) Bar(Foo<type-parameter-0-0, sizeof(type-parameter-0-0)>) -> Foo<type-parameter-0-0, sizeof(type-parameter-0-0)>'}} \
+                               // expected-note {{implicit deduction guide declared as 'template <typename X> requires __is_deducible(test9::Bar, Foo<type-parameter-0-0, sizeof(type-parameter-0-0)>) Bar(const type-parameter-0-0 (&)[sizeof(type-parameter-0-0)]) -> Foo<type-parameter-0-0, sizeof(type-parameter-0-0)>'}} \
                                // expected-note {{candidate template ignored: constraints not satisfied [with X = int]}} \
                                // expected-note {{cannot deduce template arguments for 'Bar' from 'Foo<int, 4UL>'}}
 
@@ -138,13 +138,13 @@ namespace test11 {
 struct A {};
 template<class T> struct Foo { T c; };
 template<class X, class Y=A>
-using AFoo = Foo<Y>; // expected-note {{candidate template ignored: could not match 'Foo<Y>' against 'int'}} \
-                    // expected-note {{implicit deduction guide declared as 'template <class Y = A> requires __is_deducible(test11::AFoo, Foo<Y>) AFoo(Foo<Y>) -> Foo<Y>'}} \
+using AFoo = Foo<Y>; // expected-note {{candidate template ignored: could not match 'Foo<type-parameter-0-0>' against 'int'}} \
+                    // expected-note {{implicit deduction guide declared as 'template <class Y = A> requires __is_deducible(test11::AFoo, Foo<type-parameter-0-0>) AFoo(Foo<type-parameter-0-0>) -> Foo<type-parameter-0-0>'}} \
                     // expected-note {{candidate template ignored: constraints not satisfied [with Y = int]}} \
                     // expected-note {{cannot deduce template arguments for 'AFoo' from 'Foo<int>'}} \
-                    // expected-note {{implicit deduction guide declared as 'template <class Y = A> requires __is_deducible(test11::AFoo, Foo<Y>) AFoo(Y) -> Foo<Y>'}} \
+                    // expected-note {{implicit deduction guide declared as 'template <class Y = A> requires __is_deducible(test11::AFoo, Foo<type-parameter-0-0>) AFoo(type-parameter-0-0) -> Foo<type-parameter-0-0>'}} \
                     // expected-note {{candidate function template not viable: requires 0 arguments, but 1 was provided}} \
-                    // expected-note {{implicit deduction guide declared as 'template <class Y = A> requires __is_deducible(test11::AFoo, Foo<Y>) AFoo() -> Foo<Y>'}}
+                    // expected-note {{implicit deduction guide declared as 'template <class Y = A> requires __is_deducible(test11::AFoo, Foo<type-parameter-0-0>) AFoo() -> Foo<type-parameter-0-0>'}}
 
 AFoo s = {1}; // expected-error {{no viable constructor or deduction guide for deduction of template arguments of 'AFoo'}}
 } // namespace test11
@@ -211,9 +211,9 @@ template<typename> concept False = false;
 template<False W>
 using BFoo = AFoo<W>; // expected-note {{candidate template ignored: constraints not satisfied [with V = int]}} \
                       // expected-note {{cannot deduce template arguments for 'BFoo' from 'Foo<int *>'}} \
-                      // expected-note {{implicit deduction guide declared as 'template <class V> requires __is_deducible(AFoo, Foo<V *>) && __is_deducible(test15::BFoo, Foo<V *>) BFoo(V *) -> Foo<V *>}} \
-                      // expected-note {{candidate template ignored: could not match 'Foo<V *>' against 'int *'}} \
-                      // expected-note {{template <class V> requires __is_deducible(AFoo, Foo<V *>) && __is_deducible(test15::BFoo, Foo<V *>) BFoo(Foo<V *>) -> Foo<V *>}}
+                      // expected-note {{implicit deduction guide declared as 'template <class V> requires __is_deducible(AFoo, Foo<type-parameter-0-0 *>) && __is_deducible(test15::BFoo, Foo<type-parameter-0-0 *>) BFoo(type-parameter-0-0 *) -> Foo<type-parameter-0-0 *>}} \
+                      // expected-note {{candidate template ignored: could not match 'Foo<type-parameter-0-0 *>' against 'int *'}} \
+                      // expected-note {{template <class V> requires __is_deducible(AFoo, Foo<type-parameter-0-0 *>) && __is_deducible(test15::BFoo, Foo<type-parameter-0-0 *>) BFoo(Foo<type-parameter-0-0 *>) -> Foo<type-parameter-0-0 *>}}
 int i = 0;
 AFoo a1(&i); // OK, deduce Foo<int *>
 
@@ -263,12 +263,12 @@ template<typename T> requires False<T> // expected-note {{because 'int' does not
 Foo(T) -> Foo<int>;
 
 template <typename U>
-using Bar = Foo<U>; // expected-note {{could not match 'Foo<U>' against 'int'}} \
-                    // expected-note {{implicit deduction guide declared as 'template <typename U> requires __is_deducible(test18::Bar, Foo<U>) Bar(Foo<U>) -> Foo<U>'}} \
+using Bar = Foo<U>; // expected-note {{could not match 'Foo<type-parameter-0-0>' against 'int'}} \
+                    // expected-note {{implicit deduction guide declared as 'template <typename U> requires __is_deducible(test18::Bar, Foo<type-parameter-0-0>) Bar(Foo<type-parameter-0-0>) -> Foo<type-parameter-0-0>'}} \
                     // expected-note {{candidate template ignored: constraints not satisfied}} \
-                    // expected-note {{implicit deduction guide declared as 'template <typename T> requires False<T> && __is_deducible(test18::Bar, Foo<int>) Bar(T) -> Foo<int>'}} \
+                    // expected-note {{implicit deduction guide declared as 'template <typename T> requires False<type-parameter-0-0> && __is_deducible(test18::Bar, Foo<int>) Bar(type-parameter-0-0) -> Foo<int>'}} \
                     // expected-note {{candidate function template not viable}} \
-                    // expected-note {{implicit deduction guide declared as 'template <typename U> requires __is_deducible(test18::Bar, Foo<U>) Bar() -> Foo<U>'}}
+                    // expected-note {{implicit deduction guide declared as 'template <typename U> requires __is_deducible(test18::Bar, Foo<type-parameter-0-0>) Bar() -> Foo<type-parameter-0-0>'}}
 
 Bar s = {1}; // expected-error {{no viable constructor or deduction guide for deduction of template arguments}}
 } // namespace test18
@@ -296,8 +296,8 @@ class Foo {};
 // Verify that template template type parameter TTP is referenced/used in the
 // template arguments of the RHS.
 template <template<typename> typename TTP>
-using Bar = Foo<K<TTP>>; // expected-note {{candidate template ignored: could not match 'Foo<K<TTP>>' against 'int'}} \
-                        // expected-note {{implicit deduction guide declared as 'template <template <typename> typename TTP> requires __is_deducible(test20::Bar, Foo<K<TTP>>) Bar(Foo<K<TTP>>) -> Foo<K<TTP>>'}}
+using Bar = Foo<K<TTP>>; // expected-note {{candidate template ignored: could not match 'Foo<K<template-parameter-0-0>>' against 'int'}} \
+                        // expected-note {{implicit deduction guide declared as 'template <template <typename> typename TTP> requires __is_deducible(test20::Bar, Foo<K<template-parameter-0-0>>) Bar(Foo<K<template-parameter-0-0>>) -> Foo<K<template-parameter-0-0>>'}}
 
 template <class T>
 class Container {};
diff --git a/clang/test/SemaCXX/invalid-template-declaration.cpp b/clang/test/SemaCXX/invalid-template-declaration.cpp
deleted file mode 100644
index fda147520cca8..0000000000000
--- a/clang/test/SemaCXX/invalid-template-declaration.cpp
+++ /dev/null
@@ -1,6 +0,0 @@
-// RUN: %clang_cc1 %s -verify -fsyntax-only
-// PR99933
-
-struct S {
-  template <typename> int i; // expected-error {{non-static data member 'i' cannot be declared as a template}}
-};
diff --git a/clang/test/SemaCXX/modules.cppm b/clang/test/SemaCXX/modules.cppm
index 41204be76eafa..267417bf5da2c 100644
--- a/clang/test/SemaCXX/modules.cppm
+++ b/clang/test/SemaCXX/modules.cppm
@@ -1,19 +1,17 @@
-// RUN:     %clang_cc1 -std=c++20 -emit-module-interface %s -o %t.0.pcm -verify -DTEST=0
-// RUN:     %clang_cc1 -std=c++20 -emit-module-interface %s -o %t.1.pcm -verify -DTEST=1
-// RUN:     %clang_cc1 -std=c++20 -emit-module-interface %s -fmodule-file=foo=%t.0.pcm -o %t.2.pcm -verify -DTEST=2
-// RUN:     %clang_cc1 -std=c++20 -emit-module-interface %s -fmodule-file=foo=%t.0.pcm -o %t.3.pcm -verify -Dfoo=bar -DTEST=3
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+// RUN: split-file %s %t
 
-#if TEST == 0 || TEST == 2
-// expected-no-diagnostics
-#endif
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/A.cppm -o %t.0.pcm -verify
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/B.cppm -o %t.1.pcm -verify
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/C.cppm -fmodule-file=foo=%t.0.pcm -o %t.2.pcm -verify
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/D.cppm -fmodule-file=foo=%t.0.pcm -o %t.3.pcm -verify
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/E.cppm -fmodule-file=foo=%t.0.pcm -o %t.3.pcm -verify -Dfoo=bar
 
+//--- A.cppm
 export module foo;
-
 static int m;
-
 int n;
-
-#if TEST == 0
 export {
   int a;
   int b;
@@ -27,7 +25,43 @@ export void f() {}
 
 export struct T {
 } t;
-#elif TEST == 3
+// expected-no-diagnostics
+
+//--- B.cppm
+export module foo;
+static int m;
+int n;
+struct S {
+  export int n;        // expected-error {{expected member name or ';'}}
+  export static int n; // expected-error {{expected member name or ';'}}
+};
+
+// FIXME: Exports of declarations without external linkage are disallowed.
+// Exports of declarations with non-external-linkage types are disallowed.
+
+// Cannot export within another export. This isn't precisely covered by the
+// language rules right now, but (per personal correspondence between zygoloid
+// and gdr) is the intent.
+export { // expected-note {{export block begins here}}
+  extern "C++" {
+    namespace NestedExport {
+      export { // expected-error {{export declaration appears within another export declaration}}
+        int q;
+      }
+    } // namespace NestedExport
+  }
+}
+
+//--- C.cppm
+export module foo;
+static int m;
+int n;
+// expected-no-diagnostics
+
+//--- D.cppm
+export module foo;
+static int m;
+int n;
 int use_a = a; // expected-error {{use of undeclared identifier 'a'}}
 
 #undef foo
@@ -46,29 +80,12 @@ int use_n = n; // FIXME: this should not be visible, because it is not exported
 
 extern int n;
 static_assert(&n != p); // expected-error{{use of undeclared identifier 'p'}}
-#endif
 
-#if TEST == 1
-struct S {
-  export int n;        // expected-error {{expected member name or ';'}}
-  export static int n; // expected-error {{expected member name or ';'}}
-};
-#endif
-
-// FIXME: Exports of declarations without external linkage are disallowed.
-// Exports of declarations with non-external-linkage types are disallowed.
+//--- E.cppm
+export module foo; // expected-error {{the module name in a module declaration cannot contain an object-like macro 'foo'}}
+static int m;
+int n;
+int use_a = a; // expected-error {{use of undeclared identifier 'a'}}
 
-// Cannot export within another export. This isn't precisely covered by the
-// language rules right now, but (per personal correspondence between zygoloid
-// and gdr) is the intent.
-#if TEST == 1
-export { // expected-note {{export block begins here}}
-  extern "C++" {
-  namespace NestedExport {
-  export { // expected-error {{export declaration appears within another export declaration}}
-    int q;
-  }
-  } // namespace NestedExport
-  }
-}
-#endif
+#undef foo
+import foo; // expected-error {{imports must immediately follow the module declaration}}
diff --git a/clang/test/SemaTemplate/alias-template-deprecated.cpp b/clang/test/SemaTemplate/alias-template-deprecated.cpp
deleted file mode 100644
index 7418e222bcfc5..0000000000000
--- a/clang/test/SemaTemplate/alias-template-deprecated.cpp
+++ /dev/null
@@ -1,83 +0,0 @@
-// RUN: %clang_cc1 -std=c++11 -fsyntax-only -verify %s
-//
-// This test checks that a deprecated attribute on an alias
-// template triggers a warning diagnostic when it is used.
-
-template <typename T>
-struct NoAttr {
-  void foo() {}
-};
-
-// expected-note at +2 7{{'UsingWithAttr' has been explicitly marked deprecated here}}
-template <typename T>
-using UsingWithAttr __attribute__((deprecated)) = NoAttr<T>;
-
-// expected-note at +1 {{'UsingInstWithAttr' has been explicitly marked deprecated here}}
-using UsingInstWithAttr __attribute__((deprecated)) = NoAttr<int>;
-
-// expected-note at +1 {{'TDWithAttr' has been explicitly marked deprecated here}}
-typedef NoAttr<int> TDWithAttr __attribute__((deprecated));
-
-// expected-warning at +1 {{'UsingWithAttr' is deprecated}}
-typedef UsingWithAttr<int> TDUsingWithAttr;
-
-typedef NoAttr<int> TDNoAttr;
-
-// expected-note at +1 {{'UsingTDWithAttr' has been explicitly marked deprecated here}}
-using UsingTDWithAttr __attribute__((deprecated)) = TDNoAttr;
-
-struct S {
-  NoAttr<float> f1;
-  // expected-warning at +1 {{'UsingWithAttr' is deprecated}}
-  UsingWithAttr<float> f2;
-};
-
-// expected-warning at +1 {{'UsingWithAttr' is deprecated}}
-void foo(NoAttr<short> s1, UsingWithAttr<short> s2) {
-}
-
-// expected-note at +2 {{'UsingWithCPPAttr' has been explicitly marked deprecated here}}
-template <typename T>
-using UsingWithCPPAttr [[deprecated]] = NoAttr<T>;
-
-// expected-note at +1 {{'UsingInstWithCPPAttr' has been explicitly marked deprecated here}}
-using UsingInstWithCPPAttr [[deprecated("Do not use this")]] = NoAttr<int>;
-
-void bar() {
-  NoAttr<int> obj; // Okay
-
-  // expected-warning at +2 {{'UsingWithAttr' is deprecated}}
-  // expected-note at +1 {{in instantiation of template type alias 'UsingWithAttr' requested here}}
-  UsingWithAttr<int> objUsingWA;
-
-  // expected-warning at +2 {{'UsingWithAttr' is deprecated}}
-  // expected-note at +1 {{in instantiation of template type alias 'UsingWithAttr' requested here}}
-  NoAttr<UsingWithAttr<int>> s;
-
-  // expected-note at +1 {{'DepInt' has been explicitly marked deprecated here}}
-  using DepInt [[deprecated]] = int;
-  // expected-warning at +3 {{'UsingWithAttr' is deprecated}}
-  // expected-warning at +2 {{'DepInt' is deprecated}}
-  // expected-note at +1 {{in instantiation of template type alias 'UsingWithAttr' requested here}}
-  using X = UsingWithAttr<DepInt>;
-
-  // expected-warning at +2 {{'UsingWithAttr' is deprecated}}
-  // expected-note at +1 {{in instantiation of template type alias 'UsingWithAttr' requested here}}
-  UsingWithAttr<int>().foo();
-
-  // expected-warning at +1 {{'UsingInstWithAttr' is deprecated}}
-  UsingInstWithAttr objUIWA;
-
-  // expected-warning at +1 {{'TDWithAttr' is deprecated}}
-  TDWithAttr objTDWA;
-
-  // expected-warning at +1 {{'UsingTDWithAttr' is deprecated}}
-  UsingTDWithAttr objUTDWA;
-
-  // expected-warning at +2 {{'UsingWithCPPAttr' is deprecated}}
-  // expected-note at +1 {{in instantiation of template type alias 'UsingWithCPPAttr' requested here}}
-  UsingWithCPPAttr<int> objUsingWCPPA;
-
-  // expected-warning at +1 {{'UsingInstWithCPPAttr' is deprecated: Do not use this}}
-  UsingInstWithCPPAttr objUICPPWA;
-}
diff --git a/clang/test/SemaTemplate/class-template-decl.cpp b/clang/test/SemaTemplate/class-template-decl.cpp
index 0374c4eba982a..c054a6a8d82f7 100644
--- a/clang/test/SemaTemplate/class-template-decl.cpp
+++ b/clang/test/SemaTemplate/class-template-decl.cpp
@@ -63,7 +63,7 @@ class X {
 };
 
 void f() {
-  template<typename T> class X; // expected-error{{templates can only be declared in namespace or class scope}}
+  template<typename T> class X; // expected-error{{expression}}
 }
 
 template<typename T> class X1 var; // expected-error {{variable has incomplete type 'class X1'}} \
diff --git a/clang/test/SemaTemplate/deduction-guide.cpp b/clang/test/SemaTemplate/deduction-guide.cpp
index 043a4ccf56e36..7b9ca4d83b020 100644
--- a/clang/test/SemaTemplate/deduction-guide.cpp
+++ b/clang/test/SemaTemplate/deduction-guide.cpp
@@ -68,7 +68,7 @@ using BT = B<char, 'x'>;
 // CHECK: |-TemplateTypeParmDecl {{.*}} typename depth 0 index 0 T
 // CHECK: |-NonTypeTemplateParmDecl {{.*}} 'T' depth 0 index 1 V
 // CHECK: |-TemplateTypeParmDecl {{.*}} typename depth 0 index 2 U
-// CHECK: |-NonTypeTemplateParmDecl {{.*}} 'U' depth 0 index 3 W
+// CHECK: |-NonTypeTemplateParmDecl {{.*}} 'type-parameter-0-2' depth 0 index 3 W
 // CHECK: |-CXXDeductionGuideDecl {{.*}} 'auto (X<W, V>) -> B<T, V>'
 // CHECK: | `-ParmVarDecl {{.*}} 'X<W, V>'
 // CHECK: `-CXXDeductionGuideDecl {{.*}} 'auto (X<nullptr, 'x'>) -> B<char, 'x'>'
@@ -81,7 +81,7 @@ using BT = B<char, 'x'>;
 // CHECK: |-InjectedClassNameType {{.*}} 'B<T, V>' dependent
 // CHECK: `-TemplateSpecializationType {{.*}} 'X<W, V>' dependent
 // CHECK:   |-TemplateArgument expr
-// CHECK:   | `-DeclRefExpr {{.*}} 'U' NonTypeTemplateParm {{.*}} 'W' 'U'
+// CHECK:   | `-DeclRefExpr {{.*}} 'type-parameter-0-2' NonTypeTemplateParm {{.*}} 'W' 'type-parameter-0-2'
 // CHECK:   `-TemplateArgument expr
 // CHECK:     `-DeclRefExpr {{.*}} 'T' NonTypeTemplateParm {{.*}} 'V' 'T'
 
@@ -99,13 +99,13 @@ using CT = C<int>;
 // CHECK: | |-TemplateTypeParmDecl {{.*}} typename depth 1 index 0 X
 // CHECK: | `-NonTypeTemplateParmDecl {{.*}} 'X' depth 1 index 1
 // CHECK: |-TemplateTypeParmDecl {{.*}} typename depth 0 index 2 U
-// CHECK: |-NonTypeTemplateParmDecl {{.*}} 'U' depth 0 index 3 V
+// CHECK: |-NonTypeTemplateParmDecl {{.*}} 'type-parameter-0-2' depth 0 index 3 V
 // CHECK: | `-TemplateArgument {{.*}} expr
 // CHECK: |   `-IntegerLiteral {{.*}} 'int' 0
-// CHECK: |-CXXDeductionGuideDecl {{.*}} 'auto (A, Y<T>, U) -> C<A>'
+// CHECK: |-CXXDeductionGuideDecl {{.*}} 'auto (A, Y<template-parameter-0-1>, type-parameter-0-2) -> C<A>'
 // CHECK: | |-ParmVarDecl {{.*}} 'A'
-// CHECK: | |-ParmVarDecl {{.*}} 'Y<T>'
-// CHECK: | `-ParmVarDecl {{.*}} 'U'
+// CHECK: | |-ParmVarDecl {{.*}} 'Y<template-parameter-0-1>'
+// CHECK: | `-ParmVarDecl {{.*}} 'type-parameter-0-2'
 // CHECK: `-CXXDeductionGuideDecl {{.*}} 'auto (int, Y<B>, int) -> C<int>'
 // CHECK:  |-TemplateArgument type 'int'
 // CHECK:  |-TemplateArgument template 'B'
@@ -114,20 +114,20 @@ using CT = C<int>;
 // CHECK:  |-ParmVarDecl {{.*}} 'int'
 // CHECK:  |-ParmVarDecl {{.*}} 'Y<B>'
 // CHECK:  `-ParmVarDecl {{.*}} 'int'
-// CHECK: FunctionProtoType {{.*}} 'auto (A, Y<T>, U) -> C<A>' dependent trailing_return cdecl
+// CHECK: FunctionProtoType {{.*}} 'auto (A, Y<template-parameter-0-1>, type-parameter-0-2) -> C<A>' dependent trailing_return cdecl
 // CHECK: |-InjectedClassNameType {{.*}} 'C<A>' dependent
 // CHECK: |-TemplateTypeParmType {{.*}} 'A' dependent depth 0 index 0
 // CHECK: | `-TemplateTypeParm {{.*}} 'A'
-// CHECK: |-ElaboratedType {{.*}} 'Y<T>' sugar dependent
-// CHECK: | `-TemplateSpecializationType {{.*}} 'Y<T>' dependent
+// CHECK: |-ElaboratedType {{.*}} 'Y<template-parameter-0-1>' sugar dependent
+// CHECK: | `-TemplateSpecializationType {{.*}} 'Y<template-parameter-0-1>' dependent
 // CHECK: |   `-TemplateArgument template
-// CHECK: `-TemplateTypeParmType {{.*}} 'U' dependent depth 0 index 2
+// CHECK: `-TemplateTypeParmType {{.*}} 'type-parameter-0-2' dependent depth 0 index 2
 
 template<typename ...T> struct D { // expected-note {{candidate}} \
                                    // expected-note {{implicit deduction guide declared as 'template <typename ...T> D(D<T...>) -> D<T...>'}}
   template<typename... U> using B = int(int (*...p)(T, U));
   template<typename U1, typename U2> D(B<U1, U2>*); // expected-note {{candidate}} \
-                                                    // expected-note {{implicit deduction guide declared as 'template <typename ...T, typename U1, typename U2> D(B<U1, U2> *) -> D<T...>'}}
+                                                    // expected-note {{implicit deduction guide declared as 'template <typename ...T, typename U1, typename U2> D(B<type-parameter-0-1, type-parameter-0-2> *) -> D<T...>'}}
 };
 int f(int(int, int), int(int, int));
 // FIXME: We can't deduce this because we can't deduce through a
@@ -141,14 +141,14 @@ using DT = D<int, int>;
 // CHECK: |-TemplateTypeParmDecl {{.*}} typename depth 0 index 0 ... T
 // CHECK: |-TemplateTypeParmDecl {{.*}} typename depth 0 index 1 U1
 // CHECK: |-TemplateTypeParmDecl {{.*}} typename depth 0 index 2 U2
-// CHECK: `-CXXDeductionGuideDecl {{.*}} 'auto (B<U1, U2> *) -> D<T...>'
-// CHECK:   `-ParmVarDecl {{.*}} 'B<U1, U2> *'
-// CHECK: FunctionProtoType {{.*}} 'auto (B<U1, U2> *) -> D<T...>' dependent trailing_return
+// CHECK: `-CXXDeductionGuideDecl {{.*}} 'auto (B<type-parameter-0-1, type-parameter-0-2> *) -> D<T...>'
+// CHECK:   `-ParmVarDecl {{.*}} 'B<type-parameter-0-1, type-parameter-0-2> *'
+// CHECK: FunctionProtoType {{.*}} 'auto (B<type-parameter-0-1, type-parameter-0-2> *) -> D<T...>' dependent trailing_return
 // CHECK: |-InjectedClassNameType {{.*}} 'D<T...>' dependent
-// CHECK: `-PointerType {{.*}} 'B<U1, U2> *' dependent
-// CHECK:   `-TemplateSpecializationType {{.*}} 'B<U1, U2>' sugar dependent alias
-// CHECK:     |-TemplateArgument type 'U1'
-// CHECK:     |-TemplateArgument type 'U2'
+// CHECK: `-PointerType {{.*}} 'B<type-parameter-0-1, type-parameter-0-2> *' dependent
+// CHECK:   `-TemplateSpecializationType {{.*}} 'B<type-parameter-0-1, type-parameter-0-2>' sugar dependent alias
+// CHECK:     |-TemplateArgument type 'type-parameter-0-1'
+// CHECK:     |-TemplateArgument type 'type-parameter-0-2'
 // CHECK:     `-FunctionProtoType {{.*}} 'int (int (*)(T, U)...)' dependent cdecl
 // CHECK:       |-BuiltinType {{.*}} 'int'
 // CHECK:       `-PackExpansionType {{.*}} 'int (*)(T, U)...' dependent expansions 2
@@ -232,17 +232,17 @@ F s(0);
 // CHECK: |-TemplateTypeParmDecl {{.*}} typename depth 0 index 1 U
 // CHECK: |-ParenExpr {{.*}} 'bool'
 // CHECK: | `-CXXBoolLiteralExpr {{.*}} 'bool' false
-// CHECK: |-CXXDeductionGuideDecl {{.*}} implicit <deduction guide for F> 'auto (U) -> F<>'
-// CHECK: | `-ParmVarDecl {{.*}} 'U'
+// CHECK: |-CXXDeductionGuideDecl {{.*}} implicit <deduction guide for F> 'auto (type-parameter-0-1) -> F<>'
+// CHECK: | `-ParmVarDecl {{.*}} 'type-parameter-0-1'
 // CHECK: `-CXXDeductionGuideDecl {{.*}} implicit <deduction guide for F> 'auto (int) -> F<>'
 // CHECK:   |-TemplateArgument integral ''x''
 // CHECK:   |-TemplateArgument type 'int'
 // CHECK:   | `-BuiltinType {{.*}} 'int'
 // CHECK:   `-ParmVarDecl {{.*}} 'int'
-// CHECK: FunctionProtoType {{.*}} 'auto (U) -> F<>' dependent trailing_return cdecl
+// CHECK: FunctionProtoType {{.*}} 'auto (type-parameter-0-1) -> F<>' dependent trailing_return cdecl
 // CHECK: |-InjectedClassNameType {{.*}} 'F<>' dependent
 // CHECK: | `-CXXRecord {{.*}} 'F'
-// CHECK: `-TemplateTypeParmType {{.*}} 'U' dependent depth 0 index 1
+// CHECK: `-TemplateTypeParmType {{.*}} 'type-parameter-0-1' dependent depth 0 index 1
 
 template<typename T>
 struct G { T t; };
@@ -259,7 +259,7 @@ AG ag = {1};
 // Verify that the aggregate deduction guide for alias templates is built.
 // CHECK-LABEL: Dumping <deduction guide for AG>
 // CHECK: FunctionTemplateDecl
-// CHECK: |-CXXDeductionGuideDecl {{.*}} 'auto (X) -> G<X>'
+// CHECK: |-CXXDeductionGuideDecl {{.*}} 'auto (type-parameter-0-0) -> G<type-parameter-0-0>'
 // CHECK: `-CXXDeductionGuideDecl {{.*}} 'auto (int) -> G<int>' implicit_instantiation
 // CHECK:   |-TemplateArgument type 'int'
 // CHECK:   | `-BuiltinType {{.*}} 'int'
@@ -281,7 +281,7 @@ struct Foo {
 template <typename U>
 using AFoo = Foo<G<U>>;
 // Verify that the require-clause from the Foo deduction guide is transformed.
-// The D occurrence should be rewritten to G<U>.
+// The D occurrence should be rewritten to G<type-parameter-0-0>.
 //
 // CHECK-LABEL: Dumping <deduction guide for AFoo>
 // CHECK: FunctionTemplateDecl {{.*}} implicit <deduction guide for AFoo>
@@ -289,16 +289,16 @@ using AFoo = Foo<G<U>>;
 // CHECK-NEXT: |-BinaryOperator {{.*}} '&&'
 // CHECK-NEXT: | |-ParenExpr {{.*}} 'bool'
 // CHECK-NEXT: | | `-BinaryOperator {{.*}} 'bool' '=='
-// CHECK-NEXT: | |   |-UnaryExprOrTypeTraitExpr {{.*}} 'G<U>'
+// CHECK-NEXT: | |   |-UnaryExprOrTypeTraitExpr {{.*}} 'G<type-parameter-0-0>'
 // CHECK-NEXT: | |   `-ImplicitCastExpr {{.*}}
 // CHECK-NEXT: | |     `-IntegerLiteral {{.*}}
 // CHECK-NEXT: | `-TypeTraitExpr {{.*}} 'bool' __is_deducible
 // CHECK-NEXT: |   |-DeducedTemplateSpecializationType {{.*}} 'AFoo' dependent
 // CHECK-NEXT: |   | `-name: 'AFoo'
 // CHECK-NEXT: |   |   `-TypeAliasTemplateDecl {{.+}} AFoo
-// CHECK-NEXT: |   `-TemplateSpecializationType {{.*}} 'Foo<G<U>>' dependent
-// CHECK:      |-CXXDeductionGuideDecl {{.*}} implicit <deduction guide for AFoo> 'auto (G<U>) -> Foo<G<U>>'
-// CHECK-NEXT: | `-ParmVarDecl {{.*}} 'G<U>'
+// CHECK-NEXT: |   `-TemplateSpecializationType {{.*}} 'Foo<G<type-parameter-0-0>>' dependent
+// CHECK:      |-CXXDeductionGuideDecl {{.*}} implicit <deduction guide for AFoo> 'auto (G<type-parameter-0-0>) -> Foo<G<type-parameter-0-0>>'
+// CHECK-NEXT: | `-ParmVarDecl {{.*}} 'G<type-parameter-0-0>'
 // CHECK-NEXT: `-CXXDeductionGuideDecl {{.*}} implicit used <deduction guide for AFoo> 'auto (G<int>) -> Foo<G<int>>' implicit_instantiation
 // CHECK-NEXT:   |-TemplateArgument type 'int'
 // CHECK-NEXT:   | `-BuiltinType {{.*}} 'int'
@@ -321,20 +321,20 @@ namespace TTP {
 // CHECK-NEXT:  |-TemplateTypeParmDecl {{.+}} class depth 0 index 0 T{{$}}
 // CHECK-NEXT:  |-TemplateTemplateParmDecl {{.+}} depth 0 index 1 TT{{$}}
 // CHECK-NEXT:  | `-TemplateTypeParmDecl {{.+}} class depth 1 index 0{{$}}
-// CHECK-NEXT:  |-CXXDeductionGuideDecl {{.+}} 'auto (TT<T>) -> B<T>'{{$}}
-// CHECK-NEXT:  | `-ParmVarDecl {{.+}} 'TT<T>'{{$}}
+// CHECK-NEXT:  |-CXXDeductionGuideDecl {{.+}} 'auto (template-parameter-0-1<T>) -> B<T>'{{$}}
+// CHECK-NEXT:  | `-ParmVarDecl {{.+}} 'template-parameter-0-1<T>'{{$}}
 // CHECK-NEXT:  `-CXXDeductionGuideDecl {{.+}} 'auto (A<int>) -> TTP::B<int>'
 // CHECK-NEXT:    |-TemplateArgument type 'int'
 // CHECK-NEXT:    | `-BuiltinType {{.+}} 'int'{{$}}
 // CHECK-NEXT:    |-TemplateArgument template 'TTP::A'{{$}}
 // CHECK-NEXT:    | `-ClassTemplateDecl {{.+}} A{{$}}
 // CHECK-NEXT:    `-ParmVarDecl {{.+}} 'A<int>':'TTP::A<int>'{{$}}
-// CHECK-NEXT:  FunctionProtoType {{.+}} 'auto (TT<T>) -> B<T>' dependent trailing_return cdecl{{$}}
+// CHECK-NEXT:  FunctionProtoType {{.+}} 'auto (template-parameter-0-1<T>) -> B<T>' dependent trailing_return cdecl{{$}}
 // CHECK-NEXT:  |-InjectedClassNameType {{.+}} 'B<T>' dependent{{$}}
 // CHECK-NEXT:  | `-CXXRecord {{.+}} 'B'{{$}}
-// CHECK-NEXT:  `-ElaboratedType {{.+}} 'TT<T>' sugar dependent{{$}}
-// CHECK-NEXT:    `-TemplateSpecializationType {{.+}} 'TT<T>' dependent{{$}}
-// CHECK-NEXT:      |-name: 'TT':'template-parameter-0-1' qualified
+// CHECK-NEXT:  `-ElaboratedType {{.+}} 'template-parameter-0-1<T>' sugar dependent{{$}}
+// CHECK-NEXT:    `-TemplateSpecializationType {{.+}} 'template-parameter-0-1<T>' dependent{{$}}
+// CHECK-NEXT:      |-name: 'template-parameter-0-1' qualified
 // CHECK-NEXT:      | `-TemplateTemplateParmDecl {{.+}} depth 0 index 1
 // CHECK-NEXT:      `-TemplateArgument type 'T':'type-parameter-0-0'{{$}}
 // CHECK-NEXT:        `-TemplateTypeParmType {{.+}} 'T' dependent depth 0 index 0{{$}}
diff --git a/clang/test/SemaTemplate/nested-template.cpp b/clang/test/SemaTemplate/nested-template.cpp
index a54992a397187..a6ede8e99037e 100644
--- a/clang/test/SemaTemplate/nested-template.cpp
+++ b/clang/test/SemaTemplate/nested-template.cpp
@@ -132,7 +132,7 @@ namespace PR10896 {
   private:
 
     template<typename T>
-    T SomeField; // expected-error {{non-static data member 'SomeField' cannot be declared as a template}}
+    T SomeField; // expected-error {{member 'SomeField' declared as a template}}
     template<> int SomeField2; // expected-error {{extraneous 'template<>' in declaration of variable 'SomeField2'}}
   };
 
diff --git a/clang/test/lit.cfg.py b/clang/test/lit.cfg.py
index 2bd7501136a10..2e0fbc2c9e1dd 100644
--- a/clang/test/lit.cfg.py
+++ b/clang/test/lit.cfg.py
@@ -95,7 +95,6 @@
     "llvm-ifs",
     "yaml2obj",
     "clang-linker-wrapper",
-    "clang-nvlink-wrapper",
     "llvm-lto",
     "llvm-lto2",
     "llvm-profdata",
diff --git a/clang/tools/CMakeLists.txt b/clang/tools/CMakeLists.txt
index 4885afc1584d0..bdd8004be3e02 100644
--- a/clang/tools/CMakeLists.txt
+++ b/clang/tools/CMakeLists.txt
@@ -9,7 +9,6 @@ add_clang_subdirectory(clang-format-vs)
 add_clang_subdirectory(clang-fuzzer)
 add_clang_subdirectory(clang-import-test)
 add_clang_subdirectory(clang-linker-wrapper)
-add_clang_subdirectory(clang-nvlink-wrapper)
 add_clang_subdirectory(clang-offload-packager)
 add_clang_subdirectory(clang-offload-bundler)
 add_clang_subdirectory(clang-scan-deps)
diff --git a/clang/tools/clang-fuzzer/dictionary/CMakeLists.txt b/clang/tools/clang-fuzzer/dictionary/CMakeLists.txt
index 6b72b98f5e1c4..ee4aa587ea54d 100644
--- a/clang/tools/clang-fuzzer/dictionary/CMakeLists.txt
+++ b/clang/tools/clang-fuzzer/dictionary/CMakeLists.txt
@@ -1,4 +1,3 @@
-set(CMAKE_CXX_FLAGS ${CXX_FLAGS_NOFUZZ})
 add_clang_executable(clang-fuzzer-dictionary
   dictionary.c
   )
diff --git a/clang/tools/clang-nvlink-wrapper/CMakeLists.txt b/clang/tools/clang-nvlink-wrapper/CMakeLists.txt
deleted file mode 100644
index 846fa952ba58d..0000000000000
--- a/clang/tools/clang-nvlink-wrapper/CMakeLists.txt
+++ /dev/null
@@ -1,42 +0,0 @@
-set(LLVM_LINK_COMPONENTS
-  ${LLVM_TARGETS_TO_BUILD}
-  BitWriter
-  Core
-  BinaryFormat
-  MC
-  Target
-  TransformUtils
-  Analysis
-  Passes
-  IRReader
-  Object
-  Option
-  Support
-  TargetParser
-  CodeGen
-  LTO
-  )
-
-set(LLVM_TARGET_DEFINITIONS NVLinkOpts.td)
-tablegen(LLVM NVLinkOpts.inc -gen-opt-parser-defs)
-add_public_tablegen_target(NVLinkWrapperOpts)
-
-if(NOT CLANG_BUILT_STANDALONE)
-  set(tablegen_deps intrinsics_gen NVLinkWrapperOpts)
-endif()
-
-add_clang_tool(clang-nvlink-wrapper
-  ClangNVLinkWrapper.cpp
-
-  DEPENDS
-  ${tablegen_deps}
-  )
-
-set(CLANG_NVLINK_WRAPPER_LIB_DEPS
-  clangBasic
-  )
-
-target_link_libraries(clang-nvlink-wrapper
-  PRIVATE
-  ${CLANG_NVLINK_WRAPPER_LIB_DEPS}
-  )
diff --git a/clang/tools/clang-nvlink-wrapper/ClangNVLinkWrapper.cpp b/clang/tools/clang-nvlink-wrapper/ClangNVLinkWrapper.cpp
deleted file mode 100644
index 3885166e76ca7..0000000000000
--- a/clang/tools/clang-nvlink-wrapper/ClangNVLinkWrapper.cpp
+++ /dev/null
@@ -1,785 +0,0 @@
-//===-- clang-nvlink-wrapper/ClangNVLinkWrapper.cpp - NVIDIA linker util --===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===---------------------------------------------------------------------===//
-//
-// This tool wraps around the NVIDIA linker called 'nvlink'. The NVIDIA linker
-// is required to create NVPTX applications, but does not support common
-// features like LTO or archives. This utility wraps around the tool to cover
-// its deficiencies. This tool can be removed once NVIDIA improves their linker
-// or ports it to `ld.lld`.
-//
-//===---------------------------------------------------------------------===//
-
-#include "clang/Basic/Version.h"
-
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/BinaryFormat/Magic.h"
-#include "llvm/Bitcode/BitcodeWriter.h"
-#include "llvm/CodeGen/CommandFlags.h"
-#include "llvm/IR/DiagnosticPrinter.h"
-#include "llvm/LTO/LTO.h"
-#include "llvm/Object/Archive.h"
-#include "llvm/Object/ArchiveWriter.h"
-#include "llvm/Object/Binary.h"
-#include "llvm/Object/ELFObjectFile.h"
-#include "llvm/Object/IRObjectFile.h"
-#include "llvm/Object/ObjectFile.h"
-#include "llvm/Object/OffloadBinary.h"
-#include "llvm/Option/ArgList.h"
-#include "llvm/Option/OptTable.h"
-#include "llvm/Option/Option.h"
-#include "llvm/Remarks/HotnessThresholdParser.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/FileOutputBuffer.h"
-#include "llvm/Support/FileSystem.h"
-#include "llvm/Support/InitLLVM.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/Path.h"
-#include "llvm/Support/Program.h"
-#include "llvm/Support/Signals.h"
-#include "llvm/Support/StringSaver.h"
-#include "llvm/Support/TargetSelect.h"
-#include "llvm/Support/WithColor.h"
-
-using namespace llvm;
-using namespace llvm::opt;
-using namespace llvm::object;
-
-// Various tools (e.g., llc and opt) duplicate this series of declarations for
-// options related to passes and remarks.
-static cl::opt<bool> RemarksWithHotness(
-    "pass-remarks-with-hotness",
-    cl::desc("With PGO, include profile count in optimization remarks"),
-    cl::Hidden);
-
-static cl::opt<std::optional<uint64_t>, false, remarks::HotnessThresholdParser>
-    RemarksHotnessThreshold(
-        "pass-remarks-hotness-threshold",
-        cl::desc("Minimum profile count required for "
-                 "an optimization remark to be output. "
-                 "Use 'auto' to apply the threshold from profile summary."),
-        cl::value_desc("N or 'auto'"), cl::init(0), cl::Hidden);
-
-static cl::opt<std::string>
-    RemarksFilename("pass-remarks-output",
-                    cl::desc("Output filename for pass remarks"),
-                    cl::value_desc("filename"));
-
-static cl::opt<std::string>
-    RemarksPasses("pass-remarks-filter",
-                  cl::desc("Only record optimization remarks from passes whose "
-                           "names match the given regular expression"),
-                  cl::value_desc("regex"));
-
-static cl::opt<std::string> RemarksFormat(
-    "pass-remarks-format",
-    cl::desc("The format used for serializing remarks (default: YAML)"),
-    cl::value_desc("format"), cl::init("yaml"));
-
-static cl::list<std::string>
-    PassPlugins("load-pass-plugin",
-                cl::desc("Load passes from plugin library"));
-
-static cl::opt<std::string> PassPipeline(
-    "passes",
-    cl::desc(
-        "A textual description of the pass pipeline. To have analysis passes "
-        "available before a certain pass, add 'require<foo-analysis>'. "
-        "'-passes' overrides the pass pipeline (but not all effects) from "
-        "specifying '--opt-level=O?' (O2 is the default) to "
-        "clang-linker-wrapper.  Be sure to include the corresponding "
-        "'default<O?>' in '-passes'."));
-static cl::alias PassPipeline2("p", cl::aliasopt(PassPipeline),
-                               cl::desc("Alias for -passes"));
-
-static void printVersion(raw_ostream &OS) {
-  OS << clang::getClangToolFullVersion("clang-nvlink-wrapper") << '\n';
-}
-
-/// The value of `argv[0]` when run.
-static const char *Executable;
-
-/// Temporary files to be cleaned up.
-static SmallVector<SmallString<128>> TempFiles;
-
-/// Codegen flags for LTO backend.
-static codegen::RegisterCodeGenFlags CodeGenFlags;
-
-namespace {
-// Must not overlap with llvm::opt::DriverFlag.
-enum WrapperFlags { WrapperOnlyOption = (1 << 4) };
-
-enum ID {
-  OPT_INVALID = 0, // This is not an option ID.
-#define OPTION(...) LLVM_MAKE_OPT_ID(__VA_ARGS__),
-#include "NVLinkOpts.inc"
-  LastOption
-#undef OPTION
-};
-
-#define PREFIX(NAME, VALUE)                                                    \
-  static constexpr StringLiteral NAME##_init[] = VALUE;                        \
-  static constexpr ArrayRef<StringLiteral> NAME(NAME##_init,                   \
-                                                std::size(NAME##_init) - 1);
-#include "NVLinkOpts.inc"
-#undef PREFIX
-
-static constexpr OptTable::Info InfoTable[] = {
-#define OPTION(...) LLVM_CONSTRUCT_OPT_INFO(__VA_ARGS__),
-#include "NVLinkOpts.inc"
-#undef OPTION
-};
-
-class WrapperOptTable : public opt::GenericOptTable {
-public:
-  WrapperOptTable() : opt::GenericOptTable(InfoTable) {}
-};
-
-const OptTable &getOptTable() {
-  static const WrapperOptTable *Table = []() {
-    auto Result = std::make_unique<WrapperOptTable>();
-    return Result.release();
-  }();
-  return *Table;
-}
-
-[[noreturn]] void reportError(Error E) {
-  outs().flush();
-  logAllUnhandledErrors(std::move(E), WithColor::error(errs(), Executable));
-  exit(EXIT_FAILURE);
-}
-
-void diagnosticHandler(const DiagnosticInfo &DI) {
-  std::string ErrStorage;
-  raw_string_ostream OS(ErrStorage);
-  DiagnosticPrinterRawOStream DP(OS);
-  DI.print(DP);
-
-  switch (DI.getSeverity()) {
-  case DS_Error:
-    WithColor::error(errs(), Executable) << ErrStorage << "\n";
-    break;
-  case DS_Warning:
-    WithColor::warning(errs(), Executable) << ErrStorage << "\n";
-    break;
-  case DS_Note:
-    WithColor::note(errs(), Executable) << ErrStorage << "\n";
-    break;
-  case DS_Remark:
-    WithColor::remark(errs()) << ErrStorage << "\n";
-    break;
-  }
-}
-
-Expected<StringRef> createTempFile(const ArgList &Args, const Twine &Prefix,
-                                   StringRef Extension) {
-  SmallString<128> OutputFile;
-  if (Args.hasArg(OPT_save_temps)) {
-    (Prefix + "." + Extension).toNullTerminatedStringRef(OutputFile);
-  } else {
-    if (std::error_code EC =
-            sys::fs::createTemporaryFile(Prefix, Extension, OutputFile))
-      return createFileError(OutputFile, EC);
-  }
-
-  TempFiles.emplace_back(std::move(OutputFile));
-  return TempFiles.back();
-}
-
-Expected<std::string> findProgram(const ArgList &Args, StringRef Name,
-                                  ArrayRef<StringRef> Paths) {
-  if (Args.hasArg(OPT_dry_run))
-    return Name.str();
-  ErrorOr<std::string> Path = sys::findProgramByName(Name, Paths);
-  if (!Path)
-    Path = sys::findProgramByName(Name);
-  if (!Path)
-    return createStringError(Path.getError(),
-                             "Unable to find '" + Name + "' in path");
-  return *Path;
-}
-
-std::optional<std::string> findFile(StringRef Dir, StringRef Root,
-                                    const Twine &Name) {
-  SmallString<128> Path;
-  if (Dir.starts_with("="))
-    sys::path::append(Path, Root, Dir.substr(1), Name);
-  else
-    sys::path::append(Path, Dir, Name);
-
-  if (sys::fs::exists(Path))
-    return static_cast<std::string>(Path);
-  return std::nullopt;
-}
-
-std::optional<std::string>
-findFromSearchPaths(StringRef Name, StringRef Root,
-                    ArrayRef<StringRef> SearchPaths) {
-  for (StringRef Dir : SearchPaths)
-    if (std::optional<std::string> File = findFile(Dir, Root, Name))
-      return File;
-  return std::nullopt;
-}
-
-std::optional<std::string>
-searchLibraryBaseName(StringRef Name, StringRef Root,
-                      ArrayRef<StringRef> SearchPaths) {
-  for (StringRef Dir : SearchPaths)
-    if (std::optional<std::string> File =
-            findFile(Dir, Root, "lib" + Name + ".a"))
-      return File;
-  return std::nullopt;
-}
-
-/// Search for static libraries in the linker's library path given input like
-/// `-lfoo` or `-l:libfoo.a`.
-std::optional<std::string> searchLibrary(StringRef Input, StringRef Root,
-                                         ArrayRef<StringRef> SearchPaths) {
-  if (Input.starts_with(":"))
-    return findFromSearchPaths(Input.drop_front(), Root, SearchPaths);
-  return searchLibraryBaseName(Input, Root, SearchPaths);
-}
-
-void printCommands(ArrayRef<StringRef> CmdArgs) {
-  if (CmdArgs.empty())
-    return;
-
-  llvm::errs() << " \"" << CmdArgs.front() << "\" ";
-  llvm::errs() << llvm::join(std::next(CmdArgs.begin()), CmdArgs.end(), " ")
-               << "\n";
-}
-
-/// A minimum symbol interface that provides the necessary information to
-/// extract archive members and resolve LTO symbols.
-struct Symbol {
-  enum Flags {
-    None = 0,
-    Undefined = 1 << 0,
-    Weak = 1 << 1,
-  };
-
-  Symbol() : File(), Flags(None), UsedInRegularObj(false) {}
-
-  Symbol(MemoryBufferRef File, const irsymtab::Reader::SymbolRef Sym)
-      : File(File), Flags(0), UsedInRegularObj(false) {
-    if (Sym.isUndefined())
-      Flags |= Undefined;
-    if (Sym.isWeak())
-      Flags |= Weak;
-  }
-
-  Symbol(MemoryBufferRef File, const SymbolRef Sym)
-      : File(File), Flags(0), UsedInRegularObj(false) {
-    auto FlagsOrErr = Sym.getFlags();
-    if (!FlagsOrErr)
-      reportError(FlagsOrErr.takeError());
-    if (*FlagsOrErr & SymbolRef::SF_Undefined)
-      Flags |= Undefined;
-    if (*FlagsOrErr & SymbolRef::SF_Weak)
-      Flags |= Weak;
-
-    auto NameOrErr = Sym.getName();
-    if (!NameOrErr)
-      reportError(NameOrErr.takeError());
-  }
-
-  bool isWeak() const { return Flags & Weak; }
-  bool isUndefined() const { return Flags & Undefined; }
-
-  MemoryBufferRef File;
-  uint32_t Flags;
-  bool UsedInRegularObj;
-};
-
-Expected<StringRef> runPTXAs(StringRef File, const ArgList &Args) {
-  std::string CudaPath = Args.getLastArgValue(OPT_cuda_path_EQ).str();
-  std::string GivenPath = Args.getLastArgValue(OPT_ptxas_path_EQ).str();
-  Expected<std::string> PTXAsPath =
-      findProgram(Args, "ptxas", {CudaPath + "/bin", GivenPath});
-  if (!PTXAsPath)
-    return PTXAsPath.takeError();
-
-  auto TempFileOrErr = createTempFile(
-      Args, sys::path::stem(Args.getLastArgValue(OPT_o, "a.out")), "cubin");
-  if (!TempFileOrErr)
-    return TempFileOrErr.takeError();
-
-  SmallVector<StringRef> AssemblerArgs({*PTXAsPath, "-m64", "-c", File});
-  if (Args.hasArg(OPT_verbose))
-    AssemblerArgs.push_back("-v");
-  if (Args.hasArg(OPT_g)) {
-    if (Args.hasArg(OPT_O))
-      WithColor::warning(errs(), Executable)
-          << "Optimized debugging not supported, overriding to '-O0'\n";
-    AssemblerArgs.push_back("-O0");
-  } else
-    AssemblerArgs.push_back(
-        Args.MakeArgString("-O" + Args.getLastArgValue(OPT_O, "3")));
-  AssemblerArgs.append({"-arch", Args.getLastArgValue(OPT_arch)});
-  AssemblerArgs.append({"-o", *TempFileOrErr});
-
-  if (Args.hasArg(OPT_dry_run) || Args.hasArg(OPT_verbose))
-    printCommands(AssemblerArgs);
-  if (Args.hasArg(OPT_dry_run))
-    return Args.MakeArgString(*TempFileOrErr);
-  if (sys::ExecuteAndWait(*PTXAsPath, AssemblerArgs))
-    return createStringError("'" + sys::path::filename(*PTXAsPath) + "'" +
-                             " failed");
-  return Args.MakeArgString(*TempFileOrErr);
-}
-
-Expected<std::unique_ptr<lto::LTO>> createLTO(const ArgList &Args) {
-  const llvm::Triple Triple("nvptx64-nvidia-cuda");
-  lto::Config Conf;
-  lto::ThinBackend Backend;
-  unsigned Jobs = 0;
-  if (auto *Arg = Args.getLastArg(OPT_jobs))
-    if (!llvm::to_integer(Arg->getValue(), Jobs) || Jobs == 0)
-      reportError(createStringError("%s: expected a positive integer, got '%s'",
-                                    Arg->getSpelling().data(),
-                                    Arg->getValue()));
-  Backend = lto::createInProcessThinBackend(
-      llvm::heavyweight_hardware_concurrency(Jobs));
-
-  Conf.CPU = Args.getLastArgValue(OPT_arch);
-  Conf.Options = codegen::InitTargetOptionsFromCodeGenFlags(Triple);
-
-  Conf.RemarksFilename = RemarksFilename;
-  Conf.RemarksPasses = RemarksPasses;
-  Conf.RemarksWithHotness = RemarksWithHotness;
-  Conf.RemarksHotnessThreshold = RemarksHotnessThreshold;
-  Conf.RemarksFormat = RemarksFormat;
-
-  Conf.MAttrs = {Args.getLastArgValue(OPT_feature, "").str()};
-  std::optional<CodeGenOptLevel> CGOptLevelOrNone =
-      CodeGenOpt::parseLevel(Args.getLastArgValue(OPT_O, "2")[0]);
-  assert(CGOptLevelOrNone && "Invalid optimization level");
-  Conf.CGOptLevel = *CGOptLevelOrNone;
-  Conf.OptLevel = Args.getLastArgValue(OPT_O, "2")[0] - '0';
-  Conf.DefaultTriple = Triple.getTriple();
-
-  Conf.OptPipeline = PassPipeline;
-  Conf.PassPlugins = PassPlugins;
-
-  Conf.DiagHandler = diagnosticHandler;
-  Conf.CGFileType = CodeGenFileType::AssemblyFile;
-
-  if (Args.hasArg(OPT_lto_emit_llvm)) {
-    Conf.PreCodeGenModuleHook = [&](size_t, const Module &M) {
-      std::error_code EC;
-      raw_fd_ostream LinkedBitcode(Args.getLastArgValue(OPT_o, "a.out"), EC);
-      if (EC)
-        reportError(errorCodeToError(EC));
-      WriteBitcodeToFile(M, LinkedBitcode);
-      return false;
-    };
-  }
-
-  if (Args.hasArg(OPT_save_temps))
-    if (Error Err = Conf.addSaveTemps(
-            (Args.getLastArgValue(OPT_o, "a.out") + ".").str()))
-      return Err;
-
-  unsigned Partitions = 1;
-  if (auto *Arg = Args.getLastArg(OPT_lto_partitions))
-    if (!llvm::to_integer(Arg->getValue(), Partitions) || Partitions == 0)
-      reportError(createStringError("%s: expected a positive integer, got '%s'",
-                                    Arg->getSpelling().data(),
-                                    Arg->getValue()));
-  lto::LTO::LTOKind Kind = Args.hasArg(OPT_thinlto) ? lto::LTO::LTOK_UnifiedThin
-                                                    : lto::LTO::LTOK_Default;
-  return std::make_unique<lto::LTO>(std::move(Conf), Backend, Partitions, Kind);
-}
-
-Expected<bool> getSymbolsFromBitcode(MemoryBufferRef Buffer,
-                                     StringMap<Symbol> &SymTab, bool IsLazy) {
-  Expected<IRSymtabFile> IRSymtabOrErr = readIRSymtab(Buffer);
-  if (!IRSymtabOrErr)
-    return IRSymtabOrErr.takeError();
-  bool Extracted = !IsLazy;
-  StringMap<Symbol> PendingSymbols;
-  for (unsigned I = 0; I != IRSymtabOrErr->Mods.size(); ++I) {
-    for (const auto &IRSym : IRSymtabOrErr->TheReader.module_symbols(I)) {
-      if (IRSym.isFormatSpecific() || !IRSym.isGlobal())
-        continue;
-
-      Symbol &OldSym = !SymTab.count(IRSym.getName()) && IsLazy
-                           ? PendingSymbols[IRSym.getName()]
-                           : SymTab[IRSym.getName()];
-      Symbol Sym = Symbol(Buffer, IRSym);
-      if (OldSym.File.getBuffer().empty())
-        OldSym = Sym;
-
-      bool ResolvesReference =
-          !Sym.isUndefined() &&
-          (OldSym.isUndefined() || (OldSym.isWeak() && !Sym.isWeak())) &&
-          !(OldSym.isWeak() && OldSym.isUndefined() && IsLazy);
-      Extracted |= ResolvesReference;
-
-      Sym.UsedInRegularObj = OldSym.UsedInRegularObj;
-      if (ResolvesReference)
-        OldSym = Sym;
-    }
-  }
-  if (Extracted)
-    for (const auto &[Name, Symbol] : PendingSymbols)
-      SymTab[Name] = Symbol;
-  return Extracted;
-}
-
-Expected<bool> getSymbolsFromObject(ObjectFile &ObjFile,
-                                    StringMap<Symbol> &SymTab, bool IsLazy) {
-  bool Extracted = !IsLazy;
-  StringMap<Symbol> PendingSymbols;
-  for (SymbolRef ObjSym : ObjFile.symbols()) {
-    auto NameOrErr = ObjSym.getName();
-    if (!NameOrErr)
-      return NameOrErr.takeError();
-
-    Symbol &OldSym = !SymTab.count(*NameOrErr) && IsLazy
-                         ? PendingSymbols[*NameOrErr]
-                         : SymTab[*NameOrErr];
-    Symbol Sym = Symbol(ObjFile.getMemoryBufferRef(), ObjSym);
-    if (OldSym.File.getBuffer().empty())
-      OldSym = Sym;
-
-    bool ResolvesReference = OldSym.isUndefined() && !Sym.isUndefined() &&
-                             (!OldSym.isWeak() || !IsLazy);
-    Extracted |= ResolvesReference;
-
-    if (ResolvesReference)
-      OldSym = Sym;
-    OldSym.UsedInRegularObj = true;
-  }
-  if (Extracted)
-    for (const auto &[Name, Symbol] : PendingSymbols)
-      SymTab[Name] = Symbol;
-  return Extracted;
-}
-
-Expected<bool> getSymbols(MemoryBufferRef Buffer, StringMap<Symbol> &SymTab,
-                          bool IsLazy) {
-  switch (identify_magic(Buffer.getBuffer())) {
-  case file_magic::bitcode: {
-    return getSymbolsFromBitcode(Buffer, SymTab, IsLazy);
-  }
-  case file_magic::elf_relocatable: {
-    Expected<std::unique_ptr<ObjectFile>> ObjFile =
-        ObjectFile::createObjectFile(Buffer);
-    if (!ObjFile)
-      return ObjFile.takeError();
-    return getSymbolsFromObject(**ObjFile, SymTab, IsLazy);
-  }
-  default:
-    return createStringError("Unsupported file type");
-  }
-}
-
-Expected<SmallVector<StringRef>> getInput(const ArgList &Args) {
-  SmallVector<StringRef> LibraryPaths;
-  for (const opt::Arg *Arg : Args.filtered(OPT_library_path))
-    LibraryPaths.push_back(Arg->getValue());
-
-  bool WholeArchive = false;
-  SmallVector<std::pair<std::unique_ptr<MemoryBuffer>, bool>> InputFiles;
-  for (const opt::Arg *Arg : Args.filtered(
-           OPT_INPUT, OPT_library, OPT_whole_archive, OPT_no_whole_archive)) {
-    if (Arg->getOption().matches(OPT_whole_archive) ||
-        Arg->getOption().matches(OPT_no_whole_archive)) {
-      WholeArchive = Arg->getOption().matches(OPT_whole_archive);
-      continue;
-    }
-
-    std::optional<std::string> Filename =
-        Arg->getOption().matches(OPT_library)
-            ? searchLibrary(Arg->getValue(), /*Root=*/"", LibraryPaths)
-            : std::string(Arg->getValue());
-
-    if (!Filename && Arg->getOption().matches(OPT_library))
-      return createStringError("unable to find library -l%s", Arg->getValue());
-
-    if (!Filename || !sys::fs::exists(*Filename) ||
-        sys::fs::is_directory(*Filename))
-      continue;
-
-    ErrorOr<std::unique_ptr<MemoryBuffer>> BufferOrErr =
-        MemoryBuffer::getFileOrSTDIN(*Filename);
-    if (std::error_code EC = BufferOrErr.getError())
-      return createFileError(*Filename, EC);
-
-    MemoryBufferRef Buffer = **BufferOrErr;
-    switch (identify_magic(Buffer.getBuffer())) {
-    case file_magic::bitcode:
-    case file_magic::elf_relocatable:
-      InputFiles.emplace_back(std::move(*BufferOrErr), /*IsLazy=*/false);
-      break;
-    case file_magic::archive: {
-      Expected<std::unique_ptr<llvm::object::Archive>> LibFile =
-          object::Archive::create(Buffer);
-      if (!LibFile)
-        return LibFile.takeError();
-      Error Err = Error::success();
-      for (auto Child : (*LibFile)->children(Err)) {
-        auto ChildBufferOrErr = Child.getMemoryBufferRef();
-        if (!ChildBufferOrErr)
-          return ChildBufferOrErr.takeError();
-        std::unique_ptr<MemoryBuffer> ChildBuffer =
-            MemoryBuffer::getMemBufferCopy(
-                ChildBufferOrErr->getBuffer(),
-                ChildBufferOrErr->getBufferIdentifier());
-        InputFiles.emplace_back(std::move(ChildBuffer), !WholeArchive);
-      }
-      if (Err)
-        return Err;
-      break;
-    }
-    default:
-      return createStringError("Unsupported file type");
-    }
-  }
-
-  bool Extracted = true;
-  StringMap<Symbol> SymTab;
-  SmallVector<std::unique_ptr<MemoryBuffer>> LinkerInput;
-  while (Extracted) {
-    Extracted = false;
-    for (auto &[Input, IsLazy] : InputFiles) {
-      if (!Input)
-        continue;
-
-      // Archive members only extract if they define needed symbols. We will
-      // re-scan all the inputs if any files were extracted for the link job.
-      Expected<bool> ExtractOrErr = getSymbols(*Input, SymTab, IsLazy);
-      if (!ExtractOrErr)
-        return ExtractOrErr.takeError();
-
-      Extracted |= *ExtractOrErr;
-      if (!*ExtractOrErr)
-        continue;
-
-      LinkerInput.emplace_back(std::move(Input));
-    }
-  }
-  InputFiles.clear();
-
-  // Extract any bitcode files to be passed to the LTO pipeline.
-  SmallVector<std::unique_ptr<MemoryBuffer>> BitcodeFiles;
-  for (auto &Input : LinkerInput)
-    if (identify_magic(Input->getBuffer()) == file_magic::bitcode)
-      BitcodeFiles.emplace_back(std::move(Input));
-  llvm::erase_if(LinkerInput, [](const auto &F) { return !F; });
-
-  // Run the LTO pipeline on the extracted inputs.
-  SmallVector<StringRef> Files;
-  if (!BitcodeFiles.empty()) {
-    auto LTOBackendOrErr = createLTO(Args);
-    if (!LTOBackendOrErr)
-      return LTOBackendOrErr.takeError();
-    lto::LTO &LTOBackend = **LTOBackendOrErr;
-    for (auto &BitcodeFile : BitcodeFiles) {
-      Expected<std::unique_ptr<lto::InputFile>> BitcodeFileOrErr =
-          llvm::lto::InputFile::create(*BitcodeFile);
-      if (!BitcodeFileOrErr)
-        return BitcodeFileOrErr.takeError();
-
-      const auto Symbols = (*BitcodeFileOrErr)->symbols();
-      SmallVector<lto::SymbolResolution, 16> Resolutions(Symbols.size());
-      size_t Idx = 0;
-      for (auto &Sym : Symbols) {
-        lto::SymbolResolution &Res = Resolutions[Idx++];
-        Symbol ObjSym = SymTab[Sym.getName()];
-        // We will use this as the prevailing symbol in LTO if it is not
-        // undefined and it is from the file that contained the canonical
-        // definition.
-        Res.Prevailing = !Sym.isUndefined() && ObjSym.File == *BitcodeFile;
-
-        // We need LTO to preseve the following global symbols:
-        // 1) Symbols used in regular objects.
-        // 2) Prevailing symbols that are needed visible to the gpu runtime.
-        Res.VisibleToRegularObj =
-            ObjSym.UsedInRegularObj ||
-            (Res.Prevailing &&
-             (Sym.getVisibility() != GlobalValue::HiddenVisibility &&
-              !Sym.canBeOmittedFromSymbolTable()));
-
-        // Identify symbols that must be exported dynamically and can be
-        // referenced by other files, (i.e. the runtime).
-        Res.ExportDynamic =
-            Sym.getVisibility() != GlobalValue::HiddenVisibility &&
-            !Sym.canBeOmittedFromSymbolTable();
-
-        // The NVIDIA platform does not support any symbol preemption.
-        Res.FinalDefinitionInLinkageUnit = true;
-
-        // We do not support linker redefined symbols (e.g. --wrap) for device
-        // image linking, so the symbols will not be changed after LTO.
-        Res.LinkerRedefined = false;
-      }
-
-      // Add the bitcode file with its resolved symbols to the LTO job.
-      if (Error Err = LTOBackend.add(std::move(*BitcodeFileOrErr), Resolutions))
-        return Err;
-    }
-
-    // Run the LTO job to compile the bitcode.
-    size_t MaxTasks = LTOBackend.getMaxTasks();
-    SmallVector<StringRef> LTOFiles(MaxTasks);
-    auto AddStream =
-        [&](size_t Task,
-            const Twine &ModuleName) -> std::unique_ptr<CachedFileStream> {
-      int FD = -1;
-      auto &TempFile = LTOFiles[Task];
-      if (Args.hasArg(OPT_lto_emit_asm))
-        TempFile = Args.getLastArgValue(OPT_o, "a.out");
-      else {
-        auto TempFileOrErr = createTempFile(
-            Args, sys::path::stem(Args.getLastArgValue(OPT_o, "a.out")), "s");
-        if (!TempFileOrErr)
-          reportError(TempFileOrErr.takeError());
-        TempFile = Args.MakeArgString(*TempFileOrErr);
-      }
-      if (std::error_code EC = sys::fs::openFileForWrite(TempFile, FD))
-        reportError(errorCodeToError(EC));
-      return std::make_unique<CachedFileStream>(
-          std::make_unique<llvm::raw_fd_ostream>(FD, true));
-    };
-
-    if (Error Err = LTOBackend.run(AddStream))
-      return Err;
-
-    if (Args.hasArg(OPT_lto_emit_llvm) || Args.hasArg(OPT_lto_emit_asm))
-      return Files;
-
-    for (StringRef LTOFile : LTOFiles) {
-      auto FileOrErr = runPTXAs(LTOFile, Args);
-      if (!FileOrErr)
-        return FileOrErr.takeError();
-      Files.emplace_back(*FileOrErr);
-    }
-  }
-
-  // Copy all of the input files to a new file ending in `.cubin`. The 'nvlink'
-  // linker requires all NVPTX inputs to have this extension for some reason.
-  for (auto &Input : LinkerInput) {
-    auto TempFileOrErr = createTempFile(
-        Args, sys::path::stem(Input->getBufferIdentifier()), "cubin");
-    if (!TempFileOrErr)
-      return TempFileOrErr.takeError();
-    Expected<std::unique_ptr<FileOutputBuffer>> OutputOrErr =
-        FileOutputBuffer::create(*TempFileOrErr, Input->getBuffer().size());
-    if (!OutputOrErr)
-      return OutputOrErr.takeError();
-    std::unique_ptr<FileOutputBuffer> Output = std::move(*OutputOrErr);
-    llvm::copy(Input->getBuffer(), Output->getBufferStart());
-    if (Error E = Output->commit())
-      return E;
-    Files.emplace_back(Args.MakeArgString(*TempFileOrErr));
-  }
-
-  return Files;
-}
-
-Error runNVLink(ArrayRef<StringRef> Files, const ArgList &Args) {
-  if (Args.hasArg(OPT_lto_emit_asm) || Args.hasArg(OPT_lto_emit_llvm))
-    return Error::success();
-
-  std::string CudaPath = Args.getLastArgValue(OPT_cuda_path_EQ).str();
-  Expected<std::string> NVLinkPath =
-      findProgram(Args, "nvlink", {CudaPath + "/bin"});
-  if (!NVLinkPath)
-    return NVLinkPath.takeError();
-
-  ArgStringList NewLinkerArgs;
-  for (const opt::Arg *Arg : Args) {
-    // Do not forward arguments only intended for the linker wrapper.
-    if (Arg->getOption().hasFlag(WrapperOnlyOption))
-      continue;
-
-    // Do not forward any inputs that we have processed.
-    if (Arg->getOption().matches(OPT_INPUT) ||
-        Arg->getOption().matches(OPT_library))
-      continue;
-
-    Arg->render(Args, NewLinkerArgs);
-  }
-
-  llvm::transform(Files, std::back_inserter(NewLinkerArgs),
-                  [&](StringRef Arg) { return Args.MakeArgString(Arg); });
-
-  SmallVector<StringRef> LinkerArgs({*NVLinkPath});
-  if (!Args.hasArg(OPT_o))
-    LinkerArgs.append({"-o", "a.out"});
-  for (StringRef Arg : NewLinkerArgs)
-    LinkerArgs.push_back(Arg);
-
-  if (Args.hasArg(OPT_dry_run) || Args.hasArg(OPT_verbose))
-    printCommands(LinkerArgs);
-  if (Args.hasArg(OPT_dry_run))
-    return Error::success();
-  if (sys::ExecuteAndWait(*NVLinkPath, LinkerArgs))
-    return createStringError("'" + sys::path::filename(*NVLinkPath) + "'" +
-                             " failed");
-  return Error::success();
-}
-
-} // namespace
-
-int main(int argc, char **argv) {
-  InitLLVM X(argc, argv);
-  InitializeAllTargetInfos();
-  InitializeAllTargets();
-  InitializeAllTargetMCs();
-  InitializeAllAsmParsers();
-  InitializeAllAsmPrinters();
-
-  Executable = argv[0];
-  sys::PrintStackTraceOnErrorSignal(argv[0]);
-
-  const OptTable &Tbl = getOptTable();
-  BumpPtrAllocator Alloc;
-  StringSaver Saver(Alloc);
-  auto Args = Tbl.parseArgs(argc, argv, OPT_INVALID, Saver, [&](StringRef Err) {
-    reportError(createStringError(inconvertibleErrorCode(), Err));
-  });
-
-  if (Args.hasArg(OPT_help) || Args.hasArg(OPT_help_hidden)) {
-    Tbl.printHelp(
-        outs(), "clang-nvlink-wrapper [options] <options to passed to nvlink>",
-        "A utility that wraps around the NVIDIA 'nvlink' linker.\n"
-        "This enables static linking and LTO handling for NVPTX targets.",
-        Args.hasArg(OPT_help_hidden), Args.hasArg(OPT_help_hidden));
-    return EXIT_SUCCESS;
-  }
-
-  if (Args.hasArg(OPT_version))
-    printVersion(outs());
-
-  // This forwards '-mllvm' arguments to LLVM if present.
-  SmallVector<const char *> NewArgv = {argv[0]};
-  for (const opt::Arg *Arg : Args.filtered(OPT_mllvm))
-    NewArgv.push_back(Arg->getValue());
-  for (const opt::Arg *Arg : Args.filtered(OPT_plugin_opt))
-    NewArgv.push_back(Arg->getValue());
-  cl::ParseCommandLineOptions(NewArgv.size(), &NewArgv[0]);
-
-  // Get the input files to pass to 'nvlink'.
-  auto FilesOrErr = getInput(Args);
-  if (!FilesOrErr)
-    reportError(FilesOrErr.takeError());
-
-  // Run 'nvlink' on the generated inputs.
-  if (Error Err = runNVLink(*FilesOrErr, Args))
-    reportError(std::move(Err));
-
-  // Remove the temporary files created.
-  if (!Args.hasArg(OPT_save_temps))
-    for (const auto &TempFile : TempFiles)
-      if (std::error_code EC = sys::fs::remove(TempFile))
-        reportError(createFileError(TempFile, EC));
-
-  return EXIT_SUCCESS;
-}
diff --git a/clang/tools/clang-nvlink-wrapper/NVLinkOpts.td b/clang/tools/clang-nvlink-wrapper/NVLinkOpts.td
deleted file mode 100644
index e84b530f2787d..0000000000000
--- a/clang/tools/clang-nvlink-wrapper/NVLinkOpts.td
+++ /dev/null
@@ -1,90 +0,0 @@
-include "llvm/Option/OptParser.td"
-
-def WrapperOnlyOption : OptionFlag;
-
-def help : Flag<["-", "--"], "help">,
-  HelpText<"Display available options (--help-hidden for more)">;
-
-def help_hidden : Flag<["-", "--"], "help-hidden">,
-  HelpText<"Display all available options">;
-
-def verbose : Flag<["-"], "v">, HelpText<"Print verbose information">;
-def version : Flag<["--"], "version">,
-  HelpText<"Display the version number and exit">;
-
-def cuda_path_EQ : Joined<["--"], "cuda-path=">,
-  MetaVarName<"<dir>">, HelpText<"Set the system CUDA path">;
-def ptxas_path_EQ : Joined<["--"], "ptxas-path=">,
-  MetaVarName<"<dir>">, HelpText<"Set the 'ptxas' path">;
-
-def o : JoinedOrSeparate<["-"], "o">, MetaVarName<"<path>">,
-  HelpText<"Path to file to write output">;
-def output : Separate<["--"], "output-file">, Alias<o>, Flags<[HelpHidden]>,
-  HelpText<"Alias for -o">;
-
-def library_path : JoinedOrSeparate<["-"], "L">, MetaVarName<"<dir>">,
-  HelpText<"Add <dir> to the library search path">;
-def library_path_S : Separate<["--", "-"], "library-path">, Flags<[HelpHidden]>,
-  Alias<library_path>;
-def library_path_EQ : Joined<["--", "-"], "library-path=">, Flags<[HelpHidden]>,
-  Alias<library_path>;
-
-def library : JoinedOrSeparate<["-"], "l">, MetaVarName<"<libname>">,
-  HelpText<"Search for library <libname>">;
-def library_S : Separate<["--", "-"], "library">, Flags<[HelpHidden]>,
-  Alias<library_path>;
-def library_EQ : Joined<["--", "-"], "library=">, Flags<[HelpHidden]>,
-  Alias<library_path>;
-
-def arch : Separate<["--", "-"], "arch">,
-  HelpText<"Specify the 'sm_' name of the target architecture.">;
-def : Joined<["--", "-"], "plugin-opt=mcpu=">,
-  Flags<[HelpHidden, WrapperOnlyOption]>, Alias<arch>;
-
-def feature : Separate<["--", "-"], "feature">, Flags<[WrapperOnlyOption]>,
-  HelpText<"Specify the '+ptx' freature to use for LTO.">;
-
-def g : Flag<["-"], "g">, HelpText<"Specify that this was a debug compile.">;
-def debug : Flag<["--"], "debug">, Alias<g>;
-
-def lto_emit_llvm : Flag<["--"], "lto-emit-llvm">, Flags<[WrapperOnlyOption]>,
-  HelpText<"Emit LLVM-IR bitcode">;
-def lto_emit_asm : Flag<["--"], "lto-emit-asm">, Flags<[WrapperOnlyOption]>,
-  HelpText<"Emit assembly code">;
-
-def O : Joined<["--", "-"], "plugin-opt=O">,
-  Flags<[WrapperOnlyOption]>, MetaVarName<"<O0, O1, O2, or O3>">,
-  HelpText<"Optimization level for LTO">;
-
-def thinlto : Joined<["--", "-"], "plugin-opt=thinlto">,
-  Flags<[WrapperOnlyOption]>, HelpText<"Enable the thin-lto backend">;
-def lto_partitions : Joined<["--", "-"], "plugin-opt=lto-partitions=">,
-  Flags<[WrapperOnlyOption]>, HelpText<"Number of LTO codegen partitions">;
-def jobs : Joined<["--", "-"], "plugin-opt=jobs=">,
-  Flags<[WrapperOnlyOption]>, HelpText<"Number of LTO codegen partitions">;
-def : Joined<["--", "-"], "plugin-opt=emit-llvm">,
-  Flags<[WrapperOnlyOption]>, Alias<lto_emit_llvm>;
-def : Joined<["--", "-"], "plugin-opt=emit-asm">,
-  Flags<[WrapperOnlyOption]>, Alias<lto_emit_asm>;
-def plugin_opt : Joined<["--", "-"], "plugin-opt=">, Flags<[WrapperOnlyOption]>,
-  HelpText<"Options passed to LLVM, not including the Clang invocation. Use "
-           "'--plugin-opt=--help' for a list of options.">;
-
-def save_temps : Flag<["--", "-"], "save-temps">,
-  Flags<[WrapperOnlyOption]>, HelpText<"Save intermediate results">;
-
-def whole_archive : Flag<["--", "-"], "whole-archive">,
-  Flags<[WrapperOnlyOption, HelpHidden]>;
-def no_whole_archive : Flag<["--", "-"], "no-whole-archive">,
-  Flags<[WrapperOnlyOption, HelpHidden]>;
-
-def mllvm : Separate<["-"], "mllvm">, Flags<[WrapperOnlyOption]>,
-  MetaVarName<"<arg>">,
-  HelpText<"Arguments passed to LLVM, including Clang invocations, for which "
-           "the '-mllvm' prefix is preserved. Use '-mllvm --help' for a list "
-           "of options.">;
-def mllvm_EQ : Joined<["-"], "mllvm=">, Flags<[HelpHidden]>,
-  Alias<mllvm>;
-
-def dry_run : Flag<["--", "-"], "dry-run">, Flags<[WrapperOnlyOption]>,
-  HelpText<"Print generated commands without running.">;
diff --git a/clang/tools/driver/cc1_main.cpp b/clang/tools/driver/cc1_main.cpp
index f5e5fad36573e..c2ccb47a15bc8 100644
--- a/clang/tools/driver/cc1_main.cpp
+++ b/clang/tools/driver/cc1_main.cpp
@@ -241,8 +241,7 @@ int cc1_main(ArrayRef<const char *> Argv, const char *Argv0, void *MainAddr) {
 
   if (!Clang->getFrontendOpts().TimeTracePath.empty()) {
     llvm::timeTraceProfilerInitialize(
-        Clang->getFrontendOpts().TimeTraceGranularity, Argv0,
-        Clang->getFrontendOpts().TimeTraceVerbose);
+        Clang->getFrontendOpts().TimeTraceGranularity, Argv0);
   }
   // --print-supported-cpus takes priority over the actual compilation.
   if (Clang->getFrontendOpts().PrintSupportedCPUs)
diff --git a/clang/unittests/AST/ASTImporterTest.cpp b/clang/unittests/AST/ASTImporterTest.cpp
index 6d987cc7e9ec6..92f9bae6cb064 100644
--- a/clang/unittests/AST/ASTImporterTest.cpp
+++ b/clang/unittests/AST/ASTImporterTest.cpp
@@ -9681,106 +9681,37 @@ AST_MATCHER_P(EnumDecl, hasEnumConstName, StringRef, ConstName) {
   return false;
 }
 
-TEST_P(ASTImporterOptionSpecificTestBase, ImportAnonymousEnums) {
-  const char *Code =
-      R"(
-      struct A {
-        enum { E1, E2 } x;
-        enum { E3, E4 } y;
-      };
-      )";
-  Decl *FromTU = getTuDecl(Code, Lang_CXX11);
-  auto *FromEnumE1 = FirstDeclMatcher<EnumDecl>().match(
-      FromTU, enumDecl(hasEnumConstName("E1")));
-  auto *ImportedEnumE1 = Import(FromEnumE1, Lang_CXX11);
-  EXPECT_TRUE(ImportedEnumE1);
-  auto *FromEnumE3 = FirstDeclMatcher<EnumDecl>().match(
-      FromTU, enumDecl(hasEnumConstName("E3")));
-  auto *ImportedEnumE3 = Import(FromEnumE3, Lang_CXX11);
-  EXPECT_TRUE(ImportedEnumE3);
-  EXPECT_NE(ImportedEnumE1, ImportedEnumE3);
-}
-
-TEST_P(ASTImporterOptionSpecificTestBase, ImportFreeStandingAnonymousEnums) {
-  const char *Code =
-      R"(
-      struct A {
-        enum { E1, E2 };
-        enum { E3, E4 };
-      };
-      )";
-  Decl *FromTU = getTuDecl(Code, Lang_CXX11);
-  auto *FromEnumE1 = FirstDeclMatcher<EnumDecl>().match(
-      FromTU, enumDecl(hasEnumConstName("E1")));
-  auto *ImportedEnumE1 = Import(FromEnumE1, Lang_CXX11);
-  EXPECT_TRUE(ImportedEnumE1);
-  auto *FromEnumE3 = FirstDeclMatcher<EnumDecl>().match(
-      FromTU, enumDecl(hasEnumConstName("E3")));
-  auto *ImportedEnumE3 = Import(FromEnumE3, Lang_CXX11);
-  EXPECT_TRUE(ImportedEnumE3);
-  EXPECT_NE(ImportedEnumE1, ImportedEnumE3);
-}
-
-TEST_P(ASTImporterOptionSpecificTestBase, ImportExistingAnonymousEnums) {
+TEST_P(ASTImporterOptionSpecificTestBase, ImportAnonymousEnum) {
   const char *ToCode =
       R"(
       struct A {
-        enum { E1, E2 } x;
-        enum { E3, E4 } y;
+        enum { E1, E2} x;
+        enum { E3, E4} y;
       };
       )";
   Decl *ToTU = getToTuDecl(ToCode, Lang_CXX11);
-  auto *ToEnumE1 = FirstDeclMatcher<EnumDecl>().match(
+  auto *ToE1 = FirstDeclMatcher<EnumDecl>().match(
       ToTU, enumDecl(hasEnumConstName("E1")));
-  auto *ToEnumE3 = FirstDeclMatcher<EnumDecl>().match(
+  auto *ToE3 = FirstDeclMatcher<EnumDecl>().match(
       ToTU, enumDecl(hasEnumConstName("E3")));
   const char *Code =
       R"(
       struct A {
-        enum { E1, E2 } x;
-        enum { E3, E4 } y;
+        enum { E1, E2} x;
+        enum { E3, E4} y;
       };
       )";
   Decl *FromTU = getTuDecl(Code, Lang_CXX11);
-  auto *FromEnumE1 = FirstDeclMatcher<EnumDecl>().match(
+  auto *FromE1 = FirstDeclMatcher<EnumDecl>().match(
       FromTU, enumDecl(hasEnumConstName("E1")));
-  auto *ImportedEnumE1 = Import(FromEnumE1, Lang_CXX11);
-  ASSERT_TRUE(ImportedEnumE1);
-  EXPECT_EQ(ImportedEnumE1, ToEnumE1);
-  auto *FromEnumE3 = FirstDeclMatcher<EnumDecl>().match(
-      FromTU, enumDecl(hasEnumConstName("E3")));
-  auto *ImportedEnumE3 = Import(FromEnumE3, Lang_CXX11);
-  ASSERT_TRUE(ImportedEnumE3);
-  EXPECT_EQ(ImportedEnumE3, ToEnumE3);
-}
-
-TEST_P(ASTImporterOptionSpecificTestBase, ImportExistingEmptyAnonymousEnums) {
-  const char *ToCode =
-      R"(
-      struct A {
-        enum {};
-      };
-      )";
-  Decl *ToTU = getToTuDecl(ToCode, Lang_CXX11);
-  auto *ToE1 = FirstDeclMatcher<EnumDecl>().match(ToTU, enumDecl());
-  const char *Code =
-      R"(
-      struct A {
-        enum {};
-        enum {};
-      };
-      )";
-  Decl *FromTU = getTuDecl(Code, Lang_CXX11);
-  auto *FromE1 = FirstDeclMatcher<EnumDecl>().match(FromTU, enumDecl());
   auto *ImportedE1 = Import(FromE1, Lang_CXX11);
   ASSERT_TRUE(ImportedE1);
   EXPECT_EQ(ImportedE1, ToE1);
-  auto *FromE2 = LastDeclMatcher<EnumDecl>().match(FromTU, enumDecl());
-  ASSERT_NE(FromE1, FromE2);
-  auto *ImportedE2 = Import(FromE2, Lang_CXX11);
-  ASSERT_TRUE(ImportedE2);
-  // FIXME: These should not be equal, or the import should fail.
-  EXPECT_EQ(ImportedE2, ToE1);
+  auto *FromE3 = FirstDeclMatcher<EnumDecl>().match(
+      FromTU, enumDecl(hasEnumConstName("E3")));
+  auto *ImportedE3 = Import(FromE3, Lang_CXX11);
+  ASSERT_TRUE(ImportedE3);
+  EXPECT_EQ(ImportedE3, ToE3);
 }
 
 INSTANTIATE_TEST_SUITE_P(ParameterizedTests, ASTImporterLookupTableTest,
diff --git a/clang/unittests/Analysis/FlowSensitive/DataflowEnvironmentTest.cpp b/clang/unittests/Analysis/FlowSensitive/DataflowEnvironmentTest.cpp
index 41fca6bf6eba2..a4ac597bb06d6 100644
--- a/clang/unittests/Analysis/FlowSensitive/DataflowEnvironmentTest.cpp
+++ b/clang/unittests/Analysis/FlowSensitive/DataflowEnvironmentTest.cpp
@@ -473,32 +473,4 @@ TEST_F(EnvironmentTest, Stmt) {
   Env.getResultObjectLocation(*Init);
 }
 
-// This is a crash repro.
-TEST_F(EnvironmentTest, LambdaCapturingThisInFieldInitializer) {
-  using namespace ast_matchers;
-  std::string Code = R"cc(
-      struct S {
-        int f{[this]() { return 1; }()};
-      };
-    )cc";
-
-  auto Unit =
-      tooling::buildASTFromCodeWithArgs(Code, {"-fsyntax-only", "-std=c++11"});
-  auto &Context = Unit->getASTContext();
-
-  ASSERT_EQ(Context.getDiagnostics().getClient()->getNumErrors(), 0U);
-
-  auto *LambdaCallOperator = selectFirst<CXXMethodDecl>(
-      "method", match(cxxMethodDecl(hasName("operator()"),
-                                    ofClass(cxxRecordDecl(isLambda())))
-                          .bind("method"),
-                      Context));
-
-  Environment Env(DAContext, *LambdaCallOperator);
-  // Don't crash when initializing.
-  Env.initialize();
-  // And initialize the captured `this` pointee.
-  ASSERT_NE(nullptr, Env.getThisPointeeStorageLocation());
-}
-
 } // namespace
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index 39fcbab3447a7..d01ce137b8fcb 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -8742,7 +8742,6 @@ TEST_F(FormatTest, FunctionAnnotations) {
                "    << abc;");
   verifyFormat("MACRO(abc)::function() // wrap\n"
                "    << abc;");
-  verifyFormat("FOO(bar)();", getLLVMStyleWithColumns(0));
 }
 
 TEST_F(FormatTest, BreaksDesireably) {
diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp
index b01ca322505b1..f70424c3ee060 100644
--- a/clang/unittests/Format/TokenAnnotatorTest.cpp
+++ b/clang/unittests/Format/TokenAnnotatorTest.cpp
@@ -1930,10 +1930,6 @@ TEST_F(TokenAnnotatorTest, UnderstandsFunctionAnnotations) {
                     "A(T) noexcept;");
   ASSERT_EQ(Tokens.size(), 12u) << Tokens;
   EXPECT_TOKEN(Tokens[8], tok::r_paren, TT_Unknown);
-
-  Tokens = annotate("FOO(bar)();");
-  ASSERT_EQ(Tokens.size(), 8u) << Tokens;
-  EXPECT_TOKEN(Tokens[3], tok::r_paren, TT_Unknown);
 }
 
 TEST_F(TokenAnnotatorTest, UnderstandsFunctionDeclarationNames) {
@@ -2129,13 +2125,6 @@ TEST_F(TokenAnnotatorTest, UnderstandsTrailingReturnArrow) {
   ASSERT_EQ(Tokens.size(), 21u) << Tokens;
   EXPECT_TOKEN(Tokens[13], tok::arrow, TT_Unknown);
 
-  auto Style = getLLVMStyle();
-  Style.StatementAttributeLikeMacros.push_back("emit");
-  Tokens = annotate("emit foo()->bar;", Style);
-  ASSERT_EQ(Tokens.size(), 8u) << Tokens;
-  EXPECT_TOKEN(Tokens[0], tok::identifier, TT_StatementAttributeLikeMacro);
-  EXPECT_TOKEN(Tokens[4], tok::arrow, TT_Unknown);
-
   // Mixed
   Tokens = annotate("auto f() -> int { auto a = b()->c; }");
   ASSERT_EQ(Tokens.size(), 18u) << Tokens;
@@ -2961,13 +2950,6 @@ TEST_F(TokenAnnotatorTest, StartOfName) {
   ASSERT_EQ(Tokens.size(), 7u) << Tokens;
   EXPECT_TOKEN(Tokens[0], tok::at, TT_ObjCDecl);
   EXPECT_TOKEN(Tokens[2], tok::identifier, TT_StartOfName);
-
-  auto Style = getLLVMStyle();
-  Style.StatementAttributeLikeMacros.push_back("emit");
-  Tokens = annotate("emit foo = 0;", Style);
-  ASSERT_EQ(Tokens.size(), 6u) << Tokens;
-  EXPECT_TOKEN(Tokens[0], tok::identifier, TT_StatementAttributeLikeMacro);
-  EXPECT_TOKEN(Tokens[1], tok::identifier, TT_Unknown);
 }
 
 TEST_F(TokenAnnotatorTest, BraceKind) {
diff --git a/clang/unittests/Lex/DependencyDirectivesScannerTest.cpp b/clang/unittests/Lex/DependencyDirectivesScannerTest.cpp
index 513e184be09ec..23304fff950eb 100644
--- a/clang/unittests/Lex/DependencyDirectivesScannerTest.cpp
+++ b/clang/unittests/Lex/DependencyDirectivesScannerTest.cpp
@@ -650,17 +650,6 @@ TEST(MinimizeSourceToDependencyDirectivesTest, AtImport) {
   EXPECT_STREQ("@import A.B;\n", Out.data());
 }
 
-TEST(MinimizeSourceToDependencyDirectivesTest, EmptyIncludesAndImports) {
-  SmallVector<char, 128> Out;
-
-  ASSERT_TRUE(minimizeSourceToDependencyDirectives("#import\n", Out));
-  ASSERT_TRUE(minimizeSourceToDependencyDirectives("#include\n", Out));
-  ASSERT_TRUE(minimizeSourceToDependencyDirectives("#ifdef A\n"
-                                                   "#import \n"
-                                                   "#endif\n",
-                                                   Out));
-}
-
 TEST(MinimizeSourceToDependencyDirectivesTest, AtImportFailures) {
   SmallVector<char, 128> Out;
 
diff --git a/clang/unittests/Support/TimeProfilerTest.cpp b/clang/unittests/Support/TimeProfilerTest.cpp
index f53fe71d630bf..5f3950ff033f1 100644
--- a/clang/unittests/Support/TimeProfilerTest.cpp
+++ b/clang/unittests/Support/TimeProfilerTest.cpp
@@ -10,15 +10,11 @@
 #include "clang/Frontend/FrontendActions.h"
 #include "clang/Lex/PreprocessorOptions.h"
 
-#include "llvm/ADT/StringMap.h"
 #include "llvm/Support/JSON.h"
-#include "llvm/Support/Path.h"
 #include "llvm/Support/TimeProfiler.h"
-#include "llvm/Support/VirtualFileSystem.h"
 #include <stack>
 
 #include "gtest/gtest.h"
-#include <tuple>
 
 using namespace clang;
 using namespace llvm;
@@ -27,8 +23,7 @@ namespace {
 
 // Should be called before testing.
 void setupProfiler() {
-  timeTraceProfilerInitialize(/*TimeTraceGranularity=*/0, "test",
-                              /*TimeTraceVerbose=*/true);
+  timeTraceProfilerInitialize(/*TimeTraceGranularity=*/0, "test");
 }
 
 // Should be called after `compileFromString()`.
@@ -43,24 +38,14 @@ std::string teardownProfiler() {
 
 // Returns true if code compiles successfully.
 // We only parse AST here. This is enough for constexpr evaluation.
-bool compileFromString(StringRef Code, StringRef Standard, StringRef File,
-                       llvm::StringMap<std::string> Headers = {}) {
+bool compileFromString(StringRef Code, StringRef Standard, StringRef FileName) {
   CompilerInstance Compiler;
   Compiler.createDiagnostics();
 
-  llvm::IntrusiveRefCntPtr<llvm::vfs::InMemoryFileSystem> FS(
-      new llvm::vfs::InMemoryFileSystem());
-  FS->addFile(File, 0, MemoryBuffer::getMemBuffer(Code));
-  for (const auto &Header : Headers) {
-    FS->addFile(Header.getKey(), 0,
-                MemoryBuffer::getMemBuffer(Header.getValue()));
-  }
-  llvm::IntrusiveRefCntPtr<FileManager> Files(
-      new FileManager(FileSystemOptions(), FS));
-  Compiler.setFileManager(Files.get());
-
   auto Invocation = std::make_shared<CompilerInvocation>();
-  std::vector<const char *> Args = {Standard.data(), File.data()};
+  Invocation->getPreprocessorOpts().addRemappedFile(
+      FileName, MemoryBuffer::getMemBuffer(Code).release());
+  const char *Args[] = {Standard.data(), FileName.data()};
   CompilerInvocation::CreateFromArgs(*Invocation, Args,
                                      Compiler.getDiagnostics());
   Compiler.setInvocation(std::move(Invocation));
@@ -75,28 +60,13 @@ bool compileFromString(StringRef Code, StringRef Standard, StringRef File,
   return Compiler.ExecuteAction(Action);
 }
 
-std::string GetMetadata(json::Object *Event) {
-  std::string M;
-  llvm::raw_string_ostream OS(M);
-  if (json::Object *Args = Event->getObject("args")) {
-    if (auto Detail = Args->getString("detail"))
-      OS << Detail;
-    // Use only filename to not include os-specific path separators.
-    if (auto File = Args->getString("file"))
-      OS << (M.empty() ? "" : ", ") << llvm::sys::path::filename(*File);
-    if (auto Line = Args->getInteger("line"))
-      OS << ":" << *Line;
-  }
-  return M;
-}
-
 // Returns pretty-printed trace graph.
 std::string buildTraceGraph(StringRef Json) {
   struct EventRecord {
     int64_t TimestampBegin;
     int64_t TimestampEnd;
-    std::string Name;
-    std::string Metadata;
+    StringRef Name;
+    StringRef Detail;
   };
   std::vector<EventRecord> Events;
 
@@ -111,13 +81,10 @@ std::string buildTraceGraph(StringRef Json) {
     int64_t TimestampBegin = TraceEventObj->getInteger("ts").value_or(0);
     int64_t TimestampEnd =
         TimestampBegin + TraceEventObj->getInteger("dur").value_or(0);
-    std::string Name = TraceEventObj->getString("name").value_or("").str();
-    std::string Metadata = GetMetadata(TraceEventObj);
-
-    // Source events are asynchronous events and may not perfectly nest the
-    // synchronous events. Skip testing them.
-    if (Name == "Source")
-      continue;
+    StringRef Name = TraceEventObj->getString("name").value_or("");
+    StringRef Detail = "";
+    if (json::Object *Args = TraceEventObj->getObject("args"))
+      Detail = Args->getString("detail").value_or("");
 
     // This is a "summary" event, like "Total PerformPendingInstantiations",
     // skip it
@@ -125,7 +92,7 @@ std::string buildTraceGraph(StringRef Json) {
       continue;
 
     Events.emplace_back(
-        EventRecord{TimestampBegin, TimestampEnd, Name, Metadata});
+        EventRecord{TimestampBegin, TimestampEnd, Name, Detail});
   }
 
   // There can be nested events that are very fast, for example:
@@ -165,9 +132,9 @@ std::string buildTraceGraph(StringRef Json) {
       Stream << "| ";
     }
     Stream.write(Event.Name.data(), Event.Name.size());
-    if (!Event.Metadata.empty()) {
+    if (!Event.Detail.empty()) {
       Stream << " (";
-      Stream.write(Event.Metadata.data(), Event.Metadata.size());
+      Stream.write(Event.Detail.data(), Event.Detail.size());
       Stream << ")";
     }
     Stream << "\n";
@@ -178,7 +145,7 @@ std::string buildTraceGraph(StringRef Json) {
 } // namespace
 
 TEST(TimeProfilerTest, ConstantEvaluationCxx20) {
-  std::string Code = R"(
+  constexpr StringRef Code = R"(
 void print(double value);
 
 namespace slow_namespace {
@@ -208,8 +175,9 @@ constexpr int slow_init_list[] = {1, 1, 2, 3, 5, 8, 13, 21}; // 25th line
   setupProfiler();
   ASSERT_TRUE(compileFromString(Code, "-std=c++20", "test.cc"));
   std::string Json = teardownProfiler();
-  ASSERT_EQ(R"(
-Frontend (test.cc)
+  std::string TraceGraph = buildTraceGraph(Json);
+  ASSERT_TRUE(TraceGraph == R"(
+Frontend
 | ParseDeclarationOrFunctionDefinition (test.cc:2:1)
 | ParseDeclarationOrFunctionDefinition (test.cc:6:1)
 | | ParseFunctionDefinition (slow_func)
@@ -234,54 +202,14 @@ Frontend (test.cc)
 | ParseDeclarationOrFunctionDefinition (test.cc:25:1)
 | | EvaluateAsInitializer (slow_init_list)
 | PerformPendingInstantiations
-)",
-            buildTraceGraph(Json));
-}
-
-TEST(TimeProfilerTest, TemplateInstantiations) {
-  std::string B_H = R"(
-    template <typename T>
-    T fooB(T t) {
-      return T();
-    }
-
-    #define MacroTemp(x) template <typename T> void foo##x(T) { T(); }
-  )";
-
-  std::string A_H = R"(
-    #include "b.h"
-
-    MacroTemp(MTA)
-
-    template <typename T>
-    void fooA(T t) { fooB(t); fooMTA(t); }
-  )";
-  std::string Code = R"(
-    #include "a.h"
-    void user() { fooA(0); }
-  )";
+)");
 
-  setupProfiler();
-  ASSERT_TRUE(compileFromString(Code, "-std=c++20", "test.cc",
-                                /*Headers=*/{{"a.h", A_H}, {"b.h", B_H}}));
-  std::string Json = teardownProfiler();
-  ASSERT_EQ(R"(
-Frontend (test.cc)
-| ParseFunctionDefinition (fooB)
-| ParseFunctionDefinition (fooMTA)
-| ParseFunctionDefinition (fooA)
-| ParseDeclarationOrFunctionDefinition (test.cc:3:5)
-| | ParseFunctionDefinition (user)
-| PerformPendingInstantiations
-| | InstantiateFunction (fooA<int>, a.h:7)
-| | | InstantiateFunction (fooB<int>, b.h:3)
-| | | InstantiateFunction (fooMTA<int>, a.h:4)
-)",
-            buildTraceGraph(Json));
+  // NOTE: If this test is failing, run this test with
+  // `llvm::errs() << TraceGraph;` and change the assert above.
 }
 
 TEST(TimeProfilerTest, ConstantEvaluationC99) {
-  std::string Code = R"(
+  constexpr StringRef Code = R"(
 struct {
   short quantval[4]; // 3rd line
 } value;
@@ -290,12 +218,15 @@ struct {
   setupProfiler();
   ASSERT_TRUE(compileFromString(Code, "-std=c99", "test.c"));
   std::string Json = teardownProfiler();
-  ASSERT_EQ(R"(
-Frontend (test.c)
+  std::string TraceGraph = buildTraceGraph(Json);
+  ASSERT_TRUE(TraceGraph == R"(
+Frontend
 | ParseDeclarationOrFunctionDefinition (test.c:2:1)
 | | isIntegerConstantExpr (<test.c:3:18>)
 | | EvaluateKnownConstIntCheckOverflow (<test.c:3:18>)
 | PerformPendingInstantiations
-)",
-            buildTraceGraph(Json));
+)");
+
+  // NOTE: If this test is failing, run this test with
+  // `llvm::errs() << TraceGraph;` and change the assert above.
 }
diff --git a/clang/unittests/Tooling/CMakeLists.txt b/clang/unittests/Tooling/CMakeLists.txt
index 401978c31863c..0eb612f8d9498 100644
--- a/clang/unittests/Tooling/CMakeLists.txt
+++ b/clang/unittests/Tooling/CMakeLists.txt
@@ -40,7 +40,6 @@ add_clang_unittest(ToolingTests
   RecursiveASTVisitorTests/CXXMethodDecl.cpp
   RecursiveASTVisitorTests/CXXOperatorCallExprTraverser.cpp
   RecursiveASTVisitorTests/DeclRefExpr.cpp
-  RecursiveASTVisitorTests/DeductionGuide.cpp
   RecursiveASTVisitorTests/ImplicitCtor.cpp
   RecursiveASTVisitorTests/ImplicitCtorInitializer.cpp
   RecursiveASTVisitorTests/InitListExprPostOrder.cpp
diff --git a/clang/unittests/Tooling/RecursiveASTVisitorTests/DeductionGuide.cpp b/clang/unittests/Tooling/RecursiveASTVisitorTests/DeductionGuide.cpp
index df878bfc113e5..cd4bf0eb7bd5a 100644
--- a/clang/unittests/Tooling/RecursiveASTVisitorTests/DeductionGuide.cpp
+++ b/clang/unittests/Tooling/RecursiveASTVisitorTests/DeductionGuide.cpp
@@ -37,7 +37,7 @@ TEST(RecursiveASTVisitor, DeductionGuideNonImplicitMode) {
   // Verify that the synthezied deduction guide for alias is not visited in
   // RAV's implicit mode.
   Visitor.ExpectMatch("Foo(T) -> Foo<int>", 11, 1);
-  Visitor.DisallowMatch("Bar(T) -> Foo<int>", 14, 1);
+  Visitor.DisallowMatch("Bar(type-parameter-0-0) -> Foo<int>", 14, 1);
   EXPECT_TRUE(Visitor.runOver(
       R"cpp(
 template <typename T>
@@ -61,7 +61,7 @@ Bar s(1);
 TEST(RecursiveASTVisitor, DeductionGuideImplicitMode) {
   DeductionGuideVisitor Visitor(/*ShouldVisitImplicitCode*/ true);
   Visitor.ExpectMatch("Foo(T) -> Foo<int>", 11, 1);
-  Visitor.ExpectMatch("Bar(T) -> Foo<int>", 14, 1);
+  Visitor.ExpectMatch("Bar(type-parameter-0-0) -> Foo<int>", 14, 1);
   EXPECT_TRUE(Visitor.runOver(
       R"cpp(
 template <typename T>
diff --git a/clang/utils/TableGen/NeonEmitter.cpp b/clang/utils/TableGen/NeonEmitter.cpp
index 30fbb8c5d65e5..626031d38cf00 100644
--- a/clang/utils/TableGen/NeonEmitter.cpp
+++ b/clang/utils/TableGen/NeonEmitter.cpp
@@ -952,7 +952,7 @@ std::string Intrinsic::getInstTypeCode(Type T, ClassKind CK) const {
   char typeCode = '\0';
   bool printNumber = true;
 
-  if (CK == ClassB && TargetGuard == "neon")
+  if (CK == ClassB && TargetGuard == "")
     return "";
 
   if (T.isBFloat16())
@@ -976,7 +976,7 @@ std::string Intrinsic::getInstTypeCode(Type T, ClassKind CK) const {
       break;
     }
   }
-  if (CK == ClassB && TargetGuard == "neon") {
+  if (CK == ClassB && TargetGuard == "") {
     typeCode = '\0';
   }
 
@@ -1078,7 +1078,7 @@ std::string Intrinsic::mangleName(std::string Name, ClassKind LocalCK) const {
     S += "_" + getInstTypeCode(InBaseType, LocalCK);
   }
 
-  if (LocalCK == ClassB && TargetGuard == "neon")
+  if (LocalCK == ClassB && TargetGuard == "")
     S += "_v";
 
   // Insert a 'q' before the first '_' character so that it ends up before
diff --git a/clang/www/cxx_status.html b/clang/www/cxx_status.html
index a6ded8be3ae9e..1f69a4e8a5620 100755
--- a/clang/www/cxx_status.html
+++ b/clang/www/cxx_status.html
@@ -182,7 +182,7 @@ <h2 id="cxx26">C++2c implementation status</h2>
  <tr>
   <td>Module Declarations Shouldn’t be Macros</td>
   <td><a href="https://wg21.link/P3034R1">P3034R1</a> (<a href="#dr">DR</a>)</td>
-  <td class="none" align="center">No</td>
+  <td class="unreleased" align="center">Clang 19</td>
  </tr>
  <tr>
   <td>Trivial infinite loops are not Undefined Behavior</td>
diff --git a/cmake/Modules/LLVMVersion.cmake b/cmake/Modules/LLVMVersion.cmake
index 897dd963bd9ab..5e28283fbc1c6 100644
--- a/cmake/Modules/LLVMVersion.cmake
+++ b/cmake/Modules/LLVMVersion.cmake
@@ -1,7 +1,7 @@
 # The LLVM Version number information
 
 if(NOT DEFINED LLVM_VERSION_MAJOR)
-  set(LLVM_VERSION_MAJOR 20)
+  set(LLVM_VERSION_MAJOR 19)
 endif()
 if(NOT DEFINED LLVM_VERSION_MINOR)
   set(LLVM_VERSION_MINOR 0)
diff --git a/compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake b/compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake
index 29e5beb6182ba..02ff92f693810 100644
--- a/compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake
+++ b/compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake
@@ -32,7 +32,9 @@ set(ALL_ASAN_SUPPORTED_ARCH ${X86} ${X86_64} ${ARM32} ${ARM64} ${RISCV64}
     ${LOONGARCH64})
 set(ALL_ASAN_ABI_SUPPORTED_ARCH ${X86_64} ${ARM64} ${ARM64_32})
 set(ALL_DFSAN_SUPPORTED_ARCH ${X86_64} ${MIPS64} ${ARM64} ${LOONGARCH64})
-set(ALL_RTSAN_SUPPORTED_ARCH ${X86_64} ${ARM64})
+#set(ALL_RTSAN_SUPPORTED_ARCH ${X86} ${X86_64} ${ARM32} ${ARM64} ${RISCV64}
+#    ${MIPS32} ${MIPS64} ${PPC64} ${S390X} ${SPARC} ${SPARCV9} ${HEXAGON}
+#    ${LOONGARCH64})
 
 if(ANDROID)
   set(OS_NAME "Android")
diff --git a/compiler-rt/cmake/config-ix.cmake b/compiler-rt/cmake/config-ix.cmake
index 3a151772e268a..74fdbd288a1f0 100644
--- a/compiler-rt/cmake/config-ix.cmake
+++ b/compiler-rt/cmake/config-ix.cmake
@@ -176,7 +176,6 @@ check_cxx_compiler_flag(-nostdlib++ COMPILER_RT_HAS_NOSTDLIBXX_FLAG)
 check_include_files("sys/auxv.h"    COMPILER_RT_HAS_AUXV)
 
 # Libraries.
-check_library_exists(atomic __atomic_load_8 "" COMPILER_RT_HAS_LIBATOMIC)
 check_library_exists(dl dlopen "" COMPILER_RT_HAS_LIBDL)
 check_library_exists(rt shm_open "" COMPILER_RT_HAS_LIBRT)
 check_library_exists(m pow "" COMPILER_RT_HAS_LIBM)
@@ -760,7 +759,7 @@ else()
 endif()
 
 if (COMPILER_RT_HAS_SANITIZER_COMMON AND RTSAN_SUPPORTED_ARCH AND
-    OS_NAME MATCHES "Darwin|Linux")
+    OS_NAME MATCHES "Android|Darwin|Linux")
   set(COMPILER_RT_HAS_RTSAN TRUE)
 else()
   set(COMPILER_RT_HAS_RTSAN FALSE)
diff --git a/compiler-rt/include/profile/InstrProfData.inc b/compiler-rt/include/profile/InstrProfData.inc
index 847e53cfa7432..e9866d94b762c 100644
--- a/compiler-rt/include/profile/InstrProfData.inc
+++ b/compiler-rt/include/profile/InstrProfData.inc
@@ -739,7 +739,6 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
 #define INSTR_PROF_PROFILE_RUNTIME_VAR __llvm_profile_runtime
 #define INSTR_PROF_PROFILE_COUNTER_BIAS_VAR __llvm_profile_counter_bias
 #define INSTR_PROF_PROFILE_SET_TIMESTAMP __llvm_profile_set_timestamp
-#define INSTR_PROF_PROFILE_SAMPLING_VAR __llvm_profile_sampling
 
 /* The variable that holds the name of the profile data
  * specified via command line. */
diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt
index 88a5998fd4610..744bbfacf32f1 100644
--- a/compiler-rt/lib/builtins/CMakeLists.txt
+++ b/compiler-rt/lib/builtins/CMakeLists.txt
@@ -571,7 +571,7 @@ set(aarch64_SOURCES
 
 if (COMPILER_RT_HAS_AARCH64_SME)
   if (NOT COMPILER_RT_DISABLE_AARCH64_FMV AND COMPILER_RT_HAS_FNO_BUILTIN_FLAG AND (COMPILER_RT_HAS_AUXV OR COMPILER_RT_BAREMETAL_BUILD))
-    list(APPEND aarch64_SOURCES aarch64/sme-abi.S aarch64/sme-libc-mem-routines.S aarch64/sme-abi-init.c aarch64/sme-abi-vg.c aarch64/sme-libc-routines.c)
+    list(APPEND aarch64_SOURCES aarch64/sme-abi.S aarch64/sme-abi-init.c aarch64/sme-abi-vg.c aarch64/sme-libc-routines.c)
     message(STATUS "AArch64 SME ABI routines enabled")
     set_source_files_properties(aarch64/sme-libc-routines.c PROPERTIES COMPILE_FLAGS "-fno-builtin")
   else()
diff --git a/compiler-rt/lib/builtins/aarch64/sme-libc-mem-routines.S b/compiler-rt/lib/builtins/aarch64/sme-libc-mem-routines.S
deleted file mode 100644
index 926ad3b1b6331..0000000000000
--- a/compiler-rt/lib/builtins/aarch64/sme-libc-mem-routines.S
+++ /dev/null
@@ -1,344 +0,0 @@
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-// Routines taken from libc/AOR_v20.02/string/aarch64
-
-#include "../assembly.h"
-
-#ifdef __aarch64__
-
-#define L(l) .L ## l
-
-//
-//  __arm_sc_memcpy / __arm_sc_memmove
-//
-
-#define dstin    x0
-#define src      x1
-#define count    x2
-#define dst      x3
-#define srcend1  x4
-#define dstend1  x5
-#define A_l      x6
-#define A_lw     w6
-#define A_h      x7
-#define B_l      x8
-#define B_lw     w8
-#define B_h      x9
-#define C_l      x10
-#define C_lw     w10
-#define C_h      x11
-#define D_l      x12
-#define D_h      x13
-#define E_l      x14
-#define E_h      x15
-#define F_l      x16
-#define F_h      x17
-#define G_l      count
-#define G_h      dst
-#define H_l      src
-#define H_h      srcend1
-#define tmp1     x14
-
-/* This implementation handles overlaps and supports both memcpy and memmove
-   from a single entry point.  It uses unaligned accesses and branchless
-   sequences to keep the code small, simple and improve performance.
-
-   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
-   copies of up to 128 bytes, and large copies.  The overhead of the overlap
-   check is negligible since it is only required for large copies.
-
-   Large copies use a software pipelined loop processing 64 bytes per iteration.
-   The destination pointer is 16-byte aligned to minimize unaligned accesses.
-   The loop tail is handled by always copying 64 bytes from the end.
-*/
-
-DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_sc_memcpy)
-        add     srcend1, src, count
-        add     dstend1, dstin, count
-        cmp     count, 128
-        b.hi    L(copy_long)
-        cmp     count, 32
-        b.hi    L(copy32_128)
-
-        /* Small copies: 0..32 bytes.  */
-        cmp     count, 16
-        b.lo    L(copy16)
-        ldp     A_l, A_h, [src]
-        ldp     D_l, D_h, [srcend1, -16]
-        stp     A_l, A_h, [dstin]
-        stp     D_l, D_h, [dstend1, -16]
-        ret
-
-        /* Copy 8-15 bytes.  */
-L(copy16):
-        tbz     count, 3, L(copy8)
-        ldr     A_l, [src]
-        ldr     A_h, [srcend1, -8]
-        str     A_l, [dstin]
-        str     A_h, [dstend1, -8]
-        ret
-
-        .p2align 3
-        /* Copy 4-7 bytes.  */
-L(copy8):
-        tbz     count, 2, L(copy4)
-        ldr     A_lw, [src]
-        ldr     B_lw, [srcend1, -4]
-        str     A_lw, [dstin]
-        str     B_lw, [dstend1, -4]
-        ret
-
-        /* Copy 0..3 bytes using a branchless sequence.  */
-L(copy4):
-        cbz     count, L(copy0)
-        lsr     tmp1, count, 1
-        ldrb    A_lw, [src]
-        ldrb    C_lw, [srcend1, -1]
-        ldrb    B_lw, [src, tmp1]
-        strb    A_lw, [dstin]
-        strb    B_lw, [dstin, tmp1]
-        strb    C_lw, [dstend1, -1]
-L(copy0):
-        ret
-
-        .p2align 4
-        /* Medium copies: 33..128 bytes.  */
-L(copy32_128):
-        ldp     A_l, A_h, [src]
-        ldp     B_l, B_h, [src, 16]
-        ldp     C_l, C_h, [srcend1, -32]
-        ldp     D_l, D_h, [srcend1, -16]
-        cmp     count, 64
-        b.hi    L(copy128)
-        stp     A_l, A_h, [dstin]
-        stp     B_l, B_h, [dstin, 16]
-        stp     C_l, C_h, [dstend1, -32]
-        stp     D_l, D_h, [dstend1, -16]
-        ret
-
-        .p2align 4
-        /* Copy 65..128 bytes.  */
-L(copy128):
-        ldp     E_l, E_h, [src, 32]
-        ldp     F_l, F_h, [src, 48]
-        cmp     count, 96
-        b.ls    L(copy96)
-        ldp     G_l, G_h, [srcend1, -64]
-        ldp     H_l, H_h, [srcend1, -48]
-        stp     G_l, G_h, [dstend1, -64]
-        stp     H_l, H_h, [dstend1, -48]
-L(copy96):
-        stp     A_l, A_h, [dstin]
-        stp     B_l, B_h, [dstin, 16]
-        stp     E_l, E_h, [dstin, 32]
-        stp     F_l, F_h, [dstin, 48]
-        stp     C_l, C_h, [dstend1, -32]
-        stp     D_l, D_h, [dstend1, -16]
-        ret
-
-        .p2align 4
-        /* Copy more than 128 bytes.  */
-L(copy_long):
-        /* Use backwards copy if there is an overlap.  */
-        sub     tmp1, dstin, src
-        cbz     tmp1, L(copy0)
-        cmp     tmp1, count
-        b.lo    L(copy_long_backwards)
-
-        /* Copy 16 bytes and then align dst to 16-byte alignment.  */
-
-        ldp     D_l, D_h, [src]
-        and     tmp1, dstin, 15
-        bic     dst, dstin, 15
-        sub     src, src, tmp1
-        add     count, count, tmp1      /* Count is now 16 too large.  */
-        ldp     A_l, A_h, [src, 16]
-        stp     D_l, D_h, [dstin]
-        ldp     B_l, B_h, [src, 32]
-        ldp     C_l, C_h, [src, 48]
-        ldp     D_l, D_h, [src, 64]!
-        subs    count, count, 128 + 16  /* Test and readjust count.  */
-        b.ls    L(copy64_from_end)
-L(loop64):
-        stp     A_l, A_h, [dst, 16]
-        ldp     A_l, A_h, [src, 16]
-        stp     B_l, B_h, [dst, 32]
-        ldp     B_l, B_h, [src, 32]
-        stp     C_l, C_h, [dst, 48]
-        ldp     C_l, C_h, [src, 48]
-        stp     D_l, D_h, [dst, 64]!
-        ldp     D_l, D_h, [src, 64]!
-        subs    count, count, 64
-        b.hi    L(loop64)
-
-        /* Write the last iteration and copy 64 bytes from the end.  */
-L(copy64_from_end):
-        ldp     E_l, E_h, [srcend1, -64]
-        stp     A_l, A_h, [dst, 16]
-        ldp     A_l, A_h, [srcend1, -48]
-        stp     B_l, B_h, [dst, 32]
-        ldp     B_l, B_h, [srcend1, -32]
-        stp     C_l, C_h, [dst, 48]
-        ldp     C_l, C_h, [srcend1, -16]
-        stp     D_l, D_h, [dst, 64]
-        stp     E_l, E_h, [dstend1, -64]
-        stp     A_l, A_h, [dstend1, -48]
-        stp     B_l, B_h, [dstend1, -32]
-        stp     C_l, C_h, [dstend1, -16]
-        ret
-
-        .p2align 4
-
-        /* Large backwards copy for overlapping copies.
-           Copy 16 bytes and then align dst to 16-byte alignment.  */
-L(copy_long_backwards):
-        ldp     D_l, D_h, [srcend1, -16]
-        and     tmp1, dstend1, 15
-        sub     srcend1, srcend1, tmp1
-        sub     count, count, tmp1
-        ldp     A_l, A_h, [srcend1, -16]
-        stp     D_l, D_h, [dstend1, -16]
-        ldp     B_l, B_h, [srcend1, -32]
-        ldp     C_l, C_h, [srcend1, -48]
-        ldp     D_l, D_h, [srcend1, -64]!
-        sub     dstend1, dstend1, tmp1
-        subs    count, count, 128
-        b.ls    L(copy64_from_start)
-
-L(loop64_backwards):
-        stp     A_l, A_h, [dstend1, -16]
-        ldp     A_l, A_h, [srcend1, -16]
-        stp     B_l, B_h, [dstend1, -32]
-        ldp     B_l, B_h, [srcend1, -32]
-        stp     C_l, C_h, [dstend1, -48]
-        ldp     C_l, C_h, [srcend1, -48]
-        stp     D_l, D_h, [dstend1, -64]!
-        ldp     D_l, D_h, [srcend1, -64]!
-        subs    count, count, 64
-        b.hi    L(loop64_backwards)
-
-        /* Write the last iteration and copy 64 bytes from the start.  */
-L(copy64_from_start):
-        ldp     G_l, G_h, [src, 48]
-        stp     A_l, A_h, [dstend1, -16]
-        ldp     A_l, A_h, [src, 32]
-        stp     B_l, B_h, [dstend1, -32]
-        ldp     B_l, B_h, [src, 16]
-        stp     C_l, C_h, [dstend1, -48]
-        ldp     C_l, C_h, [src]
-        stp     D_l, D_h, [dstend1, -64]
-        stp     G_l, G_h, [dstin, 48]
-        stp     A_l, A_h, [dstin, 32]
-        stp     B_l, B_h, [dstin, 16]
-        stp     C_l, C_h, [dstin]
-        ret
-END_COMPILERRT_OUTLINE_FUNCTION(__arm_sc_memcpy)
-
-DEFINE_COMPILERRT_FUNCTION_ALIAS(__arm_sc_memmove, __arm_sc_memcpy)
-
-
-//
-//  __arm_sc_memset
-//
-
-#define dstin    x0
-#define val      x1
-#define valw     w1
-#define count    x2
-#define dst      x3
-#define dstend2  x4
-#define zva_val  x5
-
-DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_sc_memset)
-        dup     v0.16B, valw
-        add     dstend2, dstin, count
-
-        cmp     count, 96
-        b.hi    L(set_long)
-        cmp     count, 16
-        b.hs    L(set_medium)
-        mov     val, v0.D[0]
-
-        /* Set 0..15 bytes.  */
-        tbz     count, 3, 1f
-        str     val, [dstin]
-        str     val, [dstend2, -8]
-        ret
-        nop
-1:      tbz     count, 2, 2f
-        str     valw, [dstin]
-        str     valw, [dstend2, -4]
-        ret
-2:      cbz     count, 3f
-        strb    valw, [dstin]
-        tbz     count, 1, 3f
-        strh    valw, [dstend2, -2]
-3:      ret
-
-        /* Set 17..96 bytes.  */
-L(set_medium):
-        str     q0, [dstin]
-        tbnz    count, 6, L(set96)
-        str     q0, [dstend2, -16]
-        tbz     count, 5, 1f
-        str     q0, [dstin, 16]
-        str     q0, [dstend2, -32]
-1:      ret
-
-        .p2align 4
-        /* Set 64..96 bytes.  Write 64 bytes from the start and
-           32 bytes from the end.  */
-L(set96):
-        str     q0, [dstin, 16]
-        stp     q0, q0, [dstin, 32]
-        stp     q0, q0, [dstend2, -32]
-        ret
-
-        .p2align 4
-L(set_long):
-        and     valw, valw, 255
-        bic     dst, dstin, 15
-        str     q0, [dstin]
-        cmp     count, 160
-        ccmp    valw, 0, 0, hs
-        b.ne    L(no_zva)
-
-#ifndef SKIP_ZVA_CHECK
-        mrs     zva_val, dczid_el0
-        and     zva_val, zva_val, 31
-        cmp     zva_val, 4              /* ZVA size is 64 bytes.  */
-        b.ne    L(no_zva)
-#endif
-        str     q0, [dst, 16]
-        stp     q0, q0, [dst, 32]
-        bic     dst, dst, 63
-        sub     count, dstend2, dst      /* Count is now 64 too large.  */
-        sub     count, count, 128       /* Adjust count and bias for loop.  */
-
-        .p2align 4
-L(zva_loop):
-        add     dst, dst, 64
-        dc      zva, dst
-        subs    count, count, 64
-        b.hi    L(zva_loop)
-        stp     q0, q0, [dstend2, -64]
-        stp     q0, q0, [dstend2, -32]
-        ret
-
-L(no_zva):
-        sub     count, dstend2, dst      /* Count is 16 too large.  */
-        sub     dst, dst, 16            /* Dst is biased by -32.  */
-        sub     count, count, 64 + 16   /* Adjust count and bias for loop.  */
-L(no_zva_loop):
-        stp     q0, q0, [dst, 32]
-        stp     q0, q0, [dst, 64]!
-        subs    count, count, 64
-        b.hi    L(no_zva_loop)
-        stp     q0, q0, [dstend2, -64]
-        stp     q0, q0, [dstend2, -32]
-        ret
-END_COMPILERRT_OUTLINE_FUNCTION(__arm_sc_memset)
-
-#endif // __aarch64__
diff --git a/compiler-rt/lib/builtins/aarch64/sme-libc-routines.c b/compiler-rt/lib/builtins/aarch64/sme-libc-routines.c
index 315490e73ea2b..89b52b0d1a880 100644
--- a/compiler-rt/lib/builtins/aarch64/sme-libc-routines.c
+++ b/compiler-rt/lib/builtins/aarch64/sme-libc-routines.c
@@ -1,5 +1,80 @@
 #include <stddef.h>
 
+// WARNING: When building the scalar versions of these functions you need to
+// use the compiler flag "-mllvm -disable-loop-idiom-all" to prevent clang
+// from recognising a loop idiom and planting calls to memcpy!
+
+static void *__arm_sc_memcpy_fwd(void *dest, const void *src,
+                                 size_t n) __arm_streaming_compatible {
+  unsigned char *destp = (unsigned char *)dest;
+  const unsigned char *srcp = (const unsigned char *)src;
+  for (size_t i = 0; i < n; ++i)
+    destp[i] = srcp[i];
+
+  return dest;
+}
+
+// If dest and src overlap then behaviour is undefined, hence we can add the
+// restrict keywords here. This also matches the definition of the libc memcpy
+// according to the man page.
+void *__arm_sc_memcpy(void *__restrict__ dest, const void *__restrict__ src,
+                      size_t n) __arm_streaming_compatible {
+  return __arm_sc_memcpy_fwd(dest, src, n);
+}
+
+void *__arm_sc_memset(void *dest, int c, size_t n) __arm_streaming_compatible {
+  unsigned char *destp = (unsigned char *)dest;
+  unsigned char c8 = (unsigned char)c;
+  for (size_t i = 0; i < n; ++i)
+    destp[i] = c8;
+
+  return dest;
+}
+
+static void *__arm_sc_memcpy_rev(void *dest, const void *src,
+                                 size_t n) __arm_streaming_compatible {
+  unsigned char *destp = (unsigned char *)dest;
+  const unsigned char *srcp = (const unsigned char *)src;
+  // TODO: Improve performance by copying larger chunks in reverse, or by
+  // using SVE.
+  while (n > 0) {
+    --n;
+    destp[n] = srcp[n];
+  }
+  return dest;
+}
+
+// Semantically a memmove is equivalent to the following:
+//   1. Copy the entire contents of src to a temporary array that does not
+//      overlap with src or dest.
+//   2. Copy the contents of the temporary array into dest.
+void *__arm_sc_memmove(void *dest, const void *src,
+                       size_t n) __arm_streaming_compatible {
+  unsigned char *destp = (unsigned char *)dest;
+  const unsigned char *srcp = (const unsigned char *)src;
+
+  // If src and dest don't overlap then just invoke memcpy
+  if ((srcp > (destp + n)) || (destp > (srcp + n)))
+    return __arm_sc_memcpy_fwd(dest, src, n);
+
+  // Overlap case 1:
+  //     src: Low     |   ->   |     High
+  //    dest: Low  |   ->   |        High
+  // Here src is always ahead of dest at a higher addres. If we first read a
+  // chunk of data from src we can safely write the same chunk to dest without
+  // corrupting future reads of src.
+  if (srcp > destp)
+    return __arm_sc_memcpy_fwd(dest, src, n);
+
+  // Overlap case 2:
+  //     src: Low  |   ->   |        High
+  //    dest: Low     |   ->   |     High
+  // While we're in the overlap region we're always corrupting future reads of
+  // src when writing to dest. An efficient way to do this is to copy the data
+  // in reverse by starting at the highest address.
+  return __arm_sc_memcpy_rev(dest, src, n);
+}
+
 const void *__arm_sc_memchr(const void *src, int c,
                             size_t n) __arm_streaming_compatible {
   const unsigned char *srcp = (const unsigned char *)src;
diff --git a/compiler-rt/lib/builtins/fp_extend.h b/compiler-rt/lib/builtins/fp_extend.h
index 22bf2b2514e57..7637417649e62 100644
--- a/compiler-rt/lib/builtins/fp_extend.h
+++ b/compiler-rt/lib/builtins/fp_extend.h
@@ -37,7 +37,16 @@ static const int srcSigFracBits = 52;
 // srcBits - srcSigFracBits - 1
 static const int srcExpBits = 11;
 
-static inline int src_rep_t_clz_impl(src_rep_t a) { return __builtin_clzll(a); }
+static inline int src_rep_t_clz_impl(src_rep_t a) {
+#if defined __LP64__
+  return __builtin_clzl(a);
+#else
+  if (a & REP_C(0xffffffff00000000))
+    return clzsi(a >> 32);
+  else
+    return 32 + clzsi(a & REP_C(0xffffffff));
+#endif
+}
 #define src_rep_t_clz src_rep_t_clz_impl
 
 #elif defined SRC_80
diff --git a/compiler-rt/lib/builtins/fp_lib.h b/compiler-rt/lib/builtins/fp_lib.h
index b2a89506135be..8404d98c93508 100644
--- a/compiler-rt/lib/builtins/fp_lib.h
+++ b/compiler-rt/lib/builtins/fp_lib.h
@@ -58,7 +58,16 @@ typedef double fp_t;
 #define REP_C UINT64_C
 #define significandBits 52
 
-static inline int rep_clz(rep_t a) { return __builtin_clzll(a); }
+static __inline int rep_clz(rep_t a) {
+#if defined __LP64__
+  return __builtin_clzl(a);
+#else
+  if (a & REP_C(0xffffffff00000000))
+    return clzsi(a >> 32);
+  else
+    return 32 + clzsi(a & REP_C(0xffffffff));
+#endif
+}
 
 #define loWord(a) (a & 0xffffffffU)
 #define hiWord(a) (a >> 32)
diff --git a/compiler-rt/lib/rtsan/rtsan.cpp b/compiler-rt/lib/rtsan/rtsan.cpp
index cf7fbddd9eb9c..43a63b4636f1e 100644
--- a/compiler-rt/lib/rtsan/rtsan.cpp
+++ b/compiler-rt/lib/rtsan/rtsan.cpp
@@ -12,23 +12,10 @@
 #include <rtsan/rtsan_context.h>
 #include <rtsan/rtsan_interceptors.h>
 
-using namespace __rtsan;
-
-bool __rtsan::rtsan_initialized;
-bool __rtsan::rtsan_init_is_running;
-
 extern "C" {
 
 SANITIZER_INTERFACE_ATTRIBUTE void __rtsan_init() {
-  CHECK(!rtsan_init_is_running);
-  if (rtsan_initialized)
-    return;
-  rtsan_init_is_running = true;
-
-  InitializeInterceptors();
-
-  rtsan_init_is_running = false;
-  rtsan_initialized = true;
+  __rtsan::InitializeInterceptors();
 }
 
 SANITIZER_INTERFACE_ATTRIBUTE void __rtsan_realtime_enter() {
diff --git a/compiler-rt/lib/rtsan/rtsan.h b/compiler-rt/lib/rtsan/rtsan.h
index ccddaf2c893ef..8b1219c13cbd3 100644
--- a/compiler-rt/lib/rtsan/rtsan.h
+++ b/compiler-rt/lib/rtsan/rtsan.h
@@ -14,13 +14,6 @@
 
 extern "C" {
 
-namespace __rtsan {
-
-extern bool rtsan_initialized;
-extern bool rtsan_init_is_running;
-
-} // namespace __rtsan
-
 // Initialise rtsan interceptors.
 // A call to this method is added to the preinit array on Linux systems.
 SANITIZER_INTERFACE_ATTRIBUTE void __rtsan_init();
diff --git a/compiler-rt/lib/rtsan/rtsan_interceptors.cpp b/compiler-rt/lib/rtsan/rtsan_interceptors.cpp
index 4d5423ec629d2..3a65f9d3f779d 100644
--- a/compiler-rt/lib/rtsan/rtsan_interceptors.cpp
+++ b/compiler-rt/lib/rtsan/rtsan_interceptors.cpp
@@ -10,14 +10,10 @@
 
 #include "rtsan/rtsan_interceptors.h"
 
-#include "interception/interception.h"
-#include "sanitizer_common/sanitizer_allocator_dlsym.h"
-#include "sanitizer_common/sanitizer_allocator_internal.h"
 #include "sanitizer_common/sanitizer_platform.h"
 #include "sanitizer_common/sanitizer_platform_interceptors.h"
 
 #include "interception/interception.h"
-#include "rtsan/rtsan.h"
 #include "rtsan/rtsan_context.h"
 
 #if SANITIZER_APPLE
@@ -39,15 +35,6 @@
 
 using namespace __sanitizer;
 
-using __rtsan::rtsan_init_is_running;
-using __rtsan::rtsan_initialized;
-
-namespace {
-struct DlsymAlloc : public DlSymAllocator<DlsymAlloc> {
-  static bool UseImpl() { return !rtsan_initialized; }
-};
-} // namespace
-
 void ExpectNotRealtime(const char *intercepted_function_name) {
   __rtsan::GetContextForThisThread().ExpectNotRealtime(
       intercepted_function_name);
@@ -251,17 +238,11 @@ INTERCEPTOR(int, nanosleep, const struct timespec *rqtp,
 // Memory
 
 INTERCEPTOR(void *, calloc, SIZE_T num, SIZE_T size) {
-  if (DlsymAlloc::Use())
-    return DlsymAlloc::Callocate(num, size);
-
   ExpectNotRealtime("calloc");
   return REAL(calloc)(num, size);
 }
 
 INTERCEPTOR(void, free, void *ptr) {
-  if (DlsymAlloc::PointerIsMine(ptr))
-    return DlsymAlloc::Free(ptr);
-
   if (ptr != NULL) {
     ExpectNotRealtime("free");
   }
@@ -269,17 +250,11 @@ INTERCEPTOR(void, free, void *ptr) {
 }
 
 INTERCEPTOR(void *, malloc, SIZE_T size) {
-  if (DlsymAlloc::Use())
-    return DlsymAlloc::Allocate(size);
-
   ExpectNotRealtime("malloc");
   return REAL(malloc)(size);
 }
 
 INTERCEPTOR(void *, realloc, void *ptr, SIZE_T size) {
-  if (DlsymAlloc::Use() || DlsymAlloc::PointerIsMine(ptr))
-    return DlsymAlloc::Realloc(ptr, size);
-
   ExpectNotRealtime("realloc");
   return REAL(realloc)(ptr, size);
 }
diff --git a/compiler-rt/lib/rtsan/tests/CMakeLists.txt b/compiler-rt/lib/rtsan/tests/CMakeLists.txt
index 3b783c90c2658..9eda116541ae3 100644
--- a/compiler-rt/lib/rtsan/tests/CMakeLists.txt
+++ b/compiler-rt/lib/rtsan/tests/CMakeLists.txt
@@ -36,7 +36,6 @@ set(RTSAN_UNITTEST_LINK_FLAGS
   ${SANITIZER_TEST_CXX_LIBRARIES}
   -no-pie)
 
-append_list_if(COMPILER_RT_HAS_LIBATOMIC -latomic RTSAN_UNITTEST_LINK_FLAGS)
 append_list_if(COMPILER_RT_HAS_LIBDL -ldl RTSAN_UNITTEST_LINK_FLAGS)
 append_list_if(COMPILER_RT_HAS_LIBRT -lrt RTSAN_UNITTEST_LINK_FLAGS)
 append_list_if(COMPILER_RT_HAS_LIBM -lm RTSAN_UNITTEST_LINK_FLAGS)
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc
index 49c9dcbef358f..032b04a09ae76 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc
@@ -6336,7 +6336,7 @@ INTERCEPTOR(void*, dlopen, const char *filename, int flag) {
 
       const char *SelfFName = DladdrSelfFName();
       VPrintf(1, "dlopen interceptor: DladdrSelfFName: %p %s\n",
-              (const void *)SelfFName, SelfFName);
+              (void *)SelfFName, SelfFName);
 
       if (SelfFName && internal_strcmp(SelfFName, filename) == 0) {
         // It's possible they copied the string from dladdr, so
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_procmaps_bsd.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_procmaps_bsd.cpp
index ee3fd5b0a817a..7d7c009f64442 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_procmaps_bsd.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_procmaps_bsd.cpp
@@ -47,8 +47,7 @@ void GetMemoryProfile(fill_profile_f cb, uptr *stats) {
   struct kinfo_proc2 *InfoProc;
   uptr Len = sizeof(*InfoProc);
   uptr Size = Len;
-  const int Mib[] = {CTL_KERN, KERN_PROC2, KERN_PROC_PID,
-                     getpid(), (int)Size,  1};
+  const int Mib[] = {CTL_KERN, KERN_PROC2, KERN_PROC_PID, getpid(), Size, 1};
   InfoProc = (struct kinfo_proc2 *)MmapOrDie(Size, "GetMemoryProfile()");
   CHECK_EQ(
       internal_sysctl(Mib, ARRAY_SIZE(Mib), nullptr, (uptr *)InfoProc, &Len, 0),
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_stacktrace_printer.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_stacktrace_printer.cpp
index 5631d132da74a..dddae441d5de9 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_stacktrace_printer.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_stacktrace_printer.cpp
@@ -269,7 +269,7 @@ void FormattedStackTracePrinter::RenderFrame(InternalScopedString *buffer,
       break;
     default:
       Report("Unsupported specifier in stack frame format: %c (%p)!\n", *p,
-             (const void *)p);
+             (void *)p);
       Die();
     }
   }
@@ -323,7 +323,7 @@ void FormattedStackTracePrinter::RenderData(InternalScopedString *buffer,
         break;
       default:
         Report("Unsupported specifier in stack frame format: %c (%p)!\n", *p,
-               (const void *)p);
+               (void *)p);
         Die();
     }
   }
diff --git a/compiler-rt/lib/scudo/standalone/tests/CMakeLists.txt b/compiler-rt/lib/scudo/standalone/tests/CMakeLists.txt
index a85eb737dba0a..672cdc0ba655e 100644
--- a/compiler-rt/lib/scudo/standalone/tests/CMakeLists.txt
+++ b/compiler-rt/lib/scudo/standalone/tests/CMakeLists.txt
@@ -46,8 +46,11 @@ set(SCUDO_UNITTEST_LINK_FLAGS
   ${COMPILER_RT_UNWINDER_LINK_LIBS}
   ${SANITIZER_TEST_CXX_LIBRARIES})
 list(APPEND SCUDO_UNITTEST_LINK_FLAGS -pthread -no-pie)
-
-append_list_if(COMPILER_RT_HAS_LIBATOMIC -latomic SCUDO_UNITTEST_LINK_FLAGS)
+# Linking against libatomic is required with some compilers
+check_library_exists(atomic __atomic_load_8 "" COMPILER_RT_HAS_LIBATOMIC)
+if (COMPILER_RT_HAS_LIBATOMIC)
+  list(APPEND SCUDO_UNITTEST_LINK_FLAGS -latomic)
+endif()
 
 set(SCUDO_TEST_HEADERS
   scudo_unit_test.h
diff --git a/compiler-rt/test/asan/TestCases/Posix/ignore_free_hook.cpp b/compiler-rt/test/asan/TestCases/Posix/ignore_free_hook.cpp
index 87be90014d56e..3266eef826eda 100644
--- a/compiler-rt/test/asan/TestCases/Posix/ignore_free_hook.cpp
+++ b/compiler-rt/test/asan/TestCases/Posix/ignore_free_hook.cpp
@@ -1,15 +1,15 @@
 // RUN: %clangxx_asan -O2 %s -o %t -DTEST=basic_hook_works && not %run %t \
-// RUN:   2>&1 | FileCheck %s -check-prefix=CHECK-BASIC
+// RUN:   |& FileCheck %s -check-prefix=CHECK-BASIC
 // RUN: %clangxx_asan -O2 %s -o %t -DTEST=ignore && %run %t \
-// RUN:   2>&1 | FileCheck %s -check-prefix=CHECK-IGNORE
+// RUN:   |& FileCheck %s -check-prefix=CHECK-IGNORE
 // RUN: %clangxx_asan -O2 %s -o %t -DTEST=ignore_twice && not %run %t \
-// RUN:   2>&1 | FileCheck %s -check-prefix=CHECK-IGNORE-2
+// RUN:   |& FileCheck %s -check-prefix=CHECK-IGNORE-2
 // RUN: %clangxx_asan -O2 %s -o %t -DTEST=mismatch && %env_asan_opts=alloc_dealloc_mismatch=1 not %run %t \
-// RUN:   2>&1 | FileCheck %s -check-prefix=CHECK-MISMATCH
+// RUN:   |& FileCheck %s -check-prefix=CHECK-MISMATCH
 // RUN: %clangxx_asan -O2 %s -o %t -DTEST=ignore_mismatch && %env_asan_opts=alloc_dealloc_mismatch=1 %run %t \
-// RUN:   2>&1 | FileCheck %s -check-prefix=CHECK-IGNORE-MISMATCH
+// RUN:   |& FileCheck %s -check-prefix=CHECK-IGNORE-MISMATCH
 // RUN: %clangxx_asan -O2 %s -o %t -DTEST=double_delete && not %run %t \
-// RUN:   2>&1 | FileCheck %s -check-prefix=CHECK-DOUBLE-DELETE
+// RUN:   |& FileCheck %s -check-prefix=CHECK-DOUBLE-DELETE
 
 #include <stdio.h>
 #include <stdlib.h>
diff --git a/compiler-rt/test/hwasan/TestCases/Posix/ignore_free_hook.cpp b/compiler-rt/test/hwasan/TestCases/Posix/ignore_free_hook.cpp
index 8c48654f84c0a..b6f17d89083b4 100644
--- a/compiler-rt/test/hwasan/TestCases/Posix/ignore_free_hook.cpp
+++ b/compiler-rt/test/hwasan/TestCases/Posix/ignore_free_hook.cpp
@@ -1,11 +1,11 @@
 // RUN: %clangxx_hwasan -O2 %s -o %t -DTEST=basic_hook_works && not %run %t \
-// RUN:   2>&1 | FileCheck %s -check-prefix=CHECK-BASIC
+// RUN:   |& FileCheck %s -check-prefix=CHECK-BASIC
 // RUN: %clangxx_hwasan -O2 %s -o %t -DTEST=ignore && %run %t \
-// RUN:   2>&1 | FileCheck %s -check-prefix=CHECK-IGNORE
+// RUN:   |& FileCheck %s -check-prefix=CHECK-IGNORE
 // RUN: %clangxx_hwasan -O2 %s -o %t -DTEST=ignore_twice && not %run %t \
-// RUN:   2>&1 | FileCheck %s -check-prefix=CHECK-IGNORE-2
+// RUN:   |& FileCheck %s -check-prefix=CHECK-IGNORE-2
 // RUN: %clangxx_hwasan -O2 %s -o %t -DTEST=double_delete && not %run %t \
-// RUN:   2>&1 | FileCheck %s -check-prefix=CHECK-DOUBLE-DELETE
+// RUN:   |& FileCheck %s -check-prefix=CHECK-DOUBLE-DELETE
 
 #include <sanitizer/hwasan_interface.h>
 #include <stdio.h>
diff --git a/compiler-rt/test/profile/Linux/instrprof-vtable-value-prof.cpp b/compiler-rt/test/profile/Linux/instrprof-vtable-value-prof.cpp
index 411640fc5176d..24a96a227393a 100644
--- a/compiler-rt/test/profile/Linux/instrprof-vtable-value-prof.cpp
+++ b/compiler-rt/test/profile/Linux/instrprof-vtable-value-prof.cpp
@@ -109,25 +109,6 @@
 // ICTEXT: {{.*}}instrprof-vtable-value-prof.cpp;_ZTVN12_GLOBAL__N_18Derived2E:750
 // ICTEXT: _ZTV8Derived1:250
 
-// When vtable value profiles exist, pgo-instr-use pass should annotate them
-// even if `-enable-vtable-value-profiling` is not explicitly on.
-// RUN: %clangxx -m64 -fprofile-use=test.profdata -fuse-ld=lld -O2 \
-// RUN:   -mllvm -print-after=pgo-instr-use -mllvm -filter-print-funcs=main \
-// RUN:   -mllvm -print-module-scope %s 2>&1 | FileCheck %s --check-prefix=ANNOTATE
-
-// ANNOTATE-NOT: Inconsistent number of value sites
-// ANNOTATE: !{!"VP", i32 2
-
-// When vtable value profiles exist, pgo-instr-use pass will not annotate them
-// if `-icp-max-num-vtables` is set to zero.
-// RUN: %clangxx -m64 -fprofile-use=test.profdata -fuse-ld=lld -O2 \
-// RUN:   -mllvm -icp-max-num-vtables=0 -mllvm -print-after=pgo-instr-use \
-// RUN:   -mllvm -filter-print-funcs=main -mllvm -print-module-scope %s 2>&1 | \
-// RUN:   FileCheck %s --check-prefix=OMIT
-
-// OMIT: Inconsistent number of value sites
-// OMIT-NOT: !{!"VP", i32 2
-
 // Test indirect call promotion transformation using vtable profiles.
 // - Build with `-g` to enable debug information.
 // - In real world settings, ICP pass is disabled in prelink pipeline. In
@@ -147,12 +128,12 @@
 // RUN:    | FileCheck %s --check-prefixes=REMARK,IR --implicit-check-not="!VP"
 
 // For the indirect call site `ptr->func`
-// REMARK: instrprof-vtable-value-prof.cpp:226:19: Promote indirect call to _ZN12_GLOBAL__N_18Derived24funcEii with count 150 out of 200, sink 1 instruction(s) and compare 1 vtable(s): {_ZTVN12_GLOBAL__N_18Derived2E}
-// REMARK: instrprof-vtable-value-prof.cpp:226:19: Promote indirect call to _ZN8Derived14funcEii with count 50 out of 50, sink 1 instruction(s) and compare 1 vtable(s): {_ZTV8Derived1}
+// REMARK: instrprof-vtable-value-prof.cpp:205:19: Promote indirect call to _ZN12_GLOBAL__N_18Derived24funcEii with count 150 out of 200, sink 1 instruction(s) and compare 1 vtable(s): {_ZTVN12_GLOBAL__N_18Derived2E}
+// REMARK: instrprof-vtable-value-prof.cpp:205:19: Promote indirect call to _ZN8Derived14funcEii with count 50 out of 50, sink 1 instruction(s) and compare 1 vtable(s): {_ZTV8Derived1}
 //
 // For the indirect call site `delete ptr`
-// REMARK: instrprof-vtable-value-prof.cpp:228:5: Promote indirect call to _ZN12_GLOBAL__N_18Derived2D0Ev with count 750 out of 1000, sink 2 instruction(s) and compare 1 vtable(s): {_ZTVN12_GLOBAL__N_18Derived2E}
-// REMARK: instrprof-vtable-value-prof.cpp:228:5: Promote indirect call to _ZN8Derived1D0Ev with count 250 out of 250, sink 2 instruction(s) and compare 1 vtable(s): {_ZTV8Derived1}
+// REMARK: instrprof-vtable-value-prof.cpp:207:5: Promote indirect call to _ZN12_GLOBAL__N_18Derived2D0Ev with count 750 out of 1000, sink 2 instruction(s) and compare 1 vtable(s): {_ZTVN12_GLOBAL__N_18Derived2E}
+// REMARK: instrprof-vtable-value-prof.cpp:207:5: Promote indirect call to _ZN8Derived1D0Ev with count 250 out of 250, sink 2 instruction(s) and compare 1 vtable(s): {_ZTV8Derived1}
 
 // The IR matchers for indirect callsite `ptr->func`.
 // IR-LABEL: @main
diff --git a/cross-project-tests/lit.cfg.py b/cross-project-tests/lit.cfg.py
index 9935fe6a199da..774c4eaf4d976 100644
--- a/cross-project-tests/lit.cfg.py
+++ b/cross-project-tests/lit.cfg.py
@@ -4,6 +4,9 @@
 import subprocess
 import sys
 
+# TODO: LooseVersion is undocumented; use something else.
+from distutils.version import LooseVersion
+
 import lit.formats
 import lit.util
 
@@ -276,11 +279,7 @@ def get_clang_default_dwarf_version_string(triple):
 gdb_version_string = get_gdb_version_string()
 if dwarf_version_string and gdb_version_string:
     if int(dwarf_version_string) >= 5:
-        try:
-            from packaging import version
-        except:
-            lit_config.fatal("Running gdb tests requires the packaging package")
-        if version.parse(gdb_version_string) < version.parse("10.1"):
+        if LooseVersion(gdb_version_string) < LooseVersion("10.1"):
             # Example for llgdb-tests, which use lldb on darwin but gdb elsewhere:
             # XFAIL: !system-darwin && gdb-clang-incompatibility
             config.available_features.add("gdb-clang-incompatibility")
diff --git a/flang/lib/Lower/ConvertCall.cpp b/flang/lib/Lower/ConvertCall.cpp
index fd873f55dd844..ba65b644e5a93 100644
--- a/flang/lib/Lower/ConvertCall.cpp
+++ b/flang/lib/Lower/ConvertCall.cpp
@@ -1227,32 +1227,26 @@ static PreparedDummyArgument preparePresentUserCallActualArgument(
     return hlfir::Entity{copyIn.getCopiedIn()};
   };
 
-  auto genSetDynamicTypeToDummyType = [&](hlfir::Entity var) -> hlfir::Entity {
-    fir::BaseBoxType boxType = fir::BoxType::get(
-        hlfir::getFortranElementOrSequenceType(dummyTypeWithActualRank));
-    if (actualIsAssumedRank)
-      return hlfir::Entity{builder.create<fir::ReboxAssumedRankOp>(
-          loc, boxType, var, fir::LowerBoundModifierAttribute::SetToOnes)};
-    // Use actual shape when creating descriptor with dummy type, the dummy
-    // shape may be unknown in case of sequence association.
-    mlir::Type actualTy =
-        hlfir::getFortranElementOrSequenceType(actual.getType());
-    boxType = boxType.getBoxTypeWithNewShape(actualTy);
-    return hlfir::Entity{builder.create<fir::ReboxOp>(loc, boxType, var,
-                                                      /*shape=*/mlir::Value{},
-                                                      /*slice=*/mlir::Value{})};
-  };
-
   // Step 2: prepare the storage for the dummy arguments, ensuring that it
   // matches the dummy requirements (e.g., must be contiguous or must be
   // a temporary).
   hlfir::Entity entity =
       hlfir::derefPointersAndAllocatables(loc, builder, actual);
   if (entity.isVariable()) {
-    // Set dynamic type if needed before any copy-in or copy so that the dummy
-    // is contiguous according to the dummy type.
-    if (mustSetDynamicTypeToDummyType)
-      entity = genSetDynamicTypeToDummyType(entity);
+    if (mustSetDynamicTypeToDummyType) {
+      // Note: this is important to do this before any copy-in or copy so
+      // that the dummy is contiguous according to the dummy type.
+      mlir::Type boxType = fir::BoxType::get(
+          hlfir::getFortranElementOrSequenceType(dummyTypeWithActualRank));
+      if (actualIsAssumedRank) {
+        entity = hlfir::Entity{builder.create<fir::ReboxAssumedRankOp>(
+            loc, boxType, entity, fir::LowerBoundModifierAttribute::SetToOnes)};
+      } else {
+        entity = hlfir::Entity{builder.create<fir::ReboxOp>(
+            loc, boxType, entity, /*shape=*/mlir::Value{},
+            /*slice=*/mlir::Value{})};
+      }
+    }
     if (arg.hasValueAttribute() ||
         // Constant expressions might be lowered as variables with
         // 'parameter' attribute. Even though the constant expressions
@@ -1291,14 +1285,20 @@ static PreparedDummyArgument preparePresentUserCallActualArgument(
         loc, builder, entity, storageType, "", byRefAttr);
     entity = hlfir::Entity{associate.getBase()};
     preparedDummy.pushExprAssociateCleanUp(associate);
-    // Rebox the actual argument to the dummy argument's type, and make sure
-    // that we pass a contiguous entity (i.e. make copy-in, if needed).
-    //
-    // TODO: this can probably be optimized by associating the expression with
-    // properly typed temporary, but this needs either a new operation or
-    // making the hlfir.associate more complex.
     if (mustSetDynamicTypeToDummyType) {
-      entity = genSetDynamicTypeToDummyType(entity);
+      // Rebox the actual argument to the dummy argument's type, and make
+      // sure that we pass a contiguous entity (i.e. make copy-in,
+      // if needed).
+      //
+      // TODO: this can probably be optimized by associating the expression
+      // with properly typed temporary, but this needs either a new operation
+      // or making the hlfir.associate more complex.
+      assert(!actualIsAssumedRank && "only variables are assumed-rank");
+      mlir::Type boxType = fir::BoxType::get(
+          hlfir::getFortranElementOrSequenceType(dummyTypeWithActualRank));
+      entity = hlfir::Entity{builder.create<fir::ReboxOp>(
+          loc, boxType, entity, /*shape=*/mlir::Value{},
+          /*slice=*/mlir::Value{})};
       entity = genCopyIn(entity, /*doCopyOut=*/false);
     }
   }
diff --git a/flang/lib/Lower/DirectivesCommon.h b/flang/lib/Lower/DirectivesCommon.h
index 5b8d1f2359c5b..e8d6bb591e361 100644
--- a/flang/lib/Lower/DirectivesCommon.h
+++ b/flang/lib/Lower/DirectivesCommon.h
@@ -33,7 +33,6 @@
 #include "flang/Optimizer/Builder/FIRBuilder.h"
 #include "flang/Optimizer/Builder/HLFIRTools.h"
 #include "flang/Optimizer/Builder/Todo.h"
-#include "flang/Optimizer/Dialect/FIRType.h"
 #include "flang/Optimizer/HLFIR/HLFIROps.h"
 #include "flang/Parser/parse-tree.h"
 #include "flang/Semantics/openmp-directive-sets.h"
@@ -136,22 +135,6 @@ static inline void genOmpAtomicHintAndMemoryOrderClauses(
   }
 }
 
-template <typename AtomicListT>
-static void processOmpAtomicTODO(mlir::Type elementType, mlir::Location loc) {
-  if (!elementType)
-    return;
-  if constexpr (std::is_same<AtomicListT,
-                             Fortran::parser::OmpAtomicClauseList>()) {
-    // Based on assertion for supported element types in OMPIRBuilder.cpp
-    // createAtomicRead
-    mlir::Type unwrappedEleTy = fir::unwrapRefType(elementType);
-    bool supportedAtomicType =
-        (!fir::isa_complex(unwrappedEleTy) && fir::isa_trivial(unwrappedEleTy));
-    if (!supportedAtomicType)
-      TODO(loc, "Unsupported atomic type");
-  }
-}
-
 /// Used to generate atomic.read operation which is created in existing
 /// location set by builder.
 template <typename AtomicListT>
@@ -164,8 +147,6 @@ static inline void genOmpAccAtomicCaptureStatement(
   // Generate `atomic.read` operation for atomic assigment statements
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
 
-  processOmpAtomicTODO<AtomicListT>(elementType, loc);
-
   if constexpr (std::is_same<AtomicListT,
                              Fortran::parser::OmpAtomicClauseList>()) {
     // If no hint clause is specified, the effect is as if
@@ -202,8 +183,6 @@ static inline void genOmpAccAtomicWriteStatement(
   mlir::Type varType = fir::unwrapRefType(lhsAddr.getType());
   rhsExpr = firOpBuilder.createConvert(loc, varType, rhsExpr);
 
-  processOmpAtomicTODO<AtomicListT>(varType, loc);
-
   if constexpr (std::is_same<AtomicListT,
                              Fortran::parser::OmpAtomicClauseList>()) {
     // If no hint clause is specified, the effect is as if
@@ -344,8 +323,6 @@ static inline void genOmpAccAtomicUpdateStatement(
         currentLocation, lhsAddr);
   }
 
-  processOmpAtomicTODO<AtomicListT>(varType, loc);
-
   llvm::SmallVector<mlir::Type> varTys = {varType};
   llvm::SmallVector<mlir::Location> locs = {currentLocation};
   firOpBuilder.createBlock(&atomicUpdateOp->getRegion(0), {}, varTys, locs);
diff --git a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
index 7e76a81e0df92..c1d4c089df3b2 100644
--- a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
@@ -17,7 +17,6 @@
 #include "flang/Lower/SymbolMap.h"
 #include "flang/Optimizer/Builder/HLFIRTools.h"
 #include "flang/Optimizer/Builder/Todo.h"
-#include "flang/Optimizer/HLFIR/HLFIROps.h"
 #include "flang/Semantics/tools.h"
 
 namespace Fortran {
@@ -128,52 +127,8 @@ void DataSharingProcessor::copyFirstPrivateSymbol(
 void DataSharingProcessor::copyLastPrivateSymbol(
     const semantics::Symbol *sym,
     [[maybe_unused]] mlir::OpBuilder::InsertPoint *lastPrivIP) {
-  if (sym->test(semantics::Symbol::Flag::OmpLastPrivate)) {
-    bool allocatable = semantics::IsAllocatable(sym->GetUltimate());
-    if (!allocatable) {
-      converter.copyHostAssociateVar(*sym, lastPrivIP);
-      return;
-    }
-
-    // copyHostAssociateVar doesn't work properly if the privatised copy was
-    // reallocated (e.g. by assignment): it will only copy if the ultimate
-    // symbol was already allocated, and it only copies data so any reallocated
-    // lengths etc are lost
-
-    // 1) Fetch the original copy of the variable.
-    assert(sym->has<Fortran::semantics::HostAssocDetails>() &&
-           "No host-association found");
-    const Fortran::semantics::Symbol &hsym = sym->GetUltimate();
-    Fortran::lower::SymbolBox hsb = symTable->lookupOneLevelUpSymbol(hsym);
-    assert(hsb && "Host symbol box not found");
-
-    // 2) Fetch the copied one that will mask the original.
-    Fortran::lower::SymbolBox sb = symTable->shallowLookupSymbol(sym);
-    assert(sb && "Host-associated symbol box not found");
-    assert(hsb.getAddr() != sb.getAddr() &&
-           "Host and associated symbol boxes are the same");
-
-    // 3) Perform the assignment.
-    fir::FirOpBuilder &builder = converter.getFirOpBuilder();
-    mlir::Location loc = converter.genLocation(sym->name());
-    mlir::OpBuilder::InsertPoint insPt = builder.saveInsertionPoint();
-    if (lastPrivIP && lastPrivIP->isSet())
-      builder.restoreInsertionPoint(*lastPrivIP);
-    else
-      builder.setInsertionPointAfter(sb.getAddr().getDefiningOp());
-
-    hlfir::Entity dst{hsb.getAddr()};
-    hlfir::Entity src{sb.getAddr()};
-    builder.create<hlfir::AssignOp>(
-        loc, src, dst, /*isWholeAllocatableAssignment=*/allocatable,
-        /*keepLhsLengthInAllocatableAssignment=*/false,
-        /*temporary_lhs=*/false);
-
-    if (lastPrivIP && lastPrivIP->isSet() &&
-        sym->test(Fortran::semantics::Symbol::Flag::OmpLastPrivate)) {
-      builder.restoreInsertionPoint(insPt);
-    }
-  }
+  if (sym->test(semantics::Symbol::Flag::OmpLastPrivate))
+    converter.copyHostAssociateVar(*sym, lastPrivIP);
 }
 
 void DataSharingProcessor::collectOmpObjectListSymbol(
diff --git a/flang/lib/Optimizer/CodeGen/PreCGRewrite.cpp b/flang/lib/Optimizer/CodeGen/PreCGRewrite.cpp
index fdf24028c5f9b..c57f5140c29eb 100644
--- a/flang/lib/Optimizer/CodeGen/PreCGRewrite.cpp
+++ b/flang/lib/Optimizer/CodeGen/PreCGRewrite.cpp
@@ -18,8 +18,8 @@
 #include "flang/Optimizer/Dialect/FIROps.h"
 #include "flang/Optimizer/Dialect/FIRType.h"
 #include "flang/Optimizer/Dialect/Support/FIRContext.h"
-#include "mlir/IR/Iterators.h"
 #include "mlir/Transforms/DialectConversion.h"
+#include "mlir/Transforms/RegionUtils.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/Debug.h"
 
@@ -325,25 +325,6 @@ class DummyScopeOpConversion
   }
 };
 
-/// Simple DCE to erase fir.shape/shift/slice/unused shape operands after this
-/// pass (fir.shape and like have no codegen).
-/// mlir::RegionDCE is expensive and requires running
-/// mlir::eraseUnreachableBlocks. It does things that are not needed here, like
-/// removing unused block arguments. fir.shape/shift/slice cannot be block
-/// arguments.
-/// This helper does a naive backward walk of the IR. It is not even guaranteed
-/// to walk blocks according to backward dominance, but that is good enough for
-/// what is done here, fir.shape/shift/slice have no usages anymore. The
-/// backward walk allows getting rid of most of the unused operands, it is not a
-/// problem to leave some in the weird cases.
-static void simpleDCE(mlir::RewriterBase &rewriter, mlir::Operation *op) {
-  op->walk<mlir::WalkOrder::PostOrder, mlir::ReverseIterator>(
-      [&](mlir::Operation *subOp) {
-        if (mlir::isOpTriviallyDead(subOp))
-          rewriter.eraseOp(subOp);
-      });
-}
-
 class CodeGenRewrite : public fir::impl::CodeGenRewriteBase<CodeGenRewrite> {
 public:
   using CodeGenRewriteBase<CodeGenRewrite>::CodeGenRewriteBase;
@@ -375,7 +356,7 @@ class CodeGenRewrite : public fir::impl::CodeGenRewriteBase<CodeGenRewrite> {
     }
     // Erase any residual (fir.shape, fir.slice...).
     mlir::IRRewriter rewriter(&context);
-    simpleDCE(rewriter, mod.getOperation());
+    (void)mlir::runRegionDCE(rewriter, mod->getRegions());
   }
 };
 
diff --git a/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp b/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp
index 685a8645fa2fc..8bb24fb6c8078 100644
--- a/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp
+++ b/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp
@@ -262,13 +262,9 @@ void AddDebugInfoPass::runOnOperation() {
 
     mlir::StringAttr fullName =
         mlir::StringAttr::get(context, funcOp.getName());
-    mlir::Attribute attr = funcOp->getAttr(fir::getInternalFuncNameAttrName());
+    auto result = fir::NameUniquer::deconstruct(funcOp.getName());
     mlir::StringAttr funcName =
-        (attr) ? mlir::cast<mlir::StringAttr>(attr)
-               : mlir::StringAttr::get(context, funcOp.getName());
-
-    auto result = fir::NameUniquer::deconstruct(funcName);
-    funcName = mlir::StringAttr::get(context, result.second.name);
+        mlir::StringAttr::get(context, result.second.name);
 
     llvm::SmallVector<mlir::LLVM::DITypeAttr> types;
     fir::DebugTypeGenerator typeGen(module);
diff --git a/flang/runtime/CMakeLists.txt b/flang/runtime/CMakeLists.txt
index d587fd44b1678..8588af7e16eec 100644
--- a/flang/runtime/CMakeLists.txt
+++ b/flang/runtime/CMakeLists.txt
@@ -295,15 +295,3 @@ else()
     FortranRuntime.static_dbg FortranRuntime.dynamic_dbg)
 endif()
 set_target_properties(FortranRuntime PROPERTIES FOLDER "Flang/Runtime Libraries")
-
-# If FortranRuntime is part of a Flang build (and not a separate build) then
-# add dependency to make sure that Fortran runtime library is being built after
-# we have the Flang compiler available.  This also includes the MODULE files
-# that compile when the 'flang-new' target is built.
-#
-# TODO: This is a workaround and should be updated when runtime build procedure
-# is changed to a regular runtime build.  See discussion in PR #95388.
-if (TARGET flang-new AND TARGET module_files)
-  add_dependencies(FortranRuntime flang-new module_files)
-endif()
-
diff --git a/flang/runtime/derived.cpp b/flang/runtime/derived.cpp
index 659f54fa344bb..0d9e033df4e27 100644
--- a/flang/runtime/derived.cpp
+++ b/flang/runtime/derived.cpp
@@ -23,9 +23,10 @@ static RT_API_ATTRS void GetComponentExtents(SubscriptValue (&extents)[maxRank],
     const typeInfo::Component &comp, const Descriptor &derivedInstance) {
   const typeInfo::Value *bounds{comp.bounds()};
   for (int dim{0}; dim < comp.rank(); ++dim) {
-    auto lb{bounds[2 * dim].GetValue(&derivedInstance).value_or(0)};
-    auto ub{bounds[2 * dim + 1].GetValue(&derivedInstance).value_or(0)};
-    extents[dim] = ub >= lb ? static_cast<SubscriptValue>(ub - lb + 1) : 0;
+    SubscriptValue lb{bounds[2 * dim].GetValue(&derivedInstance).value_or(0)};
+    SubscriptValue ub{
+        bounds[2 * dim + 1].GetValue(&derivedInstance).value_or(0)};
+    extents[dim] = ub >= lb ? ub - lb + 1 : 0;
   }
 }
 
diff --git a/flang/test/Fir/declare-codegen.fir b/flang/test/Fir/declare-codegen.fir
index fe8d84ef2d19f..c5879facb1572 100644
--- a/flang/test/Fir/declare-codegen.fir
+++ b/flang/test/Fir/declare-codegen.fir
@@ -38,17 +38,3 @@ func.func @useless_shape_with_duplicate_extent_operand(%arg0: !fir.ref<!fir.arra
 
 // DECL-LABEL: func.func @useless_shape_with_duplicate_extent_operand(
 // DECL: fircg.ext_declare
-
-// Test DCE does not crash because of unreachable code.
-func.func @unreachable_code(%arg0: !fir.ref<!fir.char<1,10>>) {
-  %c10 = arith.constant 10 : index
-  %2 = fir.declare %arg0 typeparams %c10 {uniq_name = "live_code"} : (!fir.ref<!fir.char<1,10>>, index) -> (!fir.ref<!fir.char<1,10>>)
-  return
-^bb2:  // no predecessors
-  %3 = fir.declare %arg0 typeparams %c10 {uniq_name = "dead_code"} : (!fir.ref<!fir.char<1,10>>, index) -> (!fir.ref<!fir.char<1,10>>)
-  fir.unreachable
-}
-// NODECL-LABEL:   func.func @unreachable_code(
-// NODECL-NOT:       uniq_name = "live_code"
-// DECL-LABEL:    func.func @unreachable_code(
-// DECL:             uniq_name = "live_code"
diff --git a/flang/test/Lower/HLFIR/calls-poly-to-nonpoly.f90 b/flang/test/Lower/HLFIR/calls-poly-to-nonpoly.f90
deleted file mode 100644
index 3c60a84692bdb..0000000000000
--- a/flang/test/Lower/HLFIR/calls-poly-to-nonpoly.f90
+++ /dev/null
@@ -1,26 +0,0 @@
-! Test passing polymorphic variable for non-polymorphic dummy arguments:
-! RUN: bbc -emit-hlfir -o - -I nowhere %s | FileCheck %s
-
-subroutine test_sequence_association(x)
-  type t
-    integer :: i
-  end type
-  interface
-    subroutine sequence_assoc(x, n)
-      import :: t
-      type(t) :: x(n)
-    end subroutine
-  end interface
-  class(t) :: x(:, :)
-  call sequence_assoc(x, 100)
-end subroutine
-! CHECK-LABEL:   func.func @_QPtest_sequence_association(
-! CHECK-SAME:                                            %[[VAL_0:.*]]: !fir.class<!fir.array<?x?x!fir.type<_QFtest_sequence_associationTt{i:i32}>>>
-! CHECK:           %[[VAL_1:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?x?x!fir.type<_QFtest_sequence_associationTt{i:i32}>>>>
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]]
-! CHECK:           %[[REBOX:.*]] = fir.rebox %[[VAL_3]]#0 : (!fir.class<!fir.array<?x?x!fir.type<_QFtest_sequence_associationTt{i:i32}>>>) -> !fir.box<!fir.array<?x?x!fir.type<_QFtest_sequence_associationTt{i:i32}>>>
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.copy_in %[[REBOX]] to %[[VAL_1]] : (!fir.box<!fir.array<?x?x!fir.type<_QFtest_sequence_associationTt{i:i32}>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?x!fir.type<_QFtest_sequence_associationTt{i:i32}>>>>>) -> (!fir.box<!fir.array<?x?x!fir.type<_QFtest_sequence_associationTt{i:i32}>>>, i1)
-! CHECK:           %[[VAL_6:.*]] = fir.box_addr %[[VAL_5]]#0 : (!fir.box<!fir.array<?x?x!fir.type<_QFtest_sequence_associationTt{i:i32}>>>) -> !fir.ref<!fir.array<?x?x!fir.type<_QFtest_sequence_associationTt{i:i32}>>>
-! CHECK:           %[[VAL_7:.*]] = fir.convert %[[VAL_6]] : (!fir.ref<!fir.array<?x?x!fir.type<_QFtest_sequence_associationTt{i:i32}>>>) -> !fir.ref<!fir.array<?x!fir.type<_QFtest_sequence_associationTt{i:i32}>>>
-! CHECK:           fir.call @_QPsequence_assoc(%[[VAL_7]], %{{.*}})
-! CHECK:           hlfir.copy_out %[[VAL_1]], %[[VAL_5]]#1 to %[[REBOX]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?x!fir.type<_QFtest_sequence_associationTt{i:i32}>>>>>, i1, !fir.box<!fir.array<?x?x!fir.type<_QFtest_sequence_associationTt{i:i32}>>>) -> ()
diff --git a/flang/test/Lower/OpenMP/Todo/atomic-character.f90 b/flang/test/Lower/OpenMP/Todo/atomic-character.f90
deleted file mode 100644
index 88effa4a2a515..0000000000000
--- a/flang/test/Lower/OpenMP/Todo/atomic-character.f90
+++ /dev/null
@@ -1,8 +0,0 @@
-! RUN: %not_todo_cmd %flang_fc1 -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
-
-! CHECK: not yet implemented: Unsupported atomic type
-subroutine character_atomic
-  character :: l, r
-  !$omp atomic read
-    l = r
-end subroutine
diff --git a/flang/test/Lower/OpenMP/Todo/atomic-complex.f90 b/flang/test/Lower/OpenMP/Todo/atomic-complex.f90
deleted file mode 100644
index 6d6e4399ee192..0000000000000
--- a/flang/test/Lower/OpenMP/Todo/atomic-complex.f90
+++ /dev/null
@@ -1,8 +0,0 @@
-! RUN: %not_todo_cmd %flang_fc1 -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
-
-! CHECK: not yet implemented: Unsupported atomic type
-subroutine complex_atomic
-  complex :: l, r
-  !$omp atomic read
-    l = r
-end subroutine
diff --git a/flang/test/Lower/OpenMP/atomic-read.f90 b/flang/test/Lower/OpenMP/atomic-read.f90
index 9559df171111d..8c3f37c94975e 100644
--- a/flang/test/Lower/OpenMP/atomic-read.f90
+++ b/flang/test/Lower/OpenMP/atomic-read.f90
@@ -5,18 +5,22 @@
 ! This test checks the lowering of atomic read
 
 !CHECK: func @_QQmain() attributes {fir.bindc_name = "ompatomic"} {
-!CHECK:    %[[A_REF:.*]] = fir.alloca i32 {bindc_name = "a", uniq_name = "_QFEa"}
-!CHECK:    %[[A_DECL:.*]]:2 = hlfir.declare %[[A_REF]] {uniq_name = "_QFEa"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK:    %[[B_REF:.*]] = fir.alloca i32 {bindc_name = "b", uniq_name = "_QFEb"}
-!CHECK:    %[[B_DECL:.*]]:2 = hlfir.declare %[[B_REF]] {uniq_name = "_QFEb"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:    %[[A_C1:.*]] = arith.constant 1 : index
+!CHECK:    %[[A_REF:.*]] = fir.alloca !fir.char<1> {bindc_name = "a", uniq_name = "_QFEa"}
+!CHECK:    %[[A_DECL:.*]]:2 = hlfir.declare %[[A_REF]] typeparams %[[A_C1]] {uniq_name = "_QFEa"} : (!fir.ref<!fir.char<1>>, index) -> (!fir.ref<!fir.char<1>>, !fir.ref<!fir.char<1>>)
+!CHECK:    %[[B_C1:.*]] = arith.constant 1 : index
+!CHECK:    %[[B_REF:.*]] = fir.alloca !fir.char<1> {bindc_name = "b", uniq_name = "_QFEb"}
+!CHECK:    %[[B_DECL:.*]]:2 = hlfir.declare %[[B_REF]] typeparams %[[B_C1]] {uniq_name = "_QFEb"} : (!fir.ref<!fir.char<1>>, index) -> (!fir.ref<!fir.char<1>>, !fir.ref<!fir.char<1>>)
 !CHECK:    %[[C_REF:.*]] = fir.alloca !fir.logical<4> {bindc_name = "c", uniq_name = "_QFEc"}
 !CHECK:    %[[C_DECL:.*]]:2 = hlfir.declare %[[C_REF]] {uniq_name = "_QFEc"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 !CHECK:    %[[D_REF:.*]] = fir.alloca !fir.logical<4> {bindc_name = "d", uniq_name = "_QFEd"}
 !CHECK:    %[[D_DECL:.*]]:2 = hlfir.declare %[[D_REF]] {uniq_name = "_QFEd"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
-!CHECK:    %[[E_REF:.*]] = fir.alloca i32 {bindc_name = "e", uniq_name = "_QFEe"}
-!CHECK:    %[[E_DECL:.*]]:2 = hlfir.declare %[[E_REF]] {uniq_name = "_QFEe"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK:    %[[F_REF:.*]] = fir.alloca i32 {bindc_name = "f", uniq_name = "_QFEf"}
-!CHECK:    %[[F_DECL:.*]]:2 = hlfir.declare %[[F_REF]] {uniq_name = "_QFEf"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:    %[[E_C8:.*]] = arith.constant 8 : index
+!CHECK:    %[[E_REF:.*]] = fir.alloca !fir.char<1,8> {bindc_name = "e", uniq_name = "_QFEe"}
+!CHECK:    %[[E_DECL:.*]]:2 = hlfir.declare %[[E_REF]] typeparams %[[E_C8]] {uniq_name = "_QFEe"} : (!fir.ref<!fir.char<1,8>>, index) -> (!fir.ref<!fir.char<1,8>>, !fir.ref<!fir.char<1,8>>)
+!CHECK:    %[[F_C8:.*]] = arith.constant 8 : index
+!CHECK:    %[[F_REF:.*]] = fir.alloca !fir.char<1,8> {bindc_name = "f", uniq_name = "_QFEf"}
+!CHECK:    %[[F_DECL:.*]]:2 = hlfir.declare %[[F_REF]] typeparams %[[F_C8]] {uniq_name = "_QFEf"} : (!fir.ref<!fir.char<1,8>>, index) -> (!fir.ref<!fir.char<1,8>>, !fir.ref<!fir.char<1,8>>)
 !CHECK:    %[[G_REF:.*]] = fir.alloca f32 {bindc_name = "g", uniq_name = "_QFEg"}
 !CHECK:    %[[G_DECL:.*]]:2 = hlfir.declare %[[G_REF]] {uniq_name = "_QFEg"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
 !CHECK:    %[[H_REF:.*]] = fir.alloca f32 {bindc_name = "h", uniq_name = "_QFEh"}
@@ -26,9 +30,9 @@
 !CHECK:    %[[Y_REF:.*]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFEy"}
 !CHECK:    %[[Y_DECL:.*]]:2 = hlfir.declare %[[Y_REF]] {uniq_name = "_QFEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK:    omp.atomic.read %[[X_DECL]]#1 = %[[Y_DECL]]#1   memory_order(acquire) hint(uncontended) : !fir.ref<i32>, i32
-!CHECK:    omp.atomic.read %[[A_DECL]]#1 = %[[B_DECL]]#1   memory_order(relaxed) : !fir.ref<i32>, i32
+!CHECK:    omp.atomic.read %[[A_DECL]]#1 = %[[B_DECL]]#1   memory_order(relaxed) : !fir.ref<!fir.char<1>>, !fir.char<1>
 !CHECK:    omp.atomic.read %[[C_DECL]]#1 = %[[D_DECL]]#1   memory_order(seq_cst) hint(contended) : !fir.ref<!fir.logical<4>>, !fir.logical<4>
-!CHECK:    omp.atomic.read %[[E_DECL]]#1 = %[[F_DECL]]#1   hint(speculative) : !fir.ref<i32>, i32
+!CHECK:    omp.atomic.read %[[E_DECL]]#1 = %[[F_DECL]]#1   hint(speculative) : !fir.ref<!fir.char<1,8>>, !fir.char<1,8>
 !CHECK:    omp.atomic.read %[[G_DECL]]#1 = %[[H_DECL]]#1   hint(nonspeculative) : !fir.ref<f32>, f32
 !CHECK:    omp.atomic.read %[[G_DECL]]#1 = %[[H_DECL]]#1   : !fir.ref<f32>, f32
 
@@ -36,9 +40,9 @@ program OmpAtomic
 
     use omp_lib
     integer :: x, y
-    integer :: a, b
+    character :: a, b
     logical :: c, d
-    integer :: e, f
+    character(8) :: e, f
     real g, h
     !$omp atomic acquire read hint(omp_sync_hint_uncontended)
        x = y
diff --git a/flang/test/Lower/OpenMP/lastprivate-allocatable.f90 b/flang/test/Lower/OpenMP/lastprivate-allocatable.f90
deleted file mode 100644
index 41bbb182aade2..0000000000000
--- a/flang/test/Lower/OpenMP/lastprivate-allocatable.f90
+++ /dev/null
@@ -1,38 +0,0 @@
-! RUN: %flang_fc1 -emit-hlfir -o - -fopenmp %s | FileCheck %s
-! RUN: bbc -emit-hlfir -o - -fopenmp %s | FileCheck %s
-
-program lastprivate_allocatable
-  integer, allocatable :: a
-  integer :: i
-  ! a is unallocated here
-  !$omp parallel do lastprivate(a)
-  do i=1,1
-    a = 42
-  enddo
-  !$omp end parallel do
-  ! a should be allocated here
-end program
-
-! CHECK-LABEL:   func.func @_QQmain()
-! CHECK:           %[[VAL_0:.*]] = fir.alloca !fir.box<!fir.heap<i32>> {bindc_name = "a", uniq_name = "_QFEa"}
-! CHECK:           %[[VAL_1:.*]] = fir.zero_bits !fir.heap<i32>
-! CHECK:           %[[VAL_2:.*]] = fir.embox %[[VAL_1]] : (!fir.heap<i32>) -> !fir.box<!fir.heap<i32>>
-! CHECK:           fir.store %[[VAL_2]] to %[[VAL_0]] : !fir.ref<!fir.box<!fir.heap<i32>>>
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = {{.*}}<allocatable>, uniq_name = "_QFEa"} : (!fir.ref<!fir.box<!fir.heap<i32>>>) -> (!fir.ref<!fir.box<!fir.heap<i32>>>, !fir.ref<!fir.box<!fir.heap<i32>>>)
-! CHECK:           omp.parallel {
-!                    create original copy of private variable
-! CHECK:             %[[VAL_16:.*]]:2 = hlfir.declare %{{.*}} {fortran_attrs = {{.*}}<allocatable>, uniq_name = "_QFEa"} : (!fir.ref<!fir.box<!fir.heap<i32>>>) -> (!fir.ref<!fir.box<!fir.heap<i32>>>, !fir.ref<!fir.box<!fir.heap<i32>>>)
-! CHECK:             %[[VAL_17:.*]] = fir.alloca i32 {bindc_name = "i", pinned, uniq_name = "_QFEi"}
-! CHECK:             %[[VAL_18:.*]]:2 = hlfir.declare %[[VAL_17]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:             omp.wsloop {
-! CHECK:               omp.loop_nest
-!                        [...]
-!                        if this is the last iteration
-! CHECK:                 fir.if %{{.*}} {
-!                          store loop IV
-! CHECK:                   fir.store %{{.*}} to %[[VAL_18]]#1 : !fir.ref<i32>
-!                          assign private variable to original copy: realloc
-! CHECK:                   hlfir.assign %[[VAL_16]]#0 to %[[VAL_3]]#0 realloc : !fir.ref<!fir.box<!fir.heap<i32>>>, !fir.ref<!fir.box<!fir.heap<i32>>>
-! CHECK-NEXT:            }
-! CHECK-NEXT:            omp.yield
-! CHECK-NEXT:          }
diff --git a/flang/test/Transforms/debug-92391.fir b/flang/test/Transforms/debug-92391.fir
deleted file mode 100644
index 2d83be995a0f6..0000000000000
--- a/flang/test/Transforms/debug-92391.fir
+++ /dev/null
@@ -1,18 +0,0 @@
-// RUN: fir-opt --add-debug-info --mlir-print-debuginfo %s | FileCheck %s
-
-module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>, #dlti.dl_entry<"dlti.endianness", "little">>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"} {
-  func.func @my_square_(%arg0: !fir.ref<i32> {fir.bindc_name = "x"} ) -> i32 attributes {fir.internal_name = "_QPmy_square"} {
-    %0 = fir.undefined !fir.dscope
-    %1 = fir.alloca i32 {bindc_name = "my_square", uniq_name = "_QFmy_squareEmy_square"}
-    %2 = fircg.ext_declare %1 {uniq_name = "_QFmy_squareEmy_square"} : (!fir.ref<i32>) -> !fir.ref<i32>
-    %3 = fircg.ext_declare %arg0 dummy_scope %0 {uniq_name = "_QFmy_squareEx"} : (!fir.ref<i32>, !fir.dscope) -> !fir.ref<i32>
-    %4 = fir.load %3 : !fir.ref<i32>
-    %5 = arith.muli %4, %4 : i32
-    fir.store %5 to %2 : !fir.ref<i32>
-    %6 = fir.load %2 : !fir.ref<i32>
-    return %6 : i32
-  } loc(#loc1)
-}
-#loc1 = loc("92391.f90":15:1)
-
-// CHECK: #di_subprogram = #llvm.di_subprogram<{{.*}}name = "my_square", linkageName = "my_square_"{{.*}}>
diff --git a/libc/benchmarks/gpu/CMakeLists.txt b/libc/benchmarks/gpu/CMakeLists.txt
index 69518acff3a5d..29e27724e1ab3 100644
--- a/libc/benchmarks/gpu/CMakeLists.txt
+++ b/libc/benchmarks/gpu/CMakeLists.txt
@@ -7,7 +7,7 @@ function(add_benchmark benchmark_name)
     "BENCHMARK"
     "" # Optional arguments
     "" # Single value arguments
-    "LINK_LIBRARIES;DEPENDS" # Multi-value arguments
+    "LINK_LIBRARIES" # Multi-value arguments
     ${ARGN}
   )
   
@@ -20,9 +20,6 @@ function(add_benchmark benchmark_name)
     LINK_LIBRARIES
       LibcGpuBenchmark.hermetic
       ${BENCHMARK_LINK_LIBRARIES}
-    DEPENDS
-      libc.src.stdio.printf
-      ${BENCHMARK_DEPENDS}
     ${BENCHMARK_UNPARSED_ARGUMENTS}
   )
   get_fq_target_name(${benchmark_name} fq_target_name)
@@ -58,7 +55,6 @@ add_unittest_framework_library(
     libc.src.__support.fixedvector
     libc.src.time.clock
     libc.benchmarks.gpu.timing.timing
-    libc.src.stdio.printf
 )
 
 add_subdirectory(src)
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
index 031ad163c20da..c926d8efd7db2 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@@ -7,7 +7,6 @@
 #include "src/__support/GPU/utils.h"
 #include "src/__support/fixedvector.h"
 #include "src/__support/macros/config.h"
-#include "src/stdio/printf.h"
 #include "src/time/gpu/time_utils.h"
 
 namespace LIBC_NAMESPACE_DECL {
@@ -74,10 +73,11 @@ struct AtomicBenchmarkSums {
 };
 
 AtomicBenchmarkSums all_results;
-constexpr auto GREEN = "\033[32m";
-constexpr auto RESET = "\033[0m";
 
 void print_results(Benchmark *b) {
+  constexpr auto GREEN = "\033[32m";
+  constexpr auto RESET = "\033[0m";
+
   BenchmarkResult result;
   cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
   int num_threads = all_results.active_threads.load(cpp::MemoryOrder::RELAXED);
@@ -92,53 +92,21 @@ void print_results(Benchmark *b) {
       all_results.samples_sum.load(cpp::MemoryOrder::RELAXED) / num_threads;
   result.total_iterations =
       all_results.iterations_sum.load(cpp::MemoryOrder::RELAXED) / num_threads;
-  const uint64_t duration_ns =
+  result.total_time =
       all_results.time_sum.load(cpp::MemoryOrder::RELAXED) / num_threads;
-  const uint64_t duration_us = duration_ns / 1000;
-  const uint64_t duration_ms = duration_ns / (1000 * 1000);
-  uint64_t converted_duration = duration_ns;
-  const char *time_unit;
-  if (duration_ms != 0) {
-    converted_duration = duration_ms;
-    time_unit = "ms";
-  } else if (duration_us != 0) {
-    converted_duration = duration_us;
-    time_unit = "us";
-  } else {
-    converted_duration = duration_ns;
-    time_unit = "ns";
-  }
-  result.total_time = converted_duration;
-  // result.total_time =
-  //     all_results.time_sum.load(cpp::MemoryOrder::RELAXED) / num_threads;
   cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
 
-  LIBC_NAMESPACE::printf(
-      "%-20s |%8ld |%8ld |%8ld |%11d |%9ld %2s |%9ld |%9d |\n",
-      b->get_test_name().data(), result.cycles, result.min, result.max,
-      result.total_iterations, result.total_time, time_unit,
-      static_cast<uint64_t>(result.standard_deviation), num_threads);
-}
-
-void print_header() {
-  LIBC_NAMESPACE::printf("%s", GREEN);
-  LIBC_NAMESPACE::printf("Running Suite: %-10s\n",
-                         benchmarks[0]->get_suite_name().data());
-  LIBC_NAMESPACE::printf("%s", RESET);
-  LIBC_NAMESPACE::printf("Benchmark            |  Cycles |     Min |     Max | "
-                         "Iterations |        "
-                         "Time |   Stddev |  Threads |\n");
-  LIBC_NAMESPACE::printf(
-      "---------------------------------------------------------------------"
-      "--------------------------------\n");
+  log << GREEN << "[ RUN      ] " << RESET << b->get_name() << '\n';
+  log << GREEN << "[       OK ] " << RESET << b->get_name() << ": "
+      << result.cycles << " cycles, " << result.min << " min, " << result.max
+      << " max, " << result.total_iterations << " iterations, "
+      << result.total_time << " ns, "
+      << static_cast<uint64_t>(result.standard_deviation)
+      << " stddev (num threads: " << num_threads << ")\n";
 }
 
 void Benchmark::run_benchmarks() {
   uint64_t id = gpu::get_thread_id();
-
-  if (id == 0)
-    print_header();
-
   gpu::sync_threads();
 
   for (Benchmark *b : benchmarks) {
@@ -146,7 +114,10 @@ void Benchmark::run_benchmarks() {
       all_results.reset();
 
     gpu::sync_threads();
-    if (b->num_threads == static_cast<uint32_t>(-1) || id < b->num_threads) {
+    if (!b->flags ||
+        ((b->flags & BenchmarkFlags::SINGLE_THREADED) && id == 0) ||
+        ((b->flags & BenchmarkFlags::SINGLE_WAVE) &&
+         id < gpu::get_lane_size())) {
       auto current_result = b->run();
       all_results.update(current_result);
     }
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h
index f5cf4822f6fd3..29d7ba8b0a132 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.h
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h
@@ -74,26 +74,24 @@ struct BenchmarkResult {
   clock_t total_time = 0;
 };
 
+enum BenchmarkFlags { SINGLE_THREADED = 0x1, SINGLE_WAVE = 0x2 };
+
 BenchmarkResult benchmark(const BenchmarkOptions &options,
                           cpp::function<uint64_t(void)> wrapper_func);
 
 class Benchmark {
   const cpp::function<uint64_t(void)> func;
-  const cpp::string_view suite_name;
-  const cpp::string_view test_name;
-  const uint32_t num_threads;
+  const cpp::string_view name;
+  const uint8_t flags;
 
 public:
-  Benchmark(cpp::function<uint64_t(void)> func, char const *suite_name,
-            char const *test_name, uint32_t num_threads)
-      : func(func), suite_name(suite_name), test_name(test_name),
-        num_threads(num_threads) {
+  Benchmark(cpp::function<uint64_t(void)> func, char const *name, uint8_t flags)
+      : func(func), name(name), flags(flags) {
     add_benchmark(this);
   }
 
   static void run_benchmarks();
-  const cpp::string_view get_suite_name() const { return suite_name; }
-  const cpp::string_view get_test_name() const { return test_name; }
+  const cpp::string_view get_name() const { return name; }
 
 protected:
   static void add_benchmark(Benchmark *benchmark);
@@ -107,21 +105,18 @@ class Benchmark {
 } // namespace benchmarks
 } // namespace LIBC_NAMESPACE_DECL
 
-// Passing -1 indicates the benchmark should be run with as many threads as
-// allocated by the user in the benchmark's CMake.
 #define BENCHMARK(SuiteName, TestName, Func)                                   \
   LIBC_NAMESPACE::benchmarks::Benchmark SuiteName##_##TestName##_Instance(     \
-      Func, #SuiteName, #TestName, -1)
-
-#define BENCHMARK_N_THREADS(SuiteName, TestName, Func, NumThreads)             \
-  LIBC_NAMESPACE::benchmarks::Benchmark SuiteName##_##TestName##_Instance(     \
-      Func, #SuiteName, #TestName, NumThreads)
+      Func, #SuiteName "." #TestName, 0)
 
 #define SINGLE_THREADED_BENCHMARK(SuiteName, TestName, Func)                   \
-  BENCHMARK_N_THREADS(SuiteName, TestName, Func, 1)
+  LIBC_NAMESPACE::benchmarks::Benchmark SuiteName##_##TestName##_Instance(     \
+      Func, #SuiteName "." #TestName,                                          \
+      LIBC_NAMESPACE::benchmarks::BenchmarkFlags::SINGLE_THREADED)
 
 #define SINGLE_WAVE_BENCHMARK(SuiteName, TestName, Func)                       \
-  BENCHMARK_N_THREADS(SuiteName, TestName, Func,                               \
-                      LIBC_NAMESPACE::gpu::get_lane_size())
+  LIBC_NAMESPACE::benchmarks::Benchmark SuiteName##_##TestName##_Instance(     \
+      Func, #SuiteName "." #TestName,                                          \
+      LIBC_NAMESPACE::benchmarks::BenchmarkFlags::SINGLE_WAVE)
 
 #endif
diff --git a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
index 97d1c7262d24d..a722fe709af5f 100644
--- a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
+++ b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
@@ -6,7 +6,6 @@ function(_get_compile_options_from_flags output_var)
   endif()
   check_flag(ADD_ROUND_OPT_FLAG ${ROUND_OPT_FLAG} ${ARGN})
   check_flag(ADD_EXPLICIT_SIMD_OPT_FLAG ${EXPLICIT_SIMD_OPT_FLAG} ${ARGN})
-  check_flag(ADD_MISC_MATH_BASIC_OPS_OPT_FLAG ${MISC_MATH_BASIC_OPS_OPT_FLAG} ${ARGN})
 
   if(LLVM_COMPILER_IS_GCC_COMPATIBLE)
     if(ADD_FMA_FLAG)
@@ -38,9 +37,6 @@ function(_get_compile_options_from_flags output_var)
     if(ADD_EXPLICIT_SIMD_OPT_FLAG)
       list(APPEND compile_options "-D__LIBC_EXPLICIT_SIMD_OPT")
     endif()
-    if(ADD_MISC_MATH_BASIC_OPS_OPT_FLAG)
-      list(APPEND compile_options "-D__LIBC_MISC_MATH_BASIC_OPS_OPT")
-    endif()
   elseif(MSVC)
     if(ADD_FMA_FLAG)
       list(APPEND compile_options "/arch:AVX2")
diff --git a/libc/cmake/modules/LLVMLibCFlagRules.cmake b/libc/cmake/modules/LLVMLibCFlagRules.cmake
index 4398fe55db5aa..eca7ba8d183e6 100644
--- a/libc/cmake/modules/LLVMLibCFlagRules.cmake
+++ b/libc/cmake/modules/LLVMLibCFlagRules.cmake
@@ -263,9 +263,6 @@ set(FMA_OPT_FLAG "FMA_OPT")
 set(ROUND_OPT_FLAG "ROUND_OPT")
 # This flag controls whether we use explicit SIMD instructions or not.
 set(EXPLICIT_SIMD_OPT_FLAG "EXPLICIT_SIMD_OPT")
-# This flag controls whether we use compiler builtin functions to implement
-# various basic math operations or not.
-set(MISC_MATH_BASIC_OPS_OPT_FLAG "MISC_MATH_BASIC_OPS_OPT")
 
 # Skip FMA_OPT flag for targets that don't support fma.
 if(NOT((LIBC_TARGET_ARCHITECTURE_IS_X86 AND (LIBC_CPU_FEATURES MATCHES "FMA")) OR
diff --git a/libc/cmake/modules/LLVMLibCObjectRules.cmake b/libc/cmake/modules/LLVMLibCObjectRules.cmake
index 2d3db38ecd8a3..0b092e94ca8a1 100644
--- a/libc/cmake/modules/LLVMLibCObjectRules.cmake
+++ b/libc/cmake/modules/LLVMLibCObjectRules.cmake
@@ -65,6 +65,25 @@ function(create_object_library fq_target_name)
   target_include_directories(${fq_target_name} PRIVATE ${LIBC_SOURCE_DIR})
   target_compile_options(${fq_target_name} PRIVATE ${compile_options})
 
+  # The NVPTX target is installed as LLVM-IR but the internal testing toolchain
+  # cannot handle it natively. Make a separate internal target for testing.
+  if(LIBC_TARGET_ARCHITECTURE_IS_NVPTX AND NOT LIBC_GPU_TESTS_DISABLED)
+    add_library(
+      ${internal_target_name}
+      EXCLUDE_FROM_ALL
+      OBJECT
+      ${ADD_OBJECT_SRCS}
+      ${ADD_OBJECT_HDRS}
+    )
+    target_include_directories(${internal_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
+    target_include_directories(${internal_target_name} PRIVATE ${LIBC_SOURCE_DIR})
+    target_compile_options(${internal_target_name} PRIVATE ${compile_options}
+                           -fno-lto -march=${LIBC_GPU_TARGET_ARCHITECTURE})
+    set_target_properties(${internal_target_name}
+                          PROPERTIES
+                          CXX_STANDARD ${ADD_OBJECT_CXX_STANDARD})
+  endif()
+
   if(SHOW_INTERMEDIATE_OBJECTS)
     message(STATUS "Adding object library ${fq_target_name}")
     if(${SHOW_INTERMEDIATE_OBJECTS} STREQUAL "DEPS")
@@ -264,6 +283,12 @@ function(create_entrypoint_object fq_target_name)
   add_dependencies(${internal_target_name} ${full_deps_list})
   target_link_libraries(${internal_target_name} ${full_deps_list})
 
+  # The NVPTX target cannot use LTO for the internal targets used for testing.
+  if(LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
+    target_compile_options(${internal_target_name} PRIVATE
+                           -fno-lto -march=${LIBC_GPU_TARGET_ARCHITECTURE})
+  endif()
+
   add_library(
     ${fq_target_name}
     # We want an object library as the objects will eventually get packaged into
diff --git a/libc/cmake/modules/LLVMLibCTestRules.cmake b/libc/cmake/modules/LLVMLibCTestRules.cmake
index 18adbee0bc7ad..4d349cb1799da 100644
--- a/libc/cmake/modules/LLVMLibCTestRules.cmake
+++ b/libc/cmake/modules/LLVMLibCTestRules.cmake
@@ -453,6 +453,8 @@ function(add_integration_test test_name)
   add_executable(
     ${fq_build_target_name}
     EXCLUDE_FROM_ALL
+    # The NVIDIA 'nvlink' linker does not currently support static libraries.
+    $<$<BOOL:${LIBC_TARGET_ARCHITECTURE_IS_NVPTX}>:${link_object_files}>
     ${INTEGRATION_TEST_SRCS}
     ${INTEGRATION_TEST_HDRS}
   )
@@ -471,6 +473,8 @@ function(add_integration_test test_name)
       "-Wl,-mllvm,-amdgpu-lower-global-ctor-dtor=0" -nostdlib -static
       "-Wl,-mllvm,-amdhsa-code-object-version=${LIBC_GPU_CODE_OBJECT_VERSION}")
   elseif(LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
+    # We need to use the internal object versions for NVPTX.
+    set(internal_suffix ".__internal__")
     target_link_options(${fq_build_target_name} PRIVATE
       ${LIBC_COMPILE_OPTIONS_DEFAULT} -Wno-multi-gpu
       "-Wl,--suppress-stack-size-warning"
@@ -486,9 +490,10 @@ function(add_integration_test test_name)
   endif()
   target_link_libraries(
     ${fq_build_target_name}
-    ${fq_target_name}.__libc__
-    libc.startup.${LIBC_TARGET_OS}.crt1
-    libc.test.IntegrationTest.test
+    # The NVIDIA 'nvlink' linker does not currently support static libraries.
+    $<$<NOT:$<BOOL:${LIBC_TARGET_ARCHITECTURE_IS_NVPTX}>>:${fq_target_name}.__libc__>
+    libc.startup.${LIBC_TARGET_OS}.crt1${internal_suffix}
+    libc.test.IntegrationTest.test${internal_suffix}
   )
   add_dependencies(${fq_build_target_name}
                    libc.test.IntegrationTest.test
@@ -623,6 +628,8 @@ function(add_libc_hermetic test_name)
   add_executable(
     ${fq_build_target_name}
     EXCLUDE_FROM_ALL
+    # The NVIDIA 'nvlink' linker does not currently support static libraries.
+    $<$<BOOL:${LIBC_TARGET_ARCHITECTURE_IS_NVPTX}>:${link_object_files}>
     ${HERMETIC_TEST_SRCS}
     ${HERMETIC_TEST_HDRS}
   )
@@ -649,12 +656,13 @@ function(add_libc_hermetic test_name)
 
   if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
     target_link_options(${fq_build_target_name} PRIVATE
-      ${LIBC_COMPILE_OPTIONS_DEFAULT} -Wno-multi-gpu
-      -mcpu=${LIBC_GPU_TARGET_ARCHITECTURE} -flto
-      "-Wl,-asdfasdfasdf"
+      ${LIBC_COMPILE_OPTIONS_DEFAULT}
+      -mcpu=${LIBC_GPU_TARGET_ARCHITECTURE} -flto -Wno-multi-gpu
       "-Wl,-mllvm,-amdgpu-lower-global-ctor-dtor=0" -nostdlib -static
       "-Wl,-mllvm,-amdhsa-code-object-version=${LIBC_GPU_CODE_OBJECT_VERSION}")
   elseif(LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
+    # We need to use the internal object versions for NVPTX.
+    set(internal_suffix ".__internal__")
     target_link_options(${fq_build_target_name} PRIVATE
       ${LIBC_COMPILE_OPTIONS_DEFAULT} -Wno-multi-gpu
       "-Wl,--suppress-stack-size-warning"
@@ -671,10 +679,11 @@ function(add_libc_hermetic test_name)
   target_link_libraries(
     ${fq_build_target_name}
     PRIVATE
-      libc.startup.${LIBC_TARGET_OS}.crt1
+      libc.startup.${LIBC_TARGET_OS}.crt1${internal_suffix}
       ${link_libraries}
       LibcHermeticTestSupport.hermetic
-      ${fq_target_name}.__libc__)
+      # The NVIDIA 'nvlink' linker does not currently support static libraries.
+      $<$<NOT:$<BOOL:${LIBC_TARGET_ARCHITECTURE_IS_NVPTX}>>:${fq_target_name}.__libc__>)
   add_dependencies(${fq_build_target_name}
                    LibcTest.hermetic
                    libc.test.UnitTest.ErrnoSetterMatcher
diff --git a/libc/config/baremetal/arm/entrypoints.txt b/libc/config/baremetal/arm/entrypoints.txt
index 8025ac09b9f82..4f2462abc7452 100644
--- a/libc/config/baremetal/arm/entrypoints.txt
+++ b/libc/config/baremetal/arm/entrypoints.txt
@@ -236,7 +236,6 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.canonicalize
     libc.src.math.canonicalizef
     libc.src.math.canonicalizel
-    libc.src.math.cbrt
     libc.src.math.cbrtf
     libc.src.math.ceil
     libc.src.math.ceilf
diff --git a/libc/config/baremetal/riscv/entrypoints.txt b/libc/config/baremetal/riscv/entrypoints.txt
index fb0308c953746..c2ab9d7f19921 100644
--- a/libc/config/baremetal/riscv/entrypoints.txt
+++ b/libc/config/baremetal/riscv/entrypoints.txt
@@ -232,7 +232,6 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.canonicalize
     libc.src.math.canonicalizef
     libc.src.math.canonicalizel
-    libc.src.math.cbrt
     libc.src.math.cbrtf
     libc.src.math.ceil
     libc.src.math.ceilf
diff --git a/libc/config/darwin/arm/entrypoints.txt b/libc/config/darwin/arm/entrypoints.txt
index ea5c7b537bbec..32a08f20b328f 100644
--- a/libc/config/darwin/arm/entrypoints.txt
+++ b/libc/config/darwin/arm/entrypoints.txt
@@ -135,7 +135,6 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.cos
     libc.src.math.cosf
     libc.src.math.cospif
-    libc.src.math.dsqrtl
     libc.src.math.erff
     libc.src.math.exp
     libc.src.math.expf
diff --git a/libc/config/darwin/x86_64/entrypoints.txt b/libc/config/darwin/x86_64/entrypoints.txt
index 1a7353172d464..02912decadcf7 100644
--- a/libc/config/darwin/x86_64/entrypoints.txt
+++ b/libc/config/darwin/x86_64/entrypoints.txt
@@ -119,7 +119,6 @@ set(TARGET_LIBM_ENTRYPOINTS
     #libc.src.math.ceill
     #libc.src.math.coshf
     #libc.src.math.cosf
-    #libc.src.math.dsqrtl
     #libc.src.math.expf
     #libc.src.math.exp2f
     #libc.src.math.expm1f
diff --git a/libc/config/gpu/entrypoints.txt b/libc/config/gpu/entrypoints.txt
index 42909cec55890..506c7d6d7b314 100644
--- a/libc/config/gpu/entrypoints.txt
+++ b/libc/config/gpu/entrypoints.txt
@@ -1,13 +1,3 @@
-if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
-  set(extra_entrypoints
-      # stdio.h entrypoints
-      libc.src.stdio.snprintf
-      libc.src.stdio.sprintf
-      libc.src.stdio.vsnprintf
-      libc.src.stdio.vsprintf
-  )
-endif()
-
 set(TARGET_LIBC_ENTRYPOINTS
     # assert.h entrypoints
     libc.src.assert.__assert_fail
@@ -186,13 +176,16 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.errno.errno
 
     # stdio.h entrypoints
-    ${extra_entrypoints}
     libc.src.stdio.clearerr
     libc.src.stdio.fclose
     libc.src.stdio.printf
     libc.src.stdio.vprintf
     libc.src.stdio.fprintf
     libc.src.stdio.vfprintf
+    libc.src.stdio.sprintf
+    libc.src.stdio.snprintf
+    libc.src.stdio.vsprintf
+    libc.src.stdio.vsnprintf
     libc.src.stdio.feof
     libc.src.stdio.ferror
     libc.src.stdio.fflush
diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt
index 0be6f884f0368..8afd3fb67197e 100644
--- a/libc/config/linux/aarch64/entrypoints.txt
+++ b/libc/config/linux/aarch64/entrypoints.txt
@@ -359,7 +359,6 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.coshf
     libc.src.math.cospif
     libc.src.math.dmull
-    libc.src.math.dsqrtl
     libc.src.math.erff
     libc.src.math.exp
     libc.src.math.exp10
@@ -599,7 +598,6 @@ if(LIBC_TYPES_HAS_FLOAT128)
     # math.h C23 _Float128 entrypoints
     libc.src.math.ceilf128
     libc.src.math.copysignf128
-    libc.src.math.dsqrtf128
     libc.src.math.fabsf128
     libc.src.math.fdimf128
     libc.src.math.floorf128
diff --git a/libc/config/linux/arm/entrypoints.txt b/libc/config/linux/arm/entrypoints.txt
index 55f118395c22e..0d09e4c22953c 100644
--- a/libc/config/linux/arm/entrypoints.txt
+++ b/libc/config/linux/arm/entrypoints.txt
@@ -227,7 +227,6 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.cos
     libc.src.math.cosf
     libc.src.math.coshf
-    libc.src.math.dsqrtl
     libc.src.math.erff
     libc.src.math.exp
     libc.src.math.exp10
diff --git a/libc/config/linux/riscv/entrypoints.txt b/libc/config/linux/riscv/entrypoints.txt
index 6ab90771802fd..54a382eccb546 100644
--- a/libc/config/linux/riscv/entrypoints.txt
+++ b/libc/config/linux/riscv/entrypoints.txt
@@ -228,9 +228,7 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.sys.epoll.epoll_ctl
     libc.src.sys.epoll.epoll_pwait
     libc.src.sys.epoll.epoll_wait
-    # TODO: Need to check if pwait2 is available before providing.
-    # https://github.com/llvm/llvm-project/issues/80060
-    # libc.src.sys.epoll.epoll_pwait2
+    libc.src.sys.epoll.epoll_pwait2
 
     # sys/mman.h entrypoints
     libc.src.sys.mman.madvise
@@ -268,10 +266,6 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.sys.stat.mkdirat
     libc.src.sys.stat.stat
 
-    # sys/statvfs.h
-    libc.src.sys.statvfs.fstatvfs
-    libc.src.sys.statvfs.statvfs
-
     # sys/utsname.h entrypoints
     libc.src.sys.utsname.uname
 
@@ -383,8 +377,6 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.cosf
     libc.src.math.coshf
     libc.src.math.cospif
-    libc.src.math.dmull
-    libc.src.math.dsqrtl
     libc.src.math.erff
     libc.src.math.exp
     libc.src.math.exp10
@@ -441,7 +433,6 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.fmodf
     libc.src.math.fmodl
     libc.src.math.fmul
-    libc.src.math.fmull
     libc.src.math.frexp
     libc.src.math.frexpf
     libc.src.math.frexpl
@@ -451,8 +442,6 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.fromfpx
     libc.src.math.fromfpxf
     libc.src.math.fromfpxl
-    libc.src.math.fsqrt
-    libc.src.math.fsqrtl
     libc.src.math.hypot
     libc.src.math.hypotf
     libc.src.math.ilogb
@@ -560,7 +549,6 @@ if(LIBC_TYPES_HAS_FLOAT128)
     libc.src.math.ceilf128
     libc.src.math.copysignf128
     libc.src.math.dmulf128
-    libc.src.math.dsqrtf128
     libc.src.math.fabsf128
     libc.src.math.fdimf128
     libc.src.math.floorf128
@@ -579,7 +567,6 @@ if(LIBC_TYPES_HAS_FLOAT128)
     libc.src.math.frexpf128
     libc.src.math.fromfpf128
     libc.src.math.fromfpxf128
-    libc.src.math.fsqrtf128
     libc.src.math.ilogbf128
     libc.src.math.ldexpf128
     libc.src.math.llogbf128
diff --git a/libc/config/linux/riscv/headers.txt b/libc/config/linux/riscv/headers.txt
index 0294f62bc2f7a..da203e9850603 100644
--- a/libc/config/linux/riscv/headers.txt
+++ b/libc/config/linux/riscv/headers.txt
@@ -2,7 +2,6 @@ set(TARGET_PUBLIC_HEADERS
     libc.include.assert
     libc.include.ctype
     libc.include.dirent
-    libc.include.dlfcn
     libc.include.errno
     libc.include.fcntl
     libc.include.features
@@ -19,7 +18,6 @@ set(TARGET_PUBLIC_HEADERS
     libc.include.setjmp
     libc.include.stdckdint
     libc.include.stdbit
-    libc.include.stdfix
     libc.include.stdio
     libc.include.stdlib
     libc.include.string
@@ -35,17 +33,17 @@ set(TARGET_PUBLIC_HEADERS
     libc.include.arpa_inet
 
     libc.include.sys_auxv
-    libc.include.sys_epoll
+    # Disabled due to epoll_wait syscalls not being available on this platform.
+    # libc.include.sys_epoll
     libc.include.sys_ioctl
     libc.include.sys_mman
     libc.include.sys_prctl
-    libc.include.sys_queue
     libc.include.sys_random
+    libc.include.sys_queue
     libc.include.sys_resource
     libc.include.sys_select
     libc.include.sys_socket
     libc.include.sys_stat
-    libc.include.sys_statvfs
     libc.include.sys_syscall
     libc.include.sys_time
     libc.include.sys_types
diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index f7813fc16ff7c..736908809d96c 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -228,9 +228,7 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.sys.epoll.epoll_ctl
     libc.src.sys.epoll.epoll_pwait
     libc.src.sys.epoll.epoll_wait
-    # TODO: Need to check if pwait2 is available before providing.
-    # https://github.com/llvm/llvm-project/issues/80060
-    # libc.src.sys.epoll.epoll_pwait2
+    libc.src.sys.epoll.epoll_pwait2
 
     # sys/mman.h entrypoints
     libc.src.sys.mman.madvise
@@ -384,7 +382,6 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.coshf
     libc.src.math.cospif
     libc.src.math.dmull
-    libc.src.math.dsqrtl
     libc.src.math.erff
     libc.src.math.exp
     libc.src.math.exp10
@@ -646,7 +643,6 @@ if(LIBC_TYPES_HAS_FLOAT128)
     libc.src.math.ceilf128
     libc.src.math.copysignf128
     libc.src.math.dmulf128
-    libc.src.math.dsqrtf128
     libc.src.math.fabsf128
     libc.src.math.fdimf128
     libc.src.math.floorf128
diff --git a/libc/docs/dev/header_generation.rst b/libc/docs/dev/header_generation.rst
index 89036f1277b96..ed6127e403ec7 100644
--- a/libc/docs/dev/header_generation.rst
+++ b/libc/docs/dev/header_generation.rst
@@ -1,199 +1,120 @@
 Generating Public and Internal headers
 ======================================
 
-This is a new implementation of the previous libc header generator. The old
-header generator (libc-hdrgen aka "Headergen") was based on TableGen, which
-created an awkward dependency on the rest of LLVM for our build system. By
-creating a new standalone Headergen we can eliminate these dependencies for
-easier cross compatibility.
+.. warning::
+  This page is severely out of date. Much of the information it contains may be
+  incorrect. Please only remove this warning once the page has been updated.
 
-There are 3 main components of the new Headergen. The first component are the
-YAML files that contain all the function header information and are separated by
-header specification and standard. The second component are the classes that are
-created for each component of the function header: macros, enumerations, types,
-function, arguments, and objects. The third component is the Python script that
-uses the class representation to deserialize YAML files into its specific
-components and then reserializes the components into the function header. The
-Python script also combines the generated header content with header definitions
-and extra macro and type inclusions from the .h.def file.
+Other libc implementations make use of preprocessor macro tricks to make header
+files platform agnostic. When macros aren't suitable, they rely on build
+system tricks to pick the right set of files to compile and export. While these
+approaches have served them well, parts of their systems have become extremely
+complicated making it hard to modify, extend or maintain. To avoid these
+problems in llvm-libc, we use a header generation mechanism. The mechanism is
+driven by a *header configuration language*.
 
+Header Configuration Language
+-----------------------------
 
-Instructions
-------------
+Header configuration language consists of few special *commands*. The header
+generation mechanism takes an input file, which has an extension of
+``.h.def``, and produces a header file with ``.h`` extension. The header
+configuration language commands are listed in the input ``.h.def`` file. While
+reading a ``.h.def`` file, the header generation tool does two things:
 
-Required Versions:
-  - Python Version: 3.11.8
+1. Copy the lines not containing commands as is into the output ``.h`` file.
+2. Replace the line on which a command occurs with some other text as directed
+   by the command. The replacement text can span multiple lines.
 
-1. Keep full-build mode on when building, otherwise headers will not be
-   generated.
-2. Once the build is complete, enter in the command line within the build
-   directory ``ninja check-newhdrgen`` to ensure that the integration tests are
-   passing.
-3. Then enter in the command line ``ninja libc`` to generate headers. Headers
-   will be in ``build/projects/libc/include`` or ``build/libc/include`` in a
-   runtime build. Sys spec headers will be located in
-   ``build/projects/libc/include/sys``.
+Command syntax
+~~~~~~~~~~~~~~
 
+A command should be listed on a line by itself, and should not span more than
+one line. The first token to appear on the line is the command name prefixed
+with ``%%``. For example, a line with the ``include_file`` command should start
+with ``%%include_file``. There can be indentation spaces before the ``%%``
+prefix.
 
-New Headergen is turned on by default, but if you want to use old Headergen,
-you can include this statement when building: ``-DLIBC_USE_NEW_HEADER_GEN=OFF``
+Most commands typically take arguments. They are listed as a comma separated
+list of named identifiers within parenthesis, similar to the C function call
+syntax. Before performing the action corresponding to the command, the header
+generator replaces the arguments with concrete values.
 
-To add a function to the YAML files, you can either manually enter it in the
-YAML file corresponding to the header it belongs to or add it through the
-command line.
+Argument Syntax
+~~~~~~~~~~~~~~~
 
-To add through the command line:
+Arguments are named indentifiers but prefixed with ``$`` and enclosed in ``{``
+and ``}``. For example, ``${path_to_constants}``.
 
-1. Make sure you are in the llvm-project directory.
+Comments
+~~~~~~~~
 
-2. Enter in the command line:
+There can be cases wherein one wants to add comments in the .h.def file but
+does not want them to be copied into the generated header file. Such comments
+can be added by beginning the comment lines with the ``<!>`` prefix. Currently,
+comments have to be on lines of their own. That is, they cannot be suffixes like
+this:
 
-   .. code-block:: none
+```
+%%include_file(a/b/c) <!> Path to c in b of a.  !!! WRONG SYNTAX
+```
 
-     python3 libc/newhdrgen/yaml_to_classes.py
-     libc/newhdrgen/yaml/[yaml_file.yaml] --add_function "<return_type>" <function_name> "<function_arg1, function_arg2>" <standard> <guard> <attribute>
+Available Commands
+------------------
 
-   Example:
+Sub-sections below describe the commands currently available. Under each command
+is the description of the arguments to the command, and the action taken by the
+header generation tool when processing a command.
 
-   .. code-block:: none
+``include_file``
+~~~~~~~~~~~~~~~~
 
-      python3 libc/newhdrgen/yaml_to_classes.py
-      libc/newhdrgen/yaml/ctype.yaml --add_function "char" example_function
-      "int, void, const void" stdc example_float example_attribute
+This is a replacement command which should be listed in an input ``.h.def``
+file.
 
-   Keep in mind only the return_type and arguments have quotes around them. If
-   you do not have any guards or attributes you may enter "null" for both.
+Arguments
 
-3. Check the YAML file that the added function is present. You will also get a
-   generated header file with the new addition in the newhdrgen directory to
-   examine.
+  * **path argument** - An argument representing a path to a file. The file
+    should have an extension of ``.h.inc``.
 
+Action
 
-Testing
--------
+  This command instructs that the line on which the command appears should be
+  replaced by the contents of the file whose path is passed as argument to the
+  command.
 
-New Headergen has an integration test that you may run once you have configured
-your CMake within the build directory. In the command line, enter the following:
-``ninja check-newhdrgen``. The integration test is one test that ensures the
-process of YAML to classes to generate headers works properly. If there are any
-new additions on formatting headers, make sure the test is updated with the
-specific addition.
+``begin``
+~~~~~~~~~
 
-Integration Test can be found in: ``libc/newhdrgen/tests/test_integration.py``
+This is not a replacement command. It is an error to list it in the input
+``.h.def`` file. It is normally listed in the files included by the
+``include_file`` command (the ``.h.inc`` files). A common use of this command it
+mark the beginning of what is to be included. This prevents copying items like
+license headers into the generated header file.
 
-File to modify if adding something to formatting:
-``libc/newhdrgen/tests/expected_output/test_header.h``
+Arguments
 
+  None.
 
-Common Errors
--------------
-1. Missing function specific component
+Action
 
-   Example:
+  The header generator will only include content starting from the line after the
+  line on which this command is listed.
 
-   .. code-block:: none
+``public_api``
+~~~~~~~~~~~~~~
 
-      "/llvm-project/libc/newhdrgen/yaml_to_classes.py", line 67, in yaml_to_classes function_data["return_type"]
+This is a replacement command which should be listed in an input ``.h.def``
+file. The header file generator will replace this command with the public API of
+the target platform. See the build system document for more information on the
+relevant build rules. Also, see "Mechanics of public_api" to learn the mechanics
+of how the header generator replaces this command with the public API.
 
-   If you receive this error or any error pertaining to
-   ``function_data[function_specific_component]`` while building the headers
-   that means the function specific component is missing within the YAML files.
-   Through the call stack, you will be able to find the header file which has
-   the issue. Ensure there is no missing function specific component for that
-   YAML header file.
+Arguments
 
-2. CMake Error: require argument to be specified
+  None.
 
-   Example:
+Action
 
-   .. code-block:: none
-
-     CMake Error at:
-     /llvm-project/libc/cmake/modules/LLVMLibCHeaderRules.cmake:86 (message):
-     'add_gen_hdr2' rule requires GEN_HDR to be specified.
-     Call Stack (most recent call first):
-     /llvm-project/libc/include/CMakeLists.txt:22 (add_gen_header2)
-     /llvm-project/libc/include/CMakeLists.txt:62 (add_header_macro)
-
-   If you receive this error, there is a missing YAML file, h_def file, or
-   header name within the ``libc/include/CMakeLists.txt``. The last line in the
-   error call stack will point to the header where there is a specific component
-   missing. Ensure the correct style and required files are present:
-
-   | ``[header_name]``
-   | ``[../libc/newhdrgen/yaml/[yaml_file.yaml]``
-   | ``[header_name.h.def]``
-   | ``[header_name.h]``
-   | ``DEPENDS``
-   |   ``{Necessary Depend Files}``
-
-3. Command line: expected arguments
-
-   Example:
-
-   .. code-block:: none
-
-     usage: yaml_to_classes.py [-h] [--output_dir OUTPUT_DIR] [--h_def_file H_DEF_FILE]
-     [--add_function RETURN_TYPE NAME ARGUMENTS STANDARDS GUARD ATTRIBUTES]
-     [--e ENTRY_POINTS] [--export-decls]
-     yaml_file
-     yaml_to_classes.py:
-     error: argument --add_function: expected 6 arguments
-
-   In the process of adding a function, you may run into an issue where the
-   command line is requiring more arguments than what you currently have. Ensure
-   that all components of the new function are filled. Even if you do not have a
-   guard or attribute, make sure to put null in those two areas.
-
-4. Object has no attribute
-
-   Example:
-
-   .. code-block:: none
-
-     File "/llvm-project/libc/newhdrgen/header.py", line 60, in __str__ for
-     function in self.functions: AttributeError: 'HeaderFile' object has no
-     attribute 'functions'
-
-   When running ``ninja libc`` in the build directory to generate headers you
-   may receive the error above. Essentially this means that in
-   ``libc/newhdrgen/header.py`` there is a missing attribute named functions.
-   Make sure all function components are defined within this file and there are
-   no missing functions to add these components.
-
-5. Unknown type name
-
-   Example:
-
-   .. code-block:: none
-
-     /llvm-project/build/projects/libc/include/sched.h:20:25: error: unknown type
-     name 'size_t'; did you mean 'time_t'?
-     20 | int_sched_getcpucount(size_t, const cpu_set_t*) __NOEXCEPT
-      |           ^
-     /llvm-project/build/projects/libc/include/llvm-libc-types/time_t.h:15:24:
-     note: 'time_t' declared here
-     15 | typedef __INT64_TYPE__ time_t;
-     |                    ^
-
-   During the header generation process errors like the one above may occur
-   because there are missing types for a specific header file. Check the YAML
-   file corresponding to the header file and make sure all the necessary types
-   that are being used are input into the types as well. Delete the specific
-   header file from the build folder and re-run ``ninja libc`` to ensure the
-   types are being recognized.
-
-6. Test Integration Errors
-
-   Sometimes the integration test will fail but that
-   still means the process is working unless the comparison between the output
-   and expected_output is not showing. If that is the case make sure in
-   ``libc/newhdrgen/tests/test_integration.py`` there are no missing arguments
-   that run through the script.
-
-   If the integration tests are failing due to mismatching of lines or small
-   errors in spacing that is nothing to worry about. If this is happening while
-   you are making a new change to the formatting of the headers, then
-   ensure the expected output file
-   ``libc/newhdrgen/tests/expected_output/test_header.h`` has the changes you
-   are applying.
+  The header generator will replace this command with the public API to be exposed
+  from the generated header file.
diff --git a/libc/docs/math/index.rst b/libc/docs/math/index.rst
index f287c16fd01e2..5cf9d8a42929e 100644
--- a/libc/docs/math/index.rst
+++ b/libc/docs/math/index.rst
@@ -278,7 +278,7 @@ Higher Math Functions
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | cospi     | |check|          |                 |                        |                      |                        | 7.12.4.12              | F.10.1.12                  |
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
-| dsqrt     | N/A              | N/A             |   |check|              | N/A                  |       |check|\*        | 7.12.14.6              | F.10.11                    |
+| dsqrt     | N/A              | N/A             |                        | N/A                  |                        | 7.12.14.6              | F.10.11                    |
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | erf       | |check|          |                 |                        |                      |                        | 7.12.8.1               | F.10.5.1                   |
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
diff --git a/libc/include/llvm-libc-types/fsblkcnt_t.h b/libc/include/llvm-libc-types/fsblkcnt_t.h
index 8c7d3307278cd..88a53d38cb1b3 100644
--- a/libc/include/llvm-libc-types/fsblkcnt_t.h
+++ b/libc/include/llvm-libc-types/fsblkcnt_t.h
@@ -9,6 +9,6 @@
 #ifndef LLVM_LIBC_TYPES_FSBLKCNT_T_H
 #define LLVM_LIBC_TYPES_FSBLKCNT_T_H
 
-typedef __UINT64_TYPE__ fsblkcnt_t;
+typedef __SIZE_TYPE__ fsblkcnt_t;
 
 #endif // LLVM_LIBC_TYPES_FSBLKCNT_T_H
diff --git a/libc/include/llvm-libc-types/fsfilcnt_t.h b/libc/include/llvm-libc-types/fsfilcnt_t.h
index 12697830e92eb..c5693591907ab 100644
--- a/libc/include/llvm-libc-types/fsfilcnt_t.h
+++ b/libc/include/llvm-libc-types/fsfilcnt_t.h
@@ -9,6 +9,6 @@
 #ifndef LLVM_LIBC_TYPES_FSFILCNT_T_H
 #define LLVM_LIBC_TYPES_FSFILCNT_T_H
 
-typedef __UINT64_TYPE__ fsfilcnt_t;
+typedef __SIZE_TYPE__ fsfilcnt_t;
 
 #endif // LLVM_LIBC_TYPES_FSFILCNT_T_H
diff --git a/libc/newhdrgen/yaml_to_classes.py b/libc/newhdrgen/yaml_to_classes.py
index d3bdfde2e2a22..78588055cc960 100644
--- a/libc/newhdrgen/yaml_to_classes.py
+++ b/libc/newhdrgen/yaml_to_classes.py
@@ -118,7 +118,7 @@ def load_yaml_file(yaml_file, header_class, entry_points):
         HeaderFile: An instance of HeaderFile populated with the data.
     """
     with open(yaml_file, "r") as f:
-        yaml_data = yaml.load(f, Loader=yaml.FullLoader)
+        yaml_data = yaml.safe_load(f)
     return yaml_to_classes(yaml_data, header_class, entry_points)
 
 
@@ -173,7 +173,8 @@ def add_function_to_yaml(yaml_file, function_details):
     new_function = parse_function_details(function_details)
 
     with open(yaml_file, "r") as f:
-        yaml_data = yaml.load(f, Loader=yaml.FullLoader)
+        yaml_data = yaml.safe_load(f)
+
     if "functions" not in yaml_data:
         yaml_data["functions"] = []
 
diff --git a/libc/spec/llvm_libc_ext.td b/libc/spec/llvm_libc_ext.td
index f3a8862574ac5..6291a9ceec60d 100644
--- a/libc/spec/llvm_libc_ext.td
+++ b/libc/spec/llvm_libc_ext.td
@@ -57,8 +57,6 @@ def LLVMLibcExt : StandardSpec<"llvm_libc_ext"> {
       [], // Types
       [], // Enumerations
       [
-          GuardedFunctionSpec<"dsqrtf128", RetValSpec<DoubleType>, [ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
-	  
           GuardedFunctionSpec<"f16add", RetValSpec<Float16Type>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>], "LIBC_TYPES_HAS_FLOAT16">,
           GuardedFunctionSpec<"f16addf", RetValSpec<Float16Type>, [ArgSpec<FloatType>, ArgSpec<FloatType>], "LIBC_TYPES_HAS_FLOAT16">,
           GuardedFunctionSpec<"f16addl", RetValSpec<Float16Type>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>], "LIBC_TYPES_HAS_FLOAT16">,
diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td
index 0aae65308d33a..437d6d04faf96 100644
--- a/libc/spec/stdc.td
+++ b/libc/spec/stdc.td
@@ -717,9 +717,6 @@ def StdC : StandardSpec<"stdc"> {
           FunctionSpec<"canonicalizel", RetValSpec<IntType>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>]>,
           GuardedFunctionSpec<"canonicalizef16", RetValSpec<IntType>, [ArgSpec<Float16Type>, ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">,
           GuardedFunctionSpec<"canonicalizef128", RetValSpec<IntType>, [ArgSpec<Float128Type>, ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
-	  
-          FunctionSpec<"dsqrtl", RetValSpec<DoubleType>, [ArgSpec<LongDoubleType>]>,
-	
 
           GuardedFunctionSpec<"totalorderf16", RetValSpec<IntType>, [ArgSpec<Float16Ptr>, ArgSpec<Float16Ptr>], "LIBC_TYPES_HAS_FLOAT16">,
 
diff --git a/libc/src/__support/FPUtil/BasicOperations.h b/libc/src/__support/FPUtil/BasicOperations.h
index 3b7d7a5c249ae..a963a92bfb074 100644
--- a/libc/src/__support/FPUtil/BasicOperations.h
+++ b/libc/src/__support/FPUtil/BasicOperations.h
@@ -11,8 +11,8 @@
 
 #include "FEnvImpl.h"
 #include "FPBits.h"
-#include "dyadic_float.h"
 
+#include "FEnvImpl.h"
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
@@ -269,21 +269,12 @@ totalordermag(T x, T y) {
 template <typename T>
 LIBC_INLINE cpp::enable_if_t<cpp::is_floating_point_v<T>, T> getpayload(T x) {
   using FPBits = FPBits<T>;
-  using StorageType = typename FPBits::StorageType;
   FPBits x_bits(x);
 
   if (!x_bits.is_nan())
     return T(-1.0);
 
-  StorageType payload = x_bits.uintval() & (FPBits::FRACTION_MASK >> 1);
-
-  if constexpr (is_big_int_v<StorageType>) {
-    DyadicFloat<FPBits::STORAGE_LEN> payload_dfloat(Sign::POS, 0, payload);
-
-    return static_cast<T>(payload_dfloat);
-  } else {
-    return static_cast<T>(payload);
-  }
+  return T(x_bits.uintval() & (FPBits::FRACTION_MASK >> 1));
 }
 
 template <bool IsSignaling, typename T>
diff --git a/libc/src/__support/FPUtil/CMakeLists.txt b/libc/src/__support/FPUtil/CMakeLists.txt
index 8804f3a4d5e23..793d3a121c742 100644
--- a/libc/src/__support/FPUtil/CMakeLists.txt
+++ b/libc/src/__support/FPUtil/CMakeLists.txt
@@ -75,6 +75,19 @@ add_header_library(
     libc.src.__support.common
 )
 
+add_header_library(
+  basic_operations
+  HDRS
+    BasicOperations.h
+  DEPENDS
+    .fp_bits
+    .fenv_impl
+    libc.src.__support.CPP.type_traits
+    libc.src.__support.uint128
+    libc.src.__support.common
+    libc.src.__support.macros.optimization
+)
+
 add_header_library(
   division_and_remainder_operations
   HDRS
@@ -100,6 +113,21 @@ add_header_library(
 )
 
 
+add_header_library(
+  hypot
+  HDRS
+    Hypot.h
+  DEPENDS
+    .basic_operations
+    .fenv_impl
+    .fp_bits
+    .rounding_mode
+    libc.src.__support.common
+    libc.src.__support.CPP.bit
+    libc.src.__support.CPP.type_traits
+    libc.src.__support.uint128
+)
+
 add_header_library(
   sqrt
   HDRS
@@ -180,35 +208,6 @@ add_header_library(
     libc.src.__support.macros.optimization
 )
 
-add_header_library(
-  basic_operations
-  HDRS
-    BasicOperations.h
-  DEPENDS
-    .dyadic_float
-    .fp_bits
-    .fenv_impl
-    libc.src.__support.CPP.type_traits
-    libc.src.__support.uint128
-    libc.src.__support.common
-    libc.src.__support.macros.optimization
-)
-
-add_header_library(
-  hypot
-  HDRS
-    Hypot.h
-  DEPENDS
-    .basic_operations
-    .fenv_impl
-    .fp_bits
-    .rounding_mode
-    libc.src.__support.common
-    libc.src.__support.CPP.bit
-    libc.src.__support.CPP.type_traits
-    libc.src.__support.uint128
-)
-
 add_header_library(
   manipulation_functions
   HDRS
diff --git a/libc/src/__support/FPUtil/generic/FMA.h b/libc/src/__support/FPUtil/generic/FMA.h
index e5683c8ff61ea..337301d86d919 100644
--- a/libc/src/__support/FPUtil/generic/FMA.h
+++ b/libc/src/__support/FPUtil/generic/FMA.h
@@ -129,27 +129,27 @@ fma(InType x, InType y, InType z) {
         raise_except_if_required(FE_INVALID);
 
       if (x_bits.is_quiet_nan()) {
-        InStorageType x_payload = x_bits.get_mantissa();
-        x_payload >>= InFPBits::FRACTION_LEN - OutFPBits::FRACTION_LEN;
-        return OutFPBits::quiet_nan(x_bits.sign(),
-                                    static_cast<OutStorageType>(x_payload))
-            .get_val();
+        InStorageType x_payload = static_cast<InStorageType>(getpayload(x));
+        if ((x_payload & ~(OutFPBits::FRACTION_MASK >> 1)) == 0)
+          return OutFPBits::quiet_nan(x_bits.sign(),
+                                      static_cast<OutStorageType>(x_payload))
+              .get_val();
       }
 
       if (y_bits.is_quiet_nan()) {
-        InStorageType y_payload = y_bits.get_mantissa();
-        y_payload >>= InFPBits::FRACTION_LEN - OutFPBits::FRACTION_LEN;
-        return OutFPBits::quiet_nan(y_bits.sign(),
-                                    static_cast<OutStorageType>(y_payload))
-            .get_val();
+        InStorageType y_payload = static_cast<InStorageType>(getpayload(y));
+        if ((y_payload & ~(OutFPBits::FRACTION_MASK >> 1)) == 0)
+          return OutFPBits::quiet_nan(y_bits.sign(),
+                                      static_cast<OutStorageType>(y_payload))
+              .get_val();
       }
 
       if (z_bits.is_quiet_nan()) {
-        InStorageType z_payload = z_bits.get_mantissa();
-        z_payload >>= InFPBits::FRACTION_LEN - OutFPBits::FRACTION_LEN;
-        return OutFPBits::quiet_nan(z_bits.sign(),
-                                    static_cast<OutStorageType>(z_payload))
-            .get_val();
+        InStorageType z_payload = static_cast<InStorageType>(getpayload(z));
+        if ((z_payload & ~(OutFPBits::FRACTION_MASK >> 1)) == 0)
+          return OutFPBits::quiet_nan(z_bits.sign(),
+                                      static_cast<OutStorageType>(z_payload))
+              .get_val();
       }
 
       return OutFPBits::quiet_nan().get_val();
@@ -163,27 +163,18 @@ fma(InType x, InType y, InType z) {
   int y_exp = 0;
   int z_exp = 0;
 
-  // Denormal scaling = 2^(fraction length).
-  constexpr InStorageType IMPLICIT_MASK =
-      InFPBits::SIG_MASK - InFPBits::FRACTION_MASK;
-
-  constexpr InType DENORMAL_SCALING =
-      InFPBits::create_value(
-          Sign::POS, InFPBits::FRACTION_LEN + InFPBits::EXP_BIAS, IMPLICIT_MASK)
-          .get_val();
-
   // Normalize denormal inputs.
   if (LIBC_UNLIKELY(InFPBits(x).is_subnormal())) {
     x_exp -= InFPBits::FRACTION_LEN;
-    x *= DENORMAL_SCALING;
+    x *= InType(InStorageType(1) << InFPBits::FRACTION_LEN);
   }
   if (LIBC_UNLIKELY(InFPBits(y).is_subnormal())) {
     y_exp -= InFPBits::FRACTION_LEN;
-    y *= DENORMAL_SCALING;
+    y *= InType(InStorageType(1) << InFPBits::FRACTION_LEN);
   }
   if (LIBC_UNLIKELY(InFPBits(z).is_subnormal())) {
     z_exp -= InFPBits::FRACTION_LEN;
-    z *= DENORMAL_SCALING;
+    z *= InType(InStorageType(1) << InFPBits::FRACTION_LEN);
   }
 
   x_bits = InFPBits(x);
diff --git a/libc/src/__support/FPUtil/generic/add_sub.h b/libc/src/__support/FPUtil/generic/add_sub.h
index 850db3f83209e..ec20a8723b704 100644
--- a/libc/src/__support/FPUtil/generic/add_sub.h
+++ b/libc/src/__support/FPUtil/generic/add_sub.h
@@ -56,19 +56,19 @@ add_or_sub(InType x, InType y) {
         raise_except_if_required(FE_INVALID);
 
       if (x_bits.is_quiet_nan()) {
-        InStorageType x_payload = x_bits.get_mantissa();
-        x_payload >>= InFPBits::FRACTION_LEN - OutFPBits::FRACTION_LEN;
-        return OutFPBits::quiet_nan(x_bits.sign(),
-                                    static_cast<OutStorageType>(x_payload))
-            .get_val();
+        InStorageType x_payload = static_cast<InStorageType>(getpayload(x));
+        if ((x_payload & ~(OutFPBits::FRACTION_MASK >> 1)) == 0)
+          return OutFPBits::quiet_nan(x_bits.sign(),
+                                      static_cast<OutStorageType>(x_payload))
+              .get_val();
       }
 
       if (y_bits.is_quiet_nan()) {
-        InStorageType y_payload = y_bits.get_mantissa();
-        y_payload >>= InFPBits::FRACTION_LEN - OutFPBits::FRACTION_LEN;
-        return OutFPBits::quiet_nan(y_bits.sign(),
-                                    static_cast<OutStorageType>(y_payload))
-            .get_val();
+        InStorageType y_payload = static_cast<InStorageType>(getpayload(y));
+        if ((y_payload & ~(OutFPBits::FRACTION_MASK >> 1)) == 0)
+          return OutFPBits::quiet_nan(y_bits.sign(),
+                                      static_cast<OutStorageType>(y_payload))
+              .get_val();
       }
 
       return OutFPBits::quiet_nan().get_val();
@@ -174,12 +174,10 @@ add_or_sub(InType x, InType y) {
     else
       aligned_min_mant_sticky = true;
 
-    InStorageType min_mant_sticky(static_cast<int>(aligned_min_mant_sticky));
-
     if (is_effectively_add)
-      result_mant = max_mant + (aligned_min_mant | min_mant_sticky);
+      result_mant = max_mant + (aligned_min_mant | aligned_min_mant_sticky);
     else
-      result_mant = max_mant - (aligned_min_mant | min_mant_sticky);
+      result_mant = max_mant - (aligned_min_mant | aligned_min_mant_sticky);
   }
 
   int result_exp = max_bits.get_exponent() - RESULT_FRACTION_LEN;
diff --git a/libc/src/__support/FPUtil/generic/div.h b/libc/src/__support/FPUtil/generic/div.h
index dad1772fce750..27545786aea17 100644
--- a/libc/src/__support/FPUtil/generic/div.h
+++ b/libc/src/__support/FPUtil/generic/div.h
@@ -49,19 +49,19 @@ div(InType x, InType y) {
         raise_except_if_required(FE_INVALID);
 
       if (x_bits.is_quiet_nan()) {
-        InStorageType x_payload = x_bits.get_mantissa();
-        x_payload >>= InFPBits::FRACTION_LEN - OutFPBits::FRACTION_LEN;
-        return OutFPBits::quiet_nan(x_bits.sign(),
-                                    static_cast<OutStorageType>(x_payload))
-            .get_val();
+        InStorageType x_payload = static_cast<InStorageType>(getpayload(x));
+        if ((x_payload & ~(OutFPBits::FRACTION_MASK >> 1)) == 0)
+          return OutFPBits::quiet_nan(x_bits.sign(),
+                                      static_cast<OutStorageType>(x_payload))
+              .get_val();
       }
 
       if (y_bits.is_quiet_nan()) {
-        InStorageType y_payload = y_bits.get_mantissa();
-        y_payload >>= InFPBits::FRACTION_LEN - OutFPBits::FRACTION_LEN;
-        return OutFPBits::quiet_nan(y_bits.sign(),
-                                    static_cast<OutStorageType>(y_payload))
-            .get_val();
+        InStorageType y_payload = static_cast<InStorageType>(getpayload(y));
+        if ((y_payload & ~(OutFPBits::FRACTION_MASK >> 1)) == 0)
+          return OutFPBits::quiet_nan(y_bits.sign(),
+                                      static_cast<OutStorageType>(y_payload))
+              .get_val();
       }
 
       return OutFPBits::quiet_nan().get_val();
diff --git a/libc/src/__support/FPUtil/generic/mul.h b/libc/src/__support/FPUtil/generic/mul.h
index 20d9a77792762..02fc69c6cb1ba 100644
--- a/libc/src/__support/FPUtil/generic/mul.h
+++ b/libc/src/__support/FPUtil/generic/mul.h
@@ -50,19 +50,19 @@ mul(InType x, InType y) {
         raise_except_if_required(FE_INVALID);
 
       if (x_bits.is_quiet_nan()) {
-        InStorageType x_payload = x_bits.get_mantissa();
-        x_payload >>= InFPBits::FRACTION_LEN - OutFPBits::FRACTION_LEN;
-        return OutFPBits::quiet_nan(x_bits.sign(),
-                                    static_cast<OutStorageType>(x_payload))
-            .get_val();
+        InStorageType x_payload = static_cast<InStorageType>(getpayload(x));
+        if ((x_payload & ~(OutFPBits::FRACTION_MASK >> 1)) == 0)
+          return OutFPBits::quiet_nan(x_bits.sign(),
+                                      static_cast<OutStorageType>(x_payload))
+              .get_val();
       }
 
       if (y_bits.is_quiet_nan()) {
-        InStorageType y_payload = y_bits.get_mantissa();
-        y_payload >>= InFPBits::FRACTION_LEN - OutFPBits::FRACTION_LEN;
-        return OutFPBits::quiet_nan(y_bits.sign(),
-                                    static_cast<OutStorageType>(y_payload))
-            .get_val();
+        InStorageType y_payload = static_cast<InStorageType>(getpayload(y));
+        if ((y_payload & ~(OutFPBits::FRACTION_MASK >> 1)) == 0)
+          return OutFPBits::quiet_nan(y_bits.sign(),
+                                      static_cast<OutStorageType>(y_payload))
+              .get_val();
       }
 
       return OutFPBits::quiet_nan().get_val();
diff --git a/libc/src/__support/OSUtil/linux/fcntl.cpp b/libc/src/__support/OSUtil/linux/fcntl.cpp
index 4742b2a00220b..e11013121b04b 100644
--- a/libc/src/__support/OSUtil/linux/fcntl.cpp
+++ b/libc/src/__support/OSUtil/linux/fcntl.cpp
@@ -33,8 +33,7 @@ int fcntl(int fd, int cmd, void *arg) {
 #error "fcntl and fcntl64 syscalls not available."
 #endif
 
-  int new_cmd = cmd;
-  switch (new_cmd) {
+  switch (cmd) {
   case F_OFD_SETLKW: {
     struct flock *flk = reinterpret_cast<struct flock *>(arg);
     // convert the struct to a flock64
@@ -45,8 +44,7 @@ int fcntl(int fd, int cmd, void *arg) {
     flk64.l_len = flk->l_len;
     flk64.l_pid = flk->l_pid;
     // create a syscall
-    return LIBC_NAMESPACE::syscall_impl<int>(FCNTL_SYSCALL_ID, fd, new_cmd,
-                                             &flk64);
+    return LIBC_NAMESPACE::syscall_impl<int>(FCNTL_SYSCALL_ID, fd, cmd, &flk64);
   }
   case F_OFD_GETLK:
   case F_OFD_SETLK: {
@@ -59,8 +57,8 @@ int fcntl(int fd, int cmd, void *arg) {
     flk64.l_len = flk->l_len;
     flk64.l_pid = flk->l_pid;
     // create a syscall
-    int retVal = LIBC_NAMESPACE::syscall_impl<int>(FCNTL_SYSCALL_ID, fd,
-                                                   new_cmd, &flk64);
+    int retVal =
+        LIBC_NAMESPACE::syscall_impl<int>(FCNTL_SYSCALL_ID, fd, cmd, &flk64);
     // On failure, return
     if (retVal == -1)
       return -1;
@@ -88,31 +86,17 @@ int fcntl(int fd, int cmd, void *arg) {
     libc_errno = -ret;
     return -1;
   }
-#ifdef SYS_fcntl64
-  case F_GETLK: {
-    if constexpr (FCNTL_SYSCALL_ID == SYS_fcntl64)
-      new_cmd = F_GETLK64;
-    break;
-  }
-  case F_SETLK: {
-    if constexpr (FCNTL_SYSCALL_ID == SYS_fcntl64)
-      new_cmd = F_SETLK64;
-    break;
-  }
-  case F_SETLKW: {
-    if constexpr (FCNTL_SYSCALL_ID == SYS_fcntl64)
-      new_cmd = F_SETLKW64;
-    break;
-  }
-#endif
+  // The general case
+  default: {
+    int retVal = LIBC_NAMESPACE::syscall_impl<int>(
+        FCNTL_SYSCALL_ID, fd, cmd, reinterpret_cast<void *>(arg));
+    if (retVal >= 0) {
+      return retVal;
+    }
+    libc_errno = -retVal;
+    return -1;
   }
-  int retVal = LIBC_NAMESPACE::syscall_impl<int>(FCNTL_SYSCALL_ID, fd, new_cmd,
-                                                 reinterpret_cast<void *>(arg));
-  if (retVal >= 0) {
-    return retVal;
   }
-  libc_errno = -retVal;
-  return -1;
 }
 
 } // namespace internal
diff --git a/libc/src/__support/libc_assert.h b/libc/src/__support/libc_assert.h
index e21a58a0c8aad..e3235199780c2 100644
--- a/libc/src/__support/libc_assert.h
+++ b/libc/src/__support/libc_assert.h
@@ -10,7 +10,7 @@
 #define LLVM_LIBC_SRC___SUPPORT_LIBC_ASSERT_H
 
 #include "src/__support/macros/config.h"
-#if defined(LIBC_COPT_USE_C_ASSERT) || !defined(LIBC_FULL_BUILD)
+#ifdef LIBC_COPT_USE_C_ASSERT
 
 // The build is configured to just use the public <assert.h> API
 // for libc's internal assertions.
diff --git a/libc/src/__support/threads/linux/raw_mutex.h b/libc/src/__support/threads/linux/raw_mutex.h
index 47f0aa70f1c46..dbf8b53b42f7e 100644
--- a/libc/src/__support/threads/linux/raw_mutex.h
+++ b/libc/src/__support/threads/linux/raw_mutex.h
@@ -13,7 +13,6 @@
 #include "src/__support/libc_assert.h"
 #include "src/__support/macros/attributes.h"
 #include "src/__support/macros/config.h"
-#include "src/__support/macros/optimization.h"
 #include "src/__support/threads/linux/futex_utils.h"
 #include "src/__support/threads/linux/futex_word.h"
 #include "src/__support/threads/sleep.h"
diff --git a/libc/src/__support/threads/thread.h b/libc/src/__support/threads/thread.h
index b9ce3d77d3126..931745299ffac 100644
--- a/libc/src/__support/threads/thread.h
+++ b/libc/src/__support/threads/thread.h
@@ -22,7 +22,6 @@
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/properties/architectures.h"
 
-// TODO: fix this unguarded linux dep
 #include <linux/param.h> // for exec_pagesize.
 
 #include <stddef.h> // For size_t
diff --git a/libc/src/math/CMakeLists.txt b/libc/src/math/CMakeLists.txt
index 25aef3f72e3cd..82dd23186187b 100644
--- a/libc/src/math/CMakeLists.txt
+++ b/libc/src/math/CMakeLists.txt
@@ -89,9 +89,6 @@ add_math_entrypoint_object(cospif)
 add_math_entrypoint_object(dmull)
 add_math_entrypoint_object(dmulf128)
 
-add_math_entrypoint_object(dsqrtl)
-add_math_entrypoint_object(dsqrtf128)
-
 add_math_entrypoint_object(erf)
 add_math_entrypoint_object(erff)
 
diff --git a/libc/src/math/dsqrtf128.h b/libc/src/math/dsqrtf128.h
deleted file mode 100644
index 97103392e3fef..0000000000000
--- a/libc/src/math/dsqrtf128.h
+++ /dev/null
@@ -1,21 +0,0 @@
-//===-- Implementation header for dsqrtf128 ---------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_SRC_MATH_DSQRTF128_H
-#define LLVM_LIBC_SRC_MATH_DSQRTF128_H
-
-#include "src/__support/macros/config.h"
-#include "src/__support/macros/properties/types.h"
-
-namespace LIBC_NAMESPACE_DECL {
-
-double dsqrtf128(float128 x);
-
-} // namespace LIBC_NAMESPACE_DECL
-
-#endif // LLVM_LIBC_SRC_MATH_DSQRTF128_H
diff --git a/libc/src/math/dsqrtl.h b/libc/src/math/dsqrtl.h
deleted file mode 100644
index 7bf0255697da4..0000000000000
--- a/libc/src/math/dsqrtl.h
+++ /dev/null
@@ -1,20 +0,0 @@
-//===-- Implementation header for dsqrtl ------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_SRC_MATH_DSQRTL_H
-#define LLVM_LIBC_SRC_MATH_DSQRTL_H
-
-#include "src/__support/macros/config.h"
-
-namespace LIBC_NAMESPACE_DECL {
-
-double dsqrtl(long double x);
-
-} // namespace LIBC_NAMESPACE_DECL
-
-#endif // LLVM_LIBC_SRC_MATH_DSQRTL_H
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index 74360edff3f9a..7712a6d1ae527 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -129,31 +129,6 @@ add_entrypoint_object(
     libc.src.__support.FPUtil.nearest_integer_operations
 )
 
-add_entrypoint_object(
-  dsqrtl
-  SRCS
-    dsqrtl.cpp
-  HDRS
-    ../dsqrtl.h
-  DEPENDS
-    libc.src.__support.FPUtil.generic.sqrt
-  COMPILE_OPTIONS
-    -O3
-)
-
-add_entrypoint_object(
-  dsqrtf128
-  SRCS
-    dsqrtf128.cpp
-  HDRS
-    ../dsqrtf128.h
-  DEPENDS
-    libc.src.__support.macros.properties.types
-    libc.src.__support.FPUtil.generic.sqrt
-  COMPILE_OPTIONS
-    -O3
-)
-
 add_header_library(
   range_reduction
   HDRS
@@ -424,9 +399,7 @@ add_entrypoint_object(
   DEPENDS
     libc.src.__support.FPUtil.basic_operations
   COMPILE_OPTIONS
-    -O3
-  FLAGS
-    MISC_MATH_BASIC_OPS_OPT
+    -O2
 )
 
 add_entrypoint_object(
@@ -438,9 +411,7 @@ add_entrypoint_object(
   DEPENDS
     libc.src.__support.FPUtil.basic_operations
   COMPILE_OPTIONS
-    -O3
-  FLAGS
-    MISC_MATH_BASIC_OPS_OPT
+    -O2
 )
 
 add_entrypoint_object(
@@ -452,7 +423,7 @@ add_entrypoint_object(
   DEPENDS
     libc.src.__support.FPUtil.basic_operations
   COMPILE_OPTIONS
-    -O3
+    -O2
 )
 
 add_entrypoint_object(
@@ -464,12 +435,8 @@ add_entrypoint_object(
   DEPENDS
     libc.src.__support.macros.properties.types
     libc.src.__support.FPUtil.basic_operations
-    libc.src.__support.macros.properties.architectures
-    libc.src.__support.macros.properties.compiler
   COMPILE_OPTIONS
     -O3
-  FLAGS
-    MISC_MATH_BASIC_OPS_OPT
 )
 
 add_entrypoint_object(
@@ -1451,8 +1418,6 @@ add_entrypoint_object(
     libc.src.__support.FPUtil.manipulation_functions
   COMPILE_OPTIONS
     -O3
-  FLAGS
-    MISC_MATH_BASIC_OPS_OPT
 )
 
 add_entrypoint_object(
@@ -1465,8 +1430,6 @@ add_entrypoint_object(
     libc.src.__support.FPUtil.manipulation_functions
   COMPILE_OPTIONS
     -O3
-  FLAGS
-    MISC_MATH_BASIC_OPS_OPT
 )
 
 add_entrypoint_object(
@@ -1492,8 +1455,6 @@ add_entrypoint_object(
     libc.src.__support.FPUtil.manipulation_functions
   COMPILE_OPTIONS
     -O3
-  FLAGS
-    MISC_MATH_BASIC_OPS_OPT
 )
 
 add_entrypoint_object(
diff --git a/libc/src/math/generic/copysign.cpp b/libc/src/math/generic/copysign.cpp
index 186bb2c5983f4..149d725af08e2 100644
--- a/libc/src/math/generic/copysign.cpp
+++ b/libc/src/math/generic/copysign.cpp
@@ -14,11 +14,7 @@
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(double, copysign, (double x, double y)) {
-#ifdef __LIBC_MISC_MATH_BASIC_OPS_OPT
-  return __builtin_copysign(x, y);
-#else
   return fputil::copysign(x, y);
-#endif
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/copysignf.cpp b/libc/src/math/generic/copysignf.cpp
index c79e50b61ebda..17cd70d37c308 100644
--- a/libc/src/math/generic/copysignf.cpp
+++ b/libc/src/math/generic/copysignf.cpp
@@ -14,11 +14,7 @@
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(float, copysignf, (float x, float y)) {
-#ifdef __LIBC_MISC_MATH_BASIC_OPS_OPT
-  return __builtin_copysignf(x, y);
-#else
   return fputil::copysign(x, y);
-#endif
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/copysignf16.cpp b/libc/src/math/generic/copysignf16.cpp
index 546622f049ebe..42695b3b4a6de 100644
--- a/libc/src/math/generic/copysignf16.cpp
+++ b/libc/src/math/generic/copysignf16.cpp
@@ -14,11 +14,7 @@
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(float16, copysignf16, (float16 x, float16 y)) {
-#ifdef __LIBC_MISC_MATH_BASIC_OPS_OPT
-  return __builtin_copysignf16(x, y);
-#else
   return fputil::copysign(x, y);
-#endif
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/dsqrtf128.cpp b/libc/src/math/generic/dsqrtf128.cpp
deleted file mode 100644
index ad8339309b0f3..0000000000000
--- a/libc/src/math/generic/dsqrtf128.cpp
+++ /dev/null
@@ -1,20 +0,0 @@
-//===-- Implementation of dsqrtf128 function ------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "src/math/dsqrtf128.h"
-#include "src/__support/FPUtil/generic/sqrt.h"
-#include "src/__support/common.h"
-#include "src/__support/macros/config.h"
-
-namespace LIBC_NAMESPACE_DECL {
-
-LLVM_LIBC_FUNCTION(double, dsqrtf128, (float128 x)) {
-  return fputil::sqrt<double>(x);
-}
-
-} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/dsqrtl.cpp b/libc/src/math/generic/dsqrtl.cpp
deleted file mode 100644
index bf1dae9161460..0000000000000
--- a/libc/src/math/generic/dsqrtl.cpp
+++ /dev/null
@@ -1,20 +0,0 @@
-//===-- Implementation of dsqrtl function ---------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "src/math/dsqrtl.h"
-#include "src/__support/FPUtil/generic/sqrt.h"
-#include "src/__support/common.h"
-#include "src/__support/macros/config.h"
-
-namespace LIBC_NAMESPACE_DECL {
-
-LLVM_LIBC_FUNCTION(double, dsqrtl, (long double x)) {
-  return fputil::sqrt<double>(x);
-}
-
-} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/expm1.cpp b/libc/src/math/generic/expm1.cpp
index a4dbf38ab27d0..d9485864df899 100644
--- a/libc/src/math/generic/expm1.cpp
+++ b/libc/src/math/generic/expm1.cpp
@@ -30,6 +30,13 @@
 #define LIBC_MATH_EXPM1_SKIP_ACCURATE_PASS
 #endif
 
+// #define DEBUGDEBUG
+
+#ifdef DEBUGDEBUG
+#include <iomanip>
+#include <iostream>
+#endif
+
 namespace LIBC_NAMESPACE_DECL {
 
 using fputil::DoubleDouble;
diff --git a/libc/src/math/generic/fabs.cpp b/libc/src/math/generic/fabs.cpp
index 55fa958cd7c00..472297aecb2f7 100644
--- a/libc/src/math/generic/fabs.cpp
+++ b/libc/src/math/generic/fabs.cpp
@@ -13,12 +13,6 @@
 
 namespace LIBC_NAMESPACE_DECL {
 
-LLVM_LIBC_FUNCTION(double, fabs, (double x)) {
-#ifdef __LIBC_MISC_MATH_BASIC_OPS_OPT
-  return __builtin_fabs(x);
-#else
-  return fputil::abs(x);
-#endif
-}
+LLVM_LIBC_FUNCTION(double, fabs, (double x)) { return fputil::abs(x); }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/fabsf.cpp b/libc/src/math/generic/fabsf.cpp
index 2ba18d09bbd5b..ad4fcb30c795d 100644
--- a/libc/src/math/generic/fabsf.cpp
+++ b/libc/src/math/generic/fabsf.cpp
@@ -13,12 +13,6 @@
 
 namespace LIBC_NAMESPACE_DECL {
 
-LLVM_LIBC_FUNCTION(float, fabsf, (float x)) {
-#ifdef __LIBC_MISC_MATH_BASIC_OPS_OPT
-  return __builtin_fabsf(x);
-#else
-  return fputil::abs(x);
-#endif
-}
+LLVM_LIBC_FUNCTION(float, fabsf, (float x)) { return fputil::abs(x); }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/fabsf16.cpp b/libc/src/math/generic/fabsf16.cpp
index 02e11330db718..57671fb6067e2 100644
--- a/libc/src/math/generic/fabsf16.cpp
+++ b/libc/src/math/generic/fabsf16.cpp
@@ -10,20 +10,9 @@
 #include "src/__support/FPUtil/BasicOperations.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
-#include "src/__support/macros/properties/architectures.h"
-#include "src/__support/macros/properties/compiler.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
-LLVM_LIBC_FUNCTION(float16, fabsf16, (float16 x)) {
-  // For x86, GCC generates better code from the generic implementation.
-  // https://godbolt.org/z/K9orM4hTa
-#if defined(__LIBC_MISC_MATH_BASIC_OPS_OPT) &&                                 \
-    !(defined(LIBC_TARGET_ARCH_IS_X86) && defined(LIBC_COMPILER_IS_GCC))
-  return __builtin_fabsf16(x);
-#else
-  return fputil::abs(x);
-#endif
-}
+LLVM_LIBC_FUNCTION(float16, fabsf16, (float16 x)) { return fputil::abs(x); }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdlib/CMakeLists.txt b/libc/src/stdlib/CMakeLists.txt
index b9b10bd2b4f71..513f6ad723d56 100644
--- a/libc/src/stdlib/CMakeLists.txt
+++ b/libc/src/stdlib/CMakeLists.txt
@@ -51,8 +51,6 @@ add_entrypoint_object(
   DEPENDS
     libc.src.__support.OSUtil.osutil
     .exit_handler
-    .at_quick_exit
-    .atexit
 )
 
 add_entrypoint_object(
@@ -464,11 +462,15 @@ add_entrypoint_object(
 
 # TODO: Move all exit functions to linux specific 
 
-if(TARGET libc.src.__support.threads.mutex)
-add_header_library(
+if (TARGET libc.src.__support.threads.mutex)
+add_object_library(
   exit_handler
+  SRCS
+    exit_handler.cpp
   HDRS
     exit_handler.h
+  CXX_STANDARD
+    20 # For constinit 
   DEPENDS
     libc.src.__support.CPP.mutex
     libc.src.__support.CPP.new
@@ -485,8 +487,6 @@ add_entrypoint_object(
     atexit.cpp
   HDRS
     atexit.h
-  CXX_STANDARD
-    20 # For constinit 
   DEPENDS
     .exit_handler
 )
@@ -497,11 +497,8 @@ add_entrypoint_object(
     at_quick_exit.cpp
   HDRS
     at_quick_exit.h
-  CXX_STANDARD
-    20 # For constinit 
   DEPENDS
     .exit_handler
-    .atexit
 )
 
 list(APPEND exit_deps
diff --git a/libc/src/stdlib/at_quick_exit.cpp b/libc/src/stdlib/at_quick_exit.cpp
index d2b4c0cec986f..7acae8c52def3 100644
--- a/libc/src/stdlib/at_quick_exit.cpp
+++ b/libc/src/stdlib/at_quick_exit.cpp
@@ -14,8 +14,6 @@
 
 namespace LIBC_NAMESPACE_DECL {
 
-constinit ExitCallbackList at_quick_exit_callbacks;
-
 LLVM_LIBC_FUNCTION(int, at_quick_exit, (__atexithandler_t callback)) {
   return add_atexit_unit(
       at_quick_exit_callbacks,
diff --git a/libc/src/stdlib/atexit.cpp b/libc/src/stdlib/atexit.cpp
index c8a15dd3cfef2..6844fb7aacaf6 100644
--- a/libc/src/stdlib/atexit.cpp
+++ b/libc/src/stdlib/atexit.cpp
@@ -14,9 +14,6 @@
 
 namespace LIBC_NAMESPACE_DECL {
 
-constinit ExitCallbackList atexit_callbacks;
-Mutex handler_list_mtx(false, false, false, false);
-
 extern "C" {
 
 int __cxa_atexit(AtExitCallback *callback, void *payload, void *) {
diff --git a/libc/src/stdlib/exit_handler.cpp b/libc/src/stdlib/exit_handler.cpp
new file mode 100644
index 0000000000000..ed0751a4c889e
--- /dev/null
+++ b/libc/src/stdlib/exit_handler.cpp
@@ -0,0 +1,43 @@
+//===--- Implementation of exit_handler------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdlib/exit_handler.h"
+#include "src/__support/CPP/mutex.h" // lock_guard
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+constinit ExitCallbackList at_quick_exit_callbacks;
+constinit ExitCallbackList atexit_callbacks;
+
+Mutex handler_list_mtx(false, false, false, false);
+
+void stdc_at_exit_func(void *payload) {
+  reinterpret_cast<StdCAtExitCallback *>(payload)();
+}
+
+void call_exit_callbacks(ExitCallbackList &callbacks) {
+  handler_list_mtx.lock();
+  while (!callbacks.empty()) {
+    AtExitUnit &unit = callbacks.back();
+    callbacks.pop_back();
+    handler_list_mtx.unlock();
+    unit.callback(unit.payload);
+    handler_list_mtx.lock();
+  }
+  ExitCallbackList::destroy(&callbacks);
+}
+
+int add_atexit_unit(ExitCallbackList &callbacks, const AtExitUnit &unit) {
+  cpp::lock_guard lock(handler_list_mtx);
+  if (callbacks.push_back(unit))
+    return 0;
+  return -1;
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdlib/exit_handler.h b/libc/src/stdlib/exit_handler.h
index 9720c5473940e..41733b7a0e7e5 100644
--- a/libc/src/stdlib/exit_handler.h
+++ b/libc/src/stdlib/exit_handler.h
@@ -38,32 +38,16 @@ using ExitCallbackList = ReverseOrderBlockStore<AtExitUnit, 32>;
 using ExitCallbackList = FixedVector<AtExitUnit, CALLBACK_LIST_SIZE_FOR_TESTS>;
 #endif
 
-// This is handled by the 'atexit' implementation and shared by 'at_quick_exit'.
+extern ExitCallbackList atexit_callbacks;
+extern ExitCallbackList at_quick_exit_callbacks;
+
 extern Mutex handler_list_mtx;
 
-LIBC_INLINE void stdc_at_exit_func(void *payload) {
-  reinterpret_cast<StdCAtExitCallback *>(payload)();
-}
+void stdc_at_exit_func(void *payload);
 
-LIBC_INLINE void call_exit_callbacks(ExitCallbackList &callbacks) {
-  handler_list_mtx.lock();
-  while (!callbacks.empty()) {
-    AtExitUnit &unit = callbacks.back();
-    callbacks.pop_back();
-    handler_list_mtx.unlock();
-    unit.callback(unit.payload);
-    handler_list_mtx.lock();
-  }
-  ExitCallbackList::destroy(&callbacks);
-}
+void call_exit_callbacks(ExitCallbackList &callbacks);
 
-LIBC_INLINE int add_atexit_unit(ExitCallbackList &callbacks,
-                                const AtExitUnit &unit) {
-  cpp::lock_guard lock(handler_list_mtx);
-  if (callbacks.push_back(unit))
-    return 0;
-  return -1;
-}
+int add_atexit_unit(ExitCallbackList &callbacks, const AtExitUnit &unit);
 
 } // namespace LIBC_NAMESPACE_DECL
 
diff --git a/libc/src/sys/epoll/linux/epoll_pwait2.cpp b/libc/src/sys/epoll/linux/epoll_pwait2.cpp
index 14b419399fe9b..4123157d29fff 100644
--- a/libc/src/sys/epoll/linux/epoll_pwait2.cpp
+++ b/libc/src/sys/epoll/linux/epoll_pwait2.cpp
@@ -25,10 +25,22 @@ namespace LIBC_NAMESPACE_DECL {
 LLVM_LIBC_FUNCTION(int, epoll_pwait2,
                    (int epfd, struct epoll_event *events, int maxevents,
                     const struct timespec *timeout, const sigset_t *sigmask)) {
+#ifdef SYS_epoll_pwait2
   int ret = LIBC_NAMESPACE::syscall_impl<int>(
       SYS_epoll_pwait2, epfd, reinterpret_cast<long>(events), maxevents,
       reinterpret_cast<long>(timeout), reinterpret_cast<long>(sigmask),
       NSIG / 8);
+#elif defined(SYS_epoll_pwait)
+  // Convert nanoseconds to milliseconds, rounding up if there are remaining
+  // nanoseconds
+  long timeout_ms = static_cast<long>(timeout->tv_sec * 1000 +
+                                      (timeout->tv_nsec + 999999) / 1000000);
+  int ret = LIBC_NAMESPACE::syscall_impl<int>(
+      SYS_epoll_pwait, epfd, reinterpret_cast<long>(events), maxevents,
+      timeout_ms, reinterpret_cast<long>(sigmask), NSIG / 8);
+#else
+#error "epoll_pwait and epoll_pwait2 syscalls not available."
+#endif
 
   // A negative return value indicates an error with the magnitude of the
   // value being the error code.
diff --git a/libc/test/UnitTest/CMakeLists.txt b/libc/test/UnitTest/CMakeLists.txt
index 6427b86158077..4adc2f5c725f7 100644
--- a/libc/test/UnitTest/CMakeLists.txt
+++ b/libc/test/UnitTest/CMakeLists.txt
@@ -11,10 +11,17 @@ function(add_unittest_framework_library name)
                         "header only libraries, use 'add_header_library'")
   endif()
 
+  # The Nvidia 'nvlink' linker does not support static libraries.
+  if(LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
+    set(library_type OBJECT)
+  else()
+    set(library_type STATIC)
+  endif()
+
   foreach(lib IN ITEMS ${name}.unit ${name}.hermetic)
     add_library(
       ${lib}
-      STATIC
+      ${library_type}
       EXCLUDE_FROM_ALL
       ${TEST_LIB_SRCS}
       ${TEST_LIB_HDRS}
diff --git a/libc/test/src/__support/blockstore_test.cpp b/libc/test/src/__support/blockstore_test.cpp
index de7bd72b96c80..dd74ea18f2c02 100644
--- a/libc/test/src/__support/blockstore_test.cpp
+++ b/libc/test/src/__support/blockstore_test.cpp
@@ -183,10 +183,11 @@ TEST_F(LlvmLibcBlockStoreTest, PopulateAndIterateReverse10) {
   populate_and_iterate<4, 10, true>();
 }
 
-TEST_F(LlvmLibcBlockStoreTest, Back) {
-  back_test<false>();
-  back_test<true>();
-}
+TEST_F(LlvmLibcBlockStoreTest, Back) { back_test<false>(); }
+
+// FIXME: Combing this test with the above test makes the AMDGPU backend
+// generate code which hangs. This should be fixed in the clang compiler.
+TEST_F(LlvmLibcBlockStoreTest, BackReverse) { back_test<true>(); }
 
 TEST_F(LlvmLibcBlockStoreTest, Empty) {
   empty_test<false>();
diff --git a/libc/test/src/math/CMakeLists.txt b/libc/test/src/math/CMakeLists.txt
index 3ad5d98858165..586338b2213de 100644
--- a/libc/test/src/math/CMakeLists.txt
+++ b/libc/test/src/math/CMakeLists.txt
@@ -2271,19 +2271,6 @@ add_fp_unittest(
     libc.src.math.fsqrtl
 )
 
-add_fp_unittest(
-  dsqrtl_test
-  NEED_MPFR
-  SUITE
-    libc-math-unittests
-  SRCS
-    dsqrtl_test.cpp
-  HDRS
-    SqrtTest.h
-  DEPENDS
-    libc.src.math.dsqrtl
-)
-
 add_fp_unittest(
   cbrtf_test
   NEED_MPFR
diff --git a/libc/test/src/math/dsqrtl_test.cpp b/libc/test/src/math/dsqrtl_test.cpp
deleted file mode 100644
index c178e389a57cd..0000000000000
--- a/libc/test/src/math/dsqrtl_test.cpp
+++ /dev/null
@@ -1,13 +0,0 @@
-//===-- Unittests for dsqrtl ----------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "SqrtTest.h"
-
-#include "src/math/dsqrtl.h"
-
-LIST_NARROWING_SQRT_TESTS(double, long double, LIBC_NAMESPACE::dsqrtl)
diff --git a/libc/test/src/math/performance_testing/BinaryOpSingleOutputPerf.h b/libc/test/src/math/performance_testing/BinaryOpSingleOutputPerf.h
index 63d9768e21899..995e41ba84b03 100644
--- a/libc/test/src/math/performance_testing/BinaryOpSingleOutputPerf.h
+++ b/libc/test/src/math/performance_testing/BinaryOpSingleOutputPerf.h
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/__support/CPP/algorithm.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/macros/config.h"
 #include "test/src/math/performance_testing/Timer.h"
@@ -29,11 +28,11 @@ template <typename T> class BinaryOpSingleOutputPerf {
   static void run_perf_in_range(Func myFunc, Func otherFunc,
                                 StorageType startingBit, StorageType endingBit,
                                 size_t N, size_t rounds, std::ofstream &log) {
-    if (sizeof(StorageType) <= sizeof(size_t))
-      N = cpp::min(N, static_cast<size_t>(endingBit - startingBit));
+    if (endingBit - startingBit < N)
+      N = endingBit - startingBit;
 
     auto runner = [=](Func func) {
-      [[maybe_unused]] volatile T result;
+      volatile T result;
       if (endingBit < startingBit) {
         return;
       }
diff --git a/libc/test/src/math/performance_testing/CMakeLists.txt b/libc/test/src/math/performance_testing/CMakeLists.txt
index 72e1a730ac756..a75becba04d07 100644
--- a/libc/test/src/math/performance_testing/CMakeLists.txt
+++ b/libc/test/src/math/performance_testing/CMakeLists.txt
@@ -1,5 +1,5 @@
 add_library(
-  libc_diff_test_utils STATIC
+  libc_diff_test_utils
   Timer.cpp
   Timer.h
 )
@@ -95,9 +95,6 @@ add_header_library(
   single_input_single_output_diff
   HDRS
     SingleInputSingleOutputPerf.h
-  DEPENDS
-    libc.src.__support.CPP.algorithm
-    libc.src.__support.FPUtil.fp_bits
 )
 
 add_header_library(
@@ -105,7 +102,6 @@ add_header_library(
   HDRS
     BinaryOpSingleOutputPerf.h
   DEPENDS
-    libc.src.__support.CPP.algorithm
     libc.src.__support.FPUtil.fp_bits
 )
 
@@ -406,18 +402,3 @@ add_perf_binary(
   LINK_LIBRARIES
     LibcFPTestHelpers
 )
-
-add_perf_binary(
-  misc_basic_ops_perf
-  SRCS
-    misc_basic_ops_perf.cpp
-  DEPENDS
-    .binary_op_single_output_diff
-    .single_input_single_output_diff
-    libc.src.math.copysignf
-    libc.src.math.copysignf16
-    libc.src.math.fabsf
-    libc.src.math.fabsf16
-  COMPILE_OPTIONS
-    -fno-builtin
-)
diff --git a/libc/test/src/math/performance_testing/SingleInputSingleOutputPerf.h b/libc/test/src/math/performance_testing/SingleInputSingleOutputPerf.h
index efad1259d6bf1..48ae43d6315e3 100644
--- a/libc/test/src/math/performance_testing/SingleInputSingleOutputPerf.h
+++ b/libc/test/src/math/performance_testing/SingleInputSingleOutputPerf.h
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/__support/CPP/algorithm.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/macros/config.h"
 #include "test/src/math/performance_testing/Timer.h"
@@ -27,21 +26,16 @@ template <typename T> class SingleInputSingleOutputPerf {
 
   static void runPerfInRange(Func myFunc, Func otherFunc,
                              StorageType startingBit, StorageType endingBit,
-                             size_t rounds, std::ofstream &log) {
-    size_t n = 10'010'001;
-    if (sizeof(StorageType) <= sizeof(size_t))
-      n = cpp::min(n, static_cast<size_t>(endingBit - startingBit));
-
+                             std::ofstream &log) {
     auto runner = [=](Func func) {
-      StorageType step = (endingBit - startingBit) / n;
+      constexpr StorageType N = 10'010'001;
+      StorageType step = (endingBit - startingBit) / N;
       if (step == 0)
         step = 1;
-      [[maybe_unused]] volatile T result;
-      for (size_t i = 0; i < rounds; i++) {
-        for (StorageType bits = startingBit; bits < endingBit; bits += step) {
-          T x = FPBits(bits).get_val();
-          result = func(x);
-        }
+      volatile T result;
+      for (StorageType bits = startingBit; bits < endingBit; bits += step) {
+        T x = FPBits(bits).get_val();
+        result = func(x);
       }
     };
 
@@ -50,7 +44,8 @@ template <typename T> class SingleInputSingleOutputPerf {
     runner(myFunc);
     timer.stop();
 
-    double myAverage = static_cast<double>(timer.nanoseconds()) / n / rounds;
+    StorageType numberOfRuns = endingBit - startingBit + 1;
+    double myAverage = static_cast<double>(timer.nanoseconds()) / numberOfRuns;
     log << "-- My function --\n";
     log << "     Total time      : " << timer.nanoseconds() << " ns \n";
     log << "     Average runtime : " << myAverage << " ns/op \n";
@@ -61,7 +56,8 @@ template <typename T> class SingleInputSingleOutputPerf {
     runner(otherFunc);
     timer.stop();
 
-    double otherAverage = static_cast<double>(timer.nanoseconds()) / n / rounds;
+    double otherAverage =
+        static_cast<double>(timer.nanoseconds()) / numberOfRuns;
     log << "-- Other function --\n";
     log << "     Total time      : " << timer.nanoseconds() << " ns \n";
     log << "     Average runtime : " << otherAverage << " ns/op \n";
@@ -72,18 +68,15 @@ template <typename T> class SingleInputSingleOutputPerf {
     log << "     Mine / Other's  : " << myAverage / otherAverage << " \n";
   }
 
-  static void runPerf(Func myFunc, Func otherFunc, size_t rounds,
-                      const char *logFile) {
+  static void runPerf(Func myFunc, Func otherFunc, const char *logFile) {
     std::ofstream log(logFile);
     log << " Performance tests with inputs in denormal range:\n";
     runPerfInRange(myFunc, otherFunc, /* startingBit= */ StorageType(0),
-                   /* endingBit= */ FPBits::max_subnormal().uintval(), rounds,
-                   log);
+                   /* endingBit= */ FPBits::max_subnormal().uintval(), log);
     log << "\n Performance tests with inputs in normal range:\n";
     runPerfInRange(myFunc, otherFunc,
                    /* startingBit= */ FPBits::min_normal().uintval(),
-                   /* endingBit= */ FPBits::max_normal().uintval(), rounds,
-                   log);
+                   /* endingBit= */ FPBits::max_normal().uintval(), log);
   }
 };
 
@@ -93,13 +86,6 @@ template <typename T> class SingleInputSingleOutputPerf {
 #define SINGLE_INPUT_SINGLE_OUTPUT_PERF(T, myFunc, otherFunc, filename)        \
   int main() {                                                                 \
     LIBC_NAMESPACE::testing::SingleInputSingleOutputPerf<T>::runPerf(          \
-        &myFunc, &otherFunc, 1, filename);                                     \
+        &myFunc, &otherFunc, filename);                                        \
     return 0;                                                                  \
   }
-
-#define SINGLE_INPUT_SINGLE_OUTPUT_PERF_EX(T, myFunc, otherFunc, rounds,       \
-                                           filename)                           \
-  {                                                                            \
-    LIBC_NAMESPACE::testing::SingleInputSingleOutputPerf<T>::runPerf(          \
-        &myFunc, &otherFunc, rounds, filename);                                \
-  }
diff --git a/libc/test/src/math/performance_testing/misc_basic_ops_perf.cpp b/libc/test/src/math/performance_testing/misc_basic_ops_perf.cpp
deleted file mode 100644
index ace1d21c62c32..0000000000000
--- a/libc/test/src/math/performance_testing/misc_basic_ops_perf.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-//===-- Performance test for miscellaneous basic operations ---------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "BinaryOpSingleOutputPerf.h"
-#include "SingleInputSingleOutputPerf.h"
-#include "src/math/copysignf.h"
-#include "src/math/copysignf16.h"
-#include "src/math/fabsf.h"
-#include "src/math/fabsf16.h"
-
-#include <math.h>
-
-static constexpr size_t FLOAT16_ROUNDS = 20'000;
-static constexpr size_t FLOAT_ROUNDS = 40;
-
-// LLVM libc might be the only libc implementation with support for float16 math
-// functions currently. We can't compare our float16 functions against the
-// system libc, so we compare them against this placeholder function.
-float16 placeholder_unaryf16(float16 x) { return x; }
-float16 placeholder_binaryf16(float16 x, float16 y) { return x; }
-
-int main() {
-  SINGLE_INPUT_SINGLE_OUTPUT_PERF_EX(float16, LIBC_NAMESPACE::fabsf16,
-                                     placeholder_unaryf16, FLOAT16_ROUNDS,
-                                     "fabsf16_perf.log")
-  BINARY_OP_SINGLE_OUTPUT_PERF_EX(float16, LIBC_NAMESPACE::copysignf16,
-                                  placeholder_binaryf16, FLOAT16_ROUNDS,
-                                  "copysignf16_perf.log")
-
-  SINGLE_INPUT_SINGLE_OUTPUT_PERF_EX(float, LIBC_NAMESPACE::fabsf, fabsf,
-                                     FLOAT_ROUNDS, "fabsf_perf.log")
-  BINARY_OP_SINGLE_OUTPUT_PERF_EX(float, LIBC_NAMESPACE::copysignf, copysignf,
-                                  FLOAT_ROUNDS, "copysignf_perf.log")
-
-  return 0;
-}
diff --git a/libc/test/src/math/smoke/AddTest.h b/libc/test/src/math/smoke/AddTest.h
index 0b7e395a22d4c..c713c5a88763c 100644
--- a/libc/test/src/math/smoke/AddTest.h
+++ b/libc/test/src/math/smoke/AddTest.h
@@ -38,8 +38,23 @@ class AddTest : public LIBC_NAMESPACE::testing::FEnvSafeTest {
     EXPECT_FP_IS_NAN_WITH_EXCEPTION(func(sNaN, sNaN), FE_INVALID);
 
     InType qnan_42 = InFPBits::quiet_nan(Sign::POS, 0x42).get_val();
-    EXPECT_FP_IS_NAN(func(qnan_42, zero));
-    EXPECT_FP_IS_NAN(func(zero, qnan_42));
+    EXPECT_FP_EQ(InType(0x42.0p+0),
+                 LIBC_NAMESPACE::fputil::getpayload(func(qnan_42, zero)));
+    EXPECT_FP_EQ(InType(0x42.0p+0),
+                 LIBC_NAMESPACE::fputil::getpayload(func(zero, qnan_42)));
+
+    if constexpr (sizeof(OutType) < sizeof(InType)) {
+      InStorageType max_payload = InFPBits::FRACTION_MASK >> 1;
+      InType qnan_max = InFPBits::quiet_nan(Sign::POS, max_payload).get_val();
+      EXPECT_FP_EQ(zero,
+                   LIBC_NAMESPACE::fputil::getpayload(func(qnan_max, zero)));
+      EXPECT_FP_EQ(zero,
+                   LIBC_NAMESPACE::fputil::getpayload(func(zero, qnan_max)));
+      EXPECT_FP_EQ(InType(0x42.0p+0),
+                   LIBC_NAMESPACE::fputil::getpayload(func(qnan_max, qnan_42)));
+      EXPECT_FP_EQ(InType(0x42.0p+0),
+                   LIBC_NAMESPACE::fputil::getpayload(func(qnan_42, qnan_max)));
+    }
 
     EXPECT_FP_EQ(inf, func(inf, zero));
     EXPECT_FP_EQ(neg_inf, func(neg_inf, zero));
diff --git a/libc/test/src/math/smoke/CMakeLists.txt b/libc/test/src/math/smoke/CMakeLists.txt
index 1b3c51739c0fe..f7caa03d11646 100644
--- a/libc/test/src/math/smoke/CMakeLists.txt
+++ b/libc/test/src/math/smoke/CMakeLists.txt
@@ -3996,30 +3996,6 @@ add_fp_unittest(
     libc.src.math.fsqrtf128
 )
 
-add_fp_unittest(
-  dsqrtl_test
-  SUITE
-    libc-math-smoke-tests
-  SRCS
-    dsqrtl_test.cpp
-  HDRS
-    SqrtTest.h
-  DEPENDS
-    libc.src.math.dsqrtl
-)
-
-add_fp_unittest(
-  dsqrtf128_test
-  SUITE
-    libc-math-smoke-tests
-  SRCS
-    dsqrtf128_test.cpp
-  HDRS
-    SqrtTest.h
-  DEPENDS
-    libc.src.math.dsqrtf128
-)
-
 add_fp_unittest(
   sin_test
   SUITE
diff --git a/libc/test/src/math/smoke/DivTest.h b/libc/test/src/math/smoke/DivTest.h
index b30fc17aac1d5..71cfb326b97cc 100644
--- a/libc/test/src/math/smoke/DivTest.h
+++ b/libc/test/src/math/smoke/DivTest.h
@@ -36,8 +36,23 @@ class DivTest : public LIBC_NAMESPACE::testing::FEnvSafeTest {
     EXPECT_FP_IS_NAN_WITH_EXCEPTION(func(sNaN, sNaN), FE_INVALID);
 
     InType qnan_42 = InFPBits::quiet_nan(Sign::POS, 0x42).get_val();
-    EXPECT_FP_IS_NAN(func(qnan_42, zero));
-    EXPECT_FP_IS_NAN(func(zero, qnan_42));
+    EXPECT_FP_EQ(InType(0x42.0p+0),
+                 LIBC_NAMESPACE::fputil::getpayload(func(qnan_42, zero)));
+    EXPECT_FP_EQ(InType(0x42.0p+0),
+                 LIBC_NAMESPACE::fputil::getpayload(func(zero, qnan_42)));
+
+    if constexpr (sizeof(OutType) < sizeof(InType)) {
+      InStorageType max_payload = InFPBits::FRACTION_MASK >> 1;
+      InType qnan_max = InFPBits::quiet_nan(Sign::POS, max_payload).get_val();
+      EXPECT_FP_EQ(zero,
+                   LIBC_NAMESPACE::fputil::getpayload(func(qnan_max, zero)));
+      EXPECT_FP_EQ(zero,
+                   LIBC_NAMESPACE::fputil::getpayload(func(zero, qnan_max)));
+      EXPECT_FP_EQ(InType(0x42.0p+0),
+                   LIBC_NAMESPACE::fputil::getpayload(func(qnan_max, qnan_42)));
+      EXPECT_FP_EQ(InType(0x42.0p+0),
+                   LIBC_NAMESPACE::fputil::getpayload(func(qnan_42, qnan_max)));
+    }
 
     EXPECT_FP_EQ(inf, func(inf, zero));
     EXPECT_FP_EQ(neg_inf, func(neg_inf, zero));
diff --git a/libc/test/src/math/smoke/MulTest.h b/libc/test/src/math/smoke/MulTest.h
index 0c847e39687b7..e2298eaeeb216 100644
--- a/libc/test/src/math/smoke/MulTest.h
+++ b/libc/test/src/math/smoke/MulTest.h
@@ -38,8 +38,23 @@ class MulTest : public LIBC_NAMESPACE::testing::FEnvSafeTest {
     EXPECT_FP_IS_NAN_WITH_EXCEPTION(func(sNaN, sNaN), FE_INVALID);
 
     InType qnan_42 = InFPBits::quiet_nan(Sign::POS, 0x42).get_val();
-    EXPECT_FP_IS_NAN(func(qnan_42, zero));
-    EXPECT_FP_IS_NAN(func(zero, qnan_42));
+    EXPECT_FP_EQ(InType(0x42.0p+0),
+                 LIBC_NAMESPACE::fputil::getpayload(func(qnan_42, zero)));
+    EXPECT_FP_EQ(InType(0x42.0p+0),
+                 LIBC_NAMESPACE::fputil::getpayload(func(zero, qnan_42)));
+
+    if constexpr (sizeof(OutType) < sizeof(InType)) {
+      InStorageType max_payload = InFPBits::FRACTION_MASK >> 1;
+      InType qnan_max = InFPBits::quiet_nan(Sign::POS, max_payload).get_val();
+      EXPECT_FP_EQ(zero,
+                   LIBC_NAMESPACE::fputil::getpayload(func(qnan_max, zero)));
+      EXPECT_FP_EQ(zero,
+                   LIBC_NAMESPACE::fputil::getpayload(func(zero, qnan_max)));
+      EXPECT_FP_EQ(InType(0x42.0p+0),
+                   LIBC_NAMESPACE::fputil::getpayload(func(qnan_max, qnan_42)));
+      EXPECT_FP_EQ(InType(0x42.0p+0),
+                   LIBC_NAMESPACE::fputil::getpayload(func(qnan_42, qnan_max)));
+    }
 
     EXPECT_FP_EQ(inf, func(inf, InType(1.0)));
     EXPECT_FP_EQ(neg_inf, func(neg_inf, InType(1.0)));
diff --git a/libc/test/src/math/smoke/SubTest.h b/libc/test/src/math/smoke/SubTest.h
index e5e04996affa8..a4b3822aca43c 100644
--- a/libc/test/src/math/smoke/SubTest.h
+++ b/libc/test/src/math/smoke/SubTest.h
@@ -37,8 +37,23 @@ class SubTest : public LIBC_NAMESPACE::testing::FEnvSafeTest {
     EXPECT_FP_IS_NAN_WITH_EXCEPTION(func(sNaN, sNaN), FE_INVALID);
 
     InType qnan_42 = InFPBits::quiet_nan(Sign::POS, 0x42).get_val();
-    EXPECT_FP_IS_NAN(func(qnan_42, zero));
-    EXPECT_FP_IS_NAN(func(zero, qnan_42));
+    EXPECT_FP_EQ(InType(0x42.0p+0),
+                 LIBC_NAMESPACE::fputil::getpayload(func(qnan_42, zero)));
+    EXPECT_FP_EQ(InType(0x42.0p+0),
+                 LIBC_NAMESPACE::fputil::getpayload(func(zero, qnan_42)));
+
+    if constexpr (sizeof(OutType) < sizeof(InType)) {
+      InStorageType max_payload = InFPBits::FRACTION_MASK >> 1;
+      InType qnan_max = InFPBits::quiet_nan(Sign::POS, max_payload).get_val();
+      EXPECT_FP_EQ(zero,
+                   LIBC_NAMESPACE::fputil::getpayload(func(qnan_max, zero)));
+      EXPECT_FP_EQ(zero,
+                   LIBC_NAMESPACE::fputil::getpayload(func(zero, qnan_max)));
+      EXPECT_FP_EQ(InType(0x42.0p+0),
+                   LIBC_NAMESPACE::fputil::getpayload(func(qnan_max, qnan_42)));
+      EXPECT_FP_EQ(InType(0x42.0p+0),
+                   LIBC_NAMESPACE::fputil::getpayload(func(qnan_42, qnan_max)));
+    }
 
     EXPECT_FP_EQ(inf, func(inf, zero));
     EXPECT_FP_EQ(neg_inf, func(neg_inf, zero));
diff --git a/libc/test/src/math/smoke/dsqrtf128_test.cpp b/libc/test/src/math/smoke/dsqrtf128_test.cpp
deleted file mode 100644
index 6f98d1cfc6042..0000000000000
--- a/libc/test/src/math/smoke/dsqrtf128_test.cpp
+++ /dev/null
@@ -1,13 +0,0 @@
-//===-- Unittests for dsqrtf128 -------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "SqrtTest.h"
-
-#include "src/math/dsqrtf128.h"
-
-LIST_NARROWING_SQRT_TESTS(double, float128, LIBC_NAMESPACE::dsqrtf128)
diff --git a/libc/test/src/math/smoke/dsqrtl_test.cpp b/libc/test/src/math/smoke/dsqrtl_test.cpp
deleted file mode 100644
index c178e389a57cd..0000000000000
--- a/libc/test/src/math/smoke/dsqrtl_test.cpp
+++ /dev/null
@@ -1,13 +0,0 @@
-//===-- Unittests for dsqrtl ----------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "SqrtTest.h"
-
-#include "src/math/dsqrtl.h"
-
-LIST_NARROWING_SQRT_TESTS(double, long double, LIBC_NAMESPACE::dsqrtl)
diff --git a/libc/test/src/sys/statvfs/linux/fstatvfs_test.cpp b/libc/test/src/sys/statvfs/linux/fstatvfs_test.cpp
index 8cb5f867453e4..0895c33167151 100644
--- a/libc/test/src/sys/statvfs/linux/fstatvfs_test.cpp
+++ b/libc/test/src/sys/statvfs/linux/fstatvfs_test.cpp
@@ -9,16 +9,10 @@
 #include <linux/magic.h>
 using namespace LIBC_NAMESPACE::testing::ErrnoSetterMatcher;
 
-#ifdef SYS_statfs64
-using StatFs = statfs64;
-#else
-using StatFs = statfs;
-#endif
-
 namespace LIBC_NAMESPACE_DECL {
-static int fstatfs(int fd, StatFs *buf) {
+static int fstatfs(int fd, struct statfs *buf) {
   using namespace statfs_utils;
-  if (cpp::optional<StatFs> result = linux_fstatfs(fd)) {
+  if (cpp::optional<LinuxStatFs> result = linux_fstatfs(fd)) {
     *buf = *result;
     return 0;
   }
@@ -35,7 +29,7 @@ struct PathFD {
 };
 
 TEST(LlvmLibcSysStatvfsTest, FstatfsBasic) {
-  StatFs buf;
+  struct statfs buf;
   ASSERT_THAT(LIBC_NAMESPACE::fstatfs(PathFD("/"), &buf), Succeeds());
   ASSERT_THAT(LIBC_NAMESPACE::fstatfs(PathFD("/proc"), &buf), Succeeds());
   ASSERT_EQ(buf.f_type, static_cast<decltype(buf.f_type)>(PROC_SUPER_MAGIC));
diff --git a/libc/test/src/sys/statvfs/linux/statvfs_test.cpp b/libc/test/src/sys/statvfs/linux/statvfs_test.cpp
index 5329adb54d64d..6719c1ab26865 100644
--- a/libc/test/src/sys/statvfs/linux/statvfs_test.cpp
+++ b/libc/test/src/sys/statvfs/linux/statvfs_test.cpp
@@ -6,14 +6,8 @@
 #include <linux/magic.h>
 using namespace LIBC_NAMESPACE::testing::ErrnoSetterMatcher;
 
-#ifdef SYS_statfs64
-using StatFs = statfs64;
-#else
-using StatFs = statfs;
-#endif
-
 namespace LIBC_NAMESPACE_DECL {
-static int statfs(const char *path, StatFs *buf) {
+static int statfs(const char *path, struct statfs *buf) {
   using namespace statfs_utils;
   if (cpp::optional<LinuxStatFs> result = linux_statfs(path)) {
     *buf = *result;
@@ -24,7 +18,7 @@ static int statfs(const char *path, StatFs *buf) {
 } // namespace LIBC_NAMESPACE_DECL
 
 TEST(LlvmLibcSysStatfsTest, StatfsBasic) {
-  StatFs buf;
+  struct statfs buf;
   ASSERT_THAT(LIBC_NAMESPACE::statfs("/", &buf), Succeeds());
   ASSERT_THAT(LIBC_NAMESPACE::statfs("/proc", &buf), Succeeds());
   ASSERT_EQ(buf.f_type, static_cast<decltype(buf.f_type)>(PROC_SUPER_MAGIC));
diff --git a/libc/utils/MPFRWrapper/MPFRUtils.cpp b/libc/utils/MPFRWrapper/MPFRUtils.cpp
index f88ee2af35c52..d2c28c19ab176 100644
--- a/libc/utils/MPFRWrapper/MPFRUtils.cpp
+++ b/libc/utils/MPFRWrapper/MPFRUtils.cpp
@@ -893,9 +893,6 @@ template void explain_unary_operation_single_output_error(Operation op, double,
 template void explain_unary_operation_single_output_error(Operation op,
                                                           long double, float,
                                                           double, RoundingMode);
-template void explain_unary_operation_single_output_error(Operation op,
-                                                          long double, double,
-                                                          double, RoundingMode);
 
 #ifdef LIBC_TYPES_HAS_FLOAT16
 template void explain_unary_operation_single_output_error(Operation op, float16,
@@ -1121,9 +1118,6 @@ template bool compare_unary_operation_single_output(Operation, double, float,
 template bool compare_unary_operation_single_output(Operation, long double,
                                                     float, double,
                                                     RoundingMode);
-template bool compare_unary_operation_single_output(Operation, long double,
-                                                    double, double,
-                                                    RoundingMode);
 #ifdef LIBC_TYPES_HAS_FLOAT16
 template bool compare_unary_operation_single_output(Operation, float16, float16,
                                                     double, RoundingMode);
diff --git a/libclc/generic/lib/common/sign.cl b/libclc/generic/lib/common/sign.cl
index 7b6b793be79c1..25832e0b4f8b9 100644
--- a/libclc/generic/lib/common/sign.cl
+++ b/libclc/generic/lib/common/sign.cl
@@ -26,12 +26,3 @@ SIGN(double, )
 _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, sign, double)
 
 #endif
-
-#ifdef cl_khr_fp16
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-SIGN(half,)
-_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, half, sign, half)
-
-#endif
diff --git a/libcxx/cmake/caches/Generic-hardening-mode-fast-with-abi-breaks.cmake b/libcxx/cmake/caches/Generic-hardening-mode-fast-with-abi-breaks.cmake
index 4a9389fdcb41c..4860b590dcde9 100644
--- a/libcxx/cmake/caches/Generic-hardening-mode-fast-with-abi-breaks.cmake
+++ b/libcxx/cmake/caches/Generic-hardening-mode-fast-with-abi-breaks.cmake
@@ -1,4 +1,2 @@
 set(LIBCXX_HARDENING_MODE "fast" CACHE STRING "")
 set(LIBCXX_ABI_DEFINES "_LIBCPP_ABI_BOUNDED_ITERATORS" CACHE STRING "")
-set(LIBCXX_ABI_DEFINES "_LIBCPP_ABI_BOUNDED_ITERATORS_IN_STRING" CACHE STRING "")
-set(LIBCXX_ABI_DEFINES "_LIBCPP_ABI_BOUNDED_ITERATORS_IN_VECTOR" CACHE STRING "")
diff --git a/libcxx/docs/Hardening.rst b/libcxx/docs/Hardening.rst
index 67791a5e55ac7..9aac059d27ecf 100644
--- a/libcxx/docs/Hardening.rst
+++ b/libcxx/docs/Hardening.rst
@@ -325,22 +325,6 @@ Vendors can use the following ABI options to enable additional hardening checks:
   - ``span``;
   - ``string_view``.
 
-- ``_LIBCPP_ABI_BOUNDED_ITERATORS_IN_STRING`` -- changes the iterator type of
-  ``basic_string`` to a bounded iterator that keeps track of whether it's within
-  the bounds of the original container and asserts it on every dereference and
-  when performing iterator arithmetics.
-
-  ABI impact: changes the iterator type of ``basic_string`` and its
-  specializations, such as ``string`` and ``wstring``.
-
-- ``_LIBCPP_ABI_BOUNDED_ITERATORS_IN_VECTOR`` -- changes the iterator type of
-  ``vector`` to a bounded iterator that keeps track of whether it's within the
-  bounds of the original container and asserts it on every dereference and when
-  performing iterator arithmetics. Note: this doesn't yet affect
-  ``vector<bool>``.
-
-  ABI impact: changes the iterator type of ``vector`` (except ``vector<bool>``).
-
 ABI tags
 --------
 
@@ -383,10 +367,10 @@ Hardened containers status
       - ❌
     * - ``vector``
       - ✅
-      - ✅ (see note)
+      - ❌
     * - ``string``
       - ✅
-      - ✅ (see note)
+      - ❌
     * - ``list``
       - ✅
       - ❌
@@ -445,12 +429,6 @@ Hardened containers status
       - ❌
       - N/A
 
-Note: for ``vector`` and ``string``, the iterator does not check for
-invalidation (accesses made via an invalidated iterator still lead to undefined
-behavior)
-
-Note: ``vector<bool>`` iterator is not currently hardened.
-
 Testing
 =======
 
diff --git a/libcxx/docs/ReleaseNotes.rst b/libcxx/docs/ReleaseNotes.rst
index 0454cfa3dbc36..5d98ade8d9918 100644
--- a/libcxx/docs/ReleaseNotes.rst
+++ b/libcxx/docs/ReleaseNotes.rst
@@ -1,4 +1,4 @@
-.. include:: ReleaseNotes/20.rst
+.. include:: ReleaseNotes/19.rst
 
 .. Make sure to reference the non-live release notes in a toctree to avoid Sphinx errors.
 .. toctree::
diff --git a/libcxx/docs/ReleaseNotes/19.rst b/libcxx/docs/ReleaseNotes/19.rst
index 439f552db59a8..43767552960e4 100644
--- a/libcxx/docs/ReleaseNotes/19.rst
+++ b/libcxx/docs/ReleaseNotes/19.rst
@@ -87,20 +87,6 @@ Improvements and New Features
 - ``std::ignore``\s ``const __ignore_t& operator=(_Tp&&) const`` was changed to
   ``const __ignore_type& operator=(const _Tp&) const noexcept`` for all language versions.
 
-- Vendors can now configure the ABI so that ``string`` and ``vector`` will use bounded iterators when hardening is
-  enabled. Note that checks for iterator invalidation are currently not supported -- any accesses made through an
-  invalidated bounded iterator will still result in undefined behavior (bounded iterators follow the normal invalidation
-  rules of the associated container). ``string`` bounded iterators use the logical size of the container (``index
-  < str.size()``) whereas ``vector`` bounded iterators use the "physical" size of the container (``index
-  < vec.capacity()``) which is a less strict check; refer to the implementation for further details.
-
-  Bounded iterators can be enabled via the ``_LIBCPP_ABI_BOUNDED_ITERATORS_IN_STRING`` ABI macro for ``string`` and via
-  the ``_LIBCPP_ABI_BOUNDED_ITERATORS_IN_VECTOR`` ABI macro for ``vector``; note that checks will only be performed if
-  the hardening mode is set to ``fast`` or above (i.e., no checking is performed in the unchecked mode, even if bounded
-  iterators are enabled in the ABI configuration).
-
-  Note: bounded iterators currently are not supported for ``vector<bool>``.
-
 Deprecations and Removals
 -------------------------
 
diff --git a/libcxx/docs/Status/Cxx23Issues.csv b/libcxx/docs/Status/Cxx23Issues.csv
index 1f35ca5713a9b..50104052effad 100644
--- a/libcxx/docs/Status/Cxx23Issues.csv
+++ b/libcxx/docs/Status/Cxx23Issues.csv
@@ -146,7 +146,7 @@
 "`3610 <https://wg21.link/LWG3610>`__","``iota_view::size`` sometimes rejects integer-class types","February 2022","","","|ranges|"
 "`3612 <https://wg21.link/LWG3612>`__","Inconsistent pointer alignment in ``std::format`` ","February 2022","|Complete|","14.0","|format|"
 "`3616 <https://wg21.link/LWG3616>`__","LWG 3498 seems to miss the non-member ``swap`` for ``basic_syncbuf`` ","February 2022","",""
-"`3618 <https://wg21.link/LWG3618>`__","Unnecessary ``iter_move`` for ``transform_view::iterator`` ","February 2022","|Complete|","19.0","|ranges|"
+"`3618 <https://wg21.link/LWG3618>`__","Unnecessary ``iter_move`` for ``transform_view::iterator`` ","February 2022","","","|ranges|"
 "`3619 <https://wg21.link/LWG3619>`__","Specification of ``vformat_to`` contains ill-formed ``formatted_size`` calls","February 2022","|Nothing to do|","","|format|"
 "`3621 <https://wg21.link/LWG3621>`__","Remove feature-test macro ``__cpp_lib_monadic_optional`` ","February 2022","|Complete|","15.0"
 "`3632 <https://wg21.link/LWG3632>`__","``unique_ptr`` ""Mandates: This constructor is not selected by class template argument deduction""","February 2022","|Nothing to do|",""
diff --git a/libcxx/docs/Status/Cxx23Papers.csv b/libcxx/docs/Status/Cxx23Papers.csv
index 92f4908487ae7..f8970320ce1fa 100644
--- a/libcxx/docs/Status/Cxx23Papers.csv
+++ b/libcxx/docs/Status/Cxx23Papers.csv
@@ -6,9 +6,9 @@
 "","","","","","",""
 "`P1682R3 <https://wg21.link/P1682R3>`__","LWG","std::to_underlying for enumerations","February 2021","|Complete|","13.0"
 "`P2017R1 <https://wg21.link/P2017R1>`__","LWG","Conditionally borrowed ranges","February 2021","|Complete|","16.0","|ranges|"
-"`P2160R1 <https://wg21.link/P2160R1>`__","LWG","Locks lock lockables","February 2021","Nothing to do",""
+"`P2160R1 <https://wg21.link/P2160R1>`__","LWG","Locks lock lockables","February 2021","",""
 "`P2162R2 <https://wg21.link/P2162R2>`__","LWG","Inheriting from std::variant","February 2021","|Complete|","13.0"
-"`P2212R2 <https://wg21.link/P2212R2>`__","LWG","Relax Requirements for time_point::clock","February 2021","Nothing to do",""
+"`P2212R2 <https://wg21.link/P2212R2>`__","LWG","Relax Requirements for time_point::clock","February 2021","",""
 "`P2259R1 <https://wg21.link/P2259R1>`__","LWG","Repairing input range adaptors and counted_iterator","February 2021","","","|ranges|"
 "","","","","","",""
 "`P0401R6 <https://wg21.link/P0401R6>`__","LWG","Providing size feedback in the Allocator interface","June 2021","|Complete|","15.0"
@@ -29,12 +29,12 @@
 "`P1072R10 <https://wg21.link/P1072R10>`__","LWG","``basic_string::resize_and_overwrite``","October 2021","|Complete|","14.0"
 "`P1147R1 <https://wg21.link/P1147R1>`__","LWG","Printing ``volatile`` Pointers","October 2021","|Complete|","14.0"
 "`P1272R4 <https://wg21.link/P1272R4>`__","LWG","Byteswapping for fun&&nuf","October 2021","|Complete|","14.0"
-"`P1675R2 <https://wg21.link/P1675R2>`__","LWG","``rethrow_exception`` must be allowed to copy","October 2021","Nothing to do",""
+"`P1675R2 <https://wg21.link/P1675R2>`__","LWG","``rethrow_exception`` must be allowed to copy","October 2021","",""
 "`P2077R3 <https://wg21.link/P2077R3>`__","LWG","Heterogeneous erasure overloads for associative containers","October 2021","",""
 "`P2251R1 <https://wg21.link/P2251R1>`__","LWG","Require ``span`` & ``basic_string_view`` to be Trivially Copyable","October 2021","|Complete|","14.0"
 "`P2301R1 <https://wg21.link/P2301R1>`__","LWG","Add a ``pmr`` alias for ``std::stacktrace``","October 2021","",""
 "`P2321R2 <https://wg21.link/P2321R2>`__","LWG","``zip``","October 2021","|In Progress|","","|ranges|"
-"`P2340R1 <https://wg21.link/P2340R1>`__","LWG","Clarifying the status of the 'C headers'","October 2021","Nothing to do",""
+"`P2340R1 <https://wg21.link/P2340R1>`__","LWG","Clarifying the status of the 'C headers'","October 2021","",""
 "`P2393R1 <https://wg21.link/P2393R1>`__","LWG","Cleaning up ``integer``-class types","October 2021","",""
 "`P2401R0 <https://wg21.link/P2401R0>`__","LWG","Add a conditional ``noexcept`` specification to ``std::exchange``","October 2021","|Complete|","14.0"
 "","","","","","",""
@@ -74,7 +74,7 @@
 "`P2438R2 <https://wg21.link/P2438R2>`__","LWG","``std::string::substr() &&``","July 2022","|Complete|","16.0"
 "`P2445R1 <https://wg21.link/P2445R1>`__","LWG","``forward_like``","July 2022","|Complete|","16.0"
 "`P2446R2 <https://wg21.link/P2446R2>`__","LWG","``views::as_rvalue``","July 2022","|Complete|","16.0","|ranges|"
-"`P2460R2 <https://wg21.link/P2460R2>`__","LWG","Relax requirements on ``wchar_t`` to match existing practices","July 2022","Nothing to do",""
+"`P2460R2 <https://wg21.link/P2460R2>`__","LWG","Relax requirements on ``wchar_t`` to match existing practices","July 2022","",""
 "`P2465R3 <https://wg21.link/P2465R3>`__","LWG","Standard Library Modules ``std`` and ``std.compat``","July 2022","|Complete|","19.0",""
 "`P2467R1 <https://wg21.link/P2467R1>`__","LWG","Support exclusive mode for ``fstreams``","July 2022","|Complete|","18.0",""
 "`P2474R2 <https://wg21.link/P2474R2>`__","LWG","``views::repeat``","July 2022","|Complete|","17.0","|ranges|"
@@ -120,4 +120,4 @@
 "`P2614R2 <https://wg21.link/P2614R2>`__","LWG", "Deprecate ``numeric_limits::has_denorm``","February 2023","|Complete|","18.0",""
 "`P2588R3 <https://wg21.link/P2588R3>`__","LWG", "``barrier``’s phase completion guarantees","February 2023","","",""
 "`P2763R1 <https://wg21.link/P2763R1>`__","LWG", "``layout_stride`` static extents default constructor fix","February 2023","","",""
-"`P2736R2 <https://wg21.link/P2736R2>`__","CWG","Referencing The Unicode Standard","February 2023","Complete","19.0","|format|"
+"`P2736R2 <https://wg21.link/P2736R2>`__","CWG","Referencing The Unicode Standard","February 2023","","","|format|"
diff --git a/libcxx/include/__config b/libcxx/include/__config
index 0be25a5fd226f..108f700823cbf 100644
--- a/libcxx/include/__config
+++ b/libcxx/include/__config
@@ -27,7 +27,7 @@
 // _LIBCPP_VERSION represents the version of libc++, which matches the version of LLVM.
 // Given a LLVM release LLVM XX.YY.ZZ (e.g. LLVM 17.0.1 == 17.00.01), _LIBCPP_VERSION is
 // defined to XXYYZZ.
-#  define _LIBCPP_VERSION 200000
+#  define _LIBCPP_VERSION 190000
 
 #  define _LIBCPP_CONCAT_IMPL(_X, _Y) _X##_Y
 #  define _LIBCPP_CONCAT(_X, _Y) _LIBCPP_CONCAT_IMPL(_X, _Y)
diff --git a/libcxx/include/__configuration/abi.h b/libcxx/include/__configuration/abi.h
index 0422b645727d8..710548d90a649 100644
--- a/libcxx/include/__configuration/abi.h
+++ b/libcxx/include/__configuration/abi.h
@@ -141,19 +141,6 @@
 // - `string_view`.
 // #define _LIBCPP_ABI_BOUNDED_ITERATORS
 
-// Changes the iterator type of `basic_string` to a bounded iterator that keeps track of whether it's within the bounds
-// of the original container and asserts it on every dereference and when performing iterator arithmetics.
-//
-// ABI impact: changes the iterator type of `basic_string` and its specializations, such as `string` and `wstring`.
-// #define _LIBCPP_ABI_BOUNDED_ITERATORS_IN_STRING
-
-// Changes the iterator type of `vector` to a bounded iterator that keeps track of whether it's within the bounds of the
-// original container and asserts it on every dereference and when performing iterator arithmetics. Note: this doesn't
-// yet affect `vector<bool>`.
-//
-// ABI impact: changes the iterator type of `vector` (except `vector<bool>`).
-// #define _LIBCPP_ABI_BOUNDED_ITERATORS_IN_VECTOR
-
 #if defined(_LIBCPP_COMPILER_CLANG_BASED)
 #  if defined(__APPLE__)
 #    if defined(__i386__) || defined(__x86_64__)
diff --git a/libcxx/include/__iterator/bounded_iter.h b/libcxx/include/__iterator/bounded_iter.h
index ce0823b8c97e4..a8f66f4a0126f 100644
--- a/libcxx/include/__iterator/bounded_iter.h
+++ b/libcxx/include/__iterator/bounded_iter.h
@@ -225,8 +225,6 @@ struct __bounded_iter {
 private:
   template <class>
   friend struct pointer_traits;
-  template <class, class>
-  friend struct __bounded_iter;
   _Iterator __current_;       // current iterator
   _Iterator __begin_, __end_; // valid range represented as [begin, end]
 };
diff --git a/libcxx/include/__ranges/transform_view.h b/libcxx/include/__ranges/transform_view.h
index bcce389c0e680..dc3aaa59ed8c3 100644
--- a/libcxx/include/__ranges/transform_view.h
+++ b/libcxx/include/__ranges/transform_view.h
@@ -326,6 +326,13 @@ class transform_view<_View, _Fn>::__iterator : public __transform_view_iterator_
   {
     return __x.__current_ - __y.__current_;
   }
+
+  _LIBCPP_HIDE_FROM_ABI friend constexpr decltype(auto) iter_move(const __iterator& __i) noexcept(noexcept(*__i)) {
+    if constexpr (is_lvalue_reference_v<decltype(*__i)>)
+      return std::move(*__i);
+    else
+      return *__i;
+  }
 };
 
 #  if _LIBCPP_STD_VER >= 23
diff --git a/libcxx/include/string b/libcxx/include/string
index ba86a32090825..2fd1b1e745908 100644
--- a/libcxx/include/string
+++ b/libcxx/include/string
@@ -598,7 +598,6 @@ basic_string<char32_t> operator""s( const char32_t *str, size_t len );
 #include <__functional/unary_function.h>
 #include <__fwd/string.h>
 #include <__ios/fpos.h>
-#include <__iterator/bounded_iter.h>
 #include <__iterator/distance.h>
 #include <__iterator/iterator_traits.h>
 #include <__iterator/reverse_iterator.h>
@@ -823,16 +822,9 @@ public:
                 "Allocator::value_type must be same type as value_type");
   static_assert(__check_valid_allocator<allocator_type>::value, "");
 
-#ifdef _LIBCPP_ABI_BOUNDED_ITERATORS_IN_STRING
-  // Users might provide custom allocators, and prior to C++20 we have no existing way to detect whether the allocator's
-  // pointer type is contiguous (though it has to be by the Standard). Using the wrapper type ensures the iterator is
-  // considered contiguous.
-  typedef __bounded_iter<__wrap_iter<pointer>> iterator;
-  typedef __bounded_iter<__wrap_iter<const_pointer>> const_iterator;
-#else
+  // TODO: Implement iterator bounds checking without requiring the global database.
   typedef __wrap_iter<pointer> iterator;
   typedef __wrap_iter<const_pointer> const_iterator;
-#endif
   typedef std::reverse_iterator<iterator> reverse_iterator;
   typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
 
@@ -948,33 +940,10 @@ private:
     __init_with_sentinel(std::move(__first), std::move(__last));
   }
 
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 iterator __make_iterator(pointer __p) {
-#ifdef _LIBCPP_ABI_BOUNDED_ITERATORS_IN_STRING
-    // Bound the iterator according to the size (and not the capacity, unlike vector).
-    //
-    // By the Standard, string iterators are generally not guaranteed to stay valid when the container is modified,
-    // regardless of whether reallocation occurs. This allows us to check for out-of-bounds accesses using logical size,
-    // a stricter check, since correct code can never rely on being able to access newly-added elements via an existing
-    // iterator.
-    return std::__make_bounded_iter(
-        std::__wrap_iter<pointer>(__p),
-        std::__wrap_iter<pointer>(__get_pointer()),
-        std::__wrap_iter<pointer>(__get_pointer() + size()));
-#else
-    return iterator(__p);
-#endif // _LIBCPP_ABI_BOUNDED_ITERATORS_IN_STRING
-  }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 iterator __make_iterator(pointer __p) { return iterator(__p); }
 
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_iterator __make_const_iterator(const_pointer __p) const {
-#ifdef _LIBCPP_ABI_BOUNDED_ITERATORS_IN_STRING
-    // Bound the iterator according to the size (and not the capacity, unlike vector).
-    return std::__make_bounded_iter(
-        std::__wrap_iter<const_pointer>(__p),
-        std::__wrap_iter<const_pointer>(__get_pointer()),
-        std::__wrap_iter<const_pointer>(__get_pointer() + size()));
-#else
     return const_iterator(__p);
-#endif // _LIBCPP_ABI_BOUNDED_ITERATORS_IN_STRING
   }
 
 public:
diff --git a/libcxx/include/vector b/libcxx/include/vector
index 4d83d4b6edda8..45980043a3c15 100644
--- a/libcxx/include/vector
+++ b/libcxx/include/vector
@@ -327,7 +327,6 @@ template<class T, class charT> requires is-vector-bool-reference<T> // Since C++
 #include <__functional/unary_function.h>
 #include <__fwd/vector.h>
 #include <__iterator/advance.h>
-#include <__iterator/bounded_iter.h>
 #include <__iterator/distance.h>
 #include <__iterator/iterator_traits.h>
 #include <__iterator/reverse_iterator.h>
@@ -401,16 +400,9 @@ public:
   typedef typename __alloc_traits::difference_type difference_type;
   typedef typename __alloc_traits::pointer pointer;
   typedef typename __alloc_traits::const_pointer const_pointer;
-#ifdef _LIBCPP_ABI_BOUNDED_ITERATORS_IN_VECTOR
-  // Users might provide custom allocators, and prior to C++20 we have no existing way to detect whether the allocator's
-  // pointer type is contiguous (though it has to be by the Standard). Using the wrapper type ensures the iterator is
-  // considered contiguous.
-  typedef __bounded_iter<__wrap_iter<pointer>> iterator;
-  typedef __bounded_iter<__wrap_iter<const_pointer>> const_iterator;
-#else
+  // TODO: Implement iterator bounds checking without requiring the global database.
   typedef __wrap_iter<pointer> iterator;
   typedef __wrap_iter<const_pointer> const_iterator;
-#endif
   typedef std::reverse_iterator<iterator> reverse_iterator;
   typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
 
@@ -835,39 +827,12 @@ private:
 
   _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __append(size_type __n);
   _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __append(size_type __n, const_reference __x);
-
   _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI iterator __make_iter(pointer __p) _NOEXCEPT {
-#ifdef _LIBCPP_ABI_BOUNDED_ITERATORS_IN_VECTOR
-    // Bound the iterator according to the capacity, rather than the size.
-    //
-    // Vector guarantees that iterators stay valid as long as no reallocation occurs even if new elements are inserted
-    // into the container; for these cases, we need to make sure that the newly-inserted elements can be accessed
-    // through the bounded iterator without failing checks. The downside is that the bounded iterator won't catch
-    // access that is logically out-of-bounds, i.e., goes beyond the size, but is still within the capacity. With the
-    // current implementation, there is no connection between a bounded iterator and its associated container, so we
-    // don't have a way to update existing valid iterators when the container is resized and thus have to go with
-    // a laxer approach.
-    return std::__make_bounded_iter(
-        std::__wrap_iter<pointer>(__p),
-        std::__wrap_iter<pointer>(this->__begin_),
-        std::__wrap_iter<pointer>(this->__end_cap()));
-#else
     return iterator(__p);
-#endif // _LIBCPP_ABI_BOUNDED_ITERATORS_IN_VECTOR
   }
-
   _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI const_iterator __make_iter(const_pointer __p) const _NOEXCEPT {
-#ifdef _LIBCPP_ABI_BOUNDED_ITERATORS_IN_VECTOR
-    // Bound the iterator according to the capacity, rather than the size.
-    return std::__make_bounded_iter(
-        std::__wrap_iter<const_pointer>(__p),
-        std::__wrap_iter<const_pointer>(this->__begin_),
-        std::__wrap_iter<const_pointer>(this->__end_cap()));
-#else
     return const_iterator(__p);
-#endif // _LIBCPP_ABI_BOUNDED_ITERATORS_IN_VECTOR
   }
-
   _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void
   __swap_out_circular_buffer(__split_buffer<value_type, allocator_type&>& __v);
   _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI pointer
diff --git a/libcxx/test/libcxx/containers/sequences/vector/abi.compile.pass.cpp b/libcxx/test/libcxx/containers/sequences/vector/abi.compile.pass.cpp
index 5dcbdad41968a..b03f48434a0e0 100644
--- a/libcxx/test/libcxx/containers/sequences/vector/abi.compile.pass.cpp
+++ b/libcxx/test/libcxx/containers/sequences/vector/abi.compile.pass.cpp
@@ -14,14 +14,6 @@
 
 template <class T>
 class small_pointer {
-public:
-  using value_type        = T;
-  using difference_type   = std::int16_t;
-  using pointer           = small_pointer;
-  using reference         = T&;
-  using iterator_category = std::random_access_iterator_tag;
-
-private:
   std::uint16_t offset;
 };
 
diff --git a/libcxx/test/libcxx/containers/sequences/vector/assert.iterator.index.pass.cpp b/libcxx/test/libcxx/containers/sequences/vector/assert.iterator.index.pass.cpp
deleted file mode 100644
index 63354af5af022..0000000000000
--- a/libcxx/test/libcxx/containers/sequences/vector/assert.iterator.index.pass.cpp
+++ /dev/null
@@ -1,51 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <vector>
-
-// Index iterator out of bounds.
-
-// REQUIRES: has-unix-headers, libcpp-has-abi-bounded-iterators-in-vector
-// UNSUPPORTED: libcpp-hardening-mode=none, c++03
-
-#include <vector>
-#include <cassert>
-
-#include "check_assertion.h"
-#include "fill_to_capacity.h"
-#include "min_allocator.h"
-
-int main(int, char**) {
-  {
-    typedef int T;
-    typedef std::vector<T> C;
-    C c(1);
-    fill_to_capacity(c);
-    C::iterator i = c.begin();
-    assert(i[0] == 0);
-    TEST_LIBCPP_ASSERT_FAILURE(
-        i[c.size()], "__bounded_iter::operator[]: Attempt to index an iterator at or past the end");
-    TEST_LIBCPP_ASSERT_FAILURE(
-        i[c.size() + 1], "__bounded_iter::operator[]: Attempt to index an iterator at or past the end");
-  }
-
-  {
-    typedef int T;
-    typedef std::vector<T, min_allocator<T> > C;
-    C c(1);
-    fill_to_capacity(c);
-    C::iterator i = c.begin();
-    assert(i[0] == 0);
-    TEST_LIBCPP_ASSERT_FAILURE(
-        i[c.size()], "__bounded_iter::operator[]: Attempt to index an iterator at or past the end");
-    TEST_LIBCPP_ASSERT_FAILURE(
-        i[c.size() + 1], "__bounded_iter::operator[]: Attempt to index an iterator at or past the end");
-  }
-
-  return 0;
-}
diff --git a/libcxx/test/libcxx/containers/sequences/vector/assert.iterator.add.pass.cpp b/libcxx/test/libcxx/containers/sequences/vector/debug.iterator.add.pass.cpp
similarity index 64%
rename from libcxx/test/libcxx/containers/sequences/vector/assert.iterator.add.pass.cpp
rename to libcxx/test/libcxx/containers/sequences/vector/debug.iterator.add.pass.cpp
index a066ad30ebd71..42021824ce6ae 100644
--- a/libcxx/test/libcxx/containers/sequences/vector/assert.iterator.add.pass.cpp
+++ b/libcxx/test/libcxx/containers/sequences/vector/debug.iterator.add.pass.cpp
@@ -10,14 +10,13 @@
 
 // Add to iterator out of bounds.
 
-// REQUIRES: has-unix-headers, libcpp-has-abi-bounded-iterators-in-vector
-// UNSUPPORTED: libcpp-hardening-mode=none, c++03
+// REQUIRES: has-unix-headers
+// UNSUPPORTED: !libcpp-has-legacy-debug-mode, c++03
 
 #include <vector>
 #include <cassert>
 
 #include "check_assertion.h"
-#include "fill_to_capacity.h"
 #include "min_allocator.h"
 
 int main(int, char**) {
@@ -25,24 +24,22 @@ int main(int, char**) {
     typedef int T;
     typedef std::vector<T> C;
     C c(1);
-    fill_to_capacity(c);
     C::iterator i = c.begin();
-    i += c.size();
+    i += 1;
     assert(i == c.end());
     i = c.begin();
-    TEST_LIBCPP_ASSERT_FAILURE(i + 2, "__bounded_iter::operator+=: Attempt to advance an iterator past the end");
+    TEST_LIBCPP_ASSERT_FAILURE(i + 2, "Attempted to add/subtract an iterator outside its valid range");
   }
 
   {
     typedef int T;
     typedef std::vector<T, min_allocator<T> > C;
     C c(1);
-    fill_to_capacity(c);
     C::iterator i = c.begin();
-    i += c.size();
+    i += 1;
     assert(i == c.end());
     i = c.begin();
-    TEST_LIBCPP_ASSERT_FAILURE(i + 2, "__bounded_iter::operator+=: Attempt to advance an iterator past the end");
+    TEST_LIBCPP_ASSERT_FAILURE(i + 2, "Attempted to add/subtract an iterator outside its valid range");
   }
 
   return 0;
diff --git a/libcxx/test/libcxx/containers/sequences/vector/assert.iterator.decrement.pass.cpp b/libcxx/test/libcxx/containers/sequences/vector/debug.iterator.decrement.pass.cpp
similarity index 70%
rename from libcxx/test/libcxx/containers/sequences/vector/assert.iterator.decrement.pass.cpp
rename to libcxx/test/libcxx/containers/sequences/vector/debug.iterator.decrement.pass.cpp
index 59b9c16a6aa0e..d134527a967e5 100644
--- a/libcxx/test/libcxx/containers/sequences/vector/assert.iterator.decrement.pass.cpp
+++ b/libcxx/test/libcxx/containers/sequences/vector/debug.iterator.decrement.pass.cpp
@@ -10,8 +10,8 @@
 
 // Decrement iterator prior to begin.
 
-// REQUIRES: has-unix-headers, libcpp-has-abi-bounded-iterators-in-vector
-// UNSUPPORTED: libcpp-hardening-mode=none, c++03
+// REQUIRES: has-unix-headers
+// UNSUPPORTED: !libcpp-has-legacy-debug-mode, c++03
 
 #include <vector>
 #include <cassert>
@@ -27,7 +27,7 @@ int main(int, char**) {
     C::iterator i = c.end();
     --i;
     assert(i == c.begin());
-    TEST_LIBCPP_ASSERT_FAILURE(--i, "__bounded_iter::operator--: Attempt to rewind an iterator past the start");
+    TEST_LIBCPP_ASSERT_FAILURE(--i, "Attempted to decrement a non-decrementable iterator");
   }
 
   {
@@ -37,7 +37,7 @@ int main(int, char**) {
     C::iterator i = c.end();
     --i;
     assert(i == c.begin());
-    TEST_LIBCPP_ASSERT_FAILURE(--i, "__bounded_iter::operator--: Attempt to rewind an iterator past the start");
+    TEST_LIBCPP_ASSERT_FAILURE(--i, "Attempted to decrement a non-decrementable iterator");
   }
 
   return 0;
diff --git a/libcxx/test/libcxx/containers/sequences/vector/assert.iterator.dereference.pass.cpp b/libcxx/test/libcxx/containers/sequences/vector/debug.iterator.dereference.pass.cpp
similarity index 64%
rename from libcxx/test/libcxx/containers/sequences/vector/assert.iterator.dereference.pass.cpp
rename to libcxx/test/libcxx/containers/sequences/vector/debug.iterator.dereference.pass.cpp
index 877d3655fbe2e..918cdd74b7916 100644
--- a/libcxx/test/libcxx/containers/sequences/vector/assert.iterator.dereference.pass.cpp
+++ b/libcxx/test/libcxx/containers/sequences/vector/debug.iterator.dereference.pass.cpp
@@ -10,13 +10,12 @@
 
 // Dereference non-dereferenceable iterator.
 
-// REQUIRES: has-unix-headers, libcpp-has-abi-bounded-iterators-in-vector
-// UNSUPPORTED: libcpp-hardening-mode=none, c++03
+// REQUIRES: has-unix-headers
+// UNSUPPORTED: !libcpp-has-legacy-debug-mode, c++03
 
 #include <vector>
 
 #include "check_assertion.h"
-#include "fill_to_capacity.h"
 #include "min_allocator.h"
 
 int main(int, char**) {
@@ -24,18 +23,16 @@ int main(int, char**) {
     typedef int T;
     typedef std::vector<T> C;
     C c(1);
-    fill_to_capacity(c);
     C::iterator i = c.end();
-    TEST_LIBCPP_ASSERT_FAILURE(*i, "__bounded_iter::operator*: Attempt to dereference an iterator at the end");
+    TEST_LIBCPP_ASSERT_FAILURE(*i, "Attempted to dereference a non-dereferenceable iterator");
   }
 
   {
     typedef int T;
     typedef std::vector<T, min_allocator<T> > C;
     C c(1);
-    fill_to_capacity(c);
     C::iterator i = c.end();
-    TEST_LIBCPP_ASSERT_FAILURE(*i, "__bounded_iter::operator*: Attempt to dereference an iterator at the end");
+    TEST_LIBCPP_ASSERT_FAILURE(*i, "Attempted to dereference a non-dereferenceable iterator");
   }
 
   return 0;
diff --git a/libcxx/test/libcxx/containers/sequences/vector/assert.iterator.increment.pass.cpp b/libcxx/test/libcxx/containers/sequences/vector/debug.iterator.increment.pass.cpp
similarity index 63%
rename from libcxx/test/libcxx/containers/sequences/vector/assert.iterator.increment.pass.cpp
rename to libcxx/test/libcxx/containers/sequences/vector/debug.iterator.increment.pass.cpp
index e540f40f8c476..d3e4b4ec3143f 100644
--- a/libcxx/test/libcxx/containers/sequences/vector/assert.iterator.increment.pass.cpp
+++ b/libcxx/test/libcxx/containers/sequences/vector/debug.iterator.increment.pass.cpp
@@ -10,14 +10,13 @@
 
 // Increment iterator past end.
 
-// REQUIRES: has-unix-headers, libcpp-has-abi-bounded-iterators-in-vector
-// UNSUPPORTED: libcpp-hardening-mode=none, c++03
+// REQUIRES: has-unix-headers
+// UNSUPPORTED: !libcpp-has-legacy-debug-mode, c++03
 
 #include <vector>
 #include <cassert>
 
 #include "check_assertion.h"
-#include "fill_to_capacity.h"
 #include "min_allocator.h"
 
 int main(int, char**) {
@@ -25,22 +24,20 @@ int main(int, char**) {
     typedef int T;
     typedef std::vector<T> C;
     C c(1);
-    fill_to_capacity(c);
     C::iterator i = c.begin();
-    i += c.size();
+    ++i;
     assert(i == c.end());
-    TEST_LIBCPP_ASSERT_FAILURE(++i, "__bounded_iter::operator++: Attempt to advance an iterator past the end");
+    TEST_LIBCPP_ASSERT_FAILURE(++i, "Attempted to increment a non-incrementable iterator");
   }
 
   {
     typedef int T;
     typedef std::vector<T, min_allocator<T> > C;
     C c(1);
-    fill_to_capacity(c);
     C::iterator i = c.begin();
-    i += c.size();
+    ++i;
     assert(i == c.end());
-    TEST_LIBCPP_ASSERT_FAILURE(++i, "__bounded_iter::operator++: Attempt to advance an iterator past the end");
+    TEST_LIBCPP_ASSERT_FAILURE(++i, "Attempted to increment a non-incrementable iterator");
   }
 
   return 0;
diff --git a/libcxx/test/libcxx/containers/sequences/vector/debug.iterator.index.pass.cpp b/libcxx/test/libcxx/containers/sequences/vector/debug.iterator.index.pass.cpp
new file mode 100644
index 0000000000000..8e8f6a5dae69d
--- /dev/null
+++ b/libcxx/test/libcxx/containers/sequences/vector/debug.iterator.index.pass.cpp
@@ -0,0 +1,42 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <vector>
+
+// Index iterator out of bounds.
+
+// REQUIRES: has-unix-headers
+// UNSUPPORTED: !libcpp-has-legacy-debug-mode, c++03
+
+#include <vector>
+#include <cassert>
+
+#include "check_assertion.h"
+#include "min_allocator.h"
+
+int main(int, char**) {
+  {
+    typedef int T;
+    typedef std::vector<T> C;
+    C c(1);
+    C::iterator i = c.begin();
+    assert(i[0] == 0);
+    TEST_LIBCPP_ASSERT_FAILURE(i[1], "Attempted to subscript an iterator outside its valid range");
+  }
+
+  {
+    typedef int T;
+    typedef std::vector<T, min_allocator<T> > C;
+    C c(1);
+    C::iterator i = c.begin();
+    assert(i[0] == 0);
+    TEST_LIBCPP_ASSERT_FAILURE(i[1], "Attempted to subscript an iterator outside its valid range");
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/libcxx/containers/sequences/vector/fill_to_capacity.h b/libcxx/test/libcxx/containers/sequences/vector/fill_to_capacity.h
deleted file mode 100644
index abf88c477fece..0000000000000
--- a/libcxx/test/libcxx/containers/sequences/vector/fill_to_capacity.h
+++ /dev/null
@@ -1,24 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LIBCXX_TEST_LIBCXX_CONTAINERS_SEQUENCES_VECTOR_FILL_TO_CAPACITY_H
-#define LIBCXX_TEST_LIBCXX_CONTAINERS_SEQUENCES_VECTOR_FILL_TO_CAPACITY_H
-
-#include <vector>
-
-template <typename T, typename A>
-void fill_to_capacity(std::vector<T, A>& vec) {
-  // Fill the given vector up to its capacity. Our bounded iterators are currently unable to catch an out-of-bounds
-  // access that goes beyond the container's logical storage (above the size) but is still within its physical storage
-  // (below the capacity) due to iterator stability guarantees. Filling a vector makes this distinction go away.
-  while (vec.size() < vec.capacity()) {
-    vec.push_back(T());
-  }
-}
-
-#endif // LIBCXX_TEST_LIBCXX_CONTAINERS_SEQUENCES_VECTOR_FILL_TO_CAPACITY_H
diff --git a/libcxx/test/libcxx/strings/basic.string/alignof.compile.pass.cpp b/libcxx/test/libcxx/strings/basic.string/alignof.compile.pass.cpp
index 00943ef8762f8..7b4d54ed410b0 100644
--- a/libcxx/test/libcxx/strings/basic.string/alignof.compile.pass.cpp
+++ b/libcxx/test/libcxx/strings/basic.string/alignof.compile.pass.cpp
@@ -10,7 +10,6 @@
 
 // UNSUPPORTED: c++03
 
-#include <iterator>
 #include <string>
 
 #include "test_macros.h"
@@ -19,14 +18,6 @@
 
 template <class T>
 class small_pointer {
-public:
-  using value_type        = T;
-  using difference_type   = std::int16_t;
-  using pointer           = small_pointer;
-  using reference         = T&;
-  using iterator_category = std::random_access_iterator_tag;
-
-private:
   std::uint16_t offset;
 };
 
diff --git a/libcxx/test/libcxx/strings/basic.string/sizeof.compile.pass.cpp b/libcxx/test/libcxx/strings/basic.string/sizeof.compile.pass.cpp
index b85895ffcd837..6e00e43618b2e 100644
--- a/libcxx/test/libcxx/strings/basic.string/sizeof.compile.pass.cpp
+++ b/libcxx/test/libcxx/strings/basic.string/sizeof.compile.pass.cpp
@@ -8,7 +8,6 @@
 
 // Ensure that we never change the size or alignment of `basic_string`
 
-#include <iterator>
 #include <string>
 
 #include "test_macros.h"
@@ -17,14 +16,6 @@
 
 template <class T>
 class small_pointer {
-public:
-  using value_type        = T;
-  using difference_type   = std::int16_t;
-  using pointer           = small_pointer;
-  using reference         = T&;
-  using iterator_category = std::random_access_iterator_tag;
-
-private:
   std::uint16_t offset;
 };
 
diff --git a/libcxx/test/libcxx/strings/basic.string/string.iterators/assert.iterator.add.pass.cpp b/libcxx/test/libcxx/strings/basic.string/string.iterators/debug.iterator.add.pass.cpp
similarity index 77%
rename from libcxx/test/libcxx/strings/basic.string/string.iterators/assert.iterator.add.pass.cpp
rename to libcxx/test/libcxx/strings/basic.string/string.iterators/debug.iterator.add.pass.cpp
index 56c9d63d0dbaf..8459284637dc5 100644
--- a/libcxx/test/libcxx/strings/basic.string/string.iterators/assert.iterator.add.pass.cpp
+++ b/libcxx/test/libcxx/strings/basic.string/string.iterators/debug.iterator.add.pass.cpp
@@ -10,8 +10,8 @@
 
 // Add to iterator out of bounds.
 
-// REQUIRES: has-unix-headers, libcpp-has-abi-bounded-iterators-in-string
-// UNSUPPORTED: libcpp-hardening-mode=none, c++03
+// REQUIRES: has-unix-headers
+// UNSUPPORTED: !libcpp-has-legacy-debug-mode, c++03
 
 #include <string>
 #include <cassert>
@@ -26,7 +26,7 @@ void test() {
   i += 1;
   assert(i == c.end());
   i = c.begin();
-  TEST_LIBCPP_ASSERT_FAILURE(i += 2, "__bounded_iter::operator+=: Attempt to advance an iterator past the end");
+  TEST_LIBCPP_ASSERT_FAILURE(i += 2, "Attempted to add/subtract an iterator outside its valid range");
 }
 
 int main(int, char**) {
diff --git a/libcxx/test/libcxx/strings/basic.string/string.iterators/assert.iterator.decrement.pass.cpp b/libcxx/test/libcxx/strings/basic.string/string.iterators/debug.iterator.decrement.pass.cpp
similarity index 76%
rename from libcxx/test/libcxx/strings/basic.string/string.iterators/assert.iterator.decrement.pass.cpp
rename to libcxx/test/libcxx/strings/basic.string/string.iterators/debug.iterator.decrement.pass.cpp
index 43a9739bf936f..f1fa08d006a1e 100644
--- a/libcxx/test/libcxx/strings/basic.string/string.iterators/assert.iterator.decrement.pass.cpp
+++ b/libcxx/test/libcxx/strings/basic.string/string.iterators/debug.iterator.decrement.pass.cpp
@@ -10,8 +10,8 @@
 
 // Decrement iterator prior to begin.
 
-// REQUIRES: has-unix-headers, libcpp-has-abi-bounded-iterators-in-string
-// UNSUPPORTED: libcpp-hardening-mode=none, c++03
+// REQUIRES: has-unix-headers
+// UNSUPPORTED: !libcpp-has-legacy-debug-mode, c++03
 
 #include <string>
 #include <cassert>
@@ -25,7 +25,7 @@ void test() {
   typename C::iterator i = c.end();
   --i;
   assert(i == c.begin());
-  TEST_LIBCPP_ASSERT_FAILURE(--i, "__bounded_iter::operator--: Attempt to rewind an iterator past the start");
+  TEST_LIBCPP_ASSERT_FAILURE(--i, "Attempted to decrement a non-decrementable iterator");
 }
 
 int main(int, char**) {
diff --git a/libcxx/test/libcxx/strings/basic.string/string.iterators/assert.iterator.dereference.pass.cpp b/libcxx/test/libcxx/strings/basic.string/string.iterators/debug.iterator.dereference.pass.cpp
similarity index 75%
rename from libcxx/test/libcxx/strings/basic.string/string.iterators/assert.iterator.dereference.pass.cpp
rename to libcxx/test/libcxx/strings/basic.string/string.iterators/debug.iterator.dereference.pass.cpp
index e2326be021033..0bf295c6c4f4f 100644
--- a/libcxx/test/libcxx/strings/basic.string/string.iterators/assert.iterator.dereference.pass.cpp
+++ b/libcxx/test/libcxx/strings/basic.string/string.iterators/debug.iterator.dereference.pass.cpp
@@ -10,8 +10,8 @@
 
 // Dereference non-dereferenceable iterator.
 
-// REQUIRES: has-unix-headers, libcpp-has-abi-bounded-iterators-in-string
-// UNSUPPORTED: libcpp-hardening-mode=none, c++03
+// REQUIRES: has-unix-headers
+// UNSUPPORTED: !libcpp-has-legacy-debug-mode, c++03
 
 #include <string>
 
@@ -22,7 +22,7 @@ template <class C>
 void test() {
   C c(1, '\0');
   typename C::iterator i = c.end();
-  TEST_LIBCPP_ASSERT_FAILURE(*i, "__bounded_iter::operator*: Attempt to dereference an iterator at the end");
+  TEST_LIBCPP_ASSERT_FAILURE(*i, "Attempted to dereference a non-dereferenceable iterator");
 }
 
 int main(int, char**) {
diff --git a/libcxx/test/libcxx/strings/basic.string/string.iterators/assert.iterator.increment.pass.cpp b/libcxx/test/libcxx/strings/basic.string/string.iterators/debug.iterator.increment.pass.cpp
similarity index 76%
rename from libcxx/test/libcxx/strings/basic.string/string.iterators/assert.iterator.increment.pass.cpp
rename to libcxx/test/libcxx/strings/basic.string/string.iterators/debug.iterator.increment.pass.cpp
index a7453f3115197..9cc9ab40bcdd6 100644
--- a/libcxx/test/libcxx/strings/basic.string/string.iterators/assert.iterator.increment.pass.cpp
+++ b/libcxx/test/libcxx/strings/basic.string/string.iterators/debug.iterator.increment.pass.cpp
@@ -10,8 +10,8 @@
 
 // Increment iterator past end.
 
-// REQUIRES: has-unix-headers, libcpp-has-abi-bounded-iterators-in-string
-// UNSUPPORTED: libcpp-hardening-mode=none, c++03
+// REQUIRES: has-unix-headers
+// UNSUPPORTED: !libcpp-has-legacy-debug-mode, c++03
 
 #include <string>
 #include <cassert>
@@ -25,7 +25,7 @@ void test() {
   typename C::iterator i = c.begin();
   ++i;
   assert(i == c.end());
-  TEST_LIBCPP_ASSERT_FAILURE(++i, "__bounded_iter::operator++: Attempt to advance an iterator past the end");
+  TEST_LIBCPP_ASSERT_FAILURE(++i, "Attempted to increment a non-incrementable iterator");
 }
 
 int main(int, char**) {
diff --git a/libcxx/test/libcxx/strings/basic.string/string.iterators/assert.iterator.index.pass.cpp b/libcxx/test/libcxx/strings/basic.string/string.iterators/debug.iterator.index.pass.cpp
similarity index 66%
rename from libcxx/test/libcxx/strings/basic.string/string.iterators/assert.iterator.index.pass.cpp
rename to libcxx/test/libcxx/strings/basic.string/string.iterators/debug.iterator.index.pass.cpp
index e7d384413b589..34060065d2046 100644
--- a/libcxx/test/libcxx/strings/basic.string/string.iterators/assert.iterator.index.pass.cpp
+++ b/libcxx/test/libcxx/strings/basic.string/string.iterators/debug.iterator.index.pass.cpp
@@ -10,8 +10,8 @@
 
 // Index iterator out of bounds.
 
-// REQUIRES: has-unix-headers, libcpp-has-abi-bounded-iterators-in-string
-// UNSUPPORTED: libcpp-hardening-mode=none, c++03
+// REQUIRES: has-unix-headers
+// UNSUPPORTED: !libcpp-has-legacy-debug-mode, c++03
 
 #include <string>
 #include <cassert>
@@ -23,10 +23,9 @@ template <class C>
 void test() {
   using T = decltype(std::uint8_t() - std::uint8_t());
   C c(1, '\0');
-  typename C::iterator i = c.begin();
+  C::iterator i = c.begin();
   assert(i[0] == 0);
-  TEST_LIBCPP_ASSERT_FAILURE(i[1], "__bounded_iter::operator[]: Attempt to index an iterator at or past the end");
-  TEST_LIBCPP_ASSERT_FAILURE(i[-1], "__bounded_iter::operator[]: Attempt to index an iterator past the start");
+  TEST_LIBCPP_ASSERT_FAILURE(i[1], "Attempted to subscript an iterator outside its valid range");
 }
 
 int main(int, char**) {
diff --git a/libcxx/test/std/containers/sequences/vector/vector.modifiers/assert.push_back.invalidation.pass.cpp b/libcxx/test/std/containers/sequences/vector/vector.modifiers/assert.push_back.invalidation.pass.cpp
deleted file mode 100644
index 193c00891da7f..0000000000000
--- a/libcxx/test/std/containers/sequences/vector/vector.modifiers/assert.push_back.invalidation.pass.cpp
+++ /dev/null
@@ -1,56 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <vector>
-
-// void push_back(const value_type& x);
-//
-// If no reallocation happens, then references, pointers, and iterators before
-// the insertion point remain valid but those at or after the insertion point,
-// including the past-the-end iterator, are invalidated.
-
-// REQUIRES: has-unix-headers, libcpp-has-abi-bounded-iterators-in-vector
-// UNSUPPORTED: c++03
-// UNSUPPORTED: libcpp-hardening-mode=none
-// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
-
-#include <vector>
-#include <cassert>
-#include <cstddef>
-
-#include "check_assertion.h"
-
-int main(int, char**) {
-  std::vector<int> vec;
-  vec.reserve(4);
-  std::size_t old_capacity = vec.capacity();
-  assert(old_capacity >= 4);
-
-  vec.push_back(0);
-  vec.push_back(1);
-  vec.push_back(2);
-  auto it = vec.begin();
-  vec.push_back(3);
-  assert(vec.capacity() == old_capacity);
-
-  // The capacity did not change, so the iterator remains valid and can reach the new element.
-  assert(*it == 0);
-  assert(*(it + 1) == 1);
-  assert(*(it + 2) == 2);
-  assert(*(it + 3) == 3);
-
-  while (vec.capacity() == old_capacity) {
-    vec.push_back(42);
-  }
-  TEST_LIBCPP_ASSERT_FAILURE(
-      *(it + old_capacity), "__bounded_iter::operator*: Attempt to dereference an iterator at the end");
-  // Unfortunately, the bounded iterator does not detect that it's been invalidated and will still allow attempts to
-  // dereference elements 0 to 3 (even though they refer to memory that's been reallocated).
-
-  return 0;
-}
diff --git a/libcxx/test/std/localization/locale.categories/facet.numpunct/locale.numpunct.byname/thousands_sep.pass.cpp b/libcxx/test/std/localization/locale.categories/facet.numpunct/locale.numpunct.byname/thousands_sep.pass.cpp
index 850352b3bc1ec..f368b1069c063 100644
--- a/libcxx/test/std/localization/locale.categories/facet.numpunct/locale.numpunct.byname/thousands_sep.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/facet.numpunct/locale.numpunct.byname/thousands_sep.pass.cpp
@@ -64,7 +64,7 @@ int main(int, char**)
         std::locale l(LOCALE_fr_FR_UTF_8);
         {
 #if defined(_CS_GNU_LIBC_VERSION) || defined(_WIN32) || defined(_AIX)
-            const char sep = ' ';
+          const char sep = ' ';
 #else
             const char sep = ',';
 #endif
@@ -77,11 +77,11 @@ int main(int, char**)
 #if defined(_CS_GNU_LIBC_VERSION)
             const wchar_t wsep = glibc_version_less_than("2.27") ? L' ' : L'\u202f';
 #  elif defined(_AIX)
-            const wchar_t wsep = L'\u202F';
+          const wchar_t wsep = L'\u202F';
 #  elif defined(_WIN32)
-            const wchar_t wsep = L'\u00A0';
+          const wchar_t wsep = L'\u00A0';
 #  else
-            const wchar_t wsep = L',';
+          const wchar_t wsep = L',';
 #  endif
             typedef wchar_t C;
             const std::numpunct<C>& np = std::use_facet<std::numpunct<C> >(l);
diff --git a/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/deref.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/deref.pass.cpp
index 5d8bf51c69696..06eb91dba22ff 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/deref.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/deref.pass.cpp
@@ -37,7 +37,7 @@ int main(int, char**) {
     using View = std::ranges::transform_view<MoveOnlyView, PlusOneNoexcept>;
     View transformView(MoveOnlyView{buff}, PlusOneNoexcept{});
     assert(*transformView.begin() == 1);
-    ASSERT_NOEXCEPT(*std::declval<std::ranges::iterator_t<View>>());
+    LIBCPP_ASSERT_NOEXCEPT(*std::declval<std::ranges::iterator_t<View>>());
     ASSERT_SAME_TYPE(int, decltype(*std::declval<View>().begin()));
   }
   {
diff --git a/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/iter_move.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/iter_move.pass.cpp
new file mode 100644
index 0000000000000..ea1e4b61412c4
--- /dev/null
+++ b/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/iter_move.pass.cpp
@@ -0,0 +1,49 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
+// friend constexpr decltype(auto) iter_move(const iterator& i)
+//    noexcept(noexcept(invoke(i.parent_->fun_, *i.current_)))
+
+#include <ranges>
+
+#include "test_macros.h"
+#include "../types.h"
+
+constexpr bool test() {
+  int buff[8] = {0, 1, 2, 3, 4, 5, 6, 7};
+
+  {
+    std::ranges::transform_view transformView(MoveOnlyView{buff}, PlusOneMutable{});
+    auto iter = transformView.begin();
+    ASSERT_NOT_NOEXCEPT(std::ranges::iter_move(iter));
+
+    assert(std::ranges::iter_move(iter) == 1);
+    assert(std::ranges::iter_move(iter + 2) == 3);
+
+    ASSERT_SAME_TYPE(int, decltype(std::ranges::iter_move(iter)));
+    ASSERT_SAME_TYPE(int, decltype(std::ranges::iter_move(std::move(iter))));
+  }
+
+  {
+    LIBCPP_ASSERT_NOEXCEPT(std::ranges::iter_move(
+      std::declval<std::ranges::iterator_t<std::ranges::transform_view<MoveOnlyView, PlusOneNoexcept>>&>()));
+    ASSERT_NOT_NOEXCEPT(std::ranges::iter_move(
+      std::declval<std::ranges::iterator_t<std::ranges::transform_view<MoveOnlyView, PlusOneMutable>>&>()));
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/strings/basic.string/string.capacity/resize_and_overwrite.pass.cpp b/libcxx/test/std/strings/basic.string/string.capacity/resize_and_overwrite.pass.cpp
index abd284852a189..edc8b67808b85 100644
--- a/libcxx/test/std/strings/basic.string/string.capacity/resize_and_overwrite.pass.cpp
+++ b/libcxx/test/std/strings/basic.string/string.capacity/resize_and_overwrite.pass.cpp
@@ -15,7 +15,6 @@
 
 #include <algorithm>
 #include <cassert>
-#include <memory>
 #include <string>
 
 #include "make_string.h"
@@ -30,7 +29,7 @@ constexpr void test_appending(std::size_t k, size_t N, size_t new_capacity) {
   s.resize_and_overwrite(new_capacity, [&](auto* p, auto n) {
     assert(n == new_capacity);
     LIBCPP_ASSERT(s.size() == new_capacity);
-    LIBCPP_ASSERT(std::to_address(s.begin()) == p);
+    LIBCPP_ASSERT(s.begin().base() == p);
     assert(std::all_of(p, p + k, [](const auto ch) { return ch == 'a'; }));
     std::fill(p + k, p + n, 'b');
     p[n] = 'c'; // will be overwritten
@@ -49,7 +48,7 @@ constexpr void test_truncating(std::size_t o, size_t N) {
   s.resize_and_overwrite(N, [&](auto* p, auto n) {
     assert(n == N);
     LIBCPP_ASSERT(s.size() == n);
-    LIBCPP_ASSERT(std::to_address(s.begin()) == p);
+    LIBCPP_ASSERT(s.begin().base() == p);
     assert(std::all_of(p, p + n, [](auto ch) { return ch == 'a'; }));
     p[n - 1] = 'b';
     p[n]     = 'c'; // will be overwritten
diff --git a/libcxx/utils/ci/vendor/android/Dockerfile.emulator b/libcxx/utils/ci/vendor/android/Dockerfile.emulator
index 2f52b27f6edcd..54953f40014e7 100644
--- a/libcxx/utils/ci/vendor/android/Dockerfile.emulator
+++ b/libcxx/utils/ci/vendor/android/Dockerfile.emulator
@@ -16,7 +16,7 @@ RUN apt-get update && apt-get install -y \
     unzip \
     && rm -rf /var/lib/apt/lists/*
 
-ENV ANDROID_HOME=/opt/android/sdk
+ENV ANDROID_HOME /opt/android/sdk
 
 RUN curl -sL https://dl.google.com/android/repository/commandlinetools-linux-9477386_latest.zip -o cmdline-tools.zip && \
     mkdir -p ${ANDROID_HOME} && \
diff --git a/libcxx/utils/ci/vendor/android/emulator-entrypoint.sh b/libcxx/utils/ci/vendor/android/emulator-entrypoint.sh
index 99d4995b2ee1d..e4538697266a4 100755
--- a/libcxx/utils/ci/vendor/android/emulator-entrypoint.sh
+++ b/libcxx/utils/ci/vendor/android/emulator-entrypoint.sh
@@ -45,5 +45,5 @@ fi
 
 # Use exec so that the emulator is PID 1, so that `docker stop` kills the
 # emulator.
-exec emulator @emulator -no-audio -no-window -no-metrics \
+exec emulator @emulator -no-audio -no-window \
     -partition-size "${EMU_PARTITION_SIZE}"
diff --git a/libcxx/utils/libcxx/test/features.py b/libcxx/utils/libcxx/test/features.py
index e978875d543f3..5e708da4f8fbe 100644
--- a/libcxx/utils/libcxx/test/features.py
+++ b/libcxx/utils/libcxx/test/features.py
@@ -352,8 +352,6 @@ def _mingwSupportsModules(cfg):
     "_LIBCPP_NO_VCRUNTIME": "libcpp-no-vcruntime",
     "_LIBCPP_ABI_VERSION": "libcpp-abi-version",
     "_LIBCPP_ABI_BOUNDED_ITERATORS": "libcpp-has-abi-bounded-iterators",
-    "_LIBCPP_ABI_BOUNDED_ITERATORS_IN_STRING": "libcpp-has-abi-bounded-iterators-in-string",
-    "_LIBCPP_ABI_BOUNDED_ITERATORS_IN_VECTOR": "libcpp-has-abi-bounded-iterators-in-vector",
     "_LIBCPP_DEPRECATED_ABI_DISABLE_PAIR_TRIVIAL_COPY_CTOR": "libcpp-deprecated-abi-disable-pair-trivial-copy-ctor",
     "_LIBCPP_HAS_NO_FILESYSTEM": "no-filesystem",
     "_LIBCPP_HAS_NO_RANDOM_DEVICE": "no-random-device",
diff --git a/lld/COFF/Symbols.h b/lld/COFF/Symbols.h
index 56b137d56873a..5ef46f5af6a6c 100644
--- a/lld/COFF/Symbols.h
+++ b/lld/COFF/Symbols.h
@@ -98,10 +98,10 @@ class Symbol {
   friend SymbolTable;
   explicit Symbol(Kind k, StringRef n = "")
       : symbolKind(k), isExternal(true), isCOMDAT(false),
-        writtenToSymtab(false), isUsedInRegularObj(false),
-        pendingArchiveLoad(false), isGCRoot(false), isRuntimePseudoReloc(false),
-        deferUndefined(false), canInline(true), isWeak(false),
-        nameSize(n.size()), nameData(n.empty() ? nullptr : n.data()) {
+        writtenToSymtab(false), pendingArchiveLoad(false), isGCRoot(false),
+        isRuntimePseudoReloc(false), deferUndefined(false), canInline(true),
+        isWeak(false), nameSize(n.size()),
+        nameData(n.empty() ? nullptr : n.data()) {
     assert((!n.empty() || k <= LastDefinedCOFFKind) &&
            "If the name is empty, the Symbol must be a DefinedCOFF.");
   }
@@ -499,10 +499,8 @@ void replaceSymbol(Symbol *s, ArgT &&... arg) {
   assert(static_cast<Symbol *>(static_cast<T *>(nullptr)) == nullptr &&
          "Not a Symbol");
   bool canInline = s->canInline;
-  bool isUsedInRegularObj = s->isUsedInRegularObj;
   new (s) T(std::forward<ArgT>(arg)...);
   s->canInline = canInline;
-  s->isUsedInRegularObj = isUsedInRegularObj;
 }
 } // namespace coff
 
diff --git a/lld/ELF/Arch/RISCV.cpp b/lld/ELF/Arch/RISCV.cpp
index 56759c28dcf41..6af89ce3517b7 100644
--- a/lld/ELF/Arch/RISCV.cpp
+++ b/lld/ELF/Arch/RISCV.cpp
@@ -1131,7 +1131,6 @@ static void mergeAtomic(DenseMap<unsigned, unsigned>::iterator it,
     case RISCVAttrs::RISCVAtomicAbiTag::A6C:
       return;
     };
-    break;
 
   case RISCVAtomicAbiTag::A6S:
     switch (newTag) {
@@ -1145,7 +1144,6 @@ static void mergeAtomic(DenseMap<unsigned, unsigned>::iterator it,
     case RISCVAttrs::RISCVAtomicAbiTag::A6S:
       return;
     };
-    break;
 
   case RISCVAtomicAbiTag::A7:
     switch (newTag) {
@@ -1159,7 +1157,6 @@ static void mergeAtomic(DenseMap<unsigned, unsigned>::iterator it,
     case RISCVAttrs::RISCVAtomicAbiTag::A7:
       return;
     };
-    break;
   };
 
   // If we get here, then we have an invalid tag, so report it.
diff --git a/lld/MachO/Driver.cpp b/lld/MachO/Driver.cpp
index 1883acc781ece..dc9d635b48ec4 100644
--- a/lld/MachO/Driver.cpp
+++ b/lld/MachO/Driver.cpp
@@ -49,7 +49,6 @@
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Support/TimeProfiler.h"
 #include "llvm/TargetParser/Host.h"
-#include "llvm/TextAPI/Architecture.h"
 #include "llvm/TextAPI/PackedVersion.h"
 
 #include <algorithm>
@@ -1081,55 +1080,42 @@ static bool shouldEmitChainedFixups(const InputArgList &args) {
   if (arg && arg->getOption().matches(OPT_no_fixup_chains))
     return false;
 
-  bool requested = arg && arg->getOption().matches(OPT_fixup_chains);
-  if (!config->isPic) {
-    if (requested)
-      error("-fixup_chains is incompatible with -no_pie");
+  bool isRequested = arg != nullptr;
 
-    return false;
+  // Version numbers taken from the Xcode 13.3 release notes.
+  static const std::array<std::pair<PlatformType, VersionTuple>, 5> minVersion =
+      {{{PLATFORM_MACOS, VersionTuple(11, 0)},
+        {PLATFORM_IOS, VersionTuple(13, 4)},
+        {PLATFORM_TVOS, VersionTuple(14, 0)},
+        {PLATFORM_WATCHOS, VersionTuple(7, 0)},
+        {PLATFORM_XROS, VersionTuple(1, 0)}}};
+  PlatformType platform = removeSimulator(config->platformInfo.target.Platform);
+  auto it = llvm::find_if(minVersion,
+                          [&](const auto &p) { return p.first == platform; });
+  if (it != minVersion.end() &&
+      it->second > config->platformInfo.target.MinDeployment) {
+    if (!isRequested)
+      return false;
+
+    warn("-fixup_chains requires " + getPlatformName(config->platform()) + " " +
+         it->second.getAsString() + ", which is newer than target minimum of " +
+         config->platformInfo.target.MinDeployment.getAsString());
   }
 
   if (!is_contained({AK_x86_64, AK_x86_64h, AK_arm64}, config->arch())) {
-    if (requested)
+    if (isRequested)
       error("-fixup_chains is only supported on x86_64 and arm64 targets");
-
     return false;
   }
 
-  if (args.hasArg(OPT_preload)) {
-    if (requested)
-      error("-fixup_chains is incompatible with -preload");
-
+  if (!config->isPic) {
+    if (isRequested)
+      error("-fixup_chains is incompatible with -no_pie");
     return false;
   }
 
-  if (requested)
-    return true;
-
-  static const std::array<std::pair<PlatformType, VersionTuple>, 9> minVersion =
-      {{
-          {PLATFORM_IOS, VersionTuple(13, 4)},
-          {PLATFORM_IOSSIMULATOR, VersionTuple(16, 0)},
-          {PLATFORM_MACOS, VersionTuple(13, 0)},
-          {PLATFORM_TVOS, VersionTuple(14, 0)},
-          {PLATFORM_TVOSSIMULATOR, VersionTuple(15, 0)},
-          {PLATFORM_WATCHOS, VersionTuple(7, 0)},
-          {PLATFORM_WATCHOSSIMULATOR, VersionTuple(8, 0)},
-          {PLATFORM_XROS, VersionTuple(1, 0)},
-          {PLATFORM_XROS_SIMULATOR, VersionTuple(1, 0)},
-      }};
-  PlatformType platform = config->platformInfo.target.Platform;
-  auto it = llvm::find_if(minVersion,
-                          [&](const auto &p) { return p.first == platform; });
-
-  // We don't know the versions for other platforms, so default to disabled.
-  if (it == minVersion.end())
-    return false;
-
-  if (it->second > config->platformInfo.target.MinDeployment)
-    return false;
-
-  return true;
+  // TODO: Enable by default once stable.
+  return isRequested;
 }
 
 static bool shouldEmitRelativeMethodLists(const InputArgList &args) {
diff --git a/lld/MachO/Writer.cpp b/lld/MachO/Writer.cpp
index 0eb809282af28..e6b80c1d42d9e 100644
--- a/lld/MachO/Writer.cpp
+++ b/lld/MachO/Writer.cpp
@@ -28,8 +28,8 @@
 #include "llvm/Support/LEB128.h"
 #include "llvm/Support/Parallel.h"
 #include "llvm/Support/Path.h"
+#include "llvm/Support/ThreadPool.h"
 #include "llvm/Support/TimeProfiler.h"
-#include "llvm/Support/thread.h"
 #include "llvm/Support/xxhash.h"
 
 #include <algorithm>
@@ -66,6 +66,7 @@ class Writer {
 
   template <class LP> void run();
 
+  DefaultThreadPool threadPool;
   std::unique_ptr<FileOutputBuffer> &buffer;
   uint64_t addr = 0;
   uint64_t fileOff = 0;
@@ -1120,12 +1121,14 @@ void Writer::finalizeLinkEditSegment() {
       symtabSection,     indirectSymtabSection,
       dataInCodeSection, functionStartsSection,
   };
-
-  parallelForEach(linkEditSections.begin(), linkEditSections.end(),
-                  [](LinkEditSection *osec) {
-                    if (osec)
-                      osec->finalizeContents();
-                  });
+  SmallVector<std::shared_future<void>> threadFutures;
+  threadFutures.reserve(linkEditSections.size());
+  for (LinkEditSection *osec : linkEditSections)
+    if (osec)
+      threadFutures.emplace_back(threadPool.async(
+          [](LinkEditSection *osec) { osec->finalizeContents(); }, osec));
+  for (std::shared_future<void> &future : threadFutures)
+    future.wait();
 
   // Now that __LINKEDIT is filled out, do a proper calculation of its
   // addresses and offsets.
@@ -1167,8 +1170,6 @@ void Writer::openFile() {
 }
 
 void Writer::writeSections() {
-  TimeTraceScope timeScope("Write output sections");
-
   uint8_t *buf = buffer->getBufferStart();
   std::vector<const OutputSection *> osecs;
   for (const OutputSegment *seg : outputSegments)
@@ -1199,15 +1200,18 @@ void Writer::writeUuid() {
 
   ArrayRef<uint8_t> data{buffer->getBufferStart(), buffer->getBufferEnd()};
   std::vector<ArrayRef<uint8_t>> chunks = split(data, 1024 * 1024);
-
   // Leave one slot for filename
   std::vector<uint64_t> hashes(chunks.size() + 1);
-  parallelFor(0, chunks.size(),
-              [&](size_t i) { hashes[i] = xxh3_64bits(chunks[i]); });
+  SmallVector<std::shared_future<void>> threadFutures;
+  threadFutures.reserve(chunks.size());
+  for (size_t i = 0; i < chunks.size(); ++i)
+    threadFutures.emplace_back(threadPool.async(
+        [&](size_t j) { hashes[j] = xxh3_64bits(chunks[j]); }, i));
+  for (std::shared_future<void> &future : threadFutures)
+    future.wait();
   // Append the output filename so that identical binaries with different names
   // don't get the same UUID.
   hashes[chunks.size()] = xxh3_64bits(sys::path::filename(config->finalOutput));
-
   uint64_t digest = xxh3_64bits({reinterpret_cast<uint8_t *>(hashes.data()),
                                  hashes.size() * sizeof(uint64_t)});
   uuidCommand->writeUuid(digest);
@@ -1326,18 +1330,15 @@ template <class LP> void Writer::run() {
   sortSegmentsAndSections();
   createLoadCommands<LP>();
   finalizeAddresses();
-
-  llvm::thread mapFileWriter([&] {
+  threadPool.async([&] {
     if (LLVM_ENABLE_THREADS && config->timeTraceEnabled)
       timeTraceProfilerInitialize(config->timeTraceGranularity, "writeMapFile");
     writeMapFile();
     if (LLVM_ENABLE_THREADS && config->timeTraceEnabled)
       timeTraceProfilerFinishThread();
   });
-
   finalizeLinkEditSegment();
   writeOutputFile();
-  mapFileWriter.join();
 }
 
 template <class LP> void macho::writeResult() { Writer().run<LP>(); }
diff --git a/lld/docs/ReleaseNotes.rst b/lld/docs/ReleaseNotes.rst
index 6f60efd87c975..ba8543732bb8e 100644
--- a/lld/docs/ReleaseNotes.rst
+++ b/lld/docs/ReleaseNotes.rst
@@ -26,6 +26,70 @@ Non-comprehensive list of changes in this release
 ELF Improvements
 ----------------
 
+* ``EI_OSABI`` in the output is now inferred from input object files.
+  (`#97144 <https://github.com/llvm/llvm-project/pull/97144>`_)
+* ``--compress-sections <section-glib>={none,zlib,zstd}[:level]`` is added to compress
+  matched output sections without the ``SHF_ALLOC`` flag.
+  (`#84855 <https://github.com/llvm/llvm-project/pull/84855>`_)
+  (`#90567 <https://github.com/llvm/llvm-project/pull/90567>`_)
+* The default compression level for zlib is now independent of linker
+  optimization level (``Z_BEST_SPEED``).
+* zstd compression parallelism no longer requires ``ZSTD_MULITHREAD`` build.
+* ``GNU_PROPERTY_AARCH64_FEATURE_PAUTH`` notes, ``R_AARCH64_AUTH_ABS64`` and
+  ``R_AARCH64_AUTH_RELATIVE`` relocations are now supported.
+  (`#72714 <https://github.com/llvm/llvm-project/pull/72714>`_)
+* ``--no-allow-shlib-undefined`` now rejects non-exported definitions in the
+  ``def-hidden.so ref.so`` case.
+  (`#86777 <https://github.com/llvm/llvm-project/issues/86777>`_)
+* ``--debug-names`` is added to create a merged ``.debug_names`` index
+  from input ``.debug_names`` sections. Type units are not handled yet.
+  (`#86508 <https://github.com/llvm/llvm-project/pull/86508>`_)
+* ``--enable-non-contiguous-regions`` option allows automatically packing input
+  sections into memory regions by automatically spilling to later matches if a
+  region would overflow. This reduces the toil of manually packing regions
+  (typical for embedded). It also makes full LTO feasible in such cases, since
+  IR merging currently prevents the linker script from referring to input
+  files. (`#90007 <https://github.com/llvm/llvm-project/pull/90007>`_)
+* ``--default-script`/``-dT`` is implemented to specify a default script that is processed
+  if ``--script``/``-T`` is not specified.
+  (`#89327 <https://github.com/llvm/llvm-project/pull/89327>`_)
+* ``--force-group-allocation`` is implemented to discard ``SHT_GROUP`` sections
+  and combine relocation sections if their relocated section group members are
+  placed to the same output section.
+  (`#94704 <https://github.com/llvm/llvm-project/pull/94704>`_)
+* ``--build-id`` now defaults to generating a 20-byte digest ("sha1") instead
+  of 8-byte ("fast"). This improves compatibility with RPM packaging tools.
+  (`#93943 <https://github.com/llvm/llvm-project/pull/93943>`_)
+* ``-z lrodata-after-bss`` is implemented to place ``.lrodata`` after ``.bss``.
+  (`#81224 <https://github.com/llvm/llvm-project/pull/81224>`_)
+* ``--export-dynamic`` no longer creates dynamic sections for ``-no-pie`` static linking.
+* ``--lto-emit-asm`` is now added as the canonical spelling of ``--plugin-opt=emit-llvm``.
+* ``--lto-emit-llvm`` now uses the pre-codegen module.
+  (`#97480 <https://github.com/llvm/llvm-project/pull/97480>`_)
+* When AArch64 PAuth is enabled, ``-z pack-relative-relocs`` now encodes ``R_AARCH64_AUTH_RELATIVE`` relocations in ``.rela.auth.dyn``.
+  (`#96496 <https://github.com/llvm/llvm-project/pull/96496>`_)
+* ``-z gcs`` and ``-z gcs-report`` are now supported for AArch64 Guarded Control Stack extension.
+* ``-r`` now forces ``-Bstatic``.
+* Thumb2 PLT is now supported for Cortex-M processors.
+  (`#93644 <https://github.com/llvm/llvm-project/pull/93644>`_)
+* ``DW_EH_sdata4`` of addresses larger than 0x80000000 is now supported for MIPS32.
+  (`#92438 <https://github.com/llvm/llvm-project/pull/92438>`_)
+* Certain unknown section types are rejected.
+  (`#85173 <https://github.com/llvm/llvm-project/pull/85173>`_)
+* ``PROVIDE(lhs = rhs) PROVIDE(rhs = ...)``, ``lhs`` is now defined only if ``rhs`` is needed.
+  (`#74771 <https://github.com/llvm/llvm-project/issues/74771>`_)
+  (`#87530 <https://github.com/llvm/llvm-project/pull/87530>`_)
+* ``OUTPUT_FORMAT(binary)`` is now supported.
+  (`#98837 <https://github.com/llvm/llvm-project/pull/98837>`_)
+* ``NOCROSSREFS`` and ``NOCRFOSSREFS_TO`` commands now supported to prohibit
+  cross references between certain output sections.
+  (`#98773 <https://github.com/llvm/llvm-project/pull/98773>`_)
+* Orphan placement is refined to prefer the last similar section when its rank <= orphan's rank.
+  (`#94099 <https://github.com/llvm/llvm-project/pull/94099>`_)
+  Non-alloc orphan sections are now placed at the end.
+  (`#94519 <https://github.com/llvm/llvm-project/pull/94519>`_)
+* R_X86_64_REX_GOTPCRELX of the addq form is no longer incorrectly optimized when the address is larger than 0x80000000.
+
 Breaking changes
 ----------------
 
diff --git a/lld/test/ELF/aarch64-cortex-a53-843419-abs-mapsyms.s b/lld/test/ELF/aarch64-cortex-a53-843419-abs-mapsyms.s
index a9b95b1a63ef6..608b1aceb0a00 100644
--- a/lld/test/ELF/aarch64-cortex-a53-843419-abs-mapsyms.s
+++ b/lld/test/ELF/aarch64-cortex-a53-843419-abs-mapsyms.s
@@ -12,7 +12,7 @@
         .type _start, %function
 _start: ret
 
-// CHECK:     Name: $x
+// CHECK:     Name: $x.0
 // CHECK-NEXT:     Value: 0x0
 // CHECK-NEXT:     Size: 0
 // CHECK-NEXT:     Binding: Local (0x0)
diff --git a/lld/test/ELF/aarch64-gnu-ifunc.s b/lld/test/ELF/aarch64-gnu-ifunc.s
index f39ae0cd84934..d76b54eabf8a4 100644
--- a/lld/test/ELF/aarch64-gnu-ifunc.s
+++ b/lld/test/ELF/aarch64-gnu-ifunc.s
@@ -37,7 +37,7 @@
 // CHECK-NEXT:    Section: Undefined
 // CHECK-NEXT:  }
 // CHECK-NEXT:  Symbol {
-// CHECK-NEXT:    Name: $x
+// CHECK-NEXT:    Name: $x.0
 // CHECK-NEXT:    Value: 0x210188
 // CHECK-NEXT:    Size: 0
 // CHECK-NEXT:    Binding: Local
diff --git a/lld/test/ELF/aarch64-reloc-pauth.s b/lld/test/ELF/aarch64-reloc-pauth.s
index 6b4d23d37babc..19493fcfa1365 100644
--- a/lld/test/ELF/aarch64-reloc-pauth.s
+++ b/lld/test/ELF/aarch64-reloc-pauth.s
@@ -68,11 +68,11 @@
 # RELR-EMPTY:
 # RELR-NEXT: Relocation section '.relr.auth.dyn' at offset 0x[[ADDR2]] contains 5 entries:
 # RELR-NEXT: Index: Entry Address Symbolic Address
-# RELR-NEXT: 0000: 0000000000030440 0000000000030440 $d
-# RELR-NEXT: 0001: 000000000000000f 0000000000030448 $d + 0x8
-# RELR-NEXT:  0000000000030450 $d + 0x10
-# RELR-NEXT:  0000000000030458 $d + 0x18
-# RELR-NEXT: 0002: 0000000000030492 0000000000030492 $d + 0x52
+# RELR-NEXT: 0000: 0000000000030440 0000000000030440 $d.0
+# RELR-NEXT: 0001: 000000000000000f 0000000000030448 $d.0 + 0x8
+# RELR-NEXT:  0000000000030450 $d.0 + 0x10
+# RELR-NEXT:  0000000000030458 $d.0 + 0x18
+# RELR-NEXT: 0002: 0000000000030492 0000000000030492 $d.0 + 0x52
 
 # HEX:      Hex dump of section '.test':
 # HEX-NEXT: 0x00030440 01000000 2a000020 42040300 2b000000
@@ -201,7 +201,7 @@
 # EMPTY-RELA-EMPTY:
 # EMPTY-RELA-NEXT: Relocation section '.relr.auth.dyn' at offset {{.+}} contains 1 entries:
 # EMPTY-RELA-NEXT: Index: Entry Address Symbolic Address
-# EMPTY-RELA-NEXT: 0000: 0000000000030310 0000000000030310 $d
+# EMPTY-RELA-NEXT: 0000: 0000000000030310 0000000000030310 $d.0
 
 # EMPTY-RELA-RO-NOT: .rela.dyn
 
diff --git a/lld/test/ELF/aarch64-thunk-script.s b/lld/test/ELF/aarch64-thunk-script.s
index a6c524f548288..08ff4e9871180 100644
--- a/lld/test/ELF/aarch64-thunk-script.s
+++ b/lld/test/ELF/aarch64-thunk-script.s
@@ -43,8 +43,8 @@ high_target:
 // CHECK-NEXT:                 ret
 
 /// Local symbols copied from %t.o
-// NM:      t $x
-// NM-NEXT: t $x
+// NM:      t $x.0
+// NM-NEXT: t $x.1
 /// Local thunk symbols.
 // NM-NEXT: t __AArch64AbsLongThunk_high_target
 // NM-NEXT: t $x
diff --git a/lld/test/ELF/basic-aarch64.s b/lld/test/ELF/basic-aarch64.s
index 1f59d33c4230b..6b109e8da2c3e 100644
--- a/lld/test/ELF/basic-aarch64.s
+++ b/lld/test/ELF/basic-aarch64.s
@@ -119,7 +119,7 @@ _start:
 # CHECK-NEXT:     ]
 # CHECK-NEXT:     Address: 0x0
 # CHECK-NEXT:     Offset: 0x1AA
-# CHECK-NEXT:     Size: 11
+# CHECK-NEXT:     Size: 13
 # CHECK-NEXT:     Link: 0
 # CHECK-NEXT:     Info: 0
 # CHECK-NEXT:     AddressAlignment: 1
@@ -189,7 +189,7 @@ _start:
 # CHECK-NEXT:     Section: Undefined (0x0)
 # CHECK-NEXT:   }
 # CHECK-NEXT:   Symbol {
-# CHECK-NEXT:     Name: $x
+# CHECK-NEXT:     Name: $x.0
 # CHECK-NEXT:     Value: 0x210120
 # CHECK-NEXT:     Size: 0
 # CHECK-NEXT:     Binding: Local (0x0)
diff --git a/lld/test/ELF/pack-dyn-relocs.s b/lld/test/ELF/pack-dyn-relocs.s
index dd18c92c7352e..3cbba68716972 100644
--- a/lld/test/ELF/pack-dyn-relocs.s
+++ b/lld/test/ELF/pack-dyn-relocs.s
@@ -264,30 +264,30 @@
 // RELR64-EMPTY: 
 // RELR64-NEXT:  Relocation section '.relr.dyn' at offset {{.*}} contains 24 entries:
 // RELR64-NEXT:  Symbolic Address
-// RELR64-NEXT:  $d{{$}}
-// RELR64-NEXT:  $d + 0x8
-// RELR64-NEXT:  $d + 0x10
-// RELR64-NEXT:  $d + 0x18
-// RELR64-NEXT:  $d + 0x20
-// RELR64-NEXT:  $d + 0x28
-// RELR64-NEXT:  $d + 0x30
-// RELR64-NEXT:  $d + 0x38
-// RELR64-NEXT:  $d + 0x48
-// RELR64-NEXT:  $d + 0x50
-// RELR64-NEXT:  $d + 0x58
-// RELR64-NEXT:  $d + 0x60
-// RELR64-NEXT:  $d + 0x68
-// RELR64-NEXT:  $d + 0x70
-// RELR64-NEXT:  $d + 0x78
-// RELR64-NEXT:  $d + 0x90
-// RELR64-NEXT:  $d + 0x98
-// RELR64-NEXT:  $d + 0xa0
-// RELR64-NEXT:  $d + 0xa8
-// RELR64-NEXT:  $d + 0xb0
-// RELR64-NEXT:  $d + 0xb8
-// RELR64-NEXT:  $d + 0xc0
-// RELR64-NEXT:  $d + 0xc8
-// RELR64-NEXT:  $d + 0xd0
+// RELR64-NEXT:  $d.0{{$}}
+// RELR64-NEXT:  $d.0 + 0x8
+// RELR64-NEXT:  $d.0 + 0x10
+// RELR64-NEXT:  $d.0 + 0x18
+// RELR64-NEXT:  $d.0 + 0x20
+// RELR64-NEXT:  $d.0 + 0x28
+// RELR64-NEXT:  $d.0 + 0x30
+// RELR64-NEXT:  $d.0 + 0x38
+// RELR64-NEXT:  $d.0 + 0x48
+// RELR64-NEXT:  $d.0 + 0x50
+// RELR64-NEXT:  $d.0 + 0x58
+// RELR64-NEXT:  $d.0 + 0x60
+// RELR64-NEXT:  $d.0 + 0x68
+// RELR64-NEXT:  $d.0 + 0x70
+// RELR64-NEXT:  $d.0 + 0x78
+// RELR64-NEXT:  $d.0 + 0x90
+// RELR64-NEXT:  $d.0 + 0x98
+// RELR64-NEXT:  $d.0 + 0xa0
+// RELR64-NEXT:  $d.0 + 0xa8
+// RELR64-NEXT:  $d.0 + 0xb0
+// RELR64-NEXT:  $d.0 + 0xb8
+// RELR64-NEXT:  $d.0 + 0xc0
+// RELR64-NEXT:  $d.0 + 0xc8
+// RELR64-NEXT:  $d.0 + 0xd0
 // RELR64-EMPTY:
 // RELR64-NEXT: Hex dump of section '.data':
 // RELR64-NEXT: 0x00030490 90040300 00000000 91040300 00000000 .
diff --git a/lld/test/MachO/arm64-32-stubs.s b/lld/test/MachO/arm64-32-stubs.s
index a5d48ff5baef3..743d5c419bf1a 100644
--- a/lld/test/MachO/arm64-32-stubs.s
+++ b/lld/test/MachO/arm64-32-stubs.s
@@ -10,7 +10,7 @@
 # RUN: llvm-mc -filetype=obj -triple=arm64_32-apple-watchos %t/test.s -o %t/test.o
 # RUN: %lld-watchos -dylib -install_name @executable_path/libfoo.dylib %t/foo.o -o %t/libfoo.dylib
 # RUN: %lld-watchos -dylib -install_name @executable_path/libbar.dylib %t/bar.o -o %t/libbar.dylib
-# RUN: %lld-watchos -lSystem %t/libfoo.dylib %t/libbar.dylib %t/test.o -o %t/test -no_fixup_chains
+# RUN: %lld-watchos -lSystem %t/libfoo.dylib %t/libbar.dylib %t/test.o -o %t/test
 
 # RUN: llvm-objdump --no-print-imm-hex --macho -d --no-show-raw-insn --section="__TEXT,__stubs" --section="__TEXT,__stub_helper" %t/test | FileCheck %s
 
diff --git a/lld/test/MachO/arm64-stubs.s b/lld/test/MachO/arm64-stubs.s
index 6fd94661d32e2..f714e20084895 100644
--- a/lld/test/MachO/arm64-stubs.s
+++ b/lld/test/MachO/arm64-stubs.s
@@ -5,7 +5,7 @@
 # RUN: llvm-mc -filetype=obj -triple=arm64-apple-darwin %t/test.s -o %t/test.o
 # RUN: %lld -arch arm64 -dylib -install_name @executable_path/libfoo.dylib %t/foo.o -o %t/libfoo.dylib
 # RUN: %lld -arch arm64 -dylib -install_name @executable_path/libbar.dylib %t/bar.o -o %t/libbar.dylib
-# RUN: %lld -arch arm64 -lSystem %t/libfoo.dylib %t/libbar.dylib %t/test.o -o %t/test -no_fixup_chains
+# RUN: %lld -arch arm64 -lSystem %t/libfoo.dylib %t/libbar.dylib %t/test.o -o %t/test
 
 # RUN: llvm-objdump --no-print-imm-hex --macho -d --no-show-raw-insn --section="__TEXT,__stubs" --section="__TEXT,__stub_helper" %t/test | FileCheck %s
 
diff --git a/lld/test/MachO/dyld-stub-binder.s b/lld/test/MachO/dyld-stub-binder.s
index 170fe8abd89bd..1d2e556607c79 100644
--- a/lld/test/MachO/dyld-stub-binder.s
+++ b/lld/test/MachO/dyld-stub-binder.s
@@ -14,27 +14,27 @@
 # RUN: %lld -arch arm64 -lSystem -dylib %t/foo.o -o %t/libfoo.dylib
 # RUN: llvm-nm -m %t/libfoo.dylib | FileCheck --check-prefix=STUB %s
 
+
 ## Dylibs that do lazy dynamic calls do need dyld_stub_binder.
 # RUN: not %no-lsystem-lld -arch arm64 -dylib %t/bar.o %t/libfoo.dylib \
-# RUN:     -o %t/libbar.dylib -no_fixup_chains 2>&1 | \
-# RUN:     FileCheck --check-prefix=MISSINGSTUB %s
+# RUN:     -o %t/libbar.dylib 2>&1 | FileCheck --check-prefix=MISSINGSTUB %s
 # RUN: %lld -arch arm64 -lSystem -dylib %t/bar.o  %t/libfoo.dylib \
-# RUN:     -o %t/libbar.dylib -no_fixup_chains
+# RUN:     -o %t/libbar.dylib
 # RUN: llvm-nm -m %t/libbar.dylib | FileCheck --check-prefix=STUB %s
 
 ## As do executables.
 # RUN: not %no-lsystem-lld -arch arm64 %t/libfoo.dylib %t/libbar.dylib %t/test.o \
-# RUN:     -o %t/test -no_fixup_chains 2>&1 | FileCheck --check-prefix=MISSINGSTUB %s
+# RUN:     -o %t/test 2>&1 | FileCheck --check-prefix=MISSINGSTUB %s
 # RUN: %lld -arch arm64 -lSystem %t/libfoo.dylib %t/libbar.dylib %t/test.o \
-# RUN:     -o %t/test -no_fixup_chains
+# RUN:     -o %t/test
 # RUN: llvm-nm -m %t/test | FileCheck --check-prefix=STUB %s
 
 ## Test dynamic lookup of dyld_stub_binder.
 # RUN: %no-lsystem-lld -arch arm64 %t/libfoo.dylib %t/libbar.dylib %t/test.o \
-# RUN:     -o %t/test -undefined dynamic_lookup -no_fixup_chains
+# RUN:     -o %t/test -undefined dynamic_lookup
 # RUN: llvm-nm -m %t/test | FileCheck --check-prefix=DYNSTUB %s
 # RUN: %no-lsystem-lld -arch arm64 %t/libfoo.dylib %t/libbar.dylib %t/test.o \
-# RUN:     -o %t/test -U dyld_stub_binder -no_fixup_chains
+# RUN:     -o %t/test -U dyld_stub_binder
 # RUN: llvm-nm -m %t/test | FileCheck --check-prefix=DYNSTUB %s
 
 # MISSINGSTUB:      error: undefined symbol: dyld_stub_binder
diff --git a/lld/test/MachO/invalid/chained-fixups-incompatible.s b/lld/test/MachO/invalid/chained-fixups-incompatible.s
index 0589204968fe3..83411472beb9f 100644
--- a/lld/test/MachO/invalid/chained-fixups-incompatible.s
+++ b/lld/test/MachO/invalid/chained-fixups-incompatible.s
@@ -2,6 +2,9 @@
 # RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %s -o %t.o
 # RUN: not %lld -lSystem -no_pie -fixup_chains %t.o -o /dev/null 2>&1 | \
 # RUN:   FileCheck %s --check-prefix=NO-PIE
+# RUN: %no-fatal-warnings-lld -fixup_chains %t.o -o /dev/null \
+# RUN:   -lSystem -platform_version macos 10.15 10.15 2>&1 | \
+# RUN:   FileCheck %s --check-prefix=VERSION
 # RUN: llvm-mc -filetype=obj -triple=arm64_32-apple-darwin %s -o %t-arm64_32.o
 # RUN: not %lld-watchos -lSystem -fixup_chains %t-arm64_32.o -o /dev/null 2>&1 | \
 # RUN:   FileCheck %s --check-prefix=ARCH
@@ -9,6 +12,7 @@
 ## Check that we emit diagnostics when -fixup_chains is explicitly specified,
 ## but we don't support creating chained fixups for said configuration.
 # NO-PIE:  error: -fixup_chains is incompatible with -no_pie
+# VERSION: warning: -fixup_chains requires macOS 11.0, which is newer than target minimum of 10.15
 # ARCH:    error: -fixup_chains is only supported on x86_64 and arm64 targets
 
 .globl _main
diff --git a/lld/test/MachO/objc-selrefs.s b/lld/test/MachO/objc-selrefs.s
index eebe7c6476cd5..6d144f4938b45 100644
--- a/lld/test/MachO/objc-selrefs.s
+++ b/lld/test/MachO/objc-selrefs.s
@@ -6,14 +6,14 @@
 # RUN: llvm-mc -filetype=obj -triple=arm64-apple-darwin %t/implicit-selrefs.s -o %t/implicit-selrefs.o
 
 # RUN: %lld -dylib -arch arm64 -lSystem -o %t/explicit-only-no-icf \
-# RUN:   %t/explicit-selrefs-1.o %t/explicit-selrefs-2.o -no_fixup_chains
+# RUN:   %t/explicit-selrefs-1.o %t/explicit-selrefs-2.o
 # RUN: llvm-otool -vs __DATA __objc_selrefs %t/explicit-only-no-icf | \
 # RUN:   FileCheck %s --check-prefix=EXPLICIT-NO-ICF
 
 ## NOTE: ld64 always dedups the selrefs unconditionally, but we only do it when
 ## ICF is enabled.
 # RUN: %lld -dylib -arch arm64 -lSystem -o %t/explicit-only-with-icf \
-# RUN:   %t/explicit-selrefs-1.o %t/explicit-selrefs-2.o -no_fixup_chains
+# RUN:   %t/explicit-selrefs-1.o %t/explicit-selrefs-2.o
 # RUN: llvm-otool -vs __DATA __objc_selrefs %t/explicit-only-with-icf \
 # RUN:   | FileCheck %s --check-prefix=EXPLICIT-WITH-ICF
 
@@ -25,8 +25,7 @@
 # SELREFS-EMPTY:
 
 # RUN: %lld -dylib -arch arm64 -lSystem --icf=all -o %t/explicit-and-implicit \
-# RUN:   %t/explicit-selrefs-1.o %t/explicit-selrefs-2.o %t/implicit-selrefs.o \
-# RUN:   -no_fixup_chains
+# RUN:   %t/explicit-selrefs-1.o %t/explicit-selrefs-2.o %t/implicit-selrefs.o
 # RUN: llvm-otool -vs __DATA __objc_selrefs %t/explicit-and-implicit \
 # RUN:   | FileCheck %s --check-prefix=EXPLICIT-AND-IMPLICIT
 
diff --git a/lldb/include/lldb/Target/DynamicLoader.h b/lldb/include/lldb/Target/DynamicLoader.h
index 0629e2faae7e9..15384245194b0 100644
--- a/lldb/include/lldb/Target/DynamicLoader.h
+++ b/lldb/include/lldb/Target/DynamicLoader.h
@@ -9,8 +9,8 @@
 #ifndef LLDB_TARGET_DYNAMICLOADER_H
 #define LLDB_TARGET_DYNAMICLOADER_H
 
-#include "lldb/Core/Address.h"
 #include "lldb/Core/PluginInterface.h"
+#include "lldb/Symbol/Symbol.h"
 #include "lldb/Utility/FileSpec.h"
 #include "lldb/Utility/Status.h"
 #include "lldb/Utility/UUID.h"
@@ -25,7 +25,6 @@ namespace lldb_private {
 class ModuleList;
 class Process;
 class SectionList;
-class Symbol;
 class SymbolContext;
 class SymbolContextList;
 class Thread;
@@ -330,10 +329,10 @@ class DynamicLoader : public PluginInterface {
   /// safe to call certain APIs or SPIs.
   virtual bool IsFullyInitialized() { return true; }
 
-  /// Return the `start` \b address in the dynamic loader module.
-  /// This is the address the process will begin executing with
+  /// Return the `start` function \b symbol in the dynamic loader module.
+  /// This is the symbol the process will begin executing with
   /// `process launch --stop-at-entry`.
-  virtual std::optional<lldb_private::Address> GetStartAddress() {
+  virtual std::optional<lldb_private::Symbol> GetStartSymbol() {
     return std::nullopt;
   }
 
diff --git a/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwin.cpp b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwin.cpp
index 3863b6b3520db..5c6331735bde8 100644
--- a/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwin.cpp
+++ b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwin.cpp
@@ -609,7 +609,7 @@ void DynamicLoaderDarwin::UpdateDYLDImageInfoFromNewImageInfo(
   }
 }
 
-std::optional<lldb_private::Address> DynamicLoaderDarwin::GetStartAddress() {
+std::optional<lldb_private::Symbol> DynamicLoaderDarwin::GetStartSymbol() {
   Log *log = GetLog(LLDBLog::DynamicLoader);
 
   auto log_err = [log](llvm::StringLiteral err_msg) -> std::nullopt_t {
@@ -626,7 +626,7 @@ std::optional<lldb_private::Address> DynamicLoaderDarwin::GetStartAddress() {
   if (!symbol)
     return log_err("Cannot find `start` symbol in DYLD module.");
 
-  return symbol->GetAddress();
+  return *symbol;
 }
 
 void DynamicLoaderDarwin::SetDYLDModule(lldb::ModuleSP &dyld_module_sp) {
diff --git a/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwin.h b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwin.h
index 3613c4c29b178..4ac55fdf6f3af 100644
--- a/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwin.h
+++ b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwin.h
@@ -56,8 +56,6 @@ class DynamicLoaderDarwin : public lldb_private::DynamicLoader {
 
   virtual bool NeedToDoInitialImageFetch() = 0;
 
-  std::optional<lldb_private::Address> GetStartAddress() override;
-
 protected:
   void PrivateInitialize(lldb_private::Process *process);
 
@@ -69,6 +67,8 @@ class DynamicLoaderDarwin : public lldb_private::DynamicLoader {
   // Clear method for classes derived from this one
   virtual void DoClear() = 0;
 
+  std::optional<lldb_private::Symbol> GetStartSymbol() override;
+
   void SetDYLDModule(lldb::ModuleSP &dyld_module_sp);
 
   lldb::ModuleSP GetDYLDModule();
diff --git a/llvm/benchmarks/CMakeLists.txt b/llvm/benchmarks/CMakeLists.txt
index 52d726451ada9..43f88f7257924 100644
--- a/llvm/benchmarks/CMakeLists.txt
+++ b/llvm/benchmarks/CMakeLists.txt
@@ -1,5 +1,4 @@
 set(LLVM_LINK_COMPONENTS
   Support)
 
-add_benchmark(DummyYAML DummyYAML.cpp PARTIAL_SOURCES_INTENDED)
-add_benchmark(xxhash xxhash.cpp PARTIAL_SOURCES_INTENDED)
+add_benchmark(DummyYAML DummyYAML.cpp)
diff --git a/llvm/benchmarks/xxhash.cpp b/llvm/benchmarks/xxhash.cpp
deleted file mode 100644
index 429cbc0fa87d4..0000000000000
--- a/llvm/benchmarks/xxhash.cpp
+++ /dev/null
@@ -1,29 +0,0 @@
-#include "llvm/Support/xxhash.h"
-#include "benchmark/benchmark.h"
-
-#include <memory>
-
-static uint32_t xorshift(uint32_t State) {
-  State ^= State << 13;
-  State ^= State >> 17;
-  State ^= State << 5;
-  return State;
-}
-
-static void BM_xxh3_64bits(benchmark::State &State) {
-  std::unique_ptr<uint32_t[]> Data(new uint32_t[State.range(0) / 4]);
-
-  uint32_t Prev = 0xcafebabe;
-  for (int64_t I = 0; I < State.range(0) / 4; I++)
-    Data[I] = Prev = xorshift(Prev);
-
-  llvm::ArrayRef DataRef =
-      llvm::ArrayRef(reinterpret_cast<uint8_t *>(Data.get()), State.range(0));
-
-  for (auto _ : State)
-    llvm::xxh3_64bits(DataRef);
-}
-
-BENCHMARK(BM_xxh3_64bits)->Arg(32)->Arg(512)->Arg(64 * 1024)->Arg(1024 * 1024);
-
-BENCHMARK_MAIN();
diff --git a/llvm/docs/DirectX/DXILOpTableGenDesign.rst b/llvm/docs/DirectX/DXILOpTableGenDesign.rst
index 50d801bd05efd..5f3ac75e82494 100644
--- a/llvm/docs/DirectX/DXILOpTableGenDesign.rst
+++ b/llvm/docs/DirectX/DXILOpTableGenDesign.rst
@@ -24,12 +24,12 @@ DXIL Operations are represented in one of the following `two ways
 
    These are  collectively referred to as `LLVM Intrinsics` in this note.
 
-Following is the complete list of properties of DXIL Ops with the corresponding field name
-as used in ``hctdb.py``. A DXIL Op is represented by a set of associated properties. These
-are consumed in DXIL backend passes as well as in other usage scenarios such as validation,
+Following is the complete list of attributes of DXIL Ops with the corresponding field name
+as used in ``hctdb.py``. A DXIL Op is represented by a set of associated attributes. These
+are consumed in DXIL backend passes as well as in other usage scenarios such as validation, 
 DXIL reader, etc.
 
-A. Properties consumed in DXIL backend passes
+A. Attributes consumed in DXIL backend passes
 
    1. Name of operation (``dxil_op``)
    2. A string that documents the operation (``doc``) - This is not strictly necessary but is included
@@ -38,7 +38,7 @@ A. Properties consumed in DXIL backend passes
    4. Unique Integer ID (``dxil_opid``)
    5. Operation Class signifying the name and function signature of the operation (``dxil_class``).
       This string is an integral part of the DXIL Op function name and is constructed in
-      the format ``dx.op.<class-name>.<overload-type>``. Each DXIL Op call target function name
+      the format ``dx.op.<class-name>.<overload-type>``. Each DXIL Op call target function name 
       is required to conform to this format per existing contract with the driver.
    6. List of valid overload types for the operation (``oload_types``).
    7. Required DXIL Version with support for the operation.
@@ -58,26 +58,26 @@ A. Properties consumed in DXIL backend passes
 Motivation
 ==========
 
-DXIL backend passes depend on various properties of DXIL Operations. For example, ``DXILOpLowering``
+DXIL backend passes depend on various attributes of DXIL Operations. For example, ``DXILOpLowering``
 pass will need information such as the DXIL operation an LLVM intrinsic is to be lowered to,
-along with valid overload and argument types etc. The TableGen file -
+along with valid overload and parameter types etc. The TableGen file -
 ``llvm/lib/Target/DirectX/DXIL.td`` - is used to represent DXIL Operations
-by specifying their properties listed above. ``DXIL.td`` is designed to be the single source
-of reference of DXIL Operations primarily for the implementation of passes in DXIL backend in
-``llvm-project`` repo - analogous to ``hctdb.py`` for ``DirectXShadeCompiler`` repo. However,
-the current design does not intend to encapsulate various validation rules, present in ``hctdb.py``,
-but do not pertain to DXIL Operations. It needs to have a rich representation capabilities that
-TableGen backends (such as ``DXILEmitter``) can rely on. Additionally, the DXIL Op specification
+by specifying their attributes listed above. ``DXIL.td`` is designed to be the single source
+of reference of DXIL Operations primarily for the implementation of passes in DXIL backend in 
+``llvm-project`` repo - analogous to ``hctdb.py`` for ``DirectXShadeCompiler`` repo. However, 
+the current design does not intend to encapsulate various validation rules, present in ``hctdb.py``, 
+but do not pertain to DXIL Operations. It needs to have a rich representation capabilities that 
+TableGen backends (such as ``DXILEmitter``) can rely on. Additionally, the DXIL Op specification 
 should be easy to read and comprehend.
 
 This note provides the design of the specification DXIL Ops as TableGen class ``DXILOp``
-by specifying its properties identified above.
+by specifying its attributes identified above.
 
 DXIL Operation Specification
 ============================
 
 The DXIL Operation is represented using the TableGen class ``DXILOp``. The DXIL operation
-properties are specified as fields of the ``DXILOp`` class as described below.
+attributes are specified as fields of the ``DXILOp`` class as described below.
 
 1. Each DXIL Operation is represented as a TableGen record. The name of each of the records
    signifies operation name.
@@ -93,81 +93,76 @@ properties are specified as fields of the ``DXILOp`` class as described below.
         class DXILOpClass;
 
    Concrete operation records, such as ``unary`` are defined by inheriting from ``DXILOpClass``.
-6. Return type of the operation is represented as ``LLVMType``.
-7. Operation arguments are represented as a list of ``LLVMType`` with each type
-   corresponding to the argument position. An overload type, if supported by the operation, is
-   denoted as the positional type ``overloadTy`` in the argument or in the result, where
-   ``overloadTy`` is defined to be synonymous to ``llvm_any_ty``.
+6. Return and argument types of the operation are represented as ``dag``s using the
+   special markers ``out`` and ``ins``. An overload type, if supported by the operation, is
+   denoted as the positional type ``dxil_overload_ty`` in the argument or in the result, where
+   ``dxil_overload_ty`` is defined to be synonymous to ``llvm_any_ty``.
 
    .. code-block::
 
-      defvar overloadTy = llvm_any_ty
+      defvar dxil_overload_ty = llvm_any_ty
 
-   Empty list, ``[]`` represents an operation with no arguments.
 
-8. Valid operation overload types predicated on DXIL version are specified as
-   a list of ``Overloads`` records. Representation of ``Overloads``
+7. Valid overload types and shader stages predicated on Shader Model version are specified
+   as a list of ``Constraint`` records. Representation of ``Constraints`` class is described
+   a later section.
+8. Various attributes of the DXIL Operation that are not predicated on Shader Model version
+   are represented as a ``dag`` using the special marker ``attrs``. Representation of ``Attributes`` 
    class is described in a later section.
-9.  Valid shader stages predicated on DXIL version are specified as a list of
-    ``Stages`` records. Representation of ``Stages`` class is
-    described in a later section.
-10. Various attributes of the DXIL Operation are represented as a ``list`` of
-    ``Attributes`` class records. Representation of ``Attributes``
-    class is described in a later section.
 
-Types specific to DXIL
-----------------------
-
-Type notation used in this document viz., ``<size>Ty`` corresponds to TableGen records for
-LLVM types ``llvm_<size>_ty``. Apart from ``overloadTy`` described above, ``resRetF32Ty`` is
-used to denote resource return type and ``handleTy`` is used to denote handle type.
-
-Specification of DXIL Operation
-================================
-
-A DXIL Operation is represented by the following TableGen class that encapsulates the various
-TableGen representations of its properties described above.
+A DXIL Operation is represented by the following TableGen class by encapsulating the various
+TableGen representations of its attributes described above.
 
 .. code-block::
 
    // Abstraction DXIL Operation
-   class DXILOp<int opcode, DXILOpClass opclass> {
+   class DXILOp {
      // A short description of the operation
      string Doc = "";
 
      // Opcode of DXIL Operation
-     int OpCode = opcode;
+     int OpCode = 0;
 
      // Class of DXIL Operation.
-     DXILOpClass OpClass = opclass;
+     DXILOpClass OpClass = UnknownOpClass;
 
      // LLVM Intrinsic DXIL Operation maps to
      Intrinsic LLVMIntrinsic = ?;
 
-     // Result type of the op.
-     LLVMType result;
+     // Dag containing the arguments of the op. Default to 0 arguments.
+     dag arguments = (ins);
 
-     // List of argument types of the op. Default to 0 arguments.
-     list<LLVMType> arguments = [];
+     // Results of the op. Default to 0 results.
+     dag result = (out);
 
-     // List of valid overload types predicated by DXIL version
-     list<Overloads> overloads;
+     // List of constraints predicated on Shader Model version
+     list<SMVersionConstraints> sm_constraints;
 
-     // List of valid shader stages predicated by DXIL version
-    list<Stages> stages;
-
-     // List of valid attributes predicated by DXIL version
-     list<Attributes> attributes = [];
+     // Non-predicated operation attributes
+     dag attrtibutes = (attrs);
+     Version DXILVersion = ?;
    }
 
-Version Specification
-=====================
+Constraint Specification
+========================
+
+DXIL Operation attributes such as valid overload types and valid shader stages are
+predicated on Shader Model version. These are represented as list of constrained
+attributes.
 
-DXIL version is used to specify various version-dependent operation properties in
-place of Shader Model version.
+Following is the definition of a generic constraint and the associated predicate
+
+.. code-block::
+
+   // Primitive predicate
+   class Pred;
+
+   // Generic constraint
+   class Constraint<Pred pred> {
+     Pred predicate = pred;
+   }
 
-A ``Version`` class encapsulating ``Major`` and ``Minor`` version number is defined
-as follows:
+Shader Model version is represented as follows:
 
 .. code-block::
 
@@ -177,271 +172,111 @@ as follows:
      int Minor = minor;
    }
 
+   // Valid Shader model version records
 
-Concrete representations of valid DXIL versions are defined as follows:
-
-.. code-block::
-
-   // Definition of DXIL Version 1.0 - 1.8
+   // Definition of Shader Model 6.0 - 6.8 and DXIL Version 1.0 - 1.8
    foreach i = 0...8 in {
-     def DXIL1_#i : Version<1, i>;
+     def SM6_#i : Version<6, i>;
+     def DX1_#i : Version<1, i>;
    }
 
-Shader Stage Specification
-==========================
-
-Various shader stages such as ``compute``, ``pixel``, ``vertex``, etc., are represented
-as follows
-
-.. code-block::
-
-   // Shader stages
-   class DXILShaderStage;
-
-   def compute : DXILShaderStage;
-   def pixel : DXILShaderStage;
-   def vertex : DXILShaderStage;
-   ...
-
-Shader Attribute Specification
-==============================
-
-Various operation memory access and boolean attributes such as ``ReadNone``,
-``IsWave`` etc., are represented as follows
+A shader model version predicate class is defined as
 
 .. code-block::
 
-  class DXILAttribute;
-
-  def ReadOnly : DXILOpAttributes;
-  def ReadNone : DXILOpAttributes;
-  def IsWave : DXILOpAttributes;
-  ...
-
-Versioned Property Specification
-================================
-
-DXIL Operation properties such as valid overload types, shader stages and
-attributes are predicated on DXIL version. These are represented as list of
-versioned properties.
-
-Overload Type Specification
----------------------------
-
-``overloads`` field of ``class DXILOp`` is used to represent valid operation
-overloads predicated on DXIL version as list of records of the following class
-
-.. code-block::
-
-   class Overloads<Version minver, list<LLVMType> ols> {
-     Version dxil_version = minver;
-     list<LLVMType> overload_types = ols;
+   class SMVersion<Version ver> : Pred {
+     Version SMVersion = ver;
    }
 
-Following is an example specification of valid overload types for ``DXIL1_0`` and
-``DXIL1_2``.
+A constraint class to represent overload types and shader stages predicated on shader
+model version is defined as
 
 .. code-block::
 
-   overloads = [
-                 Overloads<DXIL1_0, [halfTy, floatTy]>,
-                 Overloads<DXIL1_2, [halfTy, floatTy, doubleTy]>
-               ];
-
-An empty list signifies that the operation supports no overload types.
-
-
-Stages Specification
---------------------
-
-``stages`` field of ``class DXILOp`` is used to represent valid operation
-stages predicated on DXIL version as list of records of the following class
-
-.. code-block::
-
-   class Stages<Version minver, list<DXILShaderStage> sts> {
-     Version dxil_version = minver;
-     list<DXILShaderStage> shader_stages = sts;
+   class SMVersionConstraints<SMVersion smver, dag oloads, dag stages> : Constraint<smver> {
+     dag overload_types = oloads;
+     dag stage_kinds = stages;
    }
 
-Following is an example specification of valid stages for ``DXIL1_0``,
-``DXIL1_2``, ``DXIL1_4`` and ``DXIL1_6``.
-
-.. code-block::
+The ``dag overload_types`` and ``dag shader_kinds`` use a special markers ``overloads``
+and ``stages``, respectively.
 
-   stages = [
-             Stages<DXIL1_0, [compute, pixel]>,
-             Stages<DXIL1_2, [compute, pixel, mesh]>,
-             Stages<DXIL1_4, [all_stages]>,
-             Stages<DXIL1_6, [removed]>
-            ];
-
-The following two pseudo stage records in addition to standard shader stages
-are defined.
-
-1. ``all_stages`` signifies that the operation is valid for all stages in the
-   specified DXIL version and later.
-2. ``removed`` signifies removal of support for the operation in the specified
-   DXIL version and later.
-
-A non-empty list of supported stages is required to be specified. If an operation
-is supported in all DXIL versions and all stages it is required to be specified as
-
-.. code-block::
-
-   stages = [Stages<DXIL1_0, [all_stages]>];
-
-
-Attribute Specification
+Examples of Constraints
 -----------------------
 
-``attributes`` field of ``class DXILOp`` is used to represent valid operation
-attributes predicated on DXIL version as list of records of the following class
+Consider a DXIL Operation that is valid in Shader Model 6.2 and later,
 
-.. code-block::
+1. with valid overload types ``half``, ``float``, ``i16`` and ``i32``
+2. is valid for stages ``pixel`` and ``compute``
+3. with valid overload types ``double`` and ``i614`` if Shader Model version 6.3 and later
+4. is valid for all stages if Shader Model version 6.3 and later
 
-  class Attributes<MinVersion minver, list<DXILAttribute> attrs> {
-    MinVersion dxil_version = ver;
-    list<DXILAttribute> attributes = attrs;
-  }
-
-Following is an example specification of valid attributes for ``DXIL1_0``.
+This is represented as
 
 .. code-block::
 
-   attributes = [Attributes<DXIL1_0, [ReadNone]];
-
-A null list of ``attributes`` signifies no operation attributes.
-
-Interpretation of Multiple Versioned Properties
------------------------------------------------
-
-Each of the versioned properties states that the specified overload type, stage or
-attribute records are valid for the predicated DXIL version. Only
-the properties corresponding to latest minimal DXIL version are applicable.
-Note as in the above example, any overload types, stages or attributes,
-that remain valid in a later DXIL version need to be specified in full.
-For example, consider the following specification of valid overload types:
-
-.. code-block::
-
-   overloads = [
-                Overloads<DXIL1_0, [halfTy, floatTy]>,
-                Overloads<DXIL1_2, [halfTy, floatTy, doubleTy]>
-               ];
-
-It specifies that the overload types ``halfTy`` and ``floatTy`` are valid for DXIL
-version 1.0 and later. It also specifies that  ``doubleTy`` is additionally supported
-in DXIL version 1.2 and later.
+   [SMVersionConstraints<SMVersion<SM6_2>,
+                          (overloads llvm_half_ty, llvm_float_ty, llvm_i16_ty, llvm_i32_ty),
+                          (stages pixel, compute)>,
+    SMVersionConstraints<SMVersion<SM6_3>,
+                          (overloads llvm_half_ty, llvm_float_ty, llvm_double_ty,
+                                 llvm_i16_ty, llvm_i32_ty, llvm_i64_ty),
+                          (stages allKinds)>];
 
-This provides the flexibility to specify properties independent of other
-versioned specifications in the list.
+Consider a DXIL operation that is valid in Shader Model version 6.2 and later,
 
+1. with no overload types, i.e., all argument typess and result type are fixed.
+2. is valid for all stages.
 
-DXIL Operation Specification Examples
-=====================================
-
-Following examples illustrate the specification of some of the DXIL Ops.
-
-``Sin`` operation - an operation valid in all DXIL versions and all stages
-and has valid overload types predicated on DXIL version.
-
-.. code-block::
-
-  def Sin : DXILOp<13, unary> {
-    let Doc = "Returns sine(theta) for theta in radians.";
-    let LLVMIntrinsic = int_sin;
-    let result = overloadTy;
-    let arguments = [overloadTy];
-    let overloads = [Overloads<DXIL1_0, [halfTy, floatTy]>];
-    let stages = [Stages<DXIL1_0, [all_stages]>];
-    let attributes = [Attributes<DXIL1_0, [ReadNone]>];
-  }
-
-``FlattenedThreadIdInGroup`` - an operation with no arguments, no
-overload types, and valid stages and attributes predicated by DXIL Version.
+This is represented as
 
 .. code-block::
 
-   def FlattenedThreadIdInGroup :  DXILOp<96, flattenedThreadIdInGroup> {
-    let Doc = "Provides a flattened index for a given thread within a given "
-              "group (SV_GroupIndex)";
-    let LLVMIntrinsic = int_dx_flattened_thread_id_in_group;
-    let result = i32Ty;
-    let stages = [Stages<DXIL1_0, [compute, mesh, amplification, node]>];
-    let attributes = [Attributes<DXIL1_0, [ReadNone]>];
-   }
+     [SMVersionConstraints<SMVersion<SM6_2>, (overloads), (stages allKinds)>];
 
-``RawBufferStore`` - an operation with ``void`` return type, valid overload types
-predicated by DXIL Version and valid in all DXIL versions and stages.
 
-.. code-block::
+Specifying attributes predicated on Shader Model version using the single field 
+``sm_constraints`` not only allows for all of them to be specified together but
+also allows for a single place to specify minimum shader model version that supports
+the operation. Thus, a separate fiels is not needed to specify minimum shader model 
+version.
 
-   def RawBufferStore : DXILOp<140, rawBufferStore> {
-     let Doc = "Writes to a RWByteAddressBuffer or RWStructuredBuffer.";
-     let result = voidTy;
-     let arguments = [dxil_resource_ty, i32Ty, i32Ty, overloadTy,
-                      overloadTy, overloadTy, overloadTy, i8Ty, i32Ty];
-     let overloads = [
-                      Overloads<DXIL1_2, [halfTy, floatTy, i16Ty, i32Ty]>,
-                      Overloads<DXIL1_3>,[halfTy, floatTy, doubleTy,
-                                                   i16Ty, i32Ty, i64Ty]>
-                     ];
-      let stages = [Stages<DXIL1_2, all_stages>];
-      let attributes = [Attributes<DXIL1_0, [ReadOnly]>];
-   }
+Attribute Specification
+=======================
 
-``DerivCoarseX`` - an operation with no overload types and stages predicated
-by DXIL Version.
+DXIL Operation attributes that are not predicated on any constraint, are represented as
+a ``dag`` of Attribute records of the following abstract ``DXILAttributes`` class.
 
 .. code-block::
 
-   def DerivCoarseX : DXILOp<83, unary> {
-    let doc = "Computes the rate of change per stamp in x direction.";
-    let LLVMIntrinsic = int_dx_ddx;
-    let result = overloadTy;
-    let arguments = [overloadTy];
-    let stages = [
-                   Stages<DXIL1_0, [library, pixel]>,
-                   Stages<DXIL1_6, [library, pixel, amplification, compute, mesh]>
-                 ];
-    let attributes = [Attributes<DXIL1_0, [ReadNone]>];
-   }
+  class DXILAttributes;
 
-``CreateHandle`` - an operation with no overload types, no associated ``LLVMIntrinsic``
-and stages predicated  by DXIL Version.
+Following example records represent memory attributes 
 
 .. code-block::
 
-   def CreateHandle : DXILOp<57, createHandle> {
-     let doc = "Creates the handle to a resource";
-     let result = i32Ty;
-     let arguments = [i8Ty, i32Ty, i32Ty, i1Ty];
-     let stages = [
-                   Stages<DXIL1_0, [all_stages]>,
-                   Stages<DXIL1_6, [removed]
-                  ];
-     let attributes = [Attributes<DXIL1_0, [ReadOnly]>];
-   }
+  def ReadOnly : DXILOpAttributes;
+  def ReadNone : DXILOpAttributes;
 
-``Sample`` - an operation with valid overload types, stages and attributes
-predicated by DXIL version.
+DXIL Operation Specification Example
+====================================
+Following illustrates the specification of the DXIL Op ``Sin``
 
 .. code-block::
 
-   def Sample : DXILOp<60, sample> {
-     let Doc = "Samples a texture";
-     let LLVMIntrinsic = int_dx_sample;
-     let result = resRetF32Ty;
-     let arguments = [handleTy, handleTy, floatTy, floatTy, floatTy, floatTy,
-                      i32Ty, i32Ty, i32Ty, floatTy];
-     let overloads = [Overloads<DXIL1_0, [halfTy, floatTy, i16Ty, i32Ty]>];
-     let stages = [
-                   Stages<DXIL1_0, [library, pixel]>,
-                   Stages<DXIL1_6, [library, pixel, amplification, compute, mesh]>
-                  ];
-     let attributes = [Attributes<DXIL1_0, [ReadOnly]>];
-   }
+  def Sin  : DXILOp {
+    let Doc ="Returns sine(theta) for theta in radians.";
+    let OpCode = 13;
+    let OpClass = unary;
+    let LLVMIntrinsic = int_sin;
+    let arguments = (ins LLVMMatchType<0>);
+    let result = (out dxil_overload_ty);
+    let sm_constraints = [SMVersionConstraints<SMVersion<SM6_0>,
+                          (overloads llvm_half_ty, llvm_float_ty),
+                          (stages allKinds)>];
+    let attributes = (attrs ReadNone);
+    let DXILVersion = DX1_0;
+  }
 
 Summary
 =======
diff --git a/llvm/docs/PointerAuth.md b/llvm/docs/PointerAuth.md
index e027c902e58e1..cf2cc6305f130 100644
--- a/llvm/docs/PointerAuth.md
+++ b/llvm/docs/PointerAuth.md
@@ -18,9 +18,6 @@ At the IR level, it is represented using:
 * a [set of intrinsics](#intrinsics) (to sign/authenticate pointers)
 * a [signed pointer constant](#constant) (to sign globals)
 * a [call operand bundle](#operand-bundle) (to authenticate called pointers)
-* a [set of function attributes](#function-attributes) (to describe what
-  pointers are signed and how, to control implicit codegen in the backend, as
-  well as preserve invariants in the mid-level optimizer)
 
 The current implementation leverages the
 [Armv8.3-A PAuth/Pointer Authentication Code](#armv8-3-a-pauth-pointer-authentication-code)
@@ -290,27 +287,6 @@ but with the added guarantee that `%fp_i`, `%fp_auth`, and `%fp_auth_p`
 are not stored to (and reloaded from) memory.
 
 
-### Function Attributes
-
-Some function attributes are used to describe other pointer authentication
-operations that are not otherwise explicitly expressed in IR.
-
-#### ``ptrauth-indirect-gotos``
-
-``ptrauth-indirect-gotos`` specifies that indirect gotos in this function
-should authenticate their target.  At the IR level, no other change is needed.
-When lowering [``blockaddress`` constants](https://llvm.org/docs/LangRef.html#blockaddress),
-and [``indirectbr`` instructions](https://llvm.org/docs/LangRef.html#i-indirectbr),
-this tells the backend to respectively sign and authenticate the pointers.
-
-The specific scheme isn't ABI-visible.  Currently, the AArch64 backend
-signs blockaddresses using the `ASIA` key, with an integer discriminator
-derived from the parent function's name, using the SipHash stable discriminator:
-```
-  ptrauth_string_discriminator("<function_name> blockaddress")
-```
-
-
 ## AArch64 Support
 
 AArch64 is currently the only architecture with full support of the pointer
diff --git a/llvm/docs/RISCVUsage.rst b/llvm/docs/RISCVUsage.rst
index b3c7b0e3883d0..4474478b6d3f8 100644
--- a/llvm/docs/RISCVUsage.rst
+++ b/llvm/docs/RISCVUsage.rst
@@ -153,6 +153,7 @@ on support follow.
      ``Za64rs``        Supported (`See note <#riscv-profiles-extensions-note>`__)
      ``Zaamo``         Assembly Support
      ``Zabha``         Supported
+     ``Zacas``         Supported (`See note <#riscv-zacas-note>`__)
      ``Zalrsc``        Assembly Support
      ``Zama16b``       Supported (`See note <#riscv-profiles-extensions-note>`__)
      ``Zawrs``         Assembly Support
@@ -280,6 +281,11 @@ Supported
 ``Za128rs``, ``Za64rs``, ``Zama16b``, ``Zic64b``, ``Ziccamoa``, ``Ziccif``, ``Zicclsm``, ``Ziccrse``, ``Shcounterenvw``, ``Shgatpa``, ``Shtvala``, ``Shvsatpa``, ``Shvstvala``, ``Shvstvecd``, ``Ssccptr``, ``Sscounterenw``, ``Ssstateen``, ``Ssstrict``, ``Sstvala``, ``Sstvecd``, ``Ssu64xl``, ``Svade``, ``Svbare``
   These extensions are defined as part of the `RISC-V Profiles specification <https://github.com/riscv/riscv-profiles/releases/tag/v1.0>`__.  They do not introduce any new features themselves, but instead describe existing hardware features.
 
+  .. _riscv-zacas-note:
+
+``Zacas``
+  amocas.w will be used for i32 cmpxchg. amocas.d will be used i64 cmpxchg on RV64. The compiler will not generate amocas.d on RV32 or amocas.q on RV64 due to ABI compatibilty. These can only be used in the assembler.
+
 Experimental Extensions
 =======================
 
@@ -293,9 +299,6 @@ The primary goal of experimental support is to assist in the process of ratifica
 ``experimental-ssqosid``
   LLVM implements assembler support for the `v1.0-rc1 draft specification <https://github.com/riscv/riscv-ssqosid/releases/tag/v1.0-rc1>`_.
 
-``experimental-zacas``
-  LLVM implements the `1.0 release specification <https://github.com/riscvarchive/riscv-zacas/releases/tag/v1.0>`__. amocas.w will be used for i32 cmpxchg. amocas.d will be used i64 cmpxchg on RV64. The compiler will not generate amocas.d on RV32 or amocas.q on RV64 due to ABI compatibilty. These can only be used in the assembler. The extension will be left as experimental until `an ABI issue <https://github.com/riscv-non-isa/riscv-elf-psabi-doc/issues/444>`__ is resolved.
-
 ``experimental-zalasr``
   LLVM implements the `0.0.5 draft specification <https://github.com/mehnadnerd/riscv-zalasr>`__.
 
diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index 46fa2a74450e1..827a0fd3606ec 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -47,30 +47,110 @@ Non-comprehensive list of changes in this release
 Update on required toolchains to build LLVM
 -------------------------------------------
 
+* The minimum Python version has been raised from 3.6 to 3.8 across all of LLVM.
+  This enables the use of many new Python features, aligning more closely with
+  modern Python best practices, and improves CI maintainability
+  See `#78828 <https://github.com/llvm/llvm-project/pull/78828>`_ for more info.
+
 Changes to the LLVM IR
 ----------------------
 
+* Added Memory Model Relaxation Annotations (MMRAs).
+* Added ``nusw`` and ``nuw`` flags to ``getelementptr`` instruction.
+* Renamed ``llvm.experimental.vector.reverse`` intrinsic to ``llvm.vector.reverse``.
+* Renamed ``llvm.experimental.vector.splice`` intrinsic to ``llvm.vector.splice``.
+* Renamed ``llvm.experimental.vector.interleave2`` intrinsic to ``llvm.vector.interleave2``.
+* Renamed ``llvm.experimental.vector.deinterleave2`` intrinsic to ``llvm.vector.deinterleave2``.
+* The constant expression variants of the following instructions have been
+  removed:
+
+  * ``icmp``
+  * ``fcmp``
+  * ``shl``
+* LLVM has switched from using debug intrinsics in textual IR to using debug
+  records by default. Details of the change and instructions on how to update
+  any downstream tools and tests can be found in the `migration docs
+  <https://llvm.org/docs/RemoveDIsDebugInfo.html>`_.
+* Semantics of MC/DC intrinsics have been changed.
+
+  * ``llvm.instprof.mcdc.parameters``: 3rd argument has been changed
+    from bytes to bits.
+  * ``llvm.instprof.mcdc.condbitmap.update``: Removed.
+  * ``llvm.instprof.mcdc.tvbitmap.update``: 3rd argument has been
+    removed. The next argument has been changed from byte index to bit
+    index.
+* Added ``llvm.experimental.vector.compress`` intrinsic.
+
 Changes to LLVM infrastructure
 ------------------------------
 
 Changes to building LLVM
 ------------------------
 
+* LLVM now has rpmalloc version 1.4.5 in-tree, as a replacement C allocator for
+  hosted toolchains. This supports several host platforms such as Mac or Unix,
+  however currently only the Windows 64-bit LLVM release uses it.
+  This has a great benefit in terms of build times on Windows when using ThinLTO
+  linking, especially on machines with lots of cores, to an order of magnitude
+  or more. Clang compilation is also improved. Please see some build timings in
+  (`#91862 <https://github.com/llvm/llvm-project/pull/91862#issue-2291033962>`_)
+  For more information, refer to the **LLVM_ENABLE_RPMALLOC** option in `CMake variables <https://llvm.org/docs/CMake.html#llvm-related-variables>`_.
+
+* The ``LLVM_ENABLE_TERMINFO`` flag has been removed. LLVM no longer depends on
+  terminfo and now always uses the ``TERM`` environment variable for color
+  support autodetection.
+
 Changes to TableGen
 -------------------
 
+- We can define type aliases via new keyword ``deftype``.
+
 Changes to Interprocedural Optimizations
 ----------------------------------------
 
 Changes to the AArch64 Backend
 ------------------------------
 
+* Added support for Cortex-R82AE, Cortex-A78AE, Cortex-A520AE, Cortex-A720AE,
+  Cortex-A725, Cortex-X925, Neoverse-N3, Neoverse-V3 and Neoverse-V3AE CPUs.
+
+* ``-mbranch-protection=standard`` now enables FEAT_PAuth_LR by
+  default when the feature is enabled. The new behaviour results 
+  in ``standard`` being equal to ``bti+pac-ret+pc`` when ``+pauth-lr``
+  is passed as part of ``-mcpu=`` options.
+
+* SVE and SVE2 have been moved to the default extensions list for ARMv9.0,
+  making them optional per the Arm ARM.  Existing v9.0+ CPUs in the backend that
+  support these extensions continue to have these features enabled by default
+  when specified via ``-march=`` or an ``-mcpu=`` that supports them.  The
+  attribute ``"target-features"="+v9a"`` no longer implies ``"+sve"`` and
+  ``"+sve2"`` respectively.
+
 Changes to the AMDGPU Backend
 -----------------------------
 
+* Implemented the ``llvm.get.fpenv`` and ``llvm.set.fpenv`` intrinsics.
+* Added ``!amdgpu.no.fine.grained.memory`` and
+  ``!amdgpu.no.remote.memory`` metadata to control atomic behavior.
+
+* Implemented :ref:`llvm.get.rounding <int_get_rounding>` and :ref:`llvm.set.rounding <int_set_rounding>`
+
+* Removed ``llvm.amdgcn.ds.fadd``, ``llvm.amdgcn.ds.fmin`` and
+  ``llvm.amdgcn.ds.fmax`` intrinsics. Users should use the
+  :ref:`atomicrmw <i_atomicrmw>` instruction with `fadd`, `fmin` and
+  `fmax` with addrspace(3) instead.
+
+* AMDGPUAttributor is no longer run as part of the codegen pass
+  pipeline. It is expected to run as part of the middle end
+  optimizations.
+
 Changes to the ARM Backend
 --------------------------
 
+* Added support for Cortex-R52+ CPU.
+* FEAT_F32MM is no longer activated by default when using `+sve` on v8.6-A or greater. The feature is still available and can be used by adding `+f32mm` to the command line options.
+* armv8-r now implies only fp-armv8d16sp, rather than neon and full fp-armv8. These features are still included by default for cortex-r52. The default cpu for armv8-r is now "generic", for compatibility with variants that do not include neon, fp64, and d32.
+
 Changes to the AVR Backend
 --------------------------
 
@@ -83,6 +163,10 @@ Changes to the Hexagon Backend
 Changes to the LoongArch Backend
 --------------------------------
 
+* i32 is now a native type in the datalayout string. This enables
+  LoopStrengthReduce for loops with i32 induction variables, among other
+  optimizations.
+
 Changes to the MIPS Backend
 ---------------------------
 
@@ -92,6 +176,37 @@ Changes to the PowerPC Backend
 Changes to the RISC-V Backend
 -----------------------------
 
+* Added full support for the experimental Zabha (Byte and
+  Halfword Atomic Memory Operations) extension.
+* Added assembler/disassembler support for the experimenatl Zalasr
+  (Load-Acquire and Store-Release) extension.
+* The names of the majority of the S-prefixed (supervisor-level) extension
+  names in the RISC-V profiles specification are now recognised.
+* Codegen support was added for the Zimop (May-Be-Operations) extension.
+* The experimental Ssnpm, Smnpm, Smmpm, Sspm, and Supm 1.0.0 Pointer Masking extensions are supported.
+* The experimental Ssqosid extension is supported.
+* Zacas is no longer experimental.
+* Added the CSR names from the Resumable Non-Maskable Interrupts (Smrnmi) extension.
+* llvm-objdump now prints disassembled opcode bytes in groups of 2 or 4 bytes to
+  match GNU objdump. The bytes within the groups are in big endian order.
+* Added smstateen extension to -march. CSR names for smstateen were already supported.
+* Zaamo and Zalrsc are no longer experimental.
+* Processors that enable post reg-alloc scheduling (PostMachineScheduler) by default should use the `UsePostRAScheduler` subtarget feature. Setting `PostRAScheduler = 1` in the scheduler model will have no effect on the enabling of the PostMachineScheduler.
+* Zabha is no longer experimental.
+* B (the collection of the Zba, Zbb, Zbs extensions) is supported.
+* Added smcdeleg, ssccfg, smcsrind, and sscsrind extensions to -march.
+* ``-mcpu=syntacore-scr3-rv32`` and ``-mcpu=syntacore-scr3-rv64`` were added.
+* The default atomics mapping was changed to emit an additional trailing fence
+  for sequentially consistent stores, offering compatibility with a future
+  mapping using load-acquire and store-release instructions while remaining
+  fully compatible with objects produced prior to this change. The mapping
+  (ABI) used is recorded as an ELF attribute.
+* Ztso is no longer experimental.
+* The WCH / Nanjing Qinheng Microelectronics QingKe "XW" compressed opcodes are
+  supported under the name "Xwchc".
+* ``-mcpu=native`` now detects available features with hwprobe (RISC-V Hardware Probing Interface) on Linux 6.4 or later.
+* The version of Zicfilp/Zicfiss is updated to 1.0.
+
 Changes to the WebAssembly Backend
 ----------------------------------
 
@@ -101,6 +216,12 @@ Changes to the Windows Target
 Changes to the X86 Backend
 --------------------------
 
+- Removed knl/knm specific ISA intrinsics: AVX512PF, AVX512ER, PREFETCHWT1,
+  while assembly encoding/decoding supports are kept.
+
+- Removed ``3DNow!``-specific ISA intrinsics and codegen support. The ``3dnow`` and ``3dnowa`` target features are no longer supported. The intrinsics ``llvm.x86.3dnow.*``, ``llvm.x86.3dnowa.*``, and ``llvm.x86.mmx.femms`` have been removed. Assembly encoding/decoding for the corresponding instructions remains supported.
+
+
 Changes to the OCaml bindings
 -----------------------------
 
@@ -110,6 +231,104 @@ Changes to the Python bindings
 Changes to the C API
 --------------------
 
+* Added ``LLVMGetBlockAddressFunction`` and ``LLVMGetBlockAddressBasicBlock``
+  functions for accessing the values in a blockaddress constant.
+
+* Added ``LLVMConstStringInContext2`` function, which better matches the C++
+  API by using ``size_t`` for string length. Deprecated ``LLVMConstStringInContext``.
+
+* Added the following functions for accessing a function's prefix data:
+
+  * ``LLVMHasPrefixData``
+  * ``LLVMGetPrefixData``
+  * ``LLVMSetPrefixData``
+
+* Added the following functions for accessing a function's prologue data:
+
+  * ``LLVMHasPrologueData``
+  * ``LLVMGetPrologueData``
+  * ``LLVMSetPrologueData``
+
+* Deprecated ``LLVMConstNUWNeg`` and ``LLVMBuildNUWNeg``.
+
+* Added ``LLVMAtomicRMWBinOpUIncWrap`` and ``LLVMAtomicRMWBinOpUDecWrap`` to
+  ``LLVMAtomicRMWBinOp`` enum for AtomicRMW instructions.
+
+* Added ``LLVMCreateConstantRangeAttribute`` function for creating ConstantRange Attributes.
+
+* Added the following functions for creating and accessing data for CallBr instructions:
+
+  * ``LLVMBuildCallBr``
+  * ``LLVMGetCallBrDefaultDest``
+  * ``LLVMGetCallBrNumIndirectDests``
+  * ``LLVMGetCallBrIndirectDest``
+
+* The following functions for creating constant expressions have been removed,
+  because the underlying constant expressions are no longer supported. Instead,
+  an instruction should be created using the ``LLVMBuildXYZ`` APIs, which will
+  constant fold the operands if possible and create an instruction otherwise:
+
+  * ``LLVMConstICmp``
+  * ``LLVMConstFCmp``
+  * ``LLVMConstShl``
+
+**Note:** The following changes are due to the removal of the debug info
+intrinsics from LLVM and to the introduction of debug records into LLVM.
+They are described in detail in the `debug info migration guide <https://llvm.org/docs/RemoveDIsDebugInfo.html>`_.
+
+* Added the following functions to insert before the indicated instruction but
+  after any attached debug records.
+
+  * ``LLVMPositionBuilderBeforeDbgRecords``
+  * ``LLVMPositionBuilderBeforeInstrAndDbgRecords``
+
+  Same as ``LLVMPositionBuilder`` and ``LLVMPositionBuilderBefore`` except the
+  insertion position is set to before the debug records that precede the target
+  instruction. ``LLVMPositionBuilder`` and ``LLVMPositionBuilderBefore`` are
+  unchanged.
+
+* Added the following functions to get/set the new non-instruction debug info format.
+  They will be deprecated in the future and they are just a transition aid.
+
+  * ``LLVMIsNewDbgInfoFormat``
+  * ``LLVMSetIsNewDbgInfoFormat``
+
+* Added the following functions to insert a debug record (new debug info format).
+
+  * ``LLVMDIBuilderInsertDeclareRecordBefore``
+  * ``LLVMDIBuilderInsertDeclareRecordAtEnd``
+  * ``LLVMDIBuilderInsertDbgValueRecordBefore``
+  * ``LLVMDIBuilderInsertDbgValueRecordAtEnd``
+
+* Deleted the following functions that inserted a debug intrinsic (old debug info format).
+
+  * ``LLVMDIBuilderInsertDeclareBefore``
+  * ``LLVMDIBuilderInsertDeclareAtEnd``
+  * ``LLVMDIBuilderInsertDbgValueBefore``
+  * ``LLVMDIBuilderInsertDbgValueAtEnd``
+
+* Added the following functions for accessing a Target Extension Type's data:
+
+  * ``LLVMGetTargetExtTypeName``
+  * ``LLVMGetTargetExtTypeNumTypeParams``/``LLVMGetTargetExtTypeTypeParam``
+  * ``LLVMGetTargetExtTypeNumIntParams``/``LLVMGetTargetExtTypeIntParam``
+
+* Added the following functions for accessing/setting the no-wrap flags for a
+  GetElementPtr instruction:
+
+  * ``LLVMBuildGEPWithNoWrapFlags``
+  * ``LLVMConstGEPWithNoWrapFlags``
+  * ``LLVMGEPGetNoWrapFlags``
+  * ``LLVMGEPSetNoWrapFlags``
+
+* Added the following functions for creating and accessing data for ConstantPtrAuth constants:
+
+  * ``LLVMConstantPtrAuth``
+  * ``LLVMGetConstantPtrAuthPointer``
+  * ``LLVMGetConstantPtrAuthKey``
+  * ``LLVMGetConstantPtrAuthDiscriminator``
+  * ``LLVMGetConstantPtrAuthAddrDiscriminator``
+
 Changes to the CodeGen infrastructure
 -------------------------------------
 
@@ -119,14 +338,106 @@ Changes to the Metadata Info
 Changes to the Debug Info
 ---------------------------------
 
+* LLVM has switched from using debug intrinsics internally to using debug
+  records by default. This should happen transparently when using the DIBuilder
+  to construct debug variable information, but will require changes for any code
+  that interacts with debug intrinsics directly. Debug intrinsics will only be
+  supported on a best-effort basis from here onwards; for more information, see
+  the `migration docs <https://llvm.org/docs/RemoveDIsDebugInfo.html>`_.
+
+* When emitting DWARF v2 and not in strict DWARF mode, LLVM will now add
+  a ``DW_AT_type`` to instances of ``DW_TAG_enumeration_type``. This is actually
+  a DWARF v3 feature which tells tools what the enum's underlying type is.
+  Emitting this for v2 as well will help users who have to build binaries with
+  DWARF v2 but are using tools that understand newer DWARF standards. Older
+  tools will ignore it. (`#98335 <https://github.com/llvm/llvm-project/pull/98335>`_)
+
 Changes to the LLVM tools
 ---------------------------------
+* llvm-nm and llvm-objdump can now print symbol information from linked
+  WebAssembly binaries, using information from exports or the "name"
+  section for functions, globals and data segments. Symbol addresses and sizes
+  are printed as offsets in the file, allowing for binary size analysis. Wasm
+  files using reference types and GC are also supported (but also only for
+  functions, globals, and data, and only for listing symbols and names).
+
+* llvm-ar now utilizes LLVM_DEFAULT_TARGET_TRIPLE to determine the archive format
+  if it's not specified with the ``--format`` argument and cannot be inferred from
+  input files.
+
+* llvm-ar now allows specifying COFF archive format with ``--format`` argument
+  and uses it by default for COFF targets.
+
+* llvm-ranlib now supports ``-V`` as an alias for ``--version``.
+  ``-v`` (``--verbose`` in llvm-ar) has been removed.
+  (`#87661 <https://github.com/llvm/llvm-project/pull/87661>`_)
+
+* llvm-objcopy now supports ``--set-symbol-visibility`` and
+  ``--set-symbols-visibility`` options for ELF input to change the
+  visibility of symbols.
+
+* llvm-objcopy now supports ``--skip-symbol`` and ``--skip-symbols`` options
+  for ELF input to skip the specified symbols when executing other options
+  that can change a symbol's name, binding or visibility.
+
+* llvm-objcopy now supports ``--compress-sections`` to compress or decompress
+  arbitrary sections not within a segment.
+  (`#85036 <https://github.com/llvm/llvm-project/pull/85036>`_.)
+
+* llvm-profgen now supports COFF+DWARF binaries. This enables Sample-based PGO
+  on Windows using Intel VTune's SEP. For details on usage, see the `end-user
+  documentation for SPGO
+  <https://clang.llvm.org/docs/UsersManual.html#using-sampling-profilers>`_.
+
+* llvm-readelf's ``-r`` output for RELR has been improved.
+  (`#89162 <https://github.com/llvm/llvm-project/pull/89162>`_)
+  ``--raw-relr`` has been removed.
+
+* llvm-mca now aborts by default if it is given bad input where previously it
+  would continue. Additionally, it can now continue when it encounters
+  instructions which lack scheduling information. The behaviour can be
+  controlled by the newly introduced
+  ``--skip-unsupported-instructions=<none|lack-sched|parse-failure|any>``, as
+  documented in ``--help`` output and the command guide. (`#90474
+  <https://github.com/llvm/llvm-project/pull/90474>`_)
+
+* llvm-readobj's LLVM output format for ELF core files has been changed.
+  Similarly, the JSON format has been fixed for this case. The NT_FILE note
+  now has a map for the mapped files. (`#92835
+  <https://github.com/llvm/llvm-project/pull/92835>`_).
+
+* llvm-cov now generates HTML report with JavaScript code to allow simple
+  jumping between uncovered parts (lines/regions/branches) of code 
+  using buttons on top-right corner of the page or using keys (L/R/B or 
+  jumping in reverse direction with shift+L/R/B). (`#95662
+  <https://github.com/llvm/llvm-project/pull/95662>`_).
+
+* llvm-objcopy now verifies format of ``.note`` sections for ELF input. This can
+  be disabled by ``--no-verify-note-sections``. (`#90458
+  <https://github.com/llvm/llvm-project/pull/90458>`).
 
 Changes to LLDB
 ---------------------------------
 
+* Register field information is now provided on AArch64 FreeBSD for live
+  processes and core files (previously only provided on AArch64 Linux).
+
+* Register field information can now include enums to represent field
+  values. Enums have been added for ``fpcr.RMode`` and ``mte_ctrl.TCF``
+  for AArch64 targets::
+
+    (lldb) register read fpcr
+        fpcr = 0x00000000
+             = (AHP = 0, DN = 0, FZ = 0, RMode = RN, <...>)
+
+  If you need to know the values of the enum, these can be found in
+  the output of ``register info`` for the same register.
+
 Changes to BOLT
 ---------------------------------
+* Now supports ``--match-profile-with-function-hash`` to match profiled and
+  binary functions with exact hash, allowing for the matching of renamed but
+  identical functions.
 
 Changes to Sanitizers
 ---------------------
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 2411b2b31d293..cf378008e4c7c 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -419,12 +419,6 @@ class TargetTransformInfo {
   /// this factor, it is very likely to be predicted correctly.
   BranchProbability getPredictableBranchThreshold() const;
 
-  /// Returns estimated penalty of a branch misprediction in latency. Indicates
-  /// how aggressive the target wants for eliminating unpredictable branches. A
-  /// zero return value means extra optimization applied to them should be
-  /// minimal.
-  InstructionCost getBranchMispredictPenalty() const;
-
   /// Return true if branch divergence exists.
   ///
   /// Branch divergence has a significantly negative impact on GPU performance
@@ -1838,7 +1832,6 @@ class TargetTransformInfo::Concept {
                                              ArrayRef<const Value *> Operands,
                                              TargetCostKind CostKind) = 0;
   virtual BranchProbability getPredictableBranchThreshold() = 0;
-  virtual InstructionCost getBranchMispredictPenalty() = 0;
   virtual bool hasBranchDivergence(const Function *F = nullptr) = 0;
   virtual bool isSourceOfDivergence(const Value *V) = 0;
   virtual bool isAlwaysUniform(const Value *V) = 0;
@@ -2250,9 +2243,6 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
   BranchProbability getPredictableBranchThreshold() override {
     return Impl.getPredictableBranchThreshold();
   }
-  InstructionCost getBranchMispredictPenalty() override {
-    return Impl.getBranchMispredictPenalty();
-  }
   bool hasBranchDivergence(const Function *F = nullptr) override {
     return Impl.hasBranchDivergence(F);
   }
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 00efa474a91b5..47fde08735c0c 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -99,8 +99,6 @@ class TargetTransformInfoImplBase {
     return BranchProbability(99, 100);
   }
 
-  InstructionCost getBranchMispredictPenalty() const { return 0; }
-
   bool hasBranchDivergence(const Function *F = nullptr) const { return false; }
 
   bool isSourceOfDivergence(const Value *V) const { return false; }
diff --git a/llvm/include/llvm/Analysis/ValueTracking.h b/llvm/include/llvm/Analysis/ValueTracking.h
index 8e5a8dc483d27..2c2f965a3cd6f 100644
--- a/llvm/include/llvm/Analysis/ValueTracking.h
+++ b/llvm/include/llvm/Analysis/ValueTracking.h
@@ -736,10 +736,6 @@ inline Value *getUnderlyingObject(Value *V, unsigned MaxLookup = 6) {
   return const_cast<Value *>(getUnderlyingObject(VConst, MaxLookup));
 }
 
-/// Like getUnderlyingObject(), but will try harder to find a single underlying
-/// object. In particular, this function also looks through selects and phis.
-const Value *getUnderlyingObjectAggressive(const Value *V);
-
 /// This method is similar to getUnderlyingObject except that it can
 /// look through phi and select instructions and return multiple objects.
 ///
@@ -826,25 +822,15 @@ bool isSafeToSpeculativelyExecute(const Instruction *I,
                                   const Instruction *CtxI = nullptr,
                                   AssumptionCache *AC = nullptr,
                                   const DominatorTree *DT = nullptr,
-                                  const TargetLibraryInfo *TLI = nullptr,
-                                  bool UseVariableInfo = true);
-
-inline bool isSafeToSpeculativelyExecute(const Instruction *I,
-                                         BasicBlock::iterator CtxI,
-                                         AssumptionCache *AC = nullptr,
-                                         const DominatorTree *DT = nullptr,
-                                         const TargetLibraryInfo *TLI = nullptr,
-                                         bool UseVariableInfo = true) {
-  // Take an iterator, and unwrap it into an Instruction *.
-  return isSafeToSpeculativelyExecute(I, &*CtxI, AC, DT, TLI, UseVariableInfo);
-}
+                                  const TargetLibraryInfo *TLI = nullptr);
 
-/// Don't use information from its non-constant operands. This helper is used
-/// when its operands are going to be replaced.
 inline bool
-isSafeToSpeculativelyExecuteWithVariableReplaced(const Instruction *I) {
-  return isSafeToSpeculativelyExecute(I, nullptr, nullptr, nullptr, nullptr,
-                                      /*UseVariableInfo=*/false);
+isSafeToSpeculativelyExecute(const Instruction *I, BasicBlock::iterator CtxI,
+                             AssumptionCache *AC = nullptr,
+                             const DominatorTree *DT = nullptr,
+                             const TargetLibraryInfo *TLI = nullptr) {
+  // Take an iterator, and unwrap it into an Instruction *.
+  return isSafeToSpeculativelyExecute(I, &*CtxI, AC, DT, TLI);
 }
 
 /// This returns the same result as isSafeToSpeculativelyExecute if Opcode is
@@ -867,7 +853,7 @@ isSafeToSpeculativelyExecuteWithVariableReplaced(const Instruction *I) {
 bool isSafeToSpeculativelyExecuteWithOpcode(
     unsigned Opcode, const Instruction *Inst, const Instruction *CtxI = nullptr,
     AssumptionCache *AC = nullptr, const DominatorTree *DT = nullptr,
-    const TargetLibraryInfo *TLI = nullptr, bool UseVariableInfo = true);
+    const TargetLibraryInfo *TLI = nullptr);
 
 /// Returns true if the result or effects of the given instructions \p I
 /// depend values not reachable through the def use graph.
diff --git a/llvm/include/llvm/CodeGen/AsmPrinter.h b/llvm/include/llvm/CodeGen/AsmPrinter.h
index f57be39076a78..1c4e9e9111441 100644
--- a/llvm/include/llvm/CodeGen/AsmPrinter.h
+++ b/llvm/include/llvm/CodeGen/AsmPrinter.h
@@ -577,9 +577,6 @@ class AsmPrinter : public MachineFunctionPass {
     report_fatal_error("ptrauth constant lowering not implemented");
   }
 
-  /// Lower the specified BlockAddress to an MCExpr.
-  virtual const MCExpr *lowerBlockAddressConstant(const BlockAddress &BA);
-
   /// Return true if the basic block has exactly one predecessor and the control
   /// transfer mechanism between the predecessor and this block is a
   /// fall-through.
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h b/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
index 80897953156a0..d9f6f6540bdc8 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
@@ -900,25 +900,6 @@ class GShl : public GenericMachineInstr {
   };
 };
 
-/// Represents a threeway compare.
-class GSUCmp : public GenericMachineInstr {
-public:
-  Register getLHSReg() const { return getOperand(1).getReg(); }
-  Register getRHSReg() const { return getOperand(2).getReg(); }
-
-  bool isSigned() const { return getOpcode() == TargetOpcode::G_SCMP; }
-
-  static bool classof(const MachineInstr *MI) {
-    switch (MI->getOpcode()) {
-    case TargetOpcode::G_SCMP:
-    case TargetOpcode::G_UCMP:
-      return true;
-    default:
-      return false;
-    }
-  };
-};
-
 } // namespace llvm
 
 #endif // LLVM_CODEGEN_GLOBALISEL_GENERICMACHINEINSTRS_H
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
index cf1ca1dc34024..b17bc9aa2a44e 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
@@ -402,7 +402,6 @@ class LegalizerHelper {
 
   LegalizeResult lowerISFPCLASS(MachineInstr &MI);
 
-  LegalizeResult lowerThreewayCompare(MachineInstr &MI);
   LegalizeResult lowerMinMax(MachineInstr &MI);
   LegalizeResult lowerFCopySign(MachineInstr &MI);
   LegalizeResult lowerFMinNumMaxNum(MachineInstr &MI);
diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
index 5b657fb171296..daceaf98583bd 100644
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -217,9 +217,9 @@ enum NodeType {
   /// UNDEF - An undefined node.
   UNDEF,
 
-  /// FREEZE - FREEZE(VAL) returns an arbitrary value if VAL is UNDEF (or
-  /// is evaluated to UNDEF), or returns VAL otherwise. Note that each
-  /// read of UNDEF can yield different value, but FREEZE(UNDEF) cannot.
+  // FREEZE - FREEZE(VAL) returns an arbitrary value if VAL is UNDEF (or
+  // is evaluated to UNDEF), or returns VAL otherwise. Note that each
+  // read of UNDEF can yield different value, but FREEZE(UNDEF) cannot.
   FREEZE,
 
   /// EXTRACT_ELEMENT - This is used to get the lower or upper (determined by
@@ -300,7 +300,7 @@ enum NodeType {
   /// it to the add/sub hardware instruction, and then inverting the outgoing
   /// carry/borrow.
   ///
-  /// The use of these opcodes is preferable to ADDE/SUBE if the target supports
+  /// The use of these opcodes is preferable to adde/sube if the target supports
   /// it, as the carry is a regular value rather than a glue, which allows
   /// further optimisation.
   ///
@@ -490,7 +490,7 @@ enum NodeType {
   STRICT_FSETCC,
   STRICT_FSETCCS,
 
-  /// FPTRUNC_ROUND - This corresponds to the fptrunc_round intrinsic.
+  // FPTRUNC_ROUND - This corresponds to the fptrunc_round intrinsic.
   FPTRUNC_ROUND,
 
   /// FMA - Perform a * b + c with no intermediate rounding step.
@@ -684,10 +684,10 @@ enum NodeType {
   AVGCEILS,
   AVGCEILU,
 
-  /// ABDS/ABDU - Absolute difference - Return the absolute difference between
-  /// two numbers interpreted as signed/unsigned.
-  /// i.e trunc(abs(sext(Op0) - sext(Op1))) becomes abds(Op0, Op1)
-  ///  or trunc(abs(zext(Op0) - zext(Op1))) becomes abdu(Op0, Op1)
+  // ABDS/ABDU - Absolute difference - Return the absolute difference between
+  // two numbers interpreted as signed/unsigned.
+  // i.e trunc(abs(sext(Op0) - sext(Op1))) becomes abds(Op0, Op1)
+  //  or trunc(abs(zext(Op0) - zext(Op1))) becomes abdu(Op0, Op1)
   ABDS,
   ABDU,
 
@@ -728,9 +728,8 @@ enum NodeType {
   /// amount modulo the element size of the first operand.
   ///
   /// Funnel 'double' shifts take 3 operands, 2 inputs and the shift amount.
-  ///
-  ///     fshl(X,Y,Z): (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
-  ///     fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW))
+  /// fshl(X,Y,Z): (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
+  /// fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW))
   SHL,
   SRA,
   SRL,
@@ -788,8 +787,7 @@ enum NodeType {
 
   /// SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded
   /// integer shift operations.  The operation ordering is:
-  ///
-  ///     [Lo,Hi] = op [LoLHS,HiLHS], Amt
+  ///       [Lo,Hi] = op [LoLHS,HiLHS], Amt
   SHL_PARTS,
   SRA_PARTS,
   SRL_PARTS,
@@ -1000,7 +998,7 @@ enum NodeType {
 
   /// FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two
   /// values.
-  ///
+  //
   /// In the case where a single input is a NaN (either signaling or quiet),
   /// the non-NaN input is returned.
   ///
@@ -1198,11 +1196,11 @@ enum NodeType {
   VAEND,
   VASTART,
 
-  /// PREALLOCATED_SETUP - This has 2 operands: an input chain and a SRCVALUE
-  /// with the preallocated call Value.
+  // PREALLOCATED_SETUP - This has 2 operands: an input chain and a SRCVALUE
+  // with the preallocated call Value.
   PREALLOCATED_SETUP,
-  /// PREALLOCATED_ARG - This has 3 operands: an input chain, a SRCVALUE
-  /// with the preallocated call Value, and a constant int.
+  // PREALLOCATED_ARG - This has 3 operands: an input chain, a SRCVALUE
+  // with the preallocated call Value, and a constant int.
   PREALLOCATED_ARG,
 
   /// SRCVALUE - This is a node type that holds a Value* that is used to
@@ -1321,24 +1319,24 @@ enum NodeType {
   ATOMIC_LOAD_UINC_WRAP,
   ATOMIC_LOAD_UDEC_WRAP,
 
-  /// Masked load and store - consecutive vector load and store operations
-  /// with additional mask operand that prevents memory accesses to the
-  /// masked-off lanes.
-  ///
-  ///     Val, OutChain = MLOAD(BasePtr, Mask, PassThru)
-  ///     OutChain = MSTORE(Value, BasePtr, Mask)
+  // Masked load and store - consecutive vector load and store operations
+  // with additional mask operand that prevents memory accesses to the
+  // masked-off lanes.
+  //
+  // Val, OutChain = MLOAD(BasePtr, Mask, PassThru)
+  // OutChain = MSTORE(Value, BasePtr, Mask)
   MLOAD,
   MSTORE,
 
-  /// Masked gather and scatter - load and store operations for a vector of
-  /// random addresses with additional mask operand that prevents memory
-  /// accesses to the masked-off lanes.
-  ///
-  ///     Val, OutChain = GATHER(InChain, PassThru, Mask, BasePtr, Index, Scale)
-  ///     OutChain = SCATTER(InChain, Value, Mask, BasePtr, Index, Scale)
-  ///
-  /// The Index operand can have more vector elements than the other operands
-  /// due to type legalization. The extra elements are ignored.
+  // Masked gather and scatter - load and store operations for a vector of
+  // random addresses with additional mask operand that prevents memory
+  // accesses to the masked-off lanes.
+  //
+  // Val, OutChain = GATHER(InChain, PassThru, Mask, BasePtr, Index, Scale)
+  // OutChain = SCATTER(InChain, Value, Mask, BasePtr, Index, Scale)
+  //
+  // The Index operand can have more vector elements than the other operands
+  // due to type legalization. The extra elements are ignored.
   MGATHER,
   MSCATTER,
 
@@ -1387,11 +1385,9 @@ enum NodeType {
   /// pow-of-2 vectors, one valid legalizer expansion is to use a tree
   /// reduction, i.e.:
   /// For RES = VECREDUCE_FADD <8 x f16> SRC_VEC
-  ///
-  ///     PART_RDX = FADD SRC_VEC[0:3], SRC_VEC[4:7]
-  ///     PART_RDX2 = FADD PART_RDX[0:1], PART_RDX[2:3]
-  ///     RES = FADD PART_RDX2[0], PART_RDX2[1]
-  ///
+  ///   PART_RDX = FADD SRC_VEC[0:3], SRC_VEC[4:7]
+  ///   PART_RDX2 = FADD PART_RDX[0:1], PART_RDX[2:3]
+  ///   RES = FADD PART_RDX2[0], PART_RDX2[1]
   /// For non-pow-2 vectors, this can be computed by extracting each element
   /// and performing the operation as if it were scalarized.
   VECREDUCE_FADD,
diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h
index cafb9781698a2..f850767270a4f 100644
--- a/llvm/include/llvm/CodeGen/Passes.h
+++ b/llvm/include/llvm/CodeGen/Passes.h
@@ -205,20 +205,20 @@ namespace llvm {
   /// possible. It is best suited for debug code where live ranges are short.
   ///
   FunctionPass *createFastRegisterAllocator();
-  FunctionPass *createFastRegisterAllocator(RegAllocFilterFunc F,
+  FunctionPass *createFastRegisterAllocator(RegClassFilterFunc F,
                                             bool ClearVirtRegs);
 
   /// BasicRegisterAllocation Pass - This pass implements a degenerate global
   /// register allocator using the basic regalloc framework.
   ///
   FunctionPass *createBasicRegisterAllocator();
-  FunctionPass *createBasicRegisterAllocator(RegAllocFilterFunc F);
+  FunctionPass *createBasicRegisterAllocator(RegClassFilterFunc F);
 
   /// Greedy register allocation pass - This pass implements a global register
   /// allocator for optimized builds.
   ///
   FunctionPass *createGreedyRegisterAllocator();
-  FunctionPass *createGreedyRegisterAllocator(RegAllocFilterFunc F);
+  FunctionPass *createGreedyRegisterAllocator(RegClassFilterFunc F);
 
   /// PBQPRegisterAllocation Pass - This pass implements the Partitioned Boolean
   /// Quadratic Prograaming (PBQP) based register allocator.
diff --git a/llvm/include/llvm/CodeGen/RegAllocCommon.h b/llvm/include/llvm/CodeGen/RegAllocCommon.h
index 6c5cb295f5290..ad533eab1861c 100644
--- a/llvm/include/llvm/CodeGen/RegAllocCommon.h
+++ b/llvm/include/llvm/CodeGen/RegAllocCommon.h
@@ -9,20 +9,18 @@
 #ifndef LLVM_CODEGEN_REGALLOCCOMMON_H
 #define LLVM_CODEGEN_REGALLOCCOMMON_H
 
-#include "llvm/CodeGen/Register.h"
 #include <functional>
 
 namespace llvm {
 
 class TargetRegisterClass;
 class TargetRegisterInfo;
-class MachineRegisterInfo;
 
 /// Filter function for register classes during regalloc. Default register class
 /// filter is nullptr, where all registers should be allocated.
 typedef std::function<bool(const TargetRegisterInfo &TRI,
-                           const MachineRegisterInfo &MRI, const Register Reg)>
-    RegAllocFilterFunc;
+                           const TargetRegisterClass &RC)>
+    RegClassFilterFunc;
 }
 
 #endif // LLVM_CODEGEN_REGALLOCCOMMON_H
diff --git a/llvm/include/llvm/CodeGen/RegAllocFast.h b/llvm/include/llvm/CodeGen/RegAllocFast.h
index c99c715daacf9..c62bd14d0b4cb 100644
--- a/llvm/include/llvm/CodeGen/RegAllocFast.h
+++ b/llvm/include/llvm/CodeGen/RegAllocFast.h
@@ -15,7 +15,7 @@
 namespace llvm {
 
 struct RegAllocFastPassOptions {
-  RegAllocFilterFunc Filter = nullptr;
+  RegClassFilterFunc Filter = nullptr;
   StringRef FilterName = "all";
   bool ClearVRegs = true;
 };
diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index 649711d8faf65..5c7f6ddc94840 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -2226,12 +2226,6 @@ class TargetInstrInfo : public MCInstrInfo {
     return OptLevel >= CodeGenOptLevel::Aggressive ? 4 : 2;
   }
 
-  /// Returns the target-specific default value for tail merging.
-  /// This value will be used if the tail-merge-size argument is not provided.
-  virtual unsigned getTailMergeSize(const MachineFunction &MF) const {
-    return 3;
-  }
-
   /// Returns the callee operand from the given \p MI.
   virtual const MachineOperand &getCalleeOperand(const MachineInstr &MI) const {
     return MI.getOperand(0);
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Shared/MemoryFlags.h b/llvm/include/llvm/ExecutionEngine/Orc/Shared/MemoryFlags.h
index fa522ff88e1db..b8b5f90b6b0fb 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/Shared/MemoryFlags.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/Shared/MemoryFlags.h
@@ -184,7 +184,7 @@ template <typename T> class AllocGroupSmallMap {
   iterator end() { return Elems.end(); }
   iterator find(AllocGroup G) {
     auto I = lower_bound(Elems, G, compareKey);
-    return (I == end() || I->first == G) ? I : end();
+    return (I->first == G) ? I : end();
   }
 
   bool empty() const { return Elems.empty(); }
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index a6995888de7d4..02c0265354ccd 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -2322,26 +2322,6 @@ class OpenMPIRBuilder {
       EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args,
       Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP);
 
-  /// Generate a target-task for the target construct
-  ///
-  /// \param OutlinedFn The outlined device/target kernel function.
-  /// \param OutlinedFnID The ooulined function ID.
-  /// \param EmitTargetCallFallbackCB Call back function to generate host
-  ///        fallback code.
-  /// \param Args Data structure holding information about the kernel arguments.
-  /// \param DeviceID Identifier for the device via the 'device' clause.
-  /// \param RTLoc Source location identifier
-  /// \param AllocaIP The insertion point to be used for alloca instructions.
-  /// \param Dependencies Vector of DependData objects holding information of
-  ///        dependencies as specified by the 'depend' clause.
-  /// \param HasNoWait True if the target construct had 'nowait' on it, false
-  ///        otherwise
-  InsertPointTy emitTargetTask(
-      Function *OutlinedFn, Value *OutlinedFnID,
-      EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args,
-      Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP,
-      SmallVector<OpenMPIRBuilder::DependData> &Dependencies, bool HasNoWait);
-
   /// Emit the arguments to be passed to the runtime library based on the
   /// arrays of base pointers, pointers, sizes, map types, and mappers.  If
   /// ForEndCall, emit map types to be passed for the end of the region instead
@@ -2825,8 +2805,6 @@ class OpenMPIRBuilder {
   /// \param BodyGenCB Callback that will generate the region code.
   /// \param ArgAccessorFuncCB Callback that will generate accessors
   /// instructions for passed in target arguments where neccessary
-  /// \param Dependencies A vector of DependData objects that carry
-  // dependency information as passed in the depend clause
   InsertPointTy createTarget(const LocationDescription &Loc,
                              OpenMPIRBuilder::InsertPointTy AllocaIP,
                              OpenMPIRBuilder::InsertPointTy CodeGenIP,
@@ -2835,8 +2813,7 @@ class OpenMPIRBuilder {
                              SmallVectorImpl<Value *> &Inputs,
                              GenMapInfoCallbackTy GenMapInfoCB,
                              TargetBodyGenCallbackTy BodyGenCB,
-                             TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB,
-                             SmallVector<DependData> Dependencies = {});
+                             TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB);
 
   /// Returns __kmpc_for_static_init_* runtime function for the specified
   /// size \a IVSize and sign \a IVSigned. Will create a distribute call
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index ab2620fdcf6b3..ca85ff30f683f 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1138,19 +1138,6 @@ class AMDGPURawBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsi
 def int_amdgcn_raw_buffer_load_format : AMDGPURawBufferLoad<llvm_anyfloat_ty>;
 def int_amdgcn_raw_buffer_load : AMDGPURawBufferLoad;
 
-class AMDGPURawAtomicBufferLoad<LLVMType data_ty = llvm_any_ty> : Intrinsic <
-  [data_ty],
-  [llvm_v4i32_ty,     // rsrc(SGPR)
-   llvm_i32_ty,       // offset(VGPR/imm, included in bounds checking and swizzling)
-   llvm_i32_ty,       // soffset(SGPR/imm, excluded from bounds checking and swizzling)
-   llvm_i32_ty],      // auxiliary data (imm, cachepolicy     (bit 0 = glc,
-                      //                                       bit 1 = slc,
-                      //                                       bit 2 = dlc on gfx10+),
-                      //                      swizzled buffer (bit 3 = swz))
-  [ImmArg<ArgIndex<3>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>,
-  AMDGPURsrcIntrinsic<0>;
-def int_amdgcn_raw_atomic_buffer_load : AMDGPURawAtomicBufferLoad;
-
 class AMDGPURawPtrBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic <
   [data_ty],
   [AMDGPUBufferRsrcTy,    // rsrc(SGPR)
@@ -1169,19 +1156,6 @@ class AMDGPURawPtrBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntri
 def int_amdgcn_raw_ptr_buffer_load_format : AMDGPURawPtrBufferLoad<llvm_anyfloat_ty>;
 def int_amdgcn_raw_ptr_buffer_load : AMDGPURawPtrBufferLoad;
 
-class AMDGPURawPtrAtomicBufferLoad<LLVMType data_ty = llvm_any_ty> : Intrinsic <
-  [data_ty],
-  [AMDGPUBufferRsrcTy,// rsrc(SGPR)
-   llvm_i32_ty,       // offset(VGPR/imm, included in bounds checking and swizzling)
-   llvm_i32_ty,       // soffset(SGPR/imm, excluded from bounds checking and swizzling)
-   llvm_i32_ty],      // auxiliary data (imm, cachepolicy     (bit 0 = glc,
-                      //                                       bit 1 = slc,
-                      //                                       bit 2 = dlc on gfx10+),
-                      //                      swizzled buffer (bit 3 = swz))
-  [IntrArgMemOnly, NoCapture<ArgIndex<0>>, ImmArg<ArgIndex<3>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>,
-  AMDGPURsrcIntrinsic<0>;
-def int_amdgcn_raw_ptr_atomic_buffer_load : AMDGPURawPtrAtomicBufferLoad;
-
 class AMDGPUStructBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic <
   [data_ty],
   [llvm_v4i32_ty,    // rsrc(SGPR)
diff --git a/llvm/include/llvm/MC/MCAsmBackend.h b/llvm/include/llvm/MC/MCAsmBackend.h
index 736f44686689b..f91c8e1fc9af3 100644
--- a/llvm/include/llvm/MC/MCAsmBackend.h
+++ b/llvm/include/llvm/MC/MCAsmBackend.h
@@ -217,9 +217,8 @@ class MCAsmBackend {
   virtual bool writeNopData(raw_ostream &OS, uint64_t Count,
                             const MCSubtargetInfo *STI) const = 0;
 
-  // Return true if fragment offsets have been adjusted and an extra layout
-  // iteration is needed.
-  virtual bool finishLayout(const MCAssembler &Asm) const { return false; }
+  /// Give backend an opportunity to finish layout after relaxation
+  virtual void finishLayout(MCAssembler const &Asm) const {}
 
   /// Handle any target-specific assembler flags. By default, do nothing.
   virtual void handleAssemblerFlag(MCAssemblerFlag Flag) {}
diff --git a/llvm/include/llvm/MC/MCAssembler.h b/llvm/include/llvm/MC/MCAssembler.h
index d9752912ee66a..4b08d50de9e22 100644
--- a/llvm/include/llvm/MC/MCAssembler.h
+++ b/llvm/include/llvm/MC/MCAssembler.h
@@ -15,9 +15,13 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/iterator.h"
 #include "llvm/ADT/iterator_range.h"
+#include "llvm/BinaryFormat/MachO.h"
+#include "llvm/MC/MCDirectives.h"
 #include "llvm/MC/MCDwarf.h"
+#include "llvm/MC/MCLinkerOptimizationHint.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/SMLoc.h"
+#include "llvm/Support/VersionTuple.h"
 #include <algorithm>
 #include <cassert>
 #include <cstddef>
@@ -52,10 +56,25 @@ class MCValue;
 
 class MCAssembler {
 public:
-  friend class MCObjectWriter;
   using SectionListType = SmallVector<MCSection *, 0>;
   using const_iterator = pointee_iterator<SectionListType::const_iterator>;
 
+  /// MachO specific deployment target version info.
+  // A Major version of 0 indicates that no version information was supplied
+  // and so the corresponding load command should not be emitted.
+  using VersionInfoType = struct {
+    bool EmitBuildVersion;
+    union {
+      MCVersionMinType Type;          ///< Used when EmitBuildVersion==false.
+      MachO::PlatformType Platform;   ///< Used when EmitBuildVersion==true.
+    } TypeOrPlatform;
+    unsigned Major;
+    unsigned Minor;
+    unsigned Update;
+    /// An optional version of the SDK that was used to build the source.
+    VersionTuple SDKVersion;
+  };
+
 private:
   MCContext &Context;
 
@@ -65,11 +84,21 @@ class MCAssembler {
 
   bool HasLayout = false;
   bool RelaxAll = false;
+  bool SubsectionsViaSymbols = false;
+  bool IncrementalLinkerCompatible = false;
 
   SectionListType Sections;
 
   SmallVector<const MCSymbol *, 0> Symbols;
 
+  /// The list of linker options to propagate into the object file.
+  std::vector<std::vector<std::string>> LinkerOptions;
+
+  /// List of declared file names
+  std::vector<std::pair<std::string, size_t>> FileNames;
+  // Optional compiler version.
+  std::string CompilerVersion;
+
   MCDwarfLineTableParams LTParams;
 
   /// The set of function symbols for which a .thumb_func directive has
@@ -86,6 +115,20 @@ class MCAssembler {
   /// By default it's 0, which means bundling is disabled.
   unsigned BundleAlignSize = 0;
 
+  /// ELF specific e_header flags
+  // It would be good if there were an MCELFAssembler class to hold this.
+  // ELF header flags are used both by the integrated and standalone assemblers.
+  // Access to the flags is necessary in cases where assembler directives affect
+  // which flags to be set.
+  unsigned ELFHeaderEFlags = 0;
+
+  /// Used to communicate Linker Optimization Hint information between
+  /// the Streamer and the .o writer
+  MCLOHContainer LOHContainer;
+
+  VersionInfoType VersionInfo;
+  VersionInfoType DarwinTargetVariantVersionInfo;
+
   /// Evaluate a fixup to a relocatable expression and the value which should be
   /// placed into the fixup.
   ///
@@ -111,7 +154,6 @@ class MCAssembler {
   /// Check whether the given fragment needs relaxation.
   bool fragmentNeedsRelaxation(const MCRelaxableFragment *IF) const;
 
-  void layoutSection(MCSection &Sec);
   /// Perform one layout iteration and return true if any offsets
   /// were adjusted.
   bool layoutOnce();
@@ -132,6 +174,15 @@ class MCAssembler {
   handleFixup(MCFragment &F, const MCFixup &Fixup, const MCSubtargetInfo *STI);
 
 public:
+  struct Symver {
+    SMLoc Loc;
+    const MCSymbol *Sym;
+    StringRef Name;
+    // True if .symver *, *@@@* or .symver *, *, remove.
+    bool KeepOriginalSym;
+  };
+  std::vector<Symver> Symvers;
+
   /// Construct a new assembler instance.
   //
   // FIXME: How are we going to parameterize this? Two obvious options are stay
@@ -143,14 +194,16 @@ class MCAssembler {
               std::unique_ptr<MCObjectWriter> Writer);
   MCAssembler(const MCAssembler &) = delete;
   MCAssembler &operator=(const MCAssembler &) = delete;
+  ~MCAssembler();
 
   /// Compute the effective fragment size.
   uint64_t computeFragmentSize(const MCFragment &F) const;
 
   void layoutBundle(MCFragment *Prev, MCFragment *F) const;
+  void ensureValid(MCSection &Sec) const;
 
   // Get the offset of the given fragment inside its containing section.
-  uint64_t getFragmentOffset(const MCFragment &F) const { return F.Offset; }
+  uint64_t getFragmentOffset(const MCFragment &F) const;
 
   uint64_t getSectionAddressSize(const MCSection &Sec) const;
   uint64_t getSectionFileSize(const MCSection &Sec) const;
@@ -175,6 +228,48 @@ class MCAssembler {
   /// Flag a function symbol as the target of a .thumb_func directive.
   void setIsThumbFunc(const MCSymbol *Func) { ThumbFuncs.insert(Func); }
 
+  /// ELF e_header flags
+  unsigned getELFHeaderEFlags() const { return ELFHeaderEFlags; }
+  void setELFHeaderEFlags(unsigned Flags) { ELFHeaderEFlags = Flags; }
+
+  /// MachO deployment target version information.
+  const VersionInfoType &getVersionInfo() const { return VersionInfo; }
+  void setVersionMin(MCVersionMinType Type, unsigned Major, unsigned Minor,
+                     unsigned Update,
+                     VersionTuple SDKVersion = VersionTuple()) {
+    VersionInfo.EmitBuildVersion = false;
+    VersionInfo.TypeOrPlatform.Type = Type;
+    VersionInfo.Major = Major;
+    VersionInfo.Minor = Minor;
+    VersionInfo.Update = Update;
+    VersionInfo.SDKVersion = SDKVersion;
+  }
+  void setBuildVersion(MachO::PlatformType Platform, unsigned Major,
+                       unsigned Minor, unsigned Update,
+                       VersionTuple SDKVersion = VersionTuple()) {
+    VersionInfo.EmitBuildVersion = true;
+    VersionInfo.TypeOrPlatform.Platform = Platform;
+    VersionInfo.Major = Major;
+    VersionInfo.Minor = Minor;
+    VersionInfo.Update = Update;
+    VersionInfo.SDKVersion = SDKVersion;
+  }
+
+  const VersionInfoType &getDarwinTargetVariantVersionInfo() const {
+    return DarwinTargetVariantVersionInfo;
+  }
+  void setDarwinTargetVariantBuildVersion(MachO::PlatformType Platform,
+                                          unsigned Major, unsigned Minor,
+                                          unsigned Update,
+                                          VersionTuple SDKVersion) {
+    DarwinTargetVariantVersionInfo.EmitBuildVersion = true;
+    DarwinTargetVariantVersionInfo.TypeOrPlatform.Platform = Platform;
+    DarwinTargetVariantVersionInfo.Major = Major;
+    DarwinTargetVariantVersionInfo.Minor = Minor;
+    DarwinTargetVariantVersionInfo.Update = Update;
+    DarwinTargetVariantVersionInfo.SDKVersion = SDKVersion;
+  }
+
   /// Reuse an assembler instance
   ///
   void reset();
@@ -185,6 +280,8 @@ class MCAssembler {
 
   MCCodeEmitter *getEmitterPtr() const { return Emitter.get(); }
 
+  MCObjectWriter *getWriterPtr() const { return Writer.get(); }
+
   MCAsmBackend &getBackend() const { return *Backend; }
 
   MCCodeEmitter &getEmitter() const { return *Emitter; }
@@ -192,6 +289,7 @@ class MCAssembler {
   MCObjectWriter &getWriter() const { return *Writer; }
 
   MCDwarfLineTableParams getDWARFLinetableParams() const { return LTParams; }
+  void setDWARFLinetableParams(MCDwarfLineTableParams P) { LTParams = P; }
 
   /// Finish - Do final processing and write the object to the output stream.
   /// \p Writer is used for custom object writer (as the MCJIT does),
@@ -201,6 +299,17 @@ class MCAssembler {
   // Layout all section and prepare them for emission.
   void layout();
 
+  // FIXME: This does not belong here.
+  bool getSubsectionsViaSymbols() const { return SubsectionsViaSymbols; }
+  void setSubsectionsViaSymbols(bool Value) { SubsectionsViaSymbols = Value; }
+
+  bool isIncrementalLinkerCompatible() const {
+    return IncrementalLinkerCompatible;
+  }
+  void setIncrementalLinkerCompatible(bool Value) {
+    IncrementalLinkerCompatible = Value;
+  }
+
   bool hasLayout() const { return HasLayout; }
   bool getRelaxAll() const { return RelaxAll; }
   void setRelaxAll(bool Value) { RelaxAll = Value; }
@@ -224,14 +333,56 @@ class MCAssembler {
     return make_pointee_range(Symbols);
   }
 
+  /// @}
+  /// \name Linker Option List Access
+  /// @{
+
+  std::vector<std::vector<std::string>> &getLinkerOptions() {
+    return LinkerOptions;
+  }
+
+  // FIXME: This is a total hack, this should not be here. Once things are
+  // factored so that the streamer has direct access to the .o writer, it can
+  // disappear.
+  MCLOHContainer &getLOHContainer() { return LOHContainer; }
+  const MCLOHContainer &getLOHContainer() const {
+    return const_cast<MCAssembler *>(this)->getLOHContainer();
+  }
+
+  struct CGProfileEntry {
+    const MCSymbolRefExpr *From;
+    const MCSymbolRefExpr *To;
+    uint64_t Count;
+  };
+  std::vector<CGProfileEntry> CGProfile;
+  /// @}
+  /// \name Backend Data Access
+  /// @{
+
   bool registerSection(MCSection &Section);
   bool registerSymbol(const MCSymbol &Symbol);
 
+  MutableArrayRef<std::pair<std::string, size_t>> getFileNames() {
+    return FileNames;
+  }
+
+  void addFileName(StringRef FileName) {
+    FileNames.emplace_back(std::string(FileName), Symbols.size());
+  }
+
+  void setCompilerVersion(std::string CompilerVers) {
+    if (CompilerVersion.empty())
+      CompilerVersion = std::move(CompilerVers);
+  }
+  StringRef getCompilerVersion() { return CompilerVersion; }
+
   /// Write the necessary bundle padding to \p OS.
   /// Expects a fragment \p F containing instructions and its size \p FSize.
   void writeFragmentPadding(raw_ostream &OS, const MCEncodedFragment &F,
                             uint64_t FSize) const;
 
+  /// @}
+
   void dump() const;
 };
 
diff --git a/llvm/include/llvm/MC/MCContext.h b/llvm/include/llvm/MC/MCContext.h
index 57ba40f7ac26f..73fe6361cb866 100644
--- a/llvm/include/llvm/MC/MCContext.h
+++ b/llvm/include/llvm/MC/MCContext.h
@@ -468,10 +468,6 @@ class MCContext {
   /// true, behaves like getOrCreateSymbol, prefixed with PrivateLabelPrefix.
   MCSymbol *createBlockSymbol(const Twine &Name, bool AlwaysEmit = false);
 
-  /// Create a local, non-temporary symbol like an ELF mapping symbol. Calling
-  /// the function with the same name will generate new, unique instances.
-  MCSymbol *createLocalSymbol(StringRef Name);
-
   /// Create the definition of a directional local symbol for numbered label
   /// (used for "1:" definitions).
   MCSymbol *createDirectionalLocalSymbol(unsigned LocalLabelVal);
@@ -647,7 +643,7 @@ class MCContext {
   MCSectionXCOFF *getXCOFFSection(
       StringRef Section, SectionKind K,
       std::optional<XCOFF::CsectProperties> CsectProp = std::nullopt,
-      bool MultiSymbolsAllowed = false,
+      bool MultiSymbolsAllowed = false, const char *BeginSymName = nullptr,
       std::optional<XCOFF::DwarfSectionSubtypeFlags> DwarfSubtypeFlags =
           std::nullopt);
 
diff --git a/llvm/include/llvm/MC/MCELFObjectWriter.h b/llvm/include/llvm/MC/MCELFObjectWriter.h
index 9b74cbc3d3a52..12237094ad86a 100644
--- a/llvm/include/llvm/MC/MCELFObjectWriter.h
+++ b/llvm/include/llvm/MC/MCELFObjectWriter.h
@@ -9,8 +9,6 @@
 #ifndef LLVM_MC_MCELFOBJECTWRITER_H
 #define LLVM_MC_MCELFOBJECTWRITER_H
 
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSectionELF.h"
@@ -18,8 +16,6 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/TargetParser/Triple.h"
 #include <cstdint>
-#include <memory>
-#include <optional>
 #include <vector>
 
 namespace llvm {
@@ -29,7 +25,6 @@ class MCContext;
 class MCFixup;
 class MCSymbol;
 class MCSymbolELF;
-class MCTargetOptions;
 class MCValue;
 
 struct ELFRelocationEntry {
@@ -154,66 +149,20 @@ class MCELFObjectTargetWriter : public MCObjectTargetWriter {
   }
 };
 
-class ELFObjectWriter : public MCObjectWriter {
-  unsigned ELFHeaderEFlags = 0;
+/// Construct a new ELF writer instance.
+///
+/// \param MOTW - The target specific ELF writer subclass.
+/// \param OS - The stream to write to.
+/// \returns The constructed object writer.
+std::unique_ptr<MCObjectWriter>
+createELFObjectWriter(std::unique_ptr<MCELFObjectTargetWriter> MOTW,
+                      raw_pwrite_stream &OS, bool IsLittleEndian);
+
+std::unique_ptr<MCObjectWriter>
+createELFDwoObjectWriter(std::unique_ptr<MCELFObjectTargetWriter> MOTW,
+                         raw_pwrite_stream &OS, raw_pwrite_stream &DwoOS,
+                         bool IsLittleEndian);
 
-public:
-  std::unique_ptr<MCELFObjectTargetWriter> TargetObjectWriter;
-  raw_pwrite_stream &OS;
-  raw_pwrite_stream *DwoOS = nullptr;
-
-  DenseMap<const MCSectionELF *, std::vector<ELFRelocationEntry>> Relocations;
-  DenseMap<const MCSymbolELF *, const MCSymbolELF *> Renames;
-  bool IsLittleEndian = false;
-  bool SeenGnuAbi = false;
-  std::optional<uint8_t> OverrideABIVersion;
-
-  struct Symver {
-    SMLoc Loc;
-    const MCSymbol *Sym;
-    StringRef Name;
-    // True if .symver *, *@@@* or .symver *, *, remove.
-    bool KeepOriginalSym;
-  };
-  SmallVector<Symver, 0> Symvers;
-
-  ELFObjectWriter(std::unique_ptr<MCELFObjectTargetWriter> MOTW,
-                  raw_pwrite_stream &OS, bool IsLittleEndian);
-  ELFObjectWriter(std::unique_ptr<MCELFObjectTargetWriter> MOTW,
-                  raw_pwrite_stream &OS, raw_pwrite_stream &DwoOS,
-                  bool IsLittleEndian);
-
-  void reset() override;
-  void executePostLayoutBinding(MCAssembler &Asm) override;
-  void recordRelocation(MCAssembler &Asm, const MCFragment *Fragment,
-                        const MCFixup &Fixup, MCValue Target,
-                        uint64_t &FixedValue) override;
-  bool isSymbolRefDifferenceFullyResolvedImpl(const MCAssembler &Asm,
-                                              const MCSymbol &SymA,
-                                              const MCFragment &FB, bool InSet,
-                                              bool IsPCRel) const override;
-  uint64_t writeObject(MCAssembler &Asm) override;
-
-  bool hasRelocationAddend() const;
-  bool usesRela(const MCTargetOptions *TO, const MCSectionELF &Sec) const;
-
-  bool shouldRelocateWithSymbol(const MCAssembler &Asm, const MCValue &Val,
-                                const MCSymbolELF *Sym, uint64_t C,
-                                unsigned Type) const;
-
-  bool checkRelocation(MCContext &Ctx, SMLoc Loc, const MCSectionELF *From,
-                       const MCSectionELF *To);
-
-  unsigned getELFHeaderEFlags() const { return ELFHeaderEFlags; }
-  void setELFHeaderEFlags(unsigned Flags) { ELFHeaderEFlags = Flags; }
-
-  // Mark that we have seen GNU ABI usage (e.g. SHF_GNU_RETAIN, STB_GNU_UNIQUE).
-  void markGnuAbi() { SeenGnuAbi = true; }
-  bool seenGnuAbi() const { return SeenGnuAbi; }
-
-  // Override the default e_ident[EI_ABIVERSION] in the ELF header.
-  void setOverrideABIVersion(uint8_t V) { OverrideABIVersion = V; }
-};
 } // end namespace llvm
 
 #endif // LLVM_MC_MCELFOBJECTWRITER_H
diff --git a/llvm/include/llvm/MC/MCELFStreamer.h b/llvm/include/llvm/MC/MCELFStreamer.h
index 028bdd1a6bacf..1cd64e406d4d0 100644
--- a/llvm/include/llvm/MC/MCELFStreamer.h
+++ b/llvm/include/llvm/MC/MCELFStreamer.h
@@ -11,7 +11,6 @@
 
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/MC/MCDirectives.h"
-#include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCObjectStreamer.h"
 
 namespace llvm {
@@ -43,8 +42,6 @@ class MCELFStreamer : public MCObjectStreamer {
     MCObjectStreamer::reset();
   }
 
-  ELFObjectWriter &getWriter();
-
   /// \name MCStreamer Interface
   /// @{
 
diff --git a/llvm/include/llvm/MC/MCMachObjectWriter.h b/llvm/include/llvm/MC/MCMachObjectWriter.h
index 4386d84748271..c26c75e98844c 100644
--- a/llvm/include/llvm/MC/MCMachObjectWriter.h
+++ b/llvm/include/llvm/MC/MCMachObjectWriter.h
@@ -12,14 +12,11 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/BinaryFormat/MachO.h"
-#include "llvm/MC/MCDirectives.h"
 #include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCLinkerOptimizationHint.h"
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/StringTableBuilder.h"
 #include "llvm/Support/EndianStream.h"
-#include "llvm/Support/VersionTuple.h"
 #include <cstdint>
 #include <memory>
 #include <string>
@@ -91,21 +88,6 @@ class MachObjectWriter : public MCObjectWriter {
     MCSymbol *End;
   };
 
-  // A Major version of 0 indicates that no version information was supplied
-  // and so the corresponding load command should not be emitted.
-  using VersionInfoType = struct {
-    bool EmitBuildVersion;
-    union {
-      MCVersionMinType Type;        ///< Used when EmitBuildVersion==false.
-      MachO::PlatformType Platform; ///< Used when EmitBuildVersion==true.
-    } TypeOrPlatform;
-    unsigned Major;
-    unsigned Minor;
-    unsigned Update;
-    /// An optional version of the SDK that was used to build the source.
-    VersionTuple SDKVersion;
-  };
-
 private:
   /// Helper struct for containing some precomputed information on symbols.
   struct MachSymbolData {
@@ -158,15 +140,6 @@ class MachObjectWriter : public MCObjectWriter {
 
   /// @}
 
-  // Used to communicate Linker Optimization Hint information.
-  MCLOHContainer LOHContainer;
-
-  VersionInfoType VersionInfo{};
-  VersionInfoType TargetVariantVersionInfo{};
-
-  // The list of linker options for LC_LINKER_OPTION.
-  std::vector<std::vector<std::string>> LinkerOptions;
-
   MachSymbolData *findSymbolData(const MCSymbol &Sym);
 
   void writeWithPadding(StringRef Str, uint64_t Size);
@@ -204,7 +177,6 @@ class MachObjectWriter : public MCObjectWriter {
     return SectionOrder;
   }
   SectionAddrMap &getSectionAddressMap() { return SectionAddress; }
-  MCLOHContainer &getLOHContainer() { return LOHContainer; }
 
   uint64_t getSectionAddress(const MCSection *Sec) const {
     return SectionAddress.lookup(Sec);
@@ -220,42 +192,6 @@ class MachObjectWriter : public MCObjectWriter {
 
   bool doesSymbolRequireExternRelocation(const MCSymbol &S);
 
-  /// Mach-O deployment target version information.
-  void setVersionMin(MCVersionMinType Type, unsigned Major, unsigned Minor,
-                     unsigned Update,
-                     VersionTuple SDKVersion = VersionTuple()) {
-    VersionInfo.EmitBuildVersion = false;
-    VersionInfo.TypeOrPlatform.Type = Type;
-    VersionInfo.Major = Major;
-    VersionInfo.Minor = Minor;
-    VersionInfo.Update = Update;
-    VersionInfo.SDKVersion = SDKVersion;
-  }
-  void setBuildVersion(MachO::PlatformType Platform, unsigned Major,
-                       unsigned Minor, unsigned Update,
-                       VersionTuple SDKVersion = VersionTuple()) {
-    VersionInfo.EmitBuildVersion = true;
-    VersionInfo.TypeOrPlatform.Platform = Platform;
-    VersionInfo.Major = Major;
-    VersionInfo.Minor = Minor;
-    VersionInfo.Update = Update;
-    VersionInfo.SDKVersion = SDKVersion;
-  }
-  void setTargetVariantBuildVersion(MachO::PlatformType Platform,
-                                    unsigned Major, unsigned Minor,
-                                    unsigned Update, VersionTuple SDKVersion) {
-    TargetVariantVersionInfo.EmitBuildVersion = true;
-    TargetVariantVersionInfo.TypeOrPlatform.Platform = Platform;
-    TargetVariantVersionInfo.Major = Major;
-    TargetVariantVersionInfo.Minor = Minor;
-    TargetVariantVersionInfo.Update = Update;
-    TargetVariantVersionInfo.SDKVersion = SDKVersion;
-  }
-
-  std::vector<std::vector<std::string>> &getLinkerOptions() {
-    return LinkerOptions;
-  }
-
   /// @}
 
   /// \name Target Writer Proxy Accessors
diff --git a/llvm/include/llvm/MC/MCObjectWriter.h b/llvm/include/llvm/MC/MCObjectWriter.h
index 81ba6ffd5d44e..01114469c241e 100644
--- a/llvm/include/llvm/MC/MCObjectWriter.h
+++ b/llvm/include/llvm/MC/MCObjectWriter.h
@@ -32,20 +32,8 @@ class MCValue;
 /// should be emitted as part of writeObject().
 class MCObjectWriter {
 protected:
-  /// List of declared file names
-  SmallVector<std::pair<std::string, size_t>, 0> FileNames;
-  // XCOFF specific: Optional compiler version.
-  std::string CompilerVersion;
   std::vector<const MCSymbol *> AddrsigSyms;
   bool EmitAddrsigSection = false;
-  bool SubsectionsViaSymbols = false;
-
-  struct CGProfileEntry {
-    const MCSymbolRefExpr *From;
-    const MCSymbolRefExpr *To;
-    uint64_t Count;
-  };
-  SmallVector<CGProfileEntry, 0> CGProfile;
 
   MCObjectWriter() = default;
 
@@ -55,7 +43,7 @@ class MCObjectWriter {
   virtual ~MCObjectWriter();
 
   /// lifetime management
-  virtual void reset();
+  virtual void reset() {}
 
   /// \name High-Level API
   /// @{
@@ -93,13 +81,11 @@ class MCObjectWriter {
                                                       bool InSet,
                                                       bool IsPCRel) const;
 
-  MutableArrayRef<std::pair<std::string, size_t>> getFileNames() {
-    return FileNames;
-  }
-  void addFileName(MCAssembler &Asm, StringRef FileName);
-  void setCompilerVersion(StringRef CompilerVers) {
-    CompilerVersion = CompilerVers;
-  }
+  /// ELF only. Mark that we have seen GNU ABI usage (e.g. SHF_GNU_RETAIN).
+  virtual void markGnuAbi() {}
+
+  /// ELF only, override the default ABIVersion in the ELF header.
+  virtual void setOverrideABIVersion(uint8_t ABIVersion) {}
 
   /// Tell the object writer to emit an address-significance table during
   /// writeObject(). If this function is not called, all symbols are treated as
@@ -113,12 +99,15 @@ class MCObjectWriter {
   void addAddrsigSymbol(const MCSymbol *Sym) { AddrsigSyms.push_back(Sym); }
 
   std::vector<const MCSymbol *> &getAddrsigSyms() { return AddrsigSyms; }
-  SmallVector<CGProfileEntry, 0> &getCGProfile() { return CGProfile; }
-
-  // Mach-O specific: Whether .subsections_via_symbols is enabled.
-  bool getSubsectionsViaSymbols() const { return SubsectionsViaSymbols; }
-  void setSubsectionsViaSymbols(bool Value) { SubsectionsViaSymbols = Value; }
 
+  virtual void addExceptionEntry(const MCSymbol *Symbol, const MCSymbol *Trap,
+                                 unsigned LanguageCode, unsigned ReasonCode,
+                                 unsigned FunctionSize, bool hasDebug) {
+    report_fatal_error("addExceptionEntry is only supported on XCOFF targets");
+  }
+  virtual void addCInfoSymEntry(StringRef Name, StringRef Metadata) {
+    report_fatal_error("addCInfoSymEntry is only supported on XCOFF targets");
+  }
   /// Write the object file and returns the number of bytes written.
   ///
   /// This routine is called by the assembler after layout and relaxation is
diff --git a/llvm/include/llvm/MC/MCSPIRVObjectWriter.h b/llvm/include/llvm/MC/MCSPIRVObjectWriter.h
index 18bdb8be0145e..a8baf96b83842 100644
--- a/llvm/include/llvm/MC/MCSPIRVObjectWriter.h
+++ b/llvm/include/llvm/MC/MCSPIRVObjectWriter.h
@@ -10,8 +10,6 @@
 #define LLVM_MC_MCSPIRVOBJECTWRITER_H
 
 #include "llvm/MC/MCObjectWriter.h"
-#include "llvm/MC/MCValue.h"
-#include "llvm/Support/EndianStream.h"
 #include "llvm/Support/raw_ostream.h"
 #include <memory>
 
@@ -28,32 +26,6 @@ class MCSPIRVObjectTargetWriter : public MCObjectTargetWriter {
   }
 };
 
-class SPIRVObjectWriter final : public MCObjectWriter {
-  support::endian::Writer W;
-  std::unique_ptr<MCSPIRVObjectTargetWriter> TargetObjectWriter;
-
-  struct VersionInfoType {
-    unsigned Major = 0;
-    unsigned Minor = 0;
-    unsigned Bound = 0;
-  } VersionInfo;
-
-public:
-  SPIRVObjectWriter(std::unique_ptr<MCSPIRVObjectTargetWriter> MOTW,
-                    raw_pwrite_stream &OS)
-      : W(OS, llvm::endianness::little), TargetObjectWriter(std::move(MOTW)) {}
-
-  void setBuildVersion(unsigned Major, unsigned Minor, unsigned Bound);
-
-private:
-  void recordRelocation(MCAssembler &Asm, const MCFragment *Fragment,
-                        const MCFixup &Fixup, MCValue Target,
-                        uint64_t &FixedValue) override {}
-
-  uint64_t writeObject(MCAssembler &Asm) override;
-  void writeHeader(const MCAssembler &Asm);
-};
-
 /// Construct a new SPIR-V writer instance.
 ///
 /// \param MOTW - The target specific SPIR-V writer subclass.
diff --git a/llvm/include/llvm/MC/MCSection.h b/llvm/include/llvm/MC/MCSection.h
index 1289d6f6f9f65..dcdcd094fa17b 100644
--- a/llvm/include/llvm/MC/MCSection.h
+++ b/llvm/include/llvm/MC/MCSection.h
@@ -99,6 +99,8 @@ class MCSection {
   /// Whether this section has had instructions emitted into it.
   bool HasInstructions : 1;
 
+  bool HasLayout : 1;
+
   bool IsRegistered : 1;
 
   bool IsText : 1;
@@ -167,6 +169,9 @@ class MCSection {
   bool hasInstructions() const { return HasInstructions; }
   void setHasInstructions(bool Value) { HasInstructions = Value; }
 
+  bool hasLayout() const { return HasLayout; }
+  void setHasLayout(bool Value) { HasLayout = Value; }
+
   bool isRegistered() const { return IsRegistered; }
   void setIsRegistered(bool Value) { IsRegistered = Value; }
 
diff --git a/llvm/include/llvm/MC/MCWinCOFFObjectWriter.h b/llvm/include/llvm/MC/MCWinCOFFObjectWriter.h
index 49498871d741d..307800e73c687 100644
--- a/llvm/include/llvm/MC/MCWinCOFFObjectWriter.h
+++ b/llvm/include/llvm/MC/MCWinCOFFObjectWriter.h
@@ -43,37 +43,6 @@ class MCWinCOFFObjectTargetWriter : public MCObjectTargetWriter {
   virtual bool recordRelocation(const MCFixup &) const { return true; }
 };
 
-class WinCOFFWriter;
-
-class WinCOFFObjectWriter : public MCObjectWriter {
-  friend class WinCOFFWriter;
-
-  std::unique_ptr<MCWinCOFFObjectTargetWriter> TargetObjectWriter;
-  std::unique_ptr<WinCOFFWriter> ObjWriter, DwoWriter;
-  bool IncrementalLinkerCompatible = false;
-
-public:
-  WinCOFFObjectWriter(std::unique_ptr<MCWinCOFFObjectTargetWriter> MOTW,
-                      raw_pwrite_stream &OS);
-  WinCOFFObjectWriter(std::unique_ptr<MCWinCOFFObjectTargetWriter> MOTW,
-                      raw_pwrite_stream &OS, raw_pwrite_stream &DwoOS);
-
-  // MCObjectWriter interface implementation.
-  void reset() override;
-  void setIncrementalLinkerCompatible(bool Value) {
-    IncrementalLinkerCompatible = Value;
-  }
-  void executePostLayoutBinding(MCAssembler &Asm) override;
-  bool isSymbolRefDifferenceFullyResolvedImpl(const MCAssembler &Asm,
-                                              const MCSymbol &SymA,
-                                              const MCFragment &FB, bool InSet,
-                                              bool IsPCRel) const override;
-  void recordRelocation(MCAssembler &Asm, const MCFragment *Fragment,
-                        const MCFixup &Fixup, MCValue Target,
-                        uint64_t &FixedValue) override;
-  uint64_t writeObject(MCAssembler &Asm) override;
-};
-
 /// Construct a new Win COFF writer instance.
 ///
 /// \param MOTW - The target specific WinCOFF writer subclass.
diff --git a/llvm/include/llvm/MC/MCWinCOFFStreamer.h b/llvm/include/llvm/MC/MCWinCOFFStreamer.h
index 5c39d80538944..893a70f74ee3b 100644
--- a/llvm/include/llvm/MC/MCWinCOFFStreamer.h
+++ b/llvm/include/llvm/MC/MCWinCOFFStreamer.h
@@ -22,7 +22,6 @@ class MCSection;
 class MCSubtargetInfo;
 class MCSymbol;
 class StringRef;
-class WinCOFFObjectWriter;
 class raw_pwrite_stream;
 
 class MCWinCOFFStreamer : public MCObjectStreamer {
@@ -37,8 +36,6 @@ class MCWinCOFFStreamer : public MCObjectStreamer {
     MCObjectStreamer::reset();
   }
 
-  WinCOFFObjectWriter &getWriter();
-
   /// \name MCStreamer interface
   /// \{
 
diff --git a/llvm/include/llvm/MC/MCXCOFFObjectWriter.h b/llvm/include/llvm/MC/MCXCOFFObjectWriter.h
index c0e32a70172d8..faad2ceb26910 100644
--- a/llvm/include/llvm/MC/MCXCOFFObjectWriter.h
+++ b/llvm/include/llvm/MC/MCXCOFFObjectWriter.h
@@ -43,15 +43,6 @@ std::unique_ptr<MCObjectWriter>
 createXCOFFObjectWriter(std::unique_ptr<MCXCOFFObjectTargetWriter> MOTW,
                         raw_pwrite_stream &OS);
 
-namespace XCOFF {
-void addExceptionEntry(MCObjectWriter &Writer, const MCSymbol *Symbol,
-                       const MCSymbol *Trap, unsigned LanguageCode,
-                       unsigned ReasonCode, unsigned FunctionSize,
-                       bool hasDebug);
-void addCInfoSymEntry(MCObjectWriter &Writer, StringRef Name,
-                      StringRef Metadata);
-} // namespace XCOFF
-
 } // end namespace llvm
 
 #endif // LLVM_MC_MCXCOFFOBJECTWRITER_H
diff --git a/llvm/include/llvm/Passes/PassBuilder.h b/llvm/include/llvm/Passes/PassBuilder.h
index 474a19531ff5d..551d297e0c089 100644
--- a/llvm/include/llvm/Passes/PassBuilder.h
+++ b/llvm/include/llvm/Passes/PassBuilder.h
@@ -390,9 +390,9 @@ class PassBuilder {
   /// returns false.
   Error parseAAPipeline(AAManager &AA, StringRef PipelineText);
 
-  /// Parse RegAllocFilterName to get RegAllocFilterFunc.
-  std::optional<RegAllocFilterFunc>
-  parseRegAllocFilter(StringRef RegAllocFilterName);
+  /// Parse RegClassFilterName to get RegClassFilterFunc.
+  std::optional<RegClassFilterFunc>
+  parseRegAllocFilter(StringRef RegClassFilterName);
 
   /// Print pass names.
   void printPassNames(raw_ostream &OS);
@@ -586,7 +586,7 @@ class PassBuilder {
   /// needs it. E.g. AMDGPU requires regalloc passes can handle sgpr and vgpr
   /// separately.
   void registerRegClassFilterParsingCallback(
-      const std::function<RegAllocFilterFunc(StringRef)> &C) {
+      const std::function<RegClassFilterFunc(StringRef)> &C) {
     RegClassFilterParsingCallbacks.push_back(C);
   }
 
@@ -807,7 +807,7 @@ class PassBuilder {
               2>
       MachineFunctionPipelineParsingCallbacks;
   // Callbacks to parse `filter` parameter in register allocation passes
-  SmallVector<std::function<RegAllocFilterFunc(StringRef)>, 2>
+  SmallVector<std::function<RegClassFilterFunc(StringRef)>, 2>
       RegClassFilterParsingCallbacks;
 };
 
diff --git a/llvm/include/llvm/ProfileData/InstrProfData.inc b/llvm/include/llvm/ProfileData/InstrProfData.inc
index 847e53cfa7432..e9866d94b762c 100644
--- a/llvm/include/llvm/ProfileData/InstrProfData.inc
+++ b/llvm/include/llvm/ProfileData/InstrProfData.inc
@@ -739,7 +739,6 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
 #define INSTR_PROF_PROFILE_RUNTIME_VAR __llvm_profile_runtime
 #define INSTR_PROF_PROFILE_COUNTER_BIAS_VAR __llvm_profile_counter_bias
 #define INSTR_PROF_PROFILE_SET_TIMESTAMP __llvm_profile_set_timestamp
-#define INSTR_PROF_PROFILE_SAMPLING_VAR __llvm_profile_sampling
 
 /* The variable that holds the name of the profile data
  * specified via command line. */
diff --git a/llvm/include/llvm/SandboxIR/SandboxIR.h b/llvm/include/llvm/SandboxIR/SandboxIR.h
index 0c67206d307ef..dfffe5c96f1cf 100644
--- a/llvm/include/llvm/SandboxIR/SandboxIR.h
+++ b/llvm/include/llvm/SandboxIR/SandboxIR.h
@@ -40,7 +40,7 @@
 //                                      |
 //                                      +- PHINode
 //                                      |
-//                                      +- ReturnInst
+//                                      +- RetInst
 //                                      |
 //                                      +- SelectInst
 //                                      |
@@ -75,9 +75,7 @@ class BasicBlock;
 class Context;
 class Function;
 class Instruction;
-class SelectInst;
 class LoadInst;
-class ReturnInst;
 class StoreInst;
 class User;
 class Value;
@@ -175,13 +173,11 @@ class Value {
   /// order.
   llvm::Value *Val = nullptr;
 
-  friend class Context;    // For getting `Val`.
-  friend class User;       // For getting `Val`.
-  friend class Use;        // For getting `Val`.
-  friend class SelectInst; // For getting `Val`.
-  friend class LoadInst;   // For getting `Val`.
-  friend class StoreInst;  // For getting `Val`.
-  friend class ReturnInst; // For getting `Val`.
+  friend class Context;   // For getting `Val`.
+  friend class User;      // For getting `Val`.
+  friend class Use;       // For getting `Val`.
+  friend class LoadInst;  // For getting `Val`.
+  friend class StoreInst; // For getting `Val`.
 
   /// All values point to the context.
   Context &Ctx;
@@ -413,8 +409,6 @@ class Constant : public sandboxir::User {
   }
 
 public:
-  static Constant *createInt(Type *Ty, uint64_t V, Context &Ctx,
-                             bool IsSigned = false);
   /// For isa/dyn_cast.
   static bool classof(const sandboxir::Value *From) {
     return From->getSubclassID() == ClassID::Constant ||
@@ -503,10 +497,8 @@ class Instruction : public sandboxir::User {
   /// A SandboxIR Instruction may map to multiple LLVM IR Instruction. This
   /// returns its topmost LLVM IR instruction.
   llvm::Instruction *getTopmostLLVMInstruction() const;
-  friend class SelectInst; // For getTopmostLLVMInstruction().
-  friend class LoadInst;   // For getTopmostLLVMInstruction().
-  friend class StoreInst;  // For getTopmostLLVMInstruction().
-  friend class ReturnInst; // For getTopmostLLVMInstruction().
+  friend class LoadInst;  // For getTopmostLLVMInstruction().
+  friend class StoreInst; // For getTopmostLLVMInstruction().
 
   /// \Returns the LLVM IR Instructions that this SandboxIR maps to in program
   /// order.
@@ -571,52 +563,6 @@ class Instruction : public sandboxir::User {
 #endif
 };
 
-class SelectInst : public Instruction {
-  /// Use Context::createSelectInst(). Don't call the
-  /// constructor directly.
-  SelectInst(llvm::SelectInst *CI, Context &Ctx)
-      : Instruction(ClassID::Select, Opcode::Select, CI, Ctx) {}
-  friend Context; // for SelectInst()
-  Use getOperandUseInternal(unsigned OpIdx, bool Verify) const final {
-    return getOperandUseDefault(OpIdx, Verify);
-  }
-  SmallVector<llvm::Instruction *, 1> getLLVMInstrs() const final {
-    return {cast<llvm::Instruction>(Val)};
-  }
-  static Value *createCommon(Value *Cond, Value *True, Value *False,
-                             const Twine &Name, IRBuilder<> &Builder,
-                             Context &Ctx);
-
-public:
-  unsigned getUseOperandNo(const Use &Use) const final {
-    return getUseOperandNoDefault(Use);
-  }
-  unsigned getNumOfIRInstrs() const final { return 1u; }
-  static Value *create(Value *Cond, Value *True, Value *False,
-                       Instruction *InsertBefore, Context &Ctx,
-                       const Twine &Name = "");
-  static Value *create(Value *Cond, Value *True, Value *False,
-                       BasicBlock *InsertAtEnd, Context &Ctx,
-                       const Twine &Name = "");
-  Value *getCondition() { return getOperand(0); }
-  Value *getTrueValue() { return getOperand(1); }
-  Value *getFalseValue() { return getOperand(2); }
-
-  void setCondition(Value *New) { setOperand(0, New); }
-  void setTrueValue(Value *New) { setOperand(1, New); }
-  void setFalseValue(Value *New) { setOperand(2, New); }
-  void swapValues() { cast<llvm::SelectInst>(Val)->swapValues(); }
-  /// For isa/dyn_cast.
-  static bool classof(const Value *From);
-#ifndef NDEBUG
-  void verify() const final {
-    assert(isa<llvm::SelectInst>(Val) && "Expected SelectInst!");
-  }
-  void dump(raw_ostream &OS) const override;
-  LLVM_DUMP_METHOD void dump() const override;
-#endif
-};
-
 class LoadInst final : public Instruction {
   /// Use LoadInst::create() instead of calling the constructor.
   LoadInst(llvm::LoadInst *LI, Context &Ctx)
@@ -693,43 +639,6 @@ class StoreInst final : public Instruction {
 #endif
 };
 
-class ReturnInst final : public Instruction {
-  /// Use ReturnInst::create() instead of calling the constructor.
-  ReturnInst(llvm::Instruction *I, Context &Ctx)
-      : Instruction(ClassID::Ret, Opcode::Ret, I, Ctx) {}
-  ReturnInst(ClassID SubclassID, llvm::Instruction *I, Context &Ctx)
-      : Instruction(SubclassID, Opcode::Ret, I, Ctx) {}
-  friend class Context; // For accessing the constructor in create*()
-  Use getOperandUseInternal(unsigned OpIdx, bool Verify) const final {
-    return getOperandUseDefault(OpIdx, Verify);
-  }
-  SmallVector<llvm::Instruction *, 1> getLLVMInstrs() const final {
-    return {cast<llvm::Instruction>(Val)};
-  }
-  static ReturnInst *createCommon(Value *RetVal, IRBuilder<> &Builder,
-                                  Context &Ctx);
-
-public:
-  static ReturnInst *create(Value *RetVal, Instruction *InsertBefore,
-                            Context &Ctx);
-  static ReturnInst *create(Value *RetVal, BasicBlock *InsertAtEnd,
-                            Context &Ctx);
-  static bool classof(const Value *From) {
-    return From->getSubclassID() == ClassID::Ret;
-  }
-  unsigned getUseOperandNo(const Use &Use) const final {
-    return getUseOperandNoDefault(Use);
-  }
-  unsigned getNumOfIRInstrs() const final { return 1u; }
-  /// \Returns null if there is no return value.
-  Value *getReturnValue() const;
-#ifndef NDEBUG
-  void verify() const final {}
-  void dump(raw_ostream &OS) const override;
-  LLVM_DUMP_METHOD void dump() const override;
-#endif
-};
-
 /// An LLLVM Instruction that has no SandboxIR equivalent class gets mapped to
 /// an OpaqueInstr.
 class OpaqueInst : public sandboxir::Instruction {
@@ -854,11 +763,6 @@ class Context {
   Value *getOrCreateValue(llvm::Value *LLVMV) {
     return getOrCreateValueInternal(LLVMV, 0);
   }
-  /// Get or create a sandboxir::Constant from an existing LLVM IR \p LLVMC.
-  Constant *getOrCreateConstant(llvm::Constant *LLVMC) {
-    return cast<Constant>(getOrCreateValueInternal(LLVMC, 0));
-  }
-  friend class Constant; // For getOrCreateConstant().
   /// Create a sandboxir::BasicBlock for an existing LLVM IR \p BB. This will
   /// also create all contents of the block.
   BasicBlock *createBasicBlock(llvm::BasicBlock *BB);
@@ -868,14 +772,10 @@ class Context {
   IRBuilder<ConstantFolder> LLVMIRBuilder;
   auto &getLLVMIRBuilder() { return LLVMIRBuilder; }
 
-  SelectInst *createSelectInst(llvm::SelectInst *SI);
-  friend SelectInst; // For createSelectInst()
   LoadInst *createLoadInst(llvm::LoadInst *LI);
   friend LoadInst; // For createLoadInst()
   StoreInst *createStoreInst(llvm::StoreInst *SI);
   friend StoreInst; // For createStoreInst()
-  ReturnInst *createReturnInst(llvm::ReturnInst *I);
-  friend ReturnInst; // For createReturnInst()
 
 public:
   Context(LLVMContext &LLVMCtx)
diff --git a/llvm/include/llvm/SandboxIR/SandboxIRValues.def b/llvm/include/llvm/SandboxIR/SandboxIRValues.def
index efa9155755587..90365ca7a1c45 100644
--- a/llvm/include/llvm/SandboxIR/SandboxIRValues.def
+++ b/llvm/include/llvm/SandboxIR/SandboxIRValues.def
@@ -25,10 +25,8 @@ DEF_USER(Constant, Constant)
 #endif
 //       ClassID, Opcode(s),  Class
 DEF_INSTR(Opaque, OP(Opaque), OpaqueInst)
-DEF_INSTR(Select, OP(Select), SelectInst)
 DEF_INSTR(Load, OP(Load), LoadInst)
 DEF_INSTR(Store, OP(Store), StoreInst)
-DEF_INSTR(Ret, OP(Ret), ReturnInst)
 
 #ifdef DEF_VALUE
 #undef DEF_VALUE
diff --git a/llvm/include/llvm/Support/TimeProfiler.h b/llvm/include/llvm/Support/TimeProfiler.h
index 9e2ba31991f54..31f7df10916db 100644
--- a/llvm/include/llvm/Support/TimeProfiler.h
+++ b/llvm/include/llvm/Support/TimeProfiler.h
@@ -83,28 +83,16 @@ namespace llvm {
 
 class raw_pwrite_stream;
 
-struct TimeTraceMetadata {
-  std::string Detail;
-  // Source file and line number information for the event.
-  std::string File;
-  int Line = 0;
-
-  bool isEmpty() const { return Detail.empty() && File.empty(); }
-};
-
 struct TimeTraceProfiler;
 TimeTraceProfiler *getTimeTraceProfilerInstance();
 
-bool isTimeTraceVerbose();
-
 struct TimeTraceProfilerEntry;
 
 /// Initialize the time trace profiler.
 /// This sets up the global \p TimeTraceProfilerInstance
 /// variable to be the profiler instance.
 void timeTraceProfilerInitialize(unsigned TimeTraceGranularity,
-                                 StringRef ProcName,
-                                 bool TimeTraceVerbose = false);
+                                 StringRef ProcName);
 
 /// Cleanup the time trace profiler, if it was initialized.
 void timeTraceProfilerCleanup();
@@ -140,10 +128,6 @@ TimeTraceProfilerEntry *
 timeTraceProfilerBegin(StringRef Name,
                        llvm::function_ref<std::string()> Detail);
 
-TimeTraceProfilerEntry *
-timeTraceProfilerBegin(StringRef Name,
-                       llvm::function_ref<TimeTraceMetadata()> MetaData);
-
 /// Manually begin a time section, with the given \p Name and \p Detail.
 /// This starts Async Events having \p Name as a category which is shown
 /// separately from other traces. See
@@ -180,11 +164,6 @@ class TimeTraceScope {
     if (getTimeTraceProfilerInstance() != nullptr)
       Entry = timeTraceProfilerBegin(Name, Detail);
   }
-  TimeTraceScope(StringRef Name,
-                 llvm::function_ref<TimeTraceMetadata()> Metadata) {
-    if (getTimeTraceProfilerInstance() != nullptr)
-      Entry = timeTraceProfilerBegin(Name, Metadata);
-  }
   ~TimeTraceScope() {
     if (getTimeTraceProfilerInstance() != nullptr)
       timeTraceProfilerEnd(Entry);
diff --git a/llvm/include/llvm/Support/raw_socket_stream.h b/llvm/include/llvm/Support/raw_socket_stream.h
index 6c65a66dec9a4..eed865fb5af49 100644
--- a/llvm/include/llvm/Support/raw_socket_stream.h
+++ b/llvm/include/llvm/Support/raw_socket_stream.h
@@ -92,14 +92,13 @@ class ListeningSocket {
   /// Accepts an incoming connection on the listening socket. This method can
   /// optionally either block until a connection is available or timeout after a
   /// specified amount of time has passed. By default the method will block
-  /// until the socket has recieved a connection. If the accept timesout this
-  /// method will return std::errc:timed_out
+  /// until the socket has recieved a connection.
   ///
   /// \param Timeout An optional timeout duration in milliseconds. Setting
-  /// Timeout to a negative number causes ::accept to block indefinitely
+  /// Timeout to -1 causes accept to block indefinitely
   ///
-  Expected<std::unique_ptr<raw_socket_stream>> accept(
-      const std::chrono::milliseconds &Timeout = std::chrono::milliseconds(-1));
+  Expected<std::unique_ptr<raw_socket_stream>>
+  accept(std::chrono::milliseconds Timeout = std::chrono::milliseconds(-1));
 
   /// Creates a listening socket bound to the specified file system path.
   /// Handles the socket creation, binding, and immediately starts listening for
@@ -125,28 +124,11 @@ class raw_socket_stream : public raw_fd_stream {
 
 public:
   raw_socket_stream(int SocketFD);
-  ~raw_socket_stream();
-
   /// Create a \p raw_socket_stream connected to the UNIX domain socket at \p
   /// SocketPath.
   static Expected<std::unique_ptr<raw_socket_stream>>
   createConnectedUnix(StringRef SocketPath);
-
-  /// Attempt to read from the raw_socket_stream's file descriptor.
-  ///
-  /// This method can optionally either block until data is read or an error has
-  /// occurred or timeout after a specified amount of time has passed. By
-  /// default the method will block until the socket has read data or
-  /// encountered an error. If the read times out this method will return
-  /// std::errc:timed_out
-  ///
-  /// \param Ptr The start of the buffer that will hold any read data
-  /// \param Size The number of bytes to be read
-  /// \param Timeout An optional timeout duration in milliseconds
-  ///
-  ssize_t read(
-      char *Ptr, size_t Size,
-      const std::chrono::milliseconds &Timeout = std::chrono::milliseconds(-1));
+  ~raw_socket_stream();
 };
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/TargetParser/LoongArchTargetParser.def b/llvm/include/llvm/TargetParser/LoongArchTargetParser.def
index 101a48cbd5399..b20d124953f88 100644
--- a/llvm/include/llvm/TargetParser/LoongArchTargetParser.def
+++ b/llvm/include/llvm/TargetParser/LoongArchTargetParser.def
@@ -10,7 +10,6 @@ LOONGARCH_FEATURE("+lasx", FK_LASX)
 LOONGARCH_FEATURE("+lbt", FK_LBT)
 LOONGARCH_FEATURE("+lvz", FK_LVZ)
 LOONGARCH_FEATURE("+ual", FK_UAL)
-LOONGARCH_FEATURE("+frecipe", FK_FRECIPE)
 
 #undef LOONGARCH_FEATURE
 
@@ -20,6 +19,5 @@ LOONGARCH_FEATURE("+frecipe", FK_FRECIPE)
 
 LOONGARCH_ARCH("loongarch64", AK_LOONGARCH64, FK_64BIT | FK_FP32 | FK_FP64 | FK_UAL)
 LOONGARCH_ARCH("la464", AK_LA464, FK_64BIT | FK_FP32 | FK_FP64 | FK_LSX | FK_LASX | FK_UAL)
-LOONGARCH_ARCH("la664", AK_LA664, FK_64BIT | FK_FP32 | FK_FP64 | FK_LSX | FK_LASX | FK_UAL | FK_FRECIPE)
 
 #undef LOONGARCH_ARCH
diff --git a/llvm/include/llvm/TargetParser/LoongArchTargetParser.h b/llvm/include/llvm/TargetParser/LoongArchTargetParser.h
index c0bb15a5163b1..028844187584b 100644
--- a/llvm/include/llvm/TargetParser/LoongArchTargetParser.h
+++ b/llvm/include/llvm/TargetParser/LoongArchTargetParser.h
@@ -46,9 +46,6 @@ enum FeatureKind : uint32_t {
 
   // Allow memory accesses to be unaligned.
   FK_UAL = 1 << 8,
-
-  // Floating-point approximate reciprocal instructions are available.
-  FK_FRECIPE = 1 << 9,
 };
 
 struct FeatureInfo {
diff --git a/llvm/include/llvm/TargetParser/RISCVTargetParser.h b/llvm/include/llvm/TargetParser/RISCVTargetParser.h
index c75778952e0f5..7421dac2744b6 100644
--- a/llvm/include/llvm/TargetParser/RISCVTargetParser.h
+++ b/llvm/include/llvm/TargetParser/RISCVTargetParser.h
@@ -24,14 +24,6 @@ class Triple;
 
 namespace RISCV {
 
-namespace RISCVExtensionBitmaskTable {
-struct RISCVExtensionBitmask {
-  const char *Name;
-  unsigned GroupID;
-  unsigned BitPosition;
-};
-} // namespace RISCVExtensionBitmaskTable
-
 // We use 64 bits as the known part in the scalable vector types.
 static constexpr unsigned RVVBitsPerBlock = 64;
 
diff --git a/llvm/include/llvm/TargetParser/Triple.h b/llvm/include/llvm/TargetParser/Triple.h
index ebd92f264d904..b3bb354b38ff5 100644
--- a/llvm/include/llvm/TargetParser/Triple.h
+++ b/llvm/include/llvm/TargetParser/Triple.h
@@ -268,7 +268,7 @@ class Triple {
     Cygnus,
     CoreCLR,
     Simulator, // Simulator variants of other systems, e.g., Apple's iOS
-    MacABI,    // Mac Catalyst variant of Apple's iOS deployment target.
+    MacABI, // Mac Catalyst variant of Apple's iOS deployment target.
 
     // Shader Stages
     // The order of these values matters, and must be kept in sync with the
@@ -292,9 +292,7 @@ class Triple {
     OpenCL,
     OpenHOS,
 
-    PAuthTest,
-
-    LastEnvironmentType = PAuthTest
+    LastEnvironmentType = OpenHOS
   };
   enum ObjectFormatType {
     UnknownObjectFormat,
diff --git a/llvm/include/llvm/Transforms/Instrumentation.h b/llvm/include/llvm/Transforms/Instrumentation.h
index 969c2cd12f3f0..ea97ab2562a5b 100644
--- a/llvm/include/llvm/Transforms/Instrumentation.h
+++ b/llvm/include/llvm/Transforms/Instrumentation.h
@@ -121,18 +121,12 @@ struct InstrProfOptions {
   // Use BFI to guide register promotion
   bool UseBFIInPromotion = false;
 
-  // Use sampling to reduce the profile instrumentation runtime overhead.
-  bool Sampling = false;
-
   // Name of the profile file to use as output
   std::string InstrProfileOutput;
 
   InstrProfOptions() = default;
 };
 
-// Create the variable for profile sampling.
-void createProfileSamplingVar(Module &M);
-
 // Options for sanitizer coverage instrumentation.
 struct SanitizerCoverageOptions {
   enum Type {
diff --git a/llvm/include/llvm/Transforms/Instrumentation/PGOInstrumentation.h b/llvm/include/llvm/Transforms/Instrumentation/PGOInstrumentation.h
index 7199f27dbc991..5b1977b7de9a2 100644
--- a/llvm/include/llvm/Transforms/Instrumentation/PGOInstrumentation.h
+++ b/llvm/include/llvm/Transforms/Instrumentation/PGOInstrumentation.h
@@ -43,14 +43,12 @@ class FileSystem;
 class PGOInstrumentationGenCreateVar
     : public PassInfoMixin<PGOInstrumentationGenCreateVar> {
 public:
-  PGOInstrumentationGenCreateVar(std::string CSInstrName = "",
-                                 bool Sampling = false)
-      : CSInstrName(CSInstrName), ProfileSampling(Sampling) {}
+  PGOInstrumentationGenCreateVar(std::string CSInstrName = "")
+      : CSInstrName(CSInstrName) {}
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &MAM);
 
 private:
   std::string CSInstrName;
-  bool ProfileSampling;
 };
 
 /// The instrumentation (profile-instr-gen) pass for IR based PGO.
diff --git a/llvm/include/llvm/Transforms/Utils/SimplifyCFGOptions.h b/llvm/include/llvm/Transforms/Utils/SimplifyCFGOptions.h
index 2ea9d64f03cb6..8008fc6e8422d 100644
--- a/llvm/include/llvm/Transforms/Utils/SimplifyCFGOptions.h
+++ b/llvm/include/llvm/Transforms/Utils/SimplifyCFGOptions.h
@@ -30,7 +30,6 @@ struct SimplifyCFGOptions {
   bool SinkCommonInsts = false;
   bool SimplifyCondBranch = true;
   bool SpeculateBlocks = true;
-  bool SpeculateUnpredictables = false;
 
   AssumptionCache *AC = nullptr;
 
@@ -76,10 +75,6 @@ struct SimplifyCFGOptions {
     SpeculateBlocks = B;
     return *this;
   }
-  SimplifyCFGOptions &speculateUnpredictables(bool B) {
-    SpeculateUnpredictables = B;
-    return *this;
-  }
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp
index 61c6aa5e5a3eb..255a7ae56dd3b 100644
--- a/llvm/lib/Analysis/Loads.cpp
+++ b/llvm/lib/Analysis/Loads.cpp
@@ -313,13 +313,6 @@ bool llvm::isDereferenceableAndAlignedInLoop(LoadInst *LI, Loop *L,
     const auto *Offset = dyn_cast<SCEVConstant>(StartS->getOperand(0));
     const auto *NewBase = dyn_cast<SCEVUnknown>(StartS->getOperand(1));
     if (StartS->getNumOperands() == 2 && Offset && NewBase) {
-      // The following code below assumes the offset is unsigned, but GEP
-      // offsets are treated as signed so we can end up with a signed value
-      // here too. For example, suppose the initial PHI value is (i8 255),
-      // the offset will be treated as (i8 -1) and sign-extended to (i64 -1).
-      if (Offset->getAPInt().isNegative())
-        return false;
-
       // For the moment, restrict ourselves to the case where the offset is a
       // multiple of the requested alignment and the base is aligned.
       // TODO: generalize if a case found which warrants
@@ -750,8 +743,9 @@ static bool isPointerAlwaysReplaceable(const Value *From, const Value *To,
   if (isa<Constant>(To) &&
       isDereferenceablePointer(To, Type::getInt8Ty(To->getContext()), DL))
     return true;
-  return getUnderlyingObjectAggressive(From) ==
-         getUnderlyingObjectAggressive(To);
+  if (getUnderlyingObject(From) == getUnderlyingObject(To))
+    return true;
+  return false;
 }
 
 bool llvm::canReplacePointersInUseIfEqual(const Use &U, const Value *To,
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 6a0fa98089ba5..693f7a5bb7af5 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -279,10 +279,6 @@ BranchProbability TargetTransformInfo::getPredictableBranchThreshold() const {
              : TTIImpl->getPredictableBranchThreshold();
 }
 
-InstructionCost TargetTransformInfo::getBranchMispredictPenalty() const {
-  return TTIImpl->getBranchMispredictPenalty();
-}
-
 bool TargetTransformInfo::hasBranchDivergence(const Function *F) const {
   return TTIImpl->hasBranchDivergence(F);
 }
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 40fe1ffe13f1b..03eb6ef42b0ff 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -934,7 +934,7 @@ getKnownBitsFromAndXorOr(const Operator *I, const APInt &DemandedElts,
     // Demanded) == (xor(x, x-1) & Demanded). Extend the xor pattern
     // to use arbitrary C if xor(x, x-C) as the same as xor(x, x-1).
     if (HasKnownOne &&
-        match(I, m_c_Xor(m_Value(X), m_Add(m_Deferred(X), m_AllOnes())))) {
+        match(I, m_c_Xor(m_Value(X), m_c_Add(m_Deferred(X), m_AllOnes())))) {
       const KnownBits &XBits = I->getOperand(0) == X ? KnownLHS : KnownRHS;
       KnownOut = XBits.blsmsk();
     }
@@ -6620,48 +6620,6 @@ void llvm::getUnderlyingObjects(const Value *V,
   } while (!Worklist.empty());
 }
 
-const Value *llvm::getUnderlyingObjectAggressive(const Value *V) {
-  const unsigned MaxVisited = 8;
-
-  SmallPtrSet<const Value *, 8> Visited;
-  SmallVector<const Value *, 8> Worklist;
-  Worklist.push_back(V);
-  const Value *Object = nullptr;
-  // Used as fallback if we can't find a common underlying object through
-  // recursion.
-  bool First = true;
-  const Value *FirstObject = getUnderlyingObject(V);
-  do {
-    const Value *P = Worklist.pop_back_val();
-    P = First ? FirstObject : getUnderlyingObject(P);
-    First = false;
-
-    if (!Visited.insert(P).second)
-      continue;
-
-    if (Visited.size() == MaxVisited)
-      return FirstObject;
-
-    if (auto *SI = dyn_cast<SelectInst>(P)) {
-      Worklist.push_back(SI->getTrueValue());
-      Worklist.push_back(SI->getFalseValue());
-      continue;
-    }
-
-    if (auto *PN = dyn_cast<PHINode>(P)) {
-      append_range(Worklist, PN->incoming_values());
-      continue;
-    }
-
-    if (!Object)
-      Object = P;
-    else if (Object != P)
-      return FirstObject;
-  } while (!Worklist.empty());
-
-  return Object;
-}
-
 /// This is the function that does the work of looking through basic
 /// ptrtoint+arithmetic+inttoptr sequences.
 static const Value *getUnderlyingObjectFromInt(const Value *V) {
@@ -6813,16 +6771,15 @@ bool llvm::isSafeToSpeculativelyExecute(const Instruction *Inst,
                                         const Instruction *CtxI,
                                         AssumptionCache *AC,
                                         const DominatorTree *DT,
-                                        const TargetLibraryInfo *TLI,
-                                        bool UseVariableInfo) {
+                                        const TargetLibraryInfo *TLI) {
   return isSafeToSpeculativelyExecuteWithOpcode(Inst->getOpcode(), Inst, CtxI,
-                                                AC, DT, TLI, UseVariableInfo);
+                                                AC, DT, TLI);
 }
 
 bool llvm::isSafeToSpeculativelyExecuteWithOpcode(
     unsigned Opcode, const Instruction *Inst, const Instruction *CtxI,
-    AssumptionCache *AC, const DominatorTree *DT, const TargetLibraryInfo *TLI,
-    bool UseVariableInfo) {
+    AssumptionCache *AC, const DominatorTree *DT,
+    const TargetLibraryInfo *TLI) {
 #ifndef NDEBUG
   if (Inst->getOpcode() != Opcode) {
     // Check that the operands are actually compatible with the Opcode override.
@@ -6874,9 +6831,6 @@ bool llvm::isSafeToSpeculativelyExecuteWithOpcode(
     return false;
   }
   case Instruction::Load: {
-    if (!UseVariableInfo)
-      return false;
-
     const LoadInst *LI = dyn_cast<LoadInst>(Inst);
     if (!LI)
       return false;
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 2297b27ffdc07..91b5703944f3d 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -3144,7 +3144,7 @@ const MCExpr *AsmPrinter::lowerConstant(const Constant *CV) {
     return MCSymbolRefExpr::create(getSymbol(GV), Ctx);
 
   if (const BlockAddress *BA = dyn_cast<BlockAddress>(CV))
-    return lowerBlockAddressConstant(*BA);
+    return MCSymbolRefExpr::create(GetBlockAddressSymbol(BA), Ctx);
 
   if (const auto *Equiv = dyn_cast<DSOLocalEquivalent>(CV))
     return getObjFileLowering().lowerDSOLocalEquivalent(Equiv, TM);
@@ -3826,10 +3826,6 @@ MCSymbol *AsmPrinter::GetBlockAddressSymbol(const BasicBlock *BB) const {
   return const_cast<AsmPrinter *>(this)->getAddrLabelSymbol(BB);
 }
 
-const MCExpr *AsmPrinter::lowerBlockAddressConstant(const BlockAddress &BA) {
-  return MCSymbolRefExpr::create(GetBlockAddressSymbol(&BA), OutContext);
-}
-
 /// GetCPISymbol - Return the symbol for the specified constant pool entry.
 MCSymbol *AsmPrinter::GetCPISymbol(unsigned CPID) const {
   if (getSubtargetInfo().getTargetTriple().isWindowsMSVCEnvironment()) {
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index f88653146cc6f..80cd5ec501f25 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -2990,9 +2990,6 @@ struct ArangeSpan {
 // Emit a debug aranges section, containing a CU lookup for any
 // address we can tie back to a CU.
 void DwarfDebug::emitDebugARanges() {
-  if (ArangeLabels.empty())
-    return;
-
   // Provides a unique id per text section.
   MapVector<MCSection *, SmallVector<SymbolCU, 8>> SectionMap;
 
@@ -3015,7 +3012,8 @@ void DwarfDebug::emitDebugARanges() {
   for (auto &I : SectionMap) {
     MCSection *Section = I.first;
     SmallVector<SymbolCU, 8> &List = I.second;
-    assert(!List.empty());
+    if (List.size() < 1)
+      continue;
 
     // If we have no section (e.g. common), just write out
     // individual spans for each symbol.
diff --git a/llvm/lib/CodeGen/BranchFolding.cpp b/llvm/lib/CodeGen/BranchFolding.cpp
index 92a03eb52e35d..c0fc7a2b35ea3 100644
--- a/llvm/lib/CodeGen/BranchFolding.cpp
+++ b/llvm/lib/CodeGen/BranchFolding.cpp
@@ -80,6 +80,7 @@ TailMergeThreshold("tail-merge-threshold",
           cl::init(150), cl::Hidden);
 
 // Heuristic for tail merging (and, inversely, tail duplication).
+// TODO: This should be replaced with a target query.
 static cl::opt<unsigned>
 TailMergeSize("tail-merge-size",
               cl::desc("Min number of instructions to consider tail merging"),
@@ -144,6 +145,8 @@ BranchFolder::BranchFolder(bool DefaultEnableTailMerge, bool CommonHoist,
                            ProfileSummaryInfo *PSI, unsigned MinTailLength)
     : EnableHoistCommonCode(CommonHoist), MinCommonTailLength(MinTailLength),
       MBBFreqInfo(FreqInfo), MBPI(ProbInfo), PSI(PSI) {
+  if (MinCommonTailLength == 0)
+    MinCommonTailLength = TailMergeSize;
   switch (FlagEnableTailMerge) {
   case cl::BOU_UNSET:
     EnableTailMerge = DefaultEnableTailMerge;
@@ -192,12 +195,6 @@ bool BranchFolder::OptimizeFunction(MachineFunction &MF,
   MLI = mli;
   this->MRI = &MRI;
 
-  if (MinCommonTailLength == 0) {
-    MinCommonTailLength = TailMergeSize.getNumOccurrences() > 0
-                              ? TailMergeSize
-                              : TII->getTailMergeSize(MF);
-  }
-
   UpdateLiveIns = MRI.tracksLiveness() && TRI->trackLivenessAfterRegAlloc(MF);
   if (!UpdateLiveIns)
     MRI.invalidateLiveness();
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index e77ea3e76ad71..dfc3d73e322b8 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -5179,9 +5179,13 @@ MachineInstr *CombinerHelper::buildUDivUsingMul(MachineInstr &MI) {
   LLT ShiftAmtTy = getTargetLowering().getPreferredShiftAmountTy(Ty);
   LLT ScalarShiftAmtTy = ShiftAmtTy.getScalarType();
 
+  unsigned KnownLeadingZeros =
+      KB ? KB->getKnownBits(LHS).countMinLeadingZeros() : 0;
   auto &MIB = Builder;
 
   bool UseSRL = false;
+  bool UseNPQ = false;
+  SmallVector<Register, 16> PreShifts, PostShifts, MagicFactors, NPQFactors;
   SmallVector<Register, 16> Shifts, Factors;
   auto *RHSDefInstr = cast<GenericMachineInstr>(getDefIgnoringCopies(RHS, MRI));
   bool IsSplat = getIConstantSplatVal(*RHSDefInstr, MRI).has_value();
@@ -5209,33 +5213,6 @@ MachineInstr *CombinerHelper::buildUDivUsingMul(MachineInstr &MI) {
     return true;
   };
 
-  if (MI.getFlag(MachineInstr::MIFlag::IsExact)) {
-    // Collect all magic values from the build vector.
-    if (!matchUnaryPredicate(MRI, RHS, BuildExactUDIVPattern))
-      llvm_unreachable("Expected unary predicate match to succeed");
-
-    Register Shift, Factor;
-    if (Ty.isVector()) {
-      Shift = MIB.buildBuildVector(ShiftAmtTy, Shifts).getReg(0);
-      Factor = MIB.buildBuildVector(Ty, Factors).getReg(0);
-    } else {
-      Shift = Shifts[0];
-      Factor = Factors[0];
-    }
-
-    Register Res = LHS;
-
-    if (UseSRL)
-      Res = MIB.buildLShr(Ty, Res, Shift, MachineInstr::IsExact).getReg(0);
-
-    return MIB.buildMul(Ty, Res, Factor);
-  }
-
-  unsigned KnownLeadingZeros =
-      KB ? KB->getKnownBits(LHS).countMinLeadingZeros() : 0;
-
-  bool UseNPQ = false;
-  SmallVector<Register, 16> PreShifts, PostShifts, MagicFactors, NPQFactors;
   auto BuildUDIVPattern = [&](const Constant *C) {
     auto *CI = cast<ConstantInt>(C);
     const APInt &Divisor = CI->getValue();
@@ -5281,6 +5258,29 @@ MachineInstr *CombinerHelper::buildUDivUsingMul(MachineInstr &MI) {
     return true;
   };
 
+  if (MI.getFlag(MachineInstr::MIFlag::IsExact)) {
+    // Collect all magic values from the build vector.
+    bool Matched = matchUnaryPredicate(MRI, RHS, BuildExactUDIVPattern);
+    (void)Matched;
+    assert(Matched && "Expected unary predicate match to succeed");
+
+    Register Shift, Factor;
+    if (Ty.isVector()) {
+      Shift = MIB.buildBuildVector(ShiftAmtTy, Shifts).getReg(0);
+      Factor = MIB.buildBuildVector(Ty, Factors).getReg(0);
+    } else {
+      Shift = Shifts[0];
+      Factor = Factors[0];
+    }
+
+    Register Res = LHS;
+
+    if (UseSRL)
+      Res = MIB.buildLShr(Ty, Res, Shift, MachineInstr::IsExact).getReg(0);
+
+    return MIB.buildMul(Ty, Res, Factor);
+  }
+
   // Collect the shifts/magic values from each element.
   bool Matched = matchUnaryPredicate(MRI, RHS, BuildUDIVPattern);
   (void)Matched;
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 644dbae8f93a5..640a425ffa735 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -4006,9 +4006,6 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) {
   case G_UMIN:
   case G_UMAX:
     return lowerMinMax(MI);
-  case G_SCMP:
-  case G_UCMP:
-    return lowerThreewayCompare(MI);
   case G_FCOPYSIGN:
     return lowerFCopySign(MI);
   case G_FMINNUM:
@@ -7272,36 +7269,6 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerMinMax(MachineInstr &MI) {
   return Legalized;
 }
 
-LegalizerHelper::LegalizeResult
-LegalizerHelper::lowerThreewayCompare(MachineInstr &MI) {
-  GSUCmp *Cmp = cast<GSUCmp>(&MI);
-
-  Register Dst = Cmp->getReg(0);
-  LLT DstTy = MRI.getType(Dst);
-  LLT CmpTy = DstTy.changeElementSize(1);
-
-  CmpInst::Predicate LTPredicate = Cmp->isSigned()
-                                       ? CmpInst::Predicate::ICMP_SLT
-                                       : CmpInst::Predicate::ICMP_ULT;
-  CmpInst::Predicate GTPredicate = Cmp->isSigned()
-                                       ? CmpInst::Predicate::ICMP_SGT
-                                       : CmpInst::Predicate::ICMP_UGT;
-
-  auto One = MIRBuilder.buildConstant(DstTy, 1);
-  auto Zero = MIRBuilder.buildConstant(DstTy, 0);
-  auto IsGT = MIRBuilder.buildICmp(GTPredicate, CmpTy, Cmp->getLHSReg(),
-                                   Cmp->getRHSReg());
-  auto SelectZeroOrOne = MIRBuilder.buildSelect(DstTy, IsGT, One, Zero);
-
-  auto MinusOne = MIRBuilder.buildConstant(DstTy, -1);
-  auto IsLT = MIRBuilder.buildICmp(LTPredicate, CmpTy, Cmp->getLHSReg(),
-                                   Cmp->getRHSReg());
-  MIRBuilder.buildSelect(Dst, IsLT, MinusOne, SelectZeroOrOne);
-
-  MI.eraseFromParent();
-  return Legalized;
-}
-
 LegalizerHelper::LegalizeResult
 LegalizerHelper::lowerFCopySign(MachineInstr &MI) {
   auto [Dst, DstTy, Src0, Src0Ty, Src1, Src1Ty] = MI.getFirst3RegLLTs();
diff --git a/llvm/lib/CodeGen/RegAllocBase.h b/llvm/lib/CodeGen/RegAllocBase.h
index a1ede08a15356..643094671d682 100644
--- a/llvm/lib/CodeGen/RegAllocBase.h
+++ b/llvm/lib/CodeGen/RegAllocBase.h
@@ -72,7 +72,7 @@ class RegAllocBase {
 
 private:
   /// Private, callees should go through shouldAllocateRegister
-  const RegAllocFilterFunc shouldAllocateRegisterImpl;
+  const RegClassFilterFunc ShouldAllocateClass;
 
 protected:
   /// Inst which is a def of an original reg and whose defs are already all
@@ -81,8 +81,7 @@ class RegAllocBase {
   /// always available for the remat of all the siblings of the original reg.
   SmallPtrSet<MachineInstr *, 32> DeadRemats;
 
-  RegAllocBase(const RegAllocFilterFunc F = nullptr)
-      : shouldAllocateRegisterImpl(F) {}
+  RegAllocBase(const RegClassFilterFunc F = nullptr) : ShouldAllocateClass(F) {}
 
   virtual ~RegAllocBase() = default;
 
@@ -91,9 +90,9 @@ class RegAllocBase {
 
   /// Get whether a given register should be allocated
   bool shouldAllocateRegister(Register Reg) {
-    if (!shouldAllocateRegisterImpl)
+    if (!ShouldAllocateClass)
       return true;
-    return shouldAllocateRegisterImpl(*TRI, *MRI, Reg);
+    return ShouldAllocateClass(*TRI, *MRI->getRegClass(Reg));
   }
 
   // The top-level driver. The output is a VirtRegMap that us updated with
diff --git a/llvm/lib/CodeGen/RegAllocBasic.cpp b/llvm/lib/CodeGen/RegAllocBasic.cpp
index caf9c32a5a349..e78a447969fb2 100644
--- a/llvm/lib/CodeGen/RegAllocBasic.cpp
+++ b/llvm/lib/CodeGen/RegAllocBasic.cpp
@@ -74,7 +74,7 @@ class RABasic : public MachineFunctionPass,
   void LRE_WillShrinkVirtReg(Register) override;
 
 public:
-  RABasic(const RegAllocFilterFunc F = nullptr);
+  RABasic(const RegClassFilterFunc F = nullptr);
 
   /// Return the pass name.
   StringRef getPassName() const override { return "Basic Register Allocator"; }
@@ -168,8 +168,10 @@ void RABasic::LRE_WillShrinkVirtReg(Register VirtReg) {
   enqueue(&LI);
 }
 
-RABasic::RABasic(RegAllocFilterFunc F)
-    : MachineFunctionPass(ID), RegAllocBase(F) {}
+RABasic::RABasic(RegClassFilterFunc F):
+  MachineFunctionPass(ID),
+  RegAllocBase(F) {
+}
 
 void RABasic::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesCFG();
@@ -331,6 +333,6 @@ FunctionPass* llvm::createBasicRegisterAllocator() {
   return new RABasic();
 }
 
-FunctionPass *llvm::createBasicRegisterAllocator(RegAllocFilterFunc F) {
+FunctionPass* llvm::createBasicRegisterAllocator(RegClassFilterFunc F) {
   return new RABasic(F);
 }
diff --git a/llvm/lib/CodeGen/RegAllocFast.cpp b/llvm/lib/CodeGen/RegAllocFast.cpp
index 6e5ce72240d27..b4660d4085ffd 100644
--- a/llvm/lib/CodeGen/RegAllocFast.cpp
+++ b/llvm/lib/CodeGen/RegAllocFast.cpp
@@ -177,9 +177,9 @@ class InstrPosIndexes {
 
 class RegAllocFastImpl {
 public:
-  RegAllocFastImpl(const RegAllocFilterFunc F = nullptr,
+  RegAllocFastImpl(const RegClassFilterFunc F = nullptr,
                    bool ClearVirtRegs_ = true)
-      : ShouldAllocateRegisterImpl(F), StackSlotForVirtReg(-1),
+      : ShouldAllocateClass(F), StackSlotForVirtReg(-1),
         ClearVirtRegs(ClearVirtRegs_) {}
 
 private:
@@ -188,7 +188,7 @@ class RegAllocFastImpl {
   const TargetRegisterInfo *TRI = nullptr;
   const TargetInstrInfo *TII = nullptr;
   RegisterClassInfo RegClassInfo;
-  const RegAllocFilterFunc ShouldAllocateRegisterImpl;
+  const RegClassFilterFunc ShouldAllocateClass;
 
   /// Basic block currently being allocated.
   MachineBasicBlock *MBB = nullptr;
@@ -397,7 +397,7 @@ class RegAllocFast : public MachineFunctionPass {
 public:
   static char ID;
 
-  RegAllocFast(const RegAllocFilterFunc F = nullptr, bool ClearVirtRegs_ = true)
+  RegAllocFast(const RegClassFilterFunc F = nullptr, bool ClearVirtRegs_ = true)
       : MachineFunctionPass(ID), Impl(F, ClearVirtRegs_) {}
 
   bool runOnMachineFunction(MachineFunction &MF) override {
@@ -440,10 +440,10 @@ INITIALIZE_PASS(RegAllocFast, "regallocfast", "Fast Register Allocator", false,
 
 bool RegAllocFastImpl::shouldAllocateRegister(const Register Reg) const {
   assert(Reg.isVirtual());
-  if (!ShouldAllocateRegisterImpl)
+  if (!ShouldAllocateClass)
     return true;
-
-  return ShouldAllocateRegisterImpl(*TRI, *MRI, Reg);
+  const TargetRegisterClass &RC = *MRI->getRegClass(Reg);
+  return ShouldAllocateClass(*TRI, RC);
 }
 
 void RegAllocFastImpl::setPhysRegState(MCPhysReg PhysReg, unsigned NewState) {
@@ -1841,7 +1841,7 @@ void RegAllocFastPass::printPipeline(
 
 FunctionPass *llvm::createFastRegisterAllocator() { return new RegAllocFast(); }
 
-FunctionPass *llvm::createFastRegisterAllocator(RegAllocFilterFunc Ftor,
+FunctionPass *llvm::createFastRegisterAllocator(RegClassFilterFunc Ftor,
                                                 bool ClearVirtRegs) {
   return new RegAllocFast(Ftor, ClearVirtRegs);
 }
diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp
index 5001b4fec58f2..310050d92d744 100644
--- a/llvm/lib/CodeGen/RegAllocGreedy.cpp
+++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp
@@ -192,12 +192,14 @@ FunctionPass* llvm::createGreedyRegisterAllocator() {
   return new RAGreedy();
 }
 
-FunctionPass *llvm::createGreedyRegisterAllocator(RegAllocFilterFunc Ftor) {
+FunctionPass *llvm::createGreedyRegisterAllocator(RegClassFilterFunc Ftor) {
   return new RAGreedy(Ftor);
 }
 
-RAGreedy::RAGreedy(RegAllocFilterFunc F)
-    : MachineFunctionPass(ID), RegAllocBase(F) {}
+RAGreedy::RAGreedy(RegClassFilterFunc F):
+  MachineFunctionPass(ID),
+  RegAllocBase(F) {
+}
 
 void RAGreedy::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesCFG();
@@ -2304,7 +2306,7 @@ void RAGreedy::tryHintRecoloring(const LiveInterval &VirtReg) {
     if (Reg.isPhysical())
       continue;
 
-    // This may be a skipped register.
+    // This may be a skipped class
     if (!VRM->hasPhys(Reg)) {
       assert(!shouldAllocateRegister(Reg) &&
              "We have an unallocated variable which should have been handled");
diff --git a/llvm/lib/CodeGen/RegAllocGreedy.h b/llvm/lib/CodeGen/RegAllocGreedy.h
index 2e7608a53e9ce..ac300c0024f5a 100644
--- a/llvm/lib/CodeGen/RegAllocGreedy.h
+++ b/llvm/lib/CodeGen/RegAllocGreedy.h
@@ -281,7 +281,7 @@ class LLVM_LIBRARY_VISIBILITY RAGreedy : public MachineFunctionPass,
   bool ReverseLocalAssignment = false;
 
 public:
-  RAGreedy(const RegAllocFilterFunc F = nullptr);
+  RAGreedy(const RegClassFilterFunc F = nullptr);
 
   /// Return the pass name.
   StringRef getPassName() const override { return "Greedy Register Allocator"; }
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index aa9032ea2574c..9b2153c68ccae 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -11501,28 +11501,28 @@ static SDValue foldBoolSelectToLogic(SDNode *N, const SDLoc &DL,
   if (VT != Cond.getValueType() || VT.getScalarSizeInBits() != 1)
     return SDValue();
 
-  // select Cond, Cond, F --> or Cond, freeze(F)
-  // select Cond, 1, F    --> or Cond, freeze(F)
+  // select Cond, Cond, F --> or Cond, F
+  // select Cond, 1, F    --> or Cond, F
   if (Cond == T || isOneOrOneSplat(T, /* AllowUndefs */ true))
-    return matcher.getNode(ISD::OR, DL, VT, Cond, DAG.getFreeze(F));
+    return matcher.getNode(ISD::OR, DL, VT, Cond, F);
 
-  // select Cond, T, Cond --> and Cond, freeze(T)
-  // select Cond, T, 0    --> and Cond, freeze(T)
+  // select Cond, T, Cond --> and Cond, T
+  // select Cond, T, 0    --> and Cond, T
   if (Cond == F || isNullOrNullSplat(F, /* AllowUndefs */ true))
-    return matcher.getNode(ISD::AND, DL, VT, Cond, DAG.getFreeze(T));
+    return matcher.getNode(ISD::AND, DL, VT, Cond, T);
 
-  // select Cond, T, 1 --> or (not Cond), freeze(T)
+  // select Cond, T, 1 --> or (not Cond), T
   if (isOneOrOneSplat(F, /* AllowUndefs */ true)) {
     SDValue NotCond =
         matcher.getNode(ISD::XOR, DL, VT, Cond, DAG.getAllOnesConstant(DL, VT));
-    return matcher.getNode(ISD::OR, DL, VT, NotCond, DAG.getFreeze(T));
+    return matcher.getNode(ISD::OR, DL, VT, NotCond, T);
   }
 
-  // select Cond, 0, F --> and (not Cond), freeze(F)
+  // select Cond, 0, F --> and (not Cond), F
   if (isNullOrNullSplat(T, /* AllowUndefs */ true)) {
     SDValue NotCond =
         matcher.getNode(ISD::XOR, DL, VT, Cond, DAG.getAllOnesConstant(DL, VT));
-    return matcher.getNode(ISD::AND, DL, VT, NotCond, DAG.getFreeze(F));
+    return matcher.getNode(ISD::AND, DL, VT, NotCond, F);
   }
 
   return SDValue();
@@ -11550,37 +11550,37 @@ static SDValue foldVSelectToSignBitSplatMask(SDNode *N, SelectionDAG &DAG) {
   else
     return SDValue();
 
-  // (Cond0 s< 0) ? N1 : 0 --> (Cond0 s>> BW-1) & freeze(N1)
+  // (Cond0 s< 0) ? N1 : 0 --> (Cond0 s>> BW-1) & N1
   if (isNullOrNullSplat(N2)) {
     SDLoc DL(N);
     SDValue ShiftAmt = DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT);
     SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Cond0, ShiftAmt);
-    return DAG.getNode(ISD::AND, DL, VT, Sra, DAG.getFreeze(N1));
+    return DAG.getNode(ISD::AND, DL, VT, Sra, N1);
   }
 
-  // (Cond0 s< 0) ? -1 : N2 --> (Cond0 s>> BW-1) | freeze(N2)
+  // (Cond0 s< 0) ? -1 : N2 --> (Cond0 s>> BW-1) | N2
   if (isAllOnesOrAllOnesSplat(N1)) {
     SDLoc DL(N);
     SDValue ShiftAmt = DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT);
     SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Cond0, ShiftAmt);
-    return DAG.getNode(ISD::OR, DL, VT, Sra, DAG.getFreeze(N2));
+    return DAG.getNode(ISD::OR, DL, VT, Sra, N2);
   }
 
   // If we have to invert the sign bit mask, only do that transform if the
   // target has a bitwise 'and not' instruction (the invert is free).
-  // (Cond0 s< -0) ? 0 : N2 --> ~(Cond0 s>> BW-1) & freeze(N2)
+  // (Cond0 s< -0) ? 0 : N2 --> ~(Cond0 s>> BW-1) & N2
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   if (isNullOrNullSplat(N1) && TLI.hasAndNot(N1)) {
     SDLoc DL(N);
     SDValue ShiftAmt = DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT);
     SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Cond0, ShiftAmt);
     SDValue Not = DAG.getNOT(DL, Sra, VT);
-    return DAG.getNode(ISD::AND, DL, VT, Not, DAG.getFreeze(N2));
+    return DAG.getNode(ISD::AND, DL, VT, Not, N2);
   }
 
   // TODO: There's another pattern in this family, but it may require
   //       implementing hasOrNot() to check for profitability:
-  //       (Cond0 s> -1) ? -1 : N2 --> ~(Cond0 s>> BW-1) | freeze(N2)
+  //       (Cond0 s> -1) ? -1 : N2 --> ~(Cond0 s>> BW-1) | N2
 
   return SDValue();
 }
@@ -13782,10 +13782,9 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
 /// Given an extending node with a pop-count operand, if the target does not
 /// support a pop-count in the narrow source type but does support it in the
 /// destination type, widen the pop-count to the destination type.
-static SDValue widenCtPop(SDNode *Extend, SelectionDAG &DAG, const SDLoc &DL) {
+static SDValue widenCtPop(SDNode *Extend, SelectionDAG &DAG) {
   assert((Extend->getOpcode() == ISD::ZERO_EXTEND ||
-          Extend->getOpcode() == ISD::ANY_EXTEND) &&
-         "Expected extend op");
+          Extend->getOpcode() == ISD::ANY_EXTEND) && "Expected extend op");
 
   SDValue CtPop = Extend->getOperand(0);
   if (CtPop.getOpcode() != ISD::CTPOP || !CtPop.hasOneUse())
@@ -13798,6 +13797,7 @@ static SDValue widenCtPop(SDNode *Extend, SelectionDAG &DAG, const SDLoc &DL) {
     return SDValue();
 
   // zext (ctpop X) --> ctpop (zext X)
+  SDLoc DL(Extend);
   SDValue NewZext = DAG.getZExtOrTrunc(CtPop.getOperand(0), DL, VT);
   return DAG.getNode(ISD::CTPOP, DL, VT, NewZext);
 }
@@ -14138,7 +14138,7 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
   if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N))
     return NewVSel;
 
-  if (SDValue NewCtPop = widenCtPop(N, DAG, DL))
+  if (SDValue NewCtPop = widenCtPop(N, DAG))
     return NewCtPop;
 
   if (SDValue V = widenAbs(N, DAG))
@@ -14316,7 +14316,7 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
       return SCC;
   }
 
-  if (SDValue NewCtPop = widenCtPop(N, DAG, DL))
+  if (SDValue NewCtPop = widenCtPop(N, DAG))
     return NewCtPop;
 
   if (SDValue Res = tryToFoldExtendSelectLoad(N, TLI, DAG, DL, Level))
@@ -15653,8 +15653,6 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) {
     return SDValue();
 
   bool AllowMultipleMaybePoisonOperands =
-      N0.getOpcode() == ISD::SELECT_CC ||
-      N0.getOpcode() == ISD::SETCC ||
       N0.getOpcode() == ISD::BUILD_VECTOR ||
       N0.getOpcode() == ISD::BUILD_PAIR ||
       N0.getOpcode() == ISD::VECTOR_SHUFFLE ||
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index d420edc8f2060..02d44cd36ae53 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -4705,18 +4705,14 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
       return 1;  // Early out.
     Tmp2 = ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);
     return std::min(Tmp, Tmp2);
-  case ISD::SSUBO_CARRY:
-  case ISD::USUBO_CARRY:
-    // sub_carry(x,x,c) -> 0/-1 (sext carry)
-    if (Op.getResNo() == 0 && Op.getOperand(0) == Op.getOperand(1))
-      return VTBits;
-    [[fallthrough]];
   case ISD::SADDO:
   case ISD::UADDO:
   case ISD::SADDO_CARRY:
   case ISD::UADDO_CARRY:
   case ISD::SSUBO:
   case ISD::USUBO:
+  case ISD::SSUBO_CARRY:
+  case ISD::USUBO_CARRY:
   case ISD::SMULO:
   case ISD::UMULO:
     if (Op.getResNo() != 1)
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 37b1131d2f8a3..98a795edb7a03 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -9566,15 +9566,10 @@ static SDValue getAddressForMemoryInput(SDValue Chain, const SDLoc &Location,
   // Otherwise, create a stack slot and emit a store to it before the asm.
   Type *Ty = OpVal->getType();
   auto &DL = DAG.getDataLayout();
-  TypeSize TySize = DL.getTypeAllocSize(Ty);
+  uint64_t TySize = DL.getTypeAllocSize(Ty);
   MachineFunction &MF = DAG.getMachineFunction();
-  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
-  int StackID = 0;
-  if (TySize.isScalable())
-    StackID = TFI->getStackIDForScalableVectors();
-  int SSFI = MF.getFrameInfo().CreateStackObject(TySize.getKnownMinValue(),
-                                                 DL.getPrefTypeAlign(Ty), false,
-                                                 nullptr, StackID);
+  int SSFI = MF.getFrameInfo().CreateStackObject(
+      TySize, DL.getPrefTypeAlign(Ty), false);
   SDValue StackSlot = DAG.getFrameIndex(SSFI, TLI.getFrameIndexTy(DL));
   Chain = DAG.getTruncStore(Chain, Location, OpInfo.CallOperand, StackSlot,
                             MachinePointerInfo::getFixedStack(MF, SSFI),
diff --git a/llvm/lib/CodeGen/StackFrameLayoutAnalysisPass.cpp b/llvm/lib/CodeGen/StackFrameLayoutAnalysisPass.cpp
index 940aecd1cb363..5d3903ed84ce8 100644
--- a/llvm/lib/CodeGen/StackFrameLayoutAnalysisPass.cpp
+++ b/llvm/lib/CodeGen/StackFrameLayoutAnalysisPass.cpp
@@ -62,14 +62,11 @@ struct StackFrameLayoutAnalysisPass : public MachineFunctionPass {
     int Align;
     int Offset;
     SlotType SlotTy;
-    bool Scalable;
 
     SlotData(const MachineFrameInfo &MFI, const int ValOffset, const int Idx)
         : Slot(Idx), Size(MFI.getObjectSize(Idx)),
           Align(MFI.getObjectAlign(Idx).value()),
-          Offset(MFI.getObjectOffset(Idx) - ValOffset), SlotTy(Invalid),
-          Scalable(false) {
-      Scalable = MFI.getStackID(Idx) == TargetStackID::ScalableVector;
+          Offset(MFI.getObjectOffset(Idx) - ValOffset), SlotTy(Invalid) {
       if (MFI.isSpillSlotObjectIndex(Idx))
         SlotTy = SlotType::Spill;
       else if (Idx == MFI.getStackProtectorIndex())
@@ -78,12 +75,9 @@ struct StackFrameLayoutAnalysisPass : public MachineFunctionPass {
         SlotTy = SlotType::Variable;
     }
 
-    // We use this to sort in reverse order, so that the layout is displayed
-    // correctly. Scalable slots are sorted to the end of the list.
-    bool operator<(const SlotData &Rhs) const {
-      return std::make_tuple(!Scalable, Offset) >
-             std::make_tuple(!Rhs.Scalable, Rhs.Offset);
-    }
+    // we use this to sort in reverse order, so that the layout is displayed
+    // correctly
+    bool operator<(const SlotData &Rhs) const { return Offset > Rhs.Offset; }
   };
 
   StackFrameLayoutAnalysisPass() : MachineFunctionPass(ID) {}
@@ -159,7 +153,7 @@ struct StackFrameLayoutAnalysisPass : public MachineFunctionPass {
     Rem << Prefix << ore::NV("Offset", D.Offset)
         << "], Type: " << ore::NV("Type", getTypeString(D.SlotTy))
         << ", Align: " << ore::NV("Align", D.Align)
-        << ", Size: " << ore::NV("Size", ElementCount::get(D.Size, D.Scalable));
+        << ", Size: " << ore::NV("Size", D.Size);
   }
 
   void emitSourceLocRemark(const MachineFunction &MF, const DILocalVariable *N,
diff --git a/llvm/lib/CodeGenData/CMakeLists.txt b/llvm/lib/CodeGenData/CMakeLists.txt
index f9d107f52a715..3ba90f96cc86d 100644
--- a/llvm/lib/CodeGenData/CMakeLists.txt
+++ b/llvm/lib/CodeGenData/CMakeLists.txt
@@ -11,5 +11,4 @@ add_llvm_component_library(LLVMCodeGenData
   LINK_COMPONENTS
   Core
   Support
-  Object
   )
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index ab2482b91213b..a06e81dd280e0 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -360,23 +360,23 @@ BasicBlock *llvm::splitBBWithSuffix(IRBuilderBase &Builder, bool CreateBranch,
 // This function creates a fake integer value and a fake use for the integer
 // value. It returns the fake value created. This is useful in modeling the
 // extra arguments to the outlined functions.
-Value *createFakeIntVal(IRBuilderBase &Builder,
+Value *createFakeIntVal(IRBuilder<> &Builder,
                         OpenMPIRBuilder::InsertPointTy OuterAllocaIP,
-                        llvm::SmallVectorImpl<Instruction *> &ToBeDeleted,
+                        std::stack<Instruction *> &ToBeDeleted,
                         OpenMPIRBuilder::InsertPointTy InnerAllocaIP,
                         const Twine &Name = "", bool AsPtr = true) {
   Builder.restoreIP(OuterAllocaIP);
   Instruction *FakeVal;
   AllocaInst *FakeValAddr =
       Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, Name + ".addr");
-  ToBeDeleted.push_back(FakeValAddr);
+  ToBeDeleted.push(FakeValAddr);
 
   if (AsPtr) {
     FakeVal = FakeValAddr;
   } else {
     FakeVal =
         Builder.CreateLoad(Builder.getInt32Ty(), FakeValAddr, Name + ".val");
-    ToBeDeleted.push_back(FakeVal);
+    ToBeDeleted.push(FakeVal);
   }
 
   // Generate a fake use of this value
@@ -389,7 +389,7 @@ Value *createFakeIntVal(IRBuilderBase &Builder,
     UseFakeVal =
         cast<BinaryOperator>(Builder.CreateAdd(FakeVal, Builder.getInt32(10)));
   }
-  ToBeDeleted.push_back(UseFakeVal);
+  ToBeDeleted.push(UseFakeVal);
   return FakeVal;
 }
 
@@ -1707,74 +1707,6 @@ void OpenMPIRBuilder::createTaskyield(const LocationDescription &Loc) {
   emitTaskyieldImpl(Loc);
 }
 
-// Processes the dependencies in Dependencies and does the following
-// - Allocates space on the stack of an array of DependInfo objects
-// - Populates each DependInfo object with relevant information of
-//   the corresponding dependence.
-// - All code is inserted in the entry block of the current function.
-static Value *emitTaskDependencies(
-    OpenMPIRBuilder &OMPBuilder,
-    SmallVectorImpl<OpenMPIRBuilder::DependData> &Dependencies) {
-  // Early return if we have no dependencies to process
-  if (Dependencies.empty())
-    return nullptr;
-
-  // Given a vector of DependData objects, in this function we create an
-  // array on the stack that holds kmp_dep_info objects corresponding
-  // to each dependency. This is then passed to the OpenMP runtime.
-  // For example, if there are 'n' dependencies then the following psedo
-  // code is generated. Assume the first dependence is on a variable 'a'
-  //
-  // \code{c}
-  // DepArray = alloc(n x sizeof(kmp_depend_info);
-  // idx = 0;
-  // DepArray[idx].base_addr = ptrtoint(&a);
-  // DepArray[idx].len = 8;
-  // DepArray[idx].flags = Dep.DepKind; /*(See OMPContants.h for DepKind)*/
-  // ++idx;
-  // DepArray[idx].base_addr = ...;
-  // \endcode
-
-  IRBuilderBase &Builder = OMPBuilder.Builder;
-  Type *DependInfo = OMPBuilder.DependInfo;
-  Module &M = OMPBuilder.M;
-
-  Value *DepArray = nullptr;
-  OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
-  Builder.SetInsertPoint(
-      OldIP.getBlock()->getParent()->getEntryBlock().getTerminator());
-
-  Type *DepArrayTy = ArrayType::get(DependInfo, Dependencies.size());
-  DepArray = Builder.CreateAlloca(DepArrayTy, nullptr, ".dep.arr.addr");
-
-  for (const auto &[DepIdx, Dep] : enumerate(Dependencies)) {
-    Value *Base =
-        Builder.CreateConstInBoundsGEP2_64(DepArrayTy, DepArray, 0, DepIdx);
-    // Store the pointer to the variable
-    Value *Addr = Builder.CreateStructGEP(
-        DependInfo, Base,
-        static_cast<unsigned int>(RTLDependInfoFields::BaseAddr));
-    Value *DepValPtr = Builder.CreatePtrToInt(Dep.DepVal, Builder.getInt64Ty());
-    Builder.CreateStore(DepValPtr, Addr);
-    // Store the size of the variable
-    Value *Size = Builder.CreateStructGEP(
-        DependInfo, Base, static_cast<unsigned int>(RTLDependInfoFields::Len));
-    Builder.CreateStore(
-        Builder.getInt64(M.getDataLayout().getTypeStoreSize(Dep.DepValueType)),
-        Size);
-    // Store the dependency kind
-    Value *Flags = Builder.CreateStructGEP(
-        DependInfo, Base,
-        static_cast<unsigned int>(RTLDependInfoFields::Flags));
-    Builder.CreateStore(
-        ConstantInt::get(Builder.getInt8Ty(),
-                         static_cast<unsigned int>(Dep.DepKind)),
-        Flags);
-  }
-  Builder.restoreIP(OldIP);
-  return DepArray;
-}
-
 OpenMPIRBuilder::InsertPointTy
 OpenMPIRBuilder::createTask(const LocationDescription &Loc,
                             InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB,
@@ -1819,7 +1751,7 @@ OpenMPIRBuilder::createTask(const LocationDescription &Loc,
   OI.ExitBB = TaskExitBB;
 
   // Add the thread ID argument.
-  SmallVector<Instruction *, 4> ToBeDeleted;
+  std::stack<Instruction *> ToBeDeleted;
   OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
       Builder, AllocaIP, ToBeDeleted, TaskAllocaIP, "global.tid", false));
 
@@ -2016,8 +1948,10 @@ OpenMPIRBuilder::createTask(const LocationDescription &Loc,
           Shareds, [Shareds](Use &U) { return U.getUser() != Shareds; });
     }
 
-    llvm::for_each(llvm::reverse(ToBeDeleted),
-                   [](Instruction *I) { I->eraseFromParent(); });
+    while (!ToBeDeleted.empty()) {
+      ToBeDeleted.top()->eraseFromParent();
+      ToBeDeleted.pop();
+    }
   };
 
   addOutlineInfo(std::move(OI));
@@ -6683,91 +6617,6 @@ static Function *createOutlinedFunction(
   return Func;
 }
 
-/// Create an entry point for a target task with the following.
-/// It'll have the following signature
-/// void @.omp_target_task_proxy_func(i32 %thread.id, ptr %task)
-/// This function is called from emitTargetTask once the
-/// code to launch the target kernel has been outlined already.
-static Function *emitTargetTaskProxyFunction(OpenMPIRBuilder &OMPBuilder,
-                                             IRBuilderBase &Builder,
-                                             CallInst *StaleCI) {
-  Module &M = OMPBuilder.M;
-  // KernelLaunchFunction is the target launch function, i.e.
-  // the function that sets up kernel arguments and calls
-  // __tgt_target_kernel to launch the kernel on the device.
-  //
-  Function *KernelLaunchFunction = StaleCI->getCalledFunction();
-
-  // StaleCI is the CallInst which is the call to the outlined
-  // target kernel launch function. If there are values that the
-  // outlined function uses then these are aggregated into a structure
-  // which is passed as the second argument. If not, then there's
-  // only one argument, the threadID. So, StaleCI can be
-  //
-  // %structArg = alloca { ptr, ptr }, align 8
-  // %gep_ = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 0
-  // store ptr %20, ptr %gep_, align 8
-  // %gep_8 = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 1
-  // store ptr %21, ptr %gep_8, align 8
-  // call void @_QQmain..omp_par.1(i32 %global.tid.val6, ptr %structArg)
-  //
-  // OR
-  //
-  // call void @_QQmain..omp_par.1(i32 %global.tid.val6)
-  OpenMPIRBuilder::InsertPointTy IP(StaleCI->getParent(),
-                                    StaleCI->getIterator());
-  LLVMContext &Ctx = StaleCI->getParent()->getContext();
-  Type *ThreadIDTy = Type::getInt32Ty(Ctx);
-  Type *TaskPtrTy = OMPBuilder.TaskPtr;
-  Type *TaskTy = OMPBuilder.Task;
-  auto ProxyFnTy =
-      FunctionType::get(Builder.getVoidTy(), {ThreadIDTy, TaskPtrTy},
-                        /* isVarArg */ false);
-  auto ProxyFn = Function::Create(ProxyFnTy, GlobalValue::InternalLinkage,
-                                  ".omp_target_task_proxy_func",
-                                  Builder.GetInsertBlock()->getModule());
-  ProxyFn->getArg(0)->setName("thread.id");
-  ProxyFn->getArg(1)->setName("task");
-
-  BasicBlock *EntryBB =
-      BasicBlock::Create(Builder.getContext(), "entry", ProxyFn);
-  Builder.SetInsertPoint(EntryBB);
-
-  bool HasShareds = StaleCI->arg_size() > 1;
-  // TODO: This is a temporary assert to prove to ourselves that
-  // the outlined target launch function is always going to have
-  // atmost two arguments if there is any data shared between
-  // host and device.
-  assert((!HasShareds || (StaleCI->arg_size() == 2)) &&
-         "StaleCI with shareds should have exactly two arguments.");
-  if (HasShareds) {
-    auto *ArgStructAlloca = dyn_cast<AllocaInst>(StaleCI->getArgOperand(1));
-    assert(ArgStructAlloca &&
-           "Unable to find the alloca instruction corresponding to arguments "
-           "for extracted function");
-    auto *ArgStructType =
-        dyn_cast<StructType>(ArgStructAlloca->getAllocatedType());
-
-    AllocaInst *NewArgStructAlloca =
-        Builder.CreateAlloca(ArgStructType, nullptr, "structArg");
-    Value *TaskT = ProxyFn->getArg(1);
-    Value *ThreadId = ProxyFn->getArg(0);
-    Value *SharedsSize =
-        Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType));
-
-    Value *Shareds = Builder.CreateStructGEP(TaskTy, TaskT, 0);
-    LoadInst *LoadShared =
-        Builder.CreateLoad(PointerType::getUnqual(Ctx), Shareds);
-
-    Builder.CreateMemCpy(
-        NewArgStructAlloca, NewArgStructAlloca->getAlign(), LoadShared,
-        LoadShared->getPointerAlignment(M.getDataLayout()), SharedsSize);
-
-    Builder.CreateCall(KernelLaunchFunction, {ThreadId, NewArgStructAlloca});
-  }
-  Builder.CreateRetVoid();
-  return ProxyFn;
-}
 static void emitTargetOutlinedFunction(
     OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
     TargetRegionEntryInfo &EntryInfo, Function *&OutlinedFn,
@@ -6785,281 +6634,13 @@ static void emitTargetOutlinedFunction(
   OMPBuilder.emitTargetRegionFunction(EntryInfo, GenerateOutlinedFunction, true,
                                       OutlinedFn, OutlinedFnID);
 }
-OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetTask(
-    Function *OutlinedFn, Value *OutlinedFnID,
-    EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args,
-    Value *DeviceID, Value *RTLoc, OpenMPIRBuilder::InsertPointTy AllocaIP,
-    SmallVector<llvm::OpenMPIRBuilder::DependData> &Dependencies,
-    bool HasNoWait) {
-
-  // When we arrive at this function, the target region itself has been
-  // outlined into the function OutlinedFn.
-  // So at ths point, for
-  // --------------------------------------------------
-  //   void user_code_that_offloads(...) {
-  //     omp target depend(..) map(from:a) map(to:b, c)
-  //        a = b + c
-  //   }
-  //
-  // --------------------------------------------------
-  //
-  // we have
-  //
-  // --------------------------------------------------
-  //
-  //   void user_code_that_offloads(...) {
-  //     %.offload_baseptrs = alloca [3 x ptr], align 8
-  //     %.offload_ptrs = alloca [3 x ptr], align 8
-  //     %.offload_mappers = alloca [3 x ptr], align 8
-  //     ;; target region has been outlined and now we need to
-  //     ;; offload to it via a target task.
-  //   }
-  //   void outlined_device_function(ptr a, ptr b, ptr c) {
-  //     *a = *b + *c
-  //   }
-  //
-  // We have to now do the following
-  // (i)   Make an offloading call to outlined_device_function using the OpenMP
-  //       RTL. See 'kernel_launch_function' in the pseudo code below. This is
-  //       emitted by emitKernelLaunch
-  // (ii)  Create a task entry point function that calls kernel_launch_function
-  //       and is the entry point for the target task. See
-  //       '@.omp_target_task_proxy_func in the pseudocode below.
-  // (iii) Create a task with the task entry point created in (ii)
-  //
-  // That is we create the following
-  //
-  //   void user_code_that_offloads(...) {
-  //     %.offload_baseptrs = alloca [3 x ptr], align 8
-  //     %.offload_ptrs = alloca [3 x ptr], align 8
-  //     %.offload_mappers = alloca [3 x ptr], align 8
-  //
-  //     %structArg = alloca { ptr, ptr, ptr }, align 8
-  //     %strucArg[0] = %.offload_baseptrs
-  //     %strucArg[1] = %.offload_ptrs
-  //     %strucArg[2] = %.offload_mappers
-  //     proxy_target_task = @__kmpc_omp_task_alloc(...,
-  //                                               @.omp_target_task_proxy_func)
-  //     memcpy(proxy_target_task->shareds, %structArg, sizeof(structArg))
-  //     dependencies_array = ...
-  //     ;; if nowait not present
-  //     call @__kmpc_omp_wait_deps(..., dependencies_array)
-  //     call @__kmpc_omp_task_begin_if0(...)
-  //     call @ @.omp_target_task_proxy_func(i32 thread_id, ptr
-  //     %proxy_target_task) call @__kmpc_omp_task_complete_if0(...)
-  //   }
-  //
-  //   define internal void @.omp_target_task_proxy_func(i32 %thread.id,
-  //                                                     ptr %task) {
-  //       %structArg = alloca {ptr, ptr, ptr}
-  //       %shared_data = load (getelementptr %task, 0, 0)
-  //       mempcy(%structArg, %shared_data, sizeof(structArg))
-  //       kernel_launch_function(%thread.id, %structArg)
-  //   }
-  //
-  //   We need the proxy function because the signature of the task entry point
-  //   expected by kmpc_omp_task is always the same and will be different from
-  //   that of the kernel_launch function.
-  //
-  //   kernel_launch_function is generated by emitKernelLaunch and has the
-  //   always_inline attribute.
-  //   void kernel_launch_function(thread_id,
-  //                               structArg) alwaysinline {
-  //       %kernel_args = alloca %struct.__tgt_kernel_arguments, align 8
-  //       offload_baseptrs = load(getelementptr structArg, 0, 0)
-  //       offload_ptrs = load(getelementptr structArg, 0, 1)
-  //       offload_mappers = load(getelementptr structArg, 0, 2)
-  //       ; setup kernel_args using offload_baseptrs, offload_ptrs and
-  //       ; offload_mappers
-  //       call i32 @__tgt_target_kernel(...,
-  //                                     outlined_device_function,
-  //                                     ptr %kernel_args)
-  //   }
-  //   void outlined_device_function(ptr a, ptr b, ptr c) {
-  //      *a = *b + *c
-  //   }
-  //
-  BasicBlock *TargetTaskBodyBB =
-      splitBB(Builder, /*CreateBranch=*/true, "target.task.body");
-  BasicBlock *TargetTaskAllocaBB =
-      splitBB(Builder, /*CreateBranch=*/true, "target.task.alloca");
-
-  InsertPointTy TargetTaskAllocaIP(TargetTaskAllocaBB,
-                                   TargetTaskAllocaBB->begin());
-  InsertPointTy TargetTaskBodyIP(TargetTaskBodyBB, TargetTaskBodyBB->begin());
-
-  OutlineInfo OI;
-  OI.EntryBB = TargetTaskAllocaBB;
-  OI.OuterAllocaBB = AllocaIP.getBlock();
-
-  // Add the thread ID argument.
-  SmallVector<Instruction *, 4> ToBeDeleted;
-  OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
-      Builder, AllocaIP, ToBeDeleted, TargetTaskAllocaIP, "global.tid", false));
-
-  Builder.restoreIP(TargetTaskBodyIP);
-
-  // emitKernelLaunch makes the necessary runtime call to offload the kernel.
-  // We then outline all that code into a separate function
-  // ('kernel_launch_function' in the pseudo code above). This function is then
-  // called by the target task proxy function (see
-  // '@.omp_target_task_proxy_func' in the pseudo code above)
-  // "@.omp_target_task_proxy_func' is generated by emitTargetTaskProxyFunction
-  Builder.restoreIP(emitKernelLaunch(Builder, OutlinedFn, OutlinedFnID,
-                                     EmitTargetCallFallbackCB, Args, DeviceID,
-                                     RTLoc, TargetTaskAllocaIP));
-
-  OI.ExitBB = Builder.saveIP().getBlock();
-  OI.PostOutlineCB = [this, ToBeDeleted, Dependencies,
-                      HasNoWait](Function &OutlinedFn) mutable {
-    assert(OutlinedFn.getNumUses() == 1 &&
-           "there must be a single user for the outlined function");
-
-    CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
-    bool HasShareds = StaleCI->arg_size() > 1;
-
-    Function *ProxyFn = emitTargetTaskProxyFunction(*this, Builder, StaleCI);
-
-    LLVM_DEBUG(dbgs() << "Proxy task entry function created: " << *ProxyFn
-                      << "\n");
-
-    Builder.SetInsertPoint(StaleCI);
-
-    // Gather the arguments for emitting the runtime call.
-    uint32_t SrcLocStrSize;
-    Constant *SrcLocStr =
-        getOrCreateSrcLocStr(LocationDescription(Builder), SrcLocStrSize);
-    Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
-
-    // @__kmpc_omp_task_alloc
-    Function *TaskAllocFn =
-        getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc);
-
-    // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
-    // call.
-    Value *ThreadID = getOrCreateThreadID(Ident);
-
-    // Argument - `sizeof_kmp_task_t` (TaskSize)
-    // Tasksize refers to the size in bytes of kmp_task_t data structure
-    // including private vars accessed in task.
-    // TODO: add kmp_task_t_with_privates (privates)
-    Value *TaskSize =
-        Builder.getInt64(M.getDataLayout().getTypeStoreSize(Task));
-
-    // Argument - `sizeof_shareds` (SharedsSize)
-    // SharedsSize refers to the shareds array size in the kmp_task_t data
-    // structure.
-    Value *SharedsSize = Builder.getInt64(0);
-    if (HasShareds) {
-      auto *ArgStructAlloca = dyn_cast<AllocaInst>(StaleCI->getArgOperand(1));
-      assert(ArgStructAlloca &&
-             "Unable to find the alloca instruction corresponding to arguments "
-             "for extracted function");
-      auto *ArgStructType =
-          dyn_cast<StructType>(ArgStructAlloca->getAllocatedType());
-      assert(ArgStructType && "Unable to find struct type corresponding to "
-                              "arguments for extracted function");
-      SharedsSize =
-          Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType));
-    }
-
-    // Argument - `flags`
-    // Task is tied iff (Flags & 1) == 1.
-    // Task is untied iff (Flags & 1) == 0.
-    // Task is final iff (Flags & 2) == 2.
-    // Task is not final iff (Flags & 2) == 0.
-    // A target task is not final and is untied.
-    Value *Flags = Builder.getInt32(0);
-
-    // Emit the @__kmpc_omp_task_alloc runtime call
-    // The runtime call returns a pointer to an area where the task captured
-    // variables must be copied before the task is run (TaskData)
-    CallInst *TaskData = Builder.CreateCall(
-        TaskAllocFn, {/*loc_ref=*/Ident, /*gtid=*/ThreadID, /*flags=*/Flags,
-                      /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
-                      /*task_func=*/ProxyFn});
-
-    if (HasShareds) {
-      Value *Shareds = StaleCI->getArgOperand(1);
-      Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
-      Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData);
-      Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
-                           SharedsSize);
-    }
-
-    Value *DepArray = emitTaskDependencies(*this, Dependencies);
-
-    // ---------------------------------------------------------------
-    // V5.2 13.8 target construct
-    // If the nowait clause is present, execution of the target task
-    // may be deferred. If the nowait clause is not present, the target task is
-    // an included task.
-    // ---------------------------------------------------------------
-    // The above means that the lack of a nowait on the target construct
-    // translates to '#pragma omp task if(0)'
-    if (!HasNoWait) {
-      if (DepArray) {
-        Function *TaskWaitFn =
-            getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
-        Builder.CreateCall(
-            TaskWaitFn,
-            {/*loc_ref=*/Ident, /*gtid=*/ThreadID,
-             /*ndeps=*/Builder.getInt32(Dependencies.size()),
-             /*dep_list=*/DepArray,
-             /*ndeps_noalias=*/ConstantInt::get(Builder.getInt32Ty(), 0),
-             /*noalias_dep_list=*/
-             ConstantPointerNull::get(PointerType::getUnqual(M.getContext()))});
-      }
-      // Included task.
-      Function *TaskBeginFn =
-          getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
-      Function *TaskCompleteFn =
-          getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
-      Builder.CreateCall(TaskBeginFn, {Ident, ThreadID, TaskData});
-      CallInst *CI = nullptr;
-      if (HasShareds)
-        CI = Builder.CreateCall(ProxyFn, {ThreadID, TaskData});
-      else
-        CI = Builder.CreateCall(ProxyFn, {ThreadID});
-      CI->setDebugLoc(StaleCI->getDebugLoc());
-      Builder.CreateCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
-    } else if (DepArray) {
-      // HasNoWait - meaning the task may be deferred. Call
-      // __kmpc_omp_task_with_deps if there are dependencies,
-      // else call __kmpc_omp_task
-      Function *TaskFn =
-          getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
-      Builder.CreateCall(
-          TaskFn,
-          {Ident, ThreadID, TaskData, Builder.getInt32(Dependencies.size()),
-           DepArray, ConstantInt::get(Builder.getInt32Ty(), 0),
-           ConstantPointerNull::get(PointerType::getUnqual(M.getContext()))});
-    } else {
-      // Emit the @__kmpc_omp_task runtime call to spawn the task
-      Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
-      Builder.CreateCall(TaskFn, {Ident, ThreadID, TaskData});
-    }
 
-    StaleCI->eraseFromParent();
-    llvm::for_each(llvm::reverse(ToBeDeleted),
-                   [](Instruction *I) { I->eraseFromParent(); });
-  };
-  addOutlineInfo(std::move(OI));
-
-  LLVM_DEBUG(dbgs() << "Insert block after emitKernelLaunch = \n"
-                    << *(Builder.GetInsertBlock()) << "\n");
-  LLVM_DEBUG(dbgs() << "Module after emitKernelLaunch = \n"
-                    << *(Builder.GetInsertBlock()->getParent()->getParent())
-                    << "\n");
-  return Builder.saveIP();
-}
-static void emitTargetCall(
-    OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
-    OpenMPIRBuilder::InsertPointTy AllocaIP, Function *OutlinedFn,
-    Constant *OutlinedFnID, int32_t NumTeams, int32_t NumThreads,
-    SmallVectorImpl<Value *> &Args,
-    OpenMPIRBuilder::GenMapInfoCallbackTy GenMapInfoCB,
-    SmallVector<llvm::OpenMPIRBuilder::DependData> Dependencies = {}) {
+static void emitTargetCall(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
+                           OpenMPIRBuilder::InsertPointTy AllocaIP,
+                           Function *OutlinedFn, Constant *OutlinedFnID,
+                           int32_t NumTeams, int32_t NumThreads,
+                           SmallVectorImpl<Value *> &Args,
+                           OpenMPIRBuilder::GenMapInfoCallbackTy GenMapInfoCB) {
 
   OpenMPIRBuilder::TargetDataInfo Info(
       /*RequiresDevicePointerInfo=*/false,
@@ -7096,34 +6677,23 @@ static void emitTargetCall(
   Value *DynCGGroupMem = Builder.getInt32(0);
 
   bool HasNoWait = false;
-  bool HasDependencies = Dependencies.size() > 0;
-  bool RequiresOuterTargetTask = HasNoWait || HasDependencies;
 
   OpenMPIRBuilder::TargetKernelArgs KArgs(NumTargetItems, RTArgs, NumIterations,
                                           NumTeamsVal, NumThreadsVal,
                                           DynCGGroupMem, HasNoWait);
 
-  // The presence of certain clauses on the target directive require the
-  // explicit generation of the target task.
-  if (RequiresOuterTargetTask) {
-    Builder.restoreIP(OMPBuilder.emitTargetTask(
-        OutlinedFn, OutlinedFnID, EmitTargetCallFallbackCB, KArgs, DeviceID,
-        RTLoc, AllocaIP, Dependencies, HasNoWait));
-  } else {
-    Builder.restoreIP(OMPBuilder.emitKernelLaunch(
-        Builder, OutlinedFn, OutlinedFnID, EmitTargetCallFallbackCB, KArgs,
-        DeviceID, RTLoc, AllocaIP));
-  }
+  Builder.restoreIP(OMPBuilder.emitKernelLaunch(
+      Builder, OutlinedFn, OutlinedFnID, EmitTargetCallFallbackCB, KArgs,
+      DeviceID, RTLoc, AllocaIP));
 }
+
 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createTarget(
     const LocationDescription &Loc, InsertPointTy AllocaIP,
     InsertPointTy CodeGenIP, TargetRegionEntryInfo &EntryInfo, int32_t NumTeams,
     int32_t NumThreads, SmallVectorImpl<Value *> &Args,
     GenMapInfoCallbackTy GenMapInfoCB,
     OpenMPIRBuilder::TargetBodyGenCallbackTy CBFunc,
-    OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB,
-    SmallVector<DependData> Dependencies) {
-
+    OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB) {
   if (!updateToLocation(Loc))
     return InsertPointTy();
 
@@ -7131,18 +6701,12 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createTarget(
 
   Function *OutlinedFn;
   Constant *OutlinedFnID;
-  // The target region is outlined into its own function. The LLVM IR for
-  // the target region itself is generated using the callbacks CBFunc
-  // and ArgAccessorFuncCB
   emitTargetOutlinedFunction(*this, Builder, EntryInfo, OutlinedFn,
                              OutlinedFnID, Args, CBFunc, ArgAccessorFuncCB);
-
-  // If we are not on the target device, then we need to generate code
-  // to make a remote call (offload) to the previously outlined function
-  // that represents the target region. Do that now.
   if (!Config.isTargetDevice())
     emitTargetCall(*this, Builder, AllocaIP, OutlinedFn, OutlinedFnID, NumTeams,
-                   NumThreads, Args, GenMapInfoCB, Dependencies);
+                   NumThreads, Args, GenMapInfoCB);
+
   return Builder.saveIP();
 }
 
@@ -8263,7 +7827,7 @@ OpenMPIRBuilder::createTeams(const LocationDescription &Loc,
   OI.OuterAllocaBB = &OuterAllocaBB;
 
   // Insert fake values for global tid and bound tid.
-  SmallVector<Instruction *, 8> ToBeDeleted;
+  std::stack<Instruction *> ToBeDeleted;
   InsertPointTy OuterAllocaIP(&OuterAllocaBB, OuterAllocaBB.begin());
   OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
       Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "gid", true));
@@ -8278,7 +7842,7 @@ OpenMPIRBuilder::createTeams(const LocationDescription &Loc,
     assert(OutlinedFn.getNumUses() == 1 &&
            "there must be a single user for the outlined function");
     CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
-    ToBeDeleted.push_back(StaleCI);
+    ToBeDeleted.push(StaleCI);
 
     assert((OutlinedFn.arg_size() == 2 || OutlinedFn.arg_size() == 3) &&
            "Outlined function must have two or three arguments only");
@@ -8302,9 +7866,10 @@ OpenMPIRBuilder::createTeams(const LocationDescription &Loc,
                            omp::RuntimeFunction::OMPRTL___kmpc_fork_teams),
                        Args);
 
-    llvm::for_each(llvm::reverse(ToBeDeleted),
-                   [](Instruction *I) { I->eraseFromParent(); });
-
+    while (!ToBeDeleted.empty()) {
+      ToBeDeleted.top()->eraseFromParent();
+      ToBeDeleted.pop();
+    }
   };
 
   if (!Config.isTargetDevice())
diff --git a/llvm/lib/IR/Metadata.cpp b/llvm/lib/IR/Metadata.cpp
index ae5f5de142328..3aec7140510a6 100644
--- a/llvm/lib/IR/Metadata.cpp
+++ b/llvm/lib/IR/Metadata.cpp
@@ -1287,12 +1287,12 @@ MDNode *MDNode::getMostGenericRange(MDNode *A, MDNode *B) {
     return A;
 
   // First, walk both lists in order of the lower boundary of each interval.
-  // At each step, try to merge the new interval to the last one we added.
+  // At each step, try to merge the new interval to the last one we adedd.
   SmallVector<ConstantInt *, 4> EndPoints;
-  unsigned AI = 0;
-  unsigned BI = 0;
-  unsigned AN = A->getNumOperands() / 2;
-  unsigned BN = B->getNumOperands() / 2;
+  int AI = 0;
+  int BI = 0;
+  int AN = A->getNumOperands() / 2;
+  int BN = B->getNumOperands() / 2;
   while (AI < AN && BI < BN) {
     ConstantInt *ALow = mdconst::extract<ConstantInt>(A->getOperand(2 * AI));
     ConstantInt *BLow = mdconst::extract<ConstantInt>(B->getOperand(2 * BI));
diff --git a/llvm/lib/MC/ELFObjectWriter.cpp b/llvm/lib/MC/ELFObjectWriter.cpp
index f958905a26aa4..59d796b419b35 100644
--- a/llvm/lib/MC/ELFObjectWriter.cpp
+++ b/llvm/lib/MC/ELFObjectWriter.cpp
@@ -67,6 +67,7 @@ using namespace llvm;
 
 namespace {
 
+class ELFObjectWriter;
 struct ELFWriter;
 
 bool isDwoSection(const MCSectionELF &Sec) {
@@ -198,6 +199,116 @@ struct ELFWriter {
   void writeSection(uint32_t GroupSymbolIndex, uint64_t Offset, uint64_t Size,
                     const MCSectionELF &Section);
 };
+
+class ELFObjectWriter : public MCObjectWriter {
+  /// The target specific ELF writer instance.
+  std::unique_ptr<MCELFObjectTargetWriter> TargetObjectWriter;
+
+  DenseMap<const MCSectionELF *, std::vector<ELFRelocationEntry>> Relocations;
+
+  DenseMap<const MCSymbolELF *, const MCSymbolELF *> Renames;
+
+  bool SeenGnuAbi = false;
+
+  std::optional<uint8_t> OverrideABIVersion;
+
+  bool hasRelocationAddend() const;
+
+  bool shouldRelocateWithSymbol(const MCAssembler &Asm, const MCValue &Val,
+                                const MCSymbolELF *Sym, uint64_t C,
+                                unsigned Type) const;
+
+public:
+  ELFObjectWriter(std::unique_ptr<MCELFObjectTargetWriter> MOTW)
+      : TargetObjectWriter(std::move(MOTW)) {}
+
+  void reset() override {
+    SeenGnuAbi = false;
+    OverrideABIVersion.reset();
+    Relocations.clear();
+    Renames.clear();
+    MCObjectWriter::reset();
+  }
+
+  bool isSymbolRefDifferenceFullyResolvedImpl(const MCAssembler &Asm,
+                                              const MCSymbol &SymA,
+                                              const MCFragment &FB, bool InSet,
+                                              bool IsPCRel) const override;
+
+  virtual bool checkRelocation(MCContext &Ctx, SMLoc Loc,
+                               const MCSectionELF *From,
+                               const MCSectionELF *To) {
+    return true;
+  }
+
+  void recordRelocation(MCAssembler &Asm, const MCFragment *Fragment,
+                        const MCFixup &Fixup, MCValue Target,
+                        uint64_t &FixedValue) override;
+  bool usesRela(const MCTargetOptions *TO, const MCSectionELF &Sec) const;
+
+  void executePostLayoutBinding(MCAssembler &Asm) override;
+
+  void markGnuAbi() override { SeenGnuAbi = true; }
+  bool seenGnuAbi() const { return SeenGnuAbi; }
+
+  bool seenOverrideABIVersion() const { return OverrideABIVersion.has_value(); }
+  uint8_t getOverrideABIVersion() const { return OverrideABIVersion.value(); }
+  void setOverrideABIVersion(uint8_t V) override { OverrideABIVersion = V; }
+
+  friend struct ELFWriter;
+};
+
+class ELFSingleObjectWriter : public ELFObjectWriter {
+  raw_pwrite_stream &OS;
+  bool IsLittleEndian;
+
+public:
+  ELFSingleObjectWriter(std::unique_ptr<MCELFObjectTargetWriter> MOTW,
+                        raw_pwrite_stream &OS, bool IsLittleEndian)
+      : ELFObjectWriter(std::move(MOTW)), OS(OS),
+        IsLittleEndian(IsLittleEndian) {}
+
+  uint64_t writeObject(MCAssembler &Asm) override {
+    return ELFWriter(*this, OS, IsLittleEndian, ELFWriter::AllSections)
+        .writeObject(Asm);
+  }
+
+  friend struct ELFWriter;
+};
+
+class ELFDwoObjectWriter : public ELFObjectWriter {
+  raw_pwrite_stream &OS, &DwoOS;
+  bool IsLittleEndian;
+
+public:
+  ELFDwoObjectWriter(std::unique_ptr<MCELFObjectTargetWriter> MOTW,
+                     raw_pwrite_stream &OS, raw_pwrite_stream &DwoOS,
+                     bool IsLittleEndian)
+      : ELFObjectWriter(std::move(MOTW)), OS(OS), DwoOS(DwoOS),
+        IsLittleEndian(IsLittleEndian) {}
+
+  bool checkRelocation(MCContext &Ctx, SMLoc Loc, const MCSectionELF *From,
+                       const MCSectionELF *To) override {
+    if (isDwoSection(*From)) {
+      Ctx.reportError(Loc, "A dwo section may not contain relocations");
+      return false;
+    }
+    if (To && isDwoSection(*To)) {
+      Ctx.reportError(Loc, "A relocation may not refer to a dwo section");
+      return false;
+    }
+    return true;
+  }
+
+  uint64_t writeObject(MCAssembler &Asm) override {
+    uint64_t Size = ELFWriter(*this, OS, IsLittleEndian, ELFWriter::NonDwoOnly)
+                        .writeObject(Asm);
+    Size += ELFWriter(*this, DwoOS, IsLittleEndian, ELFWriter::DwoOnly)
+                .writeObject(Asm);
+    return Size;
+  }
+};
+
 } // end anonymous namespace
 
 uint64_t ELFWriter::align(Align Alignment) {
@@ -292,8 +403,8 @@ void ELFWriter::writeHeader(const MCAssembler &Asm) {
                    ? int(ELF::ELFOSABI_GNU)
                    : OSABI);
   // e_ident[EI_ABIVERSION]
-  W.OS << char(OWriter.OverrideABIVersion
-                   ? *OWriter.OverrideABIVersion
+  W.OS << char(OWriter.seenOverrideABIVersion()
+                   ? OWriter.getOverrideABIVersion()
                    : OWriter.TargetObjectWriter->getABIVersion());
 
   W.OS.write_zeros(ELF::EI_NIDENT - ELF::EI_PAD);
@@ -308,7 +419,7 @@ void ELFWriter::writeHeader(const MCAssembler &Asm) {
   WriteWord(0);                     // e_shoff = sec hdr table off in bytes
 
   // e_flags = whatever the target wants
-  W.write<uint32_t>(OWriter.getELFHeaderEFlags());
+  W.write<uint32_t>(Asm.getELFHeaderEFlags());
 
   // e_ehsize = ELF header size
   W.write<uint16_t>(is64Bit() ? sizeof(ELF::Elf64_Ehdr)
@@ -508,7 +619,7 @@ void ELFWriter::computeSymbolTable(MCAssembler &Asm,
   std::vector<ELFSymbolData> LocalSymbolData;
   std::vector<ELFSymbolData> ExternalSymbolData;
   MutableArrayRef<std::pair<std::string, size_t>> FileNames =
-      OWriter.getFileNames();
+      Asm.getFileNames();
   for (const std::pair<std::string, size_t> &F : FileNames)
     StrTabBuilder.add(F.first);
 
@@ -668,7 +779,7 @@ void ELFWriter::computeSymbolTable(MCAssembler &Asm,
 }
 
 void ELFWriter::writeAddrsigSection() {
-  for (const MCSymbol *Sym : OWriter.getAddrsigSyms())
+  for (const MCSymbol *Sym : OWriter.AddrsigSyms)
     if (Sym->getIndex() != 0)
       encodeULEB128(Sym->getIndex(), W.OS);
 }
@@ -1039,7 +1150,7 @@ uint64_t ELFWriter::writeObject(MCAssembler &Asm) {
     StrTabBuilder.finalize();
   } else {
     MCSectionELF *AddrsigSection;
-    if (OWriter.getEmitAddrsigSection()) {
+    if (OWriter.EmitAddrsigSection) {
       AddrsigSection = Ctx.getELFSection(".llvm_addrsig", ELF::SHT_LLVM_ADDRSIG,
                                          ELF::SHF_EXCLUDE);
       addToSectionTable(AddrsigSection);
@@ -1059,7 +1170,7 @@ uint64_t ELFWriter::writeObject(MCAssembler &Asm) {
       RelSection->setOffsets(SecStart, SecEnd);
     }
 
-    if (OWriter.getEmitAddrsigSection()) {
+    if (OWriter.EmitAddrsigSection) {
       uint64_t SecStart = W.OS.tell();
       writeAddrsigSection();
       uint64_t SecEnd = W.OS.tell();
@@ -1104,26 +1215,6 @@ uint64_t ELFWriter::writeObject(MCAssembler &Asm) {
   return W.OS.tell() - StartOffset;
 }
 
-ELFObjectWriter::ELFObjectWriter(std::unique_ptr<MCELFObjectTargetWriter> MOTW,
-                                 raw_pwrite_stream &OS, bool IsLittleEndian)
-    : TargetObjectWriter(std::move(MOTW)), OS(OS),
-      IsLittleEndian(IsLittleEndian) {}
-ELFObjectWriter::ELFObjectWriter(std::unique_ptr<MCELFObjectTargetWriter> MOTW,
-                                 raw_pwrite_stream &OS,
-                                 raw_pwrite_stream &DwoOS, bool IsLittleEndian)
-    : TargetObjectWriter(std::move(MOTW)), OS(OS), DwoOS(&DwoOS),
-      IsLittleEndian(IsLittleEndian) {}
-
-void ELFObjectWriter::reset() {
-  ELFHeaderEFlags = 0;
-  SeenGnuAbi = false;
-  OverrideABIVersion.reset();
-  Relocations.clear();
-  Renames.clear();
-  Symvers.clear();
-  MCObjectWriter::reset();
-}
-
 bool ELFObjectWriter::hasRelocationAddend() const {
   return TargetObjectWriter->hasRelocationAddend();
 }
@@ -1131,7 +1222,7 @@ bool ELFObjectWriter::hasRelocationAddend() const {
 void ELFObjectWriter::executePostLayoutBinding(MCAssembler &Asm) {
   // The presence of symbol versions causes undefined symbols and
   // versions declared with @@@ to be renamed.
-  for (const Symver &S : Symvers) {
+  for (const MCAssembler::Symver &S : Asm.Symvers) {
     StringRef AliasName = S.Name;
     const auto &Symbol = cast<MCSymbolELF>(*S.Sym);
     size_t Pos = AliasName.find('@');
@@ -1315,22 +1406,6 @@ bool ELFObjectWriter::shouldRelocateWithSymbol(const MCAssembler &Asm,
   return false;
 }
 
-bool ELFObjectWriter::checkRelocation(MCContext &Ctx, SMLoc Loc,
-                                      const MCSectionELF *From,
-                                      const MCSectionELF *To) {
-  if (DwoOS) {
-    if (isDwoSection(*From)) {
-      Ctx.reportError(Loc, "A dwo section may not contain relocations");
-      return false;
-    }
-    if (To && isDwoSection(*To)) {
-      Ctx.reportError(Loc, "A relocation may not refer to a dwo section");
-      return false;
-    }
-  }
-  return true;
-}
-
 void ELFObjectWriter::recordRelocation(MCAssembler &Asm,
                                        const MCFragment *Fragment,
                                        const MCFixup &Fixup, MCValue Target,
@@ -1447,13 +1522,17 @@ bool ELFObjectWriter::isSymbolRefDifferenceFullyResolvedImpl(
   return &SymA.getSection() == FB.getParent();
 }
 
-uint64_t ELFObjectWriter::writeObject(MCAssembler &Asm) {
-  uint64_t Size =
-      ELFWriter(*this, OS, IsLittleEndian,
-                DwoOS ? ELFWriter::NonDwoOnly : ELFWriter::AllSections)
-          .writeObject(Asm);
-  if (DwoOS)
-    Size += ELFWriter(*this, *DwoOS, IsLittleEndian, ELFWriter::DwoOnly)
-                .writeObject(Asm);
-  return Size;
+std::unique_ptr<MCObjectWriter>
+llvm::createELFObjectWriter(std::unique_ptr<MCELFObjectTargetWriter> MOTW,
+                            raw_pwrite_stream &OS, bool IsLittleEndian) {
+  return std::make_unique<ELFSingleObjectWriter>(std::move(MOTW), OS,
+                                                  IsLittleEndian);
+}
+
+std::unique_ptr<MCObjectWriter>
+llvm::createELFDwoObjectWriter(std::unique_ptr<MCELFObjectTargetWriter> MOTW,
+                               raw_pwrite_stream &OS, raw_pwrite_stream &DwoOS,
+                               bool IsLittleEndian) {
+  return std::make_unique<ELFDwoObjectWriter>(std::move(MOTW), OS, DwoOS,
+                                               IsLittleEndian);
 }
diff --git a/llvm/lib/MC/MCAsmBackend.cpp b/llvm/lib/MC/MCAsmBackend.cpp
index 7433f41c9e0fb..fc36f29a90339 100644
--- a/llvm/lib/MC/MCAsmBackend.cpp
+++ b/llvm/lib/MC/MCAsmBackend.cpp
@@ -32,17 +32,16 @@ MCAsmBackend::~MCAsmBackend() = default;
 std::unique_ptr<MCObjectWriter>
 MCAsmBackend::createObjectWriter(raw_pwrite_stream &OS) const {
   auto TW = createObjectTargetWriter();
-  bool IsLE = Endian == llvm::endianness::little;
   switch (TW->getFormat()) {
+  case Triple::ELF:
+    return createELFObjectWriter(cast<MCELFObjectTargetWriter>(std::move(TW)),
+                                 OS, Endian == llvm::endianness::little);
   case Triple::MachO:
     return createMachObjectWriter(cast<MCMachObjectTargetWriter>(std::move(TW)),
-                                  OS, IsLE);
+                                  OS, Endian == llvm::endianness::little);
   case Triple::COFF:
     return createWinCOFFObjectWriter(
         cast<MCWinCOFFObjectTargetWriter>(std::move(TW)), OS);
-  case Triple::ELF:
-    return std::make_unique<ELFObjectWriter>(
-        cast<MCELFObjectTargetWriter>(std::move(TW)), OS, IsLE);
   case Triple::SPIRV:
     return createSPIRVObjectWriter(
         cast<MCSPIRVObjectTargetWriter>(std::move(TW)), OS);
@@ -72,7 +71,7 @@ MCAsmBackend::createDwoObjectWriter(raw_pwrite_stream &OS,
     return createWinCOFFDwoObjectWriter(
         cast<MCWinCOFFObjectTargetWriter>(std::move(TW)), OS, DwoOS);
   case Triple::ELF:
-    return std::make_unique<ELFObjectWriter>(
+    return createELFDwoObjectWriter(
         cast<MCELFObjectTargetWriter>(std::move(TW)), OS, DwoOS,
         Endian == llvm::endianness::little);
   case Triple::Wasm:
diff --git a/llvm/lib/MC/MCAssembler.cpp b/llvm/lib/MC/MCAssembler.cpp
index ceeb7af0fecc4..c8d12eb5dcf64 100644
--- a/llvm/lib/MC/MCAssembler.cpp
+++ b/llvm/lib/MC/MCAssembler.cpp
@@ -84,22 +84,38 @@ MCAssembler::MCAssembler(MCContext &Context,
                          std::unique_ptr<MCCodeEmitter> Emitter,
                          std::unique_ptr<MCObjectWriter> Writer)
     : Context(Context), Backend(std::move(Backend)),
-      Emitter(std::move(Emitter)), Writer(std::move(Writer)) {}
+      Emitter(std::move(Emitter)), Writer(std::move(Writer)) {
+  VersionInfo.Major = 0; // Major version == 0 for "none specified"
+  DarwinTargetVariantVersionInfo.Major = 0;
+}
+
+MCAssembler::~MCAssembler() = default;
 
 void MCAssembler::reset() {
   RelaxAll = false;
+  SubsectionsViaSymbols = false;
+  IncrementalLinkerCompatible = false;
   Sections.clear();
   Symbols.clear();
+  LinkerOptions.clear();
+  FileNames.clear();
   ThumbFuncs.clear();
   BundleAlignSize = 0;
+  ELFHeaderEFlags = 0;
+  LOHContainer.reset();
+  VersionInfo.Major = 0;
+  VersionInfo.SDKVersion = VersionTuple();
+  DarwinTargetVariantVersionInfo.Major = 0;
+  DarwinTargetVariantVersionInfo.SDKVersion = VersionTuple();
 
   // reset objects owned by us
   if (getBackendPtr())
     getBackendPtr()->reset();
   if (getEmitterPtr())
     getEmitterPtr()->reset();
-  if (Writer)
-    Writer->reset();
+  if (getWriterPtr())
+    getWriterPtr()->reset();
+  getLOHContainer().reset();
 }
 
 bool MCAssembler::registerSection(MCSection &Section) {
@@ -192,9 +208,9 @@ bool MCAssembler::evaluateFixup(const MCFixup &Fixup, const MCFragment *DF,
       const MCSymbol &SA = A->getSymbol();
       if (A->getKind() != MCSymbolRefExpr::VK_None || SA.isUndefined()) {
         IsResolved = false;
-      } else {
+      } else if (auto *Writer = getWriterPtr()) {
         IsResolved = (FixupFlags & MCFixupKindInfo::FKF_Constant) ||
-                     getWriter().isSymbolRefDifferenceFullyResolvedImpl(
+                     Writer->isSymbolRefDifferenceFullyResolvedImpl(
                          *this, SA, *DF, false, true);
       }
     }
@@ -432,6 +448,28 @@ void MCAssembler::layoutBundle(MCFragment *Prev, MCFragment *F) const {
       DF->Offset = EF->Offset;
 }
 
+void MCAssembler::ensureValid(MCSection &Sec) const {
+  if (Sec.hasLayout())
+    return;
+  Sec.setHasLayout(true);
+  MCFragment *Prev = nullptr;
+  uint64_t Offset = 0;
+  for (MCFragment &F : Sec) {
+    F.Offset = Offset;
+    if (isBundlingEnabled() && F.hasInstructions()) {
+      layoutBundle(Prev, &F);
+      Offset = F.Offset;
+    }
+    Offset += computeFragmentSize(F);
+    Prev = &F;
+  }
+}
+
+uint64_t MCAssembler::getFragmentOffset(const MCFragment &F) const {
+  ensureValid(*F.getParent());
+  return F.Offset;
+}
+
 // Simple getSymbolOffset helper for the non-variable case.
 static bool getLabelOffset(const MCAssembler &Asm, const MCSymbol &S,
                            bool ReportError, uint64_t &Val) {
@@ -916,20 +954,22 @@ void MCAssembler::layout() {
 
   // Layout until everything fits.
   this->HasLayout = true;
-  for (MCSection &Sec : *this)
-    layoutSection(Sec);
   while (layoutOnce()) {
+    if (getContext().hadError())
+      return;
+    // Size of fragments in one section can depend on the size of fragments in
+    // another. If any fragment has changed size, we have to re-layout (and
+    // as a result possibly further relax) all.
+    for (MCSection &Sec : *this)
+      Sec.setHasLayout(false);
   }
 
   DEBUG_WITH_TYPE("mc-dump", {
       errs() << "assembler backend - post-relaxation\n--\n";
       dump(); });
 
-  // Some targets might want to adjust fragment offsets. If so, perform another
-  // layout loop.
-  if (getBackend().finishLayout(*this))
-    for (MCSection &Sec : *this)
-      layoutSection(Sec);
+  // Finalize the layout, including fragment lowering.
+  getBackend().finishLayout(*this);
 
   DEBUG_WITH_TYPE("mc-dump", {
       errs() << "assembler backend - final-layout\n--\n";
@@ -1092,7 +1132,7 @@ bool MCAssembler::relaxLEB(MCLEBFragment &LF) {
   // Use evaluateKnownAbsolute for Mach-O as a hack: .subsections_via_symbols
   // requires that .uleb128 A-B is foldable where A and B reside in different
   // fragments. This is used by __gcc_except_table.
-  bool Abs = getWriter().getSubsectionsViaSymbols()
+  bool Abs = getSubsectionsViaSymbols()
                  ? LF.getValue().evaluateKnownAbsolute(Value, *this)
                  : LF.getValue().evaluateAsAbsolute(Value, *this);
   if (!Abs) {
@@ -1282,42 +1322,15 @@ bool MCAssembler::relaxFragment(MCFragment &F) {
   }
 }
 
-void MCAssembler::layoutSection(MCSection &Sec) {
-  MCFragment *Prev = nullptr;
-  uint64_t Offset = 0;
-  for (MCFragment &F : Sec) {
-    F.Offset = Offset;
-    if (LLVM_UNLIKELY(isBundlingEnabled())) {
-      if (F.hasInstructions()) {
-        layoutBundle(Prev, &F);
-        Offset = F.Offset;
-      }
-      Prev = &F;
-    }
-    Offset += computeFragmentSize(F);
-  }
-}
-
 bool MCAssembler::layoutOnce() {
   ++stats::RelaxationSteps;
 
-  // Size of fragments in one section can depend on the size of fragments in
-  // another. If any fragment has changed size, we have to re-layout (and
-  // as a result possibly further relax) all.
-  bool ChangedAny = false;
-  for (MCSection &Sec : *this) {
-    for (;;) {
-      bool Changed = false;
-      for (MCFragment &F : Sec)
-        if (relaxFragment(F))
-          Changed = true;
-      ChangedAny |= Changed;
-      if (!Changed)
-        break;
-      layoutSection(Sec);
-    }
-  }
-  return ChangedAny;
+  bool Changed = false;
+  for (MCSection &Sec : *this)
+    for (MCFragment &Frag : Sec)
+      if (relaxFragment(Frag))
+        Changed = true;
+  return Changed;
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
diff --git a/llvm/lib/MC/MCContext.cpp b/llvm/lib/MC/MCContext.cpp
index 228c4fb03a276..2fe8369016440 100644
--- a/llvm/lib/MC/MCContext.cpp
+++ b/llvm/lib/MC/MCContext.cpp
@@ -349,11 +349,6 @@ MCSymbol *MCContext::createNamedTempSymbol() {
   return createNamedTempSymbol("tmp");
 }
 
-MCSymbol *MCContext::createLocalSymbol(StringRef Name) {
-  MCSymbolTableEntry &NameEntry = getSymbolTableEntry(Name);
-  return createSymbolImpl(&NameEntry, /*IsTemporary=*/false);
-}
-
 unsigned MCContext::NextInstance(unsigned LocalLabelVal) {
   MCLabel *&Label = Instances[LocalLabelVal];
   if (!Label)
@@ -800,6 +795,7 @@ bool MCContext::hasXCOFFSection(StringRef Section,
 MCSectionXCOFF *MCContext::getXCOFFSection(
     StringRef Section, SectionKind Kind,
     std::optional<XCOFF::CsectProperties> CsectProp, bool MultiSymbolsAllowed,
+    const char *BeginSymName,
     std::optional<XCOFF::DwarfSectionSubtypeFlags> DwarfSectionSubtypeFlags) {
   bool IsDwarfSec = DwarfSectionSubtypeFlags.has_value();
   assert((IsDwarfSec != CsectProp.has_value()) && "Invalid XCOFF section!");
@@ -829,29 +825,35 @@ MCSectionXCOFF *MCContext::getXCOFFSection(
         CachedName + "[" +
         XCOFF::getMappingClassString(CsectProp->MappingClass) + "]"));
 
+  MCSymbol *Begin = nullptr;
+  if (BeginSymName)
+    Begin = createTempSymbol(BeginSymName, false);
+
   // QualName->getUnqualifiedName() and CachedName are the same except when
   // CachedName contains invalid character(s) such as '$' for an XCOFF symbol.
   MCSectionXCOFF *Result = nullptr;
   if (IsDwarfSec)
     Result = new (XCOFFAllocator.Allocate()) MCSectionXCOFF(
         QualName->getUnqualifiedName(), Kind, QualName,
-        *DwarfSectionSubtypeFlags, QualName, CachedName, MultiSymbolsAllowed);
+        *DwarfSectionSubtypeFlags, Begin, CachedName, MultiSymbolsAllowed);
   else
     Result = new (XCOFFAllocator.Allocate())
         MCSectionXCOFF(QualName->getUnqualifiedName(), CsectProp->MappingClass,
-                       CsectProp->Type, Kind, QualName, nullptr, CachedName,
+                       CsectProp->Type, Kind, QualName, Begin, CachedName,
                        MultiSymbolsAllowed);
 
   Entry.second = Result;
 
   auto *F = allocInitialFragment(*Result);
+  if (Begin)
+    Begin->setFragment(F);
 
   // We might miss calculating the symbols difference as absolute value before
   // adding fixups when symbol_A without the fragment set is the csect itself
   // and symbol_B is in it.
-  // TODO: Currently we only set the fragment for XMC_PR csects and DWARF
-  // sections because we don't have other cases that hit this problem yet.
-  if (IsDwarfSec || CsectProp->MappingClass == XCOFF::XMC_PR)
+  // TODO: Currently we only set the fragment for XMC_PR csects because we don't
+  // have other cases that hit this problem yet.
+  if (!IsDwarfSec && CsectProp->MappingClass == XCOFF::XMC_PR)
     QualName->setFragment(F);
 
   return Result;
diff --git a/llvm/lib/MC/MCELFStreamer.cpp b/llvm/lib/MC/MCELFStreamer.cpp
index c84d88e276f40..38ebaccf3b6ff 100644
--- a/llvm/lib/MC/MCELFStreamer.cpp
+++ b/llvm/lib/MC/MCELFStreamer.cpp
@@ -46,10 +46,6 @@ MCELFStreamer::MCELFStreamer(MCContext &Context,
     : MCObjectStreamer(Context, std::move(TAB), std::move(OW),
                        std::move(Emitter)) {}
 
-ELFObjectWriter &MCELFStreamer::getWriter() {
-  return static_cast<ELFObjectWriter &>(getAssembler().getWriter());
-}
-
 bool MCELFStreamer::isBundleLocked() const {
   return getCurrentSectionOnly()->isBundleLocked();
 }
@@ -88,6 +84,18 @@ void MCELFStreamer::emitLabelAtPos(MCSymbol *S, SMLoc Loc, MCDataFragment &F,
 void MCELFStreamer::emitAssemblerFlag(MCAssemblerFlag Flag) {
   // Let the target do whatever target specific stuff it needs to do.
   getAssembler().getBackend().handleAssemblerFlag(Flag);
+  // Do any generic stuff we need to do.
+  switch (Flag) {
+  case MCAF_SyntaxUnified: return; // no-op here.
+  case MCAF_Code16: return; // Change parsing mode; no-op here.
+  case MCAF_Code32: return; // Change parsing mode; no-op here.
+  case MCAF_Code64: return; // Change parsing mode; no-op here.
+  case MCAF_SubsectionsViaSymbols:
+    getAssembler().setSubsectionsViaSymbols(true);
+    return;
+  }
+
+  llvm_unreachable("invalid assembler flag!");
 }
 
 // If bundle alignment is used and there are any instructions in the section, it
@@ -112,7 +120,7 @@ void MCELFStreamer::changeSection(MCSection *Section, uint32_t Subsection) {
   if (Grp)
     Asm.registerSymbol(*Grp);
   if (SectionELF->getFlags() & ELF::SHF_GNU_RETAIN)
-    getWriter().markGnuAbi();
+    Asm.getWriter().markGnuAbi();
 
   changeSectionImpl(Section, Subsection);
   Asm.registerSymbol(*Section->getBeginSymbol());
@@ -180,7 +188,7 @@ bool MCELFStreamer::emitSymbolAttribute(MCSymbol *S, MCSymbolAttr Attribute) {
   case MCSA_ELF_TypeGnuUniqueObject:
     Symbol->setType(CombineSymbolTypes(Symbol->getType(), ELF::STT_OBJECT));
     Symbol->setBinding(ELF::STB_GNU_UNIQUE);
-    getWriter().markGnuAbi();
+    getAssembler().getWriter().markGnuAbi();
     break;
 
   case MCSA_Global:
@@ -218,7 +226,7 @@ bool MCELFStreamer::emitSymbolAttribute(MCSymbol *S, MCSymbolAttr Attribute) {
 
   case MCSA_ELF_TypeIndFunction:
     Symbol->setType(CombineSymbolTypes(Symbol->getType(), ELF::STT_GNU_IFUNC));
-    getWriter().markGnuAbi();
+    getAssembler().getWriter().markGnuAbi();
     break;
 
   case MCSA_ELF_TypeObject:
@@ -302,7 +310,7 @@ void MCELFStreamer::emitELFSize(MCSymbol *Symbol, const MCExpr *Value) {
 void MCELFStreamer::emitELFSymverDirective(const MCSymbol *OriginalSym,
                                            StringRef Name,
                                            bool KeepOriginalSym) {
-  getWriter().Symvers.push_back(ELFObjectWriter::Symver{
+  getAssembler().Symvers.push_back(MCAssembler::Symver{
       getStartTokLoc(), OriginalSym, Name, KeepOriginalSym});
 }
 
@@ -335,7 +343,7 @@ void MCELFStreamer::emitValueToAlignment(Align Alignment, int64_t Value,
 void MCELFStreamer::emitCGProfileEntry(const MCSymbolRefExpr *From,
                                        const MCSymbolRefExpr *To,
                                        uint64_t Count) {
-  getWriter().getCGProfile().push_back({From, To, Count});
+  getAssembler().CGProfile.push_back({From, To, Count});
 }
 
 void MCELFStreamer::emitIdent(StringRef IdentString) {
@@ -464,8 +472,8 @@ void MCELFStreamer::finalizeCGProfileEntry(const MCSymbolRefExpr *&SRE,
 }
 
 void MCELFStreamer::finalizeCGProfile() {
-  ELFObjectWriter &W = getWriter();
-  if (W.getCGProfile().empty())
+  MCAssembler &Asm = getAssembler();
+  if (Asm.CGProfile.empty())
     return;
   MCSection *CGProfile = getAssembler().getContext().getELFSection(
       ".llvm.call-graph-profile", ELF::SHT_LLVM_CALL_GRAPH_PROFILE,
@@ -473,7 +481,7 @@ void MCELFStreamer::finalizeCGProfile() {
   pushSection();
   switchSection(CGProfile);
   uint64_t Offset = 0;
-  for (auto &E : W.getCGProfile()) {
+  for (MCAssembler::CGProfileEntry &E : Asm.CGProfile) {
     finalizeCGProfileEntry(E.From, Offset);
     finalizeCGProfileEntry(E.To, Offset);
     emitIntValue(E.Count, sizeof(uint64_t));
diff --git a/llvm/lib/MC/MCMachOStreamer.cpp b/llvm/lib/MC/MCMachOStreamer.cpp
index 528caa12ec212..cb2e4e70ff395 100644
--- a/llvm/lib/MC/MCMachOStreamer.cpp
+++ b/llvm/lib/MC/MCMachOStreamer.cpp
@@ -118,12 +118,12 @@ class MCMachOStreamer : public MCObjectStreamer {
   }
 
   void emitLOHDirective(MCLOHType Kind, const MCLOHArgs &Args) override {
-    getWriter().getLOHContainer().addDirective(Kind, Args);
+    getAssembler().getLOHContainer().addDirective(Kind, Args);
   }
   void emitCGProfileEntry(const MCSymbolRefExpr *From,
                           const MCSymbolRefExpr *To, uint64_t Count) override {
     if (!From->getSymbol().isTemporary() && !To->getSymbol().isTemporary())
-      getWriter().getCGProfile().push_back({From, To, Count});
+      getAssembler().CGProfile.push_back({From, To, Count});
   }
 
   void finishImpl() override;
@@ -220,13 +220,13 @@ void MCMachOStreamer::emitAssemblerFlag(MCAssemblerFlag Flag) {
   case MCAF_Code32: return; // Change parsing mode; no-op here.
   case MCAF_Code64: return; // Change parsing mode; no-op here.
   case MCAF_SubsectionsViaSymbols:
-    getWriter().setSubsectionsViaSymbols(true);
+    getAssembler().setSubsectionsViaSymbols(true);
     return;
   }
 }
 
 void MCMachOStreamer::emitLinkerOptions(ArrayRef<std::string> Options) {
-  getWriter().getLinkerOptions().push_back(Options);
+  getAssembler().getLinkerOptions().push_back(Options);
 }
 
 void MCMachOStreamer::emitDataRegion(MCDataRegionType Kind) {
@@ -252,21 +252,21 @@ void MCMachOStreamer::emitDataRegion(MCDataRegionType Kind) {
 void MCMachOStreamer::emitVersionMin(MCVersionMinType Kind, unsigned Major,
                                      unsigned Minor, unsigned Update,
                                      VersionTuple SDKVersion) {
-  getWriter().setVersionMin(Kind, Major, Minor, Update, SDKVersion);
+  getAssembler().setVersionMin(Kind, Major, Minor, Update, SDKVersion);
 }
 
 void MCMachOStreamer::emitBuildVersion(unsigned Platform, unsigned Major,
                                        unsigned Minor, unsigned Update,
                                        VersionTuple SDKVersion) {
-  getWriter().setBuildVersion((MachO::PlatformType)Platform, Major, Minor,
-                              Update, SDKVersion);
+  getAssembler().setBuildVersion((MachO::PlatformType)Platform, Major, Minor,
+                                 Update, SDKVersion);
 }
 
 void MCMachOStreamer::emitDarwinTargetVariantBuildVersion(
     unsigned Platform, unsigned Major, unsigned Minor, unsigned Update,
     VersionTuple SDKVersion) {
-  getWriter().setTargetVariantBuildVersion((MachO::PlatformType)Platform, Major,
-                                           Minor, Update, SDKVersion);
+  getAssembler().setDarwinTargetVariantBuildVersion(
+      (MachO::PlatformType)Platform, Major, Minor, Update, SDKVersion);
 }
 
 void MCMachOStreamer::emitThumbFunc(MCSymbol *Symbol) {
@@ -506,10 +506,9 @@ void MCMachOStreamer::finalizeCGProfileEntry(const MCSymbolRefExpr *&SRE) {
 
 void MCMachOStreamer::finalizeCGProfile() {
   MCAssembler &Asm = getAssembler();
-  MCObjectWriter &W = getWriter();
-  if (W.getCGProfile().empty())
+  if (Asm.CGProfile.empty())
     return;
-  for (auto &E : W.getCGProfile()) {
+  for (MCAssembler::CGProfileEntry &E : Asm.CGProfile) {
     finalizeCGProfileEntry(E.From);
     finalizeCGProfileEntry(E.To);
   }
@@ -521,7 +520,7 @@ void MCMachOStreamer::finalizeCGProfile() {
   changeSection(CGProfileSection);
   // For each entry, reserve space for 2 32-bit indices and a 64-bit count.
   size_t SectionBytes =
-      W.getCGProfile().size() * (2 * sizeof(uint32_t) + sizeof(uint64_t));
+      Asm.CGProfile.size() * (2 * sizeof(uint32_t) + sizeof(uint64_t));
   cast<MCDataFragment>(*CGProfileSection->begin())
       .getContents()
       .resize(SectionBytes);
diff --git a/llvm/lib/MC/MCObjectFileInfo.cpp b/llvm/lib/MC/MCObjectFileInfo.cpp
index 6dadd9752646f..7a2b43a954b10 100644
--- a/llvm/lib/MC/MCObjectFileInfo.cpp
+++ b/llvm/lib/MC/MCObjectFileInfo.cpp
@@ -948,52 +948,52 @@ void MCObjectFileInfo::initXCOFFMCObjectFileInfo(const Triple &T) {
   DwarfAbbrevSection = Ctx->getXCOFFSection(
       ".dwabrev", SectionKind::getMetadata(),
       /* CsectProperties */ std::nullopt,
-      /* MultiSymbolsAllowed */ true, XCOFF::SSUBTYP_DWABREV);
+      /* MultiSymbolsAllowed */ true, ".dwabrev", XCOFF::SSUBTYP_DWABREV);
 
   DwarfInfoSection = Ctx->getXCOFFSection(
       ".dwinfo", SectionKind::getMetadata(), /* CsectProperties */ std::nullopt,
-      /* MultiSymbolsAllowed */ true, XCOFF::SSUBTYP_DWINFO);
+      /* MultiSymbolsAllowed */ true, ".dwinfo", XCOFF::SSUBTYP_DWINFO);
 
   DwarfLineSection = Ctx->getXCOFFSection(
       ".dwline", SectionKind::getMetadata(), /* CsectProperties */ std::nullopt,
-      /* MultiSymbolsAllowed */ true, XCOFF::SSUBTYP_DWLINE);
+      /* MultiSymbolsAllowed */ true, ".dwline", XCOFF::SSUBTYP_DWLINE);
 
   DwarfFrameSection = Ctx->getXCOFFSection(
       ".dwframe", SectionKind::getMetadata(),
       /* CsectProperties */ std::nullopt,
-      /* MultiSymbolsAllowed */ true, XCOFF::SSUBTYP_DWFRAME);
+      /* MultiSymbolsAllowed */ true, ".dwframe", XCOFF::SSUBTYP_DWFRAME);
 
   DwarfPubNamesSection = Ctx->getXCOFFSection(
       ".dwpbnms", SectionKind::getMetadata(),
       /* CsectProperties */ std::nullopt,
-      /* MultiSymbolsAllowed */ true, XCOFF::SSUBTYP_DWPBNMS);
+      /* MultiSymbolsAllowed */ true, ".dwpbnms", XCOFF::SSUBTYP_DWPBNMS);
 
   DwarfPubTypesSection = Ctx->getXCOFFSection(
       ".dwpbtyp", SectionKind::getMetadata(),
       /* CsectProperties */ std::nullopt,
-      /* MultiSymbolsAllowed */ true, XCOFF::SSUBTYP_DWPBTYP);
+      /* MultiSymbolsAllowed */ true, ".dwpbtyp", XCOFF::SSUBTYP_DWPBTYP);
 
   DwarfStrSection = Ctx->getXCOFFSection(
       ".dwstr", SectionKind::getMetadata(), /* CsectProperties */ std::nullopt,
-      /* MultiSymbolsAllowed */ true, XCOFF::SSUBTYP_DWSTR);
+      /* MultiSymbolsAllowed */ true, ".dwstr", XCOFF::SSUBTYP_DWSTR);
 
   DwarfLocSection = Ctx->getXCOFFSection(
       ".dwloc", SectionKind::getMetadata(), /* CsectProperties */ std::nullopt,
-      /* MultiSymbolsAllowed */ true, XCOFF::SSUBTYP_DWLOC);
+      /* MultiSymbolsAllowed */ true, ".dwloc", XCOFF::SSUBTYP_DWLOC);
 
   DwarfARangesSection = Ctx->getXCOFFSection(
       ".dwarnge", SectionKind::getMetadata(),
       /* CsectProperties */ std::nullopt,
-      /* MultiSymbolsAllowed */ true, XCOFF::SSUBTYP_DWARNGE);
+      /* MultiSymbolsAllowed */ true, ".dwarnge", XCOFF::SSUBTYP_DWARNGE);
 
   DwarfRangesSection = Ctx->getXCOFFSection(
       ".dwrnges", SectionKind::getMetadata(),
       /* CsectProperties */ std::nullopt,
-      /* MultiSymbolsAllowed */ true, XCOFF::SSUBTYP_DWRNGES);
+      /* MultiSymbolsAllowed */ true, ".dwrnges", XCOFF::SSUBTYP_DWRNGES);
 
   DwarfMacinfoSection = Ctx->getXCOFFSection(
       ".dwmac", SectionKind::getMetadata(), /* CsectProperties */ std::nullopt,
-      /* MultiSymbolsAllowed */ true, XCOFF::SSUBTYP_DWMAC);
+      /* MultiSymbolsAllowed */ true, ".dwmac", XCOFF::SSUBTYP_DWMAC);
 }
 
 void MCObjectFileInfo::initDXContainerObjectFileInfo(const Triple &T) {
diff --git a/llvm/lib/MC/MCObjectStreamer.cpp b/llvm/lib/MC/MCObjectStreamer.cpp
index 9dc3974fd8f0d..a72e34fe6fd33 100644
--- a/llvm/lib/MC/MCObjectStreamer.cpp
+++ b/llvm/lib/MC/MCObjectStreamer.cpp
@@ -32,8 +32,8 @@ MCObjectStreamer::MCObjectStreamer(MCContext &Context,
       Assembler(std::make_unique<MCAssembler>(
           Context, std::move(TAB), std::move(Emitter), std::move(OW))),
       EmitEHFrame(true), EmitDebugFrame(false) {
-  assert(Assembler->getBackendPtr() && Assembler->getEmitterPtr());
-  setAllowAutoPadding(Assembler->getBackend().allowAutoPadding());
+  if (Assembler->getBackendPtr())
+    setAllowAutoPadding(Assembler->getBackend().allowAutoPadding());
   if (Context.getTargetOptions() && Context.getTargetOptions()->MCRelaxAll)
     Assembler->setRelaxAll(true);
 }
@@ -784,18 +784,15 @@ void MCObjectStreamer::emitNops(int64_t NumBytes, int64_t ControlledNopLength,
 }
 
 void MCObjectStreamer::emitFileDirective(StringRef Filename) {
-  MCAssembler &Asm = getAssembler();
-  Asm.getWriter().addFileName(Asm, Filename);
+  getAssembler().addFileName(Filename);
 }
 
 void MCObjectStreamer::emitFileDirective(StringRef Filename,
                                          StringRef CompilerVersion,
                                          StringRef TimeStamp,
                                          StringRef Description) {
-  MCObjectWriter &W = getAssembler().getWriter();
-  W.addFileName(getAssembler(), Filename);
-  if (CompilerVersion.size())
-    W.setCompilerVersion(CompilerVersion);
+  getAssembler().addFileName(Filename);
+  getAssembler().setCompilerVersion(CompilerVersion.str());
   // TODO: add TimeStamp and Description to .file symbol table entry
   // with the integrated assembler.
 }
diff --git a/llvm/lib/MC/MCObjectWriter.cpp b/llvm/lib/MC/MCObjectWriter.cpp
index 818e703514d61..d321e581bbf0e 100644
--- a/llvm/lib/MC/MCObjectWriter.cpp
+++ b/llvm/lib/MC/MCObjectWriter.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCObjectWriter.h"
-#include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCFragment.h"
 #include "llvm/MC/MCSymbol.h"
@@ -19,14 +18,6 @@ using namespace llvm;
 
 MCObjectWriter::~MCObjectWriter() = default;
 
-void MCObjectWriter::reset() {
-  FileNames.clear();
-  AddrsigSyms.clear();
-  EmitAddrsigSection = false;
-  SubsectionsViaSymbols = false;
-  CGProfile.clear();
-}
-
 bool MCObjectWriter::isSymbolRefDifferenceFullyResolved(
     const MCAssembler &Asm, const MCSymbolRefExpr *A, const MCSymbolRefExpr *B,
     bool InSet) const {
@@ -53,7 +44,3 @@ bool MCObjectWriter::isSymbolRefDifferenceFullyResolvedImpl(
   // On ELF and COFF  A - B is absolute if A and B are in the same section.
   return &SecA == &SecB;
 }
-
-void MCObjectWriter::addFileName(MCAssembler &Asm, StringRef FileName) {
-  FileNames.emplace_back(std::string(FileName), Asm.Symbols.size());
-}
diff --git a/llvm/lib/MC/MCSection.cpp b/llvm/lib/MC/MCSection.cpp
index 97e87a41c8ce5..8c2ee5635a49c 100644
--- a/llvm/lib/MC/MCSection.cpp
+++ b/llvm/lib/MC/MCSection.cpp
@@ -23,8 +23,8 @@ using namespace llvm;
 MCSection::MCSection(SectionVariant V, StringRef Name, bool IsText,
                      bool IsVirtual, MCSymbol *Begin)
     : Begin(Begin), BundleGroupBeforeFirstInst(false), HasInstructions(false),
-      IsRegistered(false), IsText(IsText), IsVirtual(IsVirtual), Name(Name),
-      Variant(V) {
+      HasLayout(false), IsRegistered(false), IsText(IsText),
+      IsVirtual(IsVirtual), Name(Name), Variant(V) {
   DummyFragment.setParent(this);
   // The initial subsection number is 0. Create a fragment list.
   CurFragList = &Subsections.emplace_back(0u, FragList{}).second;
diff --git a/llvm/lib/MC/MCSectionXCOFF.cpp b/llvm/lib/MC/MCSectionXCOFF.cpp
index 41043b2aeb93f..00ca5d2067d58 100644
--- a/llvm/lib/MC/MCSectionXCOFF.cpp
+++ b/llvm/lib/MC/MCSectionXCOFF.cpp
@@ -124,7 +124,7 @@ void MCSectionXCOFF::printSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
   if (getKind().isMetadata() && isDwarfSect()) {
     OS << "\n\t.dwsect " << format("0x%" PRIx32, *getDwarfSubtypeFlags())
        << '\n';
-    OS << getName() << ':' << '\n';
+    OS << MAI.getPrivateLabelPrefix() << getName() << ':' << '\n';
     return;
   }
 
diff --git a/llvm/lib/MC/MCWinCOFFStreamer.cpp b/llvm/lib/MC/MCWinCOFFStreamer.cpp
index 92329e079f93a..a14d3bcf37f3f 100644
--- a/llvm/lib/MC/MCWinCOFFStreamer.cpp
+++ b/llvm/lib/MC/MCWinCOFFStreamer.cpp
@@ -28,7 +28,6 @@
 #include "llvm/MC/MCSectionCOFF.h"
 #include "llvm/MC/MCSymbolCOFF.h"
 #include "llvm/MC/MCTargetOptions.h"
-#include "llvm/MC/MCWinCOFFObjectWriter.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
@@ -50,11 +49,7 @@ MCWinCOFFStreamer::MCWinCOFFStreamer(MCContext &Context,
       CurSymbol(nullptr) {
   auto *TO = Context.getTargetOptions();
   if (TO && TO->MCIncrementalLinkerCompatible)
-    getWriter().setIncrementalLinkerCompatible(true);
-}
-
-WinCOFFObjectWriter &MCWinCOFFStreamer::getWriter() {
-  return static_cast<WinCOFFObjectWriter &>(getAssembler().getWriter());
+    getAssembler().setIncrementalLinkerCompatible(true);
 }
 
 void MCWinCOFFStreamer::emitInstToData(const MCInst &Inst,
@@ -360,7 +355,7 @@ void MCWinCOFFStreamer::emitCGProfileEntry(const MCSymbolRefExpr *From,
                                            uint64_t Count) {
   // Ignore temporary symbols for now.
   if (!From->getSymbol().isTemporary() && !To->getSymbol().isTemporary())
-    getWriter().getCGProfile().push_back({From, To, Count});
+    getAssembler().CGProfile.push_back({From, To, Count});
 }
 
 void MCWinCOFFStreamer::finalizeCGProfileEntry(const MCSymbolRefExpr *&SRE) {
@@ -376,8 +371,8 @@ void MCWinCOFFStreamer::finishImpl() {
     switchSection(Asm.getContext().getCOFFSection(".llvm_addrsig",
                                                   COFF::IMAGE_SCN_LNK_REMOVE));
   }
-  if (!Asm.getWriter().getCGProfile().empty()) {
-    for (auto &E : Asm.getWriter().getCGProfile()) {
+  if (!Asm.CGProfile.empty()) {
+    for (MCAssembler::CGProfileEntry &E : Asm.CGProfile) {
       finalizeCGProfileEntry(E.From);
       finalizeCGProfileEntry(E.To);
     }
diff --git a/llvm/lib/MC/MCXCOFFStreamer.cpp b/llvm/lib/MC/MCXCOFFStreamer.cpp
index 94aa1ebc8f9e1..9cd46e504b554 100644
--- a/llvm/lib/MC/MCXCOFFStreamer.cpp
+++ b/llvm/lib/MC/MCXCOFFStreamer.cpp
@@ -19,7 +19,6 @@
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSectionXCOFF.h"
 #include "llvm/MC/MCSymbolXCOFF.h"
-#include "llvm/MC/MCXCOFFObjectWriter.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Casting.h"
 
@@ -109,14 +108,12 @@ void MCXCOFFStreamer::emitXCOFFExceptDirective(const MCSymbol *Symbol,
                                                unsigned Lang, unsigned Reason,
                                                unsigned FunctionSize,
                                                bool hasDebug) {
-  // TODO: Export XCOFFObjectWriter to llvm/MC/MCXCOFFObjectWriter.h and access
-  // it from MCXCOFFStreamer.
-  XCOFF::addExceptionEntry(getAssembler().getWriter(), Symbol, Trap, Lang,
-                           Reason, FunctionSize, hasDebug);
+  getAssembler().getWriter().addExceptionEntry(Symbol, Trap, Lang, Reason,
+                                               FunctionSize, hasDebug);
 }
 
 void MCXCOFFStreamer::emitXCOFFCInfoSym(StringRef Name, StringRef Metadata) {
-  XCOFF::addCInfoSymEntry(getAssembler().getWriter(), Name, Metadata);
+  getAssembler().getWriter().addCInfoSymEntry(Name, Metadata);
 }
 
 void MCXCOFFStreamer::emitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
diff --git a/llvm/lib/MC/MachObjectWriter.cpp b/llvm/lib/MC/MachObjectWriter.cpp
index 97fe254e4e979..f5435c6f3dd16 100644
--- a/llvm/lib/MC/MachObjectWriter.cpp
+++ b/llvm/lib/MC/MachObjectWriter.cpp
@@ -55,12 +55,6 @@ void MachObjectWriter::reset() {
   LocalSymbolData.clear();
   ExternalSymbolData.clear();
   UndefinedSymbolData.clear();
-  LOHContainer.reset();
-  VersionInfo.Major = 0;
-  VersionInfo.SDKVersion = VersionTuple();
-  TargetVariantVersionInfo.Major = 0;
-  TargetVariantVersionInfo.SDKVersion = VersionTuple();
-  LinkerOptions.clear();
   MCObjectWriter::reset();
 }
 
@@ -744,7 +738,7 @@ bool MachObjectWriter::isSymbolRefDifferenceFullyResolvedImpl(
     if (!hasReliableSymbolDifference) {
       if (!SA.isInSection() || &SecA != &SecB ||
           (!SA.isTemporary() && FB.getAtom() != SA.getFragment()->getAtom() &&
-           SubsectionsViaSymbols))
+           Asm.getSubsectionsViaSymbols()))
         return false;
       return true;
     }
@@ -792,13 +786,13 @@ uint64_t MachObjectWriter::writeObject(MCAssembler &Asm) {
   computeSymbolTable(Asm, LocalSymbolData, ExternalSymbolData,
                      UndefinedSymbolData);
 
-  if (!CGProfile.empty()) {
+  if (!Asm.CGProfile.empty()) {
     MCSection *CGProfileSection = Asm.getContext().getMachOSection(
         "__LLVM", "__cg_profile", 0, SectionKind::getMetadata());
     auto &Frag = cast<MCDataFragment>(*CGProfileSection->begin());
     Frag.getContents().clear();
     raw_svector_ostream OS(Frag.getContents());
-    for (const MCObjectWriter::CGProfileEntry &CGPE : CGProfile) {
+    for (const MCAssembler::CGProfileEntry &CGPE : Asm.CGProfile) {
       uint32_t FromIndex = CGPE.From->getSymbol().getIndex();
       uint32_t ToIndex = CGPE.To->getSymbol().getIndex();
       support::endian::write(OS, FromIndex, W.Endian);
@@ -808,6 +802,7 @@ uint64_t MachObjectWriter::writeObject(MCAssembler &Asm) {
   }
 
   unsigned NumSections = Asm.end() - Asm.begin();
+  const MCAssembler::VersionInfoType &VersionInfo = Asm.getVersionInfo();
 
   // The section data starts after the header, the segment load command (and
   // section headers) and the symbol table.
@@ -825,6 +820,9 @@ uint64_t MachObjectWriter::writeObject(MCAssembler &Asm) {
       LoadCommandsSize += sizeof(MachO::version_min_command);
   }
 
+  const MCAssembler::VersionInfoType &TargetVariantVersionInfo =
+      Asm.getDarwinTargetVariantVersionInfo();
+
   // Add the target variant version info load command size, if used.
   if (TargetVariantVersionInfo.Major != 0) {
     ++NumLoadCommands;
@@ -841,7 +839,7 @@ uint64_t MachObjectWriter::writeObject(MCAssembler &Asm) {
   }
 
   // Add the loh load command size, if used.
-  uint64_t LOHRawSize = LOHContainer.getEmitSize(Asm, *this);
+  uint64_t LOHRawSize = Asm.getLOHContainer().getEmitSize(Asm, *this);
   uint64_t LOHSize = alignTo(LOHRawSize, is64Bit() ? 8 : 4);
   if (LOHSize) {
     ++NumLoadCommands;
@@ -858,7 +856,7 @@ uint64_t MachObjectWriter::writeObject(MCAssembler &Asm) {
   }
 
   // Add the linker option load commands sizes.
-  for (const auto &Option : LinkerOptions) {
+  for (const auto &Option : Asm.getLinkerOptions()) {
     ++NumLoadCommands;
     LoadCommandsSize += ComputeLinkerOptionsLoadCommandSize(Option, is64Bit());
   }
@@ -894,7 +892,7 @@ uint64_t MachObjectWriter::writeObject(MCAssembler &Asm) {
 
   // Write the prolog, starting with the header and load command...
   writeHeader(MachO::MH_OBJECT, NumLoadCommands, LoadCommandsSize,
-              SubsectionsViaSymbols);
+              Asm.getSubsectionsViaSymbols());
   uint32_t Prot =
       MachO::VM_PROT_READ | MachO::VM_PROT_WRITE | MachO::VM_PROT_EXECUTE;
   writeSegmentLoadCommand("", NumSections, 0, VMSize, SectionDataStart,
@@ -929,7 +927,7 @@ uint64_t MachObjectWriter::writeObject(MCAssembler &Asm) {
 
   // Write out the deployment target information, if it's available.
   auto EmitDeploymentTargetVersion =
-      [&](const VersionInfoType &VersionInfo) {
+      [&](const MCAssembler::VersionInfoType &VersionInfo) {
         auto EncodeVersion = [](VersionTuple V) -> uint32_t {
           assert(!V.empty() && "empty version");
           unsigned Update = V.getSubminor().value_or(0);
@@ -1017,7 +1015,7 @@ uint64_t MachObjectWriter::writeObject(MCAssembler &Asm) {
   }
 
   // Write the linker options load commands.
-  for (const auto &Option : LinkerOptions)
+  for (const auto &Option : Asm.getLinkerOptions())
     writeLinkerOptionsLoadCommand(Option);
 
   // Write the actual section data.
@@ -1065,7 +1063,7 @@ uint64_t MachObjectWriter::writeObject(MCAssembler &Asm) {
 #ifndef NDEBUG
     unsigned Start = W.OS.tell();
 #endif
-    LOHContainer.emit(Asm, *this);
+    Asm.getLOHContainer().emit(Asm, *this);
     // Pad to a multiple of the pointer size.
     W.OS.write_zeros(
         offsetToAlignment(LOHRawSize, is64Bit() ? Align(8) : Align(4)));
diff --git a/llvm/lib/MC/SPIRVObjectWriter.cpp b/llvm/lib/MC/SPIRVObjectWriter.cpp
index ef3d4b029e43e..91bde7ab62c1e 100644
--- a/llvm/lib/MC/SPIRVObjectWriter.cpp
+++ b/llvm/lib/MC/SPIRVObjectWriter.cpp
@@ -14,27 +14,47 @@
 
 using namespace llvm;
 
+namespace {
+class SPIRVObjectWriter : public MCObjectWriter {
+  ::support::endian::Writer W;
+
+  /// The target specific SPIR-V writer instance.
+  std::unique_ptr<MCSPIRVObjectTargetWriter> TargetObjectWriter;
+
+public:
+  SPIRVObjectWriter(std::unique_ptr<MCSPIRVObjectTargetWriter> MOTW,
+                    raw_pwrite_stream &OS)
+      : W(OS, llvm::endianness::little), TargetObjectWriter(std::move(MOTW)) {}
+
+  ~SPIRVObjectWriter() override {}
+
+private:
+  void recordRelocation(MCAssembler &Asm, const MCFragment *Fragment,
+                        const MCFixup &Fixup, MCValue Target,
+                        uint64_t &FixedValue) override {}
+
+  uint64_t writeObject(MCAssembler &Asm) override;
+  void writeHeader(const MCAssembler &Asm);
+};
+} // namespace
+
 void SPIRVObjectWriter::writeHeader(const MCAssembler &Asm) {
   constexpr uint32_t MagicNumber = 0x07230203;
   constexpr uint32_t GeneratorID = 43;
   constexpr uint32_t GeneratorMagicNumber =
       (GeneratorID << 16) | (LLVM_VERSION_MAJOR);
   constexpr uint32_t Schema = 0;
+  const MCAssembler::VersionInfoType &VIT = Asm.getVersionInfo();
+  uint32_t VersionNumber = 0 | (VIT.Major << 16) | (VIT.Minor << 8);
+  uint32_t Bound = VIT.Update;
 
   W.write<uint32_t>(MagicNumber);
-  W.write<uint32_t>((VersionInfo.Major << 16) | (VersionInfo.Minor << 8));
+  W.write<uint32_t>(VersionNumber);
   W.write<uint32_t>(GeneratorMagicNumber);
-  W.write<uint32_t>(VersionInfo.Bound);
+  W.write<uint32_t>(Bound);
   W.write<uint32_t>(Schema);
 }
 
-void SPIRVObjectWriter::setBuildVersion(unsigned Major, unsigned Minor,
-                                        unsigned Bound) {
-  VersionInfo.Major = Major;
-  VersionInfo.Minor = Minor;
-  VersionInfo.Bound = Bound;
-}
-
 uint64_t SPIRVObjectWriter::writeObject(MCAssembler &Asm) {
   uint64_t StartOffset = W.OS.tell();
   writeHeader(Asm);
diff --git a/llvm/lib/MC/WinCOFFObjectWriter.cpp b/llvm/lib/MC/WinCOFFObjectWriter.cpp
index 62f53423126ea..7ba38be7edba9 100644
--- a/llvm/lib/MC/WinCOFFObjectWriter.cpp
+++ b/llvm/lib/MC/WinCOFFObjectWriter.cpp
@@ -119,9 +119,10 @@ class COFFSection {
 
   SmallVector<COFFSymbol *, 1> OffsetSymbols;
 };
-} // namespace
 
-class llvm::WinCOFFWriter {
+class WinCOFFObjectWriter;
+
+class WinCOFFWriter {
   WinCOFFObjectWriter &OWriter;
   support::endian::Writer W;
 
@@ -195,19 +196,41 @@ class llvm::WinCOFFWriter {
   void assignFileOffsets(MCAssembler &Asm);
 };
 
-WinCOFFObjectWriter::WinCOFFObjectWriter(
-    std::unique_ptr<MCWinCOFFObjectTargetWriter> MOTW, raw_pwrite_stream &OS)
-    : TargetObjectWriter(std::move(MOTW)),
-      ObjWriter(std::make_unique<WinCOFFWriter>(*this, OS,
-                                                WinCOFFWriter::AllSections)) {}
-WinCOFFObjectWriter::WinCOFFObjectWriter(
-    std::unique_ptr<MCWinCOFFObjectTargetWriter> MOTW, raw_pwrite_stream &OS,
-    raw_pwrite_stream &DwoOS)
-    : TargetObjectWriter(std::move(MOTW)),
-      ObjWriter(std::make_unique<WinCOFFWriter>(*this, OS,
-                                                WinCOFFWriter::NonDwoOnly)),
-      DwoWriter(std::make_unique<WinCOFFWriter>(*this, DwoOS,
-                                                WinCOFFWriter::DwoOnly)) {}
+class WinCOFFObjectWriter : public MCObjectWriter {
+  friend class WinCOFFWriter;
+
+  std::unique_ptr<MCWinCOFFObjectTargetWriter> TargetObjectWriter;
+  std::unique_ptr<WinCOFFWriter> ObjWriter, DwoWriter;
+
+public:
+  WinCOFFObjectWriter(std::unique_ptr<MCWinCOFFObjectTargetWriter> MOTW,
+                      raw_pwrite_stream &OS)
+      : TargetObjectWriter(std::move(MOTW)),
+        ObjWriter(std::make_unique<WinCOFFWriter>(*this, OS,
+                                                  WinCOFFWriter::AllSections)) {
+  }
+  WinCOFFObjectWriter(std::unique_ptr<MCWinCOFFObjectTargetWriter> MOTW,
+                      raw_pwrite_stream &OS, raw_pwrite_stream &DwoOS)
+      : TargetObjectWriter(std::move(MOTW)),
+        ObjWriter(std::make_unique<WinCOFFWriter>(*this, OS,
+                                                  WinCOFFWriter::NonDwoOnly)),
+        DwoWriter(std::make_unique<WinCOFFWriter>(*this, DwoOS,
+                                                  WinCOFFWriter::DwoOnly)) {}
+
+  // MCObjectWriter interface implementation.
+  void reset() override;
+  void executePostLayoutBinding(MCAssembler &Asm) override;
+  bool isSymbolRefDifferenceFullyResolvedImpl(const MCAssembler &Asm,
+                                              const MCSymbol &SymA,
+                                              const MCFragment &FB, bool InSet,
+                                              bool IsPCRel) const override;
+  void recordRelocation(MCAssembler &Asm, const MCFragment *Fragment,
+                        const MCFixup &Fixup, MCValue Target,
+                        uint64_t &FixedValue) override;
+  uint64_t writeObject(MCAssembler &Asm) override;
+};
+
+} // end anonymous namespace
 
 static bool isDwoSection(const MCSection &Sec) {
   return Sec.getName().ends_with(".dwo");
@@ -636,7 +659,7 @@ void WinCOFFWriter::writeSection(MCAssembler &Asm, const COFFSection &Sec) {
 
 // Create .file symbols.
 void WinCOFFWriter::createFileSymbols(MCAssembler &Asm) {
-  for (const std::pair<std::string, size_t> &It : OWriter.getFileNames()) {
+  for (const std::pair<std::string, size_t> &It : Asm.getFileNames()) {
     // round up to calculate the number of auxiliary symbols required
     const std::string &Name = It.first;
     unsigned SymbolSize = UseBigObj ? COFF::Symbol32Size : COFF::Symbol16Size;
@@ -1084,12 +1107,12 @@ uint64_t WinCOFFWriter::writeObject(MCAssembler &Asm) {
   }
 
   // Create the contents of the .llvm.call-graph-profile section.
-  if (Mode != DwoOnly && !OWriter.getCGProfile().empty()) {
+  if (Mode != DwoOnly && !Asm.CGProfile.empty()) {
     auto *Sec = Asm.getContext().getCOFFSection(
         ".llvm.call-graph-profile", COFF::IMAGE_SCN_LNK_REMOVE);
     auto *Frag = cast<MCDataFragment>(Sec->curFragList()->Head);
     raw_svector_ostream OS(Frag->getContents());
-    for (const auto &CGPE : OWriter.getCGProfile()) {
+    for (const MCAssembler::CGProfileEntry &CGPE : Asm.CGProfile) {
       uint32_t FromIndex = CGPE.From->getSymbol().getIndex();
       uint32_t ToIndex = CGPE.To->getSymbol().getIndex();
       support::endian::write(OS, FromIndex, W.Endian);
@@ -1102,7 +1125,7 @@ uint64_t WinCOFFWriter::writeObject(MCAssembler &Asm) {
 
   // MS LINK expects to be able to use this timestamp to implement their
   // /INCREMENTAL feature.
-  if (OWriter.IncrementalLinkerCompatible) {
+  if (Asm.isIncrementalLinkerCompatible()) {
     Header.TimeDateStamp = getTime();
   } else {
     // Have deterministic output if /INCREMENTAL isn't needed. Also matches GNU.
@@ -1151,7 +1174,6 @@ uint64_t WinCOFFWriter::writeObject(MCAssembler &Asm) {
 // MCObjectWriter interface implementations
 
 void WinCOFFObjectWriter::reset() {
-  IncrementalLinkerCompatible = false;
   ObjWriter->reset();
   if (DwoWriter)
     DwoWriter->reset();
diff --git a/llvm/lib/MC/XCOFFObjectWriter.cpp b/llvm/lib/MC/XCOFFObjectWriter.cpp
index 124b31e870884..2735142c576b1 100644
--- a/llvm/lib/MC/XCOFFObjectWriter.cpp
+++ b/llvm/lib/MC/XCOFFObjectWriter.cpp
@@ -296,6 +296,7 @@ class XCOFFObjectWriter : public MCObjectWriter {
   uint64_t SymbolTableOffset = 0;
   uint16_t SectionCount = 0;
   uint32_t PaddingsBeforeDwarf = 0;
+  std::vector<std::pair<std::string, size_t>> FileNames;
   bool HasVisibility = false;
 
   support::endian::Writer W;
@@ -421,12 +422,16 @@ class XCOFFObjectWriter : public MCObjectWriter {
   void finalizeRelocationInfo(SectionEntry *Sec, uint64_t RelCount);
   void calcOffsetToRelocations(SectionEntry *Sec, uint64_t &RawPointer);
 
+  void addExceptionEntry(const MCSymbol *Symbol, const MCSymbol *Trap,
+                         unsigned LanguageCode, unsigned ReasonCode,
+                         unsigned FunctionSize, bool hasDebug) override;
   bool hasExceptionSection() {
     return !ExceptionSection.ExceptionTable.empty();
   }
   unsigned getExceptionSectionSize();
   unsigned getExceptionOffset(const MCSymbol *Symbol);
 
+  void addCInfoSymEntry(StringRef Name, StringRef Metadata) override;
   size_t auxiliaryHeaderSize() const {
     // 64-bit object files have no auxiliary header.
     return HasVisibility && !is64Bit() ? XCOFF::AuxFileHeaderSizeShort : 0;
@@ -439,11 +444,6 @@ class XCOFFObjectWriter : public MCObjectWriter {
   void writeWord(uint64_t Word) {
     is64Bit() ? W.write<uint64_t>(Word) : W.write<uint32_t>(Word);
   }
-
-  void addExceptionEntry(const MCSymbol *Symbol, const MCSymbol *Trap,
-                         unsigned LanguageCode, unsigned ReasonCode,
-                         unsigned FunctionSize, bool hasDebug);
-  void addCInfoSymEntry(StringRef Name, StringRef Metadata);
 };
 
 XCOFFObjectWriter::XCOFFObjectWriter(
@@ -596,9 +596,6 @@ void XCOFFObjectWriter::executePostLayoutBinding(MCAssembler &Asm) {
     const MCSymbolXCOFF *XSym = cast<MCSymbolXCOFF>(&S);
     const MCSectionXCOFF *ContainingCsect = getContainingCsect(XSym);
 
-    if (ContainingCsect->isDwarfSect())
-      continue;
-
     if (XSym->getVisibilityType() != XCOFF::SYM_V_UNSPECIFIED)
       HasVisibility = true;
 
@@ -637,6 +634,7 @@ void XCOFFObjectWriter::executePostLayoutBinding(MCAssembler &Asm) {
   if (CISI && nameShouldBeInStringTable(CISI->Name))
     Strings.add(CISI->Name);
 
+  FileNames = Asm.getFileNames();
   // Emit ".file" as the source file name when there is no file name.
   if (FileNames.empty())
     FileNames.emplace_back(".file", 0);
@@ -649,7 +647,7 @@ void XCOFFObjectWriter::executePostLayoutBinding(MCAssembler &Asm) {
   // the AUX_FILE auxiliary entry.
   if (nameShouldBeInStringTable(".file"))
     Strings.add(".file");
-  StringRef Vers = CompilerVersion;
+  StringRef Vers = Asm.getCompilerVersion();
   if (auxFileSymNameShouldBeInStringTable(Vers))
     Strings.add(Vers);
 
@@ -827,6 +825,8 @@ uint64_t XCOFFObjectWriter::writeObject(MCAssembler &Asm) {
   // We always emit a timestamp of 0 for reproducibility, so ensure incremental
   // linking is not enabled, in case, like with Windows COFF, such a timestamp
   // is incompatible with incremental linking of XCOFF.
+  if (Asm.isIncrementalLinkerCompatible())
+    report_fatal_error("Incremental linking not supported for XCOFF.");
 
   finalizeSectionInfo();
   uint64_t StartOffset = W.OS.tell();
@@ -1159,7 +1159,7 @@ void XCOFFObjectWriter::writeRelocations() {
 
 void XCOFFObjectWriter::writeSymbolTable(MCAssembler &Asm) {
   // Write C_FILE symbols.
-  StringRef Vers = CompilerVersion;
+  StringRef Vers = Asm.getCompilerVersion();
 
   for (const std::pair<std::string, size_t> &F : FileNames) {
     // The n_name of a C_FILE symbol is the source file's name when no auxiliary
@@ -1415,7 +1415,7 @@ void XCOFFObjectWriter::assignAddressesAndIndices(MCAssembler &Asm) {
   // The symbol table starts with all the C_FILE symbols. Each C_FILE symbol
   // requires 1 or 2 auxiliary entries.
   uint32_t SymbolTableIndex =
-      (2 + (CompilerVersion.empty() ? 0 : 1)) * FileNames.size();
+      (2 + (Asm.getCompilerVersion().empty() ? 0 : 1)) * FileNames.size();
 
   if (CInfoSymSection.Entry)
     SymbolTableIndex++;
@@ -1738,18 +1738,3 @@ llvm::createXCOFFObjectWriter(std::unique_ptr<MCXCOFFObjectTargetWriter> MOTW,
                               raw_pwrite_stream &OS) {
   return std::make_unique<XCOFFObjectWriter>(std::move(MOTW), OS);
 }
-
-// TODO: Export XCOFFObjectWriter to llvm/MC/MCXCOFFObjectWriter.h and remove
-// the forwarders.
-void XCOFF::addExceptionEntry(MCObjectWriter &Writer, const MCSymbol *Symbol,
-                              const MCSymbol *Trap, unsigned LanguageCode,
-                              unsigned ReasonCode, unsigned FunctionSize,
-                              bool hasDebug) {
-  static_cast<XCOFFObjectWriter &>(Writer).addExceptionEntry(
-      Symbol, Trap, LanguageCode, ReasonCode, FunctionSize, hasDebug);
-}
-
-void XCOFF::addCInfoSymEntry(MCObjectWriter &Writer, StringRef Name,
-                             StringRef Metadata) {
-  static_cast<XCOFFObjectWriter &>(Writer).addCInfoSymEntry(Name, Metadata);
-}
diff --git a/llvm/lib/Object/COFFImportFile.cpp b/llvm/lib/Object/COFFImportFile.cpp
index e67b02405a3a1..2458a53cb6d54 100644
--- a/llvm/lib/Object/COFFImportFile.cpp
+++ b/llvm/lib/Object/COFFImportFile.cpp
@@ -729,10 +729,7 @@ Error writeImportLibrary(StringRef ImportName, StringRef Path,
         else if (Machine == IMAGE_FILE_MACHINE_I386 &&
                  applyNameType(IMPORT_NAME_NOPREFIX, Name) == E.ImportName)
           NameType = IMPORT_NAME_NOPREFIX;
-        else if (isArm64EC(M)) {
-          NameType = IMPORT_NAME_EXPORTAS;
-          ExportName = E.ImportName;
-        } else if (Name == E.ImportName)
+        else if (Name == E.ImportName)
           NameType = IMPORT_NAME;
         else {
           Deferred D;
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 5dbb1e2f49871..a9d3f8ec3a4ec 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -845,8 +845,6 @@ Expected<SimplifyCFGOptions> parseSimplifyCFGOptions(StringRef Params) {
       Result.hoistCommonInsts(Enable);
     } else if (ParamName == "sink-common-insts") {
       Result.sinkCommonInsts(Enable);
-    } else if (ParamName == "speculate-unpredictables") {
-      Result.speculateUnpredictables(Enable);
     } else if (Enable && ParamName.consume_front("bonus-inst-threshold=")) {
       APInt BonusInstThreshold;
       if (ParamName.getAsInteger(0, BonusInstThreshold))
@@ -1182,7 +1180,7 @@ parseRegAllocFastPassOptions(PassBuilder &PB, StringRef Params) {
     std::tie(ParamName, Params) = Params.split(';');
 
     if (ParamName.consume_front("filter=")) {
-      std::optional<RegAllocFilterFunc> Filter =
+      std::optional<RegClassFilterFunc> Filter =
           PB.parseRegAllocFilter(ParamName);
       if (!Filter) {
         return make_error<StringError>(
@@ -2192,7 +2190,7 @@ Error PassBuilder::parseAAPipeline(AAManager &AA, StringRef PipelineText) {
   return Error::success();
 }
 
-std::optional<RegAllocFilterFunc>
+std::optional<RegClassFilterFunc>
 PassBuilder::parseRegAllocFilter(StringRef FilterName) {
   if (FilterName == "all")
     return nullptr;
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 6f36bdad780ae..4fd5ee1946bb7 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -296,9 +296,6 @@ static cl::opt<AttributorRunOption> AttributorRun(
                clEnumValN(AttributorRunOption::NONE, "none",
                           "disable attributor runs")));
 
-static cl::opt<bool> EnableSampledInstr(
-    "enable-sampled-instrumentation", cl::init(false), cl::Hidden,
-    cl::desc("Enable profile instrumentation sampling (default = off)"));
 static cl::opt<bool> UseLoopVersioningLICM(
     "enable-loop-versioning-licm", cl::init(false), cl::Hidden,
     cl::desc("Enable the experimental Loop Versioning LICM pass"));
@@ -850,12 +847,6 @@ void PassBuilder::addPGOInstrPasses(ModulePassManager &MPM,
   // Do counter promotion at Level greater than O0.
   Options.DoCounterPromotion = true;
   Options.UseBFIInPromotion = IsCS;
-  if (EnableSampledInstr) {
-    Options.Sampling = true;
-    // With sampling, there is little beneifit to enable counter promotion.
-    // But note that sampling does work with counter promotion.
-    Options.DoCounterPromotion = false;
-  }
   Options.Atomic = AtomicCounterUpdate;
   MPM.addPass(InstrProfilingLoweringPass(Options, IsCS));
 }
@@ -1194,8 +1185,7 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
     MPM.addPass(PGOIndirectCallPromotion(false, false));
 
   if (IsPGOPreLink && PGOOpt->CSAction == PGOOptions::CSIRInstr)
-    MPM.addPass(PGOInstrumentationGenCreateVar(PGOOpt->CSProfileGenFile,
-                                               EnableSampledInstr));
+    MPM.addPass(PGOInstrumentationGenCreateVar(PGOOpt->CSProfileGenFile));
 
   if (IsMemprofUse)
     MPM.addPass(MemProfUsePass(PGOOpt->MemoryProfile, PGOOpt->FS));
@@ -1515,9 +1505,8 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level,
 
   // LoopSink (and other loop passes since the last simplifyCFG) might have
   // resulted in single-entry-single-exit or empty blocks. Clean up the CFG.
-  OptimizePM.addPass(SimplifyCFGPass(SimplifyCFGOptions()
-                                         .convertSwitchRangeToICmp(true)
-                                         .speculateUnpredictables(true)));
+  OptimizePM.addPass(
+      SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true)));
 
   // Add the core optimizing pipeline.
   MPM.addPass(createModuleToFunctionPassAdaptor(std::move(OptimizePM),
@@ -2035,10 +2024,9 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
   LateFPM.addPass(DivRemPairsPass());
 
   // Delete basic blocks, which optimization passes may have killed.
-  LateFPM.addPass(SimplifyCFGPass(SimplifyCFGOptions()
-                                      .convertSwitchRangeToICmp(true)
-                                      .hoistCommonInsts(true)
-                                      .speculateUnpredictables(true)));
+  LateFPM.addPass(SimplifyCFGPass(
+      SimplifyCFGOptions().convertSwitchRangeToICmp(true).hoistCommonInsts(
+          true)));
   MPM.addPass(createModuleToFunctionPassAdaptor(std::move(LateFPM)));
 
   // Drop bodies of available eternally objects to improve GlobalDCE.
diff --git a/llvm/lib/SandboxIR/SandboxIR.cpp b/llvm/lib/SandboxIR/SandboxIR.cpp
index 51c9af8a6e1fe..209b677bafbb5 100644
--- a/llvm/lib/SandboxIR/SandboxIR.cpp
+++ b/llvm/lib/SandboxIR/SandboxIR.cpp
@@ -455,51 +455,6 @@ void Instruction::dump() const {
 }
 #endif // NDEBUG
 
-Value *SelectInst::createCommon(Value *Cond, Value *True, Value *False,
-                                const Twine &Name, IRBuilder<> &Builder,
-                                Context &Ctx) {
-  llvm::Value *NewV =
-      Builder.CreateSelect(Cond->Val, True->Val, False->Val, Name);
-  if (auto *NewSI = dyn_cast<llvm::SelectInst>(NewV))
-    return Ctx.createSelectInst(NewSI);
-  assert(isa<llvm::Constant>(NewV) && "Expected constant");
-  return Ctx.getOrCreateConstant(cast<llvm::Constant>(NewV));
-}
-
-Value *SelectInst::create(Value *Cond, Value *True, Value *False,
-                          Instruction *InsertBefore, Context &Ctx,
-                          const Twine &Name) {
-  llvm::Instruction *BeforeIR = InsertBefore->getTopmostLLVMInstruction();
-  auto &Builder = Ctx.getLLVMIRBuilder();
-  Builder.SetInsertPoint(BeforeIR);
-  return createCommon(Cond, True, False, Name, Builder, Ctx);
-}
-
-Value *SelectInst::create(Value *Cond, Value *True, Value *False,
-                          BasicBlock *InsertAtEnd, Context &Ctx,
-                          const Twine &Name) {
-  auto *IRInsertAtEnd = cast<llvm::BasicBlock>(InsertAtEnd->Val);
-  auto &Builder = Ctx.getLLVMIRBuilder();
-  Builder.SetInsertPoint(IRInsertAtEnd);
-  return createCommon(Cond, True, False, Name, Builder, Ctx);
-}
-
-bool SelectInst::classof(const Value *From) {
-  return From->getSubclassID() == ClassID::Select;
-}
-
-#ifndef NDEBUG
-void SelectInst::dump(raw_ostream &OS) const {
-  dumpCommonPrefix(OS);
-  dumpCommonSuffix(OS);
-}
-
-void SelectInst::dump() const {
-  dump(dbgs());
-  dbgs() << "\n";
-}
-#endif // NDEBUG
-
 LoadInst *LoadInst::create(Type *Ty, Value *Ptr, MaybeAlign Align,
                            Instruction *InsertBefore, Context &Ctx,
                            const Twine &Name) {
@@ -585,48 +540,6 @@ void StoreInst::dump() const {
   dump(dbgs());
   dbgs() << "\n";
 }
-#endif // NDEBUG
-
-ReturnInst *ReturnInst::createCommon(Value *RetVal, IRBuilder<> &Builder,
-                                     Context &Ctx) {
-  llvm::ReturnInst *NewRI;
-  if (RetVal != nullptr)
-    NewRI = Builder.CreateRet(RetVal->Val);
-  else
-    NewRI = Builder.CreateRetVoid();
-  return Ctx.createReturnInst(NewRI);
-}
-
-ReturnInst *ReturnInst::create(Value *RetVal, Instruction *InsertBefore,
-                               Context &Ctx) {
-  llvm::Instruction *BeforeIR = InsertBefore->getTopmostLLVMInstruction();
-  auto &Builder = Ctx.getLLVMIRBuilder();
-  Builder.SetInsertPoint(BeforeIR);
-  return createCommon(RetVal, Builder, Ctx);
-}
-
-ReturnInst *ReturnInst::create(Value *RetVal, BasicBlock *InsertAtEnd,
-                               Context &Ctx) {
-  auto &Builder = Ctx.getLLVMIRBuilder();
-  Builder.SetInsertPoint(cast<llvm::BasicBlock>(InsertAtEnd->Val));
-  return createCommon(RetVal, Builder, Ctx);
-}
-
-Value *ReturnInst::getReturnValue() const {
-  auto *LLVMRetVal = cast<llvm::ReturnInst>(Val)->getReturnValue();
-  return LLVMRetVal != nullptr ? Ctx.getValue(LLVMRetVal) : nullptr;
-}
-
-#ifndef NDEBUG
-void ReturnInst::dump(raw_ostream &OS) const {
-  dumpCommonPrefix(OS);
-  dumpCommonSuffix(OS);
-}
-
-void ReturnInst::dump() const {
-  dump(dbgs());
-  dbgs() << "\n";
-}
 
 void OpaqueInst::dump(raw_ostream &OS) const {
   dumpCommonPrefix(OS);
@@ -637,15 +550,7 @@ void OpaqueInst::dump() const {
   dump(dbgs());
   dbgs() << "\n";
 }
-#endif // NDEBUG
 
-Constant *Constant::createInt(Type *Ty, uint64_t V, Context &Ctx,
-                              bool IsSigned) {
-  llvm::Constant *LLVMC = llvm::ConstantInt::get(Ty, V, IsSigned);
-  return Ctx.getOrCreateConstant(LLVMC);
-}
-
-#ifndef NDEBUG
 void Constant::dump(raw_ostream &OS) const {
   dumpCommonPrefix(OS);
   dumpCommonSuffix(OS);
@@ -721,7 +626,7 @@ Value *Context::registerValue(std::unique_ptr<Value> &&VPtr) {
          "Can't register a user!");
   Value *V = VPtr.get();
   [[maybe_unused]] auto Pair =
-      LLVMValueToValueMap.insert({VPtr->Val, std::move(VPtr)});
+         LLVMValueToValueMap.insert({VPtr->Val, std::move(VPtr)});
   assert(Pair.second && "Already exists!");
   return V;
 }
@@ -753,11 +658,6 @@ Value *Context::getOrCreateValueInternal(llvm::Value *LLVMV, llvm::User *U) {
   assert(isa<llvm::Instruction>(LLVMV) && "Expected Instruction");
 
   switch (cast<llvm::Instruction>(LLVMV)->getOpcode()) {
-  case llvm::Instruction::Select: {
-    auto *LLVMSel = cast<llvm::SelectInst>(LLVMV);
-    It->second = std::unique_ptr<SelectInst>(new SelectInst(LLVMSel, *this));
-    return It->second.get();
-  }
   case llvm::Instruction::Load: {
     auto *LLVMLd = cast<llvm::LoadInst>(LLVMV);
     It->second = std::unique_ptr<LoadInst>(new LoadInst(LLVMLd, *this));
@@ -768,11 +668,6 @@ Value *Context::getOrCreateValueInternal(llvm::Value *LLVMV, llvm::User *U) {
     It->second = std::unique_ptr<StoreInst>(new StoreInst(LLVMSt, *this));
     return It->second.get();
   }
-  case llvm::Instruction::Ret: {
-    auto *LLVMRet = cast<llvm::ReturnInst>(LLVMV);
-    It->second = std::unique_ptr<ReturnInst>(new ReturnInst(LLVMRet, *this));
-    return It->second.get();
-  }
   default:
     break;
   }
@@ -791,11 +686,6 @@ BasicBlock *Context::createBasicBlock(llvm::BasicBlock *LLVMBB) {
   return BB;
 }
 
-SelectInst *Context::createSelectInst(llvm::SelectInst *SI) {
-  auto NewPtr = std::unique_ptr<SelectInst>(new SelectInst(SI, *this));
-  return cast<SelectInst>(registerValue(std::move(NewPtr)));
-}
-
 LoadInst *Context::createLoadInst(llvm::LoadInst *LI) {
   auto NewPtr = std::unique_ptr<LoadInst>(new LoadInst(LI, *this));
   return cast<LoadInst>(registerValue(std::move(NewPtr)));
@@ -806,11 +696,6 @@ StoreInst *Context::createStoreInst(llvm::StoreInst *SI) {
   return cast<StoreInst>(registerValue(std::move(NewPtr)));
 }
 
-ReturnInst *Context::createReturnInst(llvm::ReturnInst *I) {
-  auto NewPtr = std::unique_ptr<ReturnInst>(new ReturnInst(I, *this));
-  return cast<ReturnInst>(registerValue(std::move(NewPtr)));
-}
-
 Value *Context::getValue(llvm::Value *V) const {
   auto It = LLVMValueToValueMap.find(V);
   if (It != LLVMValueToValueMap.end())
diff --git a/llvm/lib/Support/TimeProfiler.cpp b/llvm/lib/Support/TimeProfiler.cpp
index c2014028ddadc..9612db7d30f98 100644
--- a/llvm/lib/Support/TimeProfiler.cpp
+++ b/llvm/lib/Support/TimeProfiler.cpp
@@ -73,20 +73,12 @@ struct llvm::TimeTraceProfilerEntry {
   const TimePointType Start;
   TimePointType End;
   const std::string Name;
-  TimeTraceMetadata Metadata;
-
+  const std::string Detail;
   const bool AsyncEvent = false;
   TimeTraceProfilerEntry(TimePointType &&S, TimePointType &&E, std::string &&N,
                          std::string &&Dt, bool Ae)
-      : Start(std::move(S)), End(std::move(E)), Name(std::move(N)), Metadata(),
-        AsyncEvent(Ae) {
-    Metadata.Detail = std::move(Dt);
-  }
-
-  TimeTraceProfilerEntry(TimePointType &&S, TimePointType &&E, std::string &&N,
-                         TimeTraceMetadata &&Mt, bool Ae)
       : Start(std::move(S)), End(std::move(E)), Name(std::move(N)),
-        Metadata(std::move(Mt)), AsyncEvent(Ae) {}
+        Detail(std::move(Dt)), AsyncEvent(Ae) {}
 
   // Calculate timings for FlameGraph. Cast time points to microsecond precision
   // rather than casting duration. This avoids truncation issues causing inner
@@ -105,12 +97,10 @@ struct llvm::TimeTraceProfilerEntry {
 };
 
 struct llvm::TimeTraceProfiler {
-  TimeTraceProfiler(unsigned TimeTraceGranularity = 0, StringRef ProcName = "",
-                    bool TimeTraceVerbose = false)
+  TimeTraceProfiler(unsigned TimeTraceGranularity = 0, StringRef ProcName = "")
       : BeginningOfTime(system_clock::now()), StartTime(ClockType::now()),
         ProcName(ProcName), Pid(sys::Process::getProcessId()),
-        Tid(llvm::get_threadid()), TimeTraceGranularity(TimeTraceGranularity),
-        TimeTraceVerbose(TimeTraceVerbose) {
+        Tid(llvm::get_threadid()), TimeTraceGranularity(TimeTraceGranularity) {
     llvm::get_thread_name(ThreadName);
   }
 
@@ -123,15 +113,6 @@ struct llvm::TimeTraceProfiler {
     return Stack.back().get();
   }
 
-  TimeTraceProfilerEntry *
-  begin(std::string Name, llvm::function_ref<TimeTraceMetadata()> Metadata,
-        bool AsyncEvent = false) {
-    Stack.emplace_back(std::make_unique<TimeTraceProfilerEntry>(
-        ClockType::now(), TimePointType(), std::move(Name), Metadata(),
-        AsyncEvent));
-    return Stack.back().get();
-  }
-
   void end() {
     assert(!Stack.empty() && "Must call begin() first");
     end(*Stack.back());
@@ -203,15 +184,8 @@ struct llvm::TimeTraceProfiler {
           J.attribute("dur", DurUs);
         }
         J.attribute("name", E.Name);
-        if (!E.Metadata.isEmpty()) {
-          J.attributeObject("args", [&] {
-            if (!E.Metadata.Detail.empty())
-              J.attribute("detail", E.Metadata.Detail);
-            if (!E.Metadata.File.empty())
-              J.attribute("file", E.Metadata.File);
-            if (E.Metadata.Line > 0)
-              J.attribute("line", E.Metadata.Line);
-          });
+        if (!E.Detail.empty()) {
+          J.attributeObject("args", [&] { J.attribute("detail", E.Detail); });
         }
       });
 
@@ -333,25 +307,14 @@ struct llvm::TimeTraceProfiler {
 
   // Minimum time granularity (in microseconds)
   const unsigned TimeTraceGranularity;
-
-  // Make time trace capture verbose event details (e.g. source filenames). This
-  // can increase the size of the output by 2-3 times.
-  const bool TimeTraceVerbose;
 };
 
-bool llvm::isTimeTraceVerbose() {
-  return getTimeTraceProfilerInstance() &&
-         getTimeTraceProfilerInstance()->TimeTraceVerbose;
-}
-
 void llvm::timeTraceProfilerInitialize(unsigned TimeTraceGranularity,
-                                       StringRef ProcName,
-                                       bool TimeTraceVerbose) {
+                                       StringRef ProcName) {
   assert(TimeTraceProfilerInstance == nullptr &&
          "Profiler should not be initialized");
   TimeTraceProfilerInstance = new TimeTraceProfiler(
-      TimeTraceGranularity, llvm::sys::path::filename(ProcName),
-      TimeTraceVerbose);
+      TimeTraceGranularity, llvm::sys::path::filename(ProcName));
 }
 
 // Removes all TimeTraceProfilerInstances.
@@ -418,14 +381,6 @@ llvm::timeTraceProfilerBegin(StringRef Name,
   return nullptr;
 }
 
-TimeTraceProfilerEntry *
-llvm::timeTraceProfilerBegin(StringRef Name,
-                             llvm::function_ref<TimeTraceMetadata()> Metadata) {
-  if (TimeTraceProfilerInstance != nullptr)
-    return TimeTraceProfilerInstance->begin(std::string(Name), Metadata, false);
-  return nullptr;
-}
-
 TimeTraceProfilerEntry *llvm::timeTraceAsyncProfilerBegin(StringRef Name,
                                                           StringRef Detail) {
   if (TimeTraceProfilerInstance != nullptr)
diff --git a/llvm/lib/Support/raw_socket_stream.cpp b/llvm/lib/Support/raw_socket_stream.cpp
index 04b3233084a41..4cd3d58b80198 100644
--- a/llvm/lib/Support/raw_socket_stream.cpp
+++ b/llvm/lib/Support/raw_socket_stream.cpp
@@ -18,7 +18,6 @@
 
 #include <atomic>
 #include <fcntl.h>
-#include <functional>
 #include <thread>
 
 #ifndef _WIN32
@@ -178,89 +177,70 @@ Expected<ListeningSocket> ListeningSocket::createUnix(StringRef SocketPath,
 #endif // _WIN32
 }
 
-// If a file descriptor being monitored by ::poll is closed by another thread,
-// the result is unspecified. In the case ::poll does not unblock and return,
-// when ActiveFD is closed, you can provide another file descriptor via CancelFD
-// that when written to will cause poll to return. Typically CancelFD is the
-// read end of a unidirectional pipe.
-//
-// Timeout should be -1 to block indefinitly
-//
-// getActiveFD is a callback to handle ActiveFD's of std::atomic<int> and int
-static std::error_code
-manageTimeout(const std::chrono::milliseconds &Timeout,
-              const std::function<int()> &getActiveFD,
-              const std::optional<int> &CancelFD = std::nullopt) {
-  struct pollfd FD[2];
-  FD[0].events = POLLIN;
+Expected<std::unique_ptr<raw_socket_stream>>
+ListeningSocket::accept(std::chrono::milliseconds Timeout) {
+
+  struct pollfd FDs[2];
+  FDs[0].events = POLLIN;
 #ifdef _WIN32
-  SOCKET WinServerSock = _get_osfhandle(getActiveFD());
-  FD[0].fd = WinServerSock;
+  SOCKET WinServerSock = _get_osfhandle(FD);
+  FDs[0].fd = WinServerSock;
 #else
-  FD[0].fd = getActiveFD();
+  FDs[0].fd = FD;
 #endif
-  uint8_t FDCount = 1;
-  if (CancelFD.has_value()) {
-    FD[1].events = POLLIN;
-    FD[1].fd = CancelFD.value();
-    FDCount++;
-  }
+  FDs[1].events = POLLIN;
+  FDs[1].fd = PipeFD[0];
 
-  // Keep track of how much time has passed in case ::poll or WSAPoll are
-  // interupted by a signal and need to be recalled
-  auto Start = std::chrono::steady_clock::now();
-  auto RemainingTimeout = Timeout;
-  int PollStatus = 0;
-  do {
-    // If Timeout is -1 then poll should block and RemainingTimeout does not
-    // need to be recalculated
-    if (PollStatus != 0 && Timeout != std::chrono::milliseconds(-1)) {
-      auto TotalElapsedTime =
-          std::chrono::duration_cast<std::chrono::milliseconds>(
-              std::chrono::steady_clock::now() - Start);
-
-      if (TotalElapsedTime >= Timeout)
-        return std::make_error_code(std::errc::operation_would_block);
-
-      RemainingTimeout = Timeout - TotalElapsedTime;
-    }
+  // Keep track of how much time has passed in case poll is interupted by a
+  // signal and needs to be recalled
+  int RemainingTime = Timeout.count();
+  std::chrono::milliseconds ElapsedTime = std::chrono::milliseconds(0);
+  int PollStatus = -1;
+
+  while (PollStatus == -1 && (Timeout.count() == -1 || ElapsedTime < Timeout)) {
+    if (Timeout.count() != -1)
+      RemainingTime -= ElapsedTime.count();
+
+    auto Start = std::chrono::steady_clock::now();
 #ifdef _WIN32
-    PollStatus = WSAPoll(FD, FDCount, RemainingTimeout.count());
-  } while (PollStatus == SOCKET_ERROR &&
-           getLastSocketErrorCode() == std::errc::interrupted);
+    PollStatus = WSAPoll(FDs, 2, RemainingTime);
 #else
-    PollStatus = ::poll(FD, FDCount, RemainingTimeout.count());
-  } while (PollStatus == -1 &&
-           getLastSocketErrorCode() == std::errc::interrupted);
+    PollStatus = ::poll(FDs, 2, RemainingTime);
 #endif
+    // If FD equals -1 then ListeningSocket::shutdown has been called and it is
+    // appropriate to return operation_canceled
+    if (FD.load() == -1)
+      return llvm::make_error<StringError>(
+          std::make_error_code(std::errc::operation_canceled),
+          "Accept canceled");
 
-  // If ActiveFD equals -1 or CancelFD has data to be read then the operation
-  // has been canceled by another thread
-  if (getActiveFD() == -1 || (CancelFD.has_value() && FD[1].revents & POLLIN))
-    return std::make_error_code(std::errc::operation_canceled);
 #if _WIN32
-  if (PollStatus == SOCKET_ERROR)
+    if (PollStatus == SOCKET_ERROR) {
 #else
-  if (PollStatus == -1)
+    if (PollStatus == -1) {
 #endif
-    return getLastSocketErrorCode();
-  if (PollStatus == 0)
-    return std::make_error_code(std::errc::timed_out);
-  if (FD[0].revents & POLLNVAL)
-    return std::make_error_code(std::errc::bad_file_descriptor);
-  return std::error_code();
-}
+      std::error_code PollErrCode = getLastSocketErrorCode();
+      // Ignore EINTR (signal occured before any request event) and retry
+      if (PollErrCode != std::errc::interrupted)
+        return llvm::make_error<StringError>(PollErrCode, "FD poll failed");
+    }
+    if (PollStatus == 0)
+      return llvm::make_error<StringError>(
+          std::make_error_code(std::errc::timed_out),
+          "No client requests within timeout window");
 
-Expected<std::unique_ptr<raw_socket_stream>>
-ListeningSocket::accept(const std::chrono::milliseconds &Timeout) {
-  auto getActiveFD = [this]() -> int { return FD; };
-  std::error_code TimeoutErr = manageTimeout(Timeout, getActiveFD, PipeFD[0]);
-  if (TimeoutErr)
-    return llvm::make_error<StringError>(TimeoutErr, "Timeout error");
+    if (FDs[0].revents & POLLNVAL)
+      return llvm::make_error<StringError>(
+          std::make_error_code(std::errc::bad_file_descriptor));
+
+    auto Stop = std::chrono::steady_clock::now();
+    ElapsedTime +=
+        std::chrono::duration_cast<std::chrono::milliseconds>(Stop - Start);
+  }
 
   int AcceptFD;
 #ifdef _WIN32
-  SOCKET WinAcceptSock = ::accept(_get_osfhandle(FD), NULL, NULL);
+  SOCKET WinAcceptSock = ::accept(WinServerSock, NULL, NULL);
   AcceptFD = _open_osfhandle(WinAcceptSock, 0);
 #else
   AcceptFD = ::accept(FD, NULL, NULL);
@@ -315,8 +295,6 @@ ListeningSocket::~ListeningSocket() {
 raw_socket_stream::raw_socket_stream(int SocketFD)
     : raw_fd_stream(SocketFD, true) {}
 
-raw_socket_stream::~raw_socket_stream() {}
-
 Expected<std::unique_ptr<raw_socket_stream>>
 raw_socket_stream::createConnectedUnix(StringRef SocketPath) {
 #ifdef _WIN32
@@ -328,14 +306,4 @@ raw_socket_stream::createConnectedUnix(StringRef SocketPath) {
   return std::make_unique<raw_socket_stream>(*FD);
 }
 
-ssize_t raw_socket_stream::read(char *Ptr, size_t Size,
-                                const std::chrono::milliseconds &Timeout) {
-  auto getActiveFD = [this]() -> int { return this->get_fd(); };
-  std::error_code Err = manageTimeout(Timeout, getActiveFD);
-  // Mimic raw_fd_stream::read error handling behavior
-  if (Err) {
-    raw_fd_stream::error_detected(Err);
-    return -1;
-  }
-  return raw_fd_stream::read(Ptr, Size);
-}
+raw_socket_stream::~raw_socket_stream() {}
diff --git a/llvm/lib/Support/xxhash.cpp b/llvm/lib/Support/xxhash.cpp
index cdb76d57e2c1d..607789b391381 100644
--- a/llvm/lib/Support/xxhash.cpp
+++ b/llvm/lib/Support/xxhash.cpp
@@ -47,19 +47,6 @@
 
 #include <stdlib.h>
 
-#if !defined(LLVM_XXH_USE_NEON)
-#if (defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) &&      \
-    !defined(__ARM_BIG_ENDIAN)
-#define LLVM_XXH_USE_NEON 1
-#else
-#define LLVM_XXH_USE_NEON 0
-#endif
-#endif
-
-#if LLVM_XXH_USE_NEON
-#include <arm_neon.h>
-#endif
-
 using namespace llvm;
 using namespace support;
 
@@ -336,144 +323,6 @@ static uint64_t XXH3_len_129to240_64b(const uint8_t *input, size_t len,
   return XXH3_avalanche(acc);
 }
 
-#if LLVM_XXH_USE_NEON
-
-#define XXH3_accumulate_512 XXH3_accumulate_512_neon
-#define XXH3_scrambleAcc XXH3_scrambleAcc_neon
-
-// NEON implementation based on commit a57f6cce2698049863af8c25787084ae0489d849
-// (July 2024), with the following removed:
-// - workaround for suboptimal codegen on older GCC
-// - compiler barriers against instruction reordering
-// - WebAssembly SIMD support
-// - configurable split between NEON and scalar lanes (benchmarking shows no
-//   penalty when fully doing SIMD on the Apple M1)
-
-#if defined(__GNUC__) || defined(__clang__)
-#define XXH_ALIASING __attribute__((__may_alias__))
-#else
-#define XXH_ALIASING /* nothing */
-#endif
-
-typedef uint64x2_t xxh_aliasing_uint64x2_t XXH_ALIASING;
-
-LLVM_ATTRIBUTE_ALWAYS_INLINE static uint64x2_t XXH_vld1q_u64(void const *ptr) {
-  return vreinterpretq_u64_u8(vld1q_u8((uint8_t const *)ptr));
-}
-
-LLVM_ATTRIBUTE_ALWAYS_INLINE
-static void XXH3_accumulate_512_neon(uint64_t *acc, const uint8_t *input,
-                                     const uint8_t *secret) {
-  xxh_aliasing_uint64x2_t *const xacc = (xxh_aliasing_uint64x2_t *)acc;
-
-#ifdef __clang__
-#pragma clang loop unroll(full)
-#endif
-  for (size_t i = 0; i < XXH_ACC_NB / 2; i += 2) {
-    /* data_vec = input[i]; */
-    uint64x2_t data_vec_1 = XXH_vld1q_u64(input + (i * 16));
-    uint64x2_t data_vec_2 = XXH_vld1q_u64(input + ((i + 1) * 16));
-
-    /* key_vec  = secret[i];  */
-    uint64x2_t key_vec_1 = XXH_vld1q_u64(secret + (i * 16));
-    uint64x2_t key_vec_2 = XXH_vld1q_u64(secret + ((i + 1) * 16));
-
-    /* data_swap = swap(data_vec) */
-    uint64x2_t data_swap_1 = vextq_u64(data_vec_1, data_vec_1, 1);
-    uint64x2_t data_swap_2 = vextq_u64(data_vec_2, data_vec_2, 1);
-
-    /* data_key = data_vec ^ key_vec; */
-    uint64x2_t data_key_1 = veorq_u64(data_vec_1, key_vec_1);
-    uint64x2_t data_key_2 = veorq_u64(data_vec_2, key_vec_2);
-
-    /*
-     * If we reinterpret the 64x2 vectors as 32x4 vectors, we can use a
-     * de-interleave operation for 4 lanes in 1 step with `vuzpq_u32` to
-     * get one vector with the low 32 bits of each lane, and one vector
-     * with the high 32 bits of each lane.
-     *
-     * The intrinsic returns a double vector because the original ARMv7-a
-     * instruction modified both arguments in place. AArch64 and SIMD128 emit
-     * two instructions from this intrinsic.
-     *
-     *  [ dk11L | dk11H | dk12L | dk12H ] -> [ dk11L | dk12L | dk21L | dk22L ]
-     *  [ dk21L | dk21H | dk22L | dk22H ] -> [ dk11H | dk12H | dk21H | dk22H ]
-     */
-    uint32x4x2_t unzipped = vuzpq_u32(vreinterpretq_u32_u64(data_key_1),
-                                      vreinterpretq_u32_u64(data_key_2));
-
-    /* data_key_lo = data_key & 0xFFFFFFFF */
-    uint32x4_t data_key_lo = unzipped.val[0];
-    /* data_key_hi = data_key >> 32 */
-    uint32x4_t data_key_hi = unzipped.val[1];
-
-    /*
-     * Then, we can split the vectors horizontally and multiply which, as for
-     * most widening intrinsics, have a variant that works on both high half
-     * vectors for free on AArch64. A similar instruction is available on
-     * SIMD128.
-     *
-     * sum = data_swap + (u64x2) data_key_lo * (u64x2) data_key_hi
-     */
-    uint64x2_t sum_1 = vmlal_u32(data_swap_1, vget_low_u32(data_key_lo),
-                                 vget_low_u32(data_key_hi));
-    uint64x2_t sum_2 = vmlal_u32(data_swap_2, vget_high_u32(data_key_lo),
-                                 vget_high_u32(data_key_hi));
-
-    /* xacc[i] = acc_vec + sum; */
-    xacc[i] = vaddq_u64(xacc[i], sum_1);
-    xacc[i + 1] = vaddq_u64(xacc[i + 1], sum_2);
-  }
-}
-
-LLVM_ATTRIBUTE_ALWAYS_INLINE
-static void XXH3_scrambleAcc_neon(uint64_t *acc, const uint8_t *secret) {
-  xxh_aliasing_uint64x2_t *const xacc = (xxh_aliasing_uint64x2_t *)acc;
-
-  /* { prime32_1, prime32_1 } */
-  uint32x2_t const kPrimeLo = vdup_n_u32(PRIME32_1);
-  /* { 0, prime32_1, 0, prime32_1 } */
-  uint32x4_t const kPrimeHi =
-      vreinterpretq_u32_u64(vdupq_n_u64((uint64_t)PRIME32_1 << 32));
-
-  for (size_t i = 0; i < XXH_ACC_NB / 2; ++i) {
-    /* xacc[i] ^= (xacc[i] >> 47); */
-    uint64x2_t acc_vec = XXH_vld1q_u64(acc + (2 * i));
-    uint64x2_t shifted = vshrq_n_u64(acc_vec, 47);
-    uint64x2_t data_vec = veorq_u64(acc_vec, shifted);
-
-    /* xacc[i] ^= secret[i]; */
-    uint64x2_t key_vec = XXH_vld1q_u64(secret + (i * 16));
-    uint64x2_t data_key = veorq_u64(data_vec, key_vec);
-
-    /*
-     * xacc[i] *= XXH_PRIME32_1
-     *
-     * Expanded version with portable NEON intrinsics
-     *
-     *    lo(x) * lo(y) + (hi(x) * lo(y) << 32)
-     *
-     * prod_hi = hi(data_key) * lo(prime) << 32
-     *
-     * Since we only need 32 bits of this multiply a trick can be used,
-     * reinterpreting the vector as a uint32x4_t and multiplying by
-     * { 0, prime, 0, prime } to cancel out the unwanted bits and avoid the
-     * shift.
-     */
-    uint32x4_t prod_hi = vmulq_u32(vreinterpretq_u32_u64(data_key), kPrimeHi);
-
-    /* Extract low bits for vmlal_u32  */
-    uint32x2_t data_key_lo = vmovn_u64(data_key);
-
-    /* xacc[i] = prod_hi + lo(data_key) * XXH_PRIME32_1; */
-    xacc[i] = vmlal_u32(vreinterpretq_u64_u32(prod_hi), data_key_lo, kPrimeLo);
-  }
-}
-#else
-
-#define XXH3_accumulate_512 XXH3_accumulate_512_scalar
-#define XXH3_scrambleAcc XXH3_scrambleAcc_scalar
-
 LLVM_ATTRIBUTE_ALWAYS_INLINE
 static void XXH3_accumulate_512_scalar(uint64_t *acc, const uint8_t *input,
                                        const uint8_t *secret) {
@@ -486,23 +335,20 @@ static void XXH3_accumulate_512_scalar(uint64_t *acc, const uint8_t *input,
 }
 
 LLVM_ATTRIBUTE_ALWAYS_INLINE
-static void XXH3_scrambleAcc_scalar(uint64_t *acc, const uint8_t *secret) {
+static void XXH3_accumulate_scalar(uint64_t *acc, const uint8_t *input,
+                                   const uint8_t *secret, size_t nbStripes) {
+  for (size_t n = 0; n < nbStripes; ++n)
+    XXH3_accumulate_512_scalar(acc, input + n * XXH_STRIPE_LEN,
+                               secret + n * XXH_SECRET_CONSUME_RATE);
+}
+
+static void XXH3_scrambleAcc(uint64_t *acc, const uint8_t *secret) {
   for (size_t i = 0; i < XXH_ACC_NB; ++i) {
     acc[i] ^= acc[i] >> 47;
     acc[i] ^= endian::read64le(secret + 8 * i);
     acc[i] *= PRIME32_1;
   }
 }
-#endif
-
-LLVM_ATTRIBUTE_ALWAYS_INLINE
-static void XXH3_accumulate(uint64_t *acc, const uint8_t *input,
-                            const uint8_t *secret, size_t nbStripes) {
-  for (size_t n = 0; n < nbStripes; ++n) {
-    XXH3_accumulate_512(acc, input + n * XXH_STRIPE_LEN,
-                        secret + n * XXH_SECRET_CONSUME_RATE);
-  }
-}
 
 static uint64_t XXH3_mix2Accs(const uint64_t *acc, const uint8_t *secret) {
   return XXH3_mul128_fold64(acc[0] ^ endian::read64le(secret),
@@ -529,20 +375,21 @@ static uint64_t XXH3_hashLong_64b(const uint8_t *input, size_t len,
       PRIME64_4, PRIME32_2, PRIME64_5, PRIME32_1,
   };
   for (size_t n = 0; n < nb_blocks; ++n) {
-    XXH3_accumulate(acc, input + n * block_len, secret, nbStripesPerBlock);
+    XXH3_accumulate_scalar(acc, input + n * block_len, secret,
+                           nbStripesPerBlock);
     XXH3_scrambleAcc(acc, secret + secretSize - XXH_STRIPE_LEN);
   }
 
   /* last partial block */
   const size_t nbStripes = (len - 1 - (block_len * nb_blocks)) / XXH_STRIPE_LEN;
   assert(nbStripes <= secretSize / XXH_SECRET_CONSUME_RATE);
-  XXH3_accumulate(acc, input + nb_blocks * block_len, secret, nbStripes);
+  XXH3_accumulate_scalar(acc, input + nb_blocks * block_len, secret, nbStripes);
 
   /* last stripe */
   constexpr size_t XXH_SECRET_LASTACC_START = 7;
-  XXH3_accumulate_512(acc, input + len - XXH_STRIPE_LEN,
-                      secret + secretSize - XXH_STRIPE_LEN -
-                          XXH_SECRET_LASTACC_START);
+  XXH3_accumulate_512_scalar(acc, input + len - XXH_STRIPE_LEN,
+                             secret + secretSize - XXH_STRIPE_LEN -
+                                 XXH_SECRET_LASTACC_START);
 
   /* converge into final hash */
   constexpr size_t XXH_SECRET_MERGEACCS_START = 11;
@@ -993,20 +840,21 @@ XXH3_hashLong_128b(const uint8_t *input, size_t len, const uint8_t *secret,
   };
 
   for (size_t n = 0; n < nb_blocks; ++n) {
-    XXH3_accumulate(acc, input + n * block_len, secret, nbStripesPerBlock);
+    XXH3_accumulate_scalar(acc, input + n * block_len, secret,
+                           nbStripesPerBlock);
     XXH3_scrambleAcc(acc, secret + secretSize - XXH_STRIPE_LEN);
   }
 
   /* last partial block */
   const size_t nbStripes = (len - 1 - (block_len * nb_blocks)) / XXH_STRIPE_LEN;
   assert(nbStripes <= secretSize / XXH_SECRET_CONSUME_RATE);
-  XXH3_accumulate(acc, input + nb_blocks * block_len, secret, nbStripes);
+  XXH3_accumulate_scalar(acc, input + nb_blocks * block_len, secret, nbStripes);
 
   /* last stripe */
   constexpr size_t XXH_SECRET_LASTACC_START = 7;
-  XXH3_accumulate_512(acc, input + len - XXH_STRIPE_LEN,
-                      secret + secretSize - XXH_STRIPE_LEN -
-                          XXH_SECRET_LASTACC_START);
+  XXH3_accumulate_512_scalar(acc, input + len - XXH_STRIPE_LEN,
+                             secret + secretSize - XXH_STRIPE_LEN -
+                                 XXH_SECRET_LASTACC_START);
 
   /* converge into final hash */
   static_assert(sizeof(acc) == 64);
diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
index 3c9b07ad45bf2..1e60ce9c40df8 100644
--- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -68,15 +68,6 @@
 
 using namespace llvm;
 
-enum PtrauthCheckMode { Default, Unchecked, Poison, Trap };
-static cl::opt<PtrauthCheckMode> PtrauthAuthChecks(
-    "aarch64-ptrauth-auth-checks", cl::Hidden,
-    cl::values(clEnumValN(Unchecked, "none", "don't test for failure"),
-               clEnumValN(Poison, "poison", "poison on failure"),
-               clEnumValN(Trap, "trap", "trap on failure")),
-    cl::desc("Check pointer authentication auth/resign failures"),
-    cl::init(Default));
-
 #define DEBUG_TYPE "asm-printer"
 
 namespace {
@@ -102,8 +93,6 @@ class AArch64AsmPrinter : public AsmPrinter {
 
   const MCExpr *lowerConstantPtrAuth(const ConstantPtrAuth &CPA) override;
 
-  const MCExpr *lowerBlockAddressConstant(const BlockAddress &BA) override;
-
   void emitStartOfAsmFile(Module &M) override;
   void emitJumpTableInfo() override;
   std::tuple<const MCSymbol *, uint64_t, const MCSymbol *,
@@ -115,8 +104,6 @@ class AArch64AsmPrinter : public AsmPrinter {
 
   void LowerJumpTableDest(MCStreamer &OutStreamer, const MachineInstr &MI);
 
-  void LowerHardenedBRJumpTable(const MachineInstr &MI);
-
   void LowerMOPS(MCStreamer &OutStreamer, const MachineInstr &MI);
 
   void LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM,
@@ -141,12 +128,8 @@ class AArch64AsmPrinter : public AsmPrinter {
 
   void emitSled(const MachineInstr &MI, SledKind Kind);
 
-  // Emit the sequence for BRA/BLRA (authenticate + branch/call).
+  // Emit the sequence for BLRA (authenticate + branch).
   void emitPtrauthBranch(const MachineInstr *MI);
-
-  // Emit the sequence for AUT or AUTPAC.
-  void emitPtrauthAuthResign(const MachineInstr *MI);
-
   // Emit the sequence to compute a discriminator into x17, or reuse AddrDisc.
   unsigned emitPtrauthDiscriminator(uint16_t Disc, unsigned AddrDisc,
                                     unsigned &InstsEmitted);
@@ -1371,139 +1354,6 @@ void AArch64AsmPrinter::LowerJumpTableDest(llvm::MCStreamer &OutStreamer,
                                   .addImm(Size == 4 ? 0 : 2));
 }
 
-void AArch64AsmPrinter::LowerHardenedBRJumpTable(const MachineInstr &MI) {
-  unsigned InstsEmitted = 0;
-
-  const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
-  assert(MJTI && "Can't lower jump-table dispatch without JTI");
-
-  const std::vector<MachineJumpTableEntry> &JTs = MJTI->getJumpTables();
-  assert(!JTs.empty() && "Invalid JT index for jump-table dispatch");
-
-  // Emit:
-  //     mov x17, #<size of table>     ; depending on table size, with MOVKs
-  //     cmp x16, x17                  ; or #imm if table size fits in 12-bit
-  //     csel x16, x16, xzr, ls        ; check for index overflow
-  //
-  //     adrp x17, Ltable at PAGE         ; materialize table address
-  //     add x17, Ltable at PAGEOFF
-  //     ldrsw x16, [x17, x16, lsl #2] ; load table entry
-  //
-  //   Lanchor:
-  //     adr x17, Lanchor              ; compute target address
-  //     add x16, x17, x16
-  //     br x16                        ; branch to target
-
-  MachineOperand JTOp = MI.getOperand(0);
-
-  unsigned JTI = JTOp.getIndex();
-  assert(!AArch64FI->getJumpTableEntryPCRelSymbol(JTI) &&
-         "unsupported compressed jump table");
-
-  const uint64_t NumTableEntries = JTs[JTI].MBBs.size();
-
-  // cmp only supports a 12-bit immediate.  If we need more, materialize the
-  // immediate, using x17 as a scratch register.
-  uint64_t MaxTableEntry = NumTableEntries - 1;
-  if (isUInt<12>(MaxTableEntry)) {
-    EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::SUBSXri)
-                                     .addReg(AArch64::XZR)
-                                     .addReg(AArch64::X16)
-                                     .addImm(MaxTableEntry)
-                                     .addImm(0));
-    ++InstsEmitted;
-  } else {
-    EmitToStreamer(*OutStreamer,
-                   MCInstBuilder(AArch64::MOVZXi)
-                       .addReg(AArch64::X17)
-                       .addImm(static_cast<uint16_t>(MaxTableEntry))
-                       .addImm(0));
-    ++InstsEmitted;
-    // It's sad that we have to manually materialize instructions, but we can't
-    // trivially reuse the main pseudo expansion logic.
-    // A MOVK sequence is easy enough to generate and handles the general case.
-    for (int Offset = 16; Offset < 64; Offset += 16) {
-      if ((MaxTableEntry >> Offset) == 0)
-        break;
-      EmitToStreamer(*OutStreamer,
-                     MCInstBuilder(AArch64::MOVKXi)
-                         .addReg(AArch64::X17)
-                         .addReg(AArch64::X17)
-                         .addImm(static_cast<uint16_t>(MaxTableEntry >> Offset))
-                         .addImm(Offset));
-      ++InstsEmitted;
-    }
-    EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::SUBSXrs)
-                                     .addReg(AArch64::XZR)
-                                     .addReg(AArch64::X16)
-                                     .addReg(AArch64::X17)
-                                     .addImm(0));
-    ++InstsEmitted;
-  }
-
-  // This picks entry #0 on failure.
-  // We might want to trap instead.
-  EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::CSELXr)
-                                   .addReg(AArch64::X16)
-                                   .addReg(AArch64::X16)
-                                   .addReg(AArch64::XZR)
-                                   .addImm(AArch64CC::LS));
-  ++InstsEmitted;
-
-  // Prepare the @PAGE/@PAGEOFF low/high operands.
-  MachineOperand JTMOHi(JTOp), JTMOLo(JTOp);
-  MCOperand JTMCHi, JTMCLo;
-
-  JTMOHi.setTargetFlags(AArch64II::MO_PAGE);
-  JTMOLo.setTargetFlags(AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
-
-  MCInstLowering.lowerOperand(JTMOHi, JTMCHi);
-  MCInstLowering.lowerOperand(JTMOLo, JTMCLo);
-
-  EmitToStreamer(
-      *OutStreamer,
-      MCInstBuilder(AArch64::ADRP).addReg(AArch64::X17).addOperand(JTMCHi));
-  ++InstsEmitted;
-
-  EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::ADDXri)
-                                   .addReg(AArch64::X17)
-                                   .addReg(AArch64::X17)
-                                   .addOperand(JTMCLo)
-                                   .addImm(0));
-  ++InstsEmitted;
-
-  EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::LDRSWroX)
-                                   .addReg(AArch64::X16)
-                                   .addReg(AArch64::X17)
-                                   .addReg(AArch64::X16)
-                                   .addImm(0)
-                                   .addImm(1));
-  ++InstsEmitted;
-
-  MCSymbol *AdrLabel = MF->getContext().createTempSymbol();
-  const auto *AdrLabelE = MCSymbolRefExpr::create(AdrLabel, MF->getContext());
-  AArch64FI->setJumpTableEntryInfo(JTI, 4, AdrLabel);
-
-  OutStreamer->emitLabel(AdrLabel);
-  EmitToStreamer(
-      *OutStreamer,
-      MCInstBuilder(AArch64::ADR).addReg(AArch64::X17).addExpr(AdrLabelE));
-  ++InstsEmitted;
-
-  EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::ADDXrs)
-                                   .addReg(AArch64::X16)
-                                   .addReg(AArch64::X17)
-                                   .addReg(AArch64::X16)
-                                   .addImm(0));
-  ++InstsEmitted;
-
-  EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::BR).addReg(AArch64::X16));
-  ++InstsEmitted;
-
-  (void)InstsEmitted;
-  assert(STI->getInstrInfo()->getInstSizeInBytes(MI) >= InstsEmitted * 4);
-}
-
 void AArch64AsmPrinter::LowerMOPS(llvm::MCStreamer &OutStreamer,
                                   const llvm::MachineInstr &MI) {
   unsigned Opcode = MI.getOpcode();
@@ -1773,225 +1623,8 @@ unsigned AArch64AsmPrinter::emitPtrauthDiscriminator(uint16_t Disc,
   return AArch64::X17;
 }
 
-void AArch64AsmPrinter::emitPtrauthAuthResign(const MachineInstr *MI) {
-  unsigned InstsEmitted = 0;
-  const bool IsAUTPAC = MI->getOpcode() == AArch64::AUTPAC;
-
-  // We can expand AUT/AUTPAC into 3 possible sequences:
-  // - unchecked:
-  //      autia x16, x0
-  //      pacib x16, x1 ; if AUTPAC
-  //
-  // - checked and clearing:
-  //      mov x17, x0
-  //      movk x17, #disc, lsl #48
-  //      autia x16, x17
-  //      mov x17, x16
-  //      xpaci x17
-  //      cmp x16, x17
-  //      b.eq Lsuccess
-  //      mov x16, x17
-  //      b Lend
-  //     Lsuccess:
-  //      mov x17, x1
-  //      movk x17, #disc, lsl #48
-  //      pacib x16, x17
-  //     Lend:
-  //   Where we only emit the AUT if we started with an AUT.
-  //
-  // - checked and trapping:
-  //      mov x17, x0
-  //      movk x17, #disc, lsl #48
-  //      autia x16, x0
-  //      mov x17, x16
-  //      xpaci x17
-  //      cmp x16, x17
-  //      b.eq Lsuccess
-  //      brk #<0xc470 + aut key>
-  //     Lsuccess:
-  //      mov x17, x1
-  //      movk x17, #disc, lsl #48
-  //      pacib x16, x17 ; if AUTPAC
-  //   Where the b.eq skips over the trap if the PAC is valid.
-  //
-  // This sequence is expensive, but we need more information to be able to
-  // do better.
-  //
-  // We can't TBZ the poison bit because EnhancedPAC2 XORs the PAC bits
-  // on failure.
-  // We can't TST the PAC bits because we don't always know how the address
-  // space is setup for the target environment (and the bottom PAC bit is
-  // based on that).
-  // Either way, we also don't always know whether TBI is enabled or not for
-  // the specific target environment.
-
-  // By default, auth/resign sequences check for auth failures.
-  bool ShouldCheck = true;
-  // In the checked sequence, we only trap if explicitly requested.
-  bool ShouldTrap = MF->getFunction().hasFnAttribute("ptrauth-auth-traps");
-
-  // On an FPAC CPU, you get traps whether you want them or not: there's
-  // no point in emitting checks or traps.
-  if (STI->hasFPAC())
-    ShouldCheck = ShouldTrap = false;
-
-  // However, command-line flags can override this, for experimentation.
-  switch (PtrauthAuthChecks) {
-  case PtrauthCheckMode::Default:
-    break;
-  case PtrauthCheckMode::Unchecked:
-    ShouldCheck = ShouldTrap = false;
-    break;
-  case PtrauthCheckMode::Poison:
-    ShouldCheck = true;
-    ShouldTrap = false;
-    break;
-  case PtrauthCheckMode::Trap:
-    ShouldCheck = ShouldTrap = true;
-    break;
-  }
-
-  auto AUTKey = (AArch64PACKey::ID)MI->getOperand(0).getImm();
-  uint64_t AUTDisc = MI->getOperand(1).getImm();
-  unsigned AUTAddrDisc = MI->getOperand(2).getReg();
-
-  unsigned XPACOpc = getXPACOpcodeForKey(AUTKey);
-
-  // Compute aut discriminator into x17
-  assert(isUInt<16>(AUTDisc));
-  unsigned AUTDiscReg =
-      emitPtrauthDiscriminator(AUTDisc, AUTAddrDisc, InstsEmitted);
-  bool AUTZero = AUTDiscReg == AArch64::XZR;
-  unsigned AUTOpc = getAUTOpcodeForKey(AUTKey, AUTZero);
-
-  //  autiza x16      ; if  AUTZero
-  //  autia x16, x17  ; if !AUTZero
-  MCInst AUTInst;
-  AUTInst.setOpcode(AUTOpc);
-  AUTInst.addOperand(MCOperand::createReg(AArch64::X16));
-  AUTInst.addOperand(MCOperand::createReg(AArch64::X16));
-  if (!AUTZero)
-    AUTInst.addOperand(MCOperand::createReg(AUTDiscReg));
-  EmitToStreamer(*OutStreamer, AUTInst);
-  ++InstsEmitted;
-
-  // Unchecked or checked-but-non-trapping AUT is just an "AUT": we're done.
-  if (!IsAUTPAC && (!ShouldCheck || !ShouldTrap)) {
-    assert(STI->getInstrInfo()->getInstSizeInBytes(*MI) >= InstsEmitted * 4);
-    return;
-  }
-
-  MCSymbol *EndSym = nullptr;
-
-  // Checked sequences do an additional strip-and-compare.
-  if (ShouldCheck) {
-    MCSymbol *SuccessSym = createTempSymbol("auth_success_");
-
-    // XPAC has tied src/dst: use x17 as a temporary copy.
-    //  mov x17, x16
-    EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::ORRXrs)
-                                     .addReg(AArch64::X17)
-                                     .addReg(AArch64::XZR)
-                                     .addReg(AArch64::X16)
-                                     .addImm(0));
-    ++InstsEmitted;
-
-    //  xpaci x17
-    EmitToStreamer(
-        *OutStreamer,
-        MCInstBuilder(XPACOpc).addReg(AArch64::X17).addReg(AArch64::X17));
-    ++InstsEmitted;
-
-    //  cmp x16, x17
-    EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::SUBSXrs)
-                                     .addReg(AArch64::XZR)
-                                     .addReg(AArch64::X16)
-                                     .addReg(AArch64::X17)
-                                     .addImm(0));
-    ++InstsEmitted;
-
-    //  b.eq Lsuccess
-    EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::Bcc)
-                                     .addImm(AArch64CC::EQ)
-                                     .addExpr(MCSymbolRefExpr::create(
-                                         SuccessSym, OutContext)));
-    ++InstsEmitted;
-
-    if (ShouldTrap) {
-      // Trapping sequences do a 'brk'.
-      //  brk #<0xc470 + aut key>
-      EmitToStreamer(*OutStreamer,
-                     MCInstBuilder(AArch64::BRK).addImm(0xc470 | AUTKey));
-      ++InstsEmitted;
-    } else {
-      // Non-trapping checked sequences return the stripped result in x16,
-      // skipping over the PAC if there is one.
-
-      // FIXME: can we simply return the AUT result, already in x16? without..
-      //        ..traps this is usable as an oracle anyway, based on high bits
-      //  mov x17, x16
-      EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::ORRXrs)
-                                       .addReg(AArch64::X16)
-                                       .addReg(AArch64::XZR)
-                                       .addReg(AArch64::X17)
-                                       .addImm(0));
-      ++InstsEmitted;
-
-      if (IsAUTPAC) {
-        EndSym = createTempSymbol("resign_end_");
-
-        //  b Lend
-        EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::B)
-                                         .addExpr(MCSymbolRefExpr::create(
-                                             EndSym, OutContext)));
-        ++InstsEmitted;
-      }
-    }
-
-    // If the auth check succeeds, we can continue.
-    // Lsuccess:
-    OutStreamer->emitLabel(SuccessSym);
-  }
-
-  // We already emitted unchecked and checked-but-non-trapping AUTs.
-  // That left us with trapping AUTs, and AUTPACs.
-  // Trapping AUTs don't need PAC: we're done.
-  if (!IsAUTPAC) {
-    assert(STI->getInstrInfo()->getInstSizeInBytes(*MI) >= InstsEmitted * 4);
-    return;
-  }
-
-  auto PACKey = (AArch64PACKey::ID)MI->getOperand(3).getImm();
-  uint64_t PACDisc = MI->getOperand(4).getImm();
-  unsigned PACAddrDisc = MI->getOperand(5).getReg();
-
-  // Compute pac discriminator into x17
-  assert(isUInt<16>(PACDisc));
-  unsigned PACDiscReg =
-      emitPtrauthDiscriminator(PACDisc, PACAddrDisc, InstsEmitted);
-  bool PACZero = PACDiscReg == AArch64::XZR;
-  unsigned PACOpc = getPACOpcodeForKey(PACKey, PACZero);
-
-  //  pacizb x16      ; if  PACZero
-  //  pacib x16, x17  ; if !PACZero
-  MCInst PACInst;
-  PACInst.setOpcode(PACOpc);
-  PACInst.addOperand(MCOperand::createReg(AArch64::X16));
-  PACInst.addOperand(MCOperand::createReg(AArch64::X16));
-  if (!PACZero)
-    PACInst.addOperand(MCOperand::createReg(PACDiscReg));
-  EmitToStreamer(*OutStreamer, PACInst);
-  ++InstsEmitted;
-
-  assert(STI->getInstrInfo()->getInstSizeInBytes(*MI) >= InstsEmitted * 4);
-  //  Lend:
-  if (EndSym)
-    OutStreamer->emitLabel(EndSym);
-}
-
 void AArch64AsmPrinter::emitPtrauthBranch(const MachineInstr *MI) {
   unsigned InstsEmitted = 0;
-  bool IsCall = MI->getOpcode() == AArch64::BLRA;
   unsigned BrTarget = MI->getOperand(0).getReg();
 
   auto Key = (AArch64PACKey::ID)MI->getOperand(1).getImm();
@@ -2008,17 +1641,10 @@ void AArch64AsmPrinter::emitPtrauthBranch(const MachineInstr *MI) {
   bool IsZeroDisc = DiscReg == AArch64::XZR;
 
   unsigned Opc;
-  if (IsCall) {
-    if (Key == AArch64PACKey::IA)
-      Opc = IsZeroDisc ? AArch64::BLRAAZ : AArch64::BLRAA;
-    else
-      Opc = IsZeroDisc ? AArch64::BLRABZ : AArch64::BLRAB;
-  } else {
-    if (Key == AArch64PACKey::IA)
-      Opc = IsZeroDisc ? AArch64::BRAAZ : AArch64::BRAA;
-    else
-      Opc = IsZeroDisc ? AArch64::BRABZ : AArch64::BRAB;
-  }
+  if (Key == AArch64PACKey::IA)
+    Opc = IsZeroDisc ? AArch64::BLRAAZ : AArch64::BLRAA;
+  else
+    Opc = IsZeroDisc ? AArch64::BLRABZ : AArch64::BLRAB;
 
   MCInst BRInst;
   BRInst.setOpcode(Opc);
@@ -2295,19 +1921,6 @@ void AArch64AsmPrinter::LowerMOVaddrPAC(const MachineInstr &MI) {
   assert(STI->getInstrInfo()->getInstSizeInBytes(MI) >= InstsEmitted * 4);
 }
 
-const MCExpr *
-AArch64AsmPrinter::lowerBlockAddressConstant(const BlockAddress &BA) {
-  const MCExpr *BAE = AsmPrinter::lowerBlockAddressConstant(BA);
-  const Function &Fn = *BA.getFunction();
-
-  if (std::optional<uint16_t> BADisc =
-          STI->getPtrAuthBlockAddressDiscriminatorIfEnabled(Fn))
-    return AArch64AuthMCExpr::create(BAE, *BADisc, AArch64PACKey::IA,
-                                     /*HasAddressDiversity=*/false, OutContext);
-
-  return BAE;
-}
-
 // Simple pseudo-instructions have their lowering (with expansion to real
 // instructions) auto-generated.
 #include "AArch64GenMCPseudoLowering.inc"
@@ -2443,11 +2056,6 @@ void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) {
     return;
   }
 
-  case AArch64::AUT:
-  case AArch64::AUTPAC:
-    emitPtrauthAuthResign(MI);
-    return;
-
   case AArch64::LOADauthptrstatic:
     LowerLOADauthptrstatic(*MI);
     return;
@@ -2457,7 +2065,6 @@ void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) {
     LowerMOVaddrPAC(*MI);
     return;
 
-  case AArch64::BRA:
   case AArch64::BLRA:
     emitPtrauthBranch(MI);
     return;
@@ -2625,10 +2232,6 @@ void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) {
     LowerJumpTableDest(*OutStreamer, *MI);
     return;
 
-  case AArch64::BR_JumpTable:
-    LowerHardenedBRJumpTable(*MI);
-    return;
-
   case AArch64::FMOVH0:
   case AArch64::FMOVS0:
   case AArch64::FMOVD0:
diff --git a/llvm/lib/Target/AArch64/AArch64FastISel.cpp b/llvm/lib/Target/AArch64/AArch64FastISel.cpp
index 719d79dec0c31..527496f1a6374 100644
--- a/llvm/lib/Target/AArch64/AArch64FastISel.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FastISel.cpp
@@ -2516,10 +2516,6 @@ bool AArch64FastISel::selectIndirectBr(const Instruction *I) {
   if (AddrReg == 0)
     return false;
 
-  // Authenticated indirectbr is not implemented yet.
-  if (FuncInfo.MF->getFunction().hasFnAttribute("ptrauth-indirect-gotos"))
-    return false;
-
   // Emit the indirect branch.
   const MCInstrDesc &II = TII.get(AArch64::BR);
   AddrReg = constrainOperandRegClass(II, AddrReg,  II.getNumDefs());
diff --git a/llvm/lib/Target/AArch64/AArch64Features.td b/llvm/lib/Target/AArch64/AArch64Features.td
index a1ae0873fc190..832e44fe117e2 100644
--- a/llvm/lib/Target/AArch64/AArch64Features.td
+++ b/llvm/lib/Target/AArch64/AArch64Features.td
@@ -185,9 +185,6 @@ def FeatureJS : ExtensionWithMArch<"jsconv", "JS", "FEAT_JSCVT",
   "Enable Armv8.3-A JavaScript FP conversion instructions",
   [FeatureFPARMv8]>;
 
-def FeatureFPAC : Extension<"fpac", "FPAC", "FEAT_FPAC",
-  "Enable v8.3-A Pointer Authentication Faulting enhancement">;
-
 def FeatureCCIDX : Extension<"ccidx", "CCIDX", "FEAT_CCIDX",
   "Enable Armv8.3-A Extend of the CCSIDR number of sets">;
 
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 55cc106c08b95..b9c57d1975b6f 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -366,9 +366,6 @@ class AArch64DAGToDAGISel : public SelectionDAGISel {
 
   bool tryIndexedLoad(SDNode *N);
 
-  void SelectPtrauthAuth(SDNode *N);
-  void SelectPtrauthResign(SDNode *N);
-
   bool trySelectStackSlotTagP(SDNode *N);
   void SelectTagP(SDNode *N);
 
@@ -1484,96 +1481,6 @@ void AArch64DAGToDAGISel::SelectTable(SDNode *N, unsigned NumVecs, unsigned Opc,
   ReplaceNode(N, CurDAG->getMachineNode(Opc, dl, VT, Ops));
 }
 
-static std::tuple<SDValue, SDValue>
-extractPtrauthBlendDiscriminators(SDValue Disc, SelectionDAG *DAG) {
-  SDLoc DL(Disc);
-  SDValue AddrDisc;
-  SDValue ConstDisc;
-
-  // If this is a blend, remember the constant and address discriminators.
-  // Otherwise, it's either a constant discriminator, or a non-blended
-  // address discriminator.
-  if (Disc->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
-      Disc->getConstantOperandVal(0) == Intrinsic::ptrauth_blend) {
-    AddrDisc = Disc->getOperand(1);
-    ConstDisc = Disc->getOperand(2);
-  } else {
-    ConstDisc = Disc;
-  }
-
-  // If the constant discriminator (either the blend RHS, or the entire
-  // discriminator value) isn't a 16-bit constant, bail out, and let the
-  // discriminator be computed separately.
-  auto *ConstDiscN = dyn_cast<ConstantSDNode>(ConstDisc);
-  if (!ConstDiscN || !isUInt<16>(ConstDiscN->getZExtValue()))
-    return std::make_tuple(DAG->getTargetConstant(0, DL, MVT::i64), Disc);
-
-  // If there's no address discriminator, use XZR directly.
-  if (!AddrDisc)
-    AddrDisc = DAG->getRegister(AArch64::XZR, MVT::i64);
-
-  return std::make_tuple(
-      DAG->getTargetConstant(ConstDiscN->getZExtValue(), DL, MVT::i64),
-      AddrDisc);
-}
-
-void AArch64DAGToDAGISel::SelectPtrauthAuth(SDNode *N) {
-  SDLoc DL(N);
-  // IntrinsicID is operand #0
-  SDValue Val = N->getOperand(1);
-  SDValue AUTKey = N->getOperand(2);
-  SDValue AUTDisc = N->getOperand(3);
-
-  unsigned AUTKeyC = cast<ConstantSDNode>(AUTKey)->getZExtValue();
-  AUTKey = CurDAG->getTargetConstant(AUTKeyC, DL, MVT::i64);
-
-  SDValue AUTAddrDisc, AUTConstDisc;
-  std::tie(AUTConstDisc, AUTAddrDisc) =
-      extractPtrauthBlendDiscriminators(AUTDisc, CurDAG);
-
-  SDValue X16Copy = CurDAG->getCopyToReg(CurDAG->getEntryNode(), DL,
-                                         AArch64::X16, Val, SDValue());
-  SDValue Ops[] = {AUTKey, AUTConstDisc, AUTAddrDisc, X16Copy.getValue(1)};
-
-  SDNode *AUT = CurDAG->getMachineNode(AArch64::AUT, DL, MVT::i64, Ops);
-  ReplaceNode(N, AUT);
-  return;
-}
-
-void AArch64DAGToDAGISel::SelectPtrauthResign(SDNode *N) {
-  SDLoc DL(N);
-  // IntrinsicID is operand #0
-  SDValue Val = N->getOperand(1);
-  SDValue AUTKey = N->getOperand(2);
-  SDValue AUTDisc = N->getOperand(3);
-  SDValue PACKey = N->getOperand(4);
-  SDValue PACDisc = N->getOperand(5);
-
-  unsigned AUTKeyC = cast<ConstantSDNode>(AUTKey)->getZExtValue();
-  unsigned PACKeyC = cast<ConstantSDNode>(PACKey)->getZExtValue();
-
-  AUTKey = CurDAG->getTargetConstant(AUTKeyC, DL, MVT::i64);
-  PACKey = CurDAG->getTargetConstant(PACKeyC, DL, MVT::i64);
-
-  SDValue AUTAddrDisc, AUTConstDisc;
-  std::tie(AUTConstDisc, AUTAddrDisc) =
-      extractPtrauthBlendDiscriminators(AUTDisc, CurDAG);
-
-  SDValue PACAddrDisc, PACConstDisc;
-  std::tie(PACConstDisc, PACAddrDisc) =
-      extractPtrauthBlendDiscriminators(PACDisc, CurDAG);
-
-  SDValue X16Copy = CurDAG->getCopyToReg(CurDAG->getEntryNode(), DL,
-                                         AArch64::X16, Val, SDValue());
-
-  SDValue Ops[] = {AUTKey,       AUTConstDisc, AUTAddrDisc,        PACKey,
-                   PACConstDisc, PACAddrDisc,  X16Copy.getValue(1)};
-
-  SDNode *AUTPAC = CurDAG->getMachineNode(AArch64::AUTPAC, DL, MVT::i64, Ops);
-  ReplaceNode(N, AUTPAC);
-  return;
-}
-
 bool AArch64DAGToDAGISel::tryIndexedLoad(SDNode *N) {
   LoadSDNode *LD = cast<LoadSDNode>(N);
   if (LD->isUnindexed())
@@ -5530,15 +5437,6 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
     case Intrinsic::aarch64_tagp:
       SelectTagP(Node);
       return;
-
-    case Intrinsic::ptrauth_auth:
-      SelectPtrauthAuth(Node);
-      return;
-
-    case Intrinsic::ptrauth_resign:
-      SelectPtrauthResign(Node);
-      return;
-
     case Intrinsic::aarch64_neon_tbl2:
       SelectTable(Node, 2,
                   VT == MVT::v8i8 ? AArch64::TBLv8i8Two : AArch64::TBLv16i8Two,
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 87e7750768d2d..bf205b1706a6c 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -85,7 +85,6 @@
 #include "llvm/Support/InstructionCost.h"
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Support/MathExtras.h"
-#include "llvm/Support/SipHash.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
@@ -510,7 +509,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
   setOperationAction(ISD::BR_JT, MVT::Other, Custom);
   setOperationAction(ISD::JumpTable, MVT::i64, Custom);
-  setOperationAction(ISD::BRIND, MVT::Other, Custom);
   setOperationAction(ISD::SETCCCARRY, MVT::i64, Custom);
 
   setOperationAction(ISD::PtrAuthGlobalAddress, MVT::i64, Custom);
@@ -3405,11 +3403,6 @@ static bool isLegalArithImmed(uint64_t C) {
   return IsLegal;
 }
 
-static bool cannotBeIntMin(SDValue CheckedVal, SelectionDAG &DAG) {
-  KnownBits KnownSrc = DAG.computeKnownBits(CheckedVal);
-  return !KnownSrc.getSignedMinValue().isMinSignedValue();
-}
-
 // Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on
 // the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags
 // can be set differently by this operation. It comes down to whether
@@ -3417,14 +3410,12 @@ static bool cannotBeIntMin(SDValue CheckedVal, SelectionDAG &DAG) {
 // everything is fine. If not then the optimization is wrong. Thus general
 // comparisons are only valid if op2 != 0.
 //
-// So, finally, the only LLVM-native comparisons that don't mention C or V
-// are the ones that aren't unsigned comparisons. They're the only ones we can
-// safely use CMN for in the absence of information about op2.
-static bool isCMN(SDValue Op, ISD::CondCode CC, SelectionDAG &DAG) {
+// So, finally, the only LLVM-native comparisons that don't mention C and V
+// are SETEQ and SETNE. They're the only ones we can safely use CMN for in
+// the absence of information about op2.
+static bool isCMN(SDValue Op, ISD::CondCode CC) {
   return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) &&
-         (isIntEqualitySetCC(CC) ||
-          (isUnsignedIntSetCC(CC) && DAG.isKnownNeverZero(Op.getOperand(1))) ||
-          (isSignedIntSetCC(CC) && cannotBeIntMin(Op.getOperand(1), DAG)));
+         (CC == ISD::SETEQ || CC == ISD::SETNE);
 }
 
 static SDValue emitStrictFPComparison(SDValue LHS, SDValue RHS, const SDLoc &dl,
@@ -3469,12 +3460,11 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
   // register to WZR/XZR if it ends up being unused.
   unsigned Opcode = AArch64ISD::SUBS;
 
-  if (isCMN(RHS, CC, DAG)) {
+  if (isCMN(RHS, CC)) {
     // Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ?
     Opcode = AArch64ISD::ADDS;
     RHS = RHS.getOperand(1);
-  } else if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
-             isIntEqualitySetCC(CC)) {
+  } else if (isCMN(LHS, CC)) {
     // As we are looking for EQ/NE compares, the operands can be commuted ; can
     // we combine a (CMP (sub 0, op1), op2) into a CMN instruction ?
     Opcode = AArch64ISD::ADDS;
@@ -3576,15 +3566,13 @@ static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS,
       Opcode = AArch64ISD::CCMN;
       RHS = DAG.getConstant(Imm.abs(), DL, Const->getValueType(0));
     }
-  } else if (isCMN(RHS, CC, DAG)) {
-    Opcode = AArch64ISD::CCMN;
-    RHS = RHS.getOperand(1);
-  } else if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
-             isIntEqualitySetCC(CC)) {
-    // As we are looking for EQ/NE compares, the operands can be commuted ; can
-    // we combine a (CCMP (sub 0, op1), op2) into a CCMN instruction ?
-    Opcode = AArch64ISD::CCMN;
-    LHS = LHS.getOperand(1);
+  } else if (RHS.getOpcode() == ISD::SUB) {
+    SDValue SubOp0 = RHS.getOperand(0);
+    if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
+      // See emitComparison() on why we can only do this for SETEQ and SETNE.
+      Opcode = AArch64ISD::CCMN;
+      RHS = RHS.getOperand(1);
+    }
   }
   if (Opcode == 0)
     Opcode = AArch64ISD::CCMP;
@@ -3902,8 +3890,8 @@ static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
   //    cmp     w12, w11, lsl #1
   if (!isa<ConstantSDNode>(RHS) ||
       !isLegalArithImmed(RHS->getAsAPIntVal().abs().getZExtValue())) {
-    bool LHSIsCMN = isCMN(LHS, CC, DAG);
-    bool RHSIsCMN = isCMN(RHS, CC, DAG);
+    bool LHSIsCMN = isCMN(LHS, CC);
+    bool RHSIsCMN = isCMN(RHS, CC);
     SDValue TheLHS = LHSIsCMN ? LHS.getOperand(1) : LHS;
     SDValue TheRHS = RHSIsCMN ? RHS.getOperand(1) : RHS;
 
@@ -3916,7 +3904,7 @@ static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
 
   SDValue Cmp;
   AArch64CC::CondCode AArch64CC;
-  if (isIntEqualitySetCC(CC) && isa<ConstantSDNode>(RHS)) {
+  if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) {
     const ConstantSDNode *RHSC = cast<ConstantSDNode>(RHS);
 
     // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
@@ -6723,8 +6711,6 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
     return LowerJumpTable(Op, DAG);
   case ISD::BR_JT:
     return LowerBR_JT(Op, DAG);
-  case ISD::BRIND:
-    return LowerBRIND(Op, DAG);
   case ISD::ConstantPool:
     return LowerConstantPool(Op, DAG);
   case ISD::BlockAddress:
@@ -10746,30 +10732,6 @@ SDValue AArch64TargetLowering::LowerBR_JT(SDValue Op,
   auto *AFI = DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
   AFI->setJumpTableEntryInfo(JTI, 4, nullptr);
 
-  // With aarch64-jump-table-hardening, we only expand the jump table dispatch
-  // sequence later, to guarantee the integrity of the intermediate values.
-  if (DAG.getMachineFunction().getFunction().hasFnAttribute(
-          "aarch64-jump-table-hardening")) {
-    CodeModel::Model CM = getTargetMachine().getCodeModel();
-    if (Subtarget->isTargetMachO()) {
-      if (CM != CodeModel::Small && CM != CodeModel::Large)
-        report_fatal_error("Unsupported code-model for hardened jump-table");
-    } else {
-      // Note that COFF support would likely also need JUMP_TABLE_DEBUG_INFO.
-      assert(Subtarget->isTargetELF() &&
-             "jump table hardening only supported on MachO/ELF");
-      if (CM != CodeModel::Small)
-        report_fatal_error("Unsupported code-model for hardened jump-table");
-    }
-
-    SDValue X16Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::X16,
-                                       Entry, SDValue());
-    SDNode *B = DAG.getMachineNode(AArch64::BR_JumpTable, DL, MVT::Other,
-                                   DAG.getTargetJumpTable(JTI, MVT::i32),
-                                   X16Copy.getValue(0), X16Copy.getValue(1));
-    return SDValue(B, 0);
-  }
-
   SDNode *Dest =
       DAG.getMachineNode(AArch64::JumpTableDest32, DL, MVT::i64, MVT::i64, JT,
                          Entry, DAG.getTargetJumpTable(JTI, MVT::i32));
@@ -10777,33 +10739,6 @@ SDValue AArch64TargetLowering::LowerBR_JT(SDValue Op,
   return DAG.getNode(ISD::BRIND, DL, MVT::Other, JTInfo, SDValue(Dest, 0));
 }
 
-SDValue AArch64TargetLowering::LowerBRIND(SDValue Op, SelectionDAG &DAG) const {
-  SDValue Chain = Op.getOperand(0);
-  SDValue Dest = Op.getOperand(1);
-
-  // BR_JT is lowered to BRIND, but the later lowering is specific to indirectbr
-  // Skip over the jump-table BRINDs, where the destination is JumpTableDest32.
-  if (Dest->isMachineOpcode() &&
-      Dest->getMachineOpcode() == AArch64::JumpTableDest32)
-    return SDValue();
-
-  const MachineFunction &MF = DAG.getMachineFunction();
-  std::optional<uint16_t> BADisc =
-      Subtarget->getPtrAuthBlockAddressDiscriminatorIfEnabled(MF.getFunction());
-  if (!BADisc)
-    return SDValue();
-
-  SDLoc DL(Op);
-
-  SDValue Disc = DAG.getTargetConstant(*BADisc, DL, MVT::i64);
-  SDValue Key = DAG.getTargetConstant(AArch64PACKey::IA, DL, MVT::i32);
-  SDValue AddrDisc = DAG.getRegister(AArch64::XZR, MVT::i64);
-
-  SDNode *BrA = DAG.getMachineNode(AArch64::BRA, DL, MVT::Other,
-                                   {Dest, Key, Disc, AddrDisc, Chain});
-  return SDValue(BrA, 0);
-}
-
 SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
                                                  SelectionDAG &DAG) const {
   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
@@ -10823,37 +10758,15 @@ SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
 
 SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,
                                                SelectionDAG &DAG) const {
-  BlockAddressSDNode *BAN = cast<BlockAddressSDNode>(Op);
-  const BlockAddress *BA = BAN->getBlockAddress();
-
-  if (std::optional<uint16_t> BADisc =
-          Subtarget->getPtrAuthBlockAddressDiscriminatorIfEnabled(
-              *BA->getFunction())) {
-    SDLoc DL(Op);
-
-    // This isn't cheap, but BRIND is rare.
-    SDValue TargetBA = DAG.getTargetBlockAddress(BA, BAN->getValueType(0));
-
-    SDValue Disc = DAG.getTargetConstant(*BADisc, DL, MVT::i64);
-
-    SDValue Key = DAG.getTargetConstant(AArch64PACKey::IA, DL, MVT::i32);
-    SDValue AddrDisc = DAG.getRegister(AArch64::XZR, MVT::i64);
-
-    SDNode *MOV =
-        DAG.getMachineNode(AArch64::MOVaddrPAC, DL, {MVT::Other, MVT::Glue},
-                           {TargetBA, Key, AddrDisc, Disc});
-    return DAG.getCopyFromReg(SDValue(MOV, 0), DL, AArch64::X16, MVT::i64,
-                              SDValue(MOV, 1));
-  }
-
+  BlockAddressSDNode *BA = cast<BlockAddressSDNode>(Op);
   CodeModel::Model CM = getTargetMachine().getCodeModel();
   if (CM == CodeModel::Large && !Subtarget->isTargetMachO()) {
     if (!getTargetMachine().isPositionIndependent())
-      return getAddrLarge(BAN, DAG);
+      return getAddrLarge(BA, DAG);
   } else if (CM == CodeModel::Tiny) {
-    return getAddrTiny(BAN, DAG);
+    return getAddrTiny(BA, DAG);
   }
-  return getAddr(BAN, DAG);
+  return getAddr(BA, DAG);
 }
 
 SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
@@ -17653,6 +17566,71 @@ static SDValue performVecReduceAddCombineWithUADDLP(SDNode *N,
   return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i32, UADDLP);
 }
 
+// Turn [sign|zero]_extend(vecreduce_add()) into SVE's  SADDV|UADDV
+// instructions.
+static SDValue
+performVecReduceAddExtCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
+                              const AArch64TargetLowering &TLI) {
+  if (N->getOperand(0).getOpcode() != ISD::ZERO_EXTEND &&
+      N->getOperand(0).getOpcode() != ISD::SIGN_EXTEND)
+    return SDValue();
+  bool IsSigned = N->getOperand(0).getOpcode() == ISD::SIGN_EXTEND;
+
+  SelectionDAG &DAG = DCI.DAG;
+  auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
+  SDValue VecOp = N->getOperand(0).getOperand(0);
+  EVT VecOpVT = VecOp.getValueType();
+  SDLoc DL(N);
+
+  // Split the input vectors if not legal, e.g.
+  // i32 (vecreduce_add (zext nxv32i8 %op to nxv32i32))
+  // ->
+  // i32 (add
+  //   (i32 vecreduce_add (zext nxv16i8 %op.lo to nxv16i32)),
+  //   (i32 vecreduce_add (zext nxv16i8 %op.hi to nxv16i32)))
+  if (TLI.getTypeAction(*DAG.getContext(), VecOpVT) ==
+      TargetLowering::TypeSplitVector) {
+    SDValue Lo, Hi;
+    std::tie(Lo, Hi) = DAG.SplitVector(VecOp, DL);
+    unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+    EVT HalfVT = N->getOperand(0).getValueType().getHalfNumVectorElementsVT(
+        *DAG.getContext());
+    Lo = DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0),
+                     DAG.getNode(ExtOpc, DL, HalfVT, Lo));
+    Hi = DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0),
+                     DAG.getNode(ExtOpc, DL, HalfVT, Hi));
+    return DAG.getNode(ISD::ADD, DL, N->getValueType(0), Lo, Hi);
+  }
+
+  if (!TLI.isTypeLegal(VecOpVT))
+    return SDValue();
+
+  if (VecOpVT.isFixedLengthVector() &&
+      !TLI.useSVEForFixedLengthVectorVT(VecOpVT, !Subtarget.isNeonAvailable()))
+    return SDValue();
+
+  // The input type is legal so map VECREDUCE_ADD to UADDV/SADDV, e.g.
+  // i32 (vecreduce_add (zext nxv16i8 %op to nxv16i32))
+  // ->
+  // i32 (UADDV nxv16i8:%op)
+  EVT ElemType = N->getValueType(0);
+  SDValue Pg = getPredicateForVector(DAG, DL, VecOpVT);
+  if (VecOpVT.isFixedLengthVector()) {
+    EVT ContainerVT = getContainerForFixedLengthVector(DAG, VecOpVT);
+    VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
+  }
+  SDValue Res =
+      DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64,
+                  DAG.getConstant(IsSigned ? Intrinsic::aarch64_sve_saddv
+                                           : Intrinsic::aarch64_sve_uaddv,
+                                  DL, MVT::i64),
+                  Pg, VecOp);
+  if (ElemType != MVT::i64)
+    Res = DAG.getAnyExtOrTrunc(Res, DL, ElemType);
+
+  return Res;
+}
+
 // Turn a v8i8/v16i8 extended vecreduce into a udot/sdot and vecreduce
 //   vecreduce.add(ext(A)) to vecreduce.add(DOT(zero, A, one))
 //   vecreduce.add(mul(ext(A), ext(B))) to vecreduce.add(DOT(zero, A, B))
@@ -25338,8 +25316,11 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
     return performInsertVectorEltCombine(N, DCI);
   case ISD::EXTRACT_VECTOR_ELT:
     return performExtractVectorEltCombine(N, DCI, Subtarget);
-  case ISD::VECREDUCE_ADD:
-    return performVecReduceAddCombine(N, DCI.DAG, Subtarget);
+  case ISD::VECREDUCE_ADD: {
+    if (SDValue Val = performVecReduceAddCombine(N, DCI.DAG, Subtarget))
+      return Val;
+    return performVecReduceAddExtCombine(N, DCI, *this);
+  }
   case AArch64ISD::UADDV:
     return performUADDVCombine(N, DAG);
   case AArch64ISD::SMULL:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index ef45e4f01ecd3..fcdd47541be82 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -1145,7 +1145,6 @@ class AArch64TargetLowering : public TargetLowering {
                          SelectionDAG &DAG) const;
   SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerBR_JT(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerBRIND(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerAAPCS_VASTART(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 1053ba9242768..643bcc33f9201 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -276,8 +276,6 @@ def HasMatMulFP32    : Predicate<"Subtarget->hasMatMulFP32()">,
                        AssemblerPredicateWithAll<(all_of FeatureMatMulFP32), "f32mm">;
 def HasMatMulFP64    : Predicate<"Subtarget->hasMatMulFP64()">,
                        AssemblerPredicateWithAll<(all_of FeatureMatMulFP64), "f64mm">;
-def HasFPAC          : Predicate<"Subtarget->hasFPAC())">,
-                       AssemblerPredicateWithAll<(all_of FeatureFPAC), "fpac">;
 def HasXS            : Predicate<"Subtarget->hasXS()">,
                        AssemblerPredicateWithAll<(all_of FeatureXS), "xs">;
 def HasWFxT          : Predicate<"Subtarget->hasWFxT()">,
@@ -1145,33 +1143,6 @@ def JumpTableDest8 : Pseudo<(outs GPR64:$dst, GPR64sp:$scratch),
                      Sched<[]>;
 }
 
-// A hardened but more expensive version of jump-table dispatch.
-// This combines the target address computation (otherwise done using the
-// JumpTableDest pseudos above) with the branch itself (otherwise done using
-// a plain BR) in a single non-attackable sequence.
-//
-// We take the final entry index as an operand to allow isel freedom. This does
-// mean that the index can be attacker-controlled.  To address that, we also do
-// limited checking of the offset, mainly ensuring it still points within the
-// jump-table array.  When it doesn't, this branches to the first entry.
-// We might want to trap instead.
-//
-// This is intended for use in conjunction with ptrauth for other code pointers,
-// to avoid signing jump-table entries and turning them into pointers.
-//
-// Entry index is passed in x16.  Clobbers x16/x17/nzcv.
-let isNotDuplicable = 1 in
-def BR_JumpTable : Pseudo<(outs), (ins i32imm:$jti), []>, Sched<[]> {
-  let isBranch = 1;
-  let isTerminator = 1;
-  let isIndirectBranch = 1;
-  let isBarrier = 1;
-  let isNotDuplicable = 1;
-  let Defs = [X16,X17,NZCV];
-  let Uses = [X16];
-  let Size = 44; // 28 fixed + 16 variable, for table size materialization
-}
-
 // Space-consuming pseudo to aid testing of placement and reachability
 // algorithms. Immediate operand is the number of bytes this "instruction"
 // occupies; register operands can be used to enforce dependency and constrain
@@ -1795,24 +1766,6 @@ let Predicates = [HasPAuth] in {
     let Uses = [SP];
   }
 
-  // BRA pseudo, generalized version of BRAA/BRAB/Z.
-  // This directly manipulates x16/x17, which are the only registers the OS
-  // guarantees are safe to use for sensitive operations.
-  def BRA : Pseudo<(outs), (ins GPR64noip:$Rn, i32imm:$Key, i64imm:$Disc,
-                                GPR64noip:$AddrDisc), []>, Sched<[]> {
-    let isCodeGenOnly = 1;
-    let hasNoSchedulingInfo = 1;
-    let hasSideEffects = 1;
-    let mayStore = 0;
-    let mayLoad = 0;
-    let isBranch = 1;
-    let isTerminator = 1;
-    let isBarrier = 1;
-    let isIndirectBranch = 1;
-    let Size = 12; // 4 fixed + 8 variable, to compute discriminator.
-    let Defs = [X17];
-  }
-
   let isReturn = 1, isTerminator = 1, isBarrier = 1 in {
     def RETAA   : AuthReturn<0b010, 0, "retaa">;
     def RETAB   : AuthReturn<0b010, 1, "retab">;
@@ -1823,37 +1776,6 @@ let Predicates = [HasPAuth] in {
   defm LDRAA  : AuthLoad<0, "ldraa", simm10Scaled>;
   defm LDRAB  : AuthLoad<1, "ldrab", simm10Scaled>;
 
-  // AUT pseudo.
-  // This directly manipulates x16/x17, which are the only registers the OS
-  // guarantees are safe to use for sensitive operations.
-  def AUT : Pseudo<(outs), (ins i32imm:$Key, i64imm:$Disc, GPR64noip:$AddrDisc),
-                   []>, Sched<[WriteI, ReadI]> {
-    let isCodeGenOnly = 1;
-    let hasSideEffects = 1;
-    let mayStore = 0;
-    let mayLoad = 0;
-    let Size = 32;
-    let Defs = [X16,X17,NZCV];
-    let Uses = [X16];
-  }
-
-  // AUT and re-PAC a value, using different keys/data.
-  // This directly manipulates x16/x17, which are the only registers the OS
-  // guarantees are safe to use for sensitive operations.
-  def AUTPAC
-      : Pseudo<(outs),
-               (ins i32imm:$AUTKey, i64imm:$AUTDisc, GPR64noip:$AUTAddrDisc,
-                    i32imm:$PACKey, i64imm:$PACDisc, GPR64noip:$PACAddrDisc),
-               []>, Sched<[WriteI, ReadI]> {
-    let isCodeGenOnly = 1;
-    let hasSideEffects = 1;
-    let mayStore = 0;
-    let mayLoad = 0;
-    let Size = 48;
-    let Defs = [X16,X17,NZCV];
-    let Uses = [X16];
-  }
-
   // Materialize a signed global address, with adrp+add and PAC.
   def MOVaddrPAC : Pseudo<(outs),
                           (ins i64imm:$Addr, i32imm:$Key,
@@ -1911,6 +1833,7 @@ let Predicates = [HasPAuth] in {
                                 tcGPR64:$AddrDisc),
               (AUTH_TCRETURN_BTI tcGPRx16x17:$dst, imm:$FPDiff, imm:$Key,
                                  imm:$Disc, tcGPR64:$AddrDisc)>;
+
 }
 
 // v9.5-A pointer authentication extensions
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index 32a355fe38f1c..1fad1d5ca6d7d 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -24,7 +24,6 @@
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineScheduler.h"
 #include "llvm/IR/GlobalValue.h"
-#include "llvm/Support/SipHash.h"
 #include "llvm/TargetParser/AArch64TargetParser.h"
 
 using namespace llvm;
@@ -575,17 +574,6 @@ AArch64Subtarget::getAuthenticatedLRCheckMethod() const {
   return AArch64PAuth::AuthCheckMethod::None;
 }
 
-std::optional<uint16_t>
-AArch64Subtarget::getPtrAuthBlockAddressDiscriminatorIfEnabled(
-    const Function &ParentFn) const {
-  if (!ParentFn.hasFnAttribute("ptrauth-indirect-gotos"))
-    return std::nullopt;
-  // We currently have one simple mechanism for all targets.
-  // This isn't ABI, so we can always do better in the future.
-  return getPointerAuthStableSipHash(
-      (Twine(ParentFn.getName()) + " blockaddress").str());
-}
-
 bool AArch64Subtarget::enableMachinePipeliner() const {
   return getSchedModel().hasInstrSchedModel();
 }
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h
index e585aad2f7a68..12c3d25d32ee7 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -415,16 +415,6 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
   /// Choose a method of checking LR before performing a tail call.
   AArch64PAuth::AuthCheckMethod getAuthenticatedLRCheckMethod() const;
 
-  /// Compute the integer discriminator for a given BlockAddress constant, if
-  /// blockaddress signing is enabled, or std::nullopt otherwise.
-  /// Blockaddress signing is controlled by the function attribute
-  /// "ptrauth-indirect-gotos" on the parent function.
-  /// Note that this assumes the discriminator is independent of the indirect
-  /// goto branch site itself, i.e., it's the same for all BlockAddresses in
-  /// a function.
-  std::optional<uint16_t>
-  getPtrAuthBlockAddressDiscriminatorIfEnabled(const Function &ParentFn) const;
-
   const PseudoSourceValue *getAddressCheckPSV() const {
     return AddressCheckPSV.get();
   }
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index d33b0ab7b9fcd..009928a8a7488 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -2552,16 +2552,6 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
     return selectCompareBranch(I, MF, MRI);
 
   case TargetOpcode::G_BRINDIRECT: {
-    const Function &Fn = MF.getFunction();
-    if (std::optional<uint16_t> BADisc =
-            STI.getPtrAuthBlockAddressDiscriminatorIfEnabled(Fn)) {
-      auto MI = MIB.buildInstr(AArch64::BRA, {}, {I.getOperand(0).getReg()});
-      MI.addImm(AArch64PACKey::IA);
-      MI.addImm(*BADisc);
-      MI.addReg(/*AddrDisc=*/AArch64::XZR);
-      I.eraseFromParent();
-      return constrainSelectedInstRegOperands(*MI, TII, TRI, RBI);
-    }
     I.setDesc(TII.get(AArch64::BR));
     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
   }
@@ -3476,23 +3466,6 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
     return true;
   }
   case TargetOpcode::G_BLOCK_ADDR: {
-    Function *BAFn = I.getOperand(1).getBlockAddress()->getFunction();
-    if (std::optional<uint16_t> BADisc =
-            STI.getPtrAuthBlockAddressDiscriminatorIfEnabled(*BAFn)) {
-      MIB.buildInstr(TargetOpcode::IMPLICIT_DEF, {AArch64::X16}, {});
-      MIB.buildInstr(TargetOpcode::IMPLICIT_DEF, {AArch64::X17}, {});
-      MIB.buildInstr(AArch64::MOVaddrPAC)
-          .addBlockAddress(I.getOperand(1).getBlockAddress())
-          .addImm(AArch64PACKey::IA)
-          .addReg(/*AddrDisc=*/AArch64::XZR)
-          .addImm(*BADisc)
-          .constrainAllUses(TII, TRI, RBI);
-      MIB.buildCopy(I.getOperand(0).getReg(), Register(AArch64::X16));
-      RBI.constrainGenericRegister(I.getOperand(0).getReg(),
-                                   AArch64::GPR64RegClass, MRI);
-      I.eraseFromParent();
-      return true;
-    }
     if (TM.getCodeModel() == CodeModel::Large && !TM.isPositionIndependent()) {
       materializeLargeCMVal(I, I.getOperand(1).getBlockAddress(), 0);
       I.eraseFromParent();
@@ -3629,33 +3602,10 @@ bool AArch64InstructionSelector::selectBrJT(MachineInstr &I,
   unsigned JTI = I.getOperand(1).getIndex();
   Register Index = I.getOperand(2).getReg();
 
-  MF->getInfo<AArch64FunctionInfo>()->setJumpTableEntryInfo(JTI, 4, nullptr);
-
-  // With aarch64-jump-table-hardening, we only expand the jump table dispatch
-  // sequence later, to guarantee the integrity of the intermediate values.
-  if (MF->getFunction().hasFnAttribute("aarch64-jump-table-hardening")) {
-    CodeModel::Model CM = TM.getCodeModel();
-    if (STI.isTargetMachO()) {
-      if (CM != CodeModel::Small && CM != CodeModel::Large)
-        report_fatal_error("Unsupported code-model for hardened jump-table");
-    } else {
-      // Note that COFF support would likely also need JUMP_TABLE_DEBUG_INFO.
-      assert(STI.isTargetELF() &&
-             "jump table hardening only supported on MachO/ELF");
-      if (CM != CodeModel::Small)
-        report_fatal_error("Unsupported code-model for hardened jump-table");
-    }
-
-    MIB.buildCopy({AArch64::X16}, I.getOperand(2).getReg());
-    MIB.buildInstr(AArch64::BR_JumpTable)
-        .addJumpTableIndex(I.getOperand(1).getIndex());
-    I.eraseFromParent();
-    return true;
-  }
-
   Register TargetReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
   Register ScratchReg = MRI.createVirtualRegister(&AArch64::GPR64spRegClass);
 
+  MF->getInfo<AArch64FunctionInfo>()->setJumpTableEntryInfo(JTI, 4, nullptr);
   auto JumpTableInst = MIB.buildInstr(AArch64::JumpTableDest32,
                                       {TargetReg, ScratchReg}, {JTAddr, Index})
                            .addJumpTableIndex(JTI);
@@ -6557,64 +6507,6 @@ bool AArch64InstructionSelector::selectIntrinsic(MachineInstr &I,
     I.eraseFromParent();
     return true;
   }
-  case Intrinsic::ptrauth_resign: {
-    Register DstReg = I.getOperand(0).getReg();
-    Register ValReg = I.getOperand(2).getReg();
-    uint64_t AUTKey = I.getOperand(3).getImm();
-    Register AUTDisc = I.getOperand(4).getReg();
-    uint64_t PACKey = I.getOperand(5).getImm();
-    Register PACDisc = I.getOperand(6).getReg();
-
-    Register AUTAddrDisc = AUTDisc;
-    uint16_t AUTConstDiscC = 0;
-    std::tie(AUTConstDiscC, AUTAddrDisc) =
-        extractPtrauthBlendDiscriminators(AUTDisc, MRI);
-
-    Register PACAddrDisc = PACDisc;
-    uint16_t PACConstDiscC = 0;
-    std::tie(PACConstDiscC, PACAddrDisc) =
-        extractPtrauthBlendDiscriminators(PACDisc, MRI);
-
-    MIB.buildCopy({AArch64::X16}, {ValReg});
-    MIB.buildInstr(TargetOpcode::IMPLICIT_DEF, {AArch64::X17}, {});
-    MIB.buildInstr(AArch64::AUTPAC)
-        .addImm(AUTKey)
-        .addImm(AUTConstDiscC)
-        .addUse(AUTAddrDisc)
-        .addImm(PACKey)
-        .addImm(PACConstDiscC)
-        .addUse(PACAddrDisc)
-        .constrainAllUses(TII, TRI, RBI);
-    MIB.buildCopy({DstReg}, Register(AArch64::X16));
-
-    RBI.constrainGenericRegister(DstReg, AArch64::GPR64RegClass, MRI);
-    I.eraseFromParent();
-    return true;
-  }
-  case Intrinsic::ptrauth_auth: {
-    Register DstReg = I.getOperand(0).getReg();
-    Register ValReg = I.getOperand(2).getReg();
-    uint64_t AUTKey = I.getOperand(3).getImm();
-    Register AUTDisc = I.getOperand(4).getReg();
-
-    Register AUTAddrDisc = AUTDisc;
-    uint16_t AUTConstDiscC = 0;
-    std::tie(AUTConstDiscC, AUTAddrDisc) =
-        extractPtrauthBlendDiscriminators(AUTDisc, MRI);
-
-    MIB.buildCopy({AArch64::X16}, {ValReg});
-    MIB.buildInstr(TargetOpcode::IMPLICIT_DEF, {AArch64::X17}, {});
-    MIB.buildInstr(AArch64::AUT)
-        .addImm(AUTKey)
-        .addImm(AUTConstDiscC)
-        .addUse(AUTAddrDisc)
-        .constrainAllUses(TII, TRI, RBI);
-    MIB.buildCopy({DstReg}, Register(AArch64::X16));
-
-    RBI.constrainGenericRegister(DstReg, AArch64::GPR64RegClass, MRI);
-    I.eraseFromParent();
-    return true;
-  }
   case Intrinsic::frameaddress:
   case Intrinsic::returnaddress: {
     MachineFunction &MF = *I.getParent()->getParent();
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index c1c51660ecf1c..3f8641945bcd7 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -1288,8 +1288,6 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
 
   getActionDefinitionsBuilder(G_PREFETCH).custom();
 
-  getActionDefinitionsBuilder({G_SCMP, G_UCMP}).lower();
-
   getLegacyLegalizerInfo().computeTables();
   verify(*ST.getInstrInfo());
 }
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
index ed670bce594ec..27751633017ca 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
@@ -181,7 +181,7 @@ class AArch64ELFStreamer : public MCELFStreamer {
                      std::unique_ptr<MCCodeEmitter> Emitter)
       : MCELFStreamer(Context, std::move(TAB), std::move(OW),
                       std::move(Emitter)),
-        LastEMS(EMS_None) {}
+        MappingSymbolCounter(0), LastEMS(EMS_None) {}
 
   void changeSection(MCSection *Section, uint32_t Subsection = 0) override {
     // We have to keep track of the mapping symbol state of any sections we
@@ -195,6 +195,7 @@ class AArch64ELFStreamer : public MCELFStreamer {
 
   // Reset state between object emissions
   void reset() override {
+    MappingSymbolCounter = 0;
     MCELFStreamer::reset();
     LastMappingSymbols.clear();
     LastEMS = EMS_None;
@@ -270,12 +271,15 @@ class AArch64ELFStreamer : public MCELFStreamer {
   }
 
   void emitMappingSymbol(StringRef Name) {
-    auto *Symbol = cast<MCSymbolELF>(getContext().createLocalSymbol(Name));
+    auto *Symbol = cast<MCSymbolELF>(getContext().getOrCreateSymbol(
+        Name + "." + Twine(MappingSymbolCounter++)));
     emitLabel(Symbol);
     Symbol->setType(ELF::STT_NOTYPE);
     Symbol->setBinding(ELF::STB_LOCAL);
   }
 
+  int64_t MappingSymbolCounter;
+
   DenseMap<const MCSection *, ElfMappingSymbol> LastMappingSymbols;
   ElfMappingSymbol LastEMS;
 };
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 7906e0ee9d785..dfc8eaea66f7b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -947,12 +947,6 @@ def FeatureHasRestrictedSOffset : SubtargetFeature<"restricted-soffset",
   "Has restricted SOffset (immediate not supported)."
 >;
 
-def FeatureRequiredExportPriority : SubtargetFeature<"required-export-priority",
-  "HasRequiredExportPriority",
-  "true",
-  "Export priority must be explicitly manipulated on GFX11.5"
->;
-
 //===------------------------------------------------------------===//
 // Subtarget Features (options and debugging)
 //===------------------------------------------------------------===//
@@ -1573,8 +1567,7 @@ def FeatureISAVersion11_Generic: FeatureSet<
      FeatureUserSGPRInit16Bug,
      FeatureMADIntraFwdBug,
      FeaturePrivEnabledTrap2NopBug,
-     FeatureRequiresCOV6,
-     FeatureRequiredExportPriority])>;
+     FeatureRequiresCOV6])>;
 
 def FeatureISAVersion11_0_Common : FeatureSet<
   !listconcat(FeatureISAVersion11_Common.Features,
@@ -1604,23 +1597,20 @@ def FeatureISAVersion11_5_0 : FeatureSet<
   !listconcat(FeatureISAVersion11_Common.Features,
     [FeatureSALUFloatInsts,
      FeatureDPPSrc1SGPR,
-     FeatureVGPRSingleUseHintInsts,
-     FeatureRequiredExportPriority])>;
+     FeatureVGPRSingleUseHintInsts])>;
 
 def FeatureISAVersion11_5_1 : FeatureSet<
   !listconcat(FeatureISAVersion11_Common.Features,
     [FeatureSALUFloatInsts,
      FeatureDPPSrc1SGPR,
      FeatureVGPRSingleUseHintInsts,
-     Feature1_5xVGPRs,
-     FeatureRequiredExportPriority])>;
+     Feature1_5xVGPRs])>;
 
 def FeatureISAVersion11_5_2 : FeatureSet<
   !listconcat(FeatureISAVersion11_Common.Features,
     [FeatureSALUFloatInsts,
      FeatureDPPSrc1SGPR,
-     FeatureVGPRSingleUseHintInsts,
-     FeatureRequiredExportPriority])>;
+     FeatureVGPRSingleUseHintInsts])>;
 
 def FeatureISAVersion12 : FeatureSet<
   [FeatureGFX12,
@@ -2026,8 +2016,6 @@ def HasAddNoCarryInsts : Predicate<"Subtarget->hasAddNoCarry()">,
 
 def NotHasAddNoCarryInsts : Predicate<"!Subtarget->hasAddNoCarry()">;
 
-def HasXNACKEnabled : Predicate<"Subtarget->isXNACKEnabled()">;
-
 def Has16BitInsts : Predicate<"Subtarget->has16BitInsts()">,
   AssemblerPredicate<(all_of Feature16BitInsts)>;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index e64e28e01d3d1..3154dc6fe433d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -1487,10 +1487,6 @@ void AMDGPUAsmPrinter::emitResourceUsageRemarks(
   if (!Ctx.getDiagHandlerPtr()->isAnalysisRemarkEnabled(Name))
     return;
 
-  // Currently non-kernel functions have no resources to emit.
-  if (!isEntryFunctionCC(MF.getFunction().getCallingConv()))
-    return;
-
   auto EmitResourceUsageRemark = [&](StringRef RemarkName,
                                      StringRef RemarkLabel, auto Argument) {
     // Add an indent for every line besides the line with the kernel name. This
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
index 7bf5170794cd9..fe7731b9550bf 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -360,7 +360,7 @@ bool LiveRegOptimizer::optimizeLiveType(
         Type *NewType = calculateConvertType(Phi->getType());
         NewPhi->addIncoming(ConstantInt::get(NewType, 0, false),
                             Phi->getIncomingBlock(I));
-      } else if (ValMap.contains(IncVal) && ValMap[IncVal])
+      } else if (ValMap.contains(IncVal))
         NewPhi->addIncoming(ValMap[IncVal], Phi->getIncomingBlock(I));
       else
         MissingIncVal = true;
@@ -370,11 +370,11 @@ bool LiveRegOptimizer::optimizeLiveType(
       // The coercion chain of the PHI is broken. Delete the Phi
       // from the ValMap and any connected / user Phis.
       SmallVector<Value *, 4> PHIWorklist;
-      SmallPtrSet<Value *, 4> VisitedPhis;
+      SmallPtrSet<Value *, 4> Visited;
       PHIWorklist.push_back(DeadVal);
       while (!PHIWorklist.empty()) {
         Value *NextDeadValue = PHIWorklist.pop_back_val();
-        VisitedPhis.insert(NextDeadValue);
+        Visited.insert(NextDeadValue);
         auto OriginalPhi =
             std::find_if(PhiNodes.begin(), PhiNodes.end(),
                          [this, &NextDeadValue](PHINode *CandPhi) {
@@ -388,7 +388,7 @@ bool LiveRegOptimizer::optimizeLiveType(
         DeadInsts.emplace_back(cast<Instruction>(NextDeadValue));
 
         for (User *U : NextDeadValue->users()) {
-          if (!VisitedPhis.contains(cast<PHINode>(U)))
+          if (!Visited.contains(cast<PHINode>(U)))
             PHIWorklist.push_back(U);
         }
       }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 74e93b0620d26..89ef0f299feab 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -7366,8 +7366,6 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
     return legalizeBufferStore(MI, MRI, B, true, true);
   case Intrinsic::amdgcn_raw_buffer_load:
   case Intrinsic::amdgcn_raw_ptr_buffer_load:
-  case Intrinsic::amdgcn_raw_atomic_buffer_load:
-  case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
   case Intrinsic::amdgcn_struct_buffer_load:
   case Intrinsic::amdgcn_struct_ptr_buffer_load:
     return legalizeBufferLoad(MI, MRI, B, false, false);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index 77971323aa1ec..6be9be21a8a86 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -1092,9 +1092,8 @@ Value *SplitPtrStructs::handleMemoryInst(Instruction *I, Value *Arg, Value *Ptr,
 
   Intrinsic::ID IID = Intrinsic::not_intrinsic;
   if (isa<LoadInst>(I))
-    IID = Order == AtomicOrdering::NotAtomic
-              ? Intrinsic::amdgcn_raw_ptr_buffer_load
-              : Intrinsic::amdgcn_raw_ptr_atomic_buffer_load;
+    // TODO: Do we need to do something about atomic loads?
+    IID = Intrinsic::amdgcn_raw_ptr_buffer_load;
   else if (isa<StoreInst>(I))
     IID = Intrinsic::amdgcn_raw_ptr_buffer_store;
   else if (auto *RMW = dyn_cast<AtomicRMWInst>(I)) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index aa329a58547f3..68f4767458703 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4985,8 +4985,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     }
     case Intrinsic::amdgcn_raw_buffer_load:
     case Intrinsic::amdgcn_raw_ptr_buffer_load:
-    case Intrinsic::amdgcn_raw_atomic_buffer_load:
-    case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
     case Intrinsic::amdgcn_raw_tbuffer_load:
     case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
       // FIXME: Should make intrinsic ID the last operand of the instruction,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index c8fb68d1c0b0c..192996f84c5f3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -85,19 +85,16 @@ class VGPRRegisterRegAlloc : public RegisterRegAllocBase<VGPRRegisterRegAlloc> {
 };
 
 static bool onlyAllocateSGPRs(const TargetRegisterInfo &TRI,
-                              const MachineRegisterInfo &MRI,
-                              const Register Reg) {
-  const TargetRegisterClass *RC = MRI.getRegClass(Reg);
-  return static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(RC);
+                              const TargetRegisterClass &RC) {
+  return static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(&RC);
 }
 
 static bool onlyAllocateVGPRs(const TargetRegisterInfo &TRI,
-                              const MachineRegisterInfo &MRI,
-                              const Register Reg) {
-  const TargetRegisterClass *RC = MRI.getRegClass(Reg);
-  return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(RC);
+                              const TargetRegisterClass &RC) {
+  return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(&RC);
 }
 
+
 /// -{sgpr|vgpr}-regalloc=... command line option.
 static FunctionPass *useDefaultRegisterAllocator() { return nullptr; }
 
@@ -752,7 +749,7 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
       });
 
   PB.registerRegClassFilterParsingCallback(
-      [](StringRef FilterName) -> RegAllocFilterFunc {
+      [](StringRef FilterName) -> RegClassFilterFunc {
         if (FilterName == "sgpr")
           return onlyAllocateSGPRs;
         if (FilterName == "vgpr")
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index a8b171aa82840..a402fc6d7e611 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -14,7 +14,6 @@
 #include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "SIMachineFunctionInfo.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
 #include "llvm/TargetParser/TargetParser.h"
@@ -1105,7 +1104,6 @@ void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
   fixWMMAHazards(MI);
   fixShift64HighRegBug(MI);
   fixVALUMaskWriteHazard(MI);
-  fixRequiredExportPriority(MI);
 }
 
 bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
@@ -2897,113 +2895,3 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
 
   return true;
 }
-
-static bool ensureEntrySetPrio(MachineFunction *MF, int Priority,
-                               const SIInstrInfo &TII) {
-  MachineBasicBlock &EntryMBB = MF->front();
-  if (EntryMBB.begin() != EntryMBB.end()) {
-    auto &EntryMI = *EntryMBB.begin();
-    if (EntryMI.getOpcode() == AMDGPU::S_SETPRIO &&
-        EntryMI.getOperand(0).getImm() >= Priority)
-      return false;
-  }
-
-  BuildMI(EntryMBB, EntryMBB.begin(), DebugLoc(), TII.get(AMDGPU::S_SETPRIO))
-      .addImm(Priority);
-  return true;
-}
-
-bool GCNHazardRecognizer::fixRequiredExportPriority(MachineInstr *MI) {
-  if (!ST.hasRequiredExportPriority())
-    return false;
-
-  // Assume the following shader types will never have exports,
-  // and avoid adding or adjusting S_SETPRIO.
-  MachineBasicBlock *MBB = MI->getParent();
-  MachineFunction *MF = MBB->getParent();
-  auto CC = MF->getFunction().getCallingConv();
-  switch (CC) {
-  case CallingConv::AMDGPU_CS:
-  case CallingConv::AMDGPU_CS_Chain:
-  case CallingConv::AMDGPU_CS_ChainPreserve:
-  case CallingConv::AMDGPU_KERNEL:
-    return false;
-  default:
-    break;
-  }
-
-  const int MaxPriority = 3;
-  const int NormalPriority = 2;
-  const int PostExportPriority = 0;
-
-  auto It = MI->getIterator();
-  switch (MI->getOpcode()) {
-  case AMDGPU::S_ENDPGM:
-  case AMDGPU::S_ENDPGM_SAVED:
-  case AMDGPU::S_ENDPGM_ORDERED_PS_DONE:
-  case AMDGPU::SI_RETURN_TO_EPILOG:
-    // Ensure shader with calls raises priority at entry.
-    // This ensures correct priority if exports exist in callee.
-    if (MF->getFrameInfo().hasCalls())
-      return ensureEntrySetPrio(MF, NormalPriority, TII);
-    return false;
-  case AMDGPU::S_SETPRIO: {
-    // Raise minimum priority unless in workaround.
-    auto &PrioOp = MI->getOperand(0);
-    int Prio = PrioOp.getImm();
-    bool InWA = (Prio == PostExportPriority) &&
-                (It != MBB->begin() && TII.isEXP(*std::prev(It)));
-    if (InWA || Prio >= NormalPriority)
-      return false;
-    PrioOp.setImm(std::min(Prio + NormalPriority, MaxPriority));
-    return true;
-  }
-  default:
-    if (!TII.isEXP(*MI))
-      return false;
-    break;
-  }
-
-  // Check entry priority at each export (as there will only be a few).
-  // Note: amdgpu_gfx can only be a callee, so defer to caller setprio.
-  bool Changed = false;
-  if (CC != CallingConv::AMDGPU_Gfx)
-    Changed = ensureEntrySetPrio(MF, NormalPriority, TII);
-
-  auto NextMI = std::next(It);
-  bool EndOfShader = false;
-  if (NextMI != MBB->end()) {
-    // Only need WA at end of sequence of exports.
-    if (TII.isEXP(*NextMI))
-      return Changed;
-    // Assume appropriate S_SETPRIO after export means WA already applied.
-    if (NextMI->getOpcode() == AMDGPU::S_SETPRIO &&
-        NextMI->getOperand(0).getImm() == PostExportPriority)
-      return Changed;
-    EndOfShader = NextMI->getOpcode() == AMDGPU::S_ENDPGM;
-  }
-
-  const DebugLoc &DL = MI->getDebugLoc();
-
-  // Lower priority.
-  BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_SETPRIO))
-      .addImm(PostExportPriority);
-
-  if (!EndOfShader) {
-    // Wait for exports to complete.
-    BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_WAITCNT_EXPCNT))
-        .addReg(AMDGPU::SGPR_NULL)
-        .addImm(0);
-  }
-
-  BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_NOP)).addImm(0);
-  BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_NOP)).addImm(0);
-
-  if (!EndOfShader) {
-    // Return to normal (higher) priority.
-    BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_SETPRIO))
-        .addImm(NormalPriority);
-  }
-
-  return true;
-}
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
index f2a64ab48e180..3ccca527c626b 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
@@ -107,7 +107,6 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
   bool fixWMMAHazards(MachineInstr *MI);
   bool fixShift64HighRegBug(MachineInstr *MI);
   bool fixVALUMaskWriteHazard(MachineInstr *MI);
-  bool fixRequiredExportPriority(MachineInstr *MI);
 
   int checkMAIHazards(MachineInstr *MI);
   int checkMAIHazards908(MachineInstr *MI);
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index def89c785b855..e5817594a4521 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -238,7 +238,6 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   bool HasVOPDInsts = false;
   bool HasVALUTransUseHazard = false;
   bool HasForceStoreSC0SC1 = false;
-  bool HasRequiredExportPriority = false;
 
   bool RequiresCOV6 = false;
 
@@ -1283,8 +1282,6 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
 
   bool hasRestrictedSOffset() const { return HasRestrictedSOffset; }
 
-  bool hasRequiredExportPriority() const { return HasRequiredExportPriority; }
-
   /// \returns true if the target uses LOADcnt/SAMPLEcnt/BVHcnt, DScnt/KMcnt
   /// and STOREcnt rather than VMcnt, LGKMcnt and VScnt respectively.
   bool hasExtendedWaitCounts() const { return getGeneration() >= GFX12; }
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
index c2d2ca0f90f93..68774c12516ca 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
@@ -10,7 +10,6 @@
 #include "MCTargetDesc/AMDGPUFixupKinds.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/ADT/StringSwitch.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCAssembler.h"
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index db610a4badda3..edc645103135b 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -605,9 +605,9 @@ MCELFStreamer &AMDGPUTargetELFStreamer::getStreamer() {
 // We use it for emitting the accumulated PAL metadata as a .note record.
 // The PAL metadata is reset after it is emitted.
 void AMDGPUTargetELFStreamer::finish() {
-  ELFObjectWriter &W = getStreamer().getWriter();
-  W.setELFHeaderEFlags(getEFlags());
-  W.setOverrideABIVersion(
+  MCAssembler &MCA = getStreamer().getAssembler();
+  MCA.setELFHeaderEFlags(getEFlags());
+  MCA.getWriter().setOverrideABIVersion(
       getELFABIVersion(STI.getTargetTriple(), CodeObjectVersion));
 
   std::string Blob;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 7f95442401dbc..a09e0ad2c0c29 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1277,14 +1277,6 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
         Info.ptrVal = CI.getArgOperand(1);
         return true;
       }
-      case Intrinsic::amdgcn_raw_atomic_buffer_load:
-      case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load: {
-        Info.memVT =
-            memVTFromLoadIntrReturn(*this, MF.getDataLayout(), CI.getType(),
-                                    std::numeric_limits<unsigned>::max());
-        Info.flags &= ~MachineMemOperand::MOStore;
-        return true;
-      }
       }
     }
     return true;
@@ -8897,8 +8889,6 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
   }
   case Intrinsic::amdgcn_raw_buffer_load:
   case Intrinsic::amdgcn_raw_ptr_buffer_load:
-  case Intrinsic::amdgcn_raw_atomic_buffer_load:
-  case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
   case Intrinsic::amdgcn_raw_buffer_load_format:
   case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
     const bool IsFormat =
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index ae537b194f50c..8b42d4a1dee7a 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -216,7 +216,7 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
                                    CombineInfo &Paired, bool Modify = false);
   static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI,
                         const CombineInfo &Paired);
-  unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired);
+  static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired);
   static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI,
                                                      const CombineInfo &Paired);
   const TargetRegisterClass *
@@ -353,7 +353,6 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
   case AMDGPU::S_LOAD_DWORDX2_IMM:
-  case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
   case AMDGPU::GLOBAL_LOAD_DWORDX2:
   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
   case AMDGPU::GLOBAL_STORE_DWORDX2:
@@ -364,7 +363,6 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
   case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
   case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
   case AMDGPU::S_LOAD_DWORDX3_IMM:
-  case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
   case AMDGPU::GLOBAL_LOAD_DWORDX3:
   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
   case AMDGPU::GLOBAL_STORE_DWORDX3:
@@ -375,7 +373,6 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
   case AMDGPU::S_LOAD_DWORDX4_IMM:
-  case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
   case AMDGPU::GLOBAL_LOAD_DWORDX4:
   case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
   case AMDGPU::GLOBAL_STORE_DWORDX4:
@@ -386,7 +383,6 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
   case AMDGPU::S_LOAD_DWORDX8_IMM:
-  case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
     return 8;
   case AMDGPU::DS_READ_B32:
   case AMDGPU::DS_READ_B32_gfx9:
@@ -511,10 +507,6 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
   case AMDGPU::S_LOAD_DWORDX3_IMM:
   case AMDGPU::S_LOAD_DWORDX4_IMM:
   case AMDGPU::S_LOAD_DWORDX8_IMM:
-  case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
-  case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
-  case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
-  case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
     return S_LOAD_IMM;
   case AMDGPU::DS_READ_B32:
   case AMDGPU::DS_READ_B32_gfx9:
@@ -599,10 +591,6 @@ static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
   case AMDGPU::S_LOAD_DWORDX3_IMM:
   case AMDGPU::S_LOAD_DWORDX4_IMM:
   case AMDGPU::S_LOAD_DWORDX8_IMM:
-  case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
-  case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
-  case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
-  case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
     return AMDGPU::S_LOAD_DWORD_IMM;
   case AMDGPU::GLOBAL_LOAD_DWORD:
   case AMDGPU::GLOBAL_LOAD_DWORDX2:
@@ -715,10 +703,6 @@ static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
   case AMDGPU::S_LOAD_DWORDX3_IMM:
   case AMDGPU::S_LOAD_DWORDX4_IMM:
   case AMDGPU::S_LOAD_DWORDX8_IMM:
-  case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
-  case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
-  case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
-  case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
     Result.SBase = true;
     return Result;
   case AMDGPU::DS_READ_B32:
@@ -1228,14 +1212,8 @@ void SILoadStoreOptimizer::copyToDestRegs(
 
   // Copy to the old destination registers.
   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
-  auto *Dest0 = TII->getNamedOperand(*CI.I, OpName);
-  auto *Dest1 = TII->getNamedOperand(*Paired.I, OpName);
-
-  // The constrained sload instructions in S_LOAD_IMM class will have
-  // `early-clobber` flag in the dst operand. Remove the flag before using the
-  // MOs in copies.
-  Dest0->setIsEarlyClobber(false);
-  Dest1->setIsEarlyClobber(false);
+  const auto *Dest0 = TII->getNamedOperand(*CI.I, OpName);
+  const auto *Dest1 = TII->getNamedOperand(*Paired.I, OpName);
 
   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
       .add(*Dest0) // Copy to same destination including flags and sub reg.
@@ -1722,29 +1700,19 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
     case 8:
       return AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
     }
-  case S_LOAD_IMM: {
-    // If XNACK is enabled, use the constrained opcodes when the first load is
-    // under-aligned.
-    const MachineMemOperand *MMO = *CI.I->memoperands_begin();
-    bool NeedsConstrainedOpc =
-        STM->isXNACKEnabled() && MMO->getAlign().value() < Width * 4;
+  case S_LOAD_IMM:
     switch (Width) {
     default:
       return 0;
     case 2:
-      return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX2_IMM_ec
-                                 : AMDGPU::S_LOAD_DWORDX2_IMM;
+      return AMDGPU::S_LOAD_DWORDX2_IMM;
     case 3:
-      return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX3_IMM_ec
-                                 : AMDGPU::S_LOAD_DWORDX3_IMM;
+      return AMDGPU::S_LOAD_DWORDX3_IMM;
     case 4:
-      return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX4_IMM_ec
-                                 : AMDGPU::S_LOAD_DWORDX4_IMM;
+      return AMDGPU::S_LOAD_DWORDX4_IMM;
     case 8:
-      return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX8_IMM_ec
-                                 : AMDGPU::S_LOAD_DWORDX8_IMM;
+      return AMDGPU::S_LOAD_DWORDX8_IMM;
     }
-  }
   case GLOBAL_LOAD:
     switch (Width) {
     default:
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index bd4203ccd6fe4..452dac4b00993 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -217,7 +217,7 @@ class SIMemOpInfo final {
 
 class SIMemOpAccess final {
 private:
-  const AMDGPUMachineModuleInfo *MMI = nullptr;
+  AMDGPUMachineModuleInfo *MMI = nullptr;
 
   /// Reports unsupported message \p Msg for \p MI to LLVM context.
   void reportUnsupported(const MachineBasicBlock::iterator &MI,
@@ -241,7 +241,7 @@ class SIMemOpAccess final {
 public:
   /// Construct class to support accessing the machine memory operands
   /// of instructions in the machine function \p MF.
-  SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI);
+  SIMemOpAccess(MachineFunction &MF);
 
   /// \returns Load info if \p MI is a load operation, "std::nullopt" otherwise.
   std::optional<SIMemOpInfo>
@@ -806,8 +806,9 @@ SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
   return SIAtomicAddrSpace::OTHER;
 }
 
-SIMemOpAccess::SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI_)
-    : MMI(&MMI_) {}
+SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) {
+  MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>();
+}
 
 std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
     const MachineBasicBlock::iterator &MI) const {
@@ -2801,10 +2802,7 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
 bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
   bool Changed = false;
 
-  const MachineModuleInfo &MMI =
-      getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
-
-  SIMemOpAccess MOA(MMI.getObjFileInfo<AMDGPUMachineModuleInfo>());
+  SIMemOpAccess MOA(MF);
   CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>());
 
   for (auto &MBB : MF) {
diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td
index 9e470e27272c3..df1722b1f7fb4 100644
--- a/llvm/lib/Target/AMDGPU/SMInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SMInstructions.td
@@ -161,25 +161,12 @@ class SM_Discard_Pseudo <string opName, OffsetMode offsets>
   let has_soffset = offsets.HasSOffset;
 }
 
-multiclass SM_Load_Pseudos<string op, RegisterClass baseClass,
-                           RegisterClass dstClass, OffsetMode offsets> {
-  defvar opName = !tolower(op);
-  def "" : SM_Load_Pseudo <opName, baseClass, dstClass, offsets>;
-
-  // The constrained multi-dword load equivalents with early clobber flag at
-  // the dst operands. They are needed only for codegen and there is no need
-  // for their real opcodes.
-  if !gt(dstClass.RegTypes[0].Size, 32) then
-    let Constraints = "@earlyclobber $sdst",
-        PseudoInstr = op # offsets.Variant in
-      def "" # _ec : SM_Load_Pseudo <opName, baseClass, dstClass, offsets>;
-}
-
 multiclass SM_Pseudo_Loads<RegisterClass baseClass,
                            RegisterClass dstClass> {
-  defm _IMM : SM_Load_Pseudos <NAME, baseClass, dstClass, IMM_Offset>;
-  defm _SGPR : SM_Load_Pseudos <NAME, baseClass, dstClass, SGPR_Offset>;
-  defm _SGPR_IMM : SM_Load_Pseudos <NAME, baseClass, dstClass, SGPR_IMM_Offset>;
+  defvar opName = !tolower(NAME);
+  def _IMM : SM_Load_Pseudo <opName, baseClass, dstClass, IMM_Offset>;
+  def _SGPR : SM_Load_Pseudo <opName, baseClass, dstClass, SGPR_Offset>;
+  def _SGPR_IMM : SM_Load_Pseudo <opName, baseClass, dstClass, SGPR_IMM_Offset>;
 }
 
 multiclass SM_Pseudo_Stores<RegisterClass baseClass,
@@ -866,74 +853,45 @@ def SMRDBufferImm   : ComplexPattern<iPTR, 1, "SelectSMRDBufferImm">;
 def SMRDBufferImm32 : ComplexPattern<iPTR, 1, "SelectSMRDBufferImm32">;
 def SMRDBufferSgprImm : ComplexPattern<iPTR, 2, "SelectSMRDBufferSgprImm">;
 
-class SMRDAlignedLoadPat<PatFrag Op> : PatFrag <(ops node:$ptr), (Op node:$ptr), [{
-  // Returns true if it is a single dword load or naturally aligned multi-dword load.
-  LoadSDNode *Ld = cast<LoadSDNode>(N);
-  unsigned Size = Ld->getMemoryVT().getStoreSize();
-  return Size <= 4 || Ld->getAlign().value() >= Size;
-}]> {
-  let GISelPredicateCode = [{
-  auto &Ld = cast<GLoad>(MI);
-  TypeSize Size = Ld.getMMO().getSize().getValue();
-  return Size <= 4 || Ld.getMMO().getAlign().value() >= Size;
-  }];
-}
-
-def aligned_smrd_load : SMRDAlignedLoadPat<smrd_load>;
+multiclass SMRD_Pattern <string Instr, ValueType vt, bit immci = true> {
 
-multiclass SMRD_Patterns <string Instr, ValueType vt, PatFrag frag,
-                          bit immci = true, string suffix = ""> {
   // 1. IMM offset
   def : GCNPat <
-    (frag (SMRDImm i64:$sbase, i32:$offset)),
-    (vt (!cast<SM_Pseudo>(Instr#"_IMM"#suffix) $sbase, $offset, 0))>;
+    (smrd_load (SMRDImm i64:$sbase, i32:$offset)),
+    (vt (!cast<SM_Pseudo>(Instr#"_IMM") $sbase, $offset, 0))
+  >;
 
   // 2. 32-bit IMM offset on CI
   if immci then def : GCNPat <
-    (frag (SMRDImm32 i64:$sbase, i32:$offset)),
-    (vt (!cast<InstSI>(Instr#"_IMM_ci"#suffix) $sbase, $offset, 0))> {
-    let SubtargetPredicate = isGFX7Only;
+    (smrd_load (SMRDImm32 i64:$sbase, i32:$offset)),
+    (vt (!cast<InstSI>(Instr#"_IMM_ci") $sbase, $offset, 0))> {
+    let OtherPredicates = [isGFX7Only];
   }
 
   // 3. SGPR offset
   def : GCNPat <
-    (frag (SMRDSgpr i64:$sbase, i32:$soffset)),
-    (vt (!cast<SM_Pseudo>(Instr#"_SGPR"#suffix) $sbase, $soffset, 0))> {
-    let SubtargetPredicate = isNotGFX9Plus;
+    (smrd_load (SMRDSgpr i64:$sbase, i32:$soffset)),
+    (vt (!cast<SM_Pseudo>(Instr#"_SGPR") $sbase, $soffset, 0))> {
+    let OtherPredicates = [isNotGFX9Plus];
   }
   def : GCNPat <
-    (frag (SMRDSgpr i64:$sbase, i32:$soffset)),
-    (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM"#suffix) $sbase, $soffset, 0, 0))> {
-    let SubtargetPredicate = isGFX9Plus;
+    (smrd_load (SMRDSgpr i64:$sbase, i32:$soffset)),
+    (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") $sbase, $soffset, 0, 0))> {
+    let OtherPredicates = [isGFX9Plus];
   }
 
   // 4. SGPR+IMM offset
   def : GCNPat <
-    (frag (SMRDSgprImm i64:$sbase, i32:$soffset, i32:$offset)),
-    (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM"#suffix) $sbase, $soffset, $offset, 0))> {
-    let SubtargetPredicate = isGFX9Plus;
+    (smrd_load (SMRDSgprImm i64:$sbase, i32:$soffset, i32:$offset)),
+    (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") $sbase, $soffset, $offset, 0))> {
+    let OtherPredicates = [isGFX9Plus];
   }
 
   // 5. No offset
   def : GCNPat <
-    (vt (frag (i64 SReg_64:$sbase))),
-    (vt (!cast<SM_Pseudo>(Instr#"_IMM"#suffix) i64:$sbase, 0, 0))>;
-}
-
-multiclass SMRD_Pattern <string Instr, ValueType vt, bit immci = true> {
-  // High priority when XNACK is enabled and the load was naturally aligned.
-  let OtherPredicates = [HasXNACKEnabled], AddedComplexity = 102 in
-    defm: SMRD_Patterns <Instr, vt, aligned_smrd_load, immci>;
-
-  // XNACK is enabled and the load wasn't naturally aligned. The constrained sload variant.
-  if !gt(vt.Size, 32) then {
-    let OtherPredicates = [HasXNACKEnabled], AddedComplexity = 101 in
-      defm: SMRD_Patterns <Instr, vt, smrd_load, /*immci=*/false, /*suffix=*/"_ec">;
-  }
-
-  // XNACK is disabled.
-  let AddedComplexity = 100 in
-    defm: SMRD_Patterns <Instr, vt, smrd_load, immci>;
+    (vt (smrd_load (i64 SReg_64:$sbase))),
+    (vt (!cast<SM_Pseudo>(Instr#"_IMM") i64:$sbase, 0, 0))
+  >;
 }
 
 multiclass SMLoad_Pattern <string Instr, ValueType vt, bit immci = true> {
@@ -1047,8 +1005,6 @@ defm : ScalarBufferLoadIntrinsicPat <SIsbuffer_load_ubyte, "S_BUFFER_LOAD_U8">;
 defm : ScalarBufferLoadIntrinsicPat <SIsbuffer_load_short, "S_BUFFER_LOAD_I16">;
 defm : ScalarBufferLoadIntrinsicPat <SIsbuffer_load_ushort, "S_BUFFER_LOAD_U16">;
 
-} // End let AddedComplexity = 100
-
 foreach vt = Reg32Types.types in {
 defm : SMRD_Pattern <"S_LOAD_DWORD", vt>;
 }
@@ -1073,6 +1029,7 @@ foreach vt = SReg_512.RegTypes in {
 defm : SMRD_Pattern <"S_LOAD_DWORDX16", vt>;
 }
 
+} // End let AddedComplexity = 100
 
 defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORD",     i32>;
 defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX2",   v2i32>;
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
index 3182fecffecf4..649e6dc039830 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
@@ -1128,7 +1128,7 @@ void ARMELFStreamer::reset() {
   // MCELFStreamer clear's the assembler's e_flags. However, for
   // arm we manually set the ABI version on streamer creation, so
   // do the same here
-  getWriter().setELFHeaderEFlags(ELF::EF_ARM_EABI_VER5);
+  getAssembler().setELFHeaderEFlags(ELF::EF_ARM_EABI_VER5);
 }
 
 inline void ARMELFStreamer::SwitchToEHSection(StringRef Prefix,
@@ -1484,7 +1484,7 @@ MCELFStreamer *createARMELFStreamer(MCContext &Context,
   // FIXME: This should eventually end up somewhere else where more
   // intelligent flag decisions can be made. For now we are just maintaining
   // the status quo for ARM and setting EF_ARM_EABI_VER5 as the default.
-  S->getWriter().setELFHeaderEFlags(ELF::EF_ARM_EABI_VER5);
+  S->getAssembler().setELFHeaderEFlags(ELF::EF_ARM_EABI_VER5);
 
   return S;
 }
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp
index 0d29912bee264..d520880d73bbd 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp
@@ -13,7 +13,6 @@
 #include "MCTargetDesc/AVRAsmBackend.h"
 #include "MCTargetDesc/AVRFixupKinds.h"
 #include "MCTargetDesc/AVRMCTargetDesc.h"
-#include "llvm/ADT/StringSwitch.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRELFStreamer.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRELFStreamer.cpp
index b49fa38e7b146..4ac54c8876d72 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRELFStreamer.cpp
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRELFStreamer.cpp
@@ -56,13 +56,14 @@ static unsigned getEFlagsForFeatureSet(const FeatureBitset &Features) {
 
 AVRELFStreamer::AVRELFStreamer(MCStreamer &S, const MCSubtargetInfo &STI)
     : AVRTargetStreamer(S) {
-  ELFObjectWriter &W = getStreamer().getWriter();
-  unsigned EFlags = W.getELFHeaderEFlags();
+
+  MCAssembler &MCA = getStreamer().getAssembler();
+  unsigned EFlags = MCA.getELFHeaderEFlags();
 
   EFlags |= getEFlagsForFeatureSet(STI.getFeatureBits());
   EFlags |= ELF::EF_AVR_LINKRELAX_PREPARED;
 
-  W.setELFHeaderEFlags(EFlags);
+  MCA.setELFHeaderEFlags(EFlags);
 }
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFStreamer.cpp b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFStreamer.cpp
index a77e3be5eba66..059c3c143c311 100644
--- a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFStreamer.cpp
+++ b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFStreamer.cpp
@@ -31,10 +31,10 @@ using namespace llvm;
 CSKYTargetELFStreamer::CSKYTargetELFStreamer(MCStreamer &S,
                                              const MCSubtargetInfo &STI)
     : CSKYTargetStreamer(S), CurrentVendor("csky") {
-  ELFObjectWriter &W = getStreamer().getWriter();
+  MCAssembler &MCA = getStreamer().getAssembler();
   const FeatureBitset &Features = STI.getFeatureBits();
 
-  unsigned EFlags = W.getELFHeaderEFlags();
+  unsigned EFlags = MCA.getELFHeaderEFlags();
 
   EFlags |= ELF::EF_CSKY_ABIV2;
 
@@ -62,7 +62,7 @@ CSKYTargetELFStreamer::CSKYTargetELFStreamer(MCStreamer &S,
 
   EFlags |= ELF::EF_CSKY_EFV1;
 
-  W.setELFHeaderEFlags(EFlags);
+  MCA.setELFHeaderEFlags(EFlags);
 }
 
 MCELFStreamer &CSKYTargetELFStreamer::getStreamer() {
@@ -168,7 +168,8 @@ void CSKYELFStreamer::EmitMappingSymbol(StringRef Name) {
 
   State = (Name == "$t" ? EMS_Text : EMS_Data);
 
-  auto *Symbol = cast<MCSymbolELF>(getContext().createLocalSymbol(Name));
+  auto *Symbol = cast<MCSymbolELF>(getContext().getOrCreateSymbol(
+      Name + "." + Twine(MappingSymbolCounter++)));
   emitLabel(Symbol);
 
   Symbol->setType(ELF::STT_NOTYPE);
diff --git a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFStreamer.h b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFStreamer.h
index 1392fc9b5cca9..b7931e9222792 100644
--- a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFStreamer.h
+++ b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFStreamer.h
@@ -99,6 +99,8 @@ class CSKYTargetELFStreamer : public CSKYTargetStreamer {
 };
 
 class CSKYELFStreamer : public MCELFStreamer {
+  int64_t MappingSymbolCounter = 0;
+
   void EmitMappingSymbol(StringRef Name);
 
 public:
@@ -136,6 +138,7 @@ class CSKYELFStreamer : public MCELFStreamer {
     MCELFStreamer::emitValueImpl(Value, Size, Loc);
   }
   void reset() override {
+    MappingSymbolCounter = 0;
     State = EMS_None;
     MCELFStreamer::reset();
   }
diff --git a/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp b/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
index 4ef009c87a1e6..b75cd1beadc58 100644
--- a/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
@@ -829,7 +829,7 @@ bool PolynomialMultiplyRecognize::matchRightShift(SelectInst *SelI,
     return false;
 
   Value *X = nullptr;
-  if (!match(C, m_And(m_Value(X), m_One())))
+  if (!match(C, m_c_And(m_Value(X), m_One())))
     return false;
   // Matched: select (X & 1) == +++ ? ... : ...
   //          select (X & 1) != +++ ? ... : ...
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
index 1570493b765ca..6acc37e599f2e 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
@@ -702,7 +702,7 @@ class HexagonAsmBackend : public MCAsmBackend {
     return true;
   }
 
-  bool finishLayout(const MCAssembler &Asm) const override {
+  void finishLayout(MCAssembler const &Asm) const override {
     SmallVector<MCFragment *> Frags;
     for (MCSection &Sec : Asm) {
       Frags.clear();
@@ -747,6 +747,7 @@ class HexagonAsmBackend : public MCAsmBackend {
               //assert(!Error);
               (void)Error;
               ReplaceInstruction(Asm.getEmitter(), RF, Inst);
+              Sec.setHasLayout(false);
               Size = 0; // Only look back one instruction
               break;
             }
@@ -756,7 +757,6 @@ class HexagonAsmBackend : public MCAsmBackend {
         }
       }
     }
-    return true;
   }
 }; // class HexagonAsmBackend
 
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
index 7b21d30e6cef1..c16edc9857bfc 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
@@ -291,7 +291,8 @@ class HexagonTargetELFStreamer : public HexagonTargetStreamer {
   }
   HexagonTargetELFStreamer(MCStreamer &S, MCSubtargetInfo const &STI)
       : HexagonTargetStreamer(S) {
-    getStreamer().getWriter().setELFHeaderEFlags(Hexagon_MC::GetELFFlags(STI));
+    MCAssembler &MCA = getStreamer().getAssembler();
+    MCA.setELFHeaderEFlags(Hexagon_MC::GetELFFlags(STI));
   }
 
   void emitCommonSymbolSorted(MCSymbol *Symbol, uint64_t Size,
diff --git a/llvm/lib/Target/LoongArch/LoongArch.td b/llvm/lib/Target/LoongArch/LoongArch.td
index ddb27dc6404fa..8a628157c6018 100644
--- a/llvm/lib/Target/LoongArch/LoongArch.td
+++ b/llvm/lib/Target/LoongArch/LoongArch.td
@@ -106,6 +106,11 @@ def FeatureRelax
     : SubtargetFeature<"relax", "HasLinkerRelax", "true",
                        "Enable Linker relaxation">;
 
+// Experimental auto vectorization
+def FeatureAutoVec
+    : SubtargetFeature<"auto-vec", "HasExpAutoVec", "true",
+                       "Experimental auto vectorization">;
+
 // Floating point approximation operation
 def FeatureFrecipe
     : SubtargetFeature<"frecipe", "HasFrecipe", "true",
@@ -146,13 +151,6 @@ def : ProcessorModel<"la464", NoSchedModel, [Feature64Bit,
                                              FeatureExtLVZ,
                                              FeatureExtLBT]>;
 
-def : ProcessorModel<"la664", NoSchedModel, [Feature64Bit,
-                                             FeatureUAL,
-                                             FeatureExtLASX,
-                                             FeatureExtLVZ,
-                                             FeatureExtLBT,
-                                             FeatureFrecipe]>;
-
 //===----------------------------------------------------------------------===//
 // Define the LoongArch target.
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index d80509cf39849..ba6be85c7f2e8 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -252,9 +252,9 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
 
       setOperationAction(ISD::SETCC, VT, Legal);
       setOperationAction(ISD::VSELECT, VT, Legal);
-      setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
     }
     for (MVT VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
+      setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
       setOperationAction({ISD::ADD, ISD::SUB}, VT, Legal);
       setOperationAction({ISD::UMAX, ISD::UMIN, ISD::SMAX, ISD::SMIN}, VT,
                          Legal);
@@ -298,9 +298,9 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
 
       setOperationAction(ISD::SETCC, VT, Legal);
       setOperationAction(ISD::VSELECT, VT, Legal);
-      setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
     }
     for (MVT VT : {MVT::v4i64, MVT::v8i32, MVT::v16i16, MVT::v32i8}) {
+      setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
       setOperationAction({ISD::ADD, ISD::SUB}, VT, Legal);
       setOperationAction({ISD::UMAX, ISD::UMIN, ISD::SMAX, ISD::SMIN}, VT,
                          Legal);
@@ -335,7 +335,6 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
   setTargetDAGCombine(ISD::AND);
   setTargetDAGCombine(ISD::OR);
   setTargetDAGCombine(ISD::SRL);
-  setTargetDAGCombine(ISD::SETCC);
 
   // Set DAG combine for 'LSX' feature.
 
@@ -428,926 +427,9 @@ SDValue LoongArchTargetLowering::LowerOperation(SDValue Op,
   return SDValue();
 }
 
-/// Determine whether a range fits a regular pattern of values.
-/// This function accounts for the possibility of jumping over the End iterator.
-template <typename ValType>
-static bool
-fitsRegularPattern(typename SmallVectorImpl<ValType>::const_iterator Begin,
-                   unsigned CheckStride,
-                   typename SmallVectorImpl<ValType>::const_iterator End,
-                   ValType ExpectedIndex, unsigned ExpectedIndexStride) {
-  auto &I = Begin;
-
-  while (I != End) {
-    if (*I != -1 && *I != ExpectedIndex)
-      return false;
-    ExpectedIndex += ExpectedIndexStride;
-
-    // Incrementing past End is undefined behaviour so we must increment one
-    // step at a time and check for End at each step.
-    for (unsigned n = 0; n < CheckStride && I != End; ++n, ++I)
-      ; // Empty loop body.
-  }
-  return true;
-}
-
-/// Lower VECTOR_SHUFFLE into VREPLVEI (if possible).
-///
-/// VREPLVEI performs vector broadcast based on an element specified by an
-/// integer immediate, with its mask being similar to:
-///   <x, x, x, ...>
-/// where x is any valid index.
-///
-/// When undef's appear in the mask they are treated as if they were whatever
-/// value is necessary in order to fit the above form.
-static SDValue lowerVECTOR_SHUFFLE_VREPLVEI(const SDLoc &DL, ArrayRef<int> Mask,
-                                            MVT VT, SDValue V1, SDValue V2,
-                                            SelectionDAG &DAG) {
-  int SplatIndex = -1;
-  for (const auto &M : Mask) {
-    if (M != -1) {
-      SplatIndex = M;
-      break;
-    }
-  }
-
-  if (SplatIndex == -1)
-    return DAG.getUNDEF(VT);
-
-  assert(SplatIndex < (int)Mask.size() && "Out of bounds mask index");
-  if (fitsRegularPattern<int>(Mask.begin(), 1, Mask.end(), SplatIndex, 0)) {
-    APInt Imm(64, SplatIndex);
-    return DAG.getNode(LoongArchISD::VREPLVEI, DL, VT, V1,
-                       DAG.getConstant(Imm, DL, MVT::i64));
-  }
-
-  return SDValue();
-}
-
-/// Lower VECTOR_SHUFFLE into VSHUF4I (if possible).
-///
-/// VSHUF4I splits the vector into blocks of four elements, then shuffles these
-/// elements according to a <4 x i2> constant (encoded as an integer immediate).
-///
-/// It is therefore possible to lower into VSHUF4I when the mask takes the form:
-///   <a, b, c, d, a+4, b+4, c+4, d+4, a+8, b+8, c+8, d+8, ...>
-/// When undef's appear they are treated as if they were whatever value is
-/// necessary in order to fit the above forms.
-///
-/// For example:
-///   %2 = shufflevector <8 x i16> %0, <8 x i16> undef,
-///                      <8 x i32> <i32 3, i32 2, i32 1, i32 0,
-///                                 i32 7, i32 6, i32 5, i32 4>
-/// is lowered to:
-///   (VSHUF4I_H $v0, $v1, 27)
-/// where the 27 comes from:
-///   3 + (2 << 2) + (1 << 4) + (0 << 6)
-static SDValue lowerVECTOR_SHUFFLE_VSHUF4I(const SDLoc &DL, ArrayRef<int> Mask,
-                                           MVT VT, SDValue V1, SDValue V2,
-                                           SelectionDAG &DAG) {
-
-  // When the size is less than 4, lower cost instructions may be used.
-  if (Mask.size() < 4)
-    return SDValue();
-
-  int SubMask[4] = {-1, -1, -1, -1};
-  for (unsigned i = 0; i < 4; ++i) {
-    for (unsigned j = i; j < Mask.size(); j += 4) {
-      int Idx = Mask[j];
-
-      // Convert from vector index to 4-element subvector index
-      // If an index refers to an element outside of the subvector then give up
-      if (Idx != -1) {
-        Idx -= 4 * (j / 4);
-        if (Idx < 0 || Idx >= 4)
-          return SDValue();
-      }
-
-      // If the mask has an undef, replace it with the current index.
-      // Note that it might still be undef if the current index is also undef
-      if (SubMask[i] == -1)
-        SubMask[i] = Idx;
-      // Check that non-undef values are the same as in the mask. If they
-      // aren't then give up
-      else if (Idx != -1 && Idx != SubMask[i])
-        return SDValue();
-    }
-  }
-
-  // Calculate the immediate. Replace any remaining undefs with zero
-  APInt Imm(64, 0);
-  for (int i = 3; i >= 0; --i) {
-    int Idx = SubMask[i];
-
-    if (Idx == -1)
-      Idx = 0;
-
-    Imm <<= 2;
-    Imm |= Idx & 0x3;
-  }
-
-  return DAG.getNode(LoongArchISD::VSHUF4I, DL, VT, V1,
-                     DAG.getConstant(Imm, DL, MVT::i64));
-}
-
-/// Lower VECTOR_SHUFFLE into VPACKEV (if possible).
-///
-/// VPACKEV interleaves the even elements from each vector.
-///
-/// It is possible to lower into VPACKEV when the mask consists of two of the
-/// following forms interleaved:
-///   <0, 2, 4, ...>
-///   <n, n+2, n+4, ...>
-/// where n is the number of elements in the vector.
-/// For example:
-///   <0, 0, 2, 2, 4, 4, ...>
-///   <0, n, 2, n+2, 4, n+4, ...>
-///
-/// When undef's appear in the mask they are treated as if they were whatever
-/// value is necessary in order to fit the above forms.
-static SDValue lowerVECTOR_SHUFFLE_VPACKEV(const SDLoc &DL, ArrayRef<int> Mask,
-                                           MVT VT, SDValue V1, SDValue V2,
-                                           SelectionDAG &DAG) {
-
-  const auto &Begin = Mask.begin();
-  const auto &End = Mask.end();
-  SDValue OriV1 = V1, OriV2 = V2;
-
-  if (fitsRegularPattern<int>(Begin, 2, End, 0, 2))
-    V1 = OriV1;
-  else if (fitsRegularPattern<int>(Begin, 2, End, Mask.size(), 2))
-    V1 = OriV2;
-  else
-    return SDValue();
-
-  if (fitsRegularPattern<int>(Begin + 1, 2, End, 0, 2))
-    V2 = OriV1;
-  else if (fitsRegularPattern<int>(Begin + 1, 2, End, Mask.size(), 2))
-    V2 = OriV2;
-  else
-    return SDValue();
-
-  return DAG.getNode(LoongArchISD::VPACKEV, DL, VT, V2, V1);
-}
-
-/// Lower VECTOR_SHUFFLE into VPACKOD (if possible).
-///
-/// VPACKOD interleaves the odd elements from each vector.
-///
-/// It is possible to lower into VPACKOD when the mask consists of two of the
-/// following forms interleaved:
-///   <1, 3, 5, ...>
-///   <n+1, n+3, n+5, ...>
-/// where n is the number of elements in the vector.
-/// For example:
-///   <1, 1, 3, 3, 5, 5, ...>
-///   <1, n+1, 3, n+3, 5, n+5, ...>
-///
-/// When undef's appear in the mask they are treated as if they were whatever
-/// value is necessary in order to fit the above forms.
-static SDValue lowerVECTOR_SHUFFLE_VPACKOD(const SDLoc &DL, ArrayRef<int> Mask,
-                                           MVT VT, SDValue V1, SDValue V2,
-                                           SelectionDAG &DAG) {
-
-  const auto &Begin = Mask.begin();
-  const auto &End = Mask.end();
-  SDValue OriV1 = V1, OriV2 = V2;
-
-  if (fitsRegularPattern<int>(Begin, 2, End, 1, 2))
-    V1 = OriV1;
-  else if (fitsRegularPattern<int>(Begin, 2, End, Mask.size() + 1, 2))
-    V1 = OriV2;
-  else
-    return SDValue();
-
-  if (fitsRegularPattern<int>(Begin + 1, 2, End, 1, 2))
-    V2 = OriV1;
-  else if (fitsRegularPattern<int>(Begin + 1, 2, End, Mask.size() + 1, 2))
-    V2 = OriV2;
-  else
-    return SDValue();
-
-  return DAG.getNode(LoongArchISD::VPACKOD, DL, VT, V2, V1);
-}
-
-/// Lower VECTOR_SHUFFLE into VILVH (if possible).
-///
-/// VILVH interleaves consecutive elements from the left (highest-indexed) half
-/// of each vector.
-///
-/// It is possible to lower into VILVH when the mask consists of two of the
-/// following forms interleaved:
-///   <x, x+1, x+2, ...>
-///   <n+x, n+x+1, n+x+2, ...>
-/// where n is the number of elements in the vector and x is half n.
-/// For example:
-///   <x, x, x+1, x+1, x+2, x+2, ...>
-///   <x, n+x, x+1, n+x+1, x+2, n+x+2, ...>
-///
-/// When undef's appear in the mask they are treated as if they were whatever
-/// value is necessary in order to fit the above forms.
-static SDValue lowerVECTOR_SHUFFLE_VILVH(const SDLoc &DL, ArrayRef<int> Mask,
-                                         MVT VT, SDValue V1, SDValue V2,
-                                         SelectionDAG &DAG) {
-
-  const auto &Begin = Mask.begin();
-  const auto &End = Mask.end();
-  unsigned HalfSize = Mask.size() / 2;
-  SDValue OriV1 = V1, OriV2 = V2;
-
-  if (fitsRegularPattern<int>(Begin, 2, End, HalfSize, 1))
-    V1 = OriV1;
-  else if (fitsRegularPattern<int>(Begin, 2, End, Mask.size() + HalfSize, 1))
-    V1 = OriV2;
-  else
-    return SDValue();
-
-  if (fitsRegularPattern<int>(Begin + 1, 2, End, HalfSize, 1))
-    V2 = OriV1;
-  else if (fitsRegularPattern<int>(Begin + 1, 2, End, Mask.size() + HalfSize,
-                                   1))
-    V2 = OriV2;
-  else
-    return SDValue();
-
-  return DAG.getNode(LoongArchISD::VILVH, DL, VT, V2, V1);
-}
-
-/// Lower VECTOR_SHUFFLE into VILVL (if possible).
-///
-/// VILVL interleaves consecutive elements from the right (lowest-indexed) half
-/// of each vector.
-///
-/// It is possible to lower into VILVL when the mask consists of two of the
-/// following forms interleaved:
-///   <0, 1, 2, ...>
-///   <n, n+1, n+2, ...>
-/// where n is the number of elements in the vector.
-/// For example:
-///   <0, 0, 1, 1, 2, 2, ...>
-///   <0, n, 1, n+1, 2, n+2, ...>
-///
-/// When undef's appear in the mask they are treated as if they were whatever
-/// value is necessary in order to fit the above forms.
-static SDValue lowerVECTOR_SHUFFLE_VILVL(const SDLoc &DL, ArrayRef<int> Mask,
-                                         MVT VT, SDValue V1, SDValue V2,
-                                         SelectionDAG &DAG) {
-
-  const auto &Begin = Mask.begin();
-  const auto &End = Mask.end();
-  SDValue OriV1 = V1, OriV2 = V2;
-
-  if (fitsRegularPattern<int>(Begin, 2, End, 0, 1))
-    V1 = OriV1;
-  else if (fitsRegularPattern<int>(Begin, 2, End, Mask.size(), 1))
-    V1 = OriV2;
-  else
-    return SDValue();
-
-  if (fitsRegularPattern<int>(Begin + 1, 2, End, 0, 1))
-    V2 = OriV1;
-  else if (fitsRegularPattern<int>(Begin + 1, 2, End, Mask.size(), 1))
-    V2 = OriV2;
-  else
-    return SDValue();
-
-  return DAG.getNode(LoongArchISD::VILVL, DL, VT, V2, V1);
-}
-
-/// Lower VECTOR_SHUFFLE into VPICKEV (if possible).
-///
-/// VPICKEV copies the even elements of each vector into the result vector.
-///
-/// It is possible to lower into VPICKEV when the mask consists of two of the
-/// following forms concatenated:
-///   <0, 2, 4, ...>
-///   <n, n+2, n+4, ...>
-/// where n is the number of elements in the vector.
-/// For example:
-///   <0, 2, 4, ..., 0, 2, 4, ...>
-///   <0, 2, 4, ..., n, n+2, n+4, ...>
-///
-/// When undef's appear in the mask they are treated as if they were whatever
-/// value is necessary in order to fit the above forms.
-static SDValue lowerVECTOR_SHUFFLE_VPICKEV(const SDLoc &DL, ArrayRef<int> Mask,
-                                           MVT VT, SDValue V1, SDValue V2,
-                                           SelectionDAG &DAG) {
-
-  const auto &Begin = Mask.begin();
-  const auto &Mid = Mask.begin() + Mask.size() / 2;
-  const auto &End = Mask.end();
-  SDValue OriV1 = V1, OriV2 = V2;
-
-  if (fitsRegularPattern<int>(Begin, 1, Mid, 0, 2))
-    V1 = OriV1;
-  else if (fitsRegularPattern<int>(Begin, 1, Mid, Mask.size(), 2))
-    V1 = OriV2;
-  else
-    return SDValue();
-
-  if (fitsRegularPattern<int>(Mid, 1, End, 0, 2))
-    V2 = OriV1;
-  else if (fitsRegularPattern<int>(Mid, 1, End, Mask.size(), 2))
-    V2 = OriV2;
-
-  else
-    return SDValue();
-
-  return DAG.getNode(LoongArchISD::VPICKEV, DL, VT, V2, V1);
-}
-
-/// Lower VECTOR_SHUFFLE into VPICKOD (if possible).
-///
-/// VPICKOD copies the odd elements of each vector into the result vector.
-///
-/// It is possible to lower into VPICKOD when the mask consists of two of the
-/// following forms concatenated:
-///   <1, 3, 5, ...>
-///   <n+1, n+3, n+5, ...>
-/// where n is the number of elements in the vector.
-/// For example:
-///   <1, 3, 5, ..., 1, 3, 5, ...>
-///   <1, 3, 5, ..., n+1, n+3, n+5, ...>
-///
-/// When undef's appear in the mask they are treated as if they were whatever
-/// value is necessary in order to fit the above forms.
-static SDValue lowerVECTOR_SHUFFLE_VPICKOD(const SDLoc &DL, ArrayRef<int> Mask,
-                                           MVT VT, SDValue V1, SDValue V2,
-                                           SelectionDAG &DAG) {
-
-  const auto &Begin = Mask.begin();
-  const auto &Mid = Mask.begin() + Mask.size() / 2;
-  const auto &End = Mask.end();
-  SDValue OriV1 = V1, OriV2 = V2;
-
-  if (fitsRegularPattern<int>(Begin, 1, Mid, 1, 2))
-    V1 = OriV1;
-  else if (fitsRegularPattern<int>(Begin, 1, Mid, Mask.size() + 1, 2))
-    V1 = OriV2;
-  else
-    return SDValue();
-
-  if (fitsRegularPattern<int>(Mid, 1, End, 1, 2))
-    V2 = OriV1;
-  else if (fitsRegularPattern<int>(Mid, 1, End, Mask.size() + 1, 2))
-    V2 = OriV2;
-  else
-    return SDValue();
-
-  return DAG.getNode(LoongArchISD::VPICKOD, DL, VT, V2, V1);
-}
-
-/// Lower VECTOR_SHUFFLE into VSHUF.
-///
-/// This mostly consists of converting the shuffle mask into a BUILD_VECTOR and
-/// adding it as an operand to the resulting VSHUF.
-static SDValue lowerVECTOR_SHUFFLE_VSHUF(const SDLoc &DL, ArrayRef<int> Mask,
-                                         MVT VT, SDValue V1, SDValue V2,
-                                         SelectionDAG &DAG) {
-
-  SmallVector<SDValue, 16> Ops;
-  for (auto M : Mask)
-    Ops.push_back(DAG.getConstant(M, DL, MVT::i64));
-
-  EVT MaskVecTy = VT.changeVectorElementTypeToInteger();
-  SDValue MaskVec = DAG.getBuildVector(MaskVecTy, DL, Ops);
-
-  // VECTOR_SHUFFLE concatenates the vectors in an vectorwise fashion.
-  // <0b00, 0b01> + <0b10, 0b11> -> <0b00, 0b01, 0b10, 0b11>
-  // VSHF concatenates the vectors in a bitwise fashion:
-  // <0b00, 0b01> + <0b10, 0b11> ->
-  // 0b0100       + 0b1110       -> 0b01001110
-  //                                <0b10, 0b11, 0b00, 0b01>
-  // We must therefore swap the operands to get the correct result.
-  return DAG.getNode(LoongArchISD::VSHUF, DL, VT, MaskVec, V2, V1);
-}
-
-/// Dispatching routine to lower various 128-bit LoongArch vector shuffles.
-///
-/// This routine breaks down the specific type of 128-bit shuffle and
-/// dispatches to the lowering routines accordingly.
-static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
-                                  SDValue V1, SDValue V2, SelectionDAG &DAG) {
-  assert((VT.SimpleTy == MVT::v16i8 || VT.SimpleTy == MVT::v8i16 ||
-          VT.SimpleTy == MVT::v4i32 || VT.SimpleTy == MVT::v2i64 ||
-          VT.SimpleTy == MVT::v4f32 || VT.SimpleTy == MVT::v2f64) &&
-         "Vector type is unsupported for lsx!");
-  assert(V1.getSimpleValueType() == V2.getSimpleValueType() &&
-         "Two operands have different types!");
-  assert(VT.getVectorNumElements() == Mask.size() &&
-         "Unexpected mask size for shuffle!");
-  assert(Mask.size() % 2 == 0 && "Expected even mask size.");
-
-  SDValue Result;
-  // TODO: Add more comparison patterns.
-  if (V2.isUndef()) {
-    if ((Result = lowerVECTOR_SHUFFLE_VREPLVEI(DL, Mask, VT, V1, V2, DAG)))
-      return Result;
-    if ((Result = lowerVECTOR_SHUFFLE_VSHUF4I(DL, Mask, VT, V1, V2, DAG)))
-      return Result;
-
-    // TODO: This comment may be enabled in the future to better match the
-    // pattern for instruction selection.
-    /* V2 = V1; */
-  }
-
-  // It is recommended not to change the pattern comparison order for better
-  // performance.
-  if ((Result = lowerVECTOR_SHUFFLE_VPACKEV(DL, Mask, VT, V1, V2, DAG)))
-    return Result;
-  if ((Result = lowerVECTOR_SHUFFLE_VPACKOD(DL, Mask, VT, V1, V2, DAG)))
-    return Result;
-  if ((Result = lowerVECTOR_SHUFFLE_VILVH(DL, Mask, VT, V1, V2, DAG)))
-    return Result;
-  if ((Result = lowerVECTOR_SHUFFLE_VILVL(DL, Mask, VT, V1, V2, DAG)))
-    return Result;
-  if ((Result = lowerVECTOR_SHUFFLE_VPICKEV(DL, Mask, VT, V1, V2, DAG)))
-    return Result;
-  if ((Result = lowerVECTOR_SHUFFLE_VPICKOD(DL, Mask, VT, V1, V2, DAG)))
-    return Result;
-  if ((Result = lowerVECTOR_SHUFFLE_VSHUF(DL, Mask, VT, V1, V2, DAG)))
-    return Result;
-
-  return SDValue();
-}
-
-/// Lower VECTOR_SHUFFLE into XVREPLVEI (if possible).
-///
-/// It is a XVREPLVEI when the mask is:
-///   <x, x, x, ..., x+n, x+n, x+n, ...>
-/// where the number of x is equal to n and n is half the length of vector.
-///
-/// When undef's appear in the mask they are treated as if they were whatever
-/// value is necessary in order to fit the above form.
-static SDValue lowerVECTOR_SHUFFLE_XVREPLVEI(const SDLoc &DL,
-                                             ArrayRef<int> Mask, MVT VT,
-                                             SDValue V1, SDValue V2,
-                                             SelectionDAG &DAG) {
-  int SplatIndex = -1;
-  for (const auto &M : Mask) {
-    if (M != -1) {
-      SplatIndex = M;
-      break;
-    }
-  }
-
-  if (SplatIndex == -1)
-    return DAG.getUNDEF(VT);
-
-  const auto &Begin = Mask.begin();
-  const auto &End = Mask.end();
-  unsigned HalfSize = Mask.size() / 2;
-
-  assert(SplatIndex < (int)Mask.size() && "Out of bounds mask index");
-  if (fitsRegularPattern<int>(Begin, 1, End - HalfSize, SplatIndex, 0) &&
-      fitsRegularPattern<int>(Begin + HalfSize, 1, End, SplatIndex + HalfSize,
-                              0)) {
-    APInt Imm(64, SplatIndex);
-    return DAG.getNode(LoongArchISD::VREPLVEI, DL, VT, V1,
-                       DAG.getConstant(Imm, DL, MVT::i64));
-  }
-
-  return SDValue();
-}
-
-/// Lower VECTOR_SHUFFLE into XVSHUF4I (if possible).
-static SDValue lowerVECTOR_SHUFFLE_XVSHUF4I(const SDLoc &DL, ArrayRef<int> Mask,
-                                            MVT VT, SDValue V1, SDValue V2,
-                                            SelectionDAG &DAG) {
-  // When the size is less than or equal to 4, lower cost instructions may be
-  // used.
-  if (Mask.size() <= 4)
-    return SDValue();
-  return lowerVECTOR_SHUFFLE_VSHUF4I(DL, Mask, VT, V1, V2, DAG);
-}
-
-/// Lower VECTOR_SHUFFLE into XVPACKEV (if possible).
-static SDValue lowerVECTOR_SHUFFLE_XVPACKEV(const SDLoc &DL, ArrayRef<int> Mask,
-                                            MVT VT, SDValue V1, SDValue V2,
-                                            SelectionDAG &DAG) {
-  return lowerVECTOR_SHUFFLE_VPACKEV(DL, Mask, VT, V1, V2, DAG);
-}
-
-/// Lower VECTOR_SHUFFLE into XVPACKOD (if possible).
-static SDValue lowerVECTOR_SHUFFLE_XVPACKOD(const SDLoc &DL, ArrayRef<int> Mask,
-                                            MVT VT, SDValue V1, SDValue V2,
-                                            SelectionDAG &DAG) {
-  return lowerVECTOR_SHUFFLE_VPACKOD(DL, Mask, VT, V1, V2, DAG);
-}
-
-/// Lower VECTOR_SHUFFLE into XVILVH (if possible).
-static SDValue lowerVECTOR_SHUFFLE_XVILVH(const SDLoc &DL, ArrayRef<int> Mask,
-                                          MVT VT, SDValue V1, SDValue V2,
-                                          SelectionDAG &DAG) {
-
-  const auto &Begin = Mask.begin();
-  const auto &End = Mask.end();
-  unsigned HalfSize = Mask.size() / 2;
-  unsigned LeftSize = HalfSize / 2;
-  SDValue OriV1 = V1, OriV2 = V2;
-
-  if (fitsRegularPattern<int>(Begin, 2, End - HalfSize, HalfSize - LeftSize,
-                              1) &&
-      fitsRegularPattern<int>(Begin + HalfSize, 2, End, HalfSize + LeftSize, 1))
-    V1 = OriV1;
-  else if (fitsRegularPattern<int>(Begin, 2, End - HalfSize,
-                                   Mask.size() + HalfSize - LeftSize, 1) &&
-           fitsRegularPattern<int>(Begin + HalfSize, 2, End,
-                                   Mask.size() + HalfSize + LeftSize, 1))
-    V1 = OriV2;
-  else
-    return SDValue();
-
-  if (fitsRegularPattern<int>(Begin + 1, 2, End - HalfSize, HalfSize - LeftSize,
-                              1) &&
-      fitsRegularPattern<int>(Begin + 1 + HalfSize, 2, End, HalfSize + LeftSize,
-                              1))
-    V2 = OriV1;
-  else if (fitsRegularPattern<int>(Begin + 1, 2, End - HalfSize,
-                                   Mask.size() + HalfSize - LeftSize, 1) &&
-           fitsRegularPattern<int>(Begin + 1 + HalfSize, 2, End,
-                                   Mask.size() + HalfSize + LeftSize, 1))
-    V2 = OriV2;
-  else
-    return SDValue();
-
-  return DAG.getNode(LoongArchISD::VILVH, DL, VT, V2, V1);
-}
-
-/// Lower VECTOR_SHUFFLE into XVILVL (if possible).
-static SDValue lowerVECTOR_SHUFFLE_XVILVL(const SDLoc &DL, ArrayRef<int> Mask,
-                                          MVT VT, SDValue V1, SDValue V2,
-                                          SelectionDAG &DAG) {
-
-  const auto &Begin = Mask.begin();
-  const auto &End = Mask.end();
-  unsigned HalfSize = Mask.size() / 2;
-  SDValue OriV1 = V1, OriV2 = V2;
-
-  if (fitsRegularPattern<int>(Begin, 2, End - HalfSize, 0, 1) &&
-      fitsRegularPattern<int>(Begin + HalfSize, 2, End, HalfSize, 1))
-    V1 = OriV1;
-  else if (fitsRegularPattern<int>(Begin, 2, End - HalfSize, Mask.size(), 1) &&
-           fitsRegularPattern<int>(Begin + HalfSize, 2, End,
-                                   Mask.size() + HalfSize, 1))
-    V1 = OriV2;
-  else
-    return SDValue();
-
-  if (fitsRegularPattern<int>(Begin + 1, 2, End - HalfSize, 0, 1) &&
-      fitsRegularPattern<int>(Begin + 1 + HalfSize, 2, End, HalfSize, 1))
-    V2 = OriV1;
-  else if (fitsRegularPattern<int>(Begin + 1, 2, End - HalfSize, Mask.size(),
-                                   1) &&
-           fitsRegularPattern<int>(Begin + 1 + HalfSize, 2, End,
-                                   Mask.size() + HalfSize, 1))
-    V2 = OriV2;
-  else
-    return SDValue();
-
-  return DAG.getNode(LoongArchISD::VILVL, DL, VT, V2, V1);
-}
-
-/// Lower VECTOR_SHUFFLE into XVPICKEV (if possible).
-static SDValue lowerVECTOR_SHUFFLE_XVPICKEV(const SDLoc &DL, ArrayRef<int> Mask,
-                                            MVT VT, SDValue V1, SDValue V2,
-                                            SelectionDAG &DAG) {
-
-  const auto &Begin = Mask.begin();
-  const auto &LeftMid = Mask.begin() + Mask.size() / 4;
-  const auto &Mid = Mask.begin() + Mask.size() / 2;
-  const auto &RightMid = Mask.end() - Mask.size() / 4;
-  const auto &End = Mask.end();
-  unsigned HalfSize = Mask.size() / 2;
-  SDValue OriV1 = V1, OriV2 = V2;
-
-  if (fitsRegularPattern<int>(Begin, 1, LeftMid, 0, 2) &&
-      fitsRegularPattern<int>(Mid, 1, RightMid, HalfSize, 2))
-    V1 = OriV1;
-  else if (fitsRegularPattern<int>(Begin, 1, LeftMid, Mask.size(), 2) &&
-           fitsRegularPattern<int>(Mid, 1, RightMid, Mask.size() + HalfSize, 2))
-    V1 = OriV2;
-  else
-    return SDValue();
-
-  if (fitsRegularPattern<int>(LeftMid, 1, Mid, 0, 2) &&
-      fitsRegularPattern<int>(RightMid, 1, End, HalfSize, 2))
-    V2 = OriV1;
-  else if (fitsRegularPattern<int>(LeftMid, 1, Mid, Mask.size(), 2) &&
-           fitsRegularPattern<int>(RightMid, 1, End, Mask.size() + HalfSize, 2))
-    V2 = OriV2;
-
-  else
-    return SDValue();
-
-  return DAG.getNode(LoongArchISD::VPICKEV, DL, VT, V2, V1);
-}
-
-/// Lower VECTOR_SHUFFLE into XVPICKOD (if possible).
-static SDValue lowerVECTOR_SHUFFLE_XVPICKOD(const SDLoc &DL, ArrayRef<int> Mask,
-                                            MVT VT, SDValue V1, SDValue V2,
-                                            SelectionDAG &DAG) {
-
-  const auto &Begin = Mask.begin();
-  const auto &LeftMid = Mask.begin() + Mask.size() / 4;
-  const auto &Mid = Mask.begin() + Mask.size() / 2;
-  const auto &RightMid = Mask.end() - Mask.size() / 4;
-  const auto &End = Mask.end();
-  unsigned HalfSize = Mask.size() / 2;
-  SDValue OriV1 = V1, OriV2 = V2;
-
-  if (fitsRegularPattern<int>(Begin, 1, LeftMid, 1, 2) &&
-      fitsRegularPattern<int>(Mid, 1, RightMid, HalfSize + 1, 2))
-    V1 = OriV1;
-  else if (fitsRegularPattern<int>(Begin, 1, LeftMid, Mask.size() + 1, 2) &&
-           fitsRegularPattern<int>(Mid, 1, RightMid, Mask.size() + HalfSize + 1,
-                                   2))
-    V1 = OriV2;
-  else
-    return SDValue();
-
-  if (fitsRegularPattern<int>(LeftMid, 1, Mid, 1, 2) &&
-      fitsRegularPattern<int>(RightMid, 1, End, HalfSize + 1, 2))
-    V2 = OriV1;
-  else if (fitsRegularPattern<int>(LeftMid, 1, Mid, Mask.size() + 1, 2) &&
-           fitsRegularPattern<int>(RightMid, 1, End, Mask.size() + HalfSize + 1,
-                                   2))
-    V2 = OriV2;
-  else
-    return SDValue();
-
-  return DAG.getNode(LoongArchISD::VPICKOD, DL, VT, V2, V1);
-}
-
-/// Lower VECTOR_SHUFFLE into XVSHUF (if possible).
-static SDValue lowerVECTOR_SHUFFLE_XVSHUF(const SDLoc &DL, ArrayRef<int> Mask,
-                                          MVT VT, SDValue V1, SDValue V2,
-                                          SelectionDAG &DAG) {
-
-  int MaskSize = Mask.size();
-  int HalfSize = Mask.size() / 2;
-  const auto &Begin = Mask.begin();
-  const auto &Mid = Mask.begin() + HalfSize;
-  const auto &End = Mask.end();
-
-  // VECTOR_SHUFFLE concatenates the vectors:
-  //  <0, 1, 2, 3, 4, 5, 6, 7> + <8, 9, 10, 11, 12, 13, 14, 15>
-  //  shuffling ->
-  //  <0, 1, 2, 3, 8, 9, 10, 11> <4, 5, 6, 7, 12, 13, 14, 15>
-  //
-  // XVSHUF concatenates the vectors:
-  //  <a0, a1, a2, a3, b0, b1, b2, b3> + <a4, a5, a6, a7, b4, b5, b6, b7>
-  //  shuffling ->
-  //  <a0, a1, a2, a3, a4, a5, a6, a7> + <b0, b1, b2, b3, b4, b5, b6, b7>
-  SmallVector<SDValue, 8> MaskAlloc;
-  for (auto it = Begin; it < Mid; it++) {
-    if (*it < 0) // UNDEF
-      MaskAlloc.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
-    else if ((*it >= 0 && *it < HalfSize) ||
-             (*it >= MaskSize && *it <= MaskSize + HalfSize)) {
-      int M = *it < HalfSize ? *it : *it - HalfSize;
-      MaskAlloc.push_back(DAG.getTargetConstant(M, DL, MVT::i64));
-    } else
-      return SDValue();
-  }
-  assert((int)MaskAlloc.size() == HalfSize && "xvshuf convert failed!");
-
-  for (auto it = Mid; it < End; it++) {
-    if (*it < 0) // UNDEF
-      MaskAlloc.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
-    else if ((*it >= HalfSize && *it < MaskSize) ||
-             (*it >= MaskSize + HalfSize && *it < MaskSize * 2)) {
-      int M = *it < MaskSize ? *it - HalfSize : *it - MaskSize;
-      MaskAlloc.push_back(DAG.getTargetConstant(M, DL, MVT::i64));
-    } else
-      return SDValue();
-  }
-  assert((int)MaskAlloc.size() == MaskSize && "xvshuf convert failed!");
-
-  EVT MaskVecTy = VT.changeVectorElementTypeToInteger();
-  SDValue MaskVec = DAG.getBuildVector(MaskVecTy, DL, MaskAlloc);
-  return DAG.getNode(LoongArchISD::VSHUF, DL, VT, MaskVec, V2, V1);
-}
-
-/// Shuffle vectors by lane to generate more optimized instructions.
-/// 256-bit shuffles are always considered as 2-lane 128-bit shuffles.
-///
-/// Therefore, except for the following four cases, other cases are regarded
-/// as cross-lane shuffles, where optimization is relatively limited.
-///
-/// - Shuffle high, low lanes of two inputs vector
-///   <0, 1, 2, 3> + <4, 5, 6, 7> --- <0, 5, 3, 6>
-/// - Shuffle low, high lanes of two inputs vector
-///   <0, 1, 2, 3> + <4, 5, 6, 7> --- <3, 6, 0, 5>
-/// - Shuffle low, low lanes of two inputs vector
-///   <0, 1, 2, 3> + <4, 5, 6, 7> --- <3, 6, 3, 6>
-/// - Shuffle high, high lanes of two inputs vector
-///   <0, 1, 2, 3> + <4, 5, 6, 7> --- <0, 5, 0, 5>
-///
-/// The first case is the closest to LoongArch instructions and the other
-/// cases need to be converted to it for processing.
-///
-/// This function may modify V1, V2 and Mask
-static void canonicalizeShuffleVectorByLane(const SDLoc &DL,
-                                            MutableArrayRef<int> Mask, MVT VT,
-                                            SDValue &V1, SDValue &V2,
-                                            SelectionDAG &DAG) {
-
-  enum HalfMaskType { HighLaneTy, LowLaneTy, None };
-
-  int MaskSize = Mask.size();
-  int HalfSize = Mask.size() / 2;
-
-  HalfMaskType preMask = None, postMask = None;
-
-  if (std::all_of(Mask.begin(), Mask.begin() + HalfSize, [&](int M) {
-        return M < 0 || (M >= 0 && M < HalfSize) ||
-               (M >= MaskSize && M < MaskSize + HalfSize);
-      }))
-    preMask = HighLaneTy;
-  else if (std::all_of(Mask.begin(), Mask.begin() + HalfSize, [&](int M) {
-             return M < 0 || (M >= HalfSize && M < MaskSize) ||
-                    (M >= MaskSize + HalfSize && M < MaskSize * 2);
-           }))
-    preMask = LowLaneTy;
-
-  if (std::all_of(Mask.begin() + HalfSize, Mask.end(), [&](int M) {
-        return M < 0 || (M >= 0 && M < HalfSize) ||
-               (M >= MaskSize && M < MaskSize + HalfSize);
-      }))
-    postMask = HighLaneTy;
-  else if (std::all_of(Mask.begin() + HalfSize, Mask.end(), [&](int M) {
-             return M < 0 || (M >= HalfSize && M < MaskSize) ||
-                    (M >= MaskSize + HalfSize && M < MaskSize * 2);
-           }))
-    postMask = LowLaneTy;
-
-  // The pre-half of mask is high lane type, and the post-half of mask
-  // is low lane type, which is closest to the LoongArch instructions.
-  //
-  // Note: In the LoongArch architecture, the high lane of mask corresponds
-  // to the lower 128-bit of vector register, and the low lane of mask
-  // corresponds the higher 128-bit of vector register.
-  if (preMask == HighLaneTy && postMask == LowLaneTy) {
-    return;
-  }
-  if (preMask == LowLaneTy && postMask == HighLaneTy) {
-    V1 = DAG.getBitcast(MVT::v4i64, V1);
-    V1 = DAG.getNode(LoongArchISD::XVPERMI, DL, MVT::v4i64, V1,
-                     DAG.getConstant(0b01001110, DL, MVT::i64));
-    V1 = DAG.getBitcast(VT, V1);
-
-    if (!V2.isUndef()) {
-      V2 = DAG.getBitcast(MVT::v4i64, V2);
-      V2 = DAG.getNode(LoongArchISD::XVPERMI, DL, MVT::v4i64, V2,
-                       DAG.getConstant(0b01001110, DL, MVT::i64));
-      V2 = DAG.getBitcast(VT, V2);
-    }
-
-    for (auto it = Mask.begin(); it < Mask.begin() + HalfSize; it++) {
-      *it = *it < 0 ? *it : *it - HalfSize;
-    }
-    for (auto it = Mask.begin() + HalfSize; it < Mask.end(); it++) {
-      *it = *it < 0 ? *it : *it + HalfSize;
-    }
-  } else if (preMask == LowLaneTy && postMask == LowLaneTy) {
-    V1 = DAG.getBitcast(MVT::v4i64, V1);
-    V1 = DAG.getNode(LoongArchISD::XVPERMI, DL, MVT::v4i64, V1,
-                     DAG.getConstant(0b11101110, DL, MVT::i64));
-    V1 = DAG.getBitcast(VT, V1);
-
-    if (!V2.isUndef()) {
-      V2 = DAG.getBitcast(MVT::v4i64, V2);
-      V2 = DAG.getNode(LoongArchISD::XVPERMI, DL, MVT::v4i64, V2,
-                       DAG.getConstant(0b11101110, DL, MVT::i64));
-      V2 = DAG.getBitcast(VT, V2);
-    }
-
-    for (auto it = Mask.begin(); it < Mask.begin() + HalfSize; it++) {
-      *it = *it < 0 ? *it : *it - HalfSize;
-    }
-  } else if (preMask == HighLaneTy && postMask == HighLaneTy) {
-    V1 = DAG.getBitcast(MVT::v4i64, V1);
-    V1 = DAG.getNode(LoongArchISD::XVPERMI, DL, MVT::v4i64, V1,
-                     DAG.getConstant(0b01000100, DL, MVT::i64));
-    V1 = DAG.getBitcast(VT, V1);
-
-    if (!V2.isUndef()) {
-      V2 = DAG.getBitcast(MVT::v4i64, V2);
-      V2 = DAG.getNode(LoongArchISD::XVPERMI, DL, MVT::v4i64, V2,
-                       DAG.getConstant(0b01000100, DL, MVT::i64));
-      V2 = DAG.getBitcast(VT, V2);
-    }
-
-    for (auto it = Mask.begin() + HalfSize; it < Mask.end(); it++) {
-      *it = *it < 0 ? *it : *it + HalfSize;
-    }
-  } else { // cross-lane
-    return;
-  }
-}
-
-/// Dispatching routine to lower various 256-bit LoongArch vector shuffles.
-///
-/// This routine breaks down the specific type of 256-bit shuffle and
-/// dispatches to the lowering routines accordingly.
-static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
-                                  SDValue V1, SDValue V2, SelectionDAG &DAG) {
-  assert((VT.SimpleTy == MVT::v32i8 || VT.SimpleTy == MVT::v16i16 ||
-          VT.SimpleTy == MVT::v8i32 || VT.SimpleTy == MVT::v4i64 ||
-          VT.SimpleTy == MVT::v8f32 || VT.SimpleTy == MVT::v4f64) &&
-         "Vector type is unsupported for lasx!");
-  assert(V1.getSimpleValueType() == V2.getSimpleValueType() &&
-         "Two operands have different types!");
-  assert(VT.getVectorNumElements() == Mask.size() &&
-         "Unexpected mask size for shuffle!");
-  assert(Mask.size() % 2 == 0 && "Expected even mask size.");
-  assert(Mask.size() >= 4 && "Mask size is less than 4.");
-
-  // canonicalize non cross-lane shuffle vector
-  SmallVector<int> NewMask(Mask);
-  canonicalizeShuffleVectorByLane(DL, NewMask, VT, V1, V2, DAG);
-
-  SDValue Result;
-  // TODO: Add more comparison patterns.
-  if (V2.isUndef()) {
-    if ((Result = lowerVECTOR_SHUFFLE_XVREPLVEI(DL, NewMask, VT, V1, V2, DAG)))
-      return Result;
-    if ((Result = lowerVECTOR_SHUFFLE_XVSHUF4I(DL, NewMask, VT, V1, V2, DAG)))
-      return Result;
-
-    // TODO: This comment may be enabled in the future to better match the
-    // pattern for instruction selection.
-    /* V2 = V1; */
-  }
-
-  // It is recommended not to change the pattern comparison order for better
-  // performance.
-  if ((Result = lowerVECTOR_SHUFFLE_XVPACKEV(DL, NewMask, VT, V1, V2, DAG)))
-    return Result;
-  if ((Result = lowerVECTOR_SHUFFLE_XVPACKOD(DL, NewMask, VT, V1, V2, DAG)))
-    return Result;
-  if ((Result = lowerVECTOR_SHUFFLE_XVILVH(DL, NewMask, VT, V1, V2, DAG)))
-    return Result;
-  if ((Result = lowerVECTOR_SHUFFLE_XVILVL(DL, NewMask, VT, V1, V2, DAG)))
-    return Result;
-  if ((Result = lowerVECTOR_SHUFFLE_XVPICKEV(DL, NewMask, VT, V1, V2, DAG)))
-    return Result;
-  if ((Result = lowerVECTOR_SHUFFLE_XVPICKOD(DL, NewMask, VT, V1, V2, DAG)))
-    return Result;
-  if ((Result = lowerVECTOR_SHUFFLE_XVSHUF(DL, NewMask, VT, V1, V2, DAG)))
-    return Result;
-
-  return SDValue();
-}
-
 SDValue LoongArchTargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
                                                      SelectionDAG &DAG) const {
-  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
-  ArrayRef<int> OrigMask = SVOp->getMask();
-  SDValue V1 = Op.getOperand(0);
-  SDValue V2 = Op.getOperand(1);
-  MVT VT = Op.getSimpleValueType();
-  int NumElements = VT.getVectorNumElements();
-  SDLoc DL(Op);
-
-  bool V1IsUndef = V1.isUndef();
-  bool V2IsUndef = V2.isUndef();
-  if (V1IsUndef && V2IsUndef)
-    return DAG.getUNDEF(VT);
-
-  // When we create a shuffle node we put the UNDEF node to second operand,
-  // but in some cases the first operand may be transformed to UNDEF.
-  // In this case we should just commute the node.
-  if (V1IsUndef)
-    return DAG.getCommutedVectorShuffle(*SVOp);
-
-  // Check for non-undef masks pointing at an undef vector and make the masks
-  // undef as well. This makes it easier to match the shuffle based solely on
-  // the mask.
-  if (V2IsUndef &&
-      any_of(OrigMask, [NumElements](int M) { return M >= NumElements; })) {
-    SmallVector<int, 8> NewMask(OrigMask);
-    for (int &M : NewMask)
-      if (M >= NumElements)
-        M = -1;
-    return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
-  }
-
-  // Check for illegal shuffle mask element index values.
-  int MaskUpperLimit = OrigMask.size() * (V2IsUndef ? 1 : 2);
-  (void)MaskUpperLimit;
-  assert(llvm::all_of(OrigMask,
-                      [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
-         "Out of bounds shuffle index");
-
-  // For each vector width, delegate to a specialized lowering routine.
-  if (VT.is128BitVector())
-    return lower128BitShuffle(DL, OrigMask, VT, V1, V2, DAG);
-
-  if (VT.is256BitVector())
-    return lower256BitShuffle(DL, OrigMask, VT, V1, V2, DAG);
-
+  // TODO: custom shuffle.
   return SDValue();
 }
 
@@ -3446,169 +2528,6 @@ static SDValue performORCombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
-static bool checkValueWidth(SDValue V, ISD::LoadExtType &ExtType) {
-  ExtType = ISD::NON_EXTLOAD;
-
-  switch (V.getNode()->getOpcode()) {
-  case ISD::LOAD: {
-    LoadSDNode *LoadNode = cast<LoadSDNode>(V.getNode());
-    if ((LoadNode->getMemoryVT() == MVT::i8) ||
-        (LoadNode->getMemoryVT() == MVT::i16)) {
-      ExtType = LoadNode->getExtensionType();
-      return true;
-    }
-    return false;
-  }
-  case ISD::AssertSext: {
-    VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
-    if ((TypeNode->getVT() == MVT::i8) || (TypeNode->getVT() == MVT::i16)) {
-      ExtType = ISD::SEXTLOAD;
-      return true;
-    }
-    return false;
-  }
-  case ISD::AssertZext: {
-    VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
-    if ((TypeNode->getVT() == MVT::i8) || (TypeNode->getVT() == MVT::i16)) {
-      ExtType = ISD::ZEXTLOAD;
-      return true;
-    }
-    return false;
-  }
-  default:
-    return false;
-  }
-
-  return false;
-}
-
-// Eliminate redundant truncation and zero-extension nodes.
-// * Case 1:
-//  +------------+ +------------+ +------------+
-//  |   Input1   | |   Input2   | |     CC     |
-//  +------------+ +------------+ +------------+
-//         |              |              |
-//         V              V              +----+
-//  +------------+ +------------+             |
-//  |  TRUNCATE  | |  TRUNCATE  |             |
-//  +------------+ +------------+             |
-//         |              |                   |
-//         V              V                   |
-//  +------------+ +------------+             |
-//  |  ZERO_EXT  | |  ZERO_EXT  |             |
-//  +------------+ +------------+             |
-//         |              |                   |
-//         |              +-------------+     |
-//         V              V             |     |
-//        +----------------+            |     |
-//        |      AND       |            |     |
-//        +----------------+            |     |
-//                |                     |     |
-//                +---------------+     |     |
-//                                |     |     |
-//                                V     V     V
-//                               +-------------+
-//                               |     CMP     |
-//                               +-------------+
-// * Case 2:
-//  +------------+ +------------+ +-------------+ +------------+ +------------+
-//  |   Input1   | |   Input2   | | Constant -1 | | Constant 0 | |     CC     |
-//  +------------+ +------------+ +-------------+ +------------+ +------------+
-//         |              |             |               |               |
-//         V              |             |               |               |
-//  +------------+        |             |               |               |
-//  |     XOR    |<---------------------+               |               |
-//  +------------+        |                             |               |
-//         |              |                             |               |
-//         V              V             +---------------+               |
-//  +------------+ +------------+       |                               |
-//  |  TRUNCATE  | |  TRUNCATE  |       |     +-------------------------+
-//  +------------+ +------------+       |     |
-//         |              |             |     |
-//         V              V             |     |
-//  +------------+ +------------+       |     |
-//  |  ZERO_EXT  | |  ZERO_EXT  |       |     |
-//  +------------+ +------------+       |     |
-//         |              |             |     |
-//         V              V             |     |
-//        +----------------+            |     |
-//        |      AND       |            |     |
-//        +----------------+            |     |
-//                |                     |     |
-//                +---------------+     |     |
-//                                |     |     |
-//                                V     V     V
-//                               +-------------+
-//                               |     CMP     |
-//                               +-------------+
-static SDValue performSETCCCombine(SDNode *N, SelectionDAG &DAG,
-                                   TargetLowering::DAGCombinerInfo &DCI,
-                                   const LoongArchSubtarget &Subtarget) {
-  ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
-
-  SDNode *AndNode = N->getOperand(0).getNode();
-  if (AndNode->getOpcode() != ISD::AND)
-    return SDValue();
-
-  SDValue AndInputValue2 = AndNode->getOperand(1);
-  if (AndInputValue2.getOpcode() != ISD::ZERO_EXTEND)
-    return SDValue();
-
-  SDValue CmpInputValue = N->getOperand(1);
-  SDValue AndInputValue1 = AndNode->getOperand(0);
-  if (AndInputValue1.getOpcode() == ISD::XOR) {
-    if (CC != ISD::SETEQ && CC != ISD::SETNE)
-      return SDValue();
-    ConstantSDNode *CN = dyn_cast<ConstantSDNode>(AndInputValue1.getOperand(1));
-    if (!CN || CN->getSExtValue() != -1)
-      return SDValue();
-    CN = dyn_cast<ConstantSDNode>(CmpInputValue);
-    if (!CN || CN->getSExtValue() != 0)
-      return SDValue();
-    AndInputValue1 = AndInputValue1.getOperand(0);
-    if (AndInputValue1.getOpcode() != ISD::ZERO_EXTEND)
-      return SDValue();
-  } else if (AndInputValue1.getOpcode() == ISD::ZERO_EXTEND) {
-    if (AndInputValue2 != CmpInputValue)
-      return SDValue();
-  } else {
-    return SDValue();
-  }
-
-  SDValue TruncValue1 = AndInputValue1.getNode()->getOperand(0);
-  if (TruncValue1.getOpcode() != ISD::TRUNCATE)
-    return SDValue();
-
-  SDValue TruncValue2 = AndInputValue2.getNode()->getOperand(0);
-  if (TruncValue2.getOpcode() != ISD::TRUNCATE)
-    return SDValue();
-
-  SDValue TruncInputValue1 = TruncValue1.getNode()->getOperand(0);
-  SDValue TruncInputValue2 = TruncValue2.getNode()->getOperand(0);
-  ISD::LoadExtType ExtType1;
-  ISD::LoadExtType ExtType2;
-
-  if (!checkValueWidth(TruncInputValue1, ExtType1) ||
-      !checkValueWidth(TruncInputValue2, ExtType2))
-    return SDValue();
-
-  if (TruncInputValue1->getValueType(0) != TruncInputValue2->getValueType(0) ||
-      AndNode->getValueType(0) != TruncInputValue1->getValueType(0))
-    return SDValue();
-
-  if ((ExtType2 != ISD::ZEXTLOAD) &&
-      ((ExtType2 != ISD::SEXTLOAD) && (ExtType1 != ISD::SEXTLOAD)))
-    return SDValue();
-
-  // These truncation and zero-extension nodes are not necessary, remove them.
-  SDValue NewAnd = DAG.getNode(ISD::AND, SDLoc(N), AndNode->getValueType(0),
-                               TruncInputValue1, TruncInputValue2);
-  SDValue NewSetCC =
-      DAG.getSetCC(SDLoc(N), N->getValueType(0), NewAnd, TruncInputValue2, CC);
-  DAG.ReplaceAllUsesWith(N, NewSetCC.getNode());
-  return SDValue(N, 0);
-}
-
 // Combine (loongarch_bitrev_w (loongarch_revb_2w X)) to loongarch_bitrev_4b.
 static SDValue performBITREV_WCombine(SDNode *N, SelectionDAG &DAG,
                                       TargetLowering::DAGCombinerInfo &DCI,
@@ -4236,8 +3155,6 @@ SDValue LoongArchTargetLowering::PerformDAGCombine(SDNode *N,
     return performANDCombine(N, DAG, DCI, Subtarget);
   case ISD::OR:
     return performORCombine(N, DAG, DCI, Subtarget);
-  case ISD::SETCC:
-    return performSETCCCombine(N, DAG, DCI, Subtarget);
   case ISD::SRL:
     return performSRLCombine(N, DAG, DCI, Subtarget);
   case LoongArchISD::BITREV_W:
@@ -4623,16 +3540,6 @@ const char *LoongArchTargetLowering::getTargetNodeName(unsigned Opcode) const {
     NODE_NAME_CASE(MOVFCSR2GR)
     NODE_NAME_CASE(CACOP_D)
     NODE_NAME_CASE(CACOP_W)
-    NODE_NAME_CASE(VSHUF)
-    NODE_NAME_CASE(VPICKEV)
-    NODE_NAME_CASE(VPICKOD)
-    NODE_NAME_CASE(VPACKEV)
-    NODE_NAME_CASE(VPACKOD)
-    NODE_NAME_CASE(VILVL)
-    NODE_NAME_CASE(VILVH)
-    NODE_NAME_CASE(VSHUF4I)
-    NODE_NAME_CASE(VREPLVEI)
-    NODE_NAME_CASE(XVPERMI)
     NODE_NAME_CASE(VPICK_SEXT_ELT)
     NODE_NAME_CASE(VPICK_ZEXT_ELT)
     NODE_NAME_CASE(VREPLVE)
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
index fc5b36c2124e0..f4c57f80fdbe4 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
@@ -120,16 +120,6 @@ enum NodeType : unsigned {
 
   // Vector Shuffle
   VREPLVE,
-  VSHUF,
-  VPICKEV,
-  VPICKOD,
-  VPACKEV,
-  VPACKOD,
-  VILVL,
-  VILVH,
-  VSHUF4I,
-  VREPLVEI,
-  XVPERMI,
 
   // Extended vector element extraction
   VPICK_SEXT_ELT,
diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
index 6f1969bf8cae0..3de1fe2b722e5 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
@@ -10,8 +10,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-def loongarch_xvpermi: SDNode<"LoongArchISD::XVPERMI", SDT_loongArchV1RUimm>;
-
 def lasxsplati8
   : PatFrag<(ops node:$e0),
             (v32i8 (build_vector node:$e0, node:$e0, node:$e0, node:$e0,
@@ -1577,134 +1575,6 @@ def : Pat<(loongarch_vreplve v8i32:$xj, GRLenVT:$rk),
 def : Pat<(loongarch_vreplve v4i64:$xj, GRLenVT:$rk),
           (XVREPLVE_D v4i64:$xj, GRLenVT:$rk)>;
 
-// XVSHUF_{B/H/W/D}
-def : Pat<(loongarch_vshuf v32i8:$xa, v32i8:$xj, v32i8:$xk),
-          (XVSHUF_B v32i8:$xj, v32i8:$xk, v32i8:$xa)>;
-def : Pat<(loongarch_vshuf v16i16:$xd, v16i16:$xj, v16i16:$xk),
-          (XVSHUF_H v16i16:$xd, v16i16:$xj, v16i16:$xk)>;
-def : Pat<(loongarch_vshuf v8i32:$xd, v8i32:$xj, v8i32:$xk),
-          (XVSHUF_W v8i32:$xd, v8i32:$xj, v8i32:$xk)>;
-def : Pat<(loongarch_vshuf v4i64:$xd, v4i64:$xj, v4i64:$xk),
-          (XVSHUF_D v4i64:$xd, v4i64:$xj, v4i64:$xk)>;
-def : Pat<(loongarch_vshuf v8i32:$xd, v8f32:$xj, v8f32:$xk),
-          (XVSHUF_W v8i32:$xd, v8f32:$xj, v8f32:$xk)>;
-def : Pat<(loongarch_vshuf v4i64:$xd, v4f64:$xj, v4f64:$xk),
-          (XVSHUF_D v4i64:$xd, v4f64:$xj, v4f64:$xk)>;
-
-// XVPICKEV_{B/H/W/D}
-def : Pat<(loongarch_vpickev v32i8:$xj, v32i8:$xk),
-          (XVPICKEV_B v32i8:$xj, v32i8:$xk)>;
-def : Pat<(loongarch_vpickev v16i16:$xj, v16i16:$xk),
-          (XVPICKEV_H v16i16:$xj, v16i16:$xk)>;
-def : Pat<(loongarch_vpickev v8i32:$xj, v8i32:$xk),
-          (XVPICKEV_W v8i32:$xj, v8i32:$xk)>;
-def : Pat<(loongarch_vpickev v4i64:$xj, v4i64:$xk),
-          (XVPICKEV_D v4i64:$xj, v4i64:$xk)>;
-def : Pat<(loongarch_vpickev v8f32:$xj, v8f32:$xk),
-          (XVPICKEV_W v8f32:$xj, v8f32:$xk)>;
-def : Pat<(loongarch_vpickev v4f64:$xj, v4f64:$xk),
-          (XVPICKEV_D v4f64:$xj, v4f64:$xk)>;
-
-// XVPICKOD_{B/H/W/D}
-def : Pat<(loongarch_vpickod v32i8:$xj, v32i8:$xk),
-          (XVPICKOD_B v32i8:$xj, v32i8:$xk)>;
-def : Pat<(loongarch_vpickod v16i16:$xj, v16i16:$xk),
-          (XVPICKOD_H v16i16:$xj, v16i16:$xk)>;
-def : Pat<(loongarch_vpickod v8i32:$xj, v8i32:$xk),
-          (XVPICKOD_W v8i32:$xj, v8i32:$xk)>;
-def : Pat<(loongarch_vpickod v4i64:$xj, v4i64:$xk),
-          (XVPICKOD_D v4i64:$xj, v4i64:$xk)>;
-def : Pat<(loongarch_vpickod v8f32:$xj, v8f32:$xk),
-          (XVPICKOD_W v8f32:$xj, v8f32:$xk)>;
-def : Pat<(loongarch_vpickod v4f64:$xj, v4f64:$xk),
-          (XVPICKOD_D v4f64:$xj, v4f64:$xk)>;
-
-// XVPACKEV_{B/H/W/D}
-def : Pat<(loongarch_vpackev v32i8:$xj, v32i8:$xk),
-          (XVPACKEV_B v32i8:$xj, v32i8:$xk)>;
-def : Pat<(loongarch_vpackev v16i16:$xj, v16i16:$xk),
-          (XVPACKEV_H v16i16:$xj, v16i16:$xk)>;
-def : Pat<(loongarch_vpackev v8i32:$xj, v8i32:$xk),
-          (XVPACKEV_W v8i32:$xj, v8i32:$xk)>;
-def : Pat<(loongarch_vpackev v4i64:$xj, v4i64:$xk),
-          (XVPACKEV_D v4i64:$xj, v4i64:$xk)>;
-def : Pat<(loongarch_vpackev v8f32:$xj, v8f32:$xk),
-          (XVPACKEV_W v8f32:$xj, v8f32:$xk)>;
-def : Pat<(loongarch_vpackev v4f64:$xj, v4f64:$xk),
-          (XVPACKEV_D v4f64:$xj, v4f64:$xk)>;
-
-// XVPACKOD_{B/H/W/D}
-def : Pat<(loongarch_vpackod v32i8:$xj, v32i8:$xk),
-          (XVPACKOD_B v32i8:$xj, v32i8:$xk)>;
-def : Pat<(loongarch_vpackod v16i16:$xj, v16i16:$xk),
-          (XVPACKOD_H v16i16:$xj, v16i16:$xk)>;
-def : Pat<(loongarch_vpackod v8i32:$xj, v8i32:$xk),
-          (XVPACKOD_W v8i32:$xj, v8i32:$xk)>;
-def : Pat<(loongarch_vpackod v4i64:$xj, v4i64:$xk),
-          (XVPACKOD_D v4i64:$xj, v4i64:$xk)>;
-def : Pat<(loongarch_vpackod v8f32:$xj, v8f32:$xk),
-          (XVPACKOD_W v8f32:$xj, v8f32:$xk)>;
-def : Pat<(loongarch_vpackod v4f64:$xj, v4f64:$xk),
-          (XVPACKOD_D v4f64:$xj, v4f64:$xk)>;
-
-// XVILVL_{B/H/W/D}
-def : Pat<(loongarch_vilvl v32i8:$xj, v32i8:$xk),
-          (XVILVL_B v32i8:$xj, v32i8:$xk)>;
-def : Pat<(loongarch_vilvl v16i16:$xj, v16i16:$xk),
-          (XVILVL_H v16i16:$xj, v16i16:$xk)>;
-def : Pat<(loongarch_vilvl v8i32:$xj, v8i32:$xk),
-          (XVILVL_W v8i32:$xj, v8i32:$xk)>;
-def : Pat<(loongarch_vilvl v4i64:$xj, v4i64:$xk),
-          (XVILVL_D v4i64:$xj, v4i64:$xk)>;
-def : Pat<(loongarch_vilvl v8f32:$xj, v8f32:$xk),
-          (XVILVL_W v8f32:$xj, v8f32:$xk)>;
-def : Pat<(loongarch_vilvl v4f64:$xj, v4f64:$xk),
-          (XVILVL_D v4f64:$xj, v4f64:$xk)>;
-
-// XVILVH_{B/H/W/D}
-def : Pat<(loongarch_vilvh v32i8:$xj, v32i8:$xk),
-          (XVILVH_B v32i8:$xj, v32i8:$xk)>;
-def : Pat<(loongarch_vilvh v16i16:$xj, v16i16:$xk),
-          (XVILVH_H v16i16:$xj, v16i16:$xk)>;
-def : Pat<(loongarch_vilvh v8i32:$xj, v8i32:$xk),
-          (XVILVH_W v8i32:$xj, v8i32:$xk)>;
-def : Pat<(loongarch_vilvh v4i64:$xj, v4i64:$xk),
-          (XVILVH_D v4i64:$xj, v4i64:$xk)>;
-def : Pat<(loongarch_vilvh v8f32:$xj, v8f32:$xk),
-          (XVILVH_W v8f32:$xj, v8f32:$xk)>;
-def : Pat<(loongarch_vilvh v4f64:$xj, v4f64:$xk),
-          (XVILVH_D v4f64:$xj, v4f64:$xk)>;
-
-// XVSHUF4I_{B/H/W}
-def : Pat<(loongarch_vshuf4i v32i8:$xj, immZExt8:$ui8),
-          (XVSHUF4I_B v32i8:$xj, immZExt8:$ui8)>;
-def : Pat<(loongarch_vshuf4i v16i16:$xj, immZExt8:$ui8),
-        (XVSHUF4I_H v16i16:$xj, immZExt8:$ui8)>;
-def : Pat<(loongarch_vshuf4i v8i32:$xj, immZExt8:$ui8),
-        (XVSHUF4I_W v8i32:$xj, immZExt8:$ui8)>;
-def : Pat<(loongarch_vshuf4i v8f32:$xj, immZExt8:$ui8),
-        (XVSHUF4I_W v8f32:$xj, immZExt8:$ui8)>;
-
-// XVREPL128VEI_{B/H/W/D}
-def : Pat<(loongarch_vreplvei v32i8:$xj, immZExt4:$ui4),
-          (XVREPL128VEI_B v32i8:$xj, immZExt4:$ui4)>;
-def : Pat<(loongarch_vreplvei v16i16:$xj, immZExt3:$ui3),
-        (XVREPL128VEI_H v16i16:$xj, immZExt3:$ui3)>;
-def : Pat<(loongarch_vreplvei v8i32:$xj, immZExt2:$ui2),
-        (XVREPL128VEI_W v8i32:$xj, immZExt2:$ui2)>;
-def : Pat<(loongarch_vreplvei v4i64:$xj, immZExt1:$ui1),
-        (XVREPL128VEI_D v4i64:$xj, immZExt1:$ui1)>;
-def : Pat<(loongarch_vreplvei v8f32:$xj, immZExt2:$ui2),
-        (XVREPL128VEI_W v8f32:$xj, immZExt2:$ui2)>;
-def : Pat<(loongarch_vreplvei v4f64:$xj, immZExt1:$ui1),
-        (XVREPL128VEI_D v4f64:$xj, immZExt1:$ui1)>;
-
-// XVPERMI_D
-def : Pat<(loongarch_xvpermi v4i64:$xj, immZExt8: $ui8),
-          (XVPERMI_D v4i64:$xj, immZExt8: $ui8)>;
-def : Pat<(loongarch_xvpermi v4f64:$xj, immZExt8: $ui8),
-          (XVPERMI_D v4f64:$xj, immZExt8: $ui8)>;
-
 // XVREPLVE0_{W/D}
 def : Pat<(lasxsplatf32 FPR32:$fj),
           (XVREPLVE0_W (SUBREG_TO_REG (i64 0), FPR32:$fj, sub_32))>;
diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
index 0580683c3ce30..39ee861cd0565 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
@@ -15,15 +15,6 @@ def SDT_LoongArchVreplve : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisVec<0>,
                                          SDTCisSameAs<0, 1>, SDTCisInt<2>]>;
 def SDT_LoongArchVecCond : SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisVec<1>]>;
 
-def SDT_LoongArchVShuf : SDTypeProfile<1, 3, [SDTCisVec<0>,
-                                         SDTCisInt<1>, SDTCisVec<1>,
-                                         SDTCisSameAs<0, 2>,
-                                         SDTCisSameAs<2, 3>]>;
-def SDT_LoongArchV2R : SDTypeProfile<1, 2, [SDTCisVec<0>,
-                                         SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>]>;
-def SDT_loongArchV1RUimm: SDTypeProfile<1, 2, [SDTCisVec<0>,
-                                         SDTCisSameAs<0,1>, SDTCisVT<2, i64>]>;
-
 // Target nodes.
 def loongarch_vreplve : SDNode<"LoongArchISD::VREPLVE", SDT_LoongArchVreplve>;
 def loongarch_vall_nonzero : SDNode<"LoongArchISD::VALL_NONZERO",
@@ -40,23 +31,6 @@ def loongarch_vpick_sext_elt : SDNode<"LoongArchISD::VPICK_SEXT_ELT",
 def loongarch_vpick_zext_elt : SDNode<"LoongArchISD::VPICK_ZEXT_ELT",
                                       SDTypeProfile<1, 3, [SDTCisPtrTy<2>]>>;
 
-def loongarch_vshuf: SDNode<"LoongArchISD::VSHUF", SDT_LoongArchVShuf>;
-def loongarch_vpickev: SDNode<"LoongArchISD::VPICKEV", SDT_LoongArchV2R>;
-def loongarch_vpickod: SDNode<"LoongArchISD::VPICKOD", SDT_LoongArchV2R>;
-def loongarch_vpackev: SDNode<"LoongArchISD::VPACKEV", SDT_LoongArchV2R>;
-def loongarch_vpackod: SDNode<"LoongArchISD::VPACKOD", SDT_LoongArchV2R>;
-def loongarch_vilvl: SDNode<"LoongArchISD::VILVL", SDT_LoongArchV2R>;
-def loongarch_vilvh: SDNode<"LoongArchISD::VILVH", SDT_LoongArchV2R>;
-
-def loongarch_vshuf4i: SDNode<"LoongArchISD::VSHUF4I", SDT_loongArchV1RUimm>;
-def loongarch_vreplvei: SDNode<"LoongArchISD::VREPLVEI", SDT_loongArchV1RUimm>;
-
-def immZExt1 : ImmLeaf<i64, [{return isUInt<1>(Imm);}]>;
-def immZExt2 : ImmLeaf<i64, [{return isUInt<2>(Imm);}]>;
-def immZExt3 : ImmLeaf<i64, [{return isUInt<3>(Imm);}]>;
-def immZExt4 : ImmLeaf<i64, [{return isUInt<4>(Imm);}]>;
-def immZExt8 : ImmLeaf<i64, [{return isUInt<8>(Imm);}]>;
-
 class VecCond<SDPatternOperator OpNode, ValueType TyNode,
               RegisterClass RC = LSX128>
     : Pseudo<(outs GPR:$rd), (ins RC:$vj),
@@ -1708,128 +1682,6 @@ def : Pat<(loongarch_vreplve v4i32:$vj, GRLenVT:$rk),
 def : Pat<(loongarch_vreplve v2i64:$vj, GRLenVT:$rk),
           (VREPLVE_D v2i64:$vj, GRLenVT:$rk)>;
 
-// VSHUF_{B/H/W/D}
-def : Pat<(loongarch_vshuf v16i8:$va, v16i8:$vj, v16i8:$vk),
-          (VSHUF_B v16i8:$vj, v16i8:$vk, v16i8:$va)>;
-def : Pat<(loongarch_vshuf v8i16:$vd, v8i16:$vj, v8i16:$vk),
-          (VSHUF_H v8i16:$vd, v8i16:$vj, v8i16:$vk)>;
-def : Pat<(loongarch_vshuf v4i32:$vd, v4i32:$vj, v4i32:$vk),
-          (VSHUF_W v4i32:$vd, v4i32:$vj, v4i32:$vk)>;
-def : Pat<(loongarch_vshuf v2i64:$vd, v2i64:$vj, v2i64:$vk),
-          (VSHUF_D v2i64:$vd, v2i64:$vj, v2i64:$vk)>;
-def : Pat<(loongarch_vshuf v4i32:$vd, v4f32:$vj, v4f32:$vk),
-          (VSHUF_W v4i32:$vd, v4f32:$vj, v4f32:$vk)>;
-def : Pat<(loongarch_vshuf v2i64:$vd, v2f64:$vj, v2f64:$vk),
-          (VSHUF_D v2i64:$vd, v2f64:$vj, v2f64:$vk)>;
-
-// VPICKEV_{B/H/W/D}
-def : Pat<(loongarch_vpickev v16i8:$vj, v16i8:$vk),
-          (VPICKEV_B v16i8:$vj, v16i8:$vk)>;
-def : Pat<(loongarch_vpickev v8i16:$vj, v8i16:$vk),
-          (VPICKEV_H v8i16:$vj, v8i16:$vk)>;
-def : Pat<(loongarch_vpickev v4i32:$vj, v4i32:$vk),
-          (VPICKEV_W v4i32:$vj, v4i32:$vk)>;
-def : Pat<(loongarch_vpickev v2i64:$vj, v2i64:$vk),
-          (VPICKEV_D v2i64:$vj, v2i64:$vk)>;
-def : Pat<(loongarch_vpickev v4f32:$vj, v4f32:$vk),
-          (VPICKEV_W v4f32:$vj, v4f32:$vk)>;
-def : Pat<(loongarch_vpickev v2f64:$vj, v2f64:$vk),
-          (VPICKEV_D v2f64:$vj, v2f64:$vk)>;
-
-// VPICKOD_{B/H/W/D}
-def : Pat<(loongarch_vpickod v16i8:$vj, v16i8:$vk),
-          (VPICKOD_B v16i8:$vj, v16i8:$vk)>;
-def : Pat<(loongarch_vpickod v8i16:$vj, v8i16:$vk),
-          (VPICKOD_H v8i16:$vj, v8i16:$vk)>;
-def : Pat<(loongarch_vpickod v4i32:$vj, v4i32:$vk),
-          (VPICKOD_W v4i32:$vj, v4i32:$vk)>;
-def : Pat<(loongarch_vpickod v2i64:$vj, v2i64:$vk),
-          (VPICKOD_D v2i64:$vj, v2i64:$vk)>;
-def : Pat<(loongarch_vpickod v4f32:$vj, v4f32:$vk),
-          (VPICKOD_W v4f32:$vj, v4f32:$vk)>;
-def : Pat<(loongarch_vpickod v2f64:$vj, v2f64:$vk),
-          (VPICKOD_D v2f64:$vj, v2f64:$vk)>;
-
-// VPACKEV_{B/H/W/D}
-def : Pat<(loongarch_vpackev v16i8:$vj, v16i8:$vk),
-          (VPACKEV_B v16i8:$vj, v16i8:$vk)>;
-def : Pat<(loongarch_vpackev v8i16:$vj, v8i16:$vk),
-          (VPACKEV_H v8i16:$vj, v8i16:$vk)>;
-def : Pat<(loongarch_vpackev v4i32:$vj, v4i32:$vk),
-          (VPACKEV_W v4i32:$vj, v4i32:$vk)>;
-def : Pat<(loongarch_vpackev v2i64:$vj, v2i64:$vk),
-          (VPACKEV_D v2i64:$vj, v2i64:$vk)>;
-def : Pat<(loongarch_vpackev v4f32:$vj, v4f32:$vk),
-          (VPACKEV_W v4f32:$vj, v4f32:$vk)>;
-def : Pat<(loongarch_vpackev v2f64:$vj, v2f64:$vk),
-          (VPACKEV_D v2f64:$vj, v2f64:$vk)>;
-
-// VPACKOD_{B/H/W/D}
-def : Pat<(loongarch_vpackod v16i8:$vj, v16i8:$vk),
-          (VPACKOD_B v16i8:$vj, v16i8:$vk)>;
-def : Pat<(loongarch_vpackod v8i16:$vj, v8i16:$vk),
-          (VPACKOD_H v8i16:$vj, v8i16:$vk)>;
-def : Pat<(loongarch_vpackod v4i32:$vj, v4i32:$vk),
-          (VPACKOD_W v4i32:$vj, v4i32:$vk)>;
-def : Pat<(loongarch_vpackod v2i64:$vj, v2i64:$vk),
-          (VPACKOD_D v2i64:$vj, v2i64:$vk)>;
-def : Pat<(loongarch_vpackod v4f32:$vj, v4f32:$vk),
-          (VPACKOD_W v4f32:$vj, v4f32:$vk)>;
-def : Pat<(loongarch_vpackod v2f64:$vj, v2f64:$vk),
-          (VPACKOD_D v2f64:$vj, v2f64:$vk)>;
-
-// VILVL_{B/H/W/D}
-def : Pat<(loongarch_vilvl v16i8:$vj, v16i8:$vk),
-          (VILVL_B v16i8:$vj, v16i8:$vk)>;
-def : Pat<(loongarch_vilvl v8i16:$vj, v8i16:$vk),
-          (VILVL_H v8i16:$vj, v8i16:$vk)>;
-def : Pat<(loongarch_vilvl v4i32:$vj, v4i32:$vk),
-          (VILVL_W v4i32:$vj, v4i32:$vk)>;
-def : Pat<(loongarch_vilvl v2i64:$vj, v2i64:$vk),
-          (VILVL_D v2i64:$vj, v2i64:$vk)>;
-def : Pat<(loongarch_vilvl v4f32:$vj, v4f32:$vk),
-          (VILVL_W v4f32:$vj, v4f32:$vk)>;
-def : Pat<(loongarch_vilvl v2f64:$vj, v2f64:$vk),
-          (VILVL_D v2f64:$vj, v2f64:$vk)>;
-
-// VILVH_{B/H/W/D}
-def : Pat<(loongarch_vilvh v16i8:$vj, v16i8:$vk),
-          (VILVH_B v16i8:$vj, v16i8:$vk)>;
-def : Pat<(loongarch_vilvh v8i16:$vj, v8i16:$vk),
-          (VILVH_H v8i16:$vj, v8i16:$vk)>;
-def : Pat<(loongarch_vilvh v4i32:$vj, v4i32:$vk),
-          (VILVH_W v4i32:$vj, v4i32:$vk)>;
-def : Pat<(loongarch_vilvh v2i64:$vj, v2i64:$vk),
-          (VILVH_D v2i64:$vj, v2i64:$vk)>;
-def : Pat<(loongarch_vilvh v4f32:$vj, v4f32:$vk),
-          (VILVH_W v4f32:$vj, v4f32:$vk)>;
-def : Pat<(loongarch_vilvh v2f64:$vj, v2f64:$vk),
-          (VILVH_D v2f64:$vj, v2f64:$vk)>;
-
-// VSHUF4I_{B/H/W}
-def : Pat<(loongarch_vshuf4i v16i8:$vj, immZExt8:$ui8),
-          (VSHUF4I_B v16i8:$vj, immZExt8:$ui8)>;
-def : Pat<(loongarch_vshuf4i v8i16:$vj, immZExt8:$ui8),
-        (VSHUF4I_H v8i16:$vj, immZExt8:$ui8)>;
-def : Pat<(loongarch_vshuf4i v4i32:$vj, immZExt8:$ui8),
-        (VSHUF4I_W v4i32:$vj, immZExt8:$ui8)>;
-def : Pat<(loongarch_vshuf4i v4f32:$vj, immZExt8:$ui8),
-        (VSHUF4I_W v4f32:$vj, immZExt8:$ui8)>;
-
-// VREPLVEI_{B/H/W/D}
-def : Pat<(loongarch_vreplvei v16i8:$vj, immZExt4:$ui4),
-          (VREPLVEI_B v16i8:$vj, immZExt4:$ui4)>;
-def : Pat<(loongarch_vreplvei v8i16:$vj, immZExt3:$ui3),
-        (VREPLVEI_H v8i16:$vj, immZExt3:$ui3)>;
-def : Pat<(loongarch_vreplvei v4i32:$vj, immZExt2:$ui2),
-        (VREPLVEI_W v4i32:$vj, immZExt2:$ui2)>;
-def : Pat<(loongarch_vreplvei v2i64:$vj, immZExt1:$ui1),
-        (VREPLVEI_D v2i64:$vj, immZExt1:$ui1)>;
-def : Pat<(loongarch_vreplvei v4f32:$vj, immZExt2:$ui2),
-        (VREPLVEI_W v4f32:$vj, immZExt2:$ui2)>;
-def : Pat<(loongarch_vreplvei v2f64:$vj, immZExt1:$ui1),
-        (VREPLVEI_D v2f64:$vj, immZExt1:$ui1)>;
-
 // VREPLVEI_{W/D}
 def : Pat<(lsxsplatf32 FPR32:$fj),
           (VREPLVEI_W (SUBREG_TO_REG (i64 0), FPR32:$fj, sub_32), 0)>;
diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp b/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp
index 2c7b0bfeaaad5..710650acba304 100644
--- a/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp
@@ -26,6 +26,8 @@ TypeSize LoongArchTTIImpl::getRegisterBitWidth(
   case TargetTransformInfo::RGK_Scalar:
     return TypeSize::getFixed(ST->is64Bit() ? 64 : 32);
   case TargetTransformInfo::RGK_FixedWidthVector:
+    if (!ST->hasExpAutoVec())
+      return DefSize;
     if (ST->hasExtLASX())
       return TypeSize::getFixed(256);
     if (ST->hasExtLSX())
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFStreamer.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFStreamer.cpp
index 2d781acbed988..a8a4f1ed327d3 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFStreamer.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFStreamer.cpp
@@ -37,7 +37,7 @@ MCELFStreamer &LoongArchTargetELFStreamer::getStreamer() {
 
 void LoongArchTargetELFStreamer::finish() {
   LoongArchTargetStreamer::finish();
-  ELFObjectWriter &W = getStreamer().getWriter();
+  MCAssembler &MCA = getStreamer().getAssembler();
   LoongArchABI::ABI ABI = getTargetABI();
 
   // Figure out the e_flags.
@@ -48,7 +48,7 @@ void LoongArchTargetELFStreamer::finish() {
   // based relocs from day one.
   //
   // Refer to LoongArch ELF psABI v2.01 for details.
-  unsigned EFlags = W.getELFHeaderEFlags();
+  unsigned EFlags = MCA.getELFHeaderEFlags();
   EFlags |= ELF::EF_LOONGARCH_OBJABI_V1;
   switch (ABI) {
   case LoongArchABI::ABI_ILP32S:
@@ -66,7 +66,7 @@ void LoongArchTargetELFStreamer::finish() {
   case LoongArchABI::ABI_Unknown:
     llvm_unreachable("Improperly initialized target ABI");
   }
-  W.setELFHeaderEFlags(EFlags);
+  MCA.setELFHeaderEFlags(EFlags);
 }
 
 namespace {
diff --git a/llvm/lib/Target/MSP430/MCTargetDesc/MSP430ELFStreamer.cpp b/llvm/lib/Target/MSP430/MCTargetDesc/MSP430ELFStreamer.cpp
index 6c7922a655d9f..0cdb3a595f719 100644
--- a/llvm/lib/Target/MSP430/MCTargetDesc/MSP430ELFStreamer.cpp
+++ b/llvm/lib/Target/MSP430/MCTargetDesc/MSP430ELFStreamer.cpp
@@ -35,6 +35,10 @@ class MSP430TargetELFStreamer : public MCTargetStreamer {
 MSP430TargetELFStreamer::MSP430TargetELFStreamer(MCStreamer &S,
                                                  const MCSubtargetInfo &STI)
     : MCTargetStreamer(S) {
+  MCAssembler &MCA = getStreamer().getAssembler();
+  unsigned EFlags = MCA.getELFHeaderEFlags();
+  MCA.setELFHeaderEFlags(EFlags);
+
   // Emit build attributes section according to
   // MSP430 EABI (slaa534.pdf, part 13).
   MCSection *AttributeSection = getStreamer().getContext().getELFSection(
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
index f8172e576ce4c..fc95b61fd4df5 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
@@ -17,7 +17,6 @@
 #include "MCTargetDesc/MipsMCExpr.h"
 #include "MCTargetDesc/MipsMCTargetDesc.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/StringSwitch.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
index 1650ffc856191..3093b9fddc73a 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
@@ -801,7 +801,6 @@ MipsTargetELFStreamer::MipsTargetELFStreamer(MCStreamer &S,
                                              const MCSubtargetInfo &STI)
     : MipsTargetStreamer(S), MicroMipsEnabled(false), STI(STI) {
   MCAssembler &MCA = getStreamer().getAssembler();
-  ELFObjectWriter &W = getStreamer().getWriter();
 
   // It's possible that MCObjectFileInfo isn't fully initialized at this point
   // due to an initialization order problem where LLVMTargetMachine creates the
@@ -825,7 +824,7 @@ MipsTargetELFStreamer::MipsTargetELFStreamer(MCStreamer &S,
   // We can fix this by making the target streamer construct
   // the ABI, but this is fraught with wide ranging dependency
   // issues as well.
-  unsigned EFlags = W.getELFHeaderEFlags();
+  unsigned EFlags = MCA.getELFHeaderEFlags();
 
   // FIXME: Fix a dependency issue by instantiating the ABI object to some
   // default based off the triple. The triple doesn't describe the target
@@ -874,7 +873,7 @@ MipsTargetELFStreamer::MipsTargetELFStreamer(MCStreamer &S,
   if (Features[Mips::FeatureNaN2008])
     EFlags |= ELF::EF_MIPS_NAN2008;
 
-  W.setELFHeaderEFlags(EFlags);
+  MCA.setELFHeaderEFlags(EFlags);
 }
 
 void MipsTargetELFStreamer::emitLabel(MCSymbol *S) {
@@ -890,7 +889,6 @@ void MipsTargetELFStreamer::emitLabel(MCSymbol *S) {
 
 void MipsTargetELFStreamer::finish() {
   MCAssembler &MCA = getStreamer().getAssembler();
-  ELFObjectWriter &W = getStreamer().getWriter();
   const MCObjectFileInfo &OFI = *MCA.getContext().getObjectFileInfo();
   MCELFStreamer &S = getStreamer();
 
@@ -927,7 +925,7 @@ void MipsTargetELFStreamer::finish() {
 
   // Update e_header flags. See the FIXME and comment above in
   // the constructor for a full rundown on this.
-  unsigned EFlags = W.getELFHeaderEFlags();
+  unsigned EFlags = MCA.getELFHeaderEFlags();
 
   // ABI
   // N64 does not require any ABI bits.
@@ -950,7 +948,7 @@ void MipsTargetELFStreamer::finish() {
   if (Pic)
     EFlags |= ELF::EF_MIPS_PIC | ELF::EF_MIPS_CPIC;
 
-  W.setELFHeaderEFlags(EFlags);
+  MCA.setELFHeaderEFlags(EFlags);
 
   // Emit all the option records.
   // At the moment we are only emitting .Mips.options (ODK_REGINFO) and
@@ -990,25 +988,25 @@ void MipsTargetELFStreamer::emitDirectiveSetNoMicroMips() {
 }
 
 void MipsTargetELFStreamer::setUsesMicroMips() {
-  ELFObjectWriter &W = getStreamer().getWriter();
-  unsigned Flags = W.getELFHeaderEFlags();
+  MCAssembler &MCA = getStreamer().getAssembler();
+  unsigned Flags = MCA.getELFHeaderEFlags();
   Flags |= ELF::EF_MIPS_MICROMIPS;
-  W.setELFHeaderEFlags(Flags);
+  MCA.setELFHeaderEFlags(Flags);
 }
 
 void MipsTargetELFStreamer::emitDirectiveSetMips16() {
-  ELFObjectWriter &W = getStreamer().getWriter();
-  unsigned Flags = W.getELFHeaderEFlags();
+  MCAssembler &MCA = getStreamer().getAssembler();
+  unsigned Flags = MCA.getELFHeaderEFlags();
   Flags |= ELF::EF_MIPS_ARCH_ASE_M16;
-  W.setELFHeaderEFlags(Flags);
+  MCA.setELFHeaderEFlags(Flags);
   forbidModuleDirective();
 }
 
 void MipsTargetELFStreamer::emitDirectiveSetNoReorder() {
-  ELFObjectWriter &W = getStreamer().getWriter();
-  unsigned Flags = W.getELFHeaderEFlags();
+  MCAssembler &MCA = getStreamer().getAssembler();
+  unsigned Flags = MCA.getELFHeaderEFlags();
   Flags |= ELF::EF_MIPS_NOREORDER;
-  W.setELFHeaderEFlags(Flags);
+  MCA.setELFHeaderEFlags(Flags);
   forbidModuleDirective();
 }
 
@@ -1065,45 +1063,45 @@ void MipsTargetELFStreamer::emitDirectiveEnt(const MCSymbol &Symbol) {
 }
 
 void MipsTargetELFStreamer::emitDirectiveAbiCalls() {
-  ELFObjectWriter &W = getStreamer().getWriter();
-  unsigned Flags = W.getELFHeaderEFlags();
+  MCAssembler &MCA = getStreamer().getAssembler();
+  unsigned Flags = MCA.getELFHeaderEFlags();
   Flags |= ELF::EF_MIPS_CPIC | ELF::EF_MIPS_PIC;
-  W.setELFHeaderEFlags(Flags);
+  MCA.setELFHeaderEFlags(Flags);
 }
 
 void MipsTargetELFStreamer::emitDirectiveNaN2008() {
-  ELFObjectWriter &W = getStreamer().getWriter();
-  unsigned Flags = W.getELFHeaderEFlags();
+  MCAssembler &MCA = getStreamer().getAssembler();
+  unsigned Flags = MCA.getELFHeaderEFlags();
   Flags |= ELF::EF_MIPS_NAN2008;
-  W.setELFHeaderEFlags(Flags);
+  MCA.setELFHeaderEFlags(Flags);
 }
 
 void MipsTargetELFStreamer::emitDirectiveNaNLegacy() {
-  ELFObjectWriter &W = getStreamer().getWriter();
-  unsigned Flags = W.getELFHeaderEFlags();
+  MCAssembler &MCA = getStreamer().getAssembler();
+  unsigned Flags = MCA.getELFHeaderEFlags();
   Flags &= ~ELF::EF_MIPS_NAN2008;
-  W.setELFHeaderEFlags(Flags);
+  MCA.setELFHeaderEFlags(Flags);
 }
 
 void MipsTargetELFStreamer::emitDirectiveOptionPic0() {
-  ELFObjectWriter &W = getStreamer().getWriter();
-  unsigned Flags = W.getELFHeaderEFlags();
+  MCAssembler &MCA = getStreamer().getAssembler();
+  unsigned Flags = MCA.getELFHeaderEFlags();
   // This option overrides other PIC options like -KPIC.
   Pic = false;
   Flags &= ~ELF::EF_MIPS_PIC;
-  W.setELFHeaderEFlags(Flags);
+  MCA.setELFHeaderEFlags(Flags);
 }
 
 void MipsTargetELFStreamer::emitDirectiveOptionPic2() {
-  ELFObjectWriter &W = getStreamer().getWriter();
-  unsigned Flags = W.getELFHeaderEFlags();
+  MCAssembler &MCA = getStreamer().getAssembler();
+  unsigned Flags = MCA.getELFHeaderEFlags();
   Pic = true;
   // NOTE: We are following the GAS behaviour here which means the directive
   // 'pic2' also sets the CPIC bit in the ELF header. This is different from
   // what is stated in the SYSV ABI which consider the bits EF_MIPS_PIC and
   // EF_MIPS_CPIC to be mutually exclusive.
   Flags |= ELF::EF_MIPS_PIC | ELF::EF_MIPS_CPIC;
-  W.setELFHeaderEFlags(Flags);
+  MCA.setELFHeaderEFlags(Flags);
 }
 
 void MipsTargetELFStreamer::emitDirectiveInsn() {
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
index 7bdca87d019fa..8d1f8624ee406 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
@@ -298,14 +298,15 @@ class PPCTargetELFStreamer : public PPCTargetStreamer {
   }
 
   void emitAbiVersion(int AbiVersion) override {
-    ELFObjectWriter &W = getStreamer().getWriter();
-    unsigned Flags = W.getELFHeaderEFlags();
+    MCAssembler &MCA = getStreamer().getAssembler();
+    unsigned Flags = MCA.getELFHeaderEFlags();
     Flags &= ~ELF::EF_PPC64_ABI;
     Flags |= (AbiVersion & ELF::EF_PPC64_ABI);
-    W.setELFHeaderEFlags(Flags);
+    MCA.setELFHeaderEFlags(Flags);
   }
 
   void emitLocalEntry(MCSymbolELF *S, const MCExpr *LocalOffset) override {
+    MCAssembler &MCA = getStreamer().getAssembler();
 
     // encodePPC64LocalEntryOffset will report an error if it cannot
     // encode LocalOffset.
@@ -318,10 +319,9 @@ class PPCTargetELFStreamer : public PPCTargetStreamer {
 
     // For GAS compatibility, unless we already saw a .abiversion directive,
     // set e_flags to indicate ELFv2 ABI.
-    ELFObjectWriter &W = getStreamer().getWriter();
-    unsigned Flags = W.getELFHeaderEFlags();
+    unsigned Flags = MCA.getELFHeaderEFlags();
     if ((Flags & ELF::EF_PPC64_ABI) == 0)
-      W.setELFHeaderEFlags(Flags | 2);
+      MCA.setELFHeaderEFlags(Flags | 2);
   }
 
   void emitAssignment(MCSymbol *S, const MCExpr *Value) override {
diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
index 0e2811f87c817..d74143b487880 100644
--- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -909,23 +909,6 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
   // Lower multi-instruction pseudo operations.
   switch (MI->getOpcode()) {
   default: break;
-  case TargetOpcode::PATCHABLE_FUNCTION_ENTER: {
-    assert(!Subtarget->isAIXABI() &&
-           "AIX does not support patchable function entry!");
-    // PATCHABLE_FUNCTION_ENTER on little endian is for XRAY support which is
-    // handled in PPCLinuxAsmPrinter.
-    if (MAI->isLittleEndian())
-      return;
-    const Function &F = MF->getFunction();
-    unsigned Num = 0;
-    (void)F.getFnAttribute("patchable-function-entry")
-        .getValueAsString()
-        .getAsInteger(10, Num);
-    if (!Num)
-      return;
-    emitNops(Num);
-    return;
-  }
   case TargetOpcode::DBG_VALUE:
     llvm_unreachable("Should be handled target independently");
   case TargetOpcode::STACKMAP:
@@ -1797,7 +1780,7 @@ void PPCLinuxAsmPrinter::emitInstruction(const MachineInstr *MI) {
 
   switch (MI->getOpcode()) {
   default:
-    break;
+    return PPCAsmPrinter::emitInstruction(MI);
   case TargetOpcode::PATCHABLE_FUNCTION_ENTER: {
     // .begin:
     //   b .end # lis 0, FuncId[16..32]
@@ -1810,9 +1793,6 @@ void PPCLinuxAsmPrinter::emitInstruction(const MachineInstr *MI) {
     //
     // Update compiler-rt/lib/xray/xray_powerpc64.cc accordingly when number
     // of instructions change.
-    // XRAY is only supported on PPC Linux little endian.
-    if (!MAI->isLittleEndian())
-      break;
     MCSymbol *BeginOfSled = OutContext.createTempSymbol();
     MCSymbol *EndOfSled = OutContext.createTempSymbol();
     OutStreamer->emitLabel(BeginOfSled);
@@ -1854,7 +1834,7 @@ void PPCLinuxAsmPrinter::emitInstruction(const MachineInstr *MI) {
       IsConditional = false;
     } else {
       EmitToStreamer(*OutStreamer, RetInst);
-      return;
+      break;
     }
 
     MCSymbol *FallthroughLabel;
@@ -1919,7 +1899,7 @@ void PPCLinuxAsmPrinter::emitInstruction(const MachineInstr *MI) {
     if (IsConditional)
       OutStreamer->emitLabel(FallthroughLabel);
     recordSled(BeginOfSled, *MI, SledKind::FUNCTION_EXIT, 2);
-    return;
+    break;
   }
   case TargetOpcode::PATCHABLE_FUNCTION_EXIT:
     llvm_unreachable("PATCHABLE_FUNCTION_EXIT should never be emitted");
@@ -1929,7 +1909,6 @@ void PPCLinuxAsmPrinter::emitInstruction(const MachineInstr *MI) {
     llvm_unreachable("Tail call is handled in the normal case. See comments "
                      "around this assert.");
   }
-  return PPCAsmPrinter::emitInstruction(MI);
 }
 
 void PPCLinuxAsmPrinter::emitStartOfAsmFile(Module &M) {
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
index 237a8eaca05a0..cb64b64c503e9 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
@@ -560,7 +560,11 @@ bool RISCVAsmBackend::evaluateTargetFixup(const MCAssembler &Asm,
   if (A->getKind() != MCSymbolRefExpr::VK_None || SA.isUndefined())
     return false;
 
-  bool IsResolved = Asm.getWriter().isSymbolRefDifferenceFullyResolvedImpl(
+  auto *Writer = Asm.getWriterPtr();
+  if (!Writer)
+    return false;
+
+  bool IsResolved = Writer->isSymbolRefDifferenceFullyResolvedImpl(
       Asm, SA, *AUIPCDF, false, true);
   if (!IsResolved)
     return false;
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp
index 2c2aa8af8955c..87c5a756e0258 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp
@@ -87,10 +87,10 @@ void RISCVTargetELFStreamer::finishAttributeSection() {
 
 void RISCVTargetELFStreamer::finish() {
   RISCVTargetStreamer::finish();
-  ELFObjectWriter &W = getStreamer().getWriter();
+  MCAssembler &MCA = getStreamer().getAssembler();
   RISCVABI::ABI ABI = getTargetABI();
 
-  unsigned EFlags = W.getELFHeaderEFlags();
+  unsigned EFlags = MCA.getELFHeaderEFlags();
 
   if (hasRVC())
     EFlags |= ELF::EF_RISCV_RVC;
@@ -117,7 +117,7 @@ void RISCVTargetELFStreamer::finish() {
     llvm_unreachable("Improperly initialised target ABI");
   }
 
-  W.setELFHeaderEFlags(EFlags);
+  MCA.setELFHeaderEFlags(EFlags);
 }
 
 void RISCVTargetELFStreamer::reset() {
@@ -132,6 +132,7 @@ void RISCVTargetELFStreamer::emitDirectiveVariantCC(MCSymbol &Symbol) {
 void RISCVELFStreamer::reset() {
   static_cast<RISCVTargetStreamer *>(getTargetStreamer())->reset();
   MCELFStreamer::reset();
+  MappingSymbolCounter = 0;
   LastMappingSymbols.clear();
   LastEMS = EMS_None;
 }
@@ -151,7 +152,8 @@ void RISCVELFStreamer::emitInstructionsMappingSymbol() {
 }
 
 void RISCVELFStreamer::emitMappingSymbol(StringRef Name) {
-  auto *Symbol = cast<MCSymbolELF>(getContext().createLocalSymbol(Name));
+  auto *Symbol = cast<MCSymbolELF>(getContext().getOrCreateSymbol(
+      Name + "." + Twine(MappingSymbolCounter++)));
   emitLabel(Symbol);
   Symbol->setType(ELF::STT_NOTYPE);
   Symbol->setBinding(ELF::STB_LOCAL);
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h
index b06760859483d..40c6b5ac3dcc8 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h
@@ -22,6 +22,7 @@ class RISCVELFStreamer : public MCELFStreamer {
 
   enum ElfMappingSymbol { EMS_None, EMS_Instructions, EMS_Data };
 
+  int64_t MappingSymbolCounter = 0;
   DenseMap<const MCSection *, ElfMappingSymbol> LastMappingSymbols;
   ElfMappingSymbol LastEMS = EMS_None;
 
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index 3c868dbbf8b3a..c9979b2b36fc3 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -37,15 +37,6 @@ class RISCVExtension<string name, int major, int minor, string desc,
   bit Experimental = false;
 }
 
-// The groupID/bitmask of RISCVExtension is used to retrieve a specific bit value 
-// from __riscv_feature_bits based on the groupID and bitmask.
-// groupID - groupID of extension
-// bitPos  - bit position of extension bitmask
-class RISCVExtensionBitmask<bits<3> groupID, int bitPos> {
-    int GroupID = groupID;
-    int BitPos = bitPos;
-}
-
 // Version of RISCVExtension to be used for Experimental extensions. This
 // sets the Experimental flag and prepends experimental- to the -mattr name.
 class RISCVExperimentalExtension<string name, int major, int minor, string desc,
@@ -61,8 +52,7 @@ class RISCVExperimentalExtension<string name, int major, int minor, string desc,
 
 def FeatureStdExtI
     : RISCVExtension<"i", 2, 1,
-                     "'I' (Base Integer Instruction Set)">,
-      RISCVExtensionBitmask<0, 8>;
+                     "'I' (Base Integer Instruction Set)">;
 
 def FeatureStdExtE
     : RISCVExtension<"e", 2, 0,
@@ -88,8 +78,7 @@ def HasStdExtZicbop : Predicate<"Subtarget->hasStdExtZicbop()">,
 
 def FeatureStdExtZicboz
     : RISCVExtension<"zicboz", 1, 0,
-                     "'Zicboz' (Cache-Block Zero Instructions)">,
-      RISCVExtensionBitmask<0, 37>;
+                     "'Zicboz' (Cache-Block Zero Instructions)">;
 def HasStdExtZicboz : Predicate<"Subtarget->hasStdExtZicboz()">,
                       AssemblerPredicate<(all_of FeatureStdExtZicboz),
                           "'Zicboz' (Cache-Block Zero Instructions)">;
@@ -124,8 +113,7 @@ def FeatureStdExtZicntr
 
 def FeatureStdExtZicond
     : RISCVExtension<"zicond", 1, 0,
-                     "'Zicond' (Integer Conditional Operations)">,
-      RISCVExtensionBitmask<0, 38>;
+                     "'Zicond' (Integer Conditional Operations)">;
 def HasStdExtZicond : Predicate<"Subtarget->hasStdExtZicond()">,
                       AssemblerPredicate<(all_of FeatureStdExtZicond),
                           "'Zicond' (Integer Conditional Operations)">;
@@ -139,16 +127,14 @@ def HasStdExtZifencei : Predicate<"Subtarget->hasStdExtZifencei()">,
 
 def FeatureStdExtZihintpause
     : RISCVExtension<"zihintpause", 2, 0,
-                     "'Zihintpause' (Pause Hint)">,
-      RISCVExtensionBitmask<0, 40>;
+                     "'Zihintpause' (Pause Hint)">;
 def HasStdExtZihintpause : Predicate<"Subtarget->hasStdExtZihintpause()">,
                            AssemblerPredicate<(all_of FeatureStdExtZihintpause),
                                               "'Zihintpause' (Pause Hint)">;
 
 def FeatureStdExtZihintntl
     : RISCVExtension<"zihintntl", 1, 0,
-                     "'Zihintntl' (Non-Temporal Locality Hints)">,
-      RISCVExtensionBitmask<0, 39>;
+                     "'Zihintntl' (Non-Temporal Locality Hints)">;
 def HasStdExtZihintntl : Predicate<"Subtarget->hasStdExtZihintntl()">,
                          AssemblerPredicate<(all_of FeatureStdExtZihintntl),
                              "'Zihintntl' (Non-Temporal Locality Hints)">;
@@ -195,8 +181,7 @@ def HasStdExtZmmul : Predicate<"Subtarget->hasStdExtZmmul()">,
 def FeatureStdExtM
     : RISCVExtension<"m", 2, 0,
                      "'M' (Integer Multiplication and Division)",
-                     [FeatureStdExtZmmul]>,
-      RISCVExtensionBitmask<0, 12>;
+                     [FeatureStdExtZmmul]>;
 def HasStdExtM : Predicate<"Subtarget->hasStdExtM()">,
                  AssemblerPredicate<(all_of FeatureStdExtM),
                      "'M' (Integer Multiplication and Division)">;
@@ -205,16 +190,14 @@ def HasStdExtM : Predicate<"Subtarget->hasStdExtM()">,
 
 def FeatureStdExtA
     : RISCVExtension<"a", 2, 1,
-                     "'A' (Atomic Instructions)">,
-      RISCVExtensionBitmask<0, 0>;
+                     "'A' (Atomic Instructions)">;
 def HasStdExtA : Predicate<"Subtarget->hasStdExtA()">,
                  AssemblerPredicate<(all_of FeatureStdExtA),
                                     "'A' (Atomic Instructions)">;
 
 def FeatureStdExtZtso
     : RISCVExtension<"ztso", 1, 0,
-                     "'Ztso' (Memory Model - Total Store Order)">,
-      RISCVExtensionBitmask<0, 47>;
+                     "'Ztso' (Memory Model - Total Store Order)">;
 def HasStdExtZtso : Predicate<"Subtarget->hasStdExtZtso()">,
                     AssemblerPredicate<(all_of FeatureStdExtZtso),
                         "'Ztso' (Memory Model - Total Store Order)">;
@@ -243,9 +226,8 @@ def HasStdExtZabha : Predicate<"Subtarget->hasStdExtZabha()">,
                          "'Zabha' (Byte and Halfword Atomic Memory Operations)">;
 
 def FeatureStdExtZacas
-    : RISCVExperimentalExtension<"zacas", 1, 0,
-                                 "'Zacas' (Atomic Compare-And-Swap Instructions)">,
-      RISCVExtensionBitmask<0, 26>;
+    : RISCVExtension<"zacas", 1, 0,
+                     "'Zacas' (Atomic Compare-And-Swap Instructions)">;
 def HasStdExtZacas : Predicate<"Subtarget->hasStdExtZacas()">,
                      AssemblerPredicate<(all_of FeatureStdExtZacas),
                          "'Zacas' (Atomic Compare-And-Swap Instructions)">;
@@ -282,8 +264,7 @@ def HasStdExtZawrs : Predicate<"Subtarget->hasStdExtZawrs()">,
 def FeatureStdExtF
     : RISCVExtension<"f", 2, 2,
                      "'F' (Single-Precision Floating-Point)",
-                     [FeatureStdExtZicsr]>,
-      RISCVExtensionBitmask<0, 5>;
+                     [FeatureStdExtZicsr]>;
 def HasStdExtF : Predicate<"Subtarget->hasStdExtF()">,
                  AssemblerPredicate<(all_of FeatureStdExtF),
                                     "'F' (Single-Precision Floating-Point)">;
@@ -291,8 +272,7 @@ def HasStdExtF : Predicate<"Subtarget->hasStdExtF()">,
 def FeatureStdExtD
     : RISCVExtension<"d", 2, 2,
                      "'D' (Double-Precision Floating-Point)",
-                     [FeatureStdExtF]>,
-      RISCVExtensionBitmask<0, 3>;
+                     [FeatureStdExtF]>;
 def HasStdExtD : Predicate<"Subtarget->hasStdExtD()">,
                  AssemblerPredicate<(all_of FeatureStdExtD),
                                     "'D' (Double-Precision Floating-Point)">;
@@ -300,8 +280,7 @@ def HasStdExtD : Predicate<"Subtarget->hasStdExtD()">,
 def FeatureStdExtZfhmin
     : RISCVExtension<"zfhmin", 1, 0,
                      "'Zfhmin' (Half-Precision Floating-Point Minimal)",
-                     [FeatureStdExtF]>,
-      RISCVExtensionBitmask<0, 36>;
+                     [FeatureStdExtF]>;
 def HasStdExtZfhmin : Predicate<"Subtarget->hasStdExtZfhmin()">,
                       AssemblerPredicate<(all_of FeatureStdExtZfhmin),
                           "'Zfh' (Half-Precision Floating-Point) or "
@@ -310,8 +289,7 @@ def HasStdExtZfhmin : Predicate<"Subtarget->hasStdExtZfhmin()">,
 def FeatureStdExtZfh
     : RISCVExtension<"zfh", 1, 0,
                      "'Zfh' (Half-Precision Floating-Point)",
-                     [FeatureStdExtZfhmin]>,
-      RISCVExtensionBitmask<0, 35>;
+                     [FeatureStdExtZfhmin]>;
 def HasStdExtZfh : Predicate<"Subtarget->hasStdExtZfh()">,
                    AssemblerPredicate<(all_of FeatureStdExtZfh),
                        "'Zfh' (Half-Precision Floating-Point)">;
@@ -335,8 +313,7 @@ def HasHalfFPLoadStoreMove
 def FeatureStdExtZfa
     : RISCVExtension<"zfa", 1, 0,
                      "'Zfa' (Additional Floating-Point)",
-                     [FeatureStdExtF]>,
-      RISCVExtensionBitmask<0, 34>;
+                     [FeatureStdExtF]>;
 def HasStdExtZfa : Predicate<"Subtarget->hasStdExtZfa()">,
                    AssemblerPredicate<(all_of FeatureStdExtZfa),
                                       "'Zfa' (Additional Floating-Point)">;
@@ -379,8 +356,7 @@ def NoStdExtZhinx : Predicate<"!Subtarget->hasStdExtZhinx()">;
 
 def FeatureStdExtC
     : RISCVExtension<"c", 2, 0,
-                     "'C' (Compressed Instructions)">,
-      RISCVExtensionBitmask<0, 2>;
+                     "'C' (Compressed Instructions)">;
 def HasStdExtC : Predicate<"Subtarget->hasStdExtC()">,
                  AssemblerPredicate<(all_of FeatureStdExtC),
                                     "'C' (Compressed Instructions)">;
@@ -469,8 +445,7 @@ def HasStdExtZcmop : Predicate<"Subtarget->hasStdExtZcmop()">,
 
 def FeatureStdExtZba
     : RISCVExtension<"zba", 1, 0,
-                     "'Zba' (Address Generation Instructions)">,
-      RISCVExtensionBitmask<0, 27>;
+                     "'Zba' (Address Generation Instructions)">;
 def HasStdExtZba : Predicate<"Subtarget->hasStdExtZba()">,
                    AssemblerPredicate<(all_of FeatureStdExtZba),
                                       "'Zba' (Address Generation Instructions)">;
@@ -478,8 +453,7 @@ def NotHasStdExtZba : Predicate<"!Subtarget->hasStdExtZba()">;
 
 def FeatureStdExtZbb
     : RISCVExtension<"zbb", 1, 0,
-                     "'Zbb' (Basic Bit-Manipulation)">,
-      RISCVExtensionBitmask<0, 28>;
+                     "'Zbb' (Basic Bit-Manipulation)">;
 def HasStdExtZbb : Predicate<"Subtarget->hasStdExtZbb()">,
                    AssemblerPredicate<(all_of FeatureStdExtZbb),
                                       "'Zbb' (Basic Bit-Manipulation)">;
@@ -488,16 +462,14 @@ def NoStdExtZbb : Predicate<"!Subtarget->hasStdExtZbb()">,
 
 def FeatureStdExtZbc
     : RISCVExtension<"zbc", 1, 0,
-                     "'Zbc' (Carry-Less Multiplication)">,
-      RISCVExtensionBitmask<0, 29>;
+                     "'Zbc' (Carry-Less Multiplication)">;
 def HasStdExtZbc : Predicate<"Subtarget->hasStdExtZbc()">,
                    AssemblerPredicate<(all_of FeatureStdExtZbc),
                                       "'Zbc' (Carry-Less Multiplication)">;
 
 def FeatureStdExtZbs
     : RISCVExtension<"zbs", 1, 0,
-                     "'Zbs' (Single-Bit Instructions)">,
-      RISCVExtensionBitmask<0, 33>;
+                     "'Zbs' (Single-Bit Instructions)">;
 def HasStdExtZbs : Predicate<"Subtarget->hasStdExtZbs()">,
                    AssemblerPredicate<(all_of FeatureStdExtZbs),
                                       "'Zbs' (Single-Bit Instructions)">;
@@ -514,16 +486,14 @@ def HasStdExtB : Predicate<"Subtarget->hasStdExtB()">,
 
 def FeatureStdExtZbkb
     : RISCVExtension<"zbkb", 1, 0,
-                     "'Zbkb' (Bitmanip instructions for Cryptography)">,
-      RISCVExtensionBitmask<0, 30>;
+                     "'Zbkb' (Bitmanip instructions for Cryptography)">;
 def HasStdExtZbkb : Predicate<"Subtarget->hasStdExtZbkb()">,
                     AssemblerPredicate<(all_of FeatureStdExtZbkb),
                         "'Zbkb' (Bitmanip instructions for Cryptography)">;
 
 def FeatureStdExtZbkx
     : RISCVExtension<"zbkx", 1, 0,
-                     "'Zbkx' (Crossbar permutation instructions)">,
-      RISCVExtensionBitmask<0, 32>;
+                     "'Zbkx' (Crossbar permutation instructions)">;
 def HasStdExtZbkx : Predicate<"Subtarget->hasStdExtZbkx()">,
                     AssemblerPredicate<(all_of FeatureStdExtZbkx),
                         "'Zbkx' (Crossbar permutation instructions)">;
@@ -540,8 +510,7 @@ def HasStdExtZbbOrZbkb
 def FeatureStdExtZbkc
     : RISCVExtension<"zbkc", 1, 0,
                      "'Zbkc' (Carry-less multiply instructions for "
-                     "Cryptography)">,
-      RISCVExtensionBitmask<0, 31>;
+                     "Cryptography)">;
 def HasStdExtZbkc
     : Predicate<"Subtarget->hasStdExtZbkc()">,
       AssemblerPredicate<(all_of FeatureStdExtZbkc),
@@ -558,16 +527,14 @@ def HasStdExtZbcOrZbkc
 
 def FeatureStdExtZknd
     : RISCVExtension<"zknd", 1, 0,
-                     "'Zknd' (NIST Suite: AES Decryption)">,
-      RISCVExtensionBitmask<0, 41>;
+                     "'Zknd' (NIST Suite: AES Decryption)">;
 def HasStdExtZknd : Predicate<"Subtarget->hasStdExtZknd()">,
                     AssemblerPredicate<(all_of FeatureStdExtZknd),
                                        "'Zknd' (NIST Suite: AES Decryption)">;
 
 def FeatureStdExtZkne
     : RISCVExtension<"zkne", 1, 0,
-                     "'Zkne' (NIST Suite: AES Encryption)">,
-      RISCVExtensionBitmask<0, 42>;
+                     "'Zkne' (NIST Suite: AES Encryption)">;
 def HasStdExtZkne : Predicate<"Subtarget->hasStdExtZkne()">,
                     AssemblerPredicate<(all_of FeatureStdExtZkne),
                                        "'Zkne' (NIST Suite: AES Encryption)">;
@@ -582,24 +549,21 @@ def HasStdExtZkndOrZkne
 
 def FeatureStdExtZknh
     : RISCVExtension<"zknh", 1, 0,
-                     "'Zknh' (NIST Suite: Hash Function Instructions)">,
-      RISCVExtensionBitmask<0, 43>;
+                     "'Zknh' (NIST Suite: Hash Function Instructions)">;
 def HasStdExtZknh : Predicate<"Subtarget->hasStdExtZknh()">,
                     AssemblerPredicate<(all_of FeatureStdExtZknh),
                         "'Zknh' (NIST Suite: Hash Function Instructions)">;
 
 def FeatureStdExtZksed
     : RISCVExtension<"zksed", 1, 0,
-                     "'Zksed' (ShangMi Suite: SM4 Block Cipher Instructions)">,
-      RISCVExtensionBitmask<0, 44>;
+                     "'Zksed' (ShangMi Suite: SM4 Block Cipher Instructions)">;
 def HasStdExtZksed : Predicate<"Subtarget->hasStdExtZksed()">,
                      AssemblerPredicate<(all_of FeatureStdExtZksed),
                          "'Zksed' (ShangMi Suite: SM4 Block Cipher Instructions)">;
 
 def FeatureStdExtZksh
     : RISCVExtension<"zksh", 1, 0,
-                     "'Zksh' (ShangMi Suite: SM3 Hash Function Instructions)">,
-      RISCVExtensionBitmask<0, 45>;
+                     "'Zksh' (ShangMi Suite: SM3 Hash Function Instructions)">;
 def HasStdExtZksh : Predicate<"Subtarget->hasStdExtZksh()">,
                     AssemblerPredicate<(all_of FeatureStdExtZksh),
                         "'Zksh' (ShangMi Suite: SM3 Hash Function Instructions)">;
@@ -632,8 +596,7 @@ def FeatureStdExtZks
 
 def FeatureStdExtZkt
     : RISCVExtension<"zkt", 1, 0,
-                     "'Zkt' (Data Independent Execution Latency)">,
-      RISCVExtensionBitmask<0, 46>;
+                     "'Zkt' (Data Independent Execution Latency)">;
 
 def FeatureStdExtZk
     : RISCVExtension<"zk", 1, 0,
@@ -663,7 +626,6 @@ def FeatureStdExtZve32x
                      "with maximal 32 EEW)",
                      [FeatureStdExtZicsr, FeatureStdExtZvl32b]>;
 
-
 def FeatureStdExtZve32f
     : RISCVExtension<"zve32f", 1, 0,
                      "'Zve32f' (Vector Extensions for Embedded Processors "
@@ -691,8 +653,7 @@ def FeatureStdExtZve64d
 def FeatureStdExtV
     : RISCVExtension<"v", 1, 0,
                      "'V' (Vector Extension for Application Processors)",
-                     [FeatureStdExtZvl128b, FeatureStdExtZve64d]>,
-      RISCVExtensionBitmask<0, 21>;
+                     [FeatureStdExtZvl128b, FeatureStdExtZve64d]>;
 
 def FeatureStdExtZvfbfmin
     : RISCVExtension<"zvfbfmin", 1, 0, "'Zvbfmin' (Vector BF16 Converts)",
@@ -712,14 +673,12 @@ def HasStdExtZvfbfwma : Predicate<"Subtarget->hasStdExtZvfbfwma()">,
 def FeatureStdExtZvfhmin
     : RISCVExtension<"zvfhmin", 1, 0,
                      "'Zvfhmin' (Vector Half-Precision Floating-Point Minimal)",
-                     [FeatureStdExtZve32f]>,
-      RISCVExtensionBitmask<0, 51>;
+                     [FeatureStdExtZve32f]>;
 
 def FeatureStdExtZvfh
     : RISCVExtension<"zvfh", 1, 0,
                      "'Zvfh' (Vector Half-Precision Floating-Point)",
-                     [FeatureStdExtZvfhmin, FeatureStdExtZfhmin]>,
-      RISCVExtensionBitmask<0, 50>;
+                     [FeatureStdExtZvfhmin, FeatureStdExtZfhmin]>;
 
 def HasStdExtZfhOrZvfh
     : Predicate<"Subtarget->hasStdExtZfh() || Subtarget->hasStdExtZvfh()">,
@@ -731,8 +690,7 @@ def HasStdExtZfhOrZvfh
 
 def FeatureStdExtZvkb
     : RISCVExtension<"zvkb", 1, 0,
-                     "'Zvkb' (Vector Bit-manipulation used in Cryptography)">,
-      RISCVExtensionBitmask<0, 52>;
+                     "'Zvkb' (Vector Bit-manipulation used in Cryptography)">;
 def HasStdExtZvkb : Predicate<"Subtarget->hasStdExtZvkb()">,
                     AssemblerPredicate<(all_of FeatureStdExtZvkb),
                         "'Zvkb' (Vector Bit-manipulation used in Cryptography)">;
@@ -740,40 +698,35 @@ def HasStdExtZvkb : Predicate<"Subtarget->hasStdExtZvkb()">,
 def FeatureStdExtZvbb
     : RISCVExtension<"zvbb", 1, 0,
                      "'Zvbb' (Vector basic bit-manipulation instructions)",
-                     [FeatureStdExtZvkb]>,
-      RISCVExtensionBitmask<0, 48>;
+                     [FeatureStdExtZvkb]>;
 def HasStdExtZvbb : Predicate<"Subtarget->hasStdExtZvbb()">,
                     AssemblerPredicate<(all_of FeatureStdExtZvbb),
                         "'Zvbb' (Vector basic bit-manipulation instructions)">;
 
 def FeatureStdExtZvbc
     : RISCVExtension<"zvbc", 1, 0,
-                     "'Zvbc' (Vector Carryless Multiplication)">,
-      RISCVExtensionBitmask<0, 49>;
+                     "'Zvbc' (Vector Carryless Multiplication)">;
 def HasStdExtZvbc : Predicate<"Subtarget->hasStdExtZvbc()">,
                     AssemblerPredicate<(all_of FeatureStdExtZvbc),
                         "'Zvbc' (Vector Carryless Multiplication)">;
 
 def FeatureStdExtZvkg
     : RISCVExtension<"zvkg", 1, 0,
-                     "'Zvkg' (Vector GCM instructions for Cryptography)">,
-      RISCVExtensionBitmask<0, 53>;
+                     "'Zvkg' (Vector GCM instructions for Cryptography)">;
 def HasStdExtZvkg : Predicate<"Subtarget->hasStdExtZvkg()">,
                     AssemblerPredicate<(all_of FeatureStdExtZvkg),
                         "'Zvkg' (Vector GCM instructions for Cryptography)">;
 
 def FeatureStdExtZvkned
     : RISCVExtension<"zvkned", 1, 0,
-                     "'Zvkned' (Vector AES Encryption & Decryption (Single Round))">,
-      RISCVExtensionBitmask<0, 54>;
+                     "'Zvkned' (Vector AES Encryption & Decryption (Single Round))">;
 def HasStdExtZvkned : Predicate<"Subtarget->hasStdExtZvkned()">,
                       AssemblerPredicate<(all_of FeatureStdExtZvkned),
                           "'Zvkned' (Vector AES Encryption & Decryption (Single Round))">;
 
 def FeatureStdExtZvknha
     : RISCVExtension<"zvknha", 1, 0,
-                     "'Zvknha' (Vector SHA-2 (SHA-256 only))">,
-      RISCVExtensionBitmask<0, 55>;
+                     "'Zvknha' (Vector SHA-2 (SHA-256 only))">;
 def HasStdExtZvknha : Predicate<"Subtarget->hasStdExtZvknha()">,
                       AssemblerPredicate<(all_of FeatureStdExtZvknha),
                           "'Zvknha' (Vector SHA-2 (SHA-256 only))">;
@@ -781,8 +734,7 @@ def HasStdExtZvknha : Predicate<"Subtarget->hasStdExtZvknha()">,
 def FeatureStdExtZvknhb
     : RISCVExtension<"zvknhb", 1, 0,
                      "'Zvknhb' (Vector SHA-2 (SHA-256 and SHA-512))",
-                     [FeatureStdExtZve64x]>,
-      RISCVExtensionBitmask<0, 56>;
+                     [FeatureStdExtZve64x]>;
 def HasStdExtZvknhb : Predicate<"Subtarget->hasStdExtZvknhb()">,
                       AssemblerPredicate<(all_of FeatureStdExtZvknhb),
                           "'Zvknhb' (Vector SHA-2 (SHA-256 and SHA-512))">;
@@ -793,24 +745,21 @@ def HasStdExtZvknhaOrZvknhb : Predicate<"Subtarget->hasStdExtZvknha() || Subtarg
 
 def FeatureStdExtZvksed
     : RISCVExtension<"zvksed", 1, 0,
-                     "'Zvksed' (SM4 Block Cipher Instructions)">,
-      RISCVExtensionBitmask<0, 57>;
+                     "'Zvksed' (SM4 Block Cipher Instructions)">;
 def HasStdExtZvksed : Predicate<"Subtarget->hasStdExtZvksed()">,
                       AssemblerPredicate<(all_of FeatureStdExtZvksed),
                           "'Zvksed' (SM4 Block Cipher Instructions)">;
 
 def FeatureStdExtZvksh
     : RISCVExtension<"zvksh", 1, 0,
-                     "'Zvksh' (SM3 Hash Function Instructions)">,
-      RISCVExtensionBitmask<0, 58>;
+                     "'Zvksh' (SM3 Hash Function Instructions)">;
 def HasStdExtZvksh : Predicate<"Subtarget->hasStdExtZvksh()">,
                      AssemblerPredicate<(all_of FeatureStdExtZvksh),
                          "'Zvksh' (SM3 Hash Function Instructions)">;
 
 def FeatureStdExtZvkt
     : RISCVExtension<"zvkt", 1, 0,
-                     "'Zvkt' (Vector Data-Independent Execution Latency)">,
-      RISCVExtensionBitmask<0, 59>;
+                     "'Zvkt' (Vector Data-Independent Execution Latency)">;
 
 // Zvk short-hand extensions
 
@@ -847,6 +796,7 @@ def FeatureStdExtZvksg
     : RISCVExtension<"zvksg", 1, 0,
                      "'Zvksg' (shorthand for 'Zvks' and 'Zvkg')",
                      [FeatureStdExtZvks, FeatureStdExtZvkg]>;
+
 // Vector instruction predicates
 
 def HasVInstructions    : Predicate<"Subtarget->hasVInstructions()">,
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 823fb428472ef..e938454b8e642 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -18418,15 +18418,6 @@ static MachineBasicBlock *emitSelectPseudo(MachineInstr &MI,
   // Select_FPRX_ (rs1, rs2, imm, rs4, (Select_FPRX_ rs1, rs2, imm, rs4, rs5))
   // is checked here and handled by a separate function -
   // EmitLoweredCascadedSelect.
-
-  auto Next = next_nodbg(MI.getIterator(), BB->instr_end());
-  if ((MI.getOpcode() != RISCV::Select_GPR_Using_CC_GPR &&
-       MI.getOpcode() != RISCV::Select_GPR_Using_CC_Imm) &&
-      Next != BB->end() && Next->getOpcode() == MI.getOpcode() &&
-      Next->getOperand(5).getReg() == MI.getOperand(0).getReg() &&
-      Next->getOperand(5).isKill())
-    return EmitLoweredCascadedSelect(MI, *Next, BB, Subtarget);
-
   Register LHS = MI.getOperand(1).getReg();
   Register RHS;
   if (MI.getOperand(2).isReg())
@@ -18438,6 +18429,15 @@ static MachineBasicBlock *emitSelectPseudo(MachineInstr &MI,
   SelectDests.insert(MI.getOperand(0).getReg());
 
   MachineInstr *LastSelectPseudo = &MI;
+  auto Next = next_nodbg(MI.getIterator(), BB->instr_end());
+  if ((MI.getOpcode() != RISCV::Select_GPR_Using_CC_GPR &&
+       MI.getOpcode() != RISCV::Select_GPR_Using_CC_Imm) &&
+      Next != BB->end() && Next->getOpcode() == MI.getOpcode() &&
+      Next->getOperand(5).getReg() == MI.getOperand(0).getReg() &&
+      Next->getOperand(5).isKill()) {
+    return EmitLoweredCascadedSelect(MI, *Next, BB, Subtarget);
+  }
+
   for (auto E = BB->end(), SequenceMBBI = MachineBasicBlock::iterator(MI);
        SequenceMBBI != E; ++SequenceMBBI) {
     if (SequenceMBBI->isDebugInstr())
@@ -18478,11 +18478,6 @@ static MachineBasicBlock *emitSelectPseudo(MachineInstr &MI,
   F->insert(I, IfFalseMBB);
   F->insert(I, TailMBB);
 
-  // Set the call frame size on entry to the new basic blocks.
-  unsigned CallFrameSize = TII.getCallFrameSizeAt(*LastSelectPseudo);
-  IfFalseMBB->setCallFrameSize(CallFrameSize);
-  TailMBB->setCallFrameSize(CallFrameSize);
-
   // Transfer debug instructions associated with the selects to TailMBB.
   for (MachineInstr *DebugInstr : SelectDebugValues) {
     TailMBB->push_back(DebugInstr->removeFromParent());
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index 21fbf47875e68..bc3699db6f91e 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -279,10 +279,8 @@ class RVVRegisterRegAlloc : public RegisterRegAllocBase<RVVRegisterRegAlloc> {
 };
 
 static bool onlyAllocateRVVReg(const TargetRegisterInfo &TRI,
-                               const MachineRegisterInfo &MRI,
-                               const Register Reg) {
-  const TargetRegisterClass *RC = MRI.getRegClass(Reg);
-  return RISCVRegisterInfo::isRVVRegClass(RC);
+                               const TargetRegisterClass &RC) {
+  return RISCVRegisterInfo::isRVVRegClass(&RC);
 }
 
 static FunctionPass *useDefaultRegisterAllocator() { return nullptr; }
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 5a92d6bab31a9..f9eef60f77b7a 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -1688,6 +1688,7 @@ InstructionCost RISCVTTIImpl::getArithmeticInstrCost(
     return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
                                          Args, CxtI);
 
+
   auto getConstantMatCost =
     [&](unsigned Operand, TTI::OperandValueInfo OpInfo) -> InstructionCost {
     if (OpInfo.isUniform() && TLI->canSplatOperand(Opcode, Operand))
@@ -1759,14 +1760,8 @@ InstructionCost RISCVTTIImpl::getArithmeticInstrCost(
                                                            Op1Info, Op2Info,
                                                            Args, CxtI);
   }
-
-  InstructionCost InstrCost = getRISCVInstructionCost(Op, LT.second, CostKind);
-  // We use BasicTTIImpl to calculate scalar costs, which assumes floating point
-  // ops are twice as expensive as integer ops. Do the same for vectors so
-  // scalar floating point ops aren't cheaper than their vector equivalents.
-  if (Ty->isFPOrFPVectorTy())
-    InstrCost *= 2;
-  return ConstantMatCost + LT.first * InstrCost;
+  return ConstantMatCost +
+         LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
 }
 
 // TODO: Deduplicate from TargetTransformInfoImplCRTPBase.
diff --git a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
index 7fe8e11aaa420..3206c264f99d3 100644
--- a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
@@ -32,7 +32,6 @@
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCObjectStreamer.h"
-#include "llvm/MC/MCSPIRVObjectWriter.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/TargetRegistry.h"
@@ -117,8 +116,8 @@ void SPIRVAsmPrinter::emitEndOfAsmFile(Module &M) {
   // number and number of generated OpLabels
   unsigned Bound = 2 * (ST->getBound() + 1) + NLabels;
   if (MCAssembler *Asm = OutStreamer->getAssemblerPtr())
-    static_cast<SPIRVObjectWriter &>(Asm->getWriter())
-        .setBuildVersion(Major, Minor, Bound);
+    Asm->setBuildVersion(static_cast<MachO::PlatformType>(0), Major, Minor,
+                         Bound, VersionTuple(Major, Minor, 0, Bound));
 }
 
 void SPIRVAsmPrinter::emitFunctionHeader() {
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index a3ef11b2cab45..09a6d57afc0ba 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -201,7 +201,7 @@ class X86AsmBackend : public MCAsmBackend {
   bool padInstructionEncoding(MCRelaxableFragment &RF, MCCodeEmitter &Emitter,
                               unsigned &RemainingSize) const;
 
-  bool finishLayout(const MCAssembler &Asm) const override;
+  void finishLayout(const MCAssembler &Asm) const override;
 
   unsigned getMaximumNopSize(const MCSubtargetInfo &STI) const override;
 
@@ -856,7 +856,7 @@ bool X86AsmBackend::padInstructionEncoding(MCRelaxableFragment &RF,
   return Changed;
 }
 
-bool X86AsmBackend::finishLayout(const MCAssembler &Asm) const {
+void X86AsmBackend::finishLayout(MCAssembler const &Asm) const {
   // See if we can further relax some instructions to cut down on the number of
   // nop bytes required for code alignment.  The actual win is in reducing
   // instruction count, not number of bytes.  Modern X86-64 can easily end up
@@ -864,7 +864,7 @@ bool X86AsmBackend::finishLayout(const MCAssembler &Asm) const {
   // (i.e. eliminate nops) even at the cost of increasing the size and
   // complexity of others.
   if (!X86PadForAlign && !X86PadForBranchAlign)
-    return false;
+    return;
 
   // The processed regions are delimitered by LabeledFragments. -g may have more
   // MCSymbols and therefore different relaxation results. X86PadForAlign is
@@ -911,6 +911,9 @@ bool X86AsmBackend::finishLayout(const MCAssembler &Asm) const {
         continue;
       }
 
+#ifndef NDEBUG
+      const uint64_t OrigOffset = Asm.getFragmentOffset(F);
+#endif
       const uint64_t OrigSize = Asm.computeFragmentSize(F);
 
       // To keep the effects local, prefer to relax instructions closest to
@@ -923,7 +926,8 @@ bool X86AsmBackend::finishLayout(const MCAssembler &Asm) const {
         // Give the backend a chance to play any tricks it wishes to increase
         // the encoding size of the given instruction.  Target independent code
         // will try further relaxation, but target's may play further tricks.
-        padInstructionEncoding(RF, Asm.getEmitter(), RemainingSize);
+        if (padInstructionEncoding(RF, Asm.getEmitter(), RemainingSize))
+          Sec.setHasLayout(false);
 
         // If we have an instruction which hasn't been fully relaxed, we can't
         // skip past it and insert bytes before it.  Changing its starting
@@ -940,6 +944,14 @@ bool X86AsmBackend::finishLayout(const MCAssembler &Asm) const {
       if (F.getKind() == MCFragment::FT_BoundaryAlign)
         cast<MCBoundaryAlignFragment>(F).setSize(RemainingSize);
 
+#ifndef NDEBUG
+      const uint64_t FinalOffset = Asm.getFragmentOffset(F);
+      const uint64_t FinalSize = Asm.computeFragmentSize(F);
+      assert(OrigOffset + OrigSize == FinalOffset + FinalSize &&
+             "can't move start of next fragment!");
+      assert(FinalSize == RemainingSize && "inconsistent size computation?");
+#endif
+
       // If we're looking at a boundary align, make sure we don't try to pad
       // its target instructions for some following directive.  Doing so would
       // break the alignment of the current boundary align.
@@ -953,7 +965,11 @@ bool X86AsmBackend::finishLayout(const MCAssembler &Asm) const {
     }
   }
 
-  return true;
+  // The layout is done. Mark every fragment as valid.
+  for (MCSection &Section : Asm) {
+    Asm.getFragmentOffset(*Section.curFragList()->Tail);
+    Asm.computeFragmentSize(*Section.curFragList()->Tail);
+  }
 }
 
 unsigned X86AsmBackend::getMaximumNopSize(const MCSubtargetInfo &STI) const {
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 5a9d679d7002c..2959902c78675 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -43315,9 +43315,6 @@ bool X86TargetLowering::canCreateUndefOrPoisonForTargetNode(
     bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
 
   switch (Op.getOpcode()) {
-  // SSE vector multiplies are either inbounds or saturate.
-  case X86ISD::VPMADDUBSW:
-  case X86ISD::VPMADDWD:
   // SSE vector shifts handle out of bounds shift amounts.
   case X86ISD::VSHLI:
   case X86ISD::VSRLI:
@@ -56168,17 +56165,6 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
       SmallVector<SDValue> Subs;
       for (SDValue SubOp : SubOps)
         Subs.push_back(SubOp.getOperand(I));
-      // Attempt to peek through bitcasts and concat the original subvectors.
-      EVT SubVT = peekThroughBitcasts(Subs[0]).getValueType();
-      if (SubVT.isSimple() && SubVT.isVector()) {
-        EVT ConcatVT =
-            EVT::getVectorVT(*DAG.getContext(), SubVT.getScalarType(),
-                             SubVT.getVectorElementCount() * Subs.size());
-        for (SDValue &Sub : Subs)
-          Sub = DAG.getBitcast(SubVT, Sub);
-        return DAG.getBitcast(
-            VT, DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, Subs));
-      }
       return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
     };
     auto IsConcatFree = [](MVT VT, ArrayRef<SDValue> SubOps, unsigned Op) {
diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp
index d2d59ff3b93cf..4c77f40fd32a3 100644
--- a/llvm/lib/Target/X86/X86TargetMachine.cpp
+++ b/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -676,10 +676,8 @@ std::unique_ptr<CSEConfigBase> X86PassConfig::getCSEConfig() const {
 }
 
 static bool onlyAllocateTileRegisters(const TargetRegisterInfo &TRI,
-                                      const MachineRegisterInfo &MRI,
-                                      const Register Reg) {
-  const TargetRegisterClass *RC = MRI.getRegClass(Reg);
-  return static_cast<const X86RegisterInfo &>(TRI).isTileRegisterClass(RC);
+                                      const TargetRegisterClass &RC) {
+  return static_cast<const X86RegisterInfo &>(TRI).isTileRegisterClass(&RC);
 }
 
 bool X86PassConfig::addRegAssignAndRewriteOptimized() {
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index dc3ac80bdf5cf..32a3683355b72 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -6756,8 +6756,3 @@ InstructionCost X86TTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
     return AM.Scale != 0;
   return -1;
 }
-
-InstructionCost X86TTIImpl::getBranchMispredictPenalty() const {
-  // TODO: Hook MispredictPenalty of SchedMachineModel into this.
-  return 14;
-}
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index b619090e8e1e0..5eccb1aea308d 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -294,8 +294,6 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
   bool supportsEfficientVectorElementLoadStore() const;
   bool enableInterleavedAccessVectorization();
 
-  InstructionCost getBranchMispredictPenalty() const;
-
 private:
   bool supportsGather() const;
   InstructionCost getGSVectorCost(unsigned Opcode, TTI::TargetCostKind CostKind,
diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp
index fda085f880096..82c1731f58f0a 100644
--- a/llvm/lib/TargetParser/Host.cpp
+++ b/llvm/lib/TargetParser/Host.cpp
@@ -1562,8 +1562,6 @@ StringRef sys::getHostCPUName() {
   switch (processor_id & 0xf000) {
   case 0xc000: // Loongson 64bit, 4-issue
     return "la464";
-  case 0xd000: // Loongson 64bit, 6-issue
-    return "la664";
   // TODO: Others.
   default:
     break;
@@ -2069,8 +2067,7 @@ const StringMap<bool> sys::getHostCPUFeatures() {
   Features["zvfhmin"] = ExtMask & (1ULL << 31); // RISCV_HWPROBE_EXT_ZVFHMIN
   Features["zfa"] = ExtMask & (1ULL << 32);     // RISCV_HWPROBE_EXT_ZFA
   Features["ztso"] = ExtMask & (1ULL << 33);    // RISCV_HWPROBE_EXT_ZTSO
-  // TODO: Re-enable zacas when it is marked non-experimental again.
-  // Features["zacas"] = ExtMask & (1ULL << 34);   // RISCV_HWPROBE_EXT_ZACAS
+  Features["zacas"] = ExtMask & (1ULL << 34);   // RISCV_HWPROBE_EXT_ZACAS
   Features["zicond"] = ExtMask & (1ULL << 35);  // RISCV_HWPROBE_EXT_ZICOND
   Features["zihintpause"] =
       ExtMask & (1ULL << 36); // RISCV_HWPROBE_EXT_ZIHINTPAUSE
diff --git a/llvm/lib/TargetParser/LoongArchTargetParser.cpp b/llvm/lib/TargetParser/LoongArchTargetParser.cpp
index 8e86d18de2ad9..772d24c5ce3de 100644
--- a/llvm/lib/TargetParser/LoongArchTargetParser.cpp
+++ b/llvm/lib/TargetParser/LoongArchTargetParser.cpp
@@ -44,17 +44,6 @@ bool LoongArch::getArchFeatures(StringRef Arch,
       return true;
     }
   }
-
-  if (Arch == "la64v1.0" || Arch == "la64v1.1") {
-    Features.push_back("+64bit");
-    Features.push_back("+d");
-    Features.push_back("+lsx");
-    Features.push_back("+ual");
-    if (Arch == "la64v1.1")
-      Features.push_back("+frecipe");
-    return true;
-  }
-
   return false;
 }
 
diff --git a/llvm/lib/TargetParser/RISCVTargetParser.cpp b/llvm/lib/TargetParser/RISCVTargetParser.cpp
index 49a35bfcf4b9b..db1b5f689d7da 100644
--- a/llvm/lib/TargetParser/RISCVTargetParser.cpp
+++ b/llvm/lib/TargetParser/RISCVTargetParser.cpp
@@ -128,22 +128,6 @@ void getFeaturesForCPU(StringRef CPU,
     else
       EnabledFeatures.push_back(F.substr(1));
 }
-
-namespace RISCVExtensionBitmaskTable {
-#define GET_RISCVExtensionBitmaskTable_IMPL
-#include "llvm/TargetParser/RISCVTargetParserDef.inc"
-
-} // namespace RISCVExtensionBitmaskTable
-
-namespace {
-struct LessExtName {
-  bool operator()(const RISCVExtensionBitmaskTable::RISCVExtensionBitmask &LHS,
-                  StringRef RHS) {
-    return StringRef(LHS.Name) < RHS;
-  }
-};
-} // namespace
-
 } // namespace RISCV
 
 namespace RISCVVType {
diff --git a/llvm/lib/TargetParser/Triple.cpp b/llvm/lib/TargetParser/Triple.cpp
index bf89aace65e58..4fc1ff5aaa051 100644
--- a/llvm/lib/TargetParser/Triple.cpp
+++ b/llvm/lib/TargetParser/Triple.cpp
@@ -352,8 +352,6 @@ StringRef Triple::getEnvironmentTypeName(EnvironmentType Kind) {
   case OpenCL:
     return "opencl";
   case OpenHOS: return "ohos";
-  case PAuthTest:
-    return "pauthtest";
   }
 
   llvm_unreachable("Invalid EnvironmentType!");
@@ -730,7 +728,6 @@ static Triple::EnvironmentType parseEnvironment(StringRef EnvironmentName) {
       .StartsWith("amplification", Triple::Amplification)
       .StartsWith("opencl", Triple::OpenCL)
       .StartsWith("ohos", Triple::OpenHOS)
-      .StartsWith("pauthtest", Triple::PAuthTest)
       .Default(Triple::UnknownEnvironment);
 }
 
diff --git a/llvm/lib/Transforms/Coroutines/CoroEarly.cpp b/llvm/lib/Transforms/Coroutines/CoroEarly.cpp
index d8e827e9cebcf..489106422e199 100644
--- a/llvm/lib/Transforms/Coroutines/CoroEarly.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroEarly.cpp
@@ -148,7 +148,6 @@ void Lowerer::lowerCoroNoop(IntrinsicInst *II) {
     NoopCoro = new GlobalVariable(M, NoopCoroConst->getType(), /*isConstant=*/true,
                                 GlobalVariable::PrivateLinkage, NoopCoroConst,
                                 "NoopCoro.Frame.Const");
-    cast<GlobalVariable>(NoopCoro)->setNoSanitizeMetadata();
   }
 
   Builder.SetInsertPoint(II);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index aaf4ece3249a2..e387034110df9 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -1247,11 +1247,8 @@ bool InstCombinerImpl::replaceInInstruction(Value *V, Value *Old, Value *New,
   if (Depth == 2)
     return false;
 
-  assert(!isa<Constant>(Old) && "Only replace non-constant values");
-
   auto *I = dyn_cast<Instruction>(V);
-  if (!I || !I->hasOneUse() ||
-      !isSafeToSpeculativelyExecuteWithVariableReplaced(I))
+  if (!I || !I->hasOneUse() || !isSafeToSpeculativelyExecute(I))
     return false;
 
   bool Changed = false;
@@ -1334,7 +1331,7 @@ Instruction *InstCombinerImpl::foldSelectValueEquivalence(SelectInst &Sel,
     // profitability is not clear for other cases.
     // FIXME: Support vectors.
     if (OldOp == CmpLHS && match(NewOp, m_ImmConstant()) &&
-        !match(OldOp, m_Constant()) && !Cmp.getType()->isVectorTy() &&
+        !match(OldOp, m_ImmConstant()) && !Cmp.getType()->isVectorTy() &&
         isGuaranteedNotToBeUndef(NewOp, SQ.AC, &Sel, &DT))
       if (replaceInInstruction(TrueVal, OldOp, NewOp))
         return &Sel;
diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 9fb1df7ab2b79..149866a8e4200 100644
--- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -2947,8 +2947,6 @@ bool AddressSanitizer::instrumentFunction(Function &F,
   if (F.getLinkage() == GlobalValue::AvailableExternallyLinkage) return false;
   if (!ClDebugFunc.empty() && ClDebugFunc == F.getName()) return false;
   if (F.getName().starts_with("__asan_")) return false;
-  if (F.isPresplitCoroutine())
-    return false;
 
   bool FunctionModified = false;
 
diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
index d1396071c0bad..09c56eb5fe6aa 100644
--- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
+++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -170,30 +170,6 @@ cl::opt<bool> SkipRetExitBlock(
     "skip-ret-exit-block", cl::init(true),
     cl::desc("Suppress counter promotion if exit blocks contain ret."));
 
-static cl::opt<bool> SampledInstr("sampled-instrumentation", cl::ZeroOrMore,
-                                  cl::init(false),
-                                  cl::desc("Do PGO instrumentation sampling"));
-
-static cl::opt<unsigned> SampledInstrPeriod(
-    "sampled-instr-period",
-    cl::desc("Set the profile instrumentation sample period. For each sample "
-             "period, a fixed number of consecutive samples will be recorded. "
-             "The number is controlled by 'sampled-instr-burst-duration' flag. "
-             "The default sample period of 65535 is optimized for generating "
-             "efficient code that leverages unsigned integer wrapping in "
-             "overflow."),
-    cl::init(65535));
-
-static cl::opt<unsigned> SampledInstrBurstDuration(
-    "sampled-instr-burst-duration",
-    cl::desc("Set the profile instrumentation burst duration, which can range "
-             "from 0 to one less than the value of 'sampled-instr-period'. "
-             "This number of samples will be recorded for each "
-             "'sampled-instr-period' count update. Setting to 1 enables "
-             "simple sampling, in which case it is recommended to set "
-             "'sampled-instr-period' to a prime number."),
-    cl::init(200));
-
 using LoadStorePair = std::pair<Instruction *, Instruction *>;
 
 static uint64_t getIntModuleFlagOrZero(const Module &M, StringRef Flag) {
@@ -284,9 +260,6 @@ class InstrLowerer final {
   /// Returns true if profile counter update register promotion is enabled.
   bool isCounterPromotionEnabled() const;
 
-  /// Return true if profile sampling is enabled.
-  bool isSamplingEnabled() const;
-
   /// Count the number of instrumented value sites for the function.
   void computeNumValueSiteCounts(InstrProfValueProfileInst *Ins);
 
@@ -318,9 +291,6 @@ class InstrLowerer final {
   /// acts on.
   Value *getCounterAddress(InstrProfCntrInstBase *I);
 
-  /// Lower the incremental instructions under profile sampling predicates.
-  void doSampling(Instruction *I);
-
   /// Get the region counters for an increment, creating them if necessary.
   ///
   /// If the counter array doesn't yet exist, the profile data variables
@@ -665,169 +635,33 @@ PreservedAnalyses InstrProfilingLoweringPass::run(Module &M,
   return PreservedAnalyses::none();
 }
 
-//
-// Perform instrumentation sampling.
-//
-// There are 3 favors of sampling:
-// (1) Full burst sampling: We transform:
-//   Increment_Instruction;
-// to:
-//   if (__llvm_profile_sampling__ < SampledInstrBurstDuration) {
-//     Increment_Instruction;
-//   }
-//   __llvm_profile_sampling__ += 1;
-//   if (__llvm_profile_sampling__ >= SampledInstrPeriod) {
-//     __llvm_profile_sampling__ = 0;
-//   }
-//
-// "__llvm_profile_sampling__" is a thread-local global shared by all PGO
-// counters (value-instrumentation and edge instrumentation).
-//
-// (2) Fast burst sampling:
-// "__llvm_profile_sampling__" variable is an unsigned type, meaning it will
-// wrap around to zero when overflows. In this case, the second check is
-// unnecessary, so we won't generate check2 when the SampledInstrPeriod is
-// set to 65535 (64K - 1). The code after:
-//   if (__llvm_profile_sampling__ < SampledInstrBurstDuration) {
-//     Increment_Instruction;
-//   }
-//   __llvm_profile_sampling__ += 1;
-//
-// (3) Simple sampling:
-// When SampledInstrBurstDuration sets to 1, we do a simple sampling:
-//   __llvm_profile_sampling__ += 1;
-//   if (__llvm_profile_sampling__ >= SampledInstrPeriod) {
-//     __llvm_profile_sampling__ = 0;
-//     Increment_Instruction;
-//   }
-//
-// Note that, the code snippet after the transformation can still be counter
-// promoted. However, with sampling enabled, counter updates are expected to
-// be infrequent, making the benefits of counter promotion negligible.
-// Moreover, counter promotion can potentially cause issues in server
-// applications, particularly when the counters are dumped without a clean
-// exit. To mitigate this risk, counter promotion is disabled by default when
-// sampling is enabled. This behavior can be overridden using the internal
-// option.
-void InstrLowerer::doSampling(Instruction *I) {
-  if (!isSamplingEnabled())
-    return;
-
-  unsigned SampledBurstDuration = SampledInstrBurstDuration.getValue();
-  unsigned SampledPeriod = SampledInstrPeriod.getValue();
-  if (SampledBurstDuration >= SampledPeriod) {
-    report_fatal_error(
-        "SampledPeriod needs to be greater than SampledBurstDuration");
-  }
-  bool UseShort = (SampledPeriod <= USHRT_MAX);
-  bool IsSimpleSampling = (SampledBurstDuration == 1);
-  // If (SampledBurstDuration == 1 && SampledPeriod == 65535), generate
-  // the simple sampling style code.
-  bool IsFastSampling = (!IsSimpleSampling && SampledPeriod == 65535);
-
-  auto GetConstant = [UseShort](IRBuilder<> &Builder, uint32_t C) {
-    if (UseShort)
-      return Builder.getInt16(C);
-    else
-      return Builder.getInt32(C);
-  };
-
-  IntegerType *SamplingVarTy;
-  if (UseShort)
-    SamplingVarTy = Type::getInt16Ty(M.getContext());
-  else
-    SamplingVarTy = Type::getInt32Ty(M.getContext());
-  auto *SamplingVar =
-      M.getGlobalVariable(INSTR_PROF_QUOTE(INSTR_PROF_PROFILE_SAMPLING_VAR));
-  assert(SamplingVar && "SamplingVar not set properly");
-
-  // Create the condition for checking the burst duration.
-  Instruction *SamplingVarIncr;
-  Value *NewSamplingVarVal;
-  MDBuilder MDB(I->getContext());
-  MDNode *BranchWeight;
-  IRBuilder<> CondBuilder(I);
-  auto *LoadSamplingVar = CondBuilder.CreateLoad(SamplingVarTy, SamplingVar);
-  if (IsSimpleSampling) {
-    // For the simple sampling, just create the load and increments.
-    IRBuilder<> IncBuilder(I);
-    NewSamplingVarVal =
-        IncBuilder.CreateAdd(LoadSamplingVar, GetConstant(IncBuilder, 1));
-    SamplingVarIncr = IncBuilder.CreateStore(NewSamplingVarVal, SamplingVar);
-  } else {
-    // For the bust-sampling, create the conditonal update.
-    auto *DurationCond = CondBuilder.CreateICmpULE(
-        LoadSamplingVar, GetConstant(CondBuilder, SampledBurstDuration));
-    BranchWeight = MDB.createBranchWeights(
-        SampledBurstDuration, SampledPeriod + 1 - SampledBurstDuration);
-    Instruction *ThenTerm = SplitBlockAndInsertIfThen(
-        DurationCond, I, /* Unreachable */ false, BranchWeight);
-    IRBuilder<> IncBuilder(I);
-    NewSamplingVarVal =
-        IncBuilder.CreateAdd(LoadSamplingVar, GetConstant(IncBuilder, 1));
-    SamplingVarIncr = IncBuilder.CreateStore(NewSamplingVarVal, SamplingVar);
-    I->moveBefore(ThenTerm);
-  }
-
-  if (IsFastSampling)
-    return;
-
-  // Create the condtion for checking the period.
-  Instruction *ThenTerm, *ElseTerm;
-  IRBuilder<> PeriodCondBuilder(SamplingVarIncr);
-  auto *PeriodCond = PeriodCondBuilder.CreateICmpUGE(
-      NewSamplingVarVal, GetConstant(PeriodCondBuilder, SampledPeriod));
-  BranchWeight = MDB.createBranchWeights(1, SampledPeriod);
-  SplitBlockAndInsertIfThenElse(PeriodCond, SamplingVarIncr, &ThenTerm,
-                                &ElseTerm, BranchWeight);
-
-  // For the simple sampling, the counter update happens in sampling var reset.
-  if (IsSimpleSampling)
-    I->moveBefore(ThenTerm);
-
-  IRBuilder<> ResetBuilder(ThenTerm);
-  ResetBuilder.CreateStore(GetConstant(ResetBuilder, 0), SamplingVar);
-  SamplingVarIncr->moveBefore(ElseTerm);
-}
-
 bool InstrLowerer::lowerIntrinsics(Function *F) {
   bool MadeChange = false;
   PromotionCandidates.clear();
-  SmallVector<InstrProfInstBase *, 8> InstrProfInsts;
-
-  // To ensure compatibility with sampling, we save the intrinsics into
-  // a buffer to prevent potential breakage of the iterator (as the
-  // intrinsics will be moved to a different BB).
   for (BasicBlock &BB : *F) {
     for (Instruction &Instr : llvm::make_early_inc_range(BB)) {
-      if (auto *IP = dyn_cast<InstrProfInstBase>(&Instr))
-        InstrProfInsts.push_back(IP);
-    }
-  }
-
-  for (auto *Instr : InstrProfInsts) {
-    doSampling(Instr);
-    if (auto *IPIS = dyn_cast<InstrProfIncrementInstStep>(Instr)) {
-      lowerIncrement(IPIS);
-      MadeChange = true;
-    } else if (auto *IPI = dyn_cast<InstrProfIncrementInst>(Instr)) {
-      lowerIncrement(IPI);
-      MadeChange = true;
-    } else if (auto *IPC = dyn_cast<InstrProfTimestampInst>(Instr)) {
-      lowerTimestamp(IPC);
-      MadeChange = true;
-    } else if (auto *IPC = dyn_cast<InstrProfCoverInst>(Instr)) {
-      lowerCover(IPC);
-      MadeChange = true;
-    } else if (auto *IPVP = dyn_cast<InstrProfValueProfileInst>(Instr)) {
-      lowerValueProfileInst(IPVP);
-      MadeChange = true;
-    } else if (auto *IPMP = dyn_cast<InstrProfMCDCBitmapParameters>(Instr)) {
-      IPMP->eraseFromParent();
-      MadeChange = true;
-    } else if (auto *IPBU = dyn_cast<InstrProfMCDCTVBitmapUpdate>(Instr)) {
-      lowerMCDCTestVectorBitmapUpdate(IPBU);
-      MadeChange = true;
+      if (auto *IPIS = dyn_cast<InstrProfIncrementInstStep>(&Instr)) {
+        lowerIncrement(IPIS);
+        MadeChange = true;
+      } else if (auto *IPI = dyn_cast<InstrProfIncrementInst>(&Instr)) {
+        lowerIncrement(IPI);
+        MadeChange = true;
+      } else if (auto *IPC = dyn_cast<InstrProfTimestampInst>(&Instr)) {
+        lowerTimestamp(IPC);
+        MadeChange = true;
+      } else if (auto *IPC = dyn_cast<InstrProfCoverInst>(&Instr)) {
+        lowerCover(IPC);
+        MadeChange = true;
+      } else if (auto *IPVP = dyn_cast<InstrProfValueProfileInst>(&Instr)) {
+        lowerValueProfileInst(IPVP);
+        MadeChange = true;
+      } else if (auto *IPMP = dyn_cast<InstrProfMCDCBitmapParameters>(&Instr)) {
+        IPMP->eraseFromParent();
+        MadeChange = true;
+      } else if (auto *IPBU = dyn_cast<InstrProfMCDCTVBitmapUpdate>(&Instr)) {
+        lowerMCDCTestVectorBitmapUpdate(IPBU);
+        MadeChange = true;
+      }
     }
   }
 
@@ -850,12 +684,6 @@ bool InstrLowerer::isRuntimeCounterRelocationEnabled() const {
   return TT.isOSFuchsia();
 }
 
-bool InstrLowerer::isSamplingEnabled() const {
-  if (SampledInstr.getNumOccurrences() > 0)
-    return SampledInstr;
-  return Options.Sampling;
-}
-
 bool InstrLowerer::isCounterPromotionEnabled() const {
   if (DoCounterPromotion.getNumOccurrences() > 0)
     return DoCounterPromotion;
@@ -926,9 +754,6 @@ bool InstrLowerer::lower() {
   if (NeedsRuntimeHook)
     MadeChange = emitRuntimeHook();
 
-  if (!IsCS && isSamplingEnabled())
-    createProfileSamplingVar(M);
-
   bool ContainsProfiling = containsProfilingIntrinsics(M);
   GlobalVariable *CoverageNamesVar =
       M.getNamedGlobal(getCoverageUnusedNamesVarName());
@@ -2130,29 +1955,3 @@ void InstrLowerer::emitInitialization() {
 
   appendToGlobalCtors(M, F, 0);
 }
-
-namespace llvm {
-// Create the variable for profile sampling.
-void createProfileSamplingVar(Module &M) {
-  const StringRef VarName(INSTR_PROF_QUOTE(INSTR_PROF_PROFILE_SAMPLING_VAR));
-  IntegerType *SamplingVarTy;
-  Constant *ValueZero;
-  if (SampledInstrPeriod.getValue() <= USHRT_MAX) {
-    SamplingVarTy = Type::getInt16Ty(M.getContext());
-    ValueZero = Constant::getIntegerValue(SamplingVarTy, APInt(16, 0));
-  } else {
-    SamplingVarTy = Type::getInt32Ty(M.getContext());
-    ValueZero = Constant::getIntegerValue(SamplingVarTy, APInt(32, 0));
-  }
-  auto SamplingVar = new GlobalVariable(
-      M, SamplingVarTy, false, GlobalValue::WeakAnyLinkage, ValueZero, VarName);
-  SamplingVar->setVisibility(GlobalValue::DefaultVisibility);
-  SamplingVar->setThreadLocal(true);
-  Triple TT(M.getTargetTriple());
-  if (TT.supportsCOMDAT()) {
-    SamplingVar->setLinkage(GlobalValue::ExternalLinkage);
-    SamplingVar->setComdat(M.getOrInsertComdat(VarName));
-  }
-  appendToCompilerUsed(M, SamplingVar);
-}
-} // namespace llvm
diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index 1ce8f58c1aa14..35b1bbf21be97 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -1096,7 +1096,7 @@ class PGOUseFunc {
       : F(Func), M(Modu), BFI(BFIin), PSI(PSI),
         FuncInfo(Func, TLI, ComdatMembers, false, BPI, BFIin, IsCS,
                  InstrumentFuncEntry, HasSingleByteCoverage),
-        FreqAttr(FFA_Normal), IsCS(IsCS), VPC(Func, TLI) {}
+        FreqAttr(FFA_Normal), IsCS(IsCS) {}
 
   void handleInstrProfError(Error Err, uint64_t MismatchedFuncSum);
 
@@ -1178,8 +1178,6 @@ class PGOUseFunc {
   // Is to use the context sensitive profile.
   bool IsCS;
 
-  ValueProfileCollector VPC;
-
   // Find the Instrumented BB and set the value. Return false on error.
   bool setInstrumentedCounts(const std::vector<uint64_t> &CountFromProfile);
 
@@ -1757,23 +1755,8 @@ void PGOUseFunc::annotateValueSites() {
 void PGOUseFunc::annotateValueSites(uint32_t Kind) {
   assert(Kind <= IPVK_Last);
   unsigned ValueSiteIndex = 0;
-
-  unsigned NumValueSites = ProfileRecord.getNumValueSites(Kind);
-
-  // Since there isn't a reliable or fast way for profile reader to tell if a
-  // profile is generated with `-enable-vtable-value-profiling` on, we run the
-  // value profile collector over the function IR to find the instrumented sites
-  // iff function profile records shows the number of instrumented vtable sites
-  // is not zero. Function cfg already takes the number of instrumented
-  // indirect call sites into account so it doesn't hash the number of
-  // instrumented vtables; as a side effect it makes it easier to enable
-  // profiling and profile use in two steps if needed.
-  // TODO: Remove this if/when -enable-vtable-value-profiling is on by default.
-  if (NumValueSites > 0 && Kind == IPVK_VTableTarget &&
-      NumValueSites != FuncInfo.ValueSites[IPVK_VTableTarget].size() &&
-      MaxNumVTableAnnotations != 0)
-    FuncInfo.ValueSites[IPVK_VTableTarget] = VPC.get(IPVK_VTableTarget);
   auto &ValueSites = FuncInfo.ValueSites[Kind];
+  unsigned NumValueSites = ProfileRecord.getNumValueSites(Kind);
   if (NumValueSites != ValueSites.size()) {
     auto &Ctx = M->getContext();
     Ctx.diagnose(DiagnosticInfoPGOProfile(
@@ -1892,8 +1875,6 @@ PGOInstrumentationGenCreateVar::run(Module &M, ModuleAnalysisManager &MAM) {
   // The variable in a comdat may be discarded by LTO. Ensure the declaration
   // will be retained.
   appendToCompilerUsed(M, createIRLevelProfileFlagVar(M, /*IsCS=*/true));
-  if (ProfileSampling)
-    createProfileSamplingVar(M);
   PreservedAnalyses PA;
   PA.preserve<FunctionAnalysisManagerModuleProxy>();
   PA.preserveSet<AllAnalysesOn<Function>>();
diff --git a/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
index 11de37f7a7c10..b7baf34f27c21 100644
--- a/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
+++ b/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
@@ -77,9 +77,6 @@ static cl::opt<bool> UserSinkCommonInsts(
     "sink-common-insts", cl::Hidden, cl::init(false),
     cl::desc("Sink common instructions (default = false)"));
 
-static cl::opt<bool> UserSpeculateUnpredictables(
-    "speculate-unpredictables", cl::Hidden, cl::init(false),
-    cl::desc("Speculate unpredictable branches (default = false)"));
 
 STATISTIC(NumSimpl, "Number of blocks simplified");
 
@@ -328,8 +325,6 @@ static void applyCommandLineOverridesToOptions(SimplifyCFGOptions &Options) {
     Options.HoistCommonInsts = UserHoistCommonInsts;
   if (UserSinkCommonInsts.getNumOccurrences())
     Options.SinkCommonInsts = UserSinkCommonInsts;
-  if (UserSpeculateUnpredictables.getNumOccurrences())
-    Options.SpeculateUnpredictables = UserSpeculateUnpredictables;
 }
 
 SimplifyCFGPass::SimplifyCFGPass() {
@@ -356,9 +351,7 @@ void SimplifyCFGPass::printPipeline(
   OS << (Options.HoistCommonInsts ? "" : "no-") << "hoist-common-insts;";
   OS << (Options.SinkCommonInsts ? "" : "no-") << "sink-common-insts;";
   OS << (Options.SpeculateBlocks ? "" : "no-") << "speculate-blocks;";
-  OS << (Options.SimplifyCondBranch ? "" : "no-") << "simplify-cond-branch;";
-  OS << (Options.SpeculateUnpredictables ? "" : "no-")
-     << "speculate-unpredictables";
+  OS << (Options.SimplifyCondBranch ? "" : "no-") << "simplify-cond-branch";
   OS << '>';
 }
 
diff --git a/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
index 75585fcc80266..9be60721bebb5 100644
--- a/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
@@ -425,12 +425,14 @@ void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForAdd(
 
 // Returns true if A matches B + C where C is constant.
 static bool matchesAdd(Value *A, Value *&B, ConstantInt *&C) {
-  return match(A, m_c_Add(m_Value(B), m_ConstantInt(C)));
+  return (match(A, m_Add(m_Value(B), m_ConstantInt(C))) ||
+          match(A, m_Add(m_ConstantInt(C), m_Value(B))));
 }
 
 // Returns true if A matches B | C where C is constant.
 static bool matchesOr(Value *A, Value *&B, ConstantInt *&C) {
-  return match(A, m_c_Or(m_Value(B), m_ConstantInt(C)));
+  return (match(A, m_Or(m_Value(B), m_ConstantInt(C))) ||
+          match(A, m_Or(m_ConstantInt(C), m_Value(B))));
 }
 
 void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForMul(
diff --git a/llvm/lib/Transforms/Utils/DXILResource.cpp b/llvm/lib/Transforms/Utils/DXILResource.cpp
index de2b6512a6d1c..bf45654a591b5 100644
--- a/llvm/lib/Transforms/Utils/DXILResource.cpp
+++ b/llvm/lib/Transforms/Utils/DXILResource.cpp
@@ -329,8 +329,8 @@ std::pair<uint32_t, uint32_t> ResourceInfo::getAnnotateProps() const {
   uint32_t ResourceKind = llvm::to_underlying(Kind);
   uint32_t AlignLog2 = isStruct() ? Log2(Struct.Alignment) : 0;
   bool IsUAV = isUAV();
-  bool IsROV = IsUAV && UAVFlags.IsROV;
-  bool IsGloballyCoherent = IsUAV && UAVFlags.GloballyCoherent;
+  bool IsROV = IsUAV ? UAVFlags.IsROV : 0;
+  bool IsGloballyCoherent = IsUAV ? UAVFlags.GloballyCoherent : 0;
   uint8_t SamplerCmpOrHasCounter = 0;
   if (IsUAV)
     SamplerCmpOrHasCounter = UAVFlags.HasCounter;
diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
index a0406111ecbf3..0afa1ef780504 100644
--- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
@@ -1091,8 +1091,8 @@ MDNode *llvm::GetUnrollMetadata(MDNode *LoopID, StringRef Name) {
   assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
   assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
 
-  for (const MDOperand &MDO : llvm::drop_begin(LoopID->operands())) {
-    MDNode *MD = dyn_cast<MDNode>(MDO);
+  for (unsigned i = 1, e = LoopID->getNumOperands(); i < e; ++i) {
+    MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
     if (!MD)
       continue;
 
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index f23e28888931d..8f717cb43bcb4 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -3476,8 +3476,7 @@ static bool FoldCondBranchOnValueKnownInPredecessor(BranchInst *BI,
 /// Given a BB that starts with the specified two-entry PHI node,
 /// see if we can eliminate it.
 static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,
-                                DomTreeUpdater *DTU, const DataLayout &DL,
-                                bool SpeculateUnpredictables) {
+                                DomTreeUpdater *DTU, const DataLayout &DL) {
   // Ok, this is a two entry PHI node.  Check to see if this is a simple "if
   // statement", which has a very simple dominance structure.  Basically, we
   // are trying to find the condition that is being branched on, which
@@ -3509,8 +3508,7 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,
   // jump to one specific 'then' block (if we have two of them).
   // It isn't beneficial to speculatively execute the code
   // from the block that we know is predictably not entered.
-  bool IsUnpredictable = DomBI->getMetadata(LLVMContext::MD_unpredictable);
-  if (!IsUnpredictable) {
+  if (!DomBI->getMetadata(LLVMContext::MD_unpredictable)) {
     uint64_t TWeight, FWeight;
     if (extractBranchWeights(*DomBI, TWeight, FWeight) &&
         (TWeight + FWeight) != 0) {
@@ -3553,8 +3551,6 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,
   InstructionCost Cost = 0;
   InstructionCost Budget =
       TwoEntryPHINodeFoldingThreshold * TargetTransformInfo::TCC_Basic;
-  if (SpeculateUnpredictables && IsUnpredictable)
-    Budget += TTI.getBranchMispredictPenalty();
 
   bool Changed = false;
   for (BasicBlock::iterator II = BB->begin(); isa<PHINode>(II);) {
@@ -3624,9 +3620,8 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,
              [](BasicBlock *IfBlock) { return IfBlock->hasAddressTaken(); }))
     return Changed;
 
-  LLVM_DEBUG(dbgs() << "FOUND IF CONDITION!  " << *IfCond;
-             if (IsUnpredictable) dbgs() << " (unpredictable)";
-             dbgs() << "  T: " << IfTrue->getName()
+  LLVM_DEBUG(dbgs() << "FOUND IF CONDITION!  " << *IfCond
+                    << "  T: " << IfTrue->getName()
                     << "  F: " << IfFalse->getName() << "\n");
 
   // If we can still promote the PHI nodes after this gauntlet of tests,
@@ -7819,8 +7814,7 @@ bool SimplifyCFGOpt::simplifyOnce(BasicBlock *BB) {
     // eliminate it, do so now.
     if (auto *PN = dyn_cast<PHINode>(BB->begin()))
       if (PN->getNumIncomingValues() == 2)
-        if (FoldTwoEntryPHINode(PN, TTI, DTU, DL,
-                                Options.SpeculateUnpredictables))
+        if (FoldTwoEntryPHINode(PN, TTI, DTU, DL))
           return true;
   }
 
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index cafec165f6d6f..f54eebb2874ab 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -261,20 +261,20 @@ void LoopVectorizeHints::getHintsFromMetadata() {
   assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
   assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
 
-  for (const MDOperand &MDO : llvm::drop_begin(LoopID->operands())) {
+  for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
     const MDString *S = nullptr;
     SmallVector<Metadata *, 4> Args;
 
     // The expected hint is either a MDString or a MDNode with the first
     // operand a MDString.
-    if (const MDNode *MD = dyn_cast<MDNode>(MDO)) {
+    if (const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i))) {
       if (!MD || MD->getNumOperands() == 0)
         continue;
       S = dyn_cast<MDString>(MD->getOperand(0));
       for (unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i)
         Args.push_back(MD->getOperand(i));
     } else {
-      S = dyn_cast<MDString>(MDO);
+      S = dyn_cast<MDString>(LoopID->getOperand(i));
       assert(Args.size() == 0 && "too many arguments for MDString");
     }
 
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 8fa1a57177d93..6d28b8fabe42e 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8371,8 +8371,8 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
 
 // Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the
 // original exit block.
-static void addUsersInExitBlock(Loop *OrigLoop, VPRecipeBuilder &Builder,
-                                VPlan &Plan) {
+static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB, Loop *OrigLoop,
+                                VPRecipeBuilder &Builder, VPlan &Plan) {
   BasicBlock *ExitBB = OrigLoop->getUniqueExitBlock();
   BasicBlock *ExitingBB = OrigLoop->getExitingBlock();
   // Only handle single-exit loops with unique exit blocks for now.
@@ -8605,7 +8605,7 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
     // and there is nothing to fix from vector loop; phis should have incoming
     // from scalar loop only.
   } else
-    addUsersInExitBlock(OrigLoop, RecipeBuilder, *Plan);
+    addUsersInExitBlock(HeaderVPBB, OrigLoop, RecipeBuilder, *Plan);
 
   assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
          !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index cca9eeebaa53f..667c4eb311c22 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -9699,8 +9699,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
           CanonicalType = CanonicalType->getWithNewType(IntegerType::get(
               CanonicalType->getContext(),
               DL->getTypeSizeInBits(CanonicalType->getScalarType())));
-        IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
-                                          {CanonicalType, CanonicalType});
+        IntrinsicCostAttributes CostAttrs(MinMaxID, VecTy, {VecTy, VecTy});
         InstructionCost IntrinsicCost =
             TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
         // If the selects are the only uses of the compares, they will be
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 0b596e7e4f633..805d9d91fc186 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1399,10 +1399,9 @@ class VPInstruction : public VPRecipeWithIRFlags {
   bool isSingleScalar() const;
 };
 
-/// VPWidenRecipe is a recipe for producing a widened instruction using the
-/// opcode and operands of the recipe. This recipe covers most of the
-/// traditional vectorization cases where each recipe transforms into a
-/// vectorized version of itself.
+/// VPWidenRecipe is a recipe for producing a copy of vector type its
+/// ingredient. This recipe covers most of the traditional vectorization cases
+/// where each ingredient transforms into a vectorized version of itself.
 class VPWidenRecipe : public VPRecipeWithIRFlags {
   unsigned Opcode;
 
@@ -1422,8 +1421,7 @@ class VPWidenRecipe : public VPRecipeWithIRFlags {
 
   VP_CLASSOF_IMPL(VPDef::VPWidenSC)
 
-  /// Produce a widened instruction using the opcode and operands of the recipe,
-  /// processing State.VF elements.
+  /// Produce widened copies of all Ingredients.
   void execute(VPTransformState &State) override;
 
   unsigned getOpcode() const { return Opcode; }
diff --git a/llvm/test/Analysis/CostModel/RISCV/arith-fp.ll b/llvm/test/Analysis/CostModel/RISCV/arith-fp.ll
index 5236f5a3bae95..d1e8bb015491e 100644
--- a/llvm/test/Analysis/CostModel/RISCV/arith-fp.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/arith-fp.ll
@@ -8,36 +8,36 @@ define i32 @fadd() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F16 = fadd half undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F32 = fadd float undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F64 = fadd double undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F16 = fadd <1 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F16 = fadd <2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F16 = fadd <4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F16 = fadd <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F16 = fadd <16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32F16 = fadd <32 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F16 = fadd <vscale x 1 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F16 = fadd <vscale x 2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4F16 = fadd <vscale x 4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV8F16 = fadd <vscale x 8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV16F16 = fadd <vscale x 16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV32F16 = fadd <vscale x 32 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F32 = fadd <1 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fadd <2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fadd <4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fadd <8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fadd <16 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F32 = fadd <vscale x 1 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F32 = fadd <vscale x 2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV4F32 = fadd <vscale x 4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV8F32 = fadd <vscale x 8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV16F32 = fadd <vscale x 16 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = fadd <1 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fadd <2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fadd <4 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fadd <8 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F64 = fadd <vscale x 1 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV2F64 = fadd <vscale x 2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV4F64 = fadd <vscale x 4 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV8F64 = fadd <vscale x 8 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F16 = fadd <1 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F16 = fadd <2 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F16 = fadd <4 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F16 = fadd <8 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16F16 = fadd <16 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32F16 = fadd <32 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1F16 = fadd <vscale x 1 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2F16 = fadd <vscale x 2 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4F16 = fadd <vscale x 4 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV8F16 = fadd <vscale x 8 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV16F16 = fadd <vscale x 16 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV32F16 = fadd <vscale x 32 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F32 = fadd <1 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F32 = fadd <2 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = fadd <4 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = fadd <8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = fadd <16 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1F32 = fadd <vscale x 1 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2F32 = fadd <vscale x 2 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4F32 = fadd <vscale x 4 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV8F32 = fadd <vscale x 8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV16F32 = fadd <vscale x 16 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = fadd <1 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = fadd <2 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = fadd <4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = fadd <8 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1F64 = fadd <vscale x 1 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F64 = fadd <vscale x 2 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV4F64 = fadd <vscale x 4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV8F64 = fadd <vscale x 8 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %F16 = fadd half undef, undef
@@ -88,36 +88,36 @@ define i32 @fsub() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F16 = fsub half undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F32 = fsub float undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F64 = fsub double undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F16 = fsub <1 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F16 = fsub <2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F16 = fsub <4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F16 = fsub <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F16 = fsub <16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32F16 = fsub <32 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F16 = fsub <vscale x 1 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F16 = fsub <vscale x 2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4F16 = fsub <vscale x 4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV8F16 = fsub <vscale x 8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV16F16 = fsub <vscale x 16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV32F16 = fsub <vscale x 32 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F32 = fsub <1 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fsub <2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fsub <4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fsub <8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fsub <16 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F32 = fsub <vscale x 1 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F32 = fsub <vscale x 2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV4F32 = fsub <vscale x 4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV8F32 = fsub <vscale x 8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV16F32 = fsub <vscale x 16 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = fsub <1 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fsub <2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fsub <4 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fsub <8 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F64 = fsub <vscale x 1 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV2F64 = fsub <vscale x 2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV4F64 = fsub <vscale x 4 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV8F64 = fsub <vscale x 8 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F16 = fsub <1 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F16 = fsub <2 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F16 = fsub <4 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F16 = fsub <8 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16F16 = fsub <16 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32F16 = fsub <32 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1F16 = fsub <vscale x 1 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2F16 = fsub <vscale x 2 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4F16 = fsub <vscale x 4 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV8F16 = fsub <vscale x 8 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV16F16 = fsub <vscale x 16 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV32F16 = fsub <vscale x 32 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F32 = fsub <1 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F32 = fsub <2 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = fsub <4 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = fsub <8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = fsub <16 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1F32 = fsub <vscale x 1 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2F32 = fsub <vscale x 2 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4F32 = fsub <vscale x 4 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV8F32 = fsub <vscale x 8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV16F32 = fsub <vscale x 16 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = fsub <1 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = fsub <2 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = fsub <4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = fsub <8 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1F64 = fsub <vscale x 1 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F64 = fsub <vscale x 2 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV4F64 = fsub <vscale x 4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV8F64 = fsub <vscale x 8 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %F16 = fsub half undef, undef
@@ -168,36 +168,36 @@ define i32 @fmul() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F16 = fmul half undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F32 = fmul float undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F64 = fmul double undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F16 = fmul <1 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F16 = fmul <2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F16 = fmul <4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F16 = fmul <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F16 = fmul <16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32F16 = fmul <32 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F16 = fmul <vscale x 1 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F16 = fmul <vscale x 2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4F16 = fmul <vscale x 4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV8F16 = fmul <vscale x 8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV16F16 = fmul <vscale x 16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV32F16 = fmul <vscale x 32 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F32 = fmul <1 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fmul <2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fmul <4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fmul <8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fmul <16 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F32 = fmul <vscale x 1 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F32 = fmul <vscale x 2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV4F32 = fmul <vscale x 4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV8F32 = fmul <vscale x 8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV16F32 = fmul <vscale x 16 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = fmul <1 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fmul <2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fmul <4 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fmul <8 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F64 = fmul <vscale x 1 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV2F64 = fmul <vscale x 2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV4F64 = fmul <vscale x 4 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV8F64 = fmul <vscale x 8 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F16 = fmul <1 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F16 = fmul <2 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F16 = fmul <4 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F16 = fmul <8 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16F16 = fmul <16 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32F16 = fmul <32 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1F16 = fmul <vscale x 1 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2F16 = fmul <vscale x 2 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4F16 = fmul <vscale x 4 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV8F16 = fmul <vscale x 8 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV16F16 = fmul <vscale x 16 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV32F16 = fmul <vscale x 32 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F32 = fmul <1 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F32 = fmul <2 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = fmul <4 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = fmul <8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = fmul <16 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1F32 = fmul <vscale x 1 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2F32 = fmul <vscale x 2 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4F32 = fmul <vscale x 4 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV8F32 = fmul <vscale x 8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV16F32 = fmul <vscale x 16 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = fmul <1 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = fmul <2 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = fmul <4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = fmul <8 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1F64 = fmul <vscale x 1 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F64 = fmul <vscale x 2 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV4F64 = fmul <vscale x 4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV8F64 = fmul <vscale x 8 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %F16 = fmul half undef, undef
@@ -248,36 +248,36 @@ define i32 @fdiv() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F16 = fdiv half undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F32 = fdiv float undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F64 = fdiv double undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F16 = fdiv <1 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F16 = fdiv <2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F16 = fdiv <4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F16 = fdiv <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F16 = fdiv <16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32F16 = fdiv <32 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F16 = fdiv <vscale x 1 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F16 = fdiv <vscale x 2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4F16 = fdiv <vscale x 4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV8F16 = fdiv <vscale x 8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV16F16 = fdiv <vscale x 16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV32F16 = fdiv <vscale x 32 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F32 = fdiv <1 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fdiv <2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fdiv <4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fdiv <8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fdiv <16 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F32 = fdiv <vscale x 1 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F32 = fdiv <vscale x 2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV4F32 = fdiv <vscale x 4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV8F32 = fdiv <vscale x 8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV16F32 = fdiv <vscale x 16 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = fdiv <1 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fdiv <2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fdiv <4 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fdiv <8 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F64 = fdiv <vscale x 1 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV2F64 = fdiv <vscale x 2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV4F64 = fdiv <vscale x 4 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV8F64 = fdiv <vscale x 8 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F16 = fdiv <1 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F16 = fdiv <2 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F16 = fdiv <4 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F16 = fdiv <8 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16F16 = fdiv <16 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32F16 = fdiv <32 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1F16 = fdiv <vscale x 1 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2F16 = fdiv <vscale x 2 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4F16 = fdiv <vscale x 4 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV8F16 = fdiv <vscale x 8 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV16F16 = fdiv <vscale x 16 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV32F16 = fdiv <vscale x 32 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F32 = fdiv <1 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F32 = fdiv <2 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = fdiv <4 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = fdiv <8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = fdiv <16 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1F32 = fdiv <vscale x 1 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2F32 = fdiv <vscale x 2 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4F32 = fdiv <vscale x 4 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV8F32 = fdiv <vscale x 8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV16F32 = fdiv <vscale x 16 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = fdiv <1 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = fdiv <2 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = fdiv <4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = fdiv <8 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1F64 = fdiv <vscale x 1 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F64 = fdiv <vscale x 2 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV4F64 = fdiv <vscale x 4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV8F64 = fdiv <vscale x 8 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %F16 = fdiv half undef, undef
@@ -408,36 +408,36 @@ define i32 @fneg() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F16 = fneg half undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F32 = fneg float undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F64 = fneg double undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F16 = fneg <1 x half> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F16 = fneg <2 x half> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F16 = fneg <4 x half> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F16 = fneg <8 x half> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F16 = fneg <16 x half> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32F16 = fneg <32 x half> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F16 = fneg <vscale x 1 x half> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F16 = fneg <vscale x 2 x half> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4F16 = fneg <vscale x 4 x half> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV8F16 = fneg <vscale x 8 x half> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV16F16 = fneg <vscale x 16 x half> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV32F16 = fneg <vscale x 32 x half> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F32 = fneg <1 x float> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fneg <2 x float> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fneg <4 x float> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fneg <8 x float> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fneg <16 x float> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F32 = fneg <vscale x 1 x float> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F32 = fneg <vscale x 2 x float> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV4F32 = fneg <vscale x 4 x float> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV8F32 = fneg <vscale x 8 x float> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV16F32 = fneg <vscale x 16 x float> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = fneg <1 x double> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fneg <2 x double> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fneg <4 x double> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fneg <8 x double> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F64 = fneg <vscale x 1 x double> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV2F64 = fneg <vscale x 2 x double> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV4F64 = fneg <vscale x 4 x double> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV8F64 = fneg <vscale x 8 x double> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F16 = fneg <1 x half> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F16 = fneg <2 x half> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F16 = fneg <4 x half> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F16 = fneg <8 x half> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16F16 = fneg <16 x half> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32F16 = fneg <32 x half> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1F16 = fneg <vscale x 1 x half> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2F16 = fneg <vscale x 2 x half> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4F16 = fneg <vscale x 4 x half> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV8F16 = fneg <vscale x 8 x half> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV16F16 = fneg <vscale x 16 x half> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV32F16 = fneg <vscale x 32 x half> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F32 = fneg <1 x float> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F32 = fneg <2 x float> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = fneg <4 x float> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = fneg <8 x float> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = fneg <16 x float> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1F32 = fneg <vscale x 1 x float> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2F32 = fneg <vscale x 2 x float> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4F32 = fneg <vscale x 4 x float> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV8F32 = fneg <vscale x 8 x float> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV16F32 = fneg <vscale x 16 x float> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = fneg <1 x double> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = fneg <2 x double> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = fneg <4 x double> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = fneg <8 x double> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1F64 = fneg <vscale x 1 x double> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F64 = fneg <vscale x 2 x double> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV4F64 = fneg <vscale x 4 x double> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV8F64 = fneg <vscale x 8 x double> undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %F16 = fneg half undef
diff --git a/llvm/test/Analysis/CostModel/RISCV/rvv-intrinsics.ll b/llvm/test/Analysis/CostModel/RISCV/rvv-intrinsics.ll
index 67c081ba5d3c6..87ffb23dcb88e 100644
--- a/llvm/test/Analysis/CostModel/RISCV/rvv-intrinsics.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/rvv-intrinsics.ll
@@ -21,7 +21,7 @@ define void @unsupported_fp_ops(<vscale x 4 x float> %vec, i32 %extraarg) {
 
 define void @powi(<vscale x 4 x float> %vec) {
 ; CHECK-LABEL: 'powi'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %powi = call <vscale x 4 x float> @llvm.powi.nxv4f32.i32(<vscale x 4 x float> %vec, i32 42)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %powi = call <vscale x 4 x float> @llvm.powi.nxv4f32.i32(<vscale x 4 x float> %vec, i32 42)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; TYPEBASED-LABEL: 'powi'
@@ -1383,73 +1383,73 @@ define void @reduce_fadd() {
 
 define void @vp_fadd(){
 ; CHECK-LABEL: 'vp_fadd'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t0 = call <2 x float> @llvm.vp.fadd.v2f32(<2 x float> undef, <2 x float> undef, <2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t1 = fadd <2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t2 = call <4 x float> @llvm.vp.fadd.v4f32(<4 x float> undef, <4 x float> undef, <4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t3 = fadd <4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t4 = call <8 x float> @llvm.vp.fadd.v8f32(<8 x float> undef, <8 x float> undef, <8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t5 = fadd <8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t6 = call <16 x float> @llvm.vp.fadd.v16f32(<16 x float> undef, <16 x float> undef, <16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t7 = fadd <16 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t8 = call <2 x double> @llvm.vp.fadd.v2f64(<2 x double> undef, <2 x double> undef, <2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t9 = fadd <2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t10 = call <4 x double> @llvm.vp.fadd.v4f64(<4 x double> undef, <4 x double> undef, <4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t11 = fadd <4 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t12 = call <8 x double> @llvm.vp.fadd.v8f64(<8 x double> undef, <8 x double> undef, <8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t13 = fadd <8 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t14 = call <16 x double> @llvm.vp.fadd.v16f64(<16 x double> undef, <16 x double> undef, <16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t15 = fadd <16 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t17 = call <vscale x 2 x float> @llvm.vp.fadd.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t18 = fadd <vscale x 2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t19 = call <vscale x 4 x float> @llvm.vp.fadd.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t20 = fadd <vscale x 4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t21 = call <vscale x 8 x float> @llvm.vp.fadd.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t22 = fadd <vscale x 8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t23 = call <vscale x 16 x float> @llvm.vp.fadd.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t24 = fadd <vscale x 16 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t25 = call <vscale x 2 x double> @llvm.vp.fadd.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t26 = fadd <vscale x 2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t27 = call <vscale x 4 x double> @llvm.vp.fadd.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t28 = fadd <vscale x 4 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t29 = call <vscale x 8 x double> @llvm.vp.fadd.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t30 = fadd <vscale x 8 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %t31 = call <vscale x 16 x double> @llvm.vp.fadd.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %t32 = fadd <vscale x 16 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t0 = call <2 x float> @llvm.vp.fadd.v2f32(<2 x float> undef, <2 x float> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t1 = fadd <2 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t2 = call <4 x float> @llvm.vp.fadd.v4f32(<4 x float> undef, <4 x float> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t3 = fadd <4 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t4 = call <8 x float> @llvm.vp.fadd.v8f32(<8 x float> undef, <8 x float> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t5 = fadd <8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t6 = call <16 x float> @llvm.vp.fadd.v16f32(<16 x float> undef, <16 x float> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t7 = fadd <16 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t8 = call <2 x double> @llvm.vp.fadd.v2f64(<2 x double> undef, <2 x double> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t9 = fadd <2 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t10 = call <4 x double> @llvm.vp.fadd.v4f64(<4 x double> undef, <4 x double> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t11 = fadd <4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t12 = call <8 x double> @llvm.vp.fadd.v8f64(<8 x double> undef, <8 x double> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t13 = fadd <8 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t14 = call <16 x double> @llvm.vp.fadd.v16f64(<16 x double> undef, <16 x double> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t15 = fadd <16 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t17 = call <vscale x 2 x float> @llvm.vp.fadd.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t18 = fadd <vscale x 2 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t19 = call <vscale x 4 x float> @llvm.vp.fadd.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t20 = fadd <vscale x 4 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t21 = call <vscale x 8 x float> @llvm.vp.fadd.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t22 = fadd <vscale x 8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t23 = call <vscale x 16 x float> @llvm.vp.fadd.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t24 = fadd <vscale x 16 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t25 = call <vscale x 2 x double> @llvm.vp.fadd.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t26 = fadd <vscale x 2 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t27 = call <vscale x 4 x double> @llvm.vp.fadd.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t28 = fadd <vscale x 4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t29 = call <vscale x 8 x double> @llvm.vp.fadd.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t30 = fadd <vscale x 8 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t31 = call <vscale x 16 x double> @llvm.vp.fadd.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t32 = fadd <vscale x 16 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; TYPEBASED-LABEL: 'vp_fadd'
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t0 = call <2 x float> @llvm.vp.fadd.v2f32(<2 x float> undef, <2 x float> undef, <2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t1 = fadd <2 x float> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t2 = call <4 x float> @llvm.vp.fadd.v4f32(<4 x float> undef, <4 x float> undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t3 = fadd <4 x float> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t4 = call <8 x float> @llvm.vp.fadd.v8f32(<8 x float> undef, <8 x float> undef, <8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t5 = fadd <8 x float> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t6 = call <16 x float> @llvm.vp.fadd.v16f32(<16 x float> undef, <16 x float> undef, <16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t7 = fadd <16 x float> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t8 = call <2 x double> @llvm.vp.fadd.v2f64(<2 x double> undef, <2 x double> undef, <2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t9 = fadd <2 x double> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t10 = call <4 x double> @llvm.vp.fadd.v4f64(<4 x double> undef, <4 x double> undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t11 = fadd <4 x double> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t12 = call <8 x double> @llvm.vp.fadd.v8f64(<8 x double> undef, <8 x double> undef, <8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t13 = fadd <8 x double> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t14 = call <16 x double> @llvm.vp.fadd.v16f64(<16 x double> undef, <16 x double> undef, <16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t15 = fadd <16 x double> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t17 = call <vscale x 2 x float> @llvm.vp.fadd.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t18 = fadd <vscale x 2 x float> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t19 = call <vscale x 4 x float> @llvm.vp.fadd.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t20 = fadd <vscale x 4 x float> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t21 = call <vscale x 8 x float> @llvm.vp.fadd.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t22 = fadd <vscale x 8 x float> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t23 = call <vscale x 16 x float> @llvm.vp.fadd.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t24 = fadd <vscale x 16 x float> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t25 = call <vscale x 2 x double> @llvm.vp.fadd.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t26 = fadd <vscale x 2 x double> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t27 = call <vscale x 4 x double> @llvm.vp.fadd.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t28 = fadd <vscale x 4 x double> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t29 = call <vscale x 8 x double> @llvm.vp.fadd.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t30 = fadd <vscale x 8 x double> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %t31 = call <vscale x 16 x double> @llvm.vp.fadd.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %t32 = fadd <vscale x 16 x double> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t0 = call <2 x float> @llvm.vp.fadd.v2f32(<2 x float> undef, <2 x float> undef, <2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t1 = fadd <2 x float> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t2 = call <4 x float> @llvm.vp.fadd.v4f32(<4 x float> undef, <4 x float> undef, <4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t3 = fadd <4 x float> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t4 = call <8 x float> @llvm.vp.fadd.v8f32(<8 x float> undef, <8 x float> undef, <8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t5 = fadd <8 x float> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t6 = call <16 x float> @llvm.vp.fadd.v16f32(<16 x float> undef, <16 x float> undef, <16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t7 = fadd <16 x float> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t8 = call <2 x double> @llvm.vp.fadd.v2f64(<2 x double> undef, <2 x double> undef, <2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t9 = fadd <2 x double> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t10 = call <4 x double> @llvm.vp.fadd.v4f64(<4 x double> undef, <4 x double> undef, <4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t11 = fadd <4 x double> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t12 = call <8 x double> @llvm.vp.fadd.v8f64(<8 x double> undef, <8 x double> undef, <8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t13 = fadd <8 x double> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t14 = call <16 x double> @llvm.vp.fadd.v16f64(<16 x double> undef, <16 x double> undef, <16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t15 = fadd <16 x double> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t17 = call <vscale x 2 x float> @llvm.vp.fadd.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t18 = fadd <vscale x 2 x float> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t19 = call <vscale x 4 x float> @llvm.vp.fadd.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t20 = fadd <vscale x 4 x float> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t21 = call <vscale x 8 x float> @llvm.vp.fadd.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t22 = fadd <vscale x 8 x float> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t23 = call <vscale x 16 x float> @llvm.vp.fadd.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t24 = fadd <vscale x 16 x float> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t25 = call <vscale x 2 x double> @llvm.vp.fadd.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t26 = fadd <vscale x 2 x double> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t27 = call <vscale x 4 x double> @llvm.vp.fadd.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t28 = fadd <vscale x 4 x double> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t29 = call <vscale x 8 x double> @llvm.vp.fadd.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t30 = fadd <vscale x 8 x double> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t31 = call <vscale x 16 x double> @llvm.vp.fadd.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t32 = fadd <vscale x 16 x double> undef, undef
 ; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %t0 = call <2 x float> @llvm.vp.fadd.v2f32(<2 x float> undef, <2 x float> undef, <2 x i1> undef, i32 undef)
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-threeway-cmp.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-threeway-cmp.mir
deleted file mode 100644
index e6f7a2953a0c3..0000000000000
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-threeway-cmp.mir
+++ /dev/null
@@ -1,141 +0,0 @@
-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -O0 -mtriple=aarch64 -run-pass=legalizer -global-isel-abort=1 -verify-machineinstrs %s -o - | FileCheck %s
----
-name:            test_scmp
-body:             |
-  bb.0.entry:
-    ; CHECK-LABEL: name: test_scmp
-    ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x0
-    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(sgt), [[COPY]](s64), [[COPY1]]
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s32), [[C]], [[C1]]
-    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(slt), [[COPY]](s64), [[COPY1]]
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
-    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s32), [[C2]], [[SELECT]]
-    ; CHECK-NEXT: $w0 = COPY [[SELECT1]](s32)
-    %0:_(s64) = COPY $x0
-    %1:_(s64) = COPY $x0
-    %4:_(s2) = G_SCMP %0(s64), %1
-    %14:_(s32) = G_ANYEXT %4(s2)
-    $w0 = COPY %14(s32)
-
-...
----
-name:            test_ucmp
-body:             |
-  bb.0.entry:
-    ; CHECK-LABEL: name: test_ucmp
-    ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x0
-    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[COPY]](s64), [[COPY1]]
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s32), [[C]], [[C1]]
-    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY]](s64), [[COPY1]]
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
-    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s32), [[C2]], [[SELECT]]
-    ; CHECK-NEXT: $w0 = COPY [[SELECT1]](s32)
-    %0:_(s64) = COPY $x0
-    %1:_(s64) = COPY $x0
-    %4:_(s2) = G_UCMP %0(s64), %1
-    %14:_(s32) = G_ANYEXT %4(s2)
-    $w0 = COPY %14(s32)
-
-...
----
-name:            test_ucmp_vector
-body:             |
-  bb.0.entry:
-    ; CHECK-LABEL: name: test_ucmp_vector
-    ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $w2
-    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $w3
-    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
-    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $w0
-    ; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $w1
-    ; CHECK-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $w2
-    ; CHECK-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $w3
-    ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 1
-    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s8) = G_CONSTANT i8 0
-    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<4 x s32>) = G_ICMP intpred(ugt), [[BUILD_VECTOR]](<4 x s32>), [[BUILD_VECTOR1]]
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s8) = G_CONSTANT i8 3
-    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(<4 x s16>) = G_TRUNC [[ICMP]](<4 x s32>)
-    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s8) = G_IMPLICIT_DEF
-    ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<8 x s8>) = G_BUILD_VECTOR [[C2]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8), [[DEF]](s8), [[DEF]](s8), [[DEF]](s8), [[DEF]](s8)
-    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(<8 x s16>) = G_ANYEXT [[BUILD_VECTOR2]](<8 x s8>)
-    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<4 x s16>), [[UV1:%[0-9]+]]:_(<4 x s16>) = G_UNMERGE_VALUES [[ANYEXT]](<8 x s16>)
-    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(<4 x s16>) = G_XOR [[TRUNC]], [[UV]]
-    ; CHECK-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<8 x s8>) = G_BUILD_VECTOR [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[DEF]](s8), [[DEF]](s8), [[DEF]](s8), [[DEF]](s8)
-    ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(<8 x s16>) = G_ANYEXT [[BUILD_VECTOR3]](<8 x s8>)
-    ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(<4 x s16>), [[UV3:%[0-9]+]]:_(<4 x s16>) = G_UNMERGE_VALUES [[ANYEXT1]](<8 x s16>)
-    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(<4 x s16>) = G_TRUNC [[ICMP]](<4 x s32>)
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(<4 x s16>) = G_AND [[UV2]], [[TRUNC1]]
-    ; CHECK-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<8 x s8>) = G_BUILD_VECTOR [[C1]](s8), [[C1]](s8), [[C1]](s8), [[C1]](s8), [[DEF]](s8), [[DEF]](s8), [[DEF]](s8), [[DEF]](s8)
-    ; CHECK-NEXT: [[ANYEXT2:%[0-9]+]]:_(<8 x s16>) = G_ANYEXT [[BUILD_VECTOR4]](<8 x s8>)
-    ; CHECK-NEXT: [[UV4:%[0-9]+]]:_(<4 x s16>), [[UV5:%[0-9]+]]:_(<4 x s16>) = G_UNMERGE_VALUES [[ANYEXT2]](<8 x s16>)
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(<4 x s16>) = G_AND [[UV4]], [[XOR]]
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(<4 x s16>) = G_OR [[AND]], [[AND1]]
-    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s8) = G_CONSTANT i8 3
-    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(<4 x s32>) = G_ICMP intpred(ult), [[BUILD_VECTOR]](<4 x s32>), [[BUILD_VECTOR1]]
-    ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(<4 x s16>) = G_TRUNC [[ICMP1]](<4 x s32>)
-    ; CHECK-NEXT: [[BUILD_VECTOR5:%[0-9]+]]:_(<8 x s8>) = G_BUILD_VECTOR [[C3]](s8), [[C3]](s8), [[C3]](s8), [[C3]](s8), [[DEF]](s8), [[DEF]](s8), [[DEF]](s8), [[DEF]](s8)
-    ; CHECK-NEXT: [[ANYEXT3:%[0-9]+]]:_(<8 x s16>) = G_ANYEXT [[BUILD_VECTOR5]](<8 x s8>)
-    ; CHECK-NEXT: [[UV6:%[0-9]+]]:_(<4 x s16>), [[UV7:%[0-9]+]]:_(<4 x s16>) = G_UNMERGE_VALUES [[ANYEXT3]](<8 x s16>)
-    ; CHECK-NEXT: [[XOR1:%[0-9]+]]:_(<4 x s16>) = G_XOR [[TRUNC2]], [[UV6]]
-    ; CHECK-NEXT: [[BUILD_VECTOR6:%[0-9]+]]:_(<8 x s8>) = G_BUILD_VECTOR [[C3]](s8), [[C3]](s8), [[C3]](s8), [[C3]](s8), [[DEF]](s8), [[DEF]](s8), [[DEF]](s8), [[DEF]](s8)
-    ; CHECK-NEXT: [[ANYEXT4:%[0-9]+]]:_(<8 x s16>) = G_ANYEXT [[BUILD_VECTOR6]](<8 x s8>)
-    ; CHECK-NEXT: [[UV8:%[0-9]+]]:_(<4 x s16>), [[UV9:%[0-9]+]]:_(<4 x s16>) = G_UNMERGE_VALUES [[ANYEXT4]](<8 x s16>)
-    ; CHECK-NEXT: [[TRUNC3:%[0-9]+]]:_(<4 x s16>) = G_TRUNC [[ICMP1]](<4 x s32>)
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(<4 x s16>) = G_AND [[UV8]], [[TRUNC3]]
-    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(<4 x s16>) = G_AND [[OR]], [[XOR1]]
-    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(<4 x s16>) = G_OR [[AND2]], [[AND3]]
-    ; CHECK-NEXT: [[ANYEXT5:%[0-9]+]]:_(<4 x s32>) = G_ANYEXT [[OR1]](<4 x s16>)
-    ; CHECK-NEXT: $q0 = COPY [[ANYEXT5]](<4 x s32>)
-    %0:_(s32) = COPY $w0
-    %1:_(s32) = COPY $w1
-    %2:_(s32) = COPY $w2
-    %3:_(s32) = COPY $w3
-    %4:_(<4 x s32>) = G_BUILD_VECTOR %0(s32), %1(s32), %2(s32), %3(s32)
-    %5:_(s32) = COPY $w0
-    %6:_(s32) = COPY $w1
-    %7:_(s32) = COPY $w2
-    %8:_(s32) = COPY $w3
-    %9:_(<4 x s32>) = G_BUILD_VECTOR %0(s32), %1(s32), %2(s32), %3(s32)
-    %10:_(<4 x s2>) = G_UCMP %4(<4 x s32>), %9
-    %11:_(<4 x s32>) = G_ANYEXT %10(<4 x s2>)
-    $q0 = COPY %11(<4 x s32>)
-
-...
----
-name:            test_ucmp_i128
-body:             |
-  bb.0.entry:
-    ; CHECK-LABEL: name: test_ucmp_i128
-    ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x0
-    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
-    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[DEF]](s64), [[DEF]]
-    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[DEF]](s64), [[DEF]]
-    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[COPY]](s64), [[COPY1]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s32), [[ICMP2]], [[ICMP]]
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[SELECT]](s32), [[C]], [[C1]]
-    ; CHECK-NEXT: [[ICMP3:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[DEF]](s64), [[DEF]]
-    ; CHECK-NEXT: [[ICMP4:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[DEF]](s64), [[DEF]]
-    ; CHECK-NEXT: [[ICMP5:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY]](s64), [[COPY1]]
-    ; CHECK-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[ICMP4]](s32), [[ICMP5]], [[ICMP3]]
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
-    ; CHECK-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[SELECT2]](s32), [[C2]], [[SELECT1]]
-    ; CHECK-NEXT: $w0 = COPY [[SELECT3]](s32)
-    %0:_(s64) = COPY $x0
-    %1:_(s64) = COPY $x0
-    %l:_(s128) = G_ANYEXT %0
-    %r:_(s128) = G_ANYEXT %1
-    %4:_(s2) = G_UCMP %l(s128), %r
-    %14:_(s32) = G_ANYEXT %4(s2)
-    $w0 = COPY %14(s32)
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
index 87a415b45cca9..0e7804e98ae6d 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
@@ -352,12 +352,11 @@
 # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: G_SCMP (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
-# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined
+# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
 # DEBUG-NEXT: G_UCMP (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
-# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
-# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined
+# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
 # DEBUG-NEXT: G_SELECT (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
 # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
diff --git a/llvm/test/CodeGen/AArch64/callbr-asm-obj-file.ll b/llvm/test/CodeGen/AArch64/callbr-asm-obj-file.ll
index 8b6740667a229..e601f03d524a4 100644
--- a/llvm/test/CodeGen/AArch64/callbr-asm-obj-file.ll
+++ b/llvm/test/CodeGen/AArch64/callbr-asm-obj-file.ll
@@ -7,10 +7,10 @@
 @l = common hidden local_unnamed_addr global i32 0, align 4
 
 ; CHECK-LABEL: <test1>:
-; CHECK-LABEL: <$d>:
-; CHECK-LABEL: <$x>:
+; CHECK-LABEL: <$d.1>:
+; CHECK-LABEL: <$x.2>:
 ; CHECK-NEXT:    b 0x2c <test1+0x2c>
-; CHECK-LABEL: <$x>:
+; CHECK-LABEL: <$x.4>:
 ; CHECK-NEXT:    mov w0, wzr
 ; CHECK-NEXT:    ldr x30, [sp], #16
 ; CHECK-NEXT:    ret
@@ -41,8 +41,8 @@ declare dso_local i32 @i(...) local_unnamed_addr
 
 ; CHECK-LABEL: <test2>:
 ; CHECK:         b {{.*}} <test2+0x1c>
-; CHECK-LABEL: <$d>:
-; CHECK-LABEL: <$x>:
+; CHECK-LABEL: <$d.5>:
+; CHECK-LABEL: <$x.6>:
 ; CHECK-NEXT:    b {{.*}} <test2+0x18>
 define hidden i32 @test2() local_unnamed_addr {
   %1 = load i32, ptr @l, align 4
@@ -70,10 +70,10 @@ define hidden i32 @test2() local_unnamed_addr {
 }
 
 ; CHECK-LABEL: <test3>:
-; CHECK-LABEL: <$d>:
-; CHECK-LABEL: <$x>:
+; CHECK-LABEL: <$d.9>:
+; CHECK-LABEL: <$x.10>:
 ; CHECK-NEXT:    b {{.*}} <test3+0x34>
-; CHECK-LABEL: <$x>:
+; CHECK-LABEL: <$x.12>:
 ; CHECK-NEXT:    ldr x30, [sp], #16
 ; CHECK-NEXT:    ret
 define internal i1 @test3() {
diff --git a/llvm/test/CodeGen/AArch64/cmp-chains.ll b/llvm/test/CodeGen/AArch64/cmp-chains.ll
index 4b816df75a730..14cb0c82b1c03 100644
--- a/llvm/test/CodeGen/AArch64/cmp-chains.ll
+++ b/llvm/test/CodeGen/AArch64/cmp-chains.ll
@@ -258,246 +258,3 @@ define i32 @neg_range_int(i32 %a, i32 %b, i32 %c) {
   ret i32 %retval.0
 }
 
-; (b > -(d | 1) && a < c)
-define i32 @neg_range_int_comp(i32 %a, i32 %b, i32 %c, i32 %d) {
-; SDISEL-LABEL: neg_range_int_comp:
-; SDISEL:       // %bb.0:
-; SDISEL-NEXT:    orr w8, w3, #0x1
-; SDISEL-NEXT:    cmp w0, w2
-; SDISEL-NEXT:    ccmn w1, w8, #4, lt
-; SDISEL-NEXT:    csel w0, w1, w0, gt
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: neg_range_int_comp:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    orr w8, w3, #0x1
-; GISEL-NEXT:    cmp w0, w2
-; GISEL-NEXT:    neg w8, w8
-; GISEL-NEXT:    ccmp w1, w8, #4, lt
-; GISEL-NEXT:    csel w0, w1, w0, gt
-; GISEL-NEXT:    ret
-  %dor = or i32 %d, 1
-  %negd = sub i32 0, %dor
-  %cmp = icmp sgt i32 %b, %negd
-  %cmp1 = icmp slt i32 %a, %c
-  %or.cond = and i1 %cmp, %cmp1
-  %retval.0 = select i1 %or.cond, i32 %b, i32 %a
-  ret i32 %retval.0
-}
-
-; (b >u -(d | 1) && a < c)
-define i32 @neg_range_int_comp_u(i32 %a, i32 %b, i32 %c, i32 %d) {
-; SDISEL-LABEL: neg_range_int_comp_u:
-; SDISEL:       // %bb.0:
-; SDISEL-NEXT:    orr w8, w3, #0x1
-; SDISEL-NEXT:    cmp w0, w2
-; SDISEL-NEXT:    ccmn w1, w8, #0, lt
-; SDISEL-NEXT:    csel w0, w1, w0, hi
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: neg_range_int_comp_u:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    orr w8, w3, #0x1
-; GISEL-NEXT:    cmp w0, w2
-; GISEL-NEXT:    neg w8, w8
-; GISEL-NEXT:    ccmp w1, w8, #0, lt
-; GISEL-NEXT:    csel w0, w1, w0, hi
-; GISEL-NEXT:    ret
-  %dor = or i32 %d, 1
-  %negd = sub i32 0, %dor
-  %cmp = icmp ugt i32 %b, %negd
-  %cmp1 = icmp slt i32 %a, %c
-  %or.cond = and i1 %cmp, %cmp1
-  %retval.0 = select i1 %or.cond, i32 %b, i32 %a
-  ret i32 %retval.0
-}
-
-; (b > -(d | 1) && a u < c)
-define i32 @neg_range_int_comp_ua(i32 %a, i32 %b, i32 %c, i32 %d) {
-; SDISEL-LABEL: neg_range_int_comp_ua:
-; SDISEL:       // %bb.0:
-; SDISEL-NEXT:    orr w8, w3, #0x1
-; SDISEL-NEXT:    cmp w0, w2
-; SDISEL-NEXT:    ccmn w1, w8, #4, lo
-; SDISEL-NEXT:    csel w0, w1, w0, gt
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: neg_range_int_comp_ua:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    orr w8, w3, #0x1
-; GISEL-NEXT:    cmp w0, w2
-; GISEL-NEXT:    neg w8, w8
-; GISEL-NEXT:    ccmp w1, w8, #4, lo
-; GISEL-NEXT:    csel w0, w1, w0, gt
-; GISEL-NEXT:    ret
-  %dor = or i32 %d, 1
-  %negd = sub i32 0, %dor
-  %cmp = icmp sgt i32 %b, %negd
-  %cmp1 = icmp ult i32 %a, %c
-  %or.cond = and i1 %cmp, %cmp1
-  %retval.0 = select i1 %or.cond, i32 %b, i32 %a
-  ret i32 %retval.0
-}
-
-; (b <= -3 && a > c)
-define i32 @neg_range_int_2(i32 %a, i32 %b, i32 %c) {
-; SDISEL-LABEL: neg_range_int_2:
-; SDISEL:       // %bb.0:
-; SDISEL-NEXT:    cmp w0, w2
-; SDISEL-NEXT:    ccmn w1, #4, #4, gt
-; SDISEL-NEXT:    csel w0, w1, w0, gt
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: neg_range_int_2:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    cmp w0, w2
-; GISEL-NEXT:    ccmn w1, #3, #8, gt
-; GISEL-NEXT:    csel w0, w1, w0, ge
-; GISEL-NEXT:    ret
-  %cmp = icmp sge i32 %b, -3
-  %cmp1 = icmp sgt i32 %a, %c
-  %or.cond = and i1 %cmp, %cmp1
-  %retval.0 = select i1 %or.cond, i32 %b, i32 %a
-  ret i32 %retval.0
-}
-
-; (b < -(d | 1) && a >= c)
-define i32 @neg_range_int_comp2(i32 %a, i32 %b, i32 %c, i32 %d) {
-; SDISEL-LABEL: neg_range_int_comp2:
-; SDISEL:       // %bb.0:
-; SDISEL-NEXT:    orr w8, w3, #0x1
-; SDISEL-NEXT:    cmp w0, w2
-; SDISEL-NEXT:    ccmn w1, w8, #0, ge
-; SDISEL-NEXT:    csel w0, w1, w0, lt
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: neg_range_int_comp2:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    orr w8, w3, #0x1
-; GISEL-NEXT:    cmp w0, w2
-; GISEL-NEXT:    neg w8, w8
-; GISEL-NEXT:    ccmp w1, w8, #0, ge
-; GISEL-NEXT:    csel w0, w1, w0, lt
-; GISEL-NEXT:    ret
-  %dor = or i32 %d, 1
-  %negd = sub i32 0, %dor
-  %cmp = icmp slt i32 %b, %negd
-  %cmp1 = icmp sge i32 %a, %c
-  %or.cond = and i1 %cmp, %cmp1
-  %retval.0 = select i1 %or.cond, i32 %b, i32 %a
-  ret i32 %retval.0
-}
-
-; (b <u -(d | 1) && a > c)
-define i32 @neg_range_int_comp_u2(i32 %a, i32 %b, i32 %c, i32 %d) {
-; SDISEL-LABEL: neg_range_int_comp_u2:
-; SDISEL:       // %bb.0:
-; SDISEL-NEXT:    orr w8, w3, #0x1
-; SDISEL-NEXT:    cmp w0, w2
-; SDISEL-NEXT:    ccmn w1, w8, #2, gt
-; SDISEL-NEXT:    csel w0, w1, w0, lo
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: neg_range_int_comp_u2:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    orr w8, w3, #0x1
-; GISEL-NEXT:    cmp w0, w2
-; GISEL-NEXT:    neg w8, w8
-; GISEL-NEXT:    ccmp w1, w8, #2, gt
-; GISEL-NEXT:    csel w0, w1, w0, lo
-; GISEL-NEXT:    ret
-  %dor = or i32 %d, 1
-  %negd = sub i32 0, %dor
-  %cmp = icmp ult i32 %b, %negd
-  %cmp1 = icmp sgt i32 %a, %c
-  %or.cond = and i1 %cmp, %cmp1
-  %retval.0 = select i1 %or.cond, i32 %b, i32 %a
-  ret i32 %retval.0
-}
-
-; (b > -(d | 1) && a u > c)
-define i32 @neg_range_int_comp_ua2(i32 %a, i32 %b, i32 %c, i32 %d) {
-; SDISEL-LABEL: neg_range_int_comp_ua2:
-; SDISEL:       // %bb.0:
-; SDISEL-NEXT:    orr w8, w3, #0x1
-; SDISEL-NEXT:    cmp w0, w2
-; SDISEL-NEXT:    ccmn w1, w8, #4, hi
-; SDISEL-NEXT:    csel w0, w1, w0, gt
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: neg_range_int_comp_ua2:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    orr w8, w3, #0x1
-; GISEL-NEXT:    cmp w0, w2
-; GISEL-NEXT:    neg w8, w8
-; GISEL-NEXT:    ccmp w1, w8, #4, hi
-; GISEL-NEXT:    csel w0, w1, w0, gt
-; GISEL-NEXT:    ret
-  %dor = or i32 %d, 1
-  %negd = sub i32 0, %dor
-  %cmp = icmp sgt i32 %b, %negd
-  %cmp1 = icmp ugt i32 %a, %c
-  %or.cond = and i1 %cmp, %cmp1
-  %retval.0 = select i1 %or.cond, i32 %b, i32 %a
-  ret i32 %retval.0
-}
-
-; (b > -(d | 1) && a u == c)
-define i32 @neg_range_int_comp_ua3(i32 %a, i32 %b, i32 %c, i32 %d) {
-; SDISEL-LABEL: neg_range_int_comp_ua3:
-; SDISEL:       // %bb.0:
-; SDISEL-NEXT:    orr w8, w3, #0x1
-; SDISEL-NEXT:    cmp w0, w2
-; SDISEL-NEXT:    ccmn w1, w8, #4, eq
-; SDISEL-NEXT:    csel w0, w1, w0, gt
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: neg_range_int_comp_ua3:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    orr w8, w3, #0x1
-; GISEL-NEXT:    cmp w0, w2
-; GISEL-NEXT:    neg w8, w8
-; GISEL-NEXT:    ccmp w1, w8, #4, eq
-; GISEL-NEXT:    csel w0, w1, w0, gt
-; GISEL-NEXT:    ret
-  %dor = or i32 %d, 1
-  %negd = sub i32 0, %dor
-  %cmp = icmp sgt i32 %b, %negd
-  %cmp1 = icmp eq i32 %a, %c
-  %or.cond = and i1 %cmp, %cmp1
-  %retval.0 = select i1 %or.cond, i32 %b, i32 %a
-  ret i32 %retval.0
-}
-
-; -(a | 1) > (b | 3) && a < c
-define i32 @neg_range_int_c(i32 %a, i32 %b, i32 %c) {
-; SDISEL-LABEL: neg_range_int_c:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    orr w8, w0, #0x1
-; SDISEL-NEXT:    orr w9, w1, #0x3
-; SDISEL-NEXT:    cmn w9, w8
-; SDISEL-NEXT:    ccmp w2, w0, #2, lo
-; SDISEL-NEXT:    cset w0, lo
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: neg_range_int_c:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    orr w8, w0, #0x1
-; GISEL-NEXT:    orr w9, w1, #0x3
-; GISEL-NEXT:    neg w8, w8
-; GISEL-NEXT:    cmp w9, w8
-; GISEL-NEXT:    cset w8, lo
-; GISEL-NEXT:    cmp w2, w0
-; GISEL-NEXT:    cset w9, lo
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
-entry:
-  %or = or i32 %a, 1
-  %sub = sub i32 0, %or
-  %or1 = or i32 %b, 3
-  %cmp = icmp ult i32 %or1, %sub
-  %cmp2 = icmp ult i32 %c, %a
-  %0 = and i1 %cmp, %cmp2
-  %land.ext = zext i1 %0 to i32
-  ret i32 %land.ext
-}
diff --git a/llvm/test/CodeGen/AArch64/cmp-select-sign.ll b/llvm/test/CodeGen/AArch64/cmp-select-sign.ll
index 22440b79bdcd4..09a6e26fe5a40 100644
--- a/llvm/test/CodeGen/AArch64/cmp-select-sign.ll
+++ b/llvm/test/CodeGen/AArch64/cmp-select-sign.ll
@@ -262,210 +262,4 @@ define <4 x i65> @sign_4xi65(<4 x i65> %a) {
   ret <4 x i65> %res
 }
 
-define i32 @or_neg(i32 %x, i32 %y) {
-; CHECK-LABEL: or_neg:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    orr w8, w0, #0x1
-; CHECK-NEXT:    cmn w1, w8
-; CHECK-NEXT:    cset w0, lt
-; CHECK-NEXT:    ret
-  %3 = or i32 %x, 1
-  %4 = sub i32 0, %3
-  %5 = icmp sgt i32 %4, %y
-  %6 = zext i1 %5 to i32
-  ret i32 %6
-}
-
-define i32 @or_neg_ugt(i32 %x, i32 %y) {
-; CHECK-LABEL: or_neg_ugt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    orr w8, w0, #0x1
-; CHECK-NEXT:    cmn w1, w8
-; CHECK-NEXT:    cset w0, lo
-; CHECK-NEXT:    ret
-  %3 = or i32 %x, 1
-  %4 = sub i32 0, %3
-  %5 = icmp ugt i32 %4, %y
-  %6 = zext i1 %5 to i32
-  ret i32 %6
-}
-
-; Negative test
-
-define i32 @or_neg_no_smin(i32 %x, i32 %y) {
-; CHECK-LABEL: or_neg_no_smin:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    neg w8, w0
-; CHECK-NEXT:    cmp w8, w1
-; CHECK-NEXT:    cset w0, gt
-; CHECK-NEXT:    ret
-  %4 = sub i32 0, %x
-  %5 = icmp sgt i32 %4, %y
-  %6 = zext i1 %5 to i32
-  ret i32 %6
-}
-
-; Negative test
-
-define i32 @or_neg_ult_no_zero(i32 %x, i32 %y) {
-; CHECK-LABEL: or_neg_ult_no_zero:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    neg w8, w0
-; CHECK-NEXT:    cmp w8, w1
-; CHECK-NEXT:    cset w0, lo
-; CHECK-NEXT:    ret
-  %4 = sub i32 0, %x
-  %5 = icmp ult i32 %4, %y
-  %6 = zext i1 %5 to i32
-  ret i32 %6
-}
-
-define i32 @or_neg_no_smin_but_zero(i32 %x, i32 %y) {
-; CHECK-LABEL: or_neg_no_smin_but_zero:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    bic w8, w0, w0, asr #31
-; CHECK-NEXT:    cmn w1, w8
-; CHECK-NEXT:    cset w0, lt
-; CHECK-NEXT:    ret
-  %3 = call i32 @llvm.smax.i32(i32 %x, i32 0)
-  %4 = sub i32 0, %3
-  %5 = icmp sgt i32 %4, %y
-  %6 = zext i1 %5 to i32
-  ret i32 %6
-}
-
-define i32 @or_neg_slt_zero_but_no_smin(i32 %x, i32 %y) {
-; CHECK-LABEL: or_neg_slt_zero_but_no_smin:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #9 // =0x9
-; CHECK-NEXT:    cmp w0, #9
-; CHECK-NEXT:    csel w8, w0, w8, lo
-; CHECK-NEXT:    neg w8, w8
-; CHECK-NEXT:    cmp w8, w1
-; CHECK-NEXT:    cset w0, hi
-; CHECK-NEXT:    ret
-  %3 = call i32 @llvm.umin.i32(i32 %x, i32 9)
-  %4 = sub i32 0, %3
-  %5 = icmp ugt i32 %4, %y
-  %6 = zext i1 %5 to i32
-  ret i32 %6
-}
-
-define i32 @or_neg2(i32 %x, i32 %y) {
-; CHECK-LABEL: or_neg2:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    orr w8, w0, #0x1
-; CHECK-NEXT:    cmn w1, w8
-; CHECK-NEXT:    cset w0, le
-; CHECK-NEXT:    ret
-  %3 = or i32 %x, 1
-  %4 = sub i32 0, %3
-  %5 = icmp sge i32 %4, %y
-  %6 = zext i1 %5 to i32
-  ret i32 %6
-}
-
-define i32 @or_neg3(i32 %x, i32 %y) {
-; CHECK-LABEL: or_neg3:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    orr w8, w0, #0x1
-; CHECK-NEXT:    cmn w1, w8
-; CHECK-NEXT:    cset w0, gt
-; CHECK-NEXT:    ret
-  %3 = or i32 %x, 1
-  %4 = sub i32 0, %3
-  %5 = icmp slt i32 %4, %y
-  %6 = zext i1 %5 to i32
-  ret i32 %6
-}
-
-define i32 @or_neg4(i32 %x, i32 %y) {
-; CHECK-LABEL: or_neg4:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    orr w8, w0, #0x1
-; CHECK-NEXT:    cmn w1, w8
-; CHECK-NEXT:    cset w0, ge
-; CHECK-NEXT:    ret
-  %3 = or i32 %x, 1
-  %4 = sub i32 0, %3
-  %5 = icmp sle i32 %4, %y
-  %6 = zext i1 %5 to i32
-  ret i32 %6
-}
-
-define i32 @or_neg_ult(i32 %x, i32 %y) {
-; CHECK-LABEL: or_neg_ult:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    orr w8, w0, #0x1
-; CHECK-NEXT:    cmn w1, w8
-; CHECK-NEXT:    cset w0, lo
-; CHECK-NEXT:    ret
-  %3 = or i32 %x, 1
-  %4 = sub i32 0, %3
-  %5 = icmp ugt i32 %4, %y
-  %6 = zext i1 %5 to i32
-  ret i32 %6
-}
-
-define i32 @or_neg_no_smin2(i32 %x, i32 %y) {
-; CHECK-LABEL: or_neg_no_smin2:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    neg w8, w0
-; CHECK-NEXT:    cmp w8, w1
-; CHECK-NEXT:    cset w0, ge
-; CHECK-NEXT:    ret
-  %4 = sub i32 0, %x
-  %5 = icmp sge i32 %4, %y
-  %6 = zext i1 %5 to i32
-  ret i32 %6
-}
-
-; Negative test
-
-define i32 @or_neg_ult_no_zero2(i32 %x, i32 %y) {
-; CHECK-LABEL: or_neg_ult_no_zero2:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    neg w8, w0
-; CHECK-NEXT:    cmp w8, w1
-; CHECK-NEXT:    cset w0, lo
-; CHECK-NEXT:    ret
-  %4 = sub i32 0, %x
-  %5 = icmp ult i32 %4, %y
-  %6 = zext i1 %5 to i32
-  ret i32 %6
-}
-
-define i32 @or_neg_no_smin_but_zero2(i32 %x, i32 %y) {
-; CHECK-LABEL: or_neg_no_smin_but_zero2:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    bic w8, w0, w0, asr #31
-; CHECK-NEXT:    cmn w1, w8
-; CHECK-NEXT:    cset w0, ge
-; CHECK-NEXT:    ret
-  %3 = call i32 @llvm.smax.i32(i32 %x, i32 0)
-  %4 = sub i32 0, %3
-  %5 = icmp sle i32 %4, %y
-  %6 = zext i1 %5 to i32
-  ret i32 %6
-}
-
-define i32 @or_neg_slt_zero_but_no_smin2(i32 %x, i32 %y) {
-; CHECK-LABEL: or_neg_slt_zero_but_no_smin2:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #9 // =0x9
-; CHECK-NEXT:    cmp w0, #9
-; CHECK-NEXT:    csel w8, w0, w8, lo
-; CHECK-NEXT:    neg w8, w8
-; CHECK-NEXT:    cmp w8, w1
-; CHECK-NEXT:    cset w0, hs
-; CHECK-NEXT:    ret
-  %3 = call i32 @llvm.umin.i32(i32 %x, i32 9)
-  %4 = sub i32 0, %3
-  %5 = icmp uge i32 %4, %y
-  %6 = zext i1 %5 to i32
-  ret i32 %6
-}
-
-declare i32 @llvm.smax.i32(i32, i32)
-declare i32 @llvm.umax.i32(i32, i32)
 declare void @use_4xi1(<4 x i1>)
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll
index 1c8a8d635274e..2ef35283568c3 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll
@@ -229,8 +229,7 @@ define %"class.std::complex" @complex_mul_predicated_x2_v2f64(ptr %a, ptr %b, pt
 ; CHECK-NEXT:    mov z6.d, z1.d
 ; CHECK-NEXT:    mov z7.d, z0.d
 ; CHECK-NEXT:    add x2, x2, x11
-; CHECK-NEXT:    cmpne p2.d, p0/z, z2.d, #0
-; CHECK-NEXT:    and p1.b, p1/z, p1.b, p2.b
+; CHECK-NEXT:    cmpne p1.d, p1/z, z2.d, #0
 ; CHECK-NEXT:    zip2 p2.d, p1.d, p1.d
 ; CHECK-NEXT:    zip1 p1.d, p1.d, p1.d
 ; CHECK-NEXT:    ld1d { z2.d }, p2/z, [x0, #1, mul vl]
diff --git a/llvm/test/CodeGen/AArch64/double_reduct.ll b/llvm/test/CodeGen/AArch64/double_reduct.ll
index b10114bc0ffa7..cf5e15da0b173 100644
--- a/llvm/test/CodeGen/AArch64/double_reduct.ll
+++ b/llvm/test/CodeGen/AArch64/double_reduct.ll
@@ -145,11 +145,10 @@ define i16 @add_ext_i16(<16 x i8> %a, <16 x i8> %b) {
 define i16 @add_ext_v32i16(<32 x i8> %a, <16 x i8> %b) {
 ; CHECK-LABEL: add_ext_v32i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    uaddl2 v3.8h, v0.16b, v1.16b
-; CHECK-NEXT:    uaddl v0.8h, v0.8b, v1.8b
-; CHECK-NEXT:    add v0.8h, v0.8h, v3.8h
-; CHECK-NEXT:    uadalp v0.8h, v2.16b
-; CHECK-NEXT:    addv h0, v0.8h
+; CHECK-NEXT:    uaddlp v1.8h, v1.16b
+; CHECK-NEXT:    uadalp v1.8h, v0.16b
+; CHECK-NEXT:    uadalp v1.8h, v2.16b
+; CHECK-NEXT:    addv h0, v1.8h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
   %ae = zext <32 x i8> %a to <32 x i16>
diff --git a/llvm/test/CodeGen/AArch64/fast-isel-select.ll b/llvm/test/CodeGen/AArch64/fast-isel-select.ll
index 6e55bf4968e78..65701343ccc1e 100644
--- a/llvm/test/CodeGen/AArch64/fast-isel-select.ll
+++ b/llvm/test/CodeGen/AArch64/fast-isel-select.ll
@@ -1,16 +1,15 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=aarch64-apple-darwin                             -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SDAGISEL
+; RUN: llc -mtriple=aarch64-apple-darwin                             -verify-machineinstrs < %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel -fast-isel-abort=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-FASTISEL
 ; RUN: llc -mtriple=aarch64-apple-darwin -global-isel -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-GISEL
 
 ; First test the different supported value types for select.
 define zeroext i1 @select_i1(i1 zeroext %c, i1 zeroext %a, i1 zeroext %b) {
-; CHECK-SDAGISEL-LABEL: select_i1:
-; CHECK-SDAGISEL:       ; %bb.0:
-; CHECK-SDAGISEL-NEXT:    cmp w0, #0
-; CHECK-SDAGISEL-NEXT:    csel w0, w1, w2, ne
-; CHECK-SDAGISEL-NEXT:    ret
-;
+; GISEL-LABEL: select_i1:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    tst w0, #0x1
+; GISEL-NEXT:    csel w0, w1, w2, ne
+; GISEL-NEXT:    ret
 ; CHECK-FASTISEL-LABEL: select_i1:
 ; CHECK-FASTISEL:       ; %bb.0:
 ; CHECK-FASTISEL-NEXT:    tst w0, #0x1
@@ -28,12 +27,11 @@ define zeroext i1 @select_i1(i1 zeroext %c, i1 zeroext %a, i1 zeroext %b) {
 }
 
 define zeroext i8 @select_i8(i1 zeroext %c, i8 zeroext %a, i8 zeroext %b) {
-; CHECK-SDAGISEL-LABEL: select_i8:
-; CHECK-SDAGISEL:       ; %bb.0:
-; CHECK-SDAGISEL-NEXT:    cmp w0, #0
-; CHECK-SDAGISEL-NEXT:    csel w0, w1, w2, ne
-; CHECK-SDAGISEL-NEXT:    ret
-;
+; GISEL-LABEL: select_i8:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    tst w0, #0x1
+; GISEL-NEXT:    csel w0, w1, w2, ne
+; GISEL-NEXT:    ret
 ; CHECK-FASTISEL-LABEL: select_i8:
 ; CHECK-FASTISEL:       ; %bb.0:
 ; CHECK-FASTISEL-NEXT:    tst w0, #0x1
@@ -51,12 +49,11 @@ define zeroext i8 @select_i8(i1 zeroext %c, i8 zeroext %a, i8 zeroext %b) {
 }
 
 define zeroext i16 @select_i16(i1 zeroext %c, i16 zeroext %a, i16 zeroext %b) {
-; CHECK-SDAGISEL-LABEL: select_i16:
-; CHECK-SDAGISEL:       ; %bb.0:
-; CHECK-SDAGISEL-NEXT:    cmp w0, #0
-; CHECK-SDAGISEL-NEXT:    csel w0, w1, w2, ne
-; CHECK-SDAGISEL-NEXT:    ret
-;
+; GISEL-LABEL: select_i16:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    tst w0, #0x1
+; GISEL-NEXT:    csel w0, w1, w2, ne
+; GISEL-NEXT:    ret
 ; CHECK-FASTISEL-LABEL: select_i16:
 ; CHECK-FASTISEL:       ; %bb.0:
 ; CHECK-FASTISEL-NEXT:    tst w0, #0x1
@@ -74,12 +71,11 @@ define zeroext i16 @select_i16(i1 zeroext %c, i16 zeroext %a, i16 zeroext %b) {
 }
 
 define i32 @select_i32(i1 zeroext %c, i32 %a, i32 %b) {
-; CHECK-SDAGISEL-LABEL: select_i32:
-; CHECK-SDAGISEL:       ; %bb.0:
-; CHECK-SDAGISEL-NEXT:    cmp w0, #0
-; CHECK-SDAGISEL-NEXT:    csel w0, w1, w2, ne
-; CHECK-SDAGISEL-NEXT:    ret
-;
+; GISEL-LABEL: select_i32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    tst w0, #0x1
+; GISEL-NEXT:    csel w0, w1, w2, ne
+; GISEL-NEXT:    ret
 ; CHECK-FASTISEL-LABEL: select_i32:
 ; CHECK-FASTISEL:       ; %bb.0:
 ; CHECK-FASTISEL-NEXT:    tst w0, #0x1
@@ -96,12 +92,11 @@ define i32 @select_i32(i1 zeroext %c, i32 %a, i32 %b) {
 }
 
 define i64 @select_i64(i1 zeroext %c, i64 %a, i64 %b) {
-; CHECK-SDAGISEL-LABEL: select_i64:
-; CHECK-SDAGISEL:       ; %bb.0:
-; CHECK-SDAGISEL-NEXT:    cmp w0, #0
-; CHECK-SDAGISEL-NEXT:    csel x0, x1, x2, ne
-; CHECK-SDAGISEL-NEXT:    ret
-;
+; GISEL-LABEL: select_i64:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    tst w0, #0x1
+; GISEL-NEXT:    csel x0, x1, x2, ne
+; GISEL-NEXT:    ret
 ; CHECK-FASTISEL-LABEL: select_i64:
 ; CHECK-FASTISEL:       ; %bb.0:
 ; CHECK-FASTISEL-NEXT:    tst w0, #0x1
@@ -118,12 +113,11 @@ define i64 @select_i64(i1 zeroext %c, i64 %a, i64 %b) {
 }
 
 define float @select_f32(i1 zeroext %c, float %a, float %b) {
-; CHECK-SDAGISEL-LABEL: select_f32:
-; CHECK-SDAGISEL:       ; %bb.0:
-; CHECK-SDAGISEL-NEXT:    cmp w0, #0
-; CHECK-SDAGISEL-NEXT:    fcsel s0, s0, s1, ne
-; CHECK-SDAGISEL-NEXT:    ret
-;
+; GISEL-LABEL: select_f32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    tst w0, #0x1
+; GISEL-NEXT:    fcsel s0, s0, s1, ne
+; GISEL-NEXT:    ret
 ; CHECK-FASTISEL-LABEL: select_f32:
 ; CHECK-FASTISEL:       ; %bb.0:
 ; CHECK-FASTISEL-NEXT:    tst w0, #0x1
@@ -140,12 +134,11 @@ define float @select_f32(i1 zeroext %c, float %a, float %b) {
 }
 
 define double @select_f64(i1 zeroext %c, double %a, double %b) {
-; CHECK-SDAGISEL-LABEL: select_f64:
-; CHECK-SDAGISEL:       ; %bb.0:
-; CHECK-SDAGISEL-NEXT:    cmp w0, #0
-; CHECK-SDAGISEL-NEXT:    fcsel d0, d0, d1, ne
-; CHECK-SDAGISEL-NEXT:    ret
-;
+; GISEL-LABEL: select_f64:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    tst w0, #0x1
+; GISEL-NEXT:    fcsel d0, d0, d1, ne
+; GISEL-NEXT:    ret
 ; CHECK-FASTISEL-LABEL: select_f64:
 ; CHECK-FASTISEL:       ; %bb.0:
 ; CHECK-FASTISEL-NEXT:    tst w0, #0x1
@@ -163,11 +156,6 @@ define double @select_f64(i1 zeroext %c, double %a, double %b) {
 
 ; Now test the folding of all compares.
 define float @select_fcmp_false(float %x, float %a, float %b) {
-; CHECK-SDAGISEL-LABEL: select_fcmp_false:
-; CHECK-SDAGISEL:       ; %bb.0:
-; CHECK-SDAGISEL-NEXT:    fmov s0, s2
-; CHECK-SDAGISEL-NEXT:    ret
-;
 ; CHECK-FASTISEL-LABEL: select_fcmp_false:
 ; CHECK-FASTISEL:       ; %bb.0:
 ; CHECK-FASTISEL-NEXT:    fmov s0, s2
@@ -178,6 +166,11 @@ define float @select_fcmp_false(float %x, float %a, float %b) {
 ; CHECK-GISEL-NEXT:    fcmp s0, s0
 ; CHECK-GISEL-NEXT:    fcsel s0, s1, s2, gt
 ; CHECK-GISEL-NEXT:    ret
+; GISEL-LABEL: select_fcmp_false:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    fcmp s0, s0
+; GISEL-NEXT:    fcsel s0, s1, s2, gt
+; GISEL-NEXT:    ret
   %1 = fcmp ogt float %x, %x
   %2 = select i1 %1, float %a, float %b
   ret float %2
@@ -189,6 +182,11 @@ define float @select_fcmp_ogt(float %x, float %y, float %a, float %b) {
 ; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    fcsel s0, s2, s3, gt
 ; CHECK-NEXT:    ret
+; GISEL-LABEL: select_fcmp_ogt:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    fcmp s0, s1
+; GISEL-NEXT:    fcsel s0, s2, s3, gt
+; GISEL-NEXT:    ret
   %1 = fcmp ogt float %x, %y
   %2 = select i1 %1, float %a, float %b
   ret float %2
@@ -200,6 +198,11 @@ define float @select_fcmp_oge(float %x, float %y, float %a, float %b) {
 ; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    fcsel s0, s2, s3, ge
 ; CHECK-NEXT:    ret
+; GISEL-LABEL: select_fcmp_oge:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    fcmp s0, s1
+; GISEL-NEXT:    fcsel s0, s2, s3, ge
+; GISEL-NEXT:    ret
   %1 = fcmp oge float %x, %y
   %2 = select i1 %1, float %a, float %b
   ret float %2
@@ -211,6 +214,11 @@ define float @select_fcmp_olt(float %x, float %y, float %a, float %b) {
 ; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    fcsel s0, s2, s3, mi
 ; CHECK-NEXT:    ret
+; GISEL-LABEL: select_fcmp_olt:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    fcmp s0, s1
+; GISEL-NEXT:    fcsel s0, s2, s3, mi
+; GISEL-NEXT:    ret
   %1 = fcmp olt float %x, %y
   %2 = select i1 %1, float %a, float %b
   ret float %2
@@ -222,19 +230,17 @@ define float @select_fcmp_ole(float %x, float %y, float %a, float %b) {
 ; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    fcsel s0, s2, s3, ls
 ; CHECK-NEXT:    ret
+; GISEL-LABEL: select_fcmp_ole:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    fcmp s0, s1
+; GISEL-NEXT:    fcsel s0, s2, s3, ls
+; GISEL-NEXT:    ret
   %1 = fcmp ole float %x, %y
   %2 = select i1 %1, float %a, float %b
   ret float %2
 }
 
 define float @select_fcmp_one(float %x, float %y, float %a, float %b) {
-; CHECK-SDAGISEL-LABEL: select_fcmp_one:
-; CHECK-SDAGISEL:       ; %bb.0:
-; CHECK-SDAGISEL-NEXT:    fcmp s0, s1
-; CHECK-SDAGISEL-NEXT:    fcsel s0, s2, s3, mi
-; CHECK-SDAGISEL-NEXT:    fcsel s0, s2, s0, gt
-; CHECK-SDAGISEL-NEXT:    ret
-;
 ; CHECK-FASTISEL-LABEL: select_fcmp_one:
 ; CHECK-FASTISEL:       ; %bb.0:
 ; CHECK-FASTISEL-NEXT:    fcmp s0, s1
@@ -251,6 +257,15 @@ define float @select_fcmp_one(float %x, float %y, float %a, float %b) {
 ; CHECK-GISEL-NEXT:    tst w8, #0x1
 ; CHECK-GISEL-NEXT:    fcsel s0, s2, s3, ne
 ; CHECK-GISEL-NEXT:    ret
+; GISEL-LABEL: select_fcmp_one:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    fcmp s0, s1
+; GISEL-NEXT:    cset w8, mi
+; GISEL-NEXT:    cset w9, gt
+; GISEL-NEXT:    orr w8, w8, w9
+; GISEL-NEXT:    tst w8, #0x1
+; GISEL-NEXT:    fcsel s0, s2, s3, ne
+; GISEL-NEXT:    ret
   %1 = fcmp one float %x, %y
   %2 = select i1 %1, float %a, float %b
   ret float %2
@@ -262,6 +277,11 @@ define float @select_fcmp_ord(float %x, float %y, float %a, float %b) {
 ; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    fcsel s0, s2, s3, vc
 ; CHECK-NEXT:    ret
+; GISEL-LABEL: select_fcmp_ord:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    fcmp s0, s1
+; GISEL-NEXT:    fcsel s0, s2, s3, vc
+; GISEL-NEXT:    ret
   %1 = fcmp ord float %x, %y
   %2 = select i1 %1, float %a, float %b
   ret float %2
@@ -273,19 +293,17 @@ define float @select_fcmp_uno(float %x, float %y, float %a, float %b) {
 ; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    fcsel s0, s2, s3, vs
 ; CHECK-NEXT:    ret
+; GISEL-LABEL: select_fcmp_uno:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    fcmp s0, s1
+; GISEL-NEXT:    fcsel s0, s2, s3, vs
+; GISEL-NEXT:    ret
   %1 = fcmp uno float %x, %y
   %2 = select i1 %1, float %a, float %b
   ret float %2
 }
 
 define float @select_fcmp_ueq(float %x, float %y, float %a, float %b) {
-; CHECK-SDAGISEL-LABEL: select_fcmp_ueq:
-; CHECK-SDAGISEL:       ; %bb.0:
-; CHECK-SDAGISEL-NEXT:    fcmp s0, s1
-; CHECK-SDAGISEL-NEXT:    fcsel s0, s2, s3, eq
-; CHECK-SDAGISEL-NEXT:    fcsel s0, s2, s0, vs
-; CHECK-SDAGISEL-NEXT:    ret
-;
 ; CHECK-FASTISEL-LABEL: select_fcmp_ueq:
 ; CHECK-FASTISEL:       ; %bb.0:
 ; CHECK-FASTISEL-NEXT:    fcmp s0, s1
@@ -302,6 +320,15 @@ define float @select_fcmp_ueq(float %x, float %y, float %a, float %b) {
 ; CHECK-GISEL-NEXT:    tst w8, #0x1
 ; CHECK-GISEL-NEXT:    fcsel s0, s2, s3, ne
 ; CHECK-GISEL-NEXT:    ret
+; GISEL-LABEL: select_fcmp_ueq:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    fcmp s0, s1
+; GISEL-NEXT:    cset w8, eq
+; GISEL-NEXT:    cset w9, vs
+; GISEL-NEXT:    orr w8, w8, w9
+; GISEL-NEXT:    tst w8, #0x1
+; GISEL-NEXT:    fcsel s0, s2, s3, ne
+; GISEL-NEXT:    ret
   %1 = fcmp ueq float %x, %y
   %2 = select i1 %1, float %a, float %b
   ret float %2
@@ -313,6 +340,11 @@ define float @select_fcmp_ugt(float %x, float %y, float %a, float %b) {
 ; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    fcsel s0, s2, s3, hi
 ; CHECK-NEXT:    ret
+; GISEL-LABEL: select_fcmp_ugt:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    fcmp s0, s1
+; GISEL-NEXT:    fcsel s0, s2, s3, hi
+; GISEL-NEXT:    ret
   %1 = fcmp ugt float %x, %y
   %2 = select i1 %1, float %a, float %b
   ret float %2
@@ -324,6 +356,11 @@ define float @select_fcmp_uge(float %x, float %y, float %a, float %b) {
 ; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    fcsel s0, s2, s3, pl
 ; CHECK-NEXT:    ret
+; GISEL-LABEL: select_fcmp_uge:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    fcmp s0, s1
+; GISEL-NEXT:    fcsel s0, s2, s3, pl
+; GISEL-NEXT:    ret
   %1 = fcmp uge float %x, %y
   %2 = select i1 %1, float %a, float %b
   ret float %2
@@ -335,6 +372,11 @@ define float @select_fcmp_ult(float %x, float %y, float %a, float %b) {
 ; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    fcsel s0, s2, s3, lt
 ; CHECK-NEXT:    ret
+; GISEL-LABEL: select_fcmp_ult:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    fcmp s0, s1
+; GISEL-NEXT:    fcsel s0, s2, s3, lt
+; GISEL-NEXT:    ret
   %1 = fcmp ult float %x, %y
   %2 = select i1 %1, float %a, float %b
   ret float %2
@@ -347,6 +389,11 @@ define float @select_fcmp_ule(float %x, float %y, float %a, float %b) {
 ; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    fcsel s0, s2, s3, le
 ; CHECK-NEXT:    ret
+; GISEL-LABEL: select_fcmp_ule:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    fcmp s0, s1
+; GISEL-NEXT:    fcsel s0, s2, s3, le
+; GISEL-NEXT:    ret
   %1 = fcmp ule float %x, %y
   %2 = select i1 %1, float %a, float %b
   ret float %2
@@ -358,17 +405,17 @@ define float @select_fcmp_une(float %x, float %y, float %a, float %b) {
 ; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    fcsel s0, s2, s3, ne
 ; CHECK-NEXT:    ret
+; GISEL-LABEL: select_fcmp_une:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    fcmp s0, s1
+; GISEL-NEXT:    fcsel s0, s2, s3, ne
+; GISEL-NEXT:    ret
   %1 = fcmp une float %x, %y
   %2 = select i1 %1, float %a, float %b
   ret float %2
 }
 
 define float @select_fcmp_true(float %x, float %a, float %b) {
-; CHECK-SDAGISEL-LABEL: select_fcmp_true:
-; CHECK-SDAGISEL:       ; %bb.0:
-; CHECK-SDAGISEL-NEXT:    fmov s0, s1
-; CHECK-SDAGISEL-NEXT:    ret
-;
 ; CHECK-FASTISEL-LABEL: select_fcmp_true:
 ; CHECK-FASTISEL:       ; %bb.0:
 ; CHECK-FASTISEL-NEXT:    fmov s0, s1
@@ -383,6 +430,15 @@ define float @select_fcmp_true(float %x, float %a, float %b) {
 ; CHECK-GISEL-NEXT:    tst w8, #0x1
 ; CHECK-GISEL-NEXT:    fcsel s0, s1, s2, ne
 ; CHECK-GISEL-NEXT:    ret
+; GISEL-LABEL: select_fcmp_true:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    fcmp s0, s0
+; GISEL-NEXT:    cset w8, eq
+; GISEL-NEXT:    cset w9, vs
+; GISEL-NEXT:    orr w8, w8, w9
+; GISEL-NEXT:    tst w8, #0x1
+; GISEL-NEXT:    fcsel s0, s1, s2, ne
+; GISEL-NEXT:    ret
   %1 = fcmp ueq float %x, %x
   %2 = select i1 %1, float %a, float %b
   ret float %2
@@ -394,6 +450,11 @@ define float @select_icmp_eq(i32 %x, i32 %y, float %a, float %b) {
 ; CHECK-NEXT:    cmp w0, w1
 ; CHECK-NEXT:    fcsel s0, s0, s1, eq
 ; CHECK-NEXT:    ret
+; GISEL-LABEL: select_icmp_eq:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    cmp w0, w1
+; GISEL-NEXT:    fcsel s0, s0, s1, eq
+; GISEL-NEXT:    ret
   %1 = icmp eq i32 %x, %y
   %2 = select i1 %1, float %a, float %b
   ret float %2
@@ -405,6 +466,11 @@ define float @select_icmp_ne(i32 %x, i32 %y, float %a, float %b) {
 ; CHECK-NEXT:    cmp w0, w1
 ; CHECK-NEXT:    fcsel s0, s0, s1, ne
 ; CHECK-NEXT:    ret
+; GISEL-LABEL: select_icmp_ne:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    cmp w0, w1
+; GISEL-NEXT:    fcsel s0, s0, s1, ne
+; GISEL-NEXT:    ret
   %1 = icmp ne i32 %x, %y
   %2 = select i1 %1, float %a, float %b
   ret float %2
@@ -416,6 +482,11 @@ define float @select_icmp_ugt(i32 %x, i32 %y, float %a, float %b) {
 ; CHECK-NEXT:    cmp w0, w1
 ; CHECK-NEXT:    fcsel s0, s0, s1, hi
 ; CHECK-NEXT:    ret
+; GISEL-LABEL: select_icmp_ugt:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    cmp w0, w1
+; GISEL-NEXT:    fcsel s0, s0, s1, hi
+; GISEL-NEXT:    ret
   %1 = icmp ugt i32 %x, %y
   %2 = select i1 %1, float %a, float %b
   ret float %2
@@ -427,6 +498,11 @@ define float @select_icmp_uge(i32 %x, i32 %y, float %a, float %b) {
 ; CHECK-NEXT:    cmp w0, w1
 ; CHECK-NEXT:    fcsel s0, s0, s1, hs
 ; CHECK-NEXT:    ret
+; GISEL-LABEL: select_icmp_uge:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    cmp w0, w1
+; GISEL-NEXT:    fcsel s0, s0, s1, hs
+; GISEL-NEXT:    ret
   %1 = icmp uge i32 %x, %y
   %2 = select i1 %1, float %a, float %b
   ret float %2
@@ -438,6 +514,11 @@ define float @select_icmp_ult(i32 %x, i32 %y, float %a, float %b) {
 ; CHECK-NEXT:    cmp w0, w1
 ; CHECK-NEXT:    fcsel s0, s0, s1, lo
 ; CHECK-NEXT:    ret
+; GISEL-LABEL: select_icmp_ult:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    cmp w0, w1
+; GISEL-NEXT:    fcsel s0, s0, s1, lo
+; GISEL-NEXT:    ret
   %1 = icmp ult i32 %x, %y
   %2 = select i1 %1, float %a, float %b
   ret float %2
@@ -449,6 +530,11 @@ define float @select_icmp_ule(i32 %x, i32 %y, float %a, float %b) {
 ; CHECK-NEXT:    cmp w0, w1
 ; CHECK-NEXT:    fcsel s0, s0, s1, ls
 ; CHECK-NEXT:    ret
+; GISEL-LABEL: select_icmp_ule:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    cmp w0, w1
+; GISEL-NEXT:    fcsel s0, s0, s1, ls
+; GISEL-NEXT:    ret
   %1 = icmp ule i32 %x, %y
   %2 = select i1 %1, float %a, float %b
   ret float %2
@@ -460,6 +546,11 @@ define float @select_icmp_sgt(i32 %x, i32 %y, float %a, float %b) {
 ; CHECK-NEXT:    cmp w0, w1
 ; CHECK-NEXT:    fcsel s0, s0, s1, gt
 ; CHECK-NEXT:    ret
+; GISEL-LABEL: select_icmp_sgt:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    cmp w0, w1
+; GISEL-NEXT:    fcsel s0, s0, s1, gt
+; GISEL-NEXT:    ret
   %1 = icmp sgt i32 %x, %y
   %2 = select i1 %1, float %a, float %b
   ret float %2
@@ -471,6 +562,11 @@ define float @select_icmp_sge(i32 %x, i32 %y, float %a, float %b) {
 ; CHECK-NEXT:    cmp w0, w1
 ; CHECK-NEXT:    fcsel s0, s0, s1, ge
 ; CHECK-NEXT:    ret
+; GISEL-LABEL: select_icmp_sge:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    cmp w0, w1
+; GISEL-NEXT:    fcsel s0, s0, s1, ge
+; GISEL-NEXT:    ret
   %1 = icmp sge i32 %x, %y
   %2 = select i1 %1, float %a, float %b
   ret float %2
@@ -482,6 +578,11 @@ define float @select_icmp_slt(i32 %x, i32 %y, float %a, float %b) {
 ; CHECK-NEXT:    cmp w0, w1
 ; CHECK-NEXT:    fcsel s0, s0, s1, lt
 ; CHECK-NEXT:    ret
+; GISEL-LABEL: select_icmp_slt:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    cmp w0, w1
+; GISEL-NEXT:    fcsel s0, s0, s1, lt
+; GISEL-NEXT:    ret
   %1 = icmp slt i32 %x, %y
   %2 = select i1 %1, float %a, float %b
   ret float %2
@@ -493,6 +594,11 @@ define float @select_icmp_sle(i32 %x, i32 %y, float %a, float %b) {
 ; CHECK-NEXT:    cmp w0, w1
 ; CHECK-NEXT:    fcsel s0, s0, s1, le
 ; CHECK-NEXT:    ret
+; GISEL-LABEL: select_icmp_sle:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    cmp w0, w1
+; GISEL-NEXT:    fcsel s0, s0, s1, le
+; GISEL-NEXT:    ret
   %1 = icmp sle i32 %x, %y
   %2 = select i1 %1, float %a, float %b
   ret float %2
@@ -500,22 +606,33 @@ define float @select_icmp_sle(i32 %x, i32 %y, float %a, float %b) {
 
 ; Test peephole optimizations for select.
 define zeroext i1 @select_opt1(i1 zeroext %c, i1 zeroext %a) {
-; CHECK-LABEL: select_opt1:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    orr w8, w0, w1
-; CHECK-NEXT:    and w0, w8, #0x1
-; CHECK-NEXT:    ret
+; GISEL-LABEL: select_opt1:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    orr w8, w0, w1
+; GISEL-NEXT:    and w0, w8, #0x1
+; GISEL-NEXT:    ret
+; CHECK-FASTISEL-LABEL: select_opt1:
+; CHECK-FASTISEL:       ; %bb.0:
+; CHECK-FASTISEL-NEXT:    orr w8, w0, w1
+; CHECK-FASTISEL-NEXT:    and w0, w8, #0x1
+; CHECK-FASTISEL-NEXT:    ret
+;
+; CHECK-GISEL-LABEL: select_opt1:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    orr w8, w0, w1
+; CHECK-GISEL-NEXT:    and w0, w8, #0x1
+; CHECK-GISEL-NEXT:    ret
   %1 = select i1 %c, i1 true, i1 %a
   ret i1 %1
 }
 
 define zeroext i1 @select_opt2(i1 zeroext %c, i1 zeroext %a) {
-; CHECK-SDAGISEL-LABEL: select_opt2:
-; CHECK-SDAGISEL:       ; %bb.0:
-; CHECK-SDAGISEL-NEXT:    orn w8, w1, w0
-; CHECK-SDAGISEL-NEXT:    and w0, w8, #0x1
-; CHECK-SDAGISEL-NEXT:    ret
-;
+; GISEL-LABEL: select_opt2:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    eor w8, w0, #0x1
+; GISEL-NEXT:    orr w8, w8, w1
+; GISEL-NEXT:    and w0, w8, #0x1
+; GISEL-NEXT:    ret
 ; CHECK-FASTISEL-LABEL: select_opt2:
 ; CHECK-FASTISEL:       ; %bb.0:
 ; CHECK-FASTISEL-NEXT:    eor w8, w0, #0x1
@@ -534,12 +651,11 @@ define zeroext i1 @select_opt2(i1 zeroext %c, i1 zeroext %a) {
 }
 
 define zeroext i1 @select_opt3(i1 zeroext %c, i1 zeroext %a) {
-; CHECK-SDAGISEL-LABEL: select_opt3:
-; CHECK-SDAGISEL:       ; %bb.0:
-; CHECK-SDAGISEL-NEXT:    eor w8, w0, #0x1
-; CHECK-SDAGISEL-NEXT:    and w0, w8, w1
-; CHECK-SDAGISEL-NEXT:    ret
-;
+; GISEL-LABEL: select_opt3:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    eor w8, w0, #0x1
+; GISEL-NEXT:    and w0, w8, w1
+; GISEL-NEXT:    ret
 ; CHECK-FASTISEL-LABEL: select_opt3:
 ; CHECK-FASTISEL:       ; %bb.0:
 ; CHECK-FASTISEL-NEXT:    bic w8, w1, w0
@@ -556,11 +672,10 @@ define zeroext i1 @select_opt3(i1 zeroext %c, i1 zeroext %a) {
 }
 
 define zeroext i1 @select_opt4(i1 zeroext %c, i1 zeroext %a) {
-; CHECK-SDAGISEL-LABEL: select_opt4:
-; CHECK-SDAGISEL:       ; %bb.0:
-; CHECK-SDAGISEL-NEXT:    and w0, w0, w1
-; CHECK-SDAGISEL-NEXT:    ret
-;
+; GISEL-LABEL: select_opt4:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    and w0, w0, w1
+; GISEL-NEXT:    ret
 ; CHECK-FASTISEL-LABEL: select_opt4:
 ; CHECK-FASTISEL:       ; %bb.0:
 ; CHECK-FASTISEL-NEXT:    and w8, w0, w1
diff --git a/llvm/test/CodeGen/AArch64/hardened-br-jump-table.ll b/llvm/test/CodeGen/AArch64/hardened-br-jump-table.ll
deleted file mode 100644
index f20ea4db7baeb..0000000000000
--- a/llvm/test/CodeGen/AArch64/hardened-br-jump-table.ll
+++ /dev/null
@@ -1,133 +0,0 @@
-; RUN: rm -rf %t && split-file %s %t
-
-;--- err1.ll
-
-; RUN: not --crash llc %t/err1.ll -mtriple=aarch64-elf \
-; RUN:   -aarch64-min-jump-table-entries=1 -aarch64-enable-atomic-cfg-tidy=0 \
-; RUN:   -code-model=large \
-; RUN:   -o - -verify-machineinstrs 2>&1 | FileCheck %s --check-prefix=ERR1
-
-; RUN: not --crash llc %t/err1.ll -mtriple=aarch64-elf \
-; RUN:   -aarch64-min-jump-table-entries=1 -aarch64-enable-atomic-cfg-tidy=0 \
-; RUN:   -global-isel -global-isel-abort=1 \
-; RUN:   -code-model=large \
-; RUN:   -o - -verify-machineinstrs 2>&1 | FileCheck %s --check-prefix=ERR1
-
-; ERR1: LLVM ERROR: Unsupported code-model for hardened jump-table
-define i32 @test_jumptable(i32 %in) "aarch64-jump-table-hardening" {
-
-  switch i32 %in, label %def [
-    i32 0, label %lbl1
-    i32 1, label %lbl2
-  ]
-
-def:
-  ret i32 0
-
-lbl1:
-  ret i32 1
-
-lbl2:
-  ret i32 2
-}
-
-;--- test.ll
-
-; RUN: llc %t/test.ll -mtriple=arm64-apple-darwin -aarch64-enable-collect-loh=0 \
-; RUN:   -aarch64-min-jump-table-entries=1 -aarch64-enable-atomic-cfg-tidy=0 \
-; RUN:   -o - -verify-machineinstrs | FileCheck %s --check-prefix=MACHO
-
-; RUN: llc %t/test.ll -mtriple=arm64-apple-darwin -aarch64-enable-collect-loh=0 \
-; RUN:   -aarch64-min-jump-table-entries=1 -aarch64-enable-atomic-cfg-tidy=0 \
-; RUN:   -global-isel -global-isel-abort=1 \
-; RUN:   -o - -verify-machineinstrs | FileCheck %s --check-prefix=MACHO
-
-; RUN: llc %t/test.ll -mtriple=arm64-apple-darwin -aarch64-enable-collect-loh=0 \
-; RUN:   -aarch64-min-jump-table-entries=1 -aarch64-enable-atomic-cfg-tidy=0 \
-; RUN:   -code-model=large \
-; RUN:   -o - -verify-machineinstrs | FileCheck %s --check-prefix=MACHO
-
-; RUN: llc %t/test.ll -mtriple=arm64-apple-darwin -aarch64-enable-collect-loh=0 \
-; RUN:   -aarch64-min-jump-table-entries=1 -aarch64-enable-atomic-cfg-tidy=0 \
-; RUN:   -global-isel -global-isel-abort=1 \
-; RUN:   -code-model=large \
-; RUN:   -o - -verify-machineinstrs | FileCheck %s --check-prefix=MACHO
-
-; RUN: llc %t/test.ll -mtriple=aarch64-elf \
-; RUN:   -aarch64-min-jump-table-entries=1 -aarch64-enable-atomic-cfg-tidy=0 \
-; RUN:   -o - -verify-machineinstrs | FileCheck %s --check-prefix=ELF
-
-; RUN: llc %t/test.ll -mtriple=aarch64-elf \
-; RUN:   -aarch64-min-jump-table-entries=1 -aarch64-enable-atomic-cfg-tidy=0 \
-; RUN:   -global-isel -global-isel-abort=1 \
-; RUN:   -o - -verify-machineinstrs | FileCheck %s --check-prefix=ELF
-
-; MACHO-LABEL: test_jumptable:
-; MACHO:        mov   w16, w0
-; MACHO:        cmp   x16, #5
-; MACHO:        csel  x16, x16, xzr, ls
-; MACHO-NEXT:   adrp  x17, LJTI0_0 at PAGE
-; MACHO-NEXT:   add   x17, x17, LJTI0_0 at PAGEOFF
-; MACHO-NEXT:   ldrsw x16, [x17, x16, lsl #2]
-; MACHO-NEXT:  Ltmp0:
-; MACHO-NEXT:   adr   x17, Ltmp0
-; MACHO-NEXT:   add   x16, x17, x16
-; MACHO-NEXT:   br    x16
-
-; ELF-LABEL: test_jumptable:
-; ELF:        mov   w16, w0
-; ELF:        cmp   x16, #5
-; ELF:        csel  x16, x16, xzr, ls
-; ELF-NEXT:   adrp  x17, .LJTI0_0
-; ELF-NEXT:   add   x17, x17, :lo12:.LJTI0_0
-; ELF-NEXT:   ldrsw x16, [x17, x16, lsl #2]
-; ELF-NEXT:  .Ltmp0:
-; ELF-NEXT:   adr   x17, .Ltmp0
-; ELF-NEXT:   add   x16, x17, x16
-; ELF-NEXT:   br    x16
-
-define i32 @test_jumptable(i32 %in) "aarch64-jump-table-hardening" {
-
-  switch i32 %in, label %def [
-    i32 0, label %lbl1
-    i32 1, label %lbl2
-    i32 2, label %lbl3
-    i32 4, label %lbl4
-    i32 5, label %lbl5
-  ]
-
-def:
-  ret i32 0
-
-lbl1:
-  ret i32 1
-
-lbl2:
-  ret i32 2
-
-lbl3:
-  ret i32 4
-
-lbl4:
-  ret i32 8
-
-lbl5:
-  ret i32 10
-
-}
-
-; MACHO-LABEL: LJTI0_0:
-; MACHO-NEXT: .long LBB{{[0-9_]+}}-Ltmp0
-; MACHO-NEXT: .long LBB{{[0-9_]+}}-Ltmp0
-; MACHO-NEXT: .long LBB{{[0-9_]+}}-Ltmp0
-; MACHO-NEXT: .long LBB{{[0-9_]+}}-Ltmp0
-; MACHO-NEXT: .long LBB{{[0-9_]+}}-Ltmp0
-; MACHO-NEXT: .long LBB{{[0-9_]+}}-Ltmp0
-
-; ELF-LABEL: .LJTI0_0:
-; ELF-NEXT: .word .LBB{{[0-9_]+}}-.Ltmp0
-; ELF-NEXT: .word .LBB{{[0-9_]+}}-.Ltmp0
-; ELF-NEXT: .word .LBB{{[0-9_]+}}-.Ltmp0
-; ELF-NEXT: .word .LBB{{[0-9_]+}}-.Ltmp0
-; ELF-NEXT: .word .LBB{{[0-9_]+}}-.Ltmp0
-; ELF-NEXT: .word .LBB{{[0-9_]+}}-.Ltmp0
diff --git a/llvm/test/CodeGen/AArch64/inlineasm-ldr-pseudo.ll b/llvm/test/CodeGen/AArch64/inlineasm-ldr-pseudo.ll
index f3623928d67bf..f2b5e5f0064b7 100644
--- a/llvm/test/CodeGen/AArch64/inlineasm-ldr-pseudo.ll
+++ b/llvm/test/CodeGen/AArch64/inlineasm-ldr-pseudo.ll
@@ -16,7 +16,7 @@ entry:
 ; CHECK:        58000040                                         ldr    x0, 0x10
 ; CHECK:        d65f03c0                                         ret
 ; Make sure the constant pool entry comes after the return
-; CHECK-LABEL:        <$d>:
+; CHECK-LABEL:        <$d.1>:
 define i32 @bar() nounwind {
 entry:
   %0 = tail call i32 asm sideeffect "ldr $0,=0x10001", "=r"() nounwind
diff --git a/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll b/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll
index 66d670d0b796b..cdf2a962f9322 100644
--- a/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll
+++ b/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll
@@ -319,9 +319,8 @@ define i32 @ctz_nxv16i1_poison(<vscale x 16 x i1> %a) {
 define i32 @ctz_and_nxv16i1(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
 ; CHECK-LABEL: ctz_and_nxv16i1:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmpne p0.b, p0/z, z0.b, z1.b
 ; CHECK-NEXT:    ptrue p1.b
-; CHECK-NEXT:    cmpne p2.b, p1/z, z0.b, z1.b
-; CHECK-NEXT:    and p0.b, p0/z, p0.b, p2.b
 ; CHECK-NEXT:    brkb p0.b, p1/z, p0.b
 ; CHECK-NEXT:    cntp x0, p0, p0.b
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
diff --git a/llvm/test/CodeGen/AArch64/neon-abd.ll b/llvm/test/CodeGen/AArch64/neon-abd.ll
index 18364bdecee02..f743bae84053d 100644
--- a/llvm/test/CodeGen/AArch64/neon-abd.ll
+++ b/llvm/test/CodeGen/AArch64/neon-abd.ll
@@ -332,6 +332,8 @@ define <2 x i64> @uabd_2d(<2 x i64> %a, <2 x i64> %b) #0 {
 ; CHECK-NEXT:    ngc x9, xzr
 ; CHECK-NEXT:    subs x10, x10, x11
 ; CHECK-NEXT:    ngc x11, xzr
+; CHECK-NEXT:    asr x9, x9, #63
+; CHECK-NEXT:    asr x11, x11, #63
 ; CHECK-NEXT:    eor x8, x8, x9
 ; CHECK-NEXT:    eor x10, x10, x11
 ; CHECK-NEXT:    sub x8, x8, x9
diff --git a/llvm/test/CodeGen/AArch64/ptrauth-fpac.ll b/llvm/test/CodeGen/AArch64/ptrauth-fpac.ll
deleted file mode 100644
index 6afe1a93d986e..0000000000000
--- a/llvm/test/CodeGen/AArch64/ptrauth-fpac.ll
+++ /dev/null
@@ -1,374 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple arm64e-apple-darwin                                                -verify-machineinstrs | FileCheck %s --check-prefixes=ALL,NOFPAC
-; RUN: llc < %s -mtriple arm64e-apple-darwin -mattr=+fpac                                   -verify-machineinstrs | FileCheck %s --check-prefixes=ALL,FPAC
-
-target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
-
-define i64 @test_auth_ia(i64 %arg, i64 %arg1) {
-; ALL-LABEL: test_auth_ia:
-; ALL:       ; %bb.0:
-; ALL-NEXT:    mov x16, x0
-; ALL-NEXT:    autia x16, x1
-; ALL-NEXT:    mov x0, x16
-; ALL-NEXT:    ret
-  %tmp = call i64 @llvm.ptrauth.auth(i64 %arg, i32 0, i64 %arg1)
-  ret i64 %tmp
-}
-
-define i64 @test_auth_ia_zero(i64 %arg) {
-; ALL-LABEL: test_auth_ia_zero:
-; ALL:       ; %bb.0:
-; ALL-NEXT:    mov x16, x0
-; ALL-NEXT:    autiza x16
-; ALL-NEXT:    mov x0, x16
-; ALL-NEXT:    ret
-  %tmp = call i64 @llvm.ptrauth.auth(i64 %arg, i32 0, i64 0)
-  ret i64 %tmp
-}
-
-define i64 @test_auth_ib(i64 %arg, i64 %arg1) {
-; ALL-LABEL: test_auth_ib:
-; ALL:       ; %bb.0:
-; ALL-NEXT:    mov x16, x0
-; ALL-NEXT:    autib x16, x1
-; ALL-NEXT:    mov x0, x16
-; ALL-NEXT:    ret
-  %tmp = call i64 @llvm.ptrauth.auth(i64 %arg, i32 1, i64 %arg1)
-  ret i64 %tmp
-}
-
-define i64 @test_auth_ib_zero(i64 %arg) {
-; ALL-LABEL: test_auth_ib_zero:
-; ALL:       ; %bb.0:
-; ALL-NEXT:    mov x16, x0
-; ALL-NEXT:    autizb x16
-; ALL-NEXT:    mov x0, x16
-; ALL-NEXT:    ret
-  %tmp = call i64 @llvm.ptrauth.auth(i64 %arg, i32 1, i64 0)
-  ret i64 %tmp
-}
-
-define i64 @test_auth_da(i64 %arg, i64 %arg1) {
-; ALL-LABEL: test_auth_da:
-; ALL:       ; %bb.0:
-; ALL-NEXT:    mov x16, x0
-; ALL-NEXT:    autda x16, x1
-; ALL-NEXT:    mov x0, x16
-; ALL-NEXT:    ret
-  %tmp = call i64 @llvm.ptrauth.auth(i64 %arg, i32 2, i64 %arg1)
-  ret i64 %tmp
-}
-
-define i64 @test_auth_da_zero(i64 %arg) {
-; ALL-LABEL: test_auth_da_zero:
-; ALL:       ; %bb.0:
-; ALL-NEXT:    mov x16, x0
-; ALL-NEXT:    autdza x16
-; ALL-NEXT:    mov x0, x16
-; ALL-NEXT:    ret
-  %tmp = call i64 @llvm.ptrauth.auth(i64 %arg, i32 2, i64 0)
-  ret i64 %tmp
-}
-
-define i64 @test_auth_db(i64 %arg, i64 %arg1) {
-; ALL-LABEL: test_auth_db:
-; ALL:       ; %bb.0:
-; ALL-NEXT:    mov x16, x0
-; ALL-NEXT:    autdb x16, x1
-; ALL-NEXT:    mov x0, x16
-; ALL-NEXT:    ret
-  %tmp = call i64 @llvm.ptrauth.auth(i64 %arg, i32 3, i64 %arg1)
-  ret i64 %tmp
-}
-
-define i64 @test_auth_db_zero(i64 %arg) {
-; ALL-LABEL: test_auth_db_zero:
-; ALL:       ; %bb.0:
-; ALL-NEXT:    mov x16, x0
-; ALL-NEXT:    autdzb x16
-; ALL-NEXT:    mov x0, x16
-; ALL-NEXT:    ret
-  %tmp = call i64 @llvm.ptrauth.auth(i64 %arg, i32 3, i64 0)
-  ret i64 %tmp
-}
-
-; Note that this might seem like a no-op but is actually a valid way to enforce
-; the validity of a signature.
-define i64 @test_resign_ia_ia(i64 %arg, i64 %arg1, i64 %arg2) {
-; NOFPAC-LABEL: test_resign_ia_ia:
-; NOFPAC:       ; %bb.0:
-; NOFPAC-NEXT:    mov x16, x0
-; NOFPAC-NEXT:    autia x16, x1
-; NOFPAC-NEXT:    mov x17, x16
-; NOFPAC-NEXT:    xpaci x17
-; NOFPAC-NEXT:    cmp x16, x17
-; NOFPAC-NEXT:    b.eq Lauth_success_0
-; NOFPAC-NEXT:    mov x16, x17
-; NOFPAC-NEXT:    b Lresign_end_0
-; NOFPAC-NEXT:  Lauth_success_0:
-; NOFPAC-NEXT:    pacia x16, x2
-; NOFPAC-NEXT:  Lresign_end_0:
-; NOFPAC-NEXT:    mov x0, x16
-; NOFPAC-NEXT:    ret
-;
-; FPAC-LABEL: test_resign_ia_ia:
-; FPAC:       ; %bb.0:
-; FPAC-NEXT:    mov x16, x0
-; FPAC-NEXT:    autia x16, x1
-; FPAC-NEXT:    pacia x16, x2
-; FPAC-NEXT:    mov x0, x16
-; FPAC-NEXT:    ret
-  %tmp = call i64 @llvm.ptrauth.resign(i64 %arg, i32 0, i64 %arg1, i32 0, i64 %arg2)
-  ret i64 %tmp
-}
-
-define i64 @test_resign_ib_ia(i64 %arg, i64 %arg1, i64 %arg2) {
-; NOFPAC-LABEL: test_resign_ib_ia:
-; NOFPAC:       ; %bb.0:
-; NOFPAC-NEXT:    mov x16, x0
-; NOFPAC-NEXT:    autib x16, x1
-; NOFPAC-NEXT:    mov x17, x16
-; NOFPAC-NEXT:    xpaci x17
-; NOFPAC-NEXT:    cmp x16, x17
-; NOFPAC-NEXT:    b.eq Lauth_success_1
-; NOFPAC-NEXT:    mov x16, x17
-; NOFPAC-NEXT:    b Lresign_end_1
-; NOFPAC-NEXT:  Lauth_success_1:
-; NOFPAC-NEXT:    pacia x16, x2
-; NOFPAC-NEXT:  Lresign_end_1:
-; NOFPAC-NEXT:    mov x0, x16
-; NOFPAC-NEXT:    ret
-;
-; FPAC-LABEL: test_resign_ib_ia:
-; FPAC:       ; %bb.0:
-; FPAC-NEXT:    mov x16, x0
-; FPAC-NEXT:    autib x16, x1
-; FPAC-NEXT:    pacia x16, x2
-; FPAC-NEXT:    mov x0, x16
-; FPAC-NEXT:    ret
-  %tmp = call i64 @llvm.ptrauth.resign(i64 %arg, i32 1, i64 %arg1, i32 0, i64 %arg2)
-  ret i64 %tmp
-}
-
-define i64 @test_resign_da_ia(i64 %arg, i64 %arg1, i64 %arg2) {
-; NOFPAC-LABEL: test_resign_da_ia:
-; NOFPAC:       ; %bb.0:
-; NOFPAC-NEXT:    mov x16, x0
-; NOFPAC-NEXT:    autda x16, x1
-; NOFPAC-NEXT:    mov x17, x16
-; NOFPAC-NEXT:    xpacd x17
-; NOFPAC-NEXT:    cmp x16, x17
-; NOFPAC-NEXT:    b.eq Lauth_success_2
-; NOFPAC-NEXT:    mov x16, x17
-; NOFPAC-NEXT:    b Lresign_end_2
-; NOFPAC-NEXT:  Lauth_success_2:
-; NOFPAC-NEXT:    pacia x16, x2
-; NOFPAC-NEXT:  Lresign_end_2:
-; NOFPAC-NEXT:    mov x0, x16
-; NOFPAC-NEXT:    ret
-;
-; FPAC-LABEL: test_resign_da_ia:
-; FPAC:       ; %bb.0:
-; FPAC-NEXT:    mov x16, x0
-; FPAC-NEXT:    autda x16, x1
-; FPAC-NEXT:    pacia x16, x2
-; FPAC-NEXT:    mov x0, x16
-; FPAC-NEXT:    ret
-  %tmp = call i64 @llvm.ptrauth.resign(i64 %arg, i32 2, i64 %arg1, i32 0, i64 %arg2)
-  ret i64 %tmp
-}
-
-define i64 @test_resign_db_ia(i64 %arg, i64 %arg1, i64 %arg2) {
-; NOFPAC-LABEL: test_resign_db_ia:
-; NOFPAC:       ; %bb.0:
-; NOFPAC-NEXT:    mov x16, x0
-; NOFPAC-NEXT:    autdb x16, x1
-; NOFPAC-NEXT:    mov x17, x16
-; NOFPAC-NEXT:    xpacd x17
-; NOFPAC-NEXT:    cmp x16, x17
-; NOFPAC-NEXT:    b.eq Lauth_success_3
-; NOFPAC-NEXT:    mov x16, x17
-; NOFPAC-NEXT:    b Lresign_end_3
-; NOFPAC-NEXT:  Lauth_success_3:
-; NOFPAC-NEXT:    pacia x16, x2
-; NOFPAC-NEXT:  Lresign_end_3:
-; NOFPAC-NEXT:    mov x0, x16
-; NOFPAC-NEXT:    ret
-;
-; FPAC-LABEL: test_resign_db_ia:
-; FPAC:       ; %bb.0:
-; FPAC-NEXT:    mov x16, x0
-; FPAC-NEXT:    autdb x16, x1
-; FPAC-NEXT:    pacia x16, x2
-; FPAC-NEXT:    mov x0, x16
-; FPAC-NEXT:    ret
-  %tmp = call i64 @llvm.ptrauth.resign(i64 %arg, i32 3, i64 %arg1, i32 0, i64 %arg2)
-  ret i64 %tmp
-}
-
-define i64 @test_resign_db_ib(i64 %arg, i64 %arg1, i64 %arg2) {
-; NOFPAC-LABEL: test_resign_db_ib:
-; NOFPAC:       ; %bb.0:
-; NOFPAC-NEXT:    mov x16, x0
-; NOFPAC-NEXT:    autdb x16, x1
-; NOFPAC-NEXT:    mov x17, x16
-; NOFPAC-NEXT:    xpacd x17
-; NOFPAC-NEXT:    cmp x16, x17
-; NOFPAC-NEXT:    b.eq Lauth_success_4
-; NOFPAC-NEXT:    mov x16, x17
-; NOFPAC-NEXT:    b Lresign_end_4
-; NOFPAC-NEXT:  Lauth_success_4:
-; NOFPAC-NEXT:    pacib x16, x2
-; NOFPAC-NEXT:  Lresign_end_4:
-; NOFPAC-NEXT:    mov x0, x16
-; NOFPAC-NEXT:    ret
-;
-; FPAC-LABEL: test_resign_db_ib:
-; FPAC:       ; %bb.0:
-; FPAC-NEXT:    mov x16, x0
-; FPAC-NEXT:    autdb x16, x1
-; FPAC-NEXT:    pacib x16, x2
-; FPAC-NEXT:    mov x0, x16
-; FPAC-NEXT:    ret
-  %tmp = call i64 @llvm.ptrauth.resign(i64 %arg, i32 3, i64 %arg1, i32 1, i64 %arg2)
-  ret i64 %tmp
-}
-
-define i64 @test_resign_db_da(i64 %arg, i64 %arg1, i64 %arg2) {
-; NOFPAC-LABEL: test_resign_db_da:
-; NOFPAC:       ; %bb.0:
-; NOFPAC-NEXT:    mov x16, x0
-; NOFPAC-NEXT:    autdb x16, x1
-; NOFPAC-NEXT:    mov x17, x16
-; NOFPAC-NEXT:    xpacd x17
-; NOFPAC-NEXT:    cmp x16, x17
-; NOFPAC-NEXT:    b.eq Lauth_success_5
-; NOFPAC-NEXT:    mov x16, x17
-; NOFPAC-NEXT:    b Lresign_end_5
-; NOFPAC-NEXT:  Lauth_success_5:
-; NOFPAC-NEXT:    pacda x16, x2
-; NOFPAC-NEXT:  Lresign_end_5:
-; NOFPAC-NEXT:    mov x0, x16
-; NOFPAC-NEXT:    ret
-;
-; FPAC-LABEL: test_resign_db_da:
-; FPAC:       ; %bb.0:
-; FPAC-NEXT:    mov x16, x0
-; FPAC-NEXT:    autdb x16, x1
-; FPAC-NEXT:    pacda x16, x2
-; FPAC-NEXT:    mov x0, x16
-; FPAC-NEXT:    ret
-  %tmp = call i64 @llvm.ptrauth.resign(i64 %arg, i32 3, i64 %arg1, i32 2, i64 %arg2)
-  ret i64 %tmp
-}
-
-define i64 @test_resign_db_db(i64 %arg, i64 %arg1, i64 %arg2) {
-; NOFPAC-LABEL: test_resign_db_db:
-; NOFPAC:       ; %bb.0:
-; NOFPAC-NEXT:    mov x16, x0
-; NOFPAC-NEXT:    autdb x16, x1
-; NOFPAC-NEXT:    mov x17, x16
-; NOFPAC-NEXT:    xpacd x17
-; NOFPAC-NEXT:    cmp x16, x17
-; NOFPAC-NEXT:    b.eq Lauth_success_6
-; NOFPAC-NEXT:    mov x16, x17
-; NOFPAC-NEXT:    b Lresign_end_6
-; NOFPAC-NEXT:  Lauth_success_6:
-; NOFPAC-NEXT:    pacdb x16, x2
-; NOFPAC-NEXT:  Lresign_end_6:
-; NOFPAC-NEXT:    mov x0, x16
-; NOFPAC-NEXT:    ret
-;
-; FPAC-LABEL: test_resign_db_db:
-; FPAC:       ; %bb.0:
-; FPAC-NEXT:    mov x16, x0
-; FPAC-NEXT:    autdb x16, x1
-; FPAC-NEXT:    pacdb x16, x2
-; FPAC-NEXT:    mov x0, x16
-; FPAC-NEXT:    ret
-  %tmp = call i64 @llvm.ptrauth.resign(i64 %arg, i32 3, i64 %arg1, i32 3, i64 %arg2)
-  ret i64 %tmp
-}
-
-define i64 @test_resign_iza_db(i64 %arg, i64 %arg1, i64 %arg2) {
-; NOFPAC-LABEL: test_resign_iza_db:
-; NOFPAC:       ; %bb.0:
-; NOFPAC-NEXT:    mov x16, x0
-; NOFPAC-NEXT:    autiza x16
-; NOFPAC-NEXT:    mov x17, x16
-; NOFPAC-NEXT:    xpaci x17
-; NOFPAC-NEXT:    cmp x16, x17
-; NOFPAC-NEXT:    b.eq Lauth_success_7
-; NOFPAC-NEXT:    mov x16, x17
-; NOFPAC-NEXT:    b Lresign_end_7
-; NOFPAC-NEXT:  Lauth_success_7:
-; NOFPAC-NEXT:    pacdb x16, x2
-; NOFPAC-NEXT:  Lresign_end_7:
-; NOFPAC-NEXT:    mov x0, x16
-; NOFPAC-NEXT:    ret
-;
-; FPAC-LABEL: test_resign_iza_db:
-; FPAC:       ; %bb.0:
-; FPAC-NEXT:    mov x16, x0
-; FPAC-NEXT:    autiza x16
-; FPAC-NEXT:    pacdb x16, x2
-; FPAC-NEXT:    mov x0, x16
-; FPAC-NEXT:    ret
-  %tmp = call i64 @llvm.ptrauth.resign(i64 %arg, i32 0, i64 0, i32 3, i64 %arg2)
-  ret i64 %tmp
-}
-
-define i64 @test_resign_da_dzb(i64 %arg, i64 %arg1, i64 %arg2) {
-; NOFPAC-LABEL: test_resign_da_dzb:
-; NOFPAC:       ; %bb.0:
-; NOFPAC-NEXT:    mov x16, x0
-; NOFPAC-NEXT:    autda x16, x1
-; NOFPAC-NEXT:    mov x17, x16
-; NOFPAC-NEXT:    xpacd x17
-; NOFPAC-NEXT:    cmp x16, x17
-; NOFPAC-NEXT:    b.eq Lauth_success_8
-; NOFPAC-NEXT:    mov x16, x17
-; NOFPAC-NEXT:    b Lresign_end_8
-; NOFPAC-NEXT:  Lauth_success_8:
-; NOFPAC-NEXT:    pacdzb x16
-; NOFPAC-NEXT:  Lresign_end_8:
-; NOFPAC-NEXT:    mov x0, x16
-; NOFPAC-NEXT:    ret
-;
-; FPAC-LABEL: test_resign_da_dzb:
-; FPAC:       ; %bb.0:
-; FPAC-NEXT:    mov x16, x0
-; FPAC-NEXT:    autda x16, x1
-; FPAC-NEXT:    pacdzb x16
-; FPAC-NEXT:    mov x0, x16
-; FPAC-NEXT:    ret
-  %tmp = call i64 @llvm.ptrauth.resign(i64 %arg, i32 2, i64 %arg1, i32 3, i64 0)
-  ret i64 %tmp
-}
-
-define i64 @test_auth_trap_attribute(i64 %arg, i64 %arg1) "ptrauth-auth-traps" {
-; NOFPAC-LABEL: test_auth_trap_attribute:
-; NOFPAC:       ; %bb.0:
-; NOFPAC-NEXT:    mov x16, x0
-; NOFPAC-NEXT:    autia x16, x1
-; NOFPAC-NEXT:    mov x17, x16
-; NOFPAC-NEXT:    xpaci x17
-; NOFPAC-NEXT:    cmp x16, x17
-; NOFPAC-NEXT:    b.eq Lauth_success_9
-; NOFPAC-NEXT:    brk #0xc470
-; NOFPAC-NEXT:  Lauth_success_9:
-; NOFPAC-NEXT:    mov x0, x16
-; NOFPAC-NEXT:    ret
-;
-; FPAC-LABEL: test_auth_trap_attribute:
-; FPAC:       ; %bb.0:
-; FPAC-NEXT:    mov x16, x0
-; FPAC-NEXT:    autia x16, x1
-; FPAC-NEXT:    mov x0, x16
-; FPAC-NEXT:    ret
-  %tmp = call i64 @llvm.ptrauth.auth(i64 %arg, i32 0, i64 %arg1)
-  ret i64 %tmp
-}
-
-declare i64 @llvm.ptrauth.auth(i64, i32, i64)
-declare i64 @llvm.ptrauth.resign(i64, i32, i64, i32, i64)
diff --git a/llvm/test/CodeGen/AArch64/ptrauth-indirectbr.ll b/llvm/test/CodeGen/AArch64/ptrauth-indirectbr.ll
deleted file mode 100644
index 94de1b4f949e4..0000000000000
--- a/llvm/test/CodeGen/AArch64/ptrauth-indirectbr.ll
+++ /dev/null
@@ -1,248 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-
-; RUN: llc -mtriple arm64e-apple-darwin \
-; RUN:   -aarch64-enable-collect-loh=false \
-; RUN:   -aarch64-min-jump-table-entries=1 -aarch64-enable-atomic-cfg-tidy=0 \
-; RUN:   -o - %s | FileCheck %s --check-prefix=MACHO
-
-; RUN: llc -mtriple arm64e-apple-darwin \
-; RUN:   -fast-isel \
-; RUN:   -aarch64-enable-collect-loh=false \
-; RUN:   -aarch64-min-jump-table-entries=1 -aarch64-enable-atomic-cfg-tidy=0 \
-; RUN:   -o - %s | FileCheck %s --check-prefix=MACHO
-
-; RUN: llc -mtriple arm64e-apple-darwin \
-; RUN:   -global-isel -global-isel-abort=1 -verify-machineinstrs \
-; RUN:   -aarch64-enable-collect-loh=false \
-; RUN:   -aarch64-min-jump-table-entries=1 -aarch64-enable-atomic-cfg-tidy=0 \
-; RUN:   -o - %s | FileCheck %s --check-prefix=MACHO
-
-; RUN: llc -mtriple aarch64-elf -mattr=+pauth \
-; RUN:   -aarch64-min-jump-table-entries=1 -aarch64-enable-atomic-cfg-tidy=0 \
-; RUN:   -o - %s | FileCheck %s --check-prefix=ELF
-
-; RUN: llc -mtriple aarch64-elf -mattr=+pauth \
-; RUN:   -fast-isel \
-; RUN:   -aarch64-min-jump-table-entries=1 -aarch64-enable-atomic-cfg-tidy=0 \
-; RUN:   -o - %s | FileCheck %s --check-prefix=ELF
-
-; RUN: llc -mtriple aarch64-elf -mattr=+pauth \
-; RUN:   -global-isel -global-isel-abort=1 -verify-machineinstrs \
-; RUN:   -aarch64-min-jump-table-entries=1 -aarch64-enable-atomic-cfg-tidy=0 \
-; RUN:   -o - %s | FileCheck %s --check-prefix=ELF
-
-;; The discriminator is the same for all blockaddresses in the function.
-;; ptrauth_string_discriminator("test_indirectbr blockaddress") == 34947
-
-define i32 @test_indirectbr() #0 {
-; MACHO-LABEL: test_indirectbr:
-; MACHO:       ; %bb.0: ; %entry
-; MACHO-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
-; MACHO-NEXT:    adrp x16, Ltmp0 at PAGE
-; MACHO-NEXT:    add x16, x16, Ltmp0 at PAGEOFF
-; MACHO-NEXT:    mov x17, #34947 ; =0x8883
-; MACHO-NEXT:    pacia x16, x17
-; MACHO-NEXT:    mov x0, x16
-; MACHO-NEXT:    adrp x16, Ltmp1 at PAGE
-; MACHO-NEXT:    add x16, x16, Ltmp1 at PAGEOFF
-; MACHO-NEXT:    mov x17, #34947 ; =0x8883
-; MACHO-NEXT:    pacia x16, x17
-; MACHO-NEXT:    mov x1, x16
-; MACHO-NEXT:    bl _dummy_choose
-; MACHO-NEXT:    mov x17, #34947 ; =0x8883
-; MACHO-NEXT:    braa x0, x17
-; MACHO-NEXT:  Ltmp0: ; Block address taken
-; MACHO-NEXT:  LBB0_1: ; %bb1
-; MACHO-NEXT:    mov w0, #1 ; =0x1
-; MACHO-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
-; MACHO-NEXT:    ret
-; MACHO-NEXT:  Ltmp1: ; Block address taken
-; MACHO-NEXT:  LBB0_2: ; %bb2
-; MACHO-NEXT:    mov w0, #2 ; =0x2
-; MACHO-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
-; MACHO-NEXT:    ret
-;
-; ELF-LABEL: test_indirectbr:
-; ELF:       // %bb.0: // %entry
-; ELF-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; ELF-NEXT:    adrp x16, .Ltmp0
-; ELF-NEXT:    add x16, x16, :lo12:.Ltmp0
-; ELF-NEXT:    mov x17, #34947 // =0x8883
-; ELF-NEXT:    pacia x16, x17
-; ELF-NEXT:    mov x0, x16
-; ELF-NEXT:    adrp x16, .Ltmp1
-; ELF-NEXT:    add x16, x16, :lo12:.Ltmp1
-; ELF-NEXT:    mov x17, #34947 // =0x8883
-; ELF-NEXT:    pacia x16, x17
-; ELF-NEXT:    mov x1, x16
-; ELF-NEXT:    bl dummy_choose
-; ELF-NEXT:    mov x17, #34947 // =0x8883
-; ELF-NEXT:    braa x0, x17
-; ELF-NEXT:  .Ltmp0: // Block address taken
-; ELF-NEXT:  .LBB0_1: // %bb1
-; ELF-NEXT:    mov w0, #1 // =0x1
-; ELF-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; ELF-NEXT:    ret
-; ELF-NEXT:  .Ltmp1: // Block address taken
-; ELF-NEXT:  .LBB0_2: // %bb2
-; ELF-NEXT:    mov w0, #2 // =0x2
-; ELF-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; ELF-NEXT:    ret
-entry:
-  %tmp0 = call ptr @dummy_choose(ptr blockaddress(@test_indirectbr, %bb1), ptr blockaddress(@test_indirectbr, %bb2))
-  indirectbr ptr %tmp0, [label %bb1, label %bb2]
-
-bb1:
-  ret i32 1
-
-bb2:
-  ret i32 2
-}
-
-define ptr @test_indirectbr_other_function() #0 {
-; MACHO-LABEL: test_indirectbr_other_function:
-; MACHO:       ; %bb.0:
-; MACHO-NEXT:    adrp x16, Ltmp0 at PAGE
-; MACHO-NEXT:    add x16, x16, Ltmp0 at PAGEOFF
-; MACHO-NEXT:    mov x17, #34947 ; =0x8883
-; MACHO-NEXT:    pacia x16, x17
-; MACHO-NEXT:    mov x0, x16
-; MACHO-NEXT:    ret
-;
-; ELF-LABEL: test_indirectbr_other_function:
-; ELF:       // %bb.0:
-; ELF-NEXT:    adrp x16, .Ltmp0
-; ELF-NEXT:    add x16, x16, :lo12:.Ltmp0
-; ELF-NEXT:    mov x17, #34947 // =0x8883
-; ELF-NEXT:    pacia x16, x17
-; ELF-NEXT:    mov x0, x16
-; ELF-NEXT:    ret
-  ret ptr blockaddress(@test_indirectbr, %bb1)
-}
-
-;; Test another function to compare the discriminator.
-;; ptrauth_string_discriminator("test_indirectbr_2 blockaddress") == 40224
-
-define i32 @test_indirectbr_2() #0 {
-; MACHO-LABEL: test_indirectbr_2:
-; MACHO:       ; %bb.0: ; %entry
-; MACHO-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
-; MACHO-NEXT:    adrp x16, Ltmp2 at PAGE
-; MACHO-NEXT:    add x16, x16, Ltmp2 at PAGEOFF
-; MACHO-NEXT:    mov x17, #40224 ; =0x9d20
-; MACHO-NEXT:    pacia x16, x17
-; MACHO-NEXT:    mov x0, x16
-; MACHO-NEXT:    adrp x16, Ltmp3 at PAGE
-; MACHO-NEXT:    add x16, x16, Ltmp3 at PAGEOFF
-; MACHO-NEXT:    mov x17, #40224 ; =0x9d20
-; MACHO-NEXT:    pacia x16, x17
-; MACHO-NEXT:    mov x1, x16
-; MACHO-NEXT:    bl _dummy_choose
-; MACHO-NEXT:    mov x17, #40224 ; =0x9d20
-; MACHO-NEXT:    braa x0, x17
-; MACHO-NEXT:  Ltmp2: ; Block address taken
-; MACHO-NEXT:  LBB2_1: ; %bb1
-; MACHO-NEXT:    mov w0, #1 ; =0x1
-; MACHO-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
-; MACHO-NEXT:    ret
-; MACHO-NEXT:  Ltmp3: ; Block address taken
-; MACHO-NEXT:  LBB2_2: ; %bb2
-; MACHO-NEXT:    mov w0, #2 ; =0x2
-; MACHO-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
-; MACHO-NEXT:    ret
-;
-; ELF-LABEL: test_indirectbr_2:
-; ELF:       // %bb.0: // %entry
-; ELF-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; ELF-NEXT:    adrp x16, .Ltmp2
-; ELF-NEXT:    add x16, x16, :lo12:.Ltmp2
-; ELF-NEXT:    mov x17, #40224 // =0x9d20
-; ELF-NEXT:    pacia x16, x17
-; ELF-NEXT:    mov x0, x16
-; ELF-NEXT:    adrp x16, .Ltmp3
-; ELF-NEXT:    add x16, x16, :lo12:.Ltmp3
-; ELF-NEXT:    mov x17, #40224 // =0x9d20
-; ELF-NEXT:    pacia x16, x17
-; ELF-NEXT:    mov x1, x16
-; ELF-NEXT:    bl dummy_choose
-; ELF-NEXT:    mov x17, #40224 // =0x9d20
-; ELF-NEXT:    braa x0, x17
-; ELF-NEXT:  .Ltmp2: // Block address taken
-; ELF-NEXT:  .LBB2_1: // %bb1
-; ELF-NEXT:    mov w0, #1 // =0x1
-; ELF-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; ELF-NEXT:    ret
-; ELF-NEXT:  .Ltmp3: // Block address taken
-; ELF-NEXT:  .LBB2_2: // %bb2
-; ELF-NEXT:    mov w0, #2 // =0x2
-; ELF-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; ELF-NEXT:    ret
-entry:
-  %tmp0 = call ptr @dummy_choose(ptr blockaddress(@test_indirectbr_2, %bb1), ptr blockaddress(@test_indirectbr_2, %bb2))
-  indirectbr ptr %tmp0, [label %bb1, label %bb2]
-
-bb1:
-  ret i32 1
-
-bb2:
-  ret i32 2
-}
-
-;; Check we don't interfere with jump-table BRIND lowering.
-
-; MACHO-LABEL: test_jumptable:
-; MACHO:        adrp x9, LJTI3_0 at PAGE
-; MACHO-NEXT:   add x9, x9, LJTI3_0 at PAGEOFF
-; MACHO-NEXT:   adr x10, LBB3_2
-; MACHO-NEXT:   ldrb w11, [x9, x8]
-; MACHO-NEXT:   add x10, x10, x11, lsl #2
-; MACHO-NEXT:   br x10
-
-; ELF-LABEL: test_jumptable:
-; ELF:        adrp x9, .LJTI3_0
-; ELF-NEXT:   add x9, x9, :lo12:.LJTI3_0
-; ELF-NEXT:   adr x10, .LBB3_2
-; ELF-NEXT:   ldrb w11, [x9, x8]
-; ELF-NEXT:   add x10, x10, x11, lsl #2
-; ELF-NEXT:   br x10
-define i32 @test_jumptable(i32 %in) #0 {
-  switch i32 %in, label %def [
-    i32 0, label %lbl1
-    i32 1, label %lbl2
-  ]
-
-def:
-  ret i32 0
-
-lbl1:
-  ret i32 1
-
-lbl2:
-  ret i32 2
-}
-
-; MACHO-LABEL: .globl _test_indirectbr_array
-; MACHO-NEXT:  .p2align 4
-; MACHO-NEXT:  _test_indirectbr_array:
-; MACHO-NEXT:   .quad Ltmp0 at AUTH(ia,34947)
-; MACHO-NEXT:   .quad Ltmp1 at AUTH(ia,34947)
-; MACHO-NEXT:   .quad Ltmp2 at AUTH(ia,40224)
-; MACHO-NEXT:   .quad Ltmp3 at AUTH(ia,40224)
-
-; ELF-LABEL: .globl test_indirectbr_array
-; ELF-NEXT:  .p2align 4, 0x0
-; ELF-NEXT:  test_indirectbr_array:
-; ELF-NEXT:   .xword .Ltmp0 at AUTH(ia,34947)
-; ELF-NEXT:   .xword .Ltmp1 at AUTH(ia,34947)
-; ELF-NEXT:   .xword .Ltmp2 at AUTH(ia,40224)
-; ELF-NEXT:   .xword .Ltmp3 at AUTH(ia,40224)
-; ELF-NEXT:   .size test_indirectbr_array, 32
-
- at test_indirectbr_array = constant [4 x ptr] [
-  ptr blockaddress(@test_indirectbr, %bb1), ptr blockaddress(@test_indirectbr, %bb2),
-  ptr blockaddress(@test_indirectbr_2, %bb1), ptr blockaddress(@test_indirectbr_2, %bb2)
-]
-
-declare ptr @dummy_choose(ptr, ptr)
-
-attributes #0 = { "ptrauth-indirect-gotos" nounwind }
diff --git a/llvm/test/CodeGen/AArch64/ptrauth-intrinsic-auth-resign-with-blend.ll b/llvm/test/CodeGen/AArch64/ptrauth-intrinsic-auth-resign-with-blend.ll
deleted file mode 100644
index 3b93acd8e46f7..0000000000000
--- a/llvm/test/CodeGen/AArch64/ptrauth-intrinsic-auth-resign-with-blend.ll
+++ /dev/null
@@ -1,254 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple arm64e-apple-darwin -global-isel=0                    -verify-machineinstrs \
-; RUN:   -aarch64-ptrauth-auth-checks=none | FileCheck %s --check-prefix=UNCHECKED
-; RUN: llc < %s -mtriple arm64e-apple-darwin -global-isel -global-isel-abort=1 -verify-machineinstrs \
-; RUN:   -aarch64-ptrauth-auth-checks=none | FileCheck %s --check-prefix=UNCHECKED
-
-; RUN: llc < %s -mtriple arm64e-apple-darwin -global-isel=0                    -verify-machineinstrs \
-; RUN:                                     | FileCheck %s --check-prefix=CHECKED
-; RUN: llc < %s -mtriple arm64e-apple-darwin -global-isel -global-isel-abort=1 -verify-machineinstrs \
-; RUN:                                     | FileCheck %s --check-prefix=CHECKED
-
-; RUN: llc < %s -mtriple arm64e-apple-darwin -global-isel=0                    -verify-machineinstrs \
-; RUN:   -aarch64-ptrauth-auth-checks=trap | FileCheck %s --check-prefix=TRAP
-; RUN: llc < %s -mtriple arm64e-apple-darwin -global-isel -global-isel-abort=1 -verify-machineinstrs \
-; RUN:   -aarch64-ptrauth-auth-checks=trap | FileCheck %s --check-prefix=TRAP
-
-target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
-
-define i64 @test_auth_blend(i64 %arg, i64 %arg1) {
-; UNCHECKED-LABEL: test_auth_blend:
-; UNCHECKED:       ; %bb.0:
-; UNCHECKED-NEXT:    mov x16, x0
-; UNCHECKED-NEXT:    mov x17, x1
-; UNCHECKED-NEXT:    movk x17, #65535, lsl #48
-; UNCHECKED-NEXT:    autda x16, x17
-; UNCHECKED-NEXT:    mov x0, x16
-; UNCHECKED-NEXT:    ret
-;
-; CHECKED-LABEL: test_auth_blend:
-; CHECKED:       ; %bb.0:
-; CHECKED-NEXT:    mov x16, x0
-; CHECKED-NEXT:    mov x17, x1
-; CHECKED-NEXT:    movk x17, #65535, lsl #48
-; CHECKED-NEXT:    autda x16, x17
-; CHECKED-NEXT:    mov x0, x16
-; CHECKED-NEXT:    ret
-;
-; TRAP-LABEL: test_auth_blend:
-; TRAP:       ; %bb.0:
-; TRAP-NEXT:    mov x16, x0
-; TRAP-NEXT:    mov x17, x1
-; TRAP-NEXT:    movk x17, #65535, lsl #48
-; TRAP-NEXT:    autda x16, x17
-; TRAP-NEXT:    mov x17, x16
-; TRAP-NEXT:    xpacd x17
-; TRAP-NEXT:    cmp x16, x17
-; TRAP-NEXT:    b.eq Lauth_success_0
-; TRAP-NEXT:    brk #0xc472
-; TRAP-NEXT:  Lauth_success_0:
-; TRAP-NEXT:    mov x0, x16
-; TRAP-NEXT:    ret
-  %tmp0 = call i64 @llvm.ptrauth.blend(i64 %arg1, i64 65535)
-  %tmp1 = call i64 @llvm.ptrauth.auth(i64 %arg, i32 2, i64 %tmp0)
-  ret i64 %tmp1
-}
-
-define i64 @test_resign_blend(i64 %arg, i64 %arg1, i64 %arg2) {
-; UNCHECKED-LABEL: test_resign_blend:
-; UNCHECKED:       ; %bb.0:
-; UNCHECKED-NEXT:    mov x16, x0
-; UNCHECKED-NEXT:    mov x17, x1
-; UNCHECKED-NEXT:    movk x17, #12345, lsl #48
-; UNCHECKED-NEXT:    autda x16, x17
-; UNCHECKED-NEXT:    mov x17, x2
-; UNCHECKED-NEXT:    movk x17, #56789, lsl #48
-; UNCHECKED-NEXT:    pacdb x16, x17
-; UNCHECKED-NEXT:    mov x0, x16
-; UNCHECKED-NEXT:    ret
-;
-; CHECKED-LABEL: test_resign_blend:
-; CHECKED:       ; %bb.0:
-; CHECKED-NEXT:    mov x16, x0
-; CHECKED-NEXT:    mov x17, x1
-; CHECKED-NEXT:    movk x17, #12345, lsl #48
-; CHECKED-NEXT:    autda x16, x17
-; CHECKED-NEXT:    mov x17, x16
-; CHECKED-NEXT:    xpacd x17
-; CHECKED-NEXT:    cmp x16, x17
-; CHECKED-NEXT:    b.eq Lauth_success_0
-; CHECKED-NEXT:    mov x16, x17
-; CHECKED-NEXT:    b Lresign_end_0
-; CHECKED-NEXT:  Lauth_success_0:
-; CHECKED-NEXT:    mov x17, x2
-; CHECKED-NEXT:    movk x17, #56789, lsl #48
-; CHECKED-NEXT:    pacdb x16, x17
-; CHECKED-NEXT:  Lresign_end_0:
-; CHECKED-NEXT:    mov x0, x16
-; CHECKED-NEXT:    ret
-;
-; TRAP-LABEL: test_resign_blend:
-; TRAP:       ; %bb.0:
-; TRAP-NEXT:    mov x16, x0
-; TRAP-NEXT:    mov x17, x1
-; TRAP-NEXT:    movk x17, #12345, lsl #48
-; TRAP-NEXT:    autda x16, x17
-; TRAP-NEXT:    mov x17, x16
-; TRAP-NEXT:    xpacd x17
-; TRAP-NEXT:    cmp x16, x17
-; TRAP-NEXT:    b.eq Lauth_success_1
-; TRAP-NEXT:    brk #0xc472
-; TRAP-NEXT:  Lauth_success_1:
-; TRAP-NEXT:    mov x17, x2
-; TRAP-NEXT:    movk x17, #56789, lsl #48
-; TRAP-NEXT:    pacdb x16, x17
-; TRAP-NEXT:    mov x0, x16
-; TRAP-NEXT:    ret
-  %tmp0 = call i64 @llvm.ptrauth.blend(i64 %arg1, i64 12345)
-  %tmp1 = call i64 @llvm.ptrauth.blend(i64 %arg2, i64 56789)
-  %tmp2 = call i64 @llvm.ptrauth.resign(i64 %arg, i32 2, i64 %tmp0, i32 3, i64 %tmp1)
-  ret i64 %tmp2
-}
-
-define i64 @test_resign_blend_and_const(i64 %arg, i64 %arg1) {
-; UNCHECKED-LABEL: test_resign_blend_and_const:
-; UNCHECKED:       ; %bb.0:
-; UNCHECKED-NEXT:    mov x16, x0
-; UNCHECKED-NEXT:    mov x17, x1
-; UNCHECKED-NEXT:    movk x17, #12345, lsl #48
-; UNCHECKED-NEXT:    autda x16, x17
-; UNCHECKED-NEXT:    mov x17, #56789 ; =0xddd5
-; UNCHECKED-NEXT:    pacdb x16, x17
-; UNCHECKED-NEXT:    mov x0, x16
-; UNCHECKED-NEXT:    ret
-;
-; CHECKED-LABEL: test_resign_blend_and_const:
-; CHECKED:       ; %bb.0:
-; CHECKED-NEXT:    mov x16, x0
-; CHECKED-NEXT:    mov x17, x1
-; CHECKED-NEXT:    movk x17, #12345, lsl #48
-; CHECKED-NEXT:    autda x16, x17
-; CHECKED-NEXT:    mov x17, x16
-; CHECKED-NEXT:    xpacd x17
-; CHECKED-NEXT:    cmp x16, x17
-; CHECKED-NEXT:    b.eq Lauth_success_1
-; CHECKED-NEXT:    mov x16, x17
-; CHECKED-NEXT:    b Lresign_end_1
-; CHECKED-NEXT:  Lauth_success_1:
-; CHECKED-NEXT:    mov x17, #56789 ; =0xddd5
-; CHECKED-NEXT:    pacdb x16, x17
-; CHECKED-NEXT:  Lresign_end_1:
-; CHECKED-NEXT:    mov x0, x16
-; CHECKED-NEXT:    ret
-;
-; TRAP-LABEL: test_resign_blend_and_const:
-; TRAP:       ; %bb.0:
-; TRAP-NEXT:    mov x16, x0
-; TRAP-NEXT:    mov x17, x1
-; TRAP-NEXT:    movk x17, #12345, lsl #48
-; TRAP-NEXT:    autda x16, x17
-; TRAP-NEXT:    mov x17, x16
-; TRAP-NEXT:    xpacd x17
-; TRAP-NEXT:    cmp x16, x17
-; TRAP-NEXT:    b.eq Lauth_success_2
-; TRAP-NEXT:    brk #0xc472
-; TRAP-NEXT:  Lauth_success_2:
-; TRAP-NEXT:    mov x17, #56789 ; =0xddd5
-; TRAP-NEXT:    pacdb x16, x17
-; TRAP-NEXT:    mov x0, x16
-; TRAP-NEXT:    ret
-  %tmp0 = call i64 @llvm.ptrauth.blend(i64 %arg1, i64 12345)
-  %tmp1 = call i64 @llvm.ptrauth.resign(i64 %arg, i32 2, i64 %tmp0, i32 3, i64 56789)
-  ret i64 %tmp1
-}
-
-define i64 @test_resign_blend_and_addr(i64 %arg, i64 %arg1, i64 %arg2) {
-; UNCHECKED-LABEL: test_resign_blend_and_addr:
-; UNCHECKED:       ; %bb.0:
-; UNCHECKED-NEXT:    mov x16, x0
-; UNCHECKED-NEXT:    mov x17, x1
-; UNCHECKED-NEXT:    movk x17, #12345, lsl #48
-; UNCHECKED-NEXT:    autda x16, x17
-; UNCHECKED-NEXT:    pacdb x16, x2
-; UNCHECKED-NEXT:    mov x0, x16
-; UNCHECKED-NEXT:    ret
-;
-; CHECKED-LABEL: test_resign_blend_and_addr:
-; CHECKED:       ; %bb.0:
-; CHECKED-NEXT:    mov x16, x0
-; CHECKED-NEXT:    mov x17, x1
-; CHECKED-NEXT:    movk x17, #12345, lsl #48
-; CHECKED-NEXT:    autda x16, x17
-; CHECKED-NEXT:    mov x17, x16
-; CHECKED-NEXT:    xpacd x17
-; CHECKED-NEXT:    cmp x16, x17
-; CHECKED-NEXT:    b.eq Lauth_success_2
-; CHECKED-NEXT:    mov x16, x17
-; CHECKED-NEXT:    b Lresign_end_2
-; CHECKED-NEXT:  Lauth_success_2:
-; CHECKED-NEXT:    pacdb x16, x2
-; CHECKED-NEXT:  Lresign_end_2:
-; CHECKED-NEXT:    mov x0, x16
-; CHECKED-NEXT:    ret
-;
-; TRAP-LABEL: test_resign_blend_and_addr:
-; TRAP:       ; %bb.0:
-; TRAP-NEXT:    mov x16, x0
-; TRAP-NEXT:    mov x17, x1
-; TRAP-NEXT:    movk x17, #12345, lsl #48
-; TRAP-NEXT:    autda x16, x17
-; TRAP-NEXT:    mov x17, x16
-; TRAP-NEXT:    xpacd x17
-; TRAP-NEXT:    cmp x16, x17
-; TRAP-NEXT:    b.eq Lauth_success_3
-; TRAP-NEXT:    brk #0xc472
-; TRAP-NEXT:  Lauth_success_3:
-; TRAP-NEXT:    pacdb x16, x2
-; TRAP-NEXT:    mov x0, x16
-; TRAP-NEXT:    ret
-  %tmp0 = call i64 @llvm.ptrauth.blend(i64 %arg1, i64 12345)
-  %tmp1 = call i64 @llvm.ptrauth.resign(i64 %arg, i32 2, i64 %tmp0, i32 3, i64 %arg2)
-  ret i64 %tmp1
-}
-
-define i64 @test_auth_too_large_discriminator(i64 %arg, i64 %arg1) {
-; UNCHECKED-LABEL: test_auth_too_large_discriminator:
-; UNCHECKED:       ; %bb.0:
-; UNCHECKED-NEXT:    mov w8, #65536 ; =0x10000
-; UNCHECKED-NEXT:    bfi x1, x8, #48, #16
-; UNCHECKED-NEXT:    mov x16, x0
-; UNCHECKED-NEXT:    autda x16, x1
-; UNCHECKED-NEXT:    mov x0, x16
-; UNCHECKED-NEXT:    ret
-;
-; CHECKED-LABEL: test_auth_too_large_discriminator:
-; CHECKED:       ; %bb.0:
-; CHECKED-NEXT:    mov w8, #65536 ; =0x10000
-; CHECKED-NEXT:    bfi x1, x8, #48, #16
-; CHECKED-NEXT:    mov x16, x0
-; CHECKED-NEXT:    autda x16, x1
-; CHECKED-NEXT:    mov x0, x16
-; CHECKED-NEXT:    ret
-;
-; TRAP-LABEL: test_auth_too_large_discriminator:
-; TRAP:       ; %bb.0:
-; TRAP-NEXT:    mov w8, #65536 ; =0x10000
-; TRAP-NEXT:    bfi x1, x8, #48, #16
-; TRAP-NEXT:    mov x16, x0
-; TRAP-NEXT:    autda x16, x1
-; TRAP-NEXT:    mov x17, x16
-; TRAP-NEXT:    xpacd x17
-; TRAP-NEXT:    cmp x16, x17
-; TRAP-NEXT:    b.eq Lauth_success_4
-; TRAP-NEXT:    brk #0xc472
-; TRAP-NEXT:  Lauth_success_4:
-; TRAP-NEXT:    mov x0, x16
-; TRAP-NEXT:    ret
-  %tmp0 = call i64 @llvm.ptrauth.blend(i64 %arg1, i64 65536)
-  %tmp1 = call i64 @llvm.ptrauth.auth(i64 %arg, i32 2, i64 %tmp0)
-  ret i64 %tmp1
-}
-
-declare i64 @llvm.ptrauth.auth(i64, i32, i64)
-declare i64 @llvm.ptrauth.resign(i64, i32, i64, i32, i64)
-declare i64 @llvm.ptrauth.blend(i64, i64)
diff --git a/llvm/test/CodeGen/AArch64/ptrauth-intrinsic-auth-resign.ll b/llvm/test/CodeGen/AArch64/ptrauth-intrinsic-auth-resign.ll
deleted file mode 100644
index 62c9fba853adb..0000000000000
--- a/llvm/test/CodeGen/AArch64/ptrauth-intrinsic-auth-resign.ll
+++ /dev/null
@@ -1,638 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple arm64e-apple-darwin -global-isel=0                    -verify-machineinstrs \
-; RUN:   -aarch64-ptrauth-auth-checks=none | FileCheck %s --check-prefix=UNCHECKED
-; RUN: llc < %s -mtriple arm64e-apple-darwin -global-isel -global-isel-abort=1 -verify-machineinstrs \
-; RUN:   -aarch64-ptrauth-auth-checks=none | FileCheck %s --check-prefix=UNCHECKED
-
-; RUN: llc < %s -mtriple arm64e-apple-darwin -global-isel=0                    -verify-machineinstrs \
-; RUN:                                     | FileCheck %s --check-prefix=CHECKED
-; RUN: llc < %s -mtriple arm64e-apple-darwin -global-isel -global-isel-abort=1 -verify-machineinstrs \
-; RUN:                                     | FileCheck %s --check-prefix=CHECKED
-
-; RUN: llc < %s -mtriple arm64e-apple-darwin -global-isel=0                    -verify-machineinstrs \
-; RUN:   -aarch64-ptrauth-auth-checks=trap | FileCheck %s --check-prefix=TRAP
-; RUN: llc < %s -mtriple arm64e-apple-darwin -global-isel -global-isel-abort=1 -verify-machineinstrs \
-; RUN:   -aarch64-ptrauth-auth-checks=trap | FileCheck %s --check-prefix=TRAP
-
-target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
-
-define i64 @test_auth_ia(i64 %arg, i64 %arg1) {
-; UNCHECKED-LABEL: test_auth_ia:
-; UNCHECKED:       ; %bb.0:
-; UNCHECKED-NEXT:    mov x16, x0
-; UNCHECKED-NEXT:    autia x16, x1
-; UNCHECKED-NEXT:    mov x0, x16
-; UNCHECKED-NEXT:    ret
-;
-; CHECKED-LABEL: test_auth_ia:
-; CHECKED:       ; %bb.0:
-; CHECKED-NEXT:    mov x16, x0
-; CHECKED-NEXT:    autia x16, x1
-; CHECKED-NEXT:    mov x0, x16
-; CHECKED-NEXT:    ret
-;
-; TRAP-LABEL: test_auth_ia:
-; TRAP:       ; %bb.0:
-; TRAP-NEXT:    mov x16, x0
-; TRAP-NEXT:    autia x16, x1
-; TRAP-NEXT:    mov x17, x16
-; TRAP-NEXT:    xpaci x17
-; TRAP-NEXT:    cmp x16, x17
-; TRAP-NEXT:    b.eq Lauth_success_0
-; TRAP-NEXT:    brk #0xc470
-; TRAP-NEXT:  Lauth_success_0:
-; TRAP-NEXT:    mov x0, x16
-; TRAP-NEXT:    ret
-  %tmp = call i64 @llvm.ptrauth.auth(i64 %arg, i32 0, i64 %arg1)
-  ret i64 %tmp
-}
-
-define i64 @test_auth_ia_zero(i64 %arg) {
-; UNCHECKED-LABEL: test_auth_ia_zero:
-; UNCHECKED:       ; %bb.0:
-; UNCHECKED-NEXT:    mov x16, x0
-; UNCHECKED-NEXT:    autiza x16
-; UNCHECKED-NEXT:    mov x0, x16
-; UNCHECKED-NEXT:    ret
-;
-; CHECKED-LABEL: test_auth_ia_zero:
-; CHECKED:       ; %bb.0:
-; CHECKED-NEXT:    mov x16, x0
-; CHECKED-NEXT:    autiza x16
-; CHECKED-NEXT:    mov x0, x16
-; CHECKED-NEXT:    ret
-;
-; TRAP-LABEL: test_auth_ia_zero:
-; TRAP:       ; %bb.0:
-; TRAP-NEXT:    mov x16, x0
-; TRAP-NEXT:    autiza x16
-; TRAP-NEXT:    mov x17, x16
-; TRAP-NEXT:    xpaci x17
-; TRAP-NEXT:    cmp x16, x17
-; TRAP-NEXT:    b.eq Lauth_success_1
-; TRAP-NEXT:    brk #0xc470
-; TRAP-NEXT:  Lauth_success_1:
-; TRAP-NEXT:    mov x0, x16
-; TRAP-NEXT:    ret
-  %tmp = call i64 @llvm.ptrauth.auth(i64 %arg, i32 0, i64 0)
-  ret i64 %tmp
-}
-
-define i64 @test_auth_ib(i64 %arg, i64 %arg1) {
-; UNCHECKED-LABEL: test_auth_ib:
-; UNCHECKED:       ; %bb.0:
-; UNCHECKED-NEXT:    mov x16, x0
-; UNCHECKED-NEXT:    autib x16, x1
-; UNCHECKED-NEXT:    mov x0, x16
-; UNCHECKED-NEXT:    ret
-;
-; CHECKED-LABEL: test_auth_ib:
-; CHECKED:       ; %bb.0:
-; CHECKED-NEXT:    mov x16, x0
-; CHECKED-NEXT:    autib x16, x1
-; CHECKED-NEXT:    mov x0, x16
-; CHECKED-NEXT:    ret
-;
-; TRAP-LABEL: test_auth_ib:
-; TRAP:       ; %bb.0:
-; TRAP-NEXT:    mov x16, x0
-; TRAP-NEXT:    autib x16, x1
-; TRAP-NEXT:    mov x17, x16
-; TRAP-NEXT:    xpaci x17
-; TRAP-NEXT:    cmp x16, x17
-; TRAP-NEXT:    b.eq Lauth_success_2
-; TRAP-NEXT:    brk #0xc471
-; TRAP-NEXT:  Lauth_success_2:
-; TRAP-NEXT:    mov x0, x16
-; TRAP-NEXT:    ret
-  %tmp = call i64 @llvm.ptrauth.auth(i64 %arg, i32 1, i64 %arg1)
-  ret i64 %tmp
-}
-
-define i64 @test_auth_ib_zero(i64 %arg) {
-; UNCHECKED-LABEL: test_auth_ib_zero:
-; UNCHECKED:       ; %bb.0:
-; UNCHECKED-NEXT:    mov x16, x0
-; UNCHECKED-NEXT:    autizb x16
-; UNCHECKED-NEXT:    mov x0, x16
-; UNCHECKED-NEXT:    ret
-;
-; CHECKED-LABEL: test_auth_ib_zero:
-; CHECKED:       ; %bb.0:
-; CHECKED-NEXT:    mov x16, x0
-; CHECKED-NEXT:    autizb x16
-; CHECKED-NEXT:    mov x0, x16
-; CHECKED-NEXT:    ret
-;
-; TRAP-LABEL: test_auth_ib_zero:
-; TRAP:       ; %bb.0:
-; TRAP-NEXT:    mov x16, x0
-; TRAP-NEXT:    autizb x16
-; TRAP-NEXT:    mov x17, x16
-; TRAP-NEXT:    xpaci x17
-; TRAP-NEXT:    cmp x16, x17
-; TRAP-NEXT:    b.eq Lauth_success_3
-; TRAP-NEXT:    brk #0xc471
-; TRAP-NEXT:  Lauth_success_3:
-; TRAP-NEXT:    mov x0, x16
-; TRAP-NEXT:    ret
-  %tmp = call i64 @llvm.ptrauth.auth(i64 %arg, i32 1, i64 0)
-  ret i64 %tmp
-}
-
-define i64 @test_auth_da(i64 %arg, i64 %arg1) {
-; UNCHECKED-LABEL: test_auth_da:
-; UNCHECKED:       ; %bb.0:
-; UNCHECKED-NEXT:    mov x16, x0
-; UNCHECKED-NEXT:    autda x16, x1
-; UNCHECKED-NEXT:    mov x0, x16
-; UNCHECKED-NEXT:    ret
-;
-; CHECKED-LABEL: test_auth_da:
-; CHECKED:       ; %bb.0:
-; CHECKED-NEXT:    mov x16, x0
-; CHECKED-NEXT:    autda x16, x1
-; CHECKED-NEXT:    mov x0, x16
-; CHECKED-NEXT:    ret
-;
-; TRAP-LABEL: test_auth_da:
-; TRAP:       ; %bb.0:
-; TRAP-NEXT:    mov x16, x0
-; TRAP-NEXT:    autda x16, x1
-; TRAP-NEXT:    mov x17, x16
-; TRAP-NEXT:    xpacd x17
-; TRAP-NEXT:    cmp x16, x17
-; TRAP-NEXT:    b.eq Lauth_success_4
-; TRAP-NEXT:    brk #0xc472
-; TRAP-NEXT:  Lauth_success_4:
-; TRAP-NEXT:    mov x0, x16
-; TRAP-NEXT:    ret
-  %tmp = call i64 @llvm.ptrauth.auth(i64 %arg, i32 2, i64 %arg1)
-  ret i64 %tmp
-}
-
-define i64 @test_auth_da_zero(i64 %arg) {
-; UNCHECKED-LABEL: test_auth_da_zero:
-; UNCHECKED:       ; %bb.0:
-; UNCHECKED-NEXT:    mov x16, x0
-; UNCHECKED-NEXT:    autdza x16
-; UNCHECKED-NEXT:    mov x0, x16
-; UNCHECKED-NEXT:    ret
-;
-; CHECKED-LABEL: test_auth_da_zero:
-; CHECKED:       ; %bb.0:
-; CHECKED-NEXT:    mov x16, x0
-; CHECKED-NEXT:    autdza x16
-; CHECKED-NEXT:    mov x0, x16
-; CHECKED-NEXT:    ret
-;
-; TRAP-LABEL: test_auth_da_zero:
-; TRAP:       ; %bb.0:
-; TRAP-NEXT:    mov x16, x0
-; TRAP-NEXT:    autdza x16
-; TRAP-NEXT:    mov x17, x16
-; TRAP-NEXT:    xpacd x17
-; TRAP-NEXT:    cmp x16, x17
-; TRAP-NEXT:    b.eq Lauth_success_5
-; TRAP-NEXT:    brk #0xc472
-; TRAP-NEXT:  Lauth_success_5:
-; TRAP-NEXT:    mov x0, x16
-; TRAP-NEXT:    ret
-  %tmp = call i64 @llvm.ptrauth.auth(i64 %arg, i32 2, i64 0)
-  ret i64 %tmp
-}
-
-define i64 @test_auth_db(i64 %arg, i64 %arg1) {
-; UNCHECKED-LABEL: test_auth_db:
-; UNCHECKED:       ; %bb.0:
-; UNCHECKED-NEXT:    mov x16, x0
-; UNCHECKED-NEXT:    autdb x16, x1
-; UNCHECKED-NEXT:    mov x0, x16
-; UNCHECKED-NEXT:    ret
-;
-; CHECKED-LABEL: test_auth_db:
-; CHECKED:       ; %bb.0:
-; CHECKED-NEXT:    mov x16, x0
-; CHECKED-NEXT:    autdb x16, x1
-; CHECKED-NEXT:    mov x0, x16
-; CHECKED-NEXT:    ret
-;
-; TRAP-LABEL: test_auth_db:
-; TRAP:       ; %bb.0:
-; TRAP-NEXT:    mov x16, x0
-; TRAP-NEXT:    autdb x16, x1
-; TRAP-NEXT:    mov x17, x16
-; TRAP-NEXT:    xpacd x17
-; TRAP-NEXT:    cmp x16, x17
-; TRAP-NEXT:    b.eq Lauth_success_6
-; TRAP-NEXT:    brk #0xc473
-; TRAP-NEXT:  Lauth_success_6:
-; TRAP-NEXT:    mov x0, x16
-; TRAP-NEXT:    ret
-  %tmp = call i64 @llvm.ptrauth.auth(i64 %arg, i32 3, i64 %arg1)
-  ret i64 %tmp
-}
-
-define i64 @test_auth_db_zero(i64 %arg) {
-; UNCHECKED-LABEL: test_auth_db_zero:
-; UNCHECKED:       ; %bb.0:
-; UNCHECKED-NEXT:    mov x16, x0
-; UNCHECKED-NEXT:    autdzb x16
-; UNCHECKED-NEXT:    mov x0, x16
-; UNCHECKED-NEXT:    ret
-;
-; CHECKED-LABEL: test_auth_db_zero:
-; CHECKED:       ; %bb.0:
-; CHECKED-NEXT:    mov x16, x0
-; CHECKED-NEXT:    autdzb x16
-; CHECKED-NEXT:    mov x0, x16
-; CHECKED-NEXT:    ret
-;
-; TRAP-LABEL: test_auth_db_zero:
-; TRAP:       ; %bb.0:
-; TRAP-NEXT:    mov x16, x0
-; TRAP-NEXT:    autdzb x16
-; TRAP-NEXT:    mov x17, x16
-; TRAP-NEXT:    xpacd x17
-; TRAP-NEXT:    cmp x16, x17
-; TRAP-NEXT:    b.eq Lauth_success_7
-; TRAP-NEXT:    brk #0xc473
-; TRAP-NEXT:  Lauth_success_7:
-; TRAP-NEXT:    mov x0, x16
-; TRAP-NEXT:    ret
-  %tmp = call i64 @llvm.ptrauth.auth(i64 %arg, i32 3, i64 0)
-  ret i64 %tmp
-}
-
-;; Note that this might seem like a no-op but is actually a valid way to enforce
-;; the validity of a signature.
-define i64 @test_resign_ia_ia(i64 %arg, i64 %arg1, i64 %arg2) {
-; UNCHECKED-LABEL: test_resign_ia_ia:
-; UNCHECKED:       ; %bb.0:
-; UNCHECKED-NEXT:    mov x16, x0
-; UNCHECKED-NEXT:    autia x16, x1
-; UNCHECKED-NEXT:    pacia x16, x2
-; UNCHECKED-NEXT:    mov x0, x16
-; UNCHECKED-NEXT:    ret
-;
-; CHECKED-LABEL: test_resign_ia_ia:
-; CHECKED:       ; %bb.0:
-; CHECKED-NEXT:    mov x16, x0
-; CHECKED-NEXT:    autia x16, x1
-; CHECKED-NEXT:    mov x17, x16
-; CHECKED-NEXT:    xpaci x17
-; CHECKED-NEXT:    cmp x16, x17
-; CHECKED-NEXT:    b.eq Lauth_success_0
-; CHECKED-NEXT:    mov x16, x17
-; CHECKED-NEXT:    b Lresign_end_0
-; CHECKED-NEXT:  Lauth_success_0:
-; CHECKED-NEXT:    pacia x16, x2
-; CHECKED-NEXT:  Lresign_end_0:
-; CHECKED-NEXT:    mov x0, x16
-; CHECKED-NEXT:    ret
-;
-; TRAP-LABEL: test_resign_ia_ia:
-; TRAP:       ; %bb.0:
-; TRAP-NEXT:    mov x16, x0
-; TRAP-NEXT:    autia x16, x1
-; TRAP-NEXT:    mov x17, x16
-; TRAP-NEXT:    xpaci x17
-; TRAP-NEXT:    cmp x16, x17
-; TRAP-NEXT:    b.eq Lauth_success_8
-; TRAP-NEXT:    brk #0xc470
-; TRAP-NEXT:  Lauth_success_8:
-; TRAP-NEXT:    pacia x16, x2
-; TRAP-NEXT:    mov x0, x16
-; TRAP-NEXT:    ret
-  %tmp = call i64 @llvm.ptrauth.resign(i64 %arg, i32 0, i64 %arg1, i32 0, i64 %arg2)
-  ret i64 %tmp
-}
-
-define i64 @test_resign_ib_ia(i64 %arg, i64 %arg1, i64 %arg2) {
-; UNCHECKED-LABEL: test_resign_ib_ia:
-; UNCHECKED:       ; %bb.0:
-; UNCHECKED-NEXT:    mov x16, x0
-; UNCHECKED-NEXT:    autib x16, x1
-; UNCHECKED-NEXT:    pacia x16, x2
-; UNCHECKED-NEXT:    mov x0, x16
-; UNCHECKED-NEXT:    ret
-;
-; CHECKED-LABEL: test_resign_ib_ia:
-; CHECKED:       ; %bb.0:
-; CHECKED-NEXT:    mov x16, x0
-; CHECKED-NEXT:    autib x16, x1
-; CHECKED-NEXT:    mov x17, x16
-; CHECKED-NEXT:    xpaci x17
-; CHECKED-NEXT:    cmp x16, x17
-; CHECKED-NEXT:    b.eq Lauth_success_1
-; CHECKED-NEXT:    mov x16, x17
-; CHECKED-NEXT:    b Lresign_end_1
-; CHECKED-NEXT:  Lauth_success_1:
-; CHECKED-NEXT:    pacia x16, x2
-; CHECKED-NEXT:  Lresign_end_1:
-; CHECKED-NEXT:    mov x0, x16
-; CHECKED-NEXT:    ret
-;
-; TRAP-LABEL: test_resign_ib_ia:
-; TRAP:       ; %bb.0:
-; TRAP-NEXT:    mov x16, x0
-; TRAP-NEXT:    autib x16, x1
-; TRAP-NEXT:    mov x17, x16
-; TRAP-NEXT:    xpaci x17
-; TRAP-NEXT:    cmp x16, x17
-; TRAP-NEXT:    b.eq Lauth_success_9
-; TRAP-NEXT:    brk #0xc471
-; TRAP-NEXT:  Lauth_success_9:
-; TRAP-NEXT:    pacia x16, x2
-; TRAP-NEXT:    mov x0, x16
-; TRAP-NEXT:    ret
-  %tmp = call i64 @llvm.ptrauth.resign(i64 %arg, i32 1, i64 %arg1, i32 0, i64 %arg2)
-  ret i64 %tmp
-}
-
-define i64 @test_resign_da_ia(i64 %arg, i64 %arg1, i64 %arg2) {
-; UNCHECKED-LABEL: test_resign_da_ia:
-; UNCHECKED:       ; %bb.0:
-; UNCHECKED-NEXT:    mov x16, x0
-; UNCHECKED-NEXT:    autda x16, x1
-; UNCHECKED-NEXT:    pacia x16, x2
-; UNCHECKED-NEXT:    mov x0, x16
-; UNCHECKED-NEXT:    ret
-;
-; CHECKED-LABEL: test_resign_da_ia:
-; CHECKED:       ; %bb.0:
-; CHECKED-NEXT:    mov x16, x0
-; CHECKED-NEXT:    autda x16, x1
-; CHECKED-NEXT:    mov x17, x16
-; CHECKED-NEXT:    xpacd x17
-; CHECKED-NEXT:    cmp x16, x17
-; CHECKED-NEXT:    b.eq Lauth_success_2
-; CHECKED-NEXT:    mov x16, x17
-; CHECKED-NEXT:    b Lresign_end_2
-; CHECKED-NEXT:  Lauth_success_2:
-; CHECKED-NEXT:    pacia x16, x2
-; CHECKED-NEXT:  Lresign_end_2:
-; CHECKED-NEXT:    mov x0, x16
-; CHECKED-NEXT:    ret
-;
-; TRAP-LABEL: test_resign_da_ia:
-; TRAP:       ; %bb.0:
-; TRAP-NEXT:    mov x16, x0
-; TRAP-NEXT:    autda x16, x1
-; TRAP-NEXT:    mov x17, x16
-; TRAP-NEXT:    xpacd x17
-; TRAP-NEXT:    cmp x16, x17
-; TRAP-NEXT:    b.eq Lauth_success_10
-; TRAP-NEXT:    brk #0xc472
-; TRAP-NEXT:  Lauth_success_10:
-; TRAP-NEXT:    pacia x16, x2
-; TRAP-NEXT:    mov x0, x16
-; TRAP-NEXT:    ret
-  %tmp = call i64 @llvm.ptrauth.resign(i64 %arg, i32 2, i64 %arg1, i32 0, i64 %arg2)
-  ret i64 %tmp
-}
-
-define i64 @test_resign_db_da(i64 %arg, i64 %arg1, i64 %arg2) {
-; UNCHECKED-LABEL: test_resign_db_da:
-; UNCHECKED:       ; %bb.0:
-; UNCHECKED-NEXT:    mov x16, x0
-; UNCHECKED-NEXT:    autdb x16, x1
-; UNCHECKED-NEXT:    pacda x16, x2
-; UNCHECKED-NEXT:    mov x0, x16
-; UNCHECKED-NEXT:    ret
-;
-; CHECKED-LABEL: test_resign_db_da:
-; CHECKED:       ; %bb.0:
-; CHECKED-NEXT:    mov x16, x0
-; CHECKED-NEXT:    autdb x16, x1
-; CHECKED-NEXT:    mov x17, x16
-; CHECKED-NEXT:    xpacd x17
-; CHECKED-NEXT:    cmp x16, x17
-; CHECKED-NEXT:    b.eq Lauth_success_3
-; CHECKED-NEXT:    mov x16, x17
-; CHECKED-NEXT:    b Lresign_end_3
-; CHECKED-NEXT:  Lauth_success_3:
-; CHECKED-NEXT:    pacda x16, x2
-; CHECKED-NEXT:  Lresign_end_3:
-; CHECKED-NEXT:    mov x0, x16
-; CHECKED-NEXT:    ret
-;
-; TRAP-LABEL: test_resign_db_da:
-; TRAP:       ; %bb.0:
-; TRAP-NEXT:    mov x16, x0
-; TRAP-NEXT:    autdb x16, x1
-; TRAP-NEXT:    mov x17, x16
-; TRAP-NEXT:    xpacd x17
-; TRAP-NEXT:    cmp x16, x17
-; TRAP-NEXT:    b.eq Lauth_success_11
-; TRAP-NEXT:    brk #0xc473
-; TRAP-NEXT:  Lauth_success_11:
-; TRAP-NEXT:    pacda x16, x2
-; TRAP-NEXT:    mov x0, x16
-; TRAP-NEXT:    ret
-  %tmp = call i64 @llvm.ptrauth.resign(i64 %arg, i32 3, i64 %arg1, i32 2, i64 %arg2)
-  ret i64 %tmp
-}
-
-define i64 @test_resign_iza_db(i64 %arg, i64 %arg1, i64 %arg2) {
-; UNCHECKED-LABEL: test_resign_iza_db:
-; UNCHECKED:       ; %bb.0:
-; UNCHECKED-NEXT:    mov x16, x0
-; UNCHECKED-NEXT:    autiza x16
-; UNCHECKED-NEXT:    pacdb x16, x2
-; UNCHECKED-NEXT:    mov x0, x16
-; UNCHECKED-NEXT:    ret
-;
-; CHECKED-LABEL: test_resign_iza_db:
-; CHECKED:       ; %bb.0:
-; CHECKED-NEXT:    mov x16, x0
-; CHECKED-NEXT:    autiza x16
-; CHECKED-NEXT:    mov x17, x16
-; CHECKED-NEXT:    xpaci x17
-; CHECKED-NEXT:    cmp x16, x17
-; CHECKED-NEXT:    b.eq Lauth_success_4
-; CHECKED-NEXT:    mov x16, x17
-; CHECKED-NEXT:    b Lresign_end_4
-; CHECKED-NEXT:  Lauth_success_4:
-; CHECKED-NEXT:    pacdb x16, x2
-; CHECKED-NEXT:  Lresign_end_4:
-; CHECKED-NEXT:    mov x0, x16
-; CHECKED-NEXT:    ret
-;
-; TRAP-LABEL: test_resign_iza_db:
-; TRAP:       ; %bb.0:
-; TRAP-NEXT:    mov x16, x0
-; TRAP-NEXT:    autiza x16
-; TRAP-NEXT:    mov x17, x16
-; TRAP-NEXT:    xpaci x17
-; TRAP-NEXT:    cmp x16, x17
-; TRAP-NEXT:    b.eq Lauth_success_12
-; TRAP-NEXT:    brk #0xc470
-; TRAP-NEXT:  Lauth_success_12:
-; TRAP-NEXT:    pacdb x16, x2
-; TRAP-NEXT:    mov x0, x16
-; TRAP-NEXT:    ret
-  %tmp = call i64 @llvm.ptrauth.resign(i64 %arg, i32 0, i64 0, i32 3, i64 %arg2)
-  ret i64 %tmp
-}
-
-define i64 @test_resign_da_dzb(i64 %arg, i64 %arg1, i64 %arg2) {
-; UNCHECKED-LABEL: test_resign_da_dzb:
-; UNCHECKED:       ; %bb.0:
-; UNCHECKED-NEXT:    mov x16, x0
-; UNCHECKED-NEXT:    autda x16, x1
-; UNCHECKED-NEXT:    pacdzb x16
-; UNCHECKED-NEXT:    mov x0, x16
-; UNCHECKED-NEXT:    ret
-;
-; CHECKED-LABEL: test_resign_da_dzb:
-; CHECKED:       ; %bb.0:
-; CHECKED-NEXT:    mov x16, x0
-; CHECKED-NEXT:    autda x16, x1
-; CHECKED-NEXT:    mov x17, x16
-; CHECKED-NEXT:    xpacd x17
-; CHECKED-NEXT:    cmp x16, x17
-; CHECKED-NEXT:    b.eq Lauth_success_5
-; CHECKED-NEXT:    mov x16, x17
-; CHECKED-NEXT:    b Lresign_end_5
-; CHECKED-NEXT:  Lauth_success_5:
-; CHECKED-NEXT:    pacdzb x16
-; CHECKED-NEXT:  Lresign_end_5:
-; CHECKED-NEXT:    mov x0, x16
-; CHECKED-NEXT:    ret
-;
-; TRAP-LABEL: test_resign_da_dzb:
-; TRAP:       ; %bb.0:
-; TRAP-NEXT:    mov x16, x0
-; TRAP-NEXT:    autda x16, x1
-; TRAP-NEXT:    mov x17, x16
-; TRAP-NEXT:    xpacd x17
-; TRAP-NEXT:    cmp x16, x17
-; TRAP-NEXT:    b.eq Lauth_success_13
-; TRAP-NEXT:    brk #0xc472
-; TRAP-NEXT:  Lauth_success_13:
-; TRAP-NEXT:    pacdzb x16
-; TRAP-NEXT:    mov x0, x16
-; TRAP-NEXT:    ret
-  %tmp = call i64 @llvm.ptrauth.resign(i64 %arg, i32 2, i64 %arg1, i32 3, i64 0)
-  ret i64 %tmp
-}
-
-define i64 @test_auth_trap_attribute(i64 %arg, i64 %arg1) "ptrauth-auth-traps" {
-; UNCHECKED-LABEL: test_auth_trap_attribute:
-; UNCHECKED:       ; %bb.0:
-; UNCHECKED-NEXT:    mov x16, x0
-; UNCHECKED-NEXT:    autia x16, x1
-; UNCHECKED-NEXT:    mov x0, x16
-; UNCHECKED-NEXT:    ret
-;
-; CHECKED-LABEL: test_auth_trap_attribute:
-; CHECKED:       ; %bb.0:
-; CHECKED-NEXT:    mov x16, x0
-; CHECKED-NEXT:    autia x16, x1
-; CHECKED-NEXT:    mov x17, x16
-; CHECKED-NEXT:    xpaci x17
-; CHECKED-NEXT:    cmp x16, x17
-; CHECKED-NEXT:    b.eq Lauth_success_6
-; CHECKED-NEXT:    brk #0xc470
-; CHECKED-NEXT:  Lauth_success_6:
-; CHECKED-NEXT:    mov x0, x16
-; CHECKED-NEXT:    ret
-;
-; TRAP-LABEL: test_auth_trap_attribute:
-; TRAP:       ; %bb.0:
-; TRAP-NEXT:    mov x16, x0
-; TRAP-NEXT:    autia x16, x1
-; TRAP-NEXT:    mov x17, x16
-; TRAP-NEXT:    xpaci x17
-; TRAP-NEXT:    cmp x16, x17
-; TRAP-NEXT:    b.eq Lauth_success_14
-; TRAP-NEXT:    brk #0xc470
-; TRAP-NEXT:  Lauth_success_14:
-; TRAP-NEXT:    mov x0, x16
-; TRAP-NEXT:    ret
-  %tmp = call i64 @llvm.ptrauth.auth(i64 %arg, i32 0, i64 %arg1)
-  ret i64 %tmp
-}
-
-define i64 @test_auth_ia_constdisc(i64 %arg) {
-; UNCHECKED-LABEL: test_auth_ia_constdisc:
-; UNCHECKED:       ; %bb.0:
-; UNCHECKED-NEXT:    mov x16, x0
-; UNCHECKED-NEXT:    mov x17, #256 ; =0x100
-; UNCHECKED-NEXT:    autia x16, x17
-; UNCHECKED-NEXT:    mov x0, x16
-; UNCHECKED-NEXT:    ret
-;
-; CHECKED-LABEL: test_auth_ia_constdisc:
-; CHECKED:       ; %bb.0:
-; CHECKED-NEXT:    mov x16, x0
-; CHECKED-NEXT:    mov x17, #256 ; =0x100
-; CHECKED-NEXT:    autia x16, x17
-; CHECKED-NEXT:    mov x0, x16
-; CHECKED-NEXT:    ret
-;
-; TRAP-LABEL: test_auth_ia_constdisc:
-; TRAP:       ; %bb.0:
-; TRAP-NEXT:    mov x16, x0
-; TRAP-NEXT:    mov x17, #256 ; =0x100
-; TRAP-NEXT:    autia x16, x17
-; TRAP-NEXT:    mov x17, x16
-; TRAP-NEXT:    xpaci x17
-; TRAP-NEXT:    cmp x16, x17
-; TRAP-NEXT:    b.eq Lauth_success_15
-; TRAP-NEXT:    brk #0xc470
-; TRAP-NEXT:  Lauth_success_15:
-; TRAP-NEXT:    mov x0, x16
-; TRAP-NEXT:    ret
-  %tmp = call i64 @llvm.ptrauth.auth(i64 %arg, i32 0, i64 256)
-  ret i64 %tmp
-}
-
-define i64 @test_resign_da_constdisc(i64 %arg, i64 %arg1) {
-; UNCHECKED-LABEL: test_resign_da_constdisc:
-; UNCHECKED:       ; %bb.0:
-; UNCHECKED-NEXT:    mov x16, x0
-; UNCHECKED-NEXT:    autda x16, x1
-; UNCHECKED-NEXT:    mov x17, #256 ; =0x100
-; UNCHECKED-NEXT:    pacda x16, x17
-; UNCHECKED-NEXT:    mov x0, x16
-; UNCHECKED-NEXT:    ret
-;
-; CHECKED-LABEL: test_resign_da_constdisc:
-; CHECKED:       ; %bb.0:
-; CHECKED-NEXT:    mov x16, x0
-; CHECKED-NEXT:    autda x16, x1
-; CHECKED-NEXT:    mov x17, x16
-; CHECKED-NEXT:    xpacd x17
-; CHECKED-NEXT:    cmp x16, x17
-; CHECKED-NEXT:    b.eq Lauth_success_7
-; CHECKED-NEXT:    mov x16, x17
-; CHECKED-NEXT:    b Lresign_end_6
-; CHECKED-NEXT:  Lauth_success_7:
-; CHECKED-NEXT:    mov x17, #256 ; =0x100
-; CHECKED-NEXT:    pacda x16, x17
-; CHECKED-NEXT:  Lresign_end_6:
-; CHECKED-NEXT:    mov x0, x16
-; CHECKED-NEXT:    ret
-;
-; TRAP-LABEL: test_resign_da_constdisc:
-; TRAP:       ; %bb.0:
-; TRAP-NEXT:    mov x16, x0
-; TRAP-NEXT:    autda x16, x1
-; TRAP-NEXT:    mov x17, x16
-; TRAP-NEXT:    xpacd x17
-; TRAP-NEXT:    cmp x16, x17
-; TRAP-NEXT:    b.eq Lauth_success_16
-; TRAP-NEXT:    brk #0xc472
-; TRAP-NEXT:  Lauth_success_16:
-; TRAP-NEXT:    mov x17, #256 ; =0x100
-; TRAP-NEXT:    pacda x16, x17
-; TRAP-NEXT:    mov x0, x16
-; TRAP-NEXT:    ret
-  %tmp = call i64 @llvm.ptrauth.resign(i64 %arg, i32 2, i64 %arg1, i32 2, i64 256)
-  ret i64 %tmp
-}
-
-declare i64 @llvm.ptrauth.auth(i64, i32, i64)
-declare i64 @llvm.ptrauth.resign(i64, i32, i64, i32, i64)
diff --git a/llvm/test/CodeGen/AArch64/scmp.ll b/llvm/test/CodeGen/AArch64/scmp.ll
index bcad1c1a11aa7..a7abc5eadaff6 100644
--- a/llvm/test/CodeGen/AArch64/scmp.ll
+++ b/llvm/test/CodeGen/AArch64/scmp.ll
@@ -1,45 +1,26 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
-; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+; RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs < %s | FileCheck %s
 
 define i8 @scmp.8.8(i8 %x, i8 %y) nounwind {
-; CHECK-SD-LABEL: scmp.8.8:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    sxtb w8, w0
-; CHECK-SD-NEXT:    cmp w8, w1, sxtb
-; CHECK-SD-NEXT:    cset w8, gt
-; CHECK-SD-NEXT:    csinv w0, w8, wzr, ge
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: scmp.8.8:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    sxtb w8, w0
-; CHECK-GI-NEXT:    sxtb w9, w1
-; CHECK-GI-NEXT:    cmp w8, w9
-; CHECK-GI-NEXT:    cset w8, gt
-; CHECK-GI-NEXT:    csinv w0, w8, wzr, ge
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: scmp.8.8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sxtb w8, w0
+; CHECK-NEXT:    cmp w8, w1, sxtb
+; CHECK-NEXT:    cset w8, gt
+; CHECK-NEXT:    csinv w0, w8, wzr, ge
+; CHECK-NEXT:    ret
   %1 = call i8 @llvm.scmp(i8 %x, i8 %y)
   ret i8 %1
 }
 
 define i8 @scmp.8.16(i16 %x, i16 %y) nounwind {
-; CHECK-SD-LABEL: scmp.8.16:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    sxth w8, w0
-; CHECK-SD-NEXT:    cmp w8, w1, sxth
-; CHECK-SD-NEXT:    cset w8, gt
-; CHECK-SD-NEXT:    csinv w0, w8, wzr, ge
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: scmp.8.16:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    sxth w8, w0
-; CHECK-GI-NEXT:    sxth w9, w1
-; CHECK-GI-NEXT:    cmp w8, w9
-; CHECK-GI-NEXT:    cset w8, gt
-; CHECK-GI-NEXT:    csinv w0, w8, wzr, ge
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: scmp.8.16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sxth w8, w0
+; CHECK-NEXT:    cmp w8, w1, sxth
+; CHECK-NEXT:    cset w8, gt
+; CHECK-NEXT:    csinv w0, w8, wzr, ge
+; CHECK-NEXT:    ret
   %1 = call i8 @llvm.scmp(i16 %x, i16 %y)
   ret i8 %1
 }
@@ -67,35 +48,15 @@ define i8 @scmp.8.64(i64 %x, i64 %y) nounwind {
 }
 
 define i8 @scmp.8.128(i128 %x, i128 %y) nounwind {
-; CHECK-SD-LABEL: scmp.8.128:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    cmp x2, x0
-; CHECK-SD-NEXT:    sbcs xzr, x3, x1
-; CHECK-SD-NEXT:    cset w8, lt
-; CHECK-SD-NEXT:    cmp x0, x2
-; CHECK-SD-NEXT:    sbcs xzr, x1, x3
-; CHECK-SD-NEXT:    csinv w0, w8, wzr, ge
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: scmp.8.128:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    cmp x1, x3
-; CHECK-GI-NEXT:    cset w8, gt
-; CHECK-GI-NEXT:    cmp x0, x2
-; CHECK-GI-NEXT:    cset w9, hi
-; CHECK-GI-NEXT:    cmp x1, x3
-; CHECK-GI-NEXT:    csel w8, w9, w8, eq
-; CHECK-GI-NEXT:    tst w8, #0x1
-; CHECK-GI-NEXT:    cset w8, ne
-; CHECK-GI-NEXT:    cmp x1, x3
-; CHECK-GI-NEXT:    cset w9, lt
-; CHECK-GI-NEXT:    cmp x0, x2
-; CHECK-GI-NEXT:    cset w10, lo
-; CHECK-GI-NEXT:    cmp x1, x3
-; CHECK-GI-NEXT:    csel w9, w10, w9, eq
-; CHECK-GI-NEXT:    tst w9, #0x1
-; CHECK-GI-NEXT:    csinv w0, w8, wzr, eq
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: scmp.8.128:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmp x2, x0
+; CHECK-NEXT:    sbcs xzr, x3, x1
+; CHECK-NEXT:    cset w8, lt
+; CHECK-NEXT:    cmp x0, x2
+; CHECK-NEXT:    sbcs xzr, x1, x3
+; CHECK-NEXT:    csinv w0, w8, wzr, ge
+; CHECK-NEXT:    ret
   %1 = call i8 @llvm.scmp(i128 %x, i128 %y)
   ret i8 %1
 }
diff --git a/llvm/test/CodeGen/AArch64/sve-doublereduct.ll b/llvm/test/CodeGen/AArch64/sve-doublereduct.ll
index 7bc31d44bb654..b289dfbec527c 100644
--- a/llvm/test/CodeGen/AArch64/sve-doublereduct.ll
+++ b/llvm/test/CodeGen/AArch64/sve-doublereduct.ll
@@ -103,17 +103,12 @@ define i32 @add_i32(<vscale x 8 x i32> %a, <vscale x 4 x i32> %b) {
 define i16 @add_ext_i16(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
 ; CHECK-LABEL: add_ext_i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    uunpkhi z2.h, z0.b
-; CHECK-NEXT:    uunpklo z0.h, z0.b
-; CHECK-NEXT:    uunpkhi z3.h, z1.b
-; CHECK-NEXT:    uunpklo z1.h, z1.b
-; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    add z0.h, z0.h, z2.h
-; CHECK-NEXT:    add z1.h, z1.h, z3.h
-; CHECK-NEXT:    add z0.h, z0.h, z1.h
-; CHECK-NEXT:    uaddv d0, p0, z0.h
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    uaddv d0, p0, z0.b
+; CHECK-NEXT:    uaddv d1, p0, z1.b
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    fmov w9, s1
+; CHECK-NEXT:    add w0, w8, w9
 ; CHECK-NEXT:    ret
   %ae = zext <vscale x 16 x i8> %a to <vscale x 16 x i16>
   %be = zext <vscale x 16 x i8> %b to <vscale x 16 x i16>
@@ -126,21 +121,15 @@ define i16 @add_ext_i16(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
 define i16 @add_ext_v32i16(<vscale x 32 x i8> %a, <vscale x 16 x i8> %b) {
 ; CHECK-LABEL: add_ext_v32i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    uunpklo z3.h, z1.b
-; CHECK-NEXT:    uunpklo z4.h, z0.b
-; CHECK-NEXT:    uunpkhi z1.h, z1.b
-; CHECK-NEXT:    uunpkhi z0.h, z0.b
-; CHECK-NEXT:    uunpkhi z5.h, z2.b
-; CHECK-NEXT:    uunpklo z2.h, z2.b
-; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    add z0.h, z0.h, z1.h
-; CHECK-NEXT:    add z1.h, z4.h, z3.h
-; CHECK-NEXT:    add z0.h, z1.h, z0.h
-; CHECK-NEXT:    add z1.h, z2.h, z5.h
-; CHECK-NEXT:    add z0.h, z0.h, z1.h
-; CHECK-NEXT:    uaddv d0, p0, z0.h
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    uaddv d1, p0, z1.b
+; CHECK-NEXT:    uaddv d0, p0, z0.b
+; CHECK-NEXT:    uaddv d2, p0, z2.b
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    add w8, w9, w8
+; CHECK-NEXT:    fmov w9, s2
+; CHECK-NEXT:    add w0, w8, w9
 ; CHECK-NEXT:    ret
   %ae = zext <vscale x 32 x i8> %a to <vscale x 32 x i16>
   %be = zext <vscale x 16 x i8> %b to <vscale x 16 x i16>
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-vector-zext.ll b/llvm/test/CodeGen/AArch64/sve-fixed-vector-zext.ll
index 1ab2589bccd5b..24c817d410301 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-vector-zext.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-vector-zext.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 
 ; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mcpu=neoverse-v1 -O3 -aarch64-sve-vector-bits-min=256 -verify-machineinstrs | FileCheck %s --check-prefixes=SVE256
 ; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mcpu=neoverse-v1 -O3 -aarch64-sve-vector-bits-min=128 -verify-machineinstrs | FileCheck %s --check-prefixes=NEON
@@ -6,24 +7,31 @@
 
 define internal i32 @test(ptr nocapture readonly %p1, i32 %i1, ptr nocapture readonly %p2, i32 %i2) {
 ; SVE256-LABEL: test:
-; SVE256:       ld1b    { z0.h }, p0/z,
-; SVE256:       ld1b    { z1.h }, p0/z,
-; SVE256:       sub z0.h, z0.h, z1.h
-; SVE256-NEXT:  sunpklo z1.s, z0.h
-; SVE256-NEXT:  ext z0.b, z0.b, z0.b, #16
-; SVE256-NEXT:  sunpklo z0.s, z0.h
-; SVE256-NEXT:  add z0.s, z1.s, z0.s
-; SVE256-NEXT:  uaddv   d0, p1, z0.s
+; SVE256:       // %bb.0: // %L.entry
+; SVE256-NEXT:    ptrue p0.h, vl16
+; SVE256-NEXT:    mov w9, wzr
+; SVE256-NEXT:    mov w10, wzr
+; SVE256-NEXT:    mov w8, wzr
+; SVE256-NEXT:    mov w11, #-16 // =0xfffffff0
+; SVE256-NEXT:    .p2align 5, , 16
+; SVE256-NEXT:  .LBB0_1: // %L1
+; SVE256-NEXT:    // =>This Inner Loop Header: Depth=1
+; SVE256-NEXT:    sxtw x12, w9
+; SVE256-NEXT:    sxtw x13, w10
+; SVE256-NEXT:    adds w11, w11, #1
+; SVE256-NEXT:    add w10, w10, w3
+; SVE256-NEXT:    ld1b { z0.h }, p0/z, [x0, x12]
+; SVE256-NEXT:    ld1b { z1.h }, p0/z, [x2, x13]
+; SVE256-NEXT:    add w9, w9, w1
+; SVE256-NEXT:    sub z0.h, z0.h, z1.h
+; SVE256-NEXT:    saddv d0, p0, z0.h
+; SVE256-NEXT:    fmov w12, s0
+; SVE256-NEXT:    add w8, w12, w8
+; SVE256-NEXT:    b.lo .LBB0_1
+; SVE256-NEXT:  // %bb.2: // %L2
+; SVE256-NEXT:    mov w0, w8
+; SVE256-NEXT:    ret
 
-; NEON-LABEL: test:
-; NEON:       ldr q0, [x0, w9, sxtw]
-; NEON:       ldr q1, [x2, w10, sxtw]
-; NEON:       usubl2  v2.8h, v0.16b, v1.16b
-; NEON-NEXT:  usubl   v0.8h, v0.8b, v1.8b
-; NEON:       saddl2  v1.4s, v0.8h, v2.8h
-; NEON-NEXT:  saddl   v0.4s, v0.4h, v2.4h
-; NEON-NEXT:  add v0.4s, v0.4s, v1.4s
-; NEON-NEXT:  addv    s0, v0.4s
 
 L.entry:
   br label %L1
@@ -55,3 +63,5 @@ L2:                                          ; preds = %L1
 }
 
 declare  i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; NEON: {{.*}}
diff --git a/llvm/test/CodeGen/AArch64/sve-fp-int-min-max.ll b/llvm/test/CodeGen/AArch64/sve-fp-int-min-max.ll
index 0d7f230062650..afe13851f0b95 100644
--- a/llvm/test/CodeGen/AArch64/sve-fp-int-min-max.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fp-int-min-max.ll
@@ -24,8 +24,7 @@ define i64 @scalable_int_min_max(ptr %arg, ptr %arg1, <vscale x 2 x ptr> %i37, <
 ; CHECK-NEXT:    fadd z0.s, p0/m, z0.s, z4.s
 ; CHECK-NEXT:    fcmge p2.s, p0/z, z0.s, z3.s
 ; CHECK-NEXT:    add z0.d, z2.d, z1.d
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    and p2.b, p1/z, p1.b, p2.b
+; CHECK-NEXT:    bic p2.b, p1/z, p1.b, p2.b
 ; CHECK-NEXT:    mov z0.d, p2/m, z2.d
 ; CHECK-NEXT:    sel z0.d, p1, z0.d, z2.d
 ; CHECK-NEXT:    uaddv d0, p0, z0.d
diff --git a/llvm/test/CodeGen/AArch64/sve-int-reduce.ll b/llvm/test/CodeGen/AArch64/sve-int-reduce.ll
index 8c1b5225b7f25..c8dd719aa03c6 100644
--- a/llvm/test/CodeGen/AArch64/sve-int-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-int-reduce.ll
@@ -188,6 +188,94 @@ define i64 @uaddv_nxv2i64(<vscale x 2 x i64> %a) {
   ret i64 %res
 }
 
+define i32 @uaddv_nxv16i8_nxv16i32(<vscale x 16 x i8> %a) {
+; CHECK-LABEL: uaddv_nxv16i8_nxv16i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    uaddv d0, p0, z0.b
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
+  %1 = zext <vscale x 16 x i8> %a to <vscale x 16 x i32>
+  %2 = call i32 @llvm.vector.reduce.add.nxv16i32(<vscale x 16 x i32> %1)
+  ret i32 %2
+}
+
+define i64 @uaddv_nxv16i16_nxv16i64(<vscale x 16 x i16> %a) {
+; CHECK-LABEL: uaddv_nxv16i16_nxv16i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    uaddv d1, p0, z1.h
+; CHECK-NEXT:    uaddv d0, p0, z0.h
+; CHECK-NEXT:    fmov x8, d1
+; CHECK-NEXT:    fmov x9, d0
+; CHECK-NEXT:    add x0, x9, x8
+; CHECK-NEXT:    ret
+  %1 = zext <vscale x 16 x i16> %a to <vscale x 16 x i64>
+  %2 = call i64 @llvm.vector.reduce.add.nxv16i64(<vscale x 16 x i64> %1)
+  ret i64 %2
+}
+
+define i32 @uaddv_nxv16i16_nxv16i32(<vscale x 32 x i16> %a) {
+; CHECK-LABEL: uaddv_nxv16i16_nxv16i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    uaddv d3, p0, z3.h
+; CHECK-NEXT:    uaddv d2, p0, z2.h
+; CHECK-NEXT:    uaddv d1, p0, z1.h
+; CHECK-NEXT:    uaddv d0, p0, z0.h
+; CHECK-NEXT:    fmov w8, s3
+; CHECK-NEXT:    fmov w9, s2
+; CHECK-NEXT:    fmov w10, s1
+; CHECK-NEXT:    fmov w11, s0
+; CHECK-NEXT:    add w8, w9, w8
+; CHECK-NEXT:    add w9, w11, w10
+; CHECK-NEXT:    add w0, w9, w8
+; CHECK-NEXT:    ret
+  %1 = zext <vscale x 32 x i16> %a to <vscale x 32 x i32>
+  %2 = call i32 @llvm.vector.reduce.add.nxv32i64(<vscale x 32 x i32> %1)
+  ret i32 %2
+}
+
+define i32 @saddv_nxv16i8_nxv16i32(<vscale x 16 x i8> %a) {
+; CHECK-LABEL: saddv_nxv16i8_nxv16i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    saddv d0, p0, z0.b
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
+  %1 = sext <vscale x 16 x i8> %a to <vscale x 16 x i32>
+  %2 = call i32 @llvm.vector.reduce.add.nxv16i32(<vscale x 16 x i32> %1)
+  ret i32 %2
+}
+
+define i32 @uaddv_nxv32i16_nxv32i32(ptr %a) {
+; CHECK-LABEL: uaddv_nxv32i16_nxv32i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0, #3, mul vl]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT:    ld1h { z2.h }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ld1h { z3.h }, p0/z, [x0]
+; CHECK-NEXT:    uaddv d0, p0, z0.h
+; CHECK-NEXT:    uaddv d1, p0, z1.h
+; CHECK-NEXT:    uaddv d2, p0, z2.h
+; CHECK-NEXT:    uaddv d3, p0, z3.h
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    fmov w9, s1
+; CHECK-NEXT:    fmov w10, s2
+; CHECK-NEXT:    fmov w11, s3
+; CHECK-NEXT:    add w8, w9, w8
+; CHECK-NEXT:    add w9, w11, w10
+; CHECK-NEXT:    add w0, w9, w8
+; CHECK-NEXT:    ret
+  %1 = load <vscale x 32 x i16>, ptr %a, align 16
+  %2 = zext <vscale x 32 x i16> %1 to <vscale x 32 x i32>
+  %3 = call i32 @llvm.vector.reduce.add.nxv32i32(<vscale x 32 x i32> %2)
+  ret i32 %3
+}
+
 ; UMINV
 
 define i8 @umin_nxv16i8(<vscale x 16 x i8> %a) {
diff --git a/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll b/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll
deleted file mode 100644
index 34d85d1f76086..0000000000000
--- a/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll
+++ /dev/null
@@ -1,135 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 | FileCheck %s --check-prefixes=CHECK
-; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -pass-remarks-analysis=stack-frame-layout 2>&1 >/dev/null | FileCheck %s --check-prefixes=CHECK-FRAMELAYOUT
-
-; CHECK-FRAMELAYOUT-LABEL: Function: csr_d8_allocnxv4i32i32f64
-; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-8], Type: Spill, Align: 8, Size: 8
-; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 8, Size: 8
-; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-20], Type: Variable, Align: 4, Size: 4
-; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32], Type: Variable, Align: 8, Size: 8
-; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Variable, Align: 16, Size: vscale x 16
-
-define i32 @csr_d8_allocnxv4i32i32f64(double %d) "aarch64_pstate_sm_compatible" {
-; CHECK-LABEL: csr_d8_allocnxv4i32i32f64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    str d8, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    str x29, [sp, #8] // 8-byte Folded Spill
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 8 * VG
-; CHECK-NEXT:    .cfi_offset w29, -8
-; CHECK-NEXT:    .cfi_offset b8, -16
-; CHECK-NEXT:    mov z1.s, #0 // =0x0
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    add x8, sp, #16
-; CHECK-NEXT:    mov w0, wzr
-; CHECK-NEXT:    //APP
-; CHECK-NEXT:    //NO_APP
-; CHECK-NEXT:    str wzr, [sp, #12]
-; CHECK-NEXT:    str d0, [sp]
-; CHECK-NEXT:    st1w { z1.s }, p0, [x8]
-; CHECK-NEXT:    addvl sp, sp, #1
-; CHECK-NEXT:    add sp, sp, #16
-; CHECK-NEXT:    ldr x29, [sp, #8] // 8-byte Folded Reload
-; CHECK-NEXT:    ldr d8, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-entry:
-  %a = alloca <vscale x 4 x i32>
-  %b = alloca i32
-  %c = alloca double
-  tail call void asm sideeffect "", "~{d8}"() #1
-  store <vscale x 4 x i32> zeroinitializer, ptr %a
-  store i32 zeroinitializer, ptr %b
-  store double %d, ptr %c
-  ret i32 0
-}
-
-; CHECK-FRAMELAYOUT-LABEL: Function: csr_d8_allocnxv4i32i32f64_fp
-; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-8], Type: Spill, Align: 8, Size: 8
-; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 8, Size: 8
-; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-20], Type: Variable, Align: 4, Size: 4
-; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32], Type: Spill, Align: 16, Size: 8
-; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-40], Type: Variable, Align: 8, Size: 8
-; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Variable, Align: 16, Size: vscale x 16
-
-define i32 @csr_d8_allocnxv4i32i32f64_fp(double %d) "aarch64_pstate_sm_compatible" "frame-pointer"="all" {
-; CHECK-LABEL: csr_d8_allocnxv4i32i32f64_fp:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    str d8, [sp, #-32]! // 8-byte Folded Spill
-; CHECK-NEXT:    stp x29, x30, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    add x29, sp, #16
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    .cfi_def_cfa w29, 16
-; CHECK-NEXT:    .cfi_offset w30, -8
-; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    .cfi_offset b8, -32
-; CHECK-NEXT:    mov z1.s, #0 // =0x0
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    addvl x8, sp, #1
-; CHECK-NEXT:    //APP
-; CHECK-NEXT:    //NO_APP
-; CHECK-NEXT:    str wzr, [x8, #28]
-; CHECK-NEXT:    sub x8, x29, #16
-; CHECK-NEXT:    mov w0, wzr
-; CHECK-NEXT:    str d0, [sp, #8]
-; CHECK-NEXT:    st1w { z1.s }, p0, [x8, #-1, mul vl]
-; CHECK-NEXT:    addvl sp, sp, #1
-; CHECK-NEXT:    add sp, sp, #16
-; CHECK-NEXT:    ldp x29, x30, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr d8, [sp], #32 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-entry:
-  %a = alloca <vscale x 4 x i32>
-  %b = alloca i32
-  %c = alloca double
-  tail call void asm sideeffect "", "~{d8}"() #1
-  store <vscale x 4 x i32> zeroinitializer, ptr %a
-  store i32 zeroinitializer, ptr %b
-  store double %d, ptr %c
-  ret i32 0
-}
-
-; CHECK-FRAMELAYOUT-LABEL: Function: svecc_z8_allocnxv4i32i32f64_fp
-; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-8], Type: Spill, Align: 8, Size: 8
-; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 8, Size: 8
-; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-20], Type: Variable, Align: 4, Size: 4
-; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32], Type: Variable, Align: 8, Size: 8
-; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 16, Size: vscale x 16
-; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32], Type: Variable, Align: 16, Size: vscale x 16
-
-define i32 @svecc_z8_allocnxv4i32i32f64_fp(double %d, <vscale x 4 x i32> %v) "aarch64_pstate_sm_compatible" "frame-pointer"="all" {
-; CHECK-LABEL: svecc_z8_allocnxv4i32i32f64_fp:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; CHECK-NEXT:    mov x29, sp
-; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str z8, [sp] // 16-byte Folded Spill
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    .cfi_def_cfa w29, 16
-; CHECK-NEXT:    .cfi_offset w30, -8
-; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    mov w0, wzr
-; CHECK-NEXT:    //APP
-; CHECK-NEXT:    //NO_APP
-; CHECK-NEXT:    str wzr, [sp, #12]
-; CHECK-NEXT:    st1w { z1.s }, p0, [x29, #-2, mul vl]
-; CHECK-NEXT:    str d0, [sp], #16
-; CHECK-NEXT:    addvl sp, sp, #1
-; CHECK-NEXT:    ldr z8, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    addvl sp, sp, #1
-; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
-; CHECK-NEXT:    ret
-entry:
-  %a = alloca <vscale x 4 x i32>
-  %b = alloca i32
-  %c = alloca double
-  tail call void asm sideeffect "", "~{d8}"() #1
-  store <vscale x 4 x i32> %v, ptr %a
-  store i32 zeroinitializer, ptr %b
-  store double %d, ptr %c
-  ret i32 0
-}
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reductions.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reductions.ll
new file mode 100644
index 0000000000000..608b3bdeac75a
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reductions.ll
@@ -0,0 +1,102 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mattr=+sve < %s | FileCheck %s -check-prefixes=CHECK,NO_STREAMING
+; RUN: llc -mattr=+sve -force-streaming-compatible -aarch64-sve-vector-bits-min=128 -aarch64-sve-vector-bits-max=128  < %s | FileCheck %s -check-prefixes=CHECK,SVE_128
+; RUN: llc -mattr=+sve -force-streaming-compatible -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,SVE_MIN_256
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define i32 @reduce_uadd_v16i8(<32 x i8> %a) #0 {
+; NO_STREAMING-LABEL: reduce_uadd_v16i8:
+; NO_STREAMING:       // %bb.0:
+; NO_STREAMING-NEXT:    ushll2 v2.8h, v1.16b, #0
+; NO_STREAMING-NEXT:    ushll v1.8h, v1.8b, #0
+; NO_STREAMING-NEXT:    ushll2 v3.8h, v0.16b, #0
+; NO_STREAMING-NEXT:    ushll v0.8h, v0.8b, #0
+; NO_STREAMING-NEXT:    uaddl2 v4.4s, v1.8h, v2.8h
+; NO_STREAMING-NEXT:    uaddl v1.4s, v1.4h, v2.4h
+; NO_STREAMING-NEXT:    uaddl2 v2.4s, v0.8h, v3.8h
+; NO_STREAMING-NEXT:    uaddl v0.4s, v0.4h, v3.4h
+; NO_STREAMING-NEXT:    add v1.4s, v1.4s, v4.4s
+; NO_STREAMING-NEXT:    add v0.4s, v0.4s, v2.4s
+; NO_STREAMING-NEXT:    add v0.4s, v0.4s, v1.4s
+; NO_STREAMING-NEXT:    addv s0, v0.4s
+; NO_STREAMING-NEXT:    fmov w0, s0
+; NO_STREAMING-NEXT:    ret
+;
+; SVE_128-LABEL: reduce_uadd_v16i8:
+; SVE_128:       // %bb.0:
+; SVE_128-NEXT:    ptrue p0.b
+; SVE_128-NEXT:    // kill: def $q1 killed $q1 def $z1
+; SVE_128-NEXT:    // kill: def $q0 killed $q0 def $z0
+; SVE_128-NEXT:    uaddv d1, p0, z1.b
+; SVE_128-NEXT:    uaddv d0, p0, z0.b
+; SVE_128-NEXT:    fmov x8, d1
+; SVE_128-NEXT:    fmov x9, d0
+; SVE_128-NEXT:    add w0, w9, w8
+; SVE_128-NEXT:    ret
+;
+; SVE_MIN_256-LABEL: reduce_uadd_v16i8:
+; SVE_MIN_256:       // %bb.0:
+; SVE_MIN_256-NEXT:    ptrue p0.b, vl16
+; SVE_MIN_256-NEXT:    // kill: def $q0 killed $q0 def $z0
+; SVE_MIN_256-NEXT:    // kill: def $q1 killed $q1 def $z1
+; SVE_MIN_256-NEXT:    splice z0.b, p0, z0.b, z1.b
+; SVE_MIN_256-NEXT:    ptrue p0.b, vl32
+; SVE_MIN_256-NEXT:    uaddv d0, p0, z0.b
+; SVE_MIN_256-NEXT:    fmov x0, d0
+; SVE_MIN_256-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; SVE_MIN_256-NEXT:    ret
+  %1 = zext <32 x i8> %a to <32 x i32>
+  %2 = call i32 @llvm.vector.reduce.add.v16i32(<32 x i32> %1)
+  ret i32 %2
+}
+
+define i32 @reduce_sadd_v16i8(<32 x i8> %a) #0 {
+; NO_STREAMING-LABEL: reduce_sadd_v16i8:
+; NO_STREAMING:       // %bb.0:
+; NO_STREAMING-NEXT:    sshll2 v2.8h, v1.16b, #0
+; NO_STREAMING-NEXT:    sshll v1.8h, v1.8b, #0
+; NO_STREAMING-NEXT:    sshll2 v3.8h, v0.16b, #0
+; NO_STREAMING-NEXT:    sshll v0.8h, v0.8b, #0
+; NO_STREAMING-NEXT:    saddl2 v4.4s, v1.8h, v2.8h
+; NO_STREAMING-NEXT:    saddl v1.4s, v1.4h, v2.4h
+; NO_STREAMING-NEXT:    saddl2 v2.4s, v0.8h, v3.8h
+; NO_STREAMING-NEXT:    saddl v0.4s, v0.4h, v3.4h
+; NO_STREAMING-NEXT:    add v1.4s, v1.4s, v4.4s
+; NO_STREAMING-NEXT:    add v0.4s, v0.4s, v2.4s
+; NO_STREAMING-NEXT:    add v0.4s, v0.4s, v1.4s
+; NO_STREAMING-NEXT:    addv s0, v0.4s
+; NO_STREAMING-NEXT:    fmov w0, s0
+; NO_STREAMING-NEXT:    ret
+;
+; SVE_128-LABEL: reduce_sadd_v16i8:
+; SVE_128:       // %bb.0:
+; SVE_128-NEXT:    ptrue p0.b
+; SVE_128-NEXT:    // kill: def $q1 killed $q1 def $z1
+; SVE_128-NEXT:    // kill: def $q0 killed $q0 def $z0
+; SVE_128-NEXT:    saddv d1, p0, z1.b
+; SVE_128-NEXT:    saddv d0, p0, z0.b
+; SVE_128-NEXT:    fmov x8, d1
+; SVE_128-NEXT:    fmov x9, d0
+; SVE_128-NEXT:    add w0, w9, w8
+; SVE_128-NEXT:    ret
+;
+; SVE_MIN_256-LABEL: reduce_sadd_v16i8:
+; SVE_MIN_256:       // %bb.0:
+; SVE_MIN_256-NEXT:    ptrue p0.b, vl16
+; SVE_MIN_256-NEXT:    // kill: def $q0 killed $q0 def $z0
+; SVE_MIN_256-NEXT:    // kill: def $q1 killed $q1 def $z1
+; SVE_MIN_256-NEXT:    splice z0.b, p0, z0.b, z1.b
+; SVE_MIN_256-NEXT:    ptrue p0.b, vl32
+; SVE_MIN_256-NEXT:    saddv d0, p0, z0.b
+; SVE_MIN_256-NEXT:    fmov x0, d0
+; SVE_MIN_256-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; SVE_MIN_256-NEXT:    ret
+  %1 = sext <32 x i8> %a to <32 x i32>
+  %2 = call i32 @llvm.vector.reduce.add.v16i32(<32 x i32> %1)
+  ret i32 %2
+}
+
+attributes #0 = { "target-features"="+sve" }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/AArch64/ucmp.ll b/llvm/test/CodeGen/AArch64/ucmp.ll
index 1a7f0be11cee6..351d440243b70 100644
--- a/llvm/test/CodeGen/AArch64/ucmp.ll
+++ b/llvm/test/CodeGen/AArch64/ucmp.ll
@@ -1,45 +1,26 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
-; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+; RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs < %s | FileCheck %s
 
 define i8 @ucmp.8.8(i8 %x, i8 %y) nounwind {
-; CHECK-SD-LABEL: ucmp.8.8:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    and w8, w0, #0xff
-; CHECK-SD-NEXT:    cmp w8, w1, uxtb
-; CHECK-SD-NEXT:    cset w8, hi
-; CHECK-SD-NEXT:    csinv w0, w8, wzr, hs
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: ucmp.8.8:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    and w8, w0, #0xff
-; CHECK-GI-NEXT:    and w9, w1, #0xff
-; CHECK-GI-NEXT:    cmp w8, w9
-; CHECK-GI-NEXT:    cset w8, hi
-; CHECK-GI-NEXT:    csinv w0, w8, wzr, hs
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: ucmp.8.8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and w8, w0, #0xff
+; CHECK-NEXT:    cmp w8, w1, uxtb
+; CHECK-NEXT:    cset w8, hi
+; CHECK-NEXT:    csinv w0, w8, wzr, hs
+; CHECK-NEXT:    ret
   %1 = call i8 @llvm.ucmp(i8 %x, i8 %y)
   ret i8 %1
 }
 
 define i8 @ucmp.8.16(i16 %x, i16 %y) nounwind {
-; CHECK-SD-LABEL: ucmp.8.16:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    and w8, w0, #0xffff
-; CHECK-SD-NEXT:    cmp w8, w1, uxth
-; CHECK-SD-NEXT:    cset w8, hi
-; CHECK-SD-NEXT:    csinv w0, w8, wzr, hs
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: ucmp.8.16:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    and w8, w0, #0xffff
-; CHECK-GI-NEXT:    and w9, w1, #0xffff
-; CHECK-GI-NEXT:    cmp w8, w9
-; CHECK-GI-NEXT:    cset w8, hi
-; CHECK-GI-NEXT:    csinv w0, w8, wzr, hs
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: ucmp.8.16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and w8, w0, #0xffff
+; CHECK-NEXT:    cmp w8, w1, uxth
+; CHECK-NEXT:    cset w8, hi
+; CHECK-NEXT:    csinv w0, w8, wzr, hs
+; CHECK-NEXT:    ret
   %1 = call i8 @llvm.ucmp(i16 %x, i16 %y)
   ret i8 %1
 }
@@ -67,35 +48,15 @@ define i8 @ucmp.8.64(i64 %x, i64 %y) nounwind {
 }
 
 define i8 @ucmp.8.128(i128 %x, i128 %y) nounwind {
-; CHECK-SD-LABEL: ucmp.8.128:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    cmp x2, x0
-; CHECK-SD-NEXT:    sbcs xzr, x3, x1
-; CHECK-SD-NEXT:    cset w8, lo
-; CHECK-SD-NEXT:    cmp x0, x2
-; CHECK-SD-NEXT:    sbcs xzr, x1, x3
-; CHECK-SD-NEXT:    csinv w0, w8, wzr, hs
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: ucmp.8.128:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    cmp x1, x3
-; CHECK-GI-NEXT:    cset w8, hi
-; CHECK-GI-NEXT:    cmp x0, x2
-; CHECK-GI-NEXT:    cset w9, hi
-; CHECK-GI-NEXT:    cmp x1, x3
-; CHECK-GI-NEXT:    csel w8, w9, w8, eq
-; CHECK-GI-NEXT:    tst w8, #0x1
-; CHECK-GI-NEXT:    cset w8, ne
-; CHECK-GI-NEXT:    cmp x1, x3
-; CHECK-GI-NEXT:    cset w9, lo
-; CHECK-GI-NEXT:    cmp x0, x2
-; CHECK-GI-NEXT:    cset w10, lo
-; CHECK-GI-NEXT:    cmp x1, x3
-; CHECK-GI-NEXT:    csel w9, w10, w9, eq
-; CHECK-GI-NEXT:    tst w9, #0x1
-; CHECK-GI-NEXT:    csinv w0, w8, wzr, eq
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: ucmp.8.128:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmp x2, x0
+; CHECK-NEXT:    sbcs xzr, x3, x1
+; CHECK-NEXT:    cset w8, lo
+; CHECK-NEXT:    cmp x0, x2
+; CHECK-NEXT:    sbcs xzr, x1, x3
+; CHECK-NEXT:    csinv w0, w8, wzr, hs
+; CHECK-NEXT:    ret
   %1 = call i8 @llvm.ucmp(i128 %x, i128 %y)
   ret i8 %1
 }
@@ -134,41 +95,18 @@ define i64 @ucmp.64.64(i64 %x, i64 %y) nounwind {
 }
 
 define <1 x i64> @ucmp.1.64.65(<1 x i65> %x, <1 x i65> %y) {
-; CHECK-SD-LABEL: ucmp.1.64.65:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    and x8, x1, #0x1
-; CHECK-SD-NEXT:    and x9, x3, #0x1
-; CHECK-SD-NEXT:    cmp x2, x0
-; CHECK-SD-NEXT:    sbcs xzr, x9, x8
-; CHECK-SD-NEXT:    cset x10, lo
-; CHECK-SD-NEXT:    cmp x0, x2
-; CHECK-SD-NEXT:    sbcs xzr, x8, x9
-; CHECK-SD-NEXT:    csinv x8, x10, xzr, hs
-; CHECK-SD-NEXT:    fmov d0, x8
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: ucmp.1.64.65:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    and x8, x1, #0x1
-; CHECK-GI-NEXT:    and x9, x3, #0x1
-; CHECK-GI-NEXT:    cmp x8, x9
-; CHECK-GI-NEXT:    cset w10, hi
-; CHECK-GI-NEXT:    cmp x0, x2
-; CHECK-GI-NEXT:    cset w11, hi
-; CHECK-GI-NEXT:    cmp x8, x9
-; CHECK-GI-NEXT:    csel w10, w11, w10, eq
-; CHECK-GI-NEXT:    tst w10, #0x1
-; CHECK-GI-NEXT:    cset x10, ne
-; CHECK-GI-NEXT:    cmp x8, x9
-; CHECK-GI-NEXT:    cset w11, lo
-; CHECK-GI-NEXT:    cmp x0, x2
-; CHECK-GI-NEXT:    cset w12, lo
-; CHECK-GI-NEXT:    cmp x8, x9
-; CHECK-GI-NEXT:    csel w8, w12, w11, eq
-; CHECK-GI-NEXT:    tst w8, #0x1
-; CHECK-GI-NEXT:    csinv x8, x10, xzr, eq
-; CHECK-GI-NEXT:    fmov d0, x8
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: ucmp.1.64.65:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and x8, x1, #0x1
+; CHECK-NEXT:    and x9, x3, #0x1
+; CHECK-NEXT:    cmp x2, x0
+; CHECK-NEXT:    sbcs xzr, x9, x8
+; CHECK-NEXT:    cset x10, lo
+; CHECK-NEXT:    cmp x0, x2
+; CHECK-NEXT:    sbcs xzr, x8, x9
+; CHECK-NEXT:    csinv x8, x10, xzr, hs
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ret
   %1 = call <1 x i64> @llvm.ucmp(<1 x i65> %x, <1 x i65> %y)
   ret <1 x i64> %1
 }
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-add.ll b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
index c81fd26a77525..27e786eb1ced1 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-add.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
@@ -1968,10 +1968,9 @@ define i32 @test_udot_v16i8(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-SD-BASE:       // %bb.0: // %entry
 ; CHECK-SD-BASE-NEXT:    umull2 v2.8h, v1.16b, v0.16b
 ; CHECK-SD-BASE-NEXT:    umull v0.8h, v1.8b, v0.8b
-; CHECK-SD-BASE-NEXT:    uaddl2 v1.4s, v0.8h, v2.8h
-; CHECK-SD-BASE-NEXT:    uaddl v0.4s, v0.4h, v2.4h
-; CHECK-SD-BASE-NEXT:    add v0.4s, v0.4s, v1.4s
-; CHECK-SD-BASE-NEXT:    addv s0, v0.4s
+; CHECK-SD-BASE-NEXT:    uaddlp v1.4s, v2.8h
+; CHECK-SD-BASE-NEXT:    uadalp v1.4s, v0.8h
+; CHECK-SD-BASE-NEXT:    addv s0, v1.4s
 ; CHECK-SD-BASE-NEXT:    fmov w0, s0
 ; CHECK-SD-BASE-NEXT:    ret
 ;
@@ -2296,10 +2295,9 @@ define i32 @test_sdot_v16i8(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-SD-BASE:       // %bb.0: // %entry
 ; CHECK-SD-BASE-NEXT:    smull2 v2.8h, v1.16b, v0.16b
 ; CHECK-SD-BASE-NEXT:    smull v0.8h, v1.8b, v0.8b
-; CHECK-SD-BASE-NEXT:    saddl2 v1.4s, v0.8h, v2.8h
-; CHECK-SD-BASE-NEXT:    saddl v0.4s, v0.4h, v2.4h
-; CHECK-SD-BASE-NEXT:    add v0.4s, v0.4s, v1.4s
-; CHECK-SD-BASE-NEXT:    addv s0, v0.4s
+; CHECK-SD-BASE-NEXT:    saddlp v1.4s, v2.8h
+; CHECK-SD-BASE-NEXT:    sadalp v1.4s, v0.8h
+; CHECK-SD-BASE-NEXT:    addv s0, v1.4s
 ; CHECK-SD-BASE-NEXT:    fmov w0, s0
 ; CHECK-SD-BASE-NEXT:    ret
 ;
@@ -3868,10 +3866,9 @@ entry:
 define i16 @add_v32i8_v32i16_zext(<32 x i8> %x) {
 ; CHECK-SD-LABEL: add_v32i8_v32i16_zext:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    uaddl2 v2.8h, v0.16b, v1.16b
-; CHECK-SD-NEXT:    uaddl v0.8h, v0.8b, v1.8b
-; CHECK-SD-NEXT:    add v0.8h, v0.8h, v2.8h
-; CHECK-SD-NEXT:    addv h0, v0.8h
+; CHECK-SD-NEXT:    uaddlp v1.8h, v1.16b
+; CHECK-SD-NEXT:    uadalp v1.8h, v0.16b
+; CHECK-SD-NEXT:    addv h0, v1.8h
 ; CHECK-SD-NEXT:    fmov w0, s0
 ; CHECK-SD-NEXT:    ret
 ;
@@ -3994,10 +3991,9 @@ entry:
 define i16 @add_v32i8_v32i16_sext(<32 x i8> %x) {
 ; CHECK-SD-LABEL: add_v32i8_v32i16_sext:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    saddl2 v2.8h, v0.16b, v1.16b
-; CHECK-SD-NEXT:    saddl v0.8h, v0.8b, v1.8b
-; CHECK-SD-NEXT:    add v0.8h, v0.8h, v2.8h
-; CHECK-SD-NEXT:    addv h0, v0.8h
+; CHECK-SD-NEXT:    saddlp v1.8h, v1.16b
+; CHECK-SD-NEXT:    sadalp v1.8h, v0.16b
+; CHECK-SD-NEXT:    addv h0, v1.8h
 ; CHECK-SD-NEXT:    fmov w0, s0
 ; CHECK-SD-NEXT:    ret
 ;
@@ -4238,14 +4234,14 @@ define i32 @add_v32i8_v32i32_zext(<32 x i8> %x) {
 ; CHECK-SD-BASE-LABEL: add_v32i8_v32i32_zext:
 ; CHECK-SD-BASE:       // %bb.0: // %entry
 ; CHECK-SD-BASE-NEXT:    ushll2 v2.8h, v1.16b, #0
-; CHECK-SD-BASE-NEXT:    ushll2 v3.8h, v0.16b, #0
 ; CHECK-SD-BASE-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-SD-BASE-NEXT:    ushll2 v3.8h, v0.16b, #0
 ; CHECK-SD-BASE-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-SD-BASE-NEXT:    uaddl2 v4.4s, v3.8h, v2.8h
-; CHECK-SD-BASE-NEXT:    uaddl v2.4s, v3.4h, v2.4h
-; CHECK-SD-BASE-NEXT:    uaddl2 v5.4s, v0.8h, v1.8h
-; CHECK-SD-BASE-NEXT:    uaddl v0.4s, v0.4h, v1.4h
-; CHECK-SD-BASE-NEXT:    add v1.4s, v5.4s, v4.4s
+; CHECK-SD-BASE-NEXT:    uaddl2 v4.4s, v1.8h, v2.8h
+; CHECK-SD-BASE-NEXT:    uaddl v1.4s, v1.4h, v2.4h
+; CHECK-SD-BASE-NEXT:    uaddl2 v2.4s, v0.8h, v3.8h
+; CHECK-SD-BASE-NEXT:    uaddl v0.4s, v0.4h, v3.4h
+; CHECK-SD-BASE-NEXT:    add v1.4s, v1.4s, v4.4s
 ; CHECK-SD-BASE-NEXT:    add v0.4s, v0.4s, v2.4s
 ; CHECK-SD-BASE-NEXT:    add v0.4s, v0.4s, v1.4s
 ; CHECK-SD-BASE-NEXT:    addv s0, v0.4s
@@ -4511,14 +4507,14 @@ define i32 @add_v32i8_v32i32_sext(<32 x i8> %x) {
 ; CHECK-SD-BASE-LABEL: add_v32i8_v32i32_sext:
 ; CHECK-SD-BASE:       // %bb.0: // %entry
 ; CHECK-SD-BASE-NEXT:    sshll2 v2.8h, v1.16b, #0
-; CHECK-SD-BASE-NEXT:    sshll2 v3.8h, v0.16b, #0
 ; CHECK-SD-BASE-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-SD-BASE-NEXT:    sshll2 v3.8h, v0.16b, #0
 ; CHECK-SD-BASE-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-SD-BASE-NEXT:    saddl2 v4.4s, v3.8h, v2.8h
-; CHECK-SD-BASE-NEXT:    saddl v2.4s, v3.4h, v2.4h
-; CHECK-SD-BASE-NEXT:    saddl2 v5.4s, v0.8h, v1.8h
-; CHECK-SD-BASE-NEXT:    saddl v0.4s, v0.4h, v1.4h
-; CHECK-SD-BASE-NEXT:    add v1.4s, v5.4s, v4.4s
+; CHECK-SD-BASE-NEXT:    saddl2 v4.4s, v1.8h, v2.8h
+; CHECK-SD-BASE-NEXT:    saddl v1.4s, v1.4h, v2.4h
+; CHECK-SD-BASE-NEXT:    saddl2 v2.4s, v0.8h, v3.8h
+; CHECK-SD-BASE-NEXT:    saddl v0.4s, v0.4h, v3.4h
+; CHECK-SD-BASE-NEXT:    add v1.4s, v1.4s, v4.4s
 ; CHECK-SD-BASE-NEXT:    add v0.4s, v0.4s, v2.4s
 ; CHECK-SD-BASE-NEXT:    add v0.4s, v0.4s, v1.4s
 ; CHECK-SD-BASE-NEXT:    addv s0, v0.4s
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
index 7525e00e6f401..453b229bf62bd 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
@@ -1021,20 +1021,20 @@ main_body:
 define amdgpu_kernel void @global_atomic_fadd_f64_noret(ptr addrspace(1) %ptr, double %data) {
 ; GFX90A-LABEL: global_atomic_fadd_f64_noret:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    global_atomic_add_f64 v2, v[0:1], s[4:5]
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT:    global_atomic_add_f64 v2, v[0:1], s[0:1]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX940-LABEL: global_atomic_fadd_f64_noret:
 ; GFX940:       ; %bb.0: ; %main_body
-; GFX940-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX940-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX940-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
-; GFX940-NEXT:    global_atomic_add_f64 v2, v[0:1], s[4:5]
+; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NEXT:    global_atomic_add_f64 v2, v[0:1], s[0:1]
 ; GFX940-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
@@ -1044,20 +1044,20 @@ main_body:
 define amdgpu_kernel void @global_atomic_fmin_f64_noret(ptr addrspace(1) %ptr, double %data) {
 ; GFX90A-LABEL: global_atomic_fmin_f64_noret:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    global_atomic_min_f64 v2, v[0:1], s[4:5]
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT:    global_atomic_min_f64 v2, v[0:1], s[0:1]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX940-LABEL: global_atomic_fmin_f64_noret:
 ; GFX940:       ; %bb.0: ; %main_body
-; GFX940-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX940-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX940-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
-; GFX940-NEXT:    global_atomic_min_f64 v2, v[0:1], s[4:5]
+; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NEXT:    global_atomic_min_f64 v2, v[0:1], s[0:1]
 ; GFX940-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.global.atomic.fmin.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
@@ -1067,20 +1067,20 @@ main_body:
 define amdgpu_kernel void @global_atomic_fmax_f64_noret(ptr addrspace(1) %ptr, double %data) {
 ; GFX90A-LABEL: global_atomic_fmax_f64_noret:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    global_atomic_max_f64 v2, v[0:1], s[4:5]
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT:    global_atomic_max_f64 v2, v[0:1], s[0:1]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX940-LABEL: global_atomic_fmax_f64_noret:
 ; GFX940:       ; %bb.0: ; %main_body
-; GFX940-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX940-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX940-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
-; GFX940-NEXT:    global_atomic_max_f64 v2, v[0:1], s[4:5]
+; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NEXT:    global_atomic_max_f64 v2, v[0:1], s[0:1]
 ; GFX940-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.global.atomic.fmax.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
@@ -1134,14 +1134,14 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt
 ; GFX940-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX940-NEXT:    s_cbranch_execz .LBB39_2
 ; GFX940-NEXT:  ; %bb.1:
-; GFX940-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
+; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x24
 ; GFX940-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
 ; GFX940-NEXT:    v_cvt_f64_u32_e32 v[0:1], s0
 ; GFX940-NEXT:    v_mul_f64 v[0:1], v[0:1], 4.0
 ; GFX940-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    global_atomic_add_f64 v2, v[0:1], s[4:5] sc1
+; GFX940-NEXT:    global_atomic_add_f64 v2, v[0:1], s[2:3] sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc0 sc1
 ; GFX940-NEXT:  .LBB39_2:
@@ -1162,13 +1162,13 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace(
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX90A-NEXT:    s_cbranch_execz .LBB40_2
 ; GFX90A-NEXT:  ; %bb.1:
-; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
+; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x24
 ; GFX90A-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
 ; GFX90A-NEXT:    v_cvt_f64_u32_e32 v[0:1], s0
 ; GFX90A-NEXT:    v_mul_f64 v[0:1], v[0:1], 4.0
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    global_atomic_add_f64 v2, v[0:1], s[4:5]
+; GFX90A-NEXT:    global_atomic_add_f64 v2, v[0:1], s[2:3]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-NEXT:  .LBB40_2:
@@ -1184,14 +1184,14 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace(
 ; GFX940-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX940-NEXT:    s_cbranch_execz .LBB40_2
 ; GFX940-NEXT:  ; %bb.1:
-; GFX940-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
+; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x24
 ; GFX940-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
 ; GFX940-NEXT:    v_cvt_f64_u32_e32 v[0:1], s0
 ; GFX940-NEXT:    v_mul_f64 v[0:1], v[0:1], 4.0
 ; GFX940-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    global_atomic_add_f64 v2, v[0:1], s[4:5]
+; GFX940-NEXT:    global_atomic_add_f64 v2, v[0:1], s[2:3]
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:  .LBB40_2:
@@ -1248,14 +1248,14 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace
 ; GFX940-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX940-NEXT:    s_cbranch_execz .LBB41_2
 ; GFX940-NEXT:  ; %bb.1:
-; GFX940-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
+; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x24
 ; GFX940-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
 ; GFX940-NEXT:    v_cvt_f64_u32_e32 v[0:1], s0
 ; GFX940-NEXT:    v_mul_f64 v[0:1], v[0:1], 4.0
 ; GFX940-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    global_atomic_add_f64 v2, v[0:1], s[4:5] sc1
+; GFX940-NEXT:    global_atomic_add_f64 v2, v[0:1], s[2:3] sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc0 sc1
 ; GFX940-NEXT:  .LBB41_2:
@@ -1276,13 +1276,13 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace(
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX90A-NEXT:    s_cbranch_execz .LBB42_2
 ; GFX90A-NEXT:  ; %bb.1:
-; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
+; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x24
 ; GFX90A-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
 ; GFX90A-NEXT:    v_cvt_f64_u32_e32 v[0:1], s0
 ; GFX90A-NEXT:    v_mul_f64 v[0:1], v[0:1], 4.0
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    global_atomic_add_f64 v2, v[0:1], s[4:5]
+; GFX90A-NEXT:    global_atomic_add_f64 v2, v[0:1], s[2:3]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-NEXT:  .LBB42_2:
@@ -1298,14 +1298,14 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace(
 ; GFX940-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX940-NEXT:    s_cbranch_execz .LBB42_2
 ; GFX940-NEXT:  ; %bb.1:
-; GFX940-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
+; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x24
 ; GFX940-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
 ; GFX940-NEXT:    v_cvt_f64_u32_e32 v[0:1], s0
 ; GFX940-NEXT:    v_mul_f64 v[0:1], v[0:1], 4.0
 ; GFX940-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    global_atomic_add_f64 v2, v[0:1], s[4:5]
+; GFX940-NEXT:    global_atomic_add_f64 v2, v[0:1], s[2:3]
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:  .LBB42_2:
@@ -1522,14 +1522,14 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs
 ; GFX940-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX940-NEXT:    s_cbranch_execz .LBB49_2
 ; GFX940-NEXT:  ; %bb.1:
-; GFX940-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
+; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x24
 ; GFX940-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
 ; GFX940-NEXT:    v_cvt_f64_u32_e32 v[0:1], s0
 ; GFX940-NEXT:    v_mul_f64 v[0:1], v[0:1], 4.0
 ; GFX940-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    global_atomic_add_f64 v2, v[0:1], s[4:5]
+; GFX940-NEXT:    global_atomic_add_f64 v2, v[0:1], s[2:3]
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:  .LBB49_2:
@@ -1761,19 +1761,19 @@ main_body:
 define amdgpu_kernel void @flat_atomic_fadd_f64_noret(ptr %ptr, double %data) {
 ; GFX90A-LABEL: flat_atomic_fadd_f64_noret:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
 ; GFX90A-NEXT:    flat_atomic_add_f64 v[0:1], v[2:3]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX940-LABEL: flat_atomic_fadd_f64_noret:
 ; GFX940:       ; %bb.0: ; %main_body
-; GFX940-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX940-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
-; GFX940-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NEXT:    flat_atomic_add_f64 v[0:1], v[2:3]
 ; GFX940-NEXT:    s_endpgm
 main_body:
@@ -1842,19 +1842,19 @@ main_body:
 define amdgpu_kernel void @flat_atomic_fmin_f64_noret(ptr %ptr, double %data) {
 ; GFX90A-LABEL: flat_atomic_fmin_f64_noret:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
 ; GFX90A-NEXT:    flat_atomic_min_f64 v[0:1], v[2:3]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX940-LABEL: flat_atomic_fmin_f64_noret:
 ; GFX940:       ; %bb.0: ; %main_body
-; GFX940-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX940-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
-; GFX940-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NEXT:    flat_atomic_min_f64 v[0:1], v[2:3]
 ; GFX940-NEXT:    s_endpgm
 main_body:
@@ -1884,19 +1884,19 @@ main_body:
 define amdgpu_kernel void @flat_atomic_fmax_f64_noret(ptr %ptr, double %data) {
 ; GFX90A-LABEL: flat_atomic_fmax_f64_noret:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
 ; GFX90A-NEXT:    flat_atomic_max_f64 v[0:1], v[2:3]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX940-LABEL: flat_atomic_fmax_f64_noret:
 ; GFX940:       ; %bb.0: ; %main_body
-; GFX940-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX940-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
-; GFX940-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NEXT:    flat_atomic_max_f64 v[0:1], v[2:3]
 ; GFX940-NEXT:    s_endpgm
 main_body:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir
index 6a1e52cd29fd9..5d4816812e6c0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir
@@ -1,6 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -enable-unsafe-fp-math -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=GFX10
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -enable-unsafe-fp-math -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=GFX11
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -enable-unsafe-fp-math -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -enable-unsafe-fp-math -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s
 
 ---
 name:            fract_f64_neg
@@ -12,41 +12,23 @@ body: |
   bb.1:
     liveins: $sgpr0_sgpr1
 
-    ; GFX10-LABEL: name: fract_f64_neg
-    ; GFX10: liveins: $sgpr0_sgpr1
-    ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX10-NEXT: early-clobber %3:sgpr_128 = S_LOAD_DWORDX4_IMM_ec [[COPY]], 36, 0 :: (dereferenceable invariant load (<2 x s64>), align 4, addrspace 4)
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY %3.sub0_sub1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY %3.sub2_sub3
-    ; GFX10-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY2]], 0, 0 :: (load (s64), addrspace 1)
-    ; GFX10-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B]]
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM]]
-    ; GFX10-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_ADD_F64_e64 0, [[COPY3]], 1, [[COPY4]], 0, 0, implicit $mode, implicit $exec
-    ; GFX10-NEXT: [[V_FLOOR_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_FLOOR_F64_e64 0, [[V_ADD_F64_e64_]], 0, 0, implicit $mode, implicit $exec
-    ; GFX10-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_]], 1, [[V_FLOOR_F64_e64_]], 0, 0, implicit $mode, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX10-NEXT: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], [[V_ADD_F64_e64_1]], [[COPY1]], 0, 0, implicit $exec :: (store (s64), addrspace 1)
-    ; GFX10-NEXT: S_ENDPGM 0
-    ;
-    ; GFX11-LABEL: name: fract_f64_neg
-    ; GFX11: liveins: $sgpr0_sgpr1
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX11-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 36, 0 :: (dereferenceable invariant load (<2 x s64>), align 4, addrspace 4)
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub2_sub3
-    ; GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY2]], 0, 0 :: (load (s64), addrspace 1)
-    ; GFX11-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B]]
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM]]
-    ; GFX11-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_ADD_F64_e64 0, [[COPY3]], 1, [[COPY4]], 0, 0, implicit $mode, implicit $exec
-    ; GFX11-NEXT: [[V_FLOOR_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_FLOOR_F64_e64 0, [[V_ADD_F64_e64_]], 0, 0, implicit $mode, implicit $exec
-    ; GFX11-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_]], 1, [[V_FLOOR_F64_e64_]], 0, 0, implicit $mode, implicit $exec
-    ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX11-NEXT: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], [[V_ADD_F64_e64_1]], [[COPY1]], 0, 0, implicit $exec :: (store (s64), addrspace 1)
-    ; GFX11-NEXT: S_ENDPGM 0
+    ; CHECK-LABEL: name: fract_f64_neg
+    ; CHECK: liveins: $sgpr0_sgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+    ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 36, 0 :: (dereferenceable invariant load (<2 x s64>), align 4, addrspace 4)
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub2_sub3
+    ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY2]], 0, 0 :: (load (s64), addrspace 1)
+    ; CHECK-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B]]
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM]]
+    ; CHECK-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_ADD_F64_e64 0, [[COPY3]], 1, [[COPY4]], 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: [[V_FLOOR_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_FLOOR_F64_e64 0, [[V_ADD_F64_e64_]], 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_]], 1, [[V_FLOOR_F64_e64_]], 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; CHECK-NEXT: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], [[V_ADD_F64_e64_1]], [[COPY1]], 0, 0, implicit $exec :: (store (s64), addrspace 1)
+    ; CHECK-NEXT: S_ENDPGM 0
     %2:sgpr(p4) = COPY $sgpr0_sgpr1
     %7:sgpr(s64) = G_CONSTANT i64 36
     %8:sgpr(p4) = G_PTR_ADD %2, %7(s64)
@@ -78,41 +60,23 @@ body: |
   bb.1:
     liveins: $sgpr0_sgpr1
 
-    ; GFX10-LABEL: name: fract_f64_neg_abs
-    ; GFX10: liveins: $sgpr0_sgpr1
-    ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX10-NEXT: early-clobber %3:sgpr_128 = S_LOAD_DWORDX4_IMM_ec [[COPY]], 36, 0 :: (dereferenceable invariant load (<2 x s64>), align 4, addrspace 4)
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY %3.sub0_sub1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY %3.sub2_sub3
-    ; GFX10-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY2]], 0, 0 :: (load (s64), addrspace 1)
-    ; GFX10-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B]]
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM]]
-    ; GFX10-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_ADD_F64_e64 0, [[COPY3]], 3, [[COPY4]], 0, 0, implicit $mode, implicit $exec
-    ; GFX10-NEXT: [[V_FLOOR_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_FLOOR_F64_e64 0, [[V_ADD_F64_e64_]], 0, 0, implicit $mode, implicit $exec
-    ; GFX10-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_]], 1, [[V_FLOOR_F64_e64_]], 0, 0, implicit $mode, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX10-NEXT: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], [[V_ADD_F64_e64_1]], [[COPY1]], 0, 0, implicit $exec :: (store (s64), addrspace 1)
-    ; GFX10-NEXT: S_ENDPGM 0
-    ;
-    ; GFX11-LABEL: name: fract_f64_neg_abs
-    ; GFX11: liveins: $sgpr0_sgpr1
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX11-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 36, 0 :: (dereferenceable invariant load (<2 x s64>), align 4, addrspace 4)
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub2_sub3
-    ; GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY2]], 0, 0 :: (load (s64), addrspace 1)
-    ; GFX11-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B]]
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM]]
-    ; GFX11-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_ADD_F64_e64 0, [[COPY3]], 3, [[COPY4]], 0, 0, implicit $mode, implicit $exec
-    ; GFX11-NEXT: [[V_FLOOR_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_FLOOR_F64_e64 0, [[V_ADD_F64_e64_]], 0, 0, implicit $mode, implicit $exec
-    ; GFX11-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_]], 1, [[V_FLOOR_F64_e64_]], 0, 0, implicit $mode, implicit $exec
-    ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX11-NEXT: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], [[V_ADD_F64_e64_1]], [[COPY1]], 0, 0, implicit $exec :: (store (s64), addrspace 1)
-    ; GFX11-NEXT: S_ENDPGM 0
+    ; CHECK-LABEL: name: fract_f64_neg_abs
+    ; CHECK: liveins: $sgpr0_sgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+    ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 36, 0 :: (dereferenceable invariant load (<2 x s64>), align 4, addrspace 4)
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub2_sub3
+    ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY2]], 0, 0 :: (load (s64), addrspace 1)
+    ; CHECK-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B]]
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM]]
+    ; CHECK-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_ADD_F64_e64 0, [[COPY3]], 3, [[COPY4]], 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: [[V_FLOOR_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_FLOOR_F64_e64 0, [[V_ADD_F64_e64_]], 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_]], 1, [[V_FLOOR_F64_e64_]], 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; CHECK-NEXT: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], [[V_ADD_F64_e64_1]], [[COPY1]], 0, 0, implicit $exec :: (store (s64), addrspace 1)
+    ; CHECK-NEXT: S_ENDPGM 0
     %2:sgpr(p4) = COPY $sgpr0_sgpr1
     %7:sgpr(s64) = G_CONSTANT i64 36
     %8:sgpr(p4) = G_PTR_ADD %2, %7(s64)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-constant.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-constant.mir
index 02c6220ac2aca..504f7697a0fcc 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-constant.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-constant.mir
@@ -3,7 +3,7 @@
 # RUN: llc -amdgpu-global-isel-new-legality -mtriple=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs  -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX7 %s
 # RUN: llc -amdgpu-global-isel-new-legality -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX8 %s
 # RUN: llc -amdgpu-global-isel-new-legality -mtriple=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX10 %s
-# RUN: llc -amdgpu-global-isel-new-legality -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX11 %s
+# RUN: llc -amdgpu-global-isel-new-legality -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX10 %s
 
 ---
 
@@ -44,13 +44,6 @@ body: |
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
     ; GFX10-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 0, 0 :: (load (s32), addrspace 4)
     ; GFX10-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]]
-    ;
-    ; GFX11-LABEL: name: load_constant_s32_from_4
-    ; GFX11: liveins: $sgpr0_sgpr1
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX11-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 0, 0 :: (load (s32), addrspace 4)
-    ; GFX11-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]]
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(s32) = G_LOAD %0 :: (load (s32), align 4, addrspace 4)
     $sgpr0 = COPY %1
@@ -96,13 +89,6 @@ body: |
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
     ; GFX10-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 0, 0 :: (load (<2 x s16>), addrspace 4)
     ; GFX10-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]]
-    ;
-    ; GFX11-LABEL: name: load_constant_v2s16_from_4
-    ; GFX11: liveins: $sgpr0_sgpr1
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX11-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 0, 0 :: (load (<2 x s16>), addrspace 4)
-    ; GFX11-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]]
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(<2 x s16>) = G_LOAD %0 :: (load (<2 x s16>), align 4, addrspace 4)
     $sgpr0 = COPY %1
@@ -147,13 +133,6 @@ body: |
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
     ; GFX10-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 0, 0 :: (load (<2 x s32>), addrspace 4)
     ; GFX10-NEXT: $sgpr0_sgpr1 = COPY [[S_LOAD_DWORDX2_IMM]]
-    ;
-    ; GFX11-LABEL: name: load_constant_v2s32
-    ; GFX11: liveins: $sgpr0_sgpr1
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 0, 0 :: (load (<2 x s32>), addrspace 4)
-    ; GFX11-NEXT: $sgpr0_sgpr1 = COPY [[S_LOAD_DWORDX2_IMM]]
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(<2 x s32>) = G_LOAD %0 :: (load (<2 x s32>), align 8, addrspace 4)
     $sgpr0_sgpr1 = COPY %1
@@ -195,15 +174,8 @@ body: |
     ; GFX10: liveins: $sgpr0_sgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX10-NEXT: early-clobber %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY]], 0, 0 :: (load (<2 x s32>), align 4, addrspace 4)
-    ; GFX10-NEXT: $sgpr0_sgpr1 = COPY %1
-    ;
-    ; GFX11-LABEL: name: load_constant_v2s32_align4
-    ; GFX11: liveins: $sgpr0_sgpr1
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 0, 0 :: (load (<2 x s32>), align 4, addrspace 4)
-    ; GFX11-NEXT: $sgpr0_sgpr1 = COPY [[S_LOAD_DWORDX2_IMM]]
+    ; GFX10-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 0, 0 :: (load (<2 x s32>), align 4, addrspace 4)
+    ; GFX10-NEXT: $sgpr0_sgpr1 = COPY [[S_LOAD_DWORDX2_IMM]]
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(<2 x s32>) = G_LOAD %0 :: (load (<2 x s32>), align 4, addrspace 4)
     $sgpr0_sgpr1 = COPY %1
@@ -245,15 +217,8 @@ body: |
     ; GFX10: liveins: $sgpr0_sgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX10-NEXT: early-clobber %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY]], 0, 0 :: (load (<4 x s16>), align 4, addrspace 4)
-    ; GFX10-NEXT: $sgpr0_sgpr1 = COPY %1
-    ;
-    ; GFX11-LABEL: name: load_constant_v4s16_align4
-    ; GFX11: liveins: $sgpr0_sgpr1
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 0, 0 :: (load (<4 x s16>), align 4, addrspace 4)
-    ; GFX11-NEXT: $sgpr0_sgpr1 = COPY [[S_LOAD_DWORDX2_IMM]]
+    ; GFX10-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 0, 0 :: (load (<4 x s16>), align 4, addrspace 4)
+    ; GFX10-NEXT: $sgpr0_sgpr1 = COPY [[S_LOAD_DWORDX2_IMM]]
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(<4 x s16>) = G_LOAD %0 :: (load (<4 x s16>), align 4, addrspace 4)
     $sgpr0_sgpr1 = COPY %1
@@ -296,15 +261,8 @@ body: |
     ; GFX10: liveins: $sgpr0_sgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX10-NEXT: early-clobber %1:sgpr_128 = S_LOAD_DWORDX4_IMM_ec [[COPY]], 0, 0 :: (load (<4 x s32>), align 4, addrspace 4)
-    ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %1
-    ;
-    ; GFX11-LABEL: name: load_constant_v4s32_align4
-    ; GFX11: liveins: $sgpr0_sgpr1
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX11-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 0, 0 :: (load (<4 x s32>), align 4, addrspace 4)
-    ; GFX11-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[S_LOAD_DWORDX4_IMM]]
+    ; GFX10-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 0, 0 :: (load (<4 x s32>), align 4, addrspace 4)
+    ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[S_LOAD_DWORDX4_IMM]]
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(<4 x  s32>) = G_LOAD %0 :: (load (<4 x s32>), align 4, addrspace 4)
     $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %1
@@ -349,13 +307,6 @@ body: |
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
     ; GFX10-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 0, 0 :: (load (s64), addrspace 4)
     ; GFX10-NEXT: $sgpr0_sgpr1 = COPY [[S_LOAD_DWORDX2_IMM]]
-    ;
-    ; GFX11-LABEL: name: load_constant_s64
-    ; GFX11: liveins: $sgpr0_sgpr1
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 0, 0 :: (load (s64), addrspace 4)
-    ; GFX11-NEXT: $sgpr0_sgpr1 = COPY [[S_LOAD_DWORDX2_IMM]]
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(s64) = G_LOAD %0 :: (load (s64), align 8, addrspace 4)
     $sgpr0_sgpr1 = COPY %1
@@ -398,15 +349,8 @@ body: |
     ; GFX10: liveins: $sgpr0_sgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX10-NEXT: early-clobber %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY]], 0, 0 :: (load (s64), align 4, addrspace 4)
-    ; GFX10-NEXT: $sgpr0_sgpr1 = COPY %1
-    ;
-    ; GFX11-LABEL: name: load_constant_s64_align4
-    ; GFX11: liveins: $sgpr0_sgpr1
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 0, 0 :: (load (s64), align 4, addrspace 4)
-    ; GFX11-NEXT: $sgpr0_sgpr1 = COPY [[S_LOAD_DWORDX2_IMM]]
+    ; GFX10-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 0, 0 :: (load (s64), align 4, addrspace 4)
+    ; GFX10-NEXT: $sgpr0_sgpr1 = COPY [[S_LOAD_DWORDX2_IMM]]
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(s64) = G_LOAD %0 :: (load (s64), align 4, addrspace 4)
     $sgpr0_sgpr1 = COPY %1
@@ -449,15 +393,8 @@ body: |
     ; GFX10: liveins: $sgpr0_sgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX10-NEXT: early-clobber %1:sgpr_128 = S_LOAD_DWORDX4_IMM_ec [[COPY]], 0, 0 :: (load (<2 x s64>), align 4, addrspace 4)
-    ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %1
-    ;
-    ; GFX11-LABEL: name: load_constant_v2s64
-    ; GFX11: liveins: $sgpr0_sgpr1
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX11-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 0, 0 :: (load (<2 x s64>), align 4, addrspace 4)
-    ; GFX11-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[S_LOAD_DWORDX4_IMM]]
+    ; GFX10-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 0, 0 :: (load (<2 x s64>), align 4, addrspace 4)
+    ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[S_LOAD_DWORDX4_IMM]]
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(<2 x s64>) = G_LOAD %0 :: (load (<2 x s64>), align 4, addrspace 4)
     $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %1
@@ -502,13 +439,6 @@ body: |
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:sgpr_128(<2 x p1>) = G_LOAD [[COPY]](p4) :: (load (<2 x p1>), align 4, addrspace 4)
     ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[LOAD]](<2 x p1>)
-    ;
-    ; GFX11-LABEL: name: load_constant_v2p1
-    ; GFX11: liveins: $sgpr0_sgpr1
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
-    ; GFX11-NEXT: [[LOAD:%[0-9]+]]:sgpr_128(<2 x p1>) = G_LOAD [[COPY]](p4) :: (load (<2 x p1>), align 4, addrspace 4)
-    ; GFX11-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[LOAD]](<2 x p1>)
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(<2 x p1>) = G_LOAD %0 :: (load (<2 x p1>), align 4, addrspace 4)
     $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %1
@@ -553,13 +483,6 @@ body: |
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:sgpr_128(s128) = G_LOAD [[COPY]](p4) :: (load (s128), align 4, addrspace 4)
     ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[LOAD]](s128)
-    ;
-    ; GFX11-LABEL: name: load_constant_s128_align4
-    ; GFX11: liveins: $sgpr0_sgpr1
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
-    ; GFX11-NEXT: [[LOAD:%[0-9]+]]:sgpr_128(s128) = G_LOAD [[COPY]](p4) :: (load (s128), align 4, addrspace 4)
-    ; GFX11-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[LOAD]](s128)
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(s128) = G_LOAD %0 :: (load (s128), align 4, addrspace 4)
     $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %1
@@ -604,13 +527,6 @@ body: |
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
     ; GFX10-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 0, 0 :: (load (p3), addrspace 4)
     ; GFX10-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]]
-    ;
-    ; GFX11-LABEL: name: load_constant_p3_from_4
-    ; GFX11: liveins: $sgpr0_sgpr1
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX11-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 0, 0 :: (load (p3), addrspace 4)
-    ; GFX11-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]]
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(p3) = G_LOAD %0 :: (load (p3), align 4, addrspace 4)
     $sgpr0 = COPY %1
@@ -655,13 +571,6 @@ body: |
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
     ; GFX10-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 0, 0 :: (load (p4), addrspace 4)
     ; GFX10-NEXT: $sgpr0_sgpr1 = COPY [[S_LOAD_DWORDX2_IMM]]
-    ;
-    ; GFX11-LABEL: name: load_constant_p4_from_8
-    ; GFX11: liveins: $sgpr0_sgpr1
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 0, 0 :: (load (p4), addrspace 4)
-    ; GFX11-NEXT: $sgpr0_sgpr1 = COPY [[S_LOAD_DWORDX2_IMM]]
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(p4) = G_LOAD %0 :: (load (p4), align 8, addrspace 4)
     $sgpr0_sgpr1 = COPY %1
@@ -706,13 +615,6 @@ body: |
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:sreg_64(p999) = G_LOAD [[COPY]](p4) :: (load (p999), addrspace 4)
     ; GFX10-NEXT: $sgpr0_sgpr1 = COPY [[LOAD]](p999)
-    ;
-    ; GFX11-LABEL: name: load_constant_p999_from_8
-    ; GFX11: liveins: $sgpr0_sgpr1
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
-    ; GFX11-NEXT: [[LOAD:%[0-9]+]]:sreg_64(p999) = G_LOAD [[COPY]](p4) :: (load (p999), addrspace 4)
-    ; GFX11-NEXT: $sgpr0_sgpr1 = COPY [[LOAD]](p999)
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(p999) = G_LOAD %0 :: (load (p999), align 8, addrspace 4)
     $sgpr0_sgpr1 = COPY %1
@@ -757,13 +659,6 @@ body: |
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:sreg_64(<2 x p3>) = G_LOAD [[COPY]](p4) :: (load (<2 x p3>), addrspace 4)
     ; GFX10-NEXT: $sgpr0_sgpr1 = COPY [[LOAD]](<2 x p3>)
-    ;
-    ; GFX11-LABEL: name: load_constant_v2p3
-    ; GFX11: liveins: $sgpr0_sgpr1
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
-    ; GFX11-NEXT: [[LOAD:%[0-9]+]]:sreg_64(<2 x p3>) = G_LOAD [[COPY]](p4) :: (load (<2 x p3>), addrspace 4)
-    ; GFX11-NEXT: $sgpr0_sgpr1 = COPY [[LOAD]](<2 x p3>)
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(<2 x p3>) = G_LOAD %0 :: (load (<2 x p3>), align 8, addrspace 4)
     $sgpr0_sgpr1 = COPY %1
@@ -808,13 +703,6 @@ body: |
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
     ; GFX10-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 0, 0 :: (load (<2 x s16>), addrspace 4)
     ; GFX10-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]]
-    ;
-    ; GFX11-LABEL: name: load_constant_v2s16
-    ; GFX11: liveins: $sgpr0_sgpr1
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX11-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 0, 0 :: (load (<2 x s16>), addrspace 4)
-    ; GFX11-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]]
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(<2 x s16>) = G_LOAD %0 :: (load (<2 x s16>), align 4, addrspace 4)
     $sgpr0 = COPY %1
@@ -859,13 +747,6 @@ body: |
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
     ; GFX10-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 0, 0 :: (load (<4 x s16>), addrspace 4)
     ; GFX10-NEXT: $sgpr0_sgpr1 = COPY [[S_LOAD_DWORDX2_IMM]]
-    ;
-    ; GFX11-LABEL: name: load_constant_v4s16
-    ; GFX11: liveins: $sgpr0_sgpr1
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 0, 0 :: (load (<4 x s16>), addrspace 4)
-    ; GFX11-NEXT: $sgpr0_sgpr1 = COPY [[S_LOAD_DWORDX2_IMM]]
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(<4 x s16>) = G_LOAD %0 :: (load (<4 x s16>), align 8, addrspace 4)
     $sgpr0_sgpr1 = COPY %1
@@ -908,15 +789,8 @@ body: |
     ; GFX10: liveins: $sgpr0_sgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX10-NEXT: early-clobber %1:sgpr_128 = S_LOAD_DWORDX4_IMM_ec [[COPY]], 0, 0 :: (load (<8 x s16>), align 4, addrspace 4)
-    ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %1
-    ;
-    ; GFX11-LABEL: name: load_constant_v8s16
-    ; GFX11: liveins: $sgpr0_sgpr1
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX11-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 0, 0 :: (load (<8 x s16>), align 4, addrspace 4)
-    ; GFX11-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[S_LOAD_DWORDX4_IMM]]
+    ; GFX10-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 0, 0 :: (load (<8 x s16>), align 4, addrspace 4)
+    ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[S_LOAD_DWORDX4_IMM]]
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(<8 x s16>) = G_LOAD %0 :: (load (<8 x s16>), align 4, addrspace 4)
     $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %1
@@ -959,15 +833,8 @@ body: |
     ; GFX10: liveins: $sgpr0_sgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX10-NEXT: early-clobber %1:sgpr_256 = S_LOAD_DWORDX8_IMM_ec [[COPY]], 0, 0 :: (load (<8 x s32>), align 4, addrspace 4)
-    ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 = COPY %1
-    ;
-    ; GFX11-LABEL: name: load_constant_v8s32
-    ; GFX11: liveins: $sgpr0_sgpr1
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX11-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[COPY]], 0, 0 :: (load (<8 x s32>), align 4, addrspace 4)
-    ; GFX11-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 = COPY [[S_LOAD_DWORDX8_IMM]]
+    ; GFX10-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[COPY]], 0, 0 :: (load (<8 x s32>), align 4, addrspace 4)
+    ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 = COPY [[S_LOAD_DWORDX8_IMM]]
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(<8 x s32>) = G_LOAD %0 :: (load (<8 x s32>), align 4, addrspace 4)
     $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 = COPY %1
@@ -1010,15 +877,8 @@ body: |
     ; GFX10: liveins: $sgpr0_sgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX10-NEXT: early-clobber %1:sgpr_512 = S_LOAD_DWORDX16_IMM_ec [[COPY]], 0, 0 :: (load (<16 x s32>), align 4, addrspace 4)
-    ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY %1
-    ;
-    ; GFX11-LABEL: name: load_constant_v16s32
-    ; GFX11: liveins: $sgpr0_sgpr1
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX11-NEXT: [[S_LOAD_DWORDX16_IMM:%[0-9]+]]:sgpr_512 = S_LOAD_DWORDX16_IMM [[COPY]], 0, 0 :: (load (<16 x s32>), align 4, addrspace 4)
-    ; GFX11-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY [[S_LOAD_DWORDX16_IMM]]
+    ; GFX10-NEXT: [[S_LOAD_DWORDX16_IMM:%[0-9]+]]:sgpr_512 = S_LOAD_DWORDX16_IMM [[COPY]], 0, 0 :: (load (<16 x s32>), align 4, addrspace 4)
+    ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY [[S_LOAD_DWORDX16_IMM]]
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(<16 x s32>) = G_LOAD %0 :: (load (<16 x s32>), align 4, addrspace 4)
     $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY %1
@@ -1061,15 +921,8 @@ body: |
     ; GFX10: liveins: $sgpr0_sgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX10-NEXT: early-clobber %1:sgpr_512 = S_LOAD_DWORDX16_IMM_ec [[COPY]], 0, 0 :: (load (<8 x s64>), align 4, addrspace 4)
-    ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY %1
-    ;
-    ; GFX11-LABEL: name: load_constant_v8s64
-    ; GFX11: liveins: $sgpr0_sgpr1
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX11-NEXT: [[S_LOAD_DWORDX16_IMM:%[0-9]+]]:sgpr_512 = S_LOAD_DWORDX16_IMM [[COPY]], 0, 0 :: (load (<8 x s64>), align 4, addrspace 4)
-    ; GFX11-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY [[S_LOAD_DWORDX16_IMM]]
+    ; GFX10-NEXT: [[S_LOAD_DWORDX16_IMM:%[0-9]+]]:sgpr_512 = S_LOAD_DWORDX16_IMM [[COPY]], 0, 0 :: (load (<8 x s64>), align 4, addrspace 4)
+    ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY [[S_LOAD_DWORDX16_IMM]]
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(<8 x s64>) = G_LOAD %0 :: (load (<8 x s64>), align 4, addrspace 4)
     $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY %1
@@ -1118,13 +971,6 @@ body: |
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
     ; GFX10-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 1020, 0 :: (load (s32), addrspace 4)
     ; GFX10-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]]
-    ;
-    ; GFX11-LABEL: name: load_constant_s32_from_4_gep_1020
-    ; GFX11: liveins: $sgpr0_sgpr1
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX11-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 1020, 0 :: (load (s32), addrspace 4)
-    ; GFX11-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]]
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(s64) = G_CONSTANT i64 1020
     %2:sgpr(p4) = G_PTR_ADD %0, %1
@@ -1172,13 +1018,6 @@ body: |
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
     ; GFX10-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 1024, 0 :: (load (s32), addrspace 4)
     ; GFX10-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]]
-    ;
-    ; GFX11-LABEL: name: load_constant_s32_from_4_gep_1024
-    ; GFX11: liveins: $sgpr0_sgpr1
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX11-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 1024, 0 :: (load (s32), addrspace 4)
-    ; GFX11-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]]
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(s64) = G_CONSTANT i64 1024
     %2:sgpr(p4) = G_PTR_ADD %0, %1
@@ -1228,14 +1067,6 @@ body: |
     ; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1048575
     ; GFX10-NEXT: [[S_LOAD_DWORD_SGPR_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_SGPR_IMM [[COPY]], [[S_MOV_B32_]], 0, 0 :: (load (s32), addrspace 4)
     ; GFX10-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_SGPR_IMM]]
-    ;
-    ; GFX11-LABEL: name: load_constant_s32_from_4_gep_1048575
-    ; GFX11: liveins: $sgpr0_sgpr1
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1048575
-    ; GFX11-NEXT: [[S_LOAD_DWORD_SGPR_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_SGPR_IMM [[COPY]], [[S_MOV_B32_]], 0, 0 :: (load (s32), addrspace 4)
-    ; GFX11-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_SGPR_IMM]]
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(s64) = G_CONSTANT i64 1048575
     %2:sgpr(p4) = G_PTR_ADD %0, %1
@@ -1285,14 +1116,6 @@ body: |
     ; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1048576
     ; GFX10-NEXT: [[S_LOAD_DWORD_SGPR_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_SGPR_IMM [[COPY]], [[S_MOV_B32_]], 0, 0 :: (load (s32), addrspace 4)
     ; GFX10-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_SGPR_IMM]]
-    ;
-    ; GFX11-LABEL: name: load_constant_s32_from_4_gep_1048576
-    ; GFX11: liveins: $sgpr0_sgpr1
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1048576
-    ; GFX11-NEXT: [[S_LOAD_DWORD_SGPR_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_SGPR_IMM [[COPY]], [[S_MOV_B32_]], 0, 0 :: (load (s32), addrspace 4)
-    ; GFX11-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_SGPR_IMM]]
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(s64) = G_CONSTANT i64 1048576
     %2:sgpr(p4) = G_PTR_ADD %0, %1
@@ -1343,14 +1166,6 @@ body: |
     ; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1073741823
     ; GFX10-NEXT: [[S_LOAD_DWORD_SGPR_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_SGPR_IMM [[COPY]], [[S_MOV_B32_]], 0, 0 :: (load (s32), addrspace 4)
     ; GFX10-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_SGPR_IMM]]
-    ;
-    ; GFX11-LABEL: name: load_constant_s32_from_4_gep_1073741823
-    ; GFX11: liveins: $sgpr0_sgpr1
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1073741823
-    ; GFX11-NEXT: [[S_LOAD_DWORD_SGPR_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_SGPR_IMM [[COPY]], [[S_MOV_B32_]], 0, 0 :: (load (s32), addrspace 4)
-    ; GFX11-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_SGPR_IMM]]
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(s64) = G_CONSTANT i64 1073741823
     %2:sgpr(p4) = G_PTR_ADD %0, %1
@@ -1429,21 +1244,6 @@ body: |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
     ; GFX10-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[REG_SEQUENCE]], 0, 0 :: (load (s32), addrspace 4)
     ; GFX10-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]]
-    ;
-    ; GFX11-LABEL: name: load_constant_s32_from_4_gep_negative_1
-    ; GFX11: liveins: $sgpr0_sgpr1
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX11-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -1
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1
-    ; GFX11-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
-    ; GFX11-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
-    ; GFX11-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[REG_SEQUENCE]], 0, 0 :: (load (s32), addrspace 4)
-    ; GFX11-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]]
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(s64) = G_CONSTANT i64 -1
     %2:sgpr(p4) = G_PTR_ADD %0, %1
@@ -1522,21 +1322,6 @@ body: |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
     ; GFX10-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[REG_SEQUENCE]], 0, 0 :: (load (s32), addrspace 4)
     ; GFX10-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]]
-    ;
-    ; GFX11-LABEL: name: load_constant_s32_from_4_gep_negative_524288
-    ; GFX11: liveins: $sgpr0_sgpr1
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX11-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -524288
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1
-    ; GFX11-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
-    ; GFX11-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
-    ; GFX11-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[REG_SEQUENCE]], 0, 0 :: (load (s32), addrspace 4)
-    ; GFX11-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]]
     %0:sgpr(p4) = COPY $sgpr0_sgpr1
     %1:sgpr(s64) = G_CONSTANT i64 -524288
     %2:sgpr(p4) = G_PTR_ADD %0, %1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll
index f88125ea02937..90f34acaa17aa 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll
@@ -48,16 +48,16 @@ define amdgpu_kernel void @test_div_scale_f32_1(ptr addrspace(1) %out, ptr addrs
 ;
 ; GFX10-LABEL: test_div_scale_f32_1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
+; GFX10-NEXT:    global_load_dword v1, v0, s[2:3] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_load_dword v2, v0, s[6:7] offset:4 glc dlc
+; GFX10-NEXT:    global_load_dword v2, v0, s[2:3] offset:4 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_div_scale_f32 v0, s0, v2, v2, v1
+; GFX10-NEXT:    v_div_scale_f32 v0, s2, v2, v2, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_div_scale_f32_1:
@@ -133,16 +133,16 @@ define amdgpu_kernel void @test_div_scale_f32_2(ptr addrspace(1) %out, ptr addrs
 ;
 ; GFX10-LABEL: test_div_scale_f32_2:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
+; GFX10-NEXT:    global_load_dword v1, v0, s[2:3] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_load_dword v2, v0, s[6:7] offset:4 glc dlc
+; GFX10-NEXT:    global_load_dword v2, v0, s[2:3] offset:4 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_div_scale_f32 v0, s0, v1, v2, v1
+; GFX10-NEXT:    v_div_scale_f32 v0, s2, v1, v2, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_div_scale_f32_2:
@@ -1266,14 +1266,14 @@ define amdgpu_kernel void @test_div_scale_f32_inline_imm_num(ptr addrspace(1) %o
 ;
 ; GFX10-LABEL: test_div_scale_f32_inline_imm_num:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_div_scale_f32 v0, s0, v0, v0, 1.0
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    v_div_scale_f32 v0, s2, v0, v0, 1.0
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_div_scale_f32_inline_imm_num:
@@ -1337,14 +1337,14 @@ define amdgpu_kernel void @test_div_scale_f32_inline_imm_den(ptr addrspace(1) %o
 ;
 ; GFX10-LABEL: test_div_scale_f32_inline_imm_den:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_div_scale_f32 v0, s0, 2.0, 2.0, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    v_div_scale_f32 v0, s2, 2.0, 2.0, v0
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_div_scale_f32_inline_imm_den:
@@ -1416,17 +1416,17 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_num(ptr addrspace(1) %out, pt
 ;
 ; GFX10-LABEL: test_div_scale_f32_fabs_num:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
+; GFX10-NEXT:    global_load_dword v1, v0, s[2:3] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_load_dword v2, v0, s[6:7] offset:4 glc dlc
+; GFX10-NEXT:    global_load_dword v2, v0, s[2:3] offset:4 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    v_div_scale_f32 v0, s0, v2, v2, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    v_div_scale_f32 v0, s2, v2, v2, v0
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_div_scale_f32_fabs_num:
@@ -1508,17 +1508,17 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_den(ptr addrspace(1) %out, pt
 ;
 ; GFX10-LABEL: test_div_scale_f32_fabs_den:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
+; GFX10-NEXT:    global_load_dword v1, v0, s[2:3] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_load_dword v2, v0, s[6:7] offset:4 glc dlc
+; GFX10-NEXT:    global_load_dword v2, v0, s[2:3] offset:4 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v2
-; GFX10-NEXT:    v_div_scale_f32 v0, s0, v0, v0, v1
+; GFX10-NEXT:    v_div_scale_f32 v0, s2, v0, v0, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_div_scale_f32_fabs_den:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
index d1fa579ca9b4b..69f9a5712b0b5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
@@ -658,17 +658,17 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr,
 ;
 ; GFX1013-LABEL: image_bvh_intersect_ray_nsa_reassign:
 ; GFX1013:       ; %bb.0:
-; GFX1013-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX1013-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX1013-NEXT:    v_lshlrev_b32_e32 v6, 2, v0
 ; GFX1013-NEXT:    v_mov_b32_e32 v7, 0x40a00000
 ; GFX1013-NEXT:    v_mov_b32_e32 v8, 0x40c00000
 ; GFX1013-NEXT:    v_mov_b32_e32 v9, 0x40e00000
 ; GFX1013-NEXT:    v_mov_b32_e32 v10, 0x41000000
 ; GFX1013-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1013-NEXT:    v_mov_b32_e32 v0, s4
-; GFX1013-NEXT:    v_mov_b32_e32 v1, s5
-; GFX1013-NEXT:    v_mov_b32_e32 v2, s6
-; GFX1013-NEXT:    v_mov_b32_e32 v3, s7
+; GFX1013-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1013-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1013-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1013-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX1013-NEXT:    v_add_co_u32 v4, vcc_lo, v0, v6
 ; GFX1013-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
 ; GFX1013-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v6
@@ -681,7 +681,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr,
 ; GFX1013-NEXT:    v_mov_b32_e32 v4, 2.0
 ; GFX1013-NEXT:    v_mov_b32_e32 v5, 0x40400000
 ; GFX1013-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX1013-NEXT:    image_bvh_intersect_ray v[0:3], v[0:10], s[8:11]
+; GFX1013-NEXT:    image_bvh_intersect_ray v[0:3], v[0:10], s[4:7]
 ; GFX1013-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1013-NEXT:    flat_store_dwordx4 v[0:1], v[0:3]
 ; GFX1013-NEXT:    s_endpgm
@@ -769,14 +769,14 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_
 ;
 ; GFX1013-LABEL: image_bvh_intersect_ray_a16_nsa_reassign:
 ; GFX1013:       ; %bb.0:
-; GFX1013-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX1013-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX1013-NEXT:    v_lshlrev_b32_e32 v6, 2, v0
 ; GFX1013-NEXT:    v_mov_b32_e32 v7, 0x48004700
 ; GFX1013-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1013-NEXT:    v_mov_b32_e32 v0, s4
-; GFX1013-NEXT:    v_mov_b32_e32 v1, s5
-; GFX1013-NEXT:    v_mov_b32_e32 v2, s6
-; GFX1013-NEXT:    v_mov_b32_e32 v3, s7
+; GFX1013-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1013-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1013-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1013-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX1013-NEXT:    v_add_co_u32 v4, vcc_lo, v0, v6
 ; GFX1013-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
 ; GFX1013-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v6
@@ -789,7 +789,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_
 ; GFX1013-NEXT:    v_mov_b32_e32 v4, 2.0
 ; GFX1013-NEXT:    v_mov_b32_e32 v5, 0x44004200
 ; GFX1013-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX1013-NEXT:    image_bvh_intersect_ray v[0:3], v[0:7], s[8:11] a16
+; GFX1013-NEXT:    image_bvh_intersect_ray v[0:3], v[0:7], s[4:7] a16
 ; GFX1013-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1013-NEXT:    flat_store_dwordx4 v[0:1], v[0:3]
 ; GFX1013-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll
index 4946d3759c2ef..aa21e67544d65 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll
@@ -63,14 +63,14 @@ define amdgpu_kernel void @mov_dpp64_test(ptr addrspace(1) %out, i64 %in1) {
 ;
 ; GFX10-LABEL: mov_dpp64_test:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24 ; encoding: [0x01,0x01,0x08,0xf4,0x24,0x00,0x00,0xfa]
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24 ; encoding: [0x01,0x00,0x08,0xf4,0x24,0x00,0x00,0xfa]
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0 ; encoding: [0x80,0x02,0x04,0x7e]
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]
-; GFX10-NEXT:    v_mov_b32_e32 v0, s6 ; encoding: [0x06,0x02,0x00,0x7e]
-; GFX10-NEXT:    v_mov_b32_e32 v1, s7 ; encoding: [0x07,0x02,0x02,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v0, s2 ; encoding: [0x02,0x02,0x00,0x7e]
+; GFX10-NEXT:    v_mov_b32_e32 v1, s3 ; encoding: [0x03,0x02,0x02,0x7e]
 ; GFX10-NEXT:    v_mov_b32_dpp v0, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; encoding: [0xfa,0x02,0x00,0x7e,0x00,0x01,0x00,0x11]
 ; GFX10-NEXT:    v_mov_b32_dpp v1, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; encoding: [0xfa,0x02,0x02,0x7e,0x01,0x01,0x00,0x11]
-; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5] ; encoding: [0x00,0x80,0x74,0xdc,0x02,0x00,0x04,0x00]
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] ; encoding: [0x00,0x80,0x74,0xdc,0x02,0x00,0x00,0x00]
 ; GFX10-NEXT:    s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
 ;
 ; GFX11-LABEL: mov_dpp64_test:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll
index 1e8209bd3fc6b..2198ba9f1d964 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll
@@ -19,13 +19,13 @@ define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in1, i32 %in2) {
 ;
 ; GFX10-LABEL: dpp_test:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s6
-; GFX10-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: dpp_test:
@@ -64,16 +64,16 @@ define amdgpu_kernel void @update_dppi64_test(ptr addrspace(1) %arg, i64 %in1, i
 ;
 ; GFX10-LABEL: update_dppi64_test:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[4:5]
-; GFX10-NEXT:    v_mov_b32_e32 v2, s6
-; GFX10-NEXT:    v_mov_b32_e32 v3, s7
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[0:1]
+; GFX10-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
 ; GFX10-NEXT:    v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
-; GFX10-NEXT:    global_store_dwordx2 v4, v[2:3], s[4:5]
+; GFX10-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: update_dppi64_test:
@@ -120,16 +120,16 @@ define amdgpu_kernel void @update_dppf64_test(ptr addrspace(1) %arg, double %in1
 ;
 ; GFX10-LABEL: update_dppf64_test:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[4:5]
-; GFX10-NEXT:    v_mov_b32_e32 v2, s6
-; GFX10-NEXT:    v_mov_b32_e32 v3, s7
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[0:1]
+; GFX10-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
 ; GFX10-NEXT:    v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
-; GFX10-NEXT:    global_store_dwordx2 v4, v[2:3], s[4:5]
+; GFX10-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: update_dppf64_test:
@@ -176,16 +176,16 @@ define amdgpu_kernel void @update_dppv2i32_test(ptr addrspace(1) %arg, <2 x i32>
 ;
 ; GFX10-LABEL: update_dppv2i32_test:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[4:5]
-; GFX10-NEXT:    v_mov_b32_e32 v2, s6
-; GFX10-NEXT:    v_mov_b32_e32 v3, s7
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[0:1]
+; GFX10-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
 ; GFX10-NEXT:    v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
-; GFX10-NEXT:    global_store_dwordx2 v4, v[2:3], s[4:5]
+; GFX10-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: update_dppv2i32_test:
@@ -232,16 +232,16 @@ define amdgpu_kernel void @update_dppv2f32_test(ptr addrspace(1) %arg, <2 x floa
 ;
 ; GFX10-LABEL: update_dppv2f32_test:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[4:5]
-; GFX10-NEXT:    v_mov_b32_e32 v2, s6
-; GFX10-NEXT:    v_mov_b32_e32 v3, s7
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[0:1]
+; GFX10-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
 ; GFX10-NEXT:    v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
-; GFX10-NEXT:    global_store_dwordx2 v4, v[2:3], s[4:5]
+; GFX10-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: update_dppv2f32_test:
@@ -288,16 +288,16 @@ define amdgpu_kernel void @update_dpp_p0_test(ptr addrspace(1) %arg, ptr %in1, p
 ;
 ; GFX10-LABEL: update_dpp_p0_test:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[4:5]
-; GFX10-NEXT:    v_mov_b32_e32 v2, s6
-; GFX10-NEXT:    v_mov_b32_e32 v3, s7
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[0:1]
+; GFX10-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
 ; GFX10-NEXT:    v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
-; GFX10-NEXT:    global_store_dwordx2 v4, v[2:3], s[4:5]
+; GFX10-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: update_dpp_p0_test:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll
index 489f46d1237a3..577a7d0b4cba0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll
@@ -7,18 +7,18 @@ declare i32 @llvm.amdgcn.workitem.id.x()
 define amdgpu_kernel void @v_mul_i64_no_zext(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind {
 ; GFX10-LABEL: v_mul_i64_no_zext:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x2c
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x2c
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 3, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v7, s[4:5]
-; GFX10-NEXT:    global_load_dwordx2 v[2:3], v7, s[6:7]
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v7, s[0:1]
+; GFX10-NEXT:    global_load_dwordx2 v[2:3], v7, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mad_u64_u32 v[4:5], s0, v0, v2, 0
 ; GFX10-NEXT:    v_mad_u64_u32 v[5:6], s0, v0, v3, v[5:6]
 ; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s0, v1, v2, v[5:6]
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v0
-; GFX10-NEXT:    global_store_dwordx2 v7, v[4:5], s[6:7]
+; GFX10-NEXT:    global_store_dwordx2 v7, v[4:5], s[2:3]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_mul_i64_no_zext:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
index 42f1bf84c0420..b0f3eee3c7363 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
@@ -2559,47 +2559,47 @@ define amdgpu_kernel void @s_mul_u64_zext_with_sregs(ptr addrspace(1) %out, ptr
 ;
 ; GFX8-LABEL: s_mul_u64_zext_with_sregs:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0x50
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_load_dword s0, s[6:7], 0x0
-; GFX8-NEXT:    v_mov_b32_e32 v2, s4
-; GFX8-NEXT:    v_mov_b32_e32 v3, s5
+; GFX8-NEXT:    s_load_dword s2, s[2:3], 0x0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mul_hi_u32 v0, s0, v0
-; GFX8-NEXT:    s_mulk_i32 s0, 0x50
-; GFX8-NEXT:    v_readfirstlane_b32 s1, v0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_mul_hi_u32 v0, s2, v0
+; GFX8-NEXT:    s_mulk_i32 s2, 0x50
+; GFX8-NEXT:    v_readfirstlane_b32 s3, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: s_mul_u64_zext_with_sregs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dword s1, s[6:7], 0x0
+; GFX9-NEXT:    s_load_dword s3, s[2:3], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mul_i32 s0, s1, 0x50
-; GFX9-NEXT:    s_mul_hi_u32 s1, s1, 0x50
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    s_mul_i32 s2, s3, 0x50
+; GFX9-NEXT:    s_mul_hi_u32 s3, s3, 0x50
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: s_mul_u64_zext_with_sregs:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_load_dword s1, s[6:7], 0x0
+; GFX10-NEXT:    s_load_dword s3, s[2:3], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_mul_i32 s0, s1, 0x50
-; GFX10-NEXT:    s_mul_hi_u32 s1, s1, 0x50
-; GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-NEXT:    s_mul_i32 s2, s3, 0x50
+; GFX10-NEXT:    s_mul_hi_u32 s3, s3, 0x50
+; GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_mul_u64_zext_with_sregs:
@@ -2738,56 +2738,56 @@ define amdgpu_kernel void @s_mul_u64_sext_with_sregs(ptr addrspace(1) %out, ptr
 ;
 ; GFX8-LABEL: s_mul_u64_sext_with_sregs:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0x50
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_load_dword s0, s[6:7], 0x0
-; GFX8-NEXT:    v_mov_b32_e32 v2, s4
-; GFX8-NEXT:    v_mov_b32_e32 v3, s5
+; GFX8-NEXT:    s_load_dword s2, s[2:3], 0x0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mul_hi_u32 v0, s0, v0
-; GFX8-NEXT:    s_ashr_i32 s1, s0, 31
-; GFX8-NEXT:    s_mulk_i32 s0, 0x50
-; GFX8-NEXT:    s_mulk_i32 s1, 0x50
-; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX8-NEXT:    s_add_u32 s1, s1, s2
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_mul_hi_u32 v0, s2, v0
+; GFX8-NEXT:    s_ashr_i32 s3, s2, 31
+; GFX8-NEXT:    s_mulk_i32 s2, 0x50
+; GFX8-NEXT:    s_mulk_i32 s3, 0x50
+; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX8-NEXT:    s_add_u32 s3, s3, s4
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: s_mul_u64_sext_with_sregs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dword s1, s[6:7], 0x0
+; GFX9-NEXT:    s_load_dword s3, s[2:3], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s2, s1, 31
-; GFX9-NEXT:    s_mul_i32 s0, s1, 0x50
-; GFX9-NEXT:    s_mul_hi_u32 s1, s1, 0x50
-; GFX9-NEXT:    s_mulk_i32 s2, 0x50
-; GFX9-NEXT:    s_add_u32 s1, s2, s1
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
+; GFX9-NEXT:    s_mul_i32 s2, s3, 0x50
+; GFX9-NEXT:    s_mul_hi_u32 s3, s3, 0x50
+; GFX9-NEXT:    s_mulk_i32 s4, 0x50
+; GFX9-NEXT:    s_add_u32 s3, s4, s3
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: s_mul_u64_sext_with_sregs:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX10-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_ashr_i32 s1, s0, 31
-; GFX10-NEXT:    s_mul_hi_u32 s2, s0, 0x50
-; GFX10-NEXT:    s_mulk_i32 s1, 0x50
-; GFX10-NEXT:    s_mulk_i32 s0, 0x50
-; GFX10-NEXT:    s_add_i32 s1, s2, s1
-; GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-NEXT:    s_ashr_i32 s3, s2, 31
+; GFX10-NEXT:    s_mul_hi_u32 s4, s2, 0x50
+; GFX10-NEXT:    s_mulk_i32 s3, 0x50
+; GFX10-NEXT:    s_mulk_i32 s2, 0x50
+; GFX10-NEXT:    s_add_i32 s3, s4, s3
+; GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_mul_u64_sext_with_sregs:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
index 7d7f450e590fa..cf69c50ed9357 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
@@ -305,25 +305,25 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ;
 ; GFX9-LABEL: sdivrem_i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x0
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[6:7], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s2, s13, 31
-; GFX9-NEXT:    s_ashr_i32 s4, s15, 31
-; GFX9-NEXT:    s_add_u32 s0, s12, s2
-; GFX9-NEXT:    s_addc_u32 s1, s13, s2
-; GFX9-NEXT:    s_add_u32 s6, s14, s4
-; GFX9-NEXT:    s_mov_b32 s5, s4
-; GFX9-NEXT:    s_addc_u32 s7, s15, s4
-; GFX9-NEXT:    s_xor_b64 s[6:7], s[6:7], s[4:5]
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s7
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s6
+; GFX9-NEXT:    s_ashr_i32 s2, s9, 31
+; GFX9-NEXT:    s_ashr_i32 s12, s11, 31
+; GFX9-NEXT:    s_add_u32 s0, s8, s2
+; GFX9-NEXT:    s_addc_u32 s1, s9, s2
+; GFX9-NEXT:    s_add_u32 s8, s10, s12
+; GFX9-NEXT:    s_mov_b32 s13, s12
+; GFX9-NEXT:    s_addc_u32 s9, s11, s12
+; GFX9-NEXT:    s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s9
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s8
 ; GFX9-NEXT:    s_mov_b32 s3, s2
-; GFX9-NEXT:    s_xor_b64 s[12:13], s[0:1], s[2:3]
+; GFX9-NEXT:    s_xor_b64 s[10:11], s[0:1], s[2:3]
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT:    s_sub_u32 s14, 0, s6
-; GFX9-NEXT:    s_subb_u32 s15, 0, s7
+; GFX9-NEXT:    s_sub_u32 s14, 0, s8
+; GFX9-NEXT:    s_subb_u32 s15, 0, s9
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX9-NEXT:    v_trunc_f32_e32 v2, v1
@@ -357,7 +357,7 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v1, vcc
 ; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s14, v3, 0
-; GFX9-NEXT:    v_mov_b32_e32 v7, s7
+; GFX9-NEXT:    v_mov_b32_e32 v7, s9
 ; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s14, v4, v[1:2]
 ; GFX9-NEXT:    v_mul_hi_u32 v6, v3, v0
 ; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s15, v3, v[1:2]
@@ -382,52 +382,52 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX9-NEXT:    v_add3_u32 v1, v5, v2, v1
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v3, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v2, s13, v0
-; GFX9-NEXT:    v_mul_lo_u32 v3, s12, v1
-; GFX9-NEXT:    v_mul_hi_u32 v4, s12, v0
-; GFX9-NEXT:    v_mul_hi_u32 v0, s13, v0
-; GFX9-NEXT:    v_mul_hi_u32 v6, s13, v1
+; GFX9-NEXT:    v_mul_lo_u32 v2, s11, v0
+; GFX9-NEXT:    v_mul_lo_u32 v3, s10, v1
+; GFX9-NEXT:    v_mul_hi_u32 v4, s10, v0
+; GFX9-NEXT:    v_mul_hi_u32 v0, s11, v0
+; GFX9-NEXT:    v_mul_hi_u32 v6, s11, v1
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v4, s13, v1
+; GFX9-NEXT:    v_mul_lo_u32 v4, s11, v1
 ; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
-; GFX9-NEXT:    v_mul_hi_u32 v3, s12, v1
+; GFX9-NEXT:    v_mul_hi_u32 v3, s10, v1
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v4, v0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v0, v2
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s6, v5, 0
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s8, v5, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v3, v4, v3
 ; GFX9-NEXT:    v_add3_u32 v3, v3, v2, v6
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s6, v3, v[1:2]
-; GFX9-NEXT:    v_mov_b32_e32 v6, s13
-; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s12, v0
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s7, v5, v[1:2]
+; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s8, v3, v[1:2]
+; GFX9-NEXT:    v_mov_b32_e32 v6, s11
+; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s10, v0
+; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s9, v5, v[1:2]
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_subb_co_u32_e64 v2, s[0:1], v6, v1, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s7, v2
-; GFX9-NEXT:    v_sub_u32_e32 v1, s13, v1
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v2
+; GFX9-NEXT:    v_sub_u32_e32 v1, s11, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s6, v0
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s7, v2
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v2
 ; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v7, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v6, v6, v8, s[0:1]
-; GFX9-NEXT:    v_subrev_co_u32_e32 v8, vcc, s6, v0
+; GFX9-NEXT:    v_subrev_co_u32_e32 v8, vcc, s8, v0
 ; GFX9-NEXT:    v_subbrev_co_u32_e64 v9, s[0:1], 0, v1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e64 v10, s[0:1], 1, v5
 ; GFX9-NEXT:    v_addc_co_u32_e64 v11, s[0:1], 0, v3, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s7, v9
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v9
 ; GFX9-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s6, v8
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v8
 ; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v7, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s7, v9
-; GFX9-NEXT:    v_subrev_co_u32_e32 v7, vcc, s6, v8
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v9
+; GFX9-NEXT:    v_subrev_co_u32_e32 v7, vcc, s8, v8
 ; GFX9-NEXT:    v_cndmask_b32_e64 v12, v12, v13, s[0:1]
 ; GFX9-NEXT:    v_add_co_u32_e64 v13, s[0:1], 1, v10
 ; GFX9-NEXT:    v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc
@@ -442,7 +442,7 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v11, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v6, v0, v6, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v1, s[0:1]
-; GFX9-NEXT:    s_xor_b64 s[0:1], s[2:3], s[4:5]
+; GFX9-NEXT:    s_xor_b64 s[0:1], s[2:3], s[12:13]
 ; GFX9-NEXT:    v_xor_b32_e32 v0, s0, v5
 ; GFX9-NEXT:    v_xor_b32_e32 v1, s1, v3
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
@@ -453,27 +453,27 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX9-NEXT:    v_mov_b32_e32 v6, s2
 ; GFX9-NEXT:    v_subrev_co_u32_e32 v2, vcc, s2, v3
 ; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v5, v6, vcc
-; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[8:9]
-; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[10:11]
+; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[4:5]
+; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: sdivrem_i64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x0
+; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[6:7], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_ashr_i32 s2, s13, 31
-; GFX10-NEXT:    s_ashr_i32 s4, s15, 31
-; GFX10-NEXT:    s_add_u32 s0, s12, s2
-; GFX10-NEXT:    s_addc_u32 s1, s13, s2
-; GFX10-NEXT:    s_add_u32 s6, s14, s4
-; GFX10-NEXT:    s_mov_b32 s5, s4
-; GFX10-NEXT:    s_addc_u32 s7, s15, s4
+; GFX10-NEXT:    s_ashr_i32 s2, s9, 31
+; GFX10-NEXT:    s_ashr_i32 s12, s11, 31
+; GFX10-NEXT:    s_add_u32 s0, s8, s2
+; GFX10-NEXT:    s_addc_u32 s1, s9, s2
+; GFX10-NEXT:    s_add_u32 s8, s10, s12
+; GFX10-NEXT:    s_mov_b32 s13, s12
+; GFX10-NEXT:    s_addc_u32 s9, s11, s12
 ; GFX10-NEXT:    s_mov_b32 s3, s2
-; GFX10-NEXT:    s_xor_b64 s[6:7], s[6:7], s[4:5]
+; GFX10-NEXT:    s_xor_b64 s[8:9], s[8:9], s[12:13]
 ; GFX10-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s7
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s6
-; GFX10-NEXT:    s_sub_u32 s12, 0, s6
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s9
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s8
+; GFX10-NEXT:    s_sub_u32 s10, 0, s8
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
@@ -484,12 +484,11 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v4, v2
 ; GFX10-NEXT:    v_add_f32_e32 v0, v1, v0
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v3, v0
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s13, s12, v3, 0
-; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s13, s12, v4, v[1:2]
-; GFX10-NEXT:    s_subb_u32 s13, 0, s7
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s11, s10, v3, 0
+; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s11, s10, v4, v[1:2]
+; GFX10-NEXT:    s_subb_u32 s11, 0, s9
 ; GFX10-NEXT:    v_mul_hi_u32 v6, v4, v0
-; GFX10-NEXT:    s_xor_b64 s[4:5], s[2:3], s[4:5]
-; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s14, s13, v3, v[1:2]
+; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s14, s11, v3, v[1:2]
 ; GFX10-NEXT:    v_mul_lo_u32 v2, v4, v0
 ; GFX10-NEXT:    v_mul_hi_u32 v0, v3, v0
 ; GFX10-NEXT:    v_mul_lo_u32 v5, v3, v1
@@ -511,28 +510,28 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v0
 ; GFX10-NEXT:    v_add3_u32 v1, v5, v2, v1
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v1, vcc_lo
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s14, s12, v3, 0
-; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s12, s12, v4, v[1:2]
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s14, s10, v3, 0
+; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s10, s10, v4, v[1:2]
 ; GFX10-NEXT:    v_mul_hi_u32 v6, v4, v0
-; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s12, s13, v3, v[1:2]
+; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s10, s11, v3, v[1:2]
 ; GFX10-NEXT:    v_mul_lo_u32 v2, v4, v0
 ; GFX10-NEXT:    v_mul_hi_u32 v0, v3, v0
 ; GFX10-NEXT:    v_mul_lo_u32 v5, v3, v1
 ; GFX10-NEXT:    v_mul_lo_u32 v7, v4, v1
 ; GFX10-NEXT:    v_mul_hi_u32 v8, v3, v1
 ; GFX10-NEXT:    v_mul_hi_u32 v1, v4, v1
-; GFX10-NEXT:    v_add_co_u32 v2, s12, v2, v5
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s12
-; GFX10-NEXT:    v_add_co_u32 v6, s12, v7, v6
-; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s12
-; GFX10-NEXT:    v_add_co_u32 v0, s12, v2, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s12
-; GFX10-NEXT:    v_add_co_u32 v2, s12, v6, v8
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s12
+; GFX10-NEXT:    v_add_co_u32 v2, s10, v2, v5
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s10
+; GFX10-NEXT:    v_add_co_u32 v6, s10, v7, v6
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s10
+; GFX10-NEXT:    v_add_co_u32 v0, s10, v2, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s10
+; GFX10-NEXT:    v_add_co_u32 v2, s10, v6, v8
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s10
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, v5, v0
 ; GFX10-NEXT:    v_add_nc_u32_e32 v5, v7, v6
-; GFX10-NEXT:    v_add_co_u32 v0, s12, v2, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s12
+; GFX10-NEXT:    v_add_co_u32 v0, s10, v2, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s10
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v3, v0
 ; GFX10-NEXT:    v_add3_u32 v1, v5, v2, v1
 ; GFX10-NEXT:    v_mul_lo_u32 v2, s1, v0
@@ -541,70 +540,71 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX10-NEXT:    v_mul_hi_u32 v0, s1, v0
 ; GFX10-NEXT:    v_mul_lo_u32 v3, s0, v1
 ; GFX10-NEXT:    v_mul_lo_u32 v5, s1, v1
-; GFX10-NEXT:    v_add_co_u32 v2, s12, v2, v3
+; GFX10-NEXT:    v_add_co_u32 v2, s10, v2, v3
 ; GFX10-NEXT:    v_mul_hi_u32 v3, s0, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s12
-; GFX10-NEXT:    v_add_co_u32 v2, s12, v2, v4
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s12
-; GFX10-NEXT:    v_add_co_u32 v0, s12, v5, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s12
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s10
+; GFX10-NEXT:    v_add_co_u32 v2, s10, v2, v4
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s10
+; GFX10-NEXT:    v_add_co_u32 v0, s10, v5, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s10
 ; GFX10-NEXT:    v_add_nc_u32_e32 v2, v6, v2
-; GFX10-NEXT:    v_add_co_u32 v0, s12, v0, v3
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s12
-; GFX10-NEXT:    v_add_co_u32 v5, s12, v0, v2
+; GFX10-NEXT:    v_add_co_u32 v0, s10, v0, v3
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s10
+; GFX10-NEXT:    v_add_co_u32 v5, s10, v0, v2
 ; GFX10-NEXT:    v_mul_hi_u32 v2, s1, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s12
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s10
 ; GFX10-NEXT:    v_add_nc_u32_e32 v3, v4, v3
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s12, s6, v5, 0
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s10, s8, v5, 0
 ; GFX10-NEXT:    v_add3_u32 v3, v3, v6, v2
-; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s12, s6, v3, v[1:2]
-; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s12, s7, v5, v[1:2]
+; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s10, s8, v3, v[1:2]
+; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s10, s9, v5, v[1:2]
 ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v5, 1
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v3, vcc_lo
 ; GFX10-NEXT:    v_sub_co_u32 v0, vcc_lo, s0, v0
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v6, s1, v1
 ; GFX10-NEXT:    v_sub_co_ci_u32_e64 v1, s0, s1, v1, vcc_lo
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v6, vcc_lo, s7, v6, vcc_lo
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s6, v0
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v6, vcc_lo, s9, v6, vcc_lo
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s8, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc_lo
-; GFX10-NEXT:    v_sub_co_u32 v8, vcc_lo, v0, s6
+; GFX10-NEXT:    v_sub_co_u32 v8, vcc_lo, v0, s8
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v9, s0, 0, v6, vcc_lo
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s7, v1
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v6, vcc_lo, s7, v6, vcc_lo
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s9, v1
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v6, vcc_lo, s9, v6, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s0
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s6, v8
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s8, v8
 ; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s0
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s7, v9
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s9, v9
 ; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s0
 ; GFX10-NEXT:    v_add_co_u32 v13, s0, v2, 1
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v14, s0, 0, v4, s0
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s7, v9
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s9, v9
 ; GFX10-NEXT:    v_cndmask_b32_e64 v11, v12, v11, s0
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s7, v1
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s9, v1
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v11
 ; GFX10-NEXT:    v_cndmask_b32_e64 v7, v10, v7, s0
-; GFX10-NEXT:    v_sub_co_u32 v10, s0, v8, s6
+; GFX10-NEXT:    v_sub_co_u32 v10, s0, v8, s8
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v6, s0, 0, v6, s0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v13, vcc_lo
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v7
 ; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v14, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v7, v8, v10, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v6, v9, v6, vcc_lo
+; GFX10-NEXT:    s_xor_b64 s[8:9], s[2:3], s[12:13]
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v7, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
-; GFX10-NEXT:    v_xor_b32_e32 v2, s4, v2
-; GFX10-NEXT:    v_xor_b32_e32 v3, s5, v3
+; GFX10-NEXT:    v_xor_b32_e32 v2, s8, v2
+; GFX10-NEXT:    v_xor_b32_e32 v3, s9, v3
 ; GFX10-NEXT:    v_xor_b32_e32 v5, s2, v0
 ; GFX10-NEXT:    v_xor_b32_e32 v6, s2, v1
-; GFX10-NEXT:    v_sub_co_u32 v0, vcc_lo, v2, s4
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v1, vcc_lo, s5, v3, vcc_lo
+; GFX10-NEXT:    v_sub_co_u32 v0, vcc_lo, v2, s8
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v1, vcc_lo, s9, v3, vcc_lo
 ; GFX10-NEXT:    v_sub_co_u32 v2, vcc_lo, v5, s2
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v3, vcc_lo, s2, v6, vcc_lo
-; GFX10-NEXT:    global_store_dwordx2 v4, v[0:1], s[8:9]
-; GFX10-NEXT:    global_store_dwordx2 v4, v[2:3], s[10:11]
+; GFX10-NEXT:    global_store_dwordx2 v4, v[0:1], s[4:5]
+; GFX10-NEXT:    global_store_dwordx2 v4, v[2:3], s[6:7]
 ; GFX10-NEXT:    s_endpgm
   %div = sdiv i64 %x, %y
   store i64 %div, ptr addrspace(1) %out0
@@ -692,121 +692,121 @@ define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1
 ;
 ; GFX9-LABEL: sdivrem_v2i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x0
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[6:7], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s0, s14, 31
-; GFX9-NEXT:    s_add_i32 s1, s14, s0
-; GFX9-NEXT:    s_xor_b32 s1, s1, s0
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s1
-; GFX9-NEXT:    s_ashr_i32 s2, s15, 31
-; GFX9-NEXT:    s_add_i32 s3, s15, s2
-; GFX9-NEXT:    s_xor_b32 s3, s3, s2
+; GFX9-NEXT:    s_ashr_i32 s8, s6, 31
+; GFX9-NEXT:    s_add_i32 s6, s6, s8
+; GFX9-NEXT:    s_xor_b32 s6, s6, s8
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s6
+; GFX9-NEXT:    s_ashr_i32 s9, s7, 31
+; GFX9-NEXT:    s_add_i32 s7, s7, s9
+; GFX9-NEXT:    s_xor_b32 s7, s7, s9
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s3
-; GFX9-NEXT:    s_sub_i32 s6, 0, s1
-; GFX9-NEXT:    s_ashr_i32 s4, s12, 31
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s7
+; GFX9-NEXT:    s_sub_i32 s12, 0, s6
+; GFX9-NEXT:    s_ashr_i32 s10, s4, 31
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GFX9-NEXT:    s_sub_i32 s7, 0, s3
-; GFX9-NEXT:    s_ashr_i32 s5, s13, 31
-; GFX9-NEXT:    v_mul_lo_u32 v2, s6, v0
+; GFX9-NEXT:    s_add_i32 s4, s4, s10
+; GFX9-NEXT:    s_xor_b32 s4, s4, s10
+; GFX9-NEXT:    v_mul_lo_u32 v2, s12, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT:    s_add_i32 s6, s12, s4
+; GFX9-NEXT:    s_sub_i32 s12, 0, s7
 ; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
-; GFX9-NEXT:    s_xor_b32 s6, s6, s4
-; GFX9-NEXT:    v_mul_lo_u32 v3, s7, v1
-; GFX9-NEXT:    s_add_i32 s7, s13, s5
+; GFX9-NEXT:    s_ashr_i32 s11, s5, 31
+; GFX9-NEXT:    v_mul_lo_u32 v3, s12, v1
+; GFX9-NEXT:    s_add_i32 s5, s5, s11
 ; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v0, s6, v0
+; GFX9-NEXT:    v_mul_hi_u32 v0, s4, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v2, v1, v3
-; GFX9-NEXT:    s_xor_b32 s7, s7, s5
-; GFX9-NEXT:    s_xor_b32 s0, s4, s0
-; GFX9-NEXT:    v_mul_lo_u32 v3, v0, s1
+; GFX9-NEXT:    s_xor_b32 s5, s5, s11
+; GFX9-NEXT:    v_mul_lo_u32 v3, v0, s6
 ; GFX9-NEXT:    v_add_u32_e32 v1, v1, v2
 ; GFX9-NEXT:    v_add_u32_e32 v2, 1, v0
-; GFX9-NEXT:    v_mul_hi_u32 v1, s7, v1
-; GFX9-NEXT:    v_sub_u32_e32 v3, s6, v3
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s1, v3
+; GFX9-NEXT:    v_mul_hi_u32 v1, s5, v1
+; GFX9-NEXT:    v_sub_u32_e32 v3, s4, v3
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s6, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v2, s1, v3
+; GFX9-NEXT:    v_subrev_u32_e32 v2, s6, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s1, v2
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s6, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v3, s1, v2
+; GFX9-NEXT:    v_subrev_u32_e32 v3, s6, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v3, v1, s3
+; GFX9-NEXT:    v_mul_lo_u32 v3, v1, s7
 ; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
-; GFX9-NEXT:    v_xor_b32_e32 v0, s0, v0
-; GFX9-NEXT:    v_subrev_u32_e32 v0, s0, v0
-; GFX9-NEXT:    v_sub_u32_e32 v3, s7, v3
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v3
+; GFX9-NEXT:    s_xor_b32 s4, s10, s8
+; GFX9-NEXT:    v_xor_b32_e32 v0, s4, v0
+; GFX9-NEXT:    v_sub_u32_e32 v3, s5, v3
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s7, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v4, s3, v3
+; GFX9-NEXT:    v_subrev_u32_e32 v4, s7, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v3
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s7, v3
+; GFX9-NEXT:    v_subrev_u32_e32 v0, s4, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v4, s3, v3
-; GFX9-NEXT:    s_xor_b32 s0, s5, s2
+; GFX9-NEXT:    v_subrev_u32_e32 v4, s7, v3
+; GFX9-NEXT:    s_xor_b32 s4, s11, s9
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v1, s0, v1
-; GFX9-NEXT:    v_xor_b32_e32 v2, s4, v2
-; GFX9-NEXT:    v_subrev_u32_e32 v1, s0, v1
-; GFX9-NEXT:    v_xor_b32_e32 v3, s5, v3
+; GFX9-NEXT:    v_xor_b32_e32 v1, s4, v1
+; GFX9-NEXT:    v_xor_b32_e32 v2, s10, v2
+; GFX9-NEXT:    v_subrev_u32_e32 v1, s4, v1
+; GFX9-NEXT:    v_xor_b32_e32 v3, s11, v3
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9-NEXT:    v_subrev_u32_e32 v2, s4, v2
-; GFX9-NEXT:    v_subrev_u32_e32 v3, s5, v3
-; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[8:9]
-; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[10:11]
+; GFX9-NEXT:    v_subrev_u32_e32 v2, s10, v2
+; GFX9-NEXT:    v_subrev_u32_e32 v3, s11, v3
+; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
+; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: sdivrem_v2i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x0
+; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[6:7], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_ashr_i32 s1, s14, 31
-; GFX10-NEXT:    s_ashr_i32 s2, s15, 31
-; GFX10-NEXT:    s_add_i32 s0, s14, s1
-; GFX10-NEXT:    s_add_i32 s3, s15, s2
-; GFX10-NEXT:    s_xor_b32 s4, s0, s1
+; GFX10-NEXT:    s_ashr_i32 s1, s10, 31
+; GFX10-NEXT:    s_ashr_i32 s2, s11, 31
+; GFX10-NEXT:    s_add_i32 s0, s10, s1
+; GFX10-NEXT:    s_add_i32 s3, s11, s2
+; GFX10-NEXT:    s_xor_b32 s10, s0, s1
 ; GFX10-NEXT:    s_xor_b32 s3, s3, s2
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s4
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s10
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s3
-; GFX10-NEXT:    s_sub_i32 s0, 0, s4
-; GFX10-NEXT:    s_sub_i32 s5, 0, s3
-; GFX10-NEXT:    s_ashr_i32 s6, s13, 31
+; GFX10-NEXT:    s_sub_i32 s0, 0, s10
+; GFX10-NEXT:    s_sub_i32 s11, 0, s3
+; GFX10-NEXT:    s_ashr_i32 s12, s9, 31
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GFX10-NEXT:    s_add_i32 s7, s13, s6
-; GFX10-NEXT:    s_xor_b32 s7, s7, s6
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX10-NEXT:    v_mul_lo_u32 v2, s0, v0
-; GFX10-NEXT:    v_mul_lo_u32 v3, s5, v1
-; GFX10-NEXT:    s_ashr_i32 s5, s12, 31
-; GFX10-NEXT:    s_add_i32 s0, s12, s5
-; GFX10-NEXT:    s_xor_b32 s1, s5, s1
-; GFX10-NEXT:    s_xor_b32 s0, s0, s5
+; GFX10-NEXT:    v_mul_lo_u32 v3, s11, v1
+; GFX10-NEXT:    s_ashr_i32 s11, s8, 31
+; GFX10-NEXT:    s_add_i32 s0, s8, s11
+; GFX10-NEXT:    s_add_i32 s8, s9, s12
+; GFX10-NEXT:    s_xor_b32 s0, s0, s11
+; GFX10-NEXT:    s_xor_b32 s8, s8, s12
 ; GFX10-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX10-NEXT:    v_mul_hi_u32 v3, v1, v3
+; GFX10-NEXT:    s_xor_b32 s1, s11, s1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_add_nc_u32_e32 v1, v1, v3
 ; GFX10-NEXT:    v_mul_hi_u32 v0, s0, v0
-; GFX10-NEXT:    v_mul_hi_u32 v1, s7, v1
-; GFX10-NEXT:    v_mul_lo_u32 v2, v0, s4
+; GFX10-NEXT:    v_mul_hi_u32 v1, s8, v1
+; GFX10-NEXT:    v_mul_lo_u32 v2, v0, s10
 ; GFX10-NEXT:    v_mul_lo_u32 v3, v1, s3
 ; GFX10-NEXT:    v_add_nc_u32_e32 v4, 1, v0
 ; GFX10-NEXT:    v_add_nc_u32_e32 v5, 1, v1
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v2, s0, v2
-; GFX10-NEXT:    v_sub_nc_u32_e32 v3, s7, v3
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, s4, v2
+; GFX10-NEXT:    v_sub_nc_u32_e32 v3, s8, v3
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, s10, v2
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s3, v3
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s4, v2
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s10, v2
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, s3, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s0
@@ -814,26 +814,26 @@ define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
 ; GFX10-NEXT:    v_add_nc_u32_e32 v5, 1, v1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v4, 1, v0
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s4, v2
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s10, v2
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s3, v3
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, s4, v2
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, s10, v2
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, s3, v3
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
-; GFX10-NEXT:    s_xor_b32 s0, s6, s2
+; GFX10-NEXT:    s_xor_b32 s0, s12, s2
 ; GFX10-NEXT:    v_xor_b32_e32 v0, s1, v0
 ; GFX10-NEXT:    v_xor_b32_e32 v1, s0, v1
-; GFX10-NEXT:    v_xor_b32_e32 v2, s5, v2
-; GFX10-NEXT:    v_xor_b32_e32 v3, s6, v3
+; GFX10-NEXT:    v_xor_b32_e32 v2, s11, v2
+; GFX10-NEXT:    v_xor_b32_e32 v3, s12, v3
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v0, s1, v0
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v1, s0, v1
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v2, s5, v2
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, s6, v3
-; GFX10-NEXT:    global_store_dwordx2 v4, v[0:1], s[8:9]
-; GFX10-NEXT:    global_store_dwordx2 v4, v[2:3], s[10:11]
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v2, s11, v2
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, s12, v3
+; GFX10-NEXT:    global_store_dwordx2 v4, v[0:1], s[4:5]
+; GFX10-NEXT:    global_store_dwordx2 v4, v[2:3], s[6:7]
 ; GFX10-NEXT:    s_endpgm
   %div = sdiv <2 x i32> %x, %y
   store <2 x i32> %div, ptr addrspace(1) %out0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll
index 8ea5fc25d95de..9ee0acf2aa2db 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll
@@ -92,8 +92,8 @@ entry:
 ; GCN-LABEL: {{^}}smrd6:
 ; SICIVI: s_add_u32 s{{[0-9]}}, s{{[0-9]}}, -4
 ; SICIVI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x0
-; GFX9_10: s_add_u32 s0, s6, -4
-; GFX9_10: s_addc_u32 s1, s7, -1
+; GFX9_10: s_add_u32 s2, s2, -4
+; GFX9_10: s_addc_u32 s3, s3, -1
 ; GFX9_10: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x0
 define amdgpu_kernel void @smrd6(ptr addrspace(1) %out, ptr addrspace(4) %ptr) #0 {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
index 5aef667934709..8b94f93e44e56 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
@@ -251,12 +251,12 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ;
 ; GFX9-LABEL: udivrem_i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x0
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[6:7], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s15
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s14
-; GFX9-NEXT:    s_sub_u32 s2, 0, s14
-; GFX9-NEXT:    s_subb_u32 s3, 0, s15
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s11
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s10
+; GFX9-NEXT:    s_sub_u32 s2, 0, s10
+; GFX9-NEXT:    s_subb_u32 s3, 0, s11
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
@@ -293,7 +293,7 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v1, vcc
 ; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0
-; GFX9-NEXT:    v_mov_b32_e32 v7, s15
+; GFX9-NEXT:    v_mov_b32_e32 v7, s11
 ; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2]
 ; GFX9-NEXT:    v_mul_hi_u32 v6, v3, v0
 ; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2]
@@ -318,52 +318,52 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX9-NEXT:    v_add3_u32 v1, v5, v2, v1
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v3, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v2, s13, v0
-; GFX9-NEXT:    v_mul_lo_u32 v3, s12, v1
-; GFX9-NEXT:    v_mul_hi_u32 v4, s12, v0
-; GFX9-NEXT:    v_mul_hi_u32 v0, s13, v0
-; GFX9-NEXT:    v_mul_hi_u32 v6, s13, v1
+; GFX9-NEXT:    v_mul_lo_u32 v2, s9, v0
+; GFX9-NEXT:    v_mul_lo_u32 v3, s8, v1
+; GFX9-NEXT:    v_mul_hi_u32 v4, s8, v0
+; GFX9-NEXT:    v_mul_hi_u32 v0, s9, v0
+; GFX9-NEXT:    v_mul_hi_u32 v6, s9, v1
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v4, s13, v1
+; GFX9-NEXT:    v_mul_lo_u32 v4, s9, v1
 ; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
-; GFX9-NEXT:    v_mul_hi_u32 v3, s12, v1
+; GFX9-NEXT:    v_mul_hi_u32 v3, s8, v1
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v4, v0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v0, v2
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s14, v5, 0
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s10, v5, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v3, v4, v3
 ; GFX9-NEXT:    v_add3_u32 v3, v3, v2, v6
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s14, v3, v[1:2]
-; GFX9-NEXT:    v_mov_b32_e32 v6, s13
+; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s10, v3, v[1:2]
+; GFX9-NEXT:    v_mov_b32_e32 v6, s9
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s15, v5, v[1:2]
-; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, s12, v0
+; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s11, v5, v[1:2]
+; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, s8, v0
 ; GFX9-NEXT:    v_subb_co_u32_e64 v6, s[0:1], v6, v1, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s15, v6
-; GFX9-NEXT:    v_sub_u32_e32 v0, s13, v1
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v6
+; GFX9-NEXT:    v_sub_u32_e32 v0, s9, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s14, v2
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s15, v6
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s11, v6
 ; GFX9-NEXT:    v_subb_co_u32_e32 v0, vcc, v0, v7, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v8, s[0:1]
-; GFX9-NEXT:    v_subrev_co_u32_e32 v8, vcc, s14, v2
+; GFX9-NEXT:    v_subrev_co_u32_e32 v8, vcc, s10, v2
 ; GFX9-NEXT:    v_subbrev_co_u32_e64 v9, s[0:1], 0, v0, vcc
 ; GFX9-NEXT:    v_add_co_u32_e64 v10, s[0:1], 1, v5
 ; GFX9-NEXT:    v_addc_co_u32_e64 v11, s[0:1], 0, v3, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s15, v9
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v9
 ; GFX9-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s14, v8
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v8
 ; GFX9-NEXT:    v_subb_co_u32_e32 v0, vcc, v0, v7, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s15, v9
-; GFX9-NEXT:    v_subrev_co_u32_e32 v7, vcc, s14, v8
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s11, v9
+; GFX9-NEXT:    v_subrev_co_u32_e32 v7, vcc, s10, v8
 ; GFX9-NEXT:    v_cndmask_b32_e64 v12, v12, v13, s[0:1]
 ; GFX9-NEXT:    v_add_co_u32_e64 v13, s[0:1], 1, v10
 ; GFX9-NEXT:    v_subbrev_co_u32_e32 v15, vcc, 0, v0, vcc
@@ -378,17 +378,17 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v9, v15, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, v6, v5, s[0:1]
-; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[8:9]
-; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[10:11]
+; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[4:5]
+; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: udivrem_i64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x0
+; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[6:7], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s15
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s14
-; GFX10-NEXT:    s_sub_u32 s0, 0, s14
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s11
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s10
+; GFX10-NEXT:    s_sub_u32 s0, 0, s10
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
@@ -401,7 +401,7 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v3, v0
 ; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s1, s0, v3, 0
 ; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s1, s0, v4, v[1:2]
-; GFX10-NEXT:    s_subb_u32 s1, 0, s15
+; GFX10-NEXT:    s_subb_u32 s1, 0, s11
 ; GFX10-NEXT:    v_mul_hi_u32 v6, v4, v0
 ; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s2, s1, v3, v[1:2]
 ; GFX10-NEXT:    v_mul_lo_u32 v2, v4, v0
@@ -449,14 +449,14 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v3, v0
 ; GFX10-NEXT:    v_add3_u32 v1, v5, v2, v1
-; GFX10-NEXT:    v_mul_lo_u32 v2, s13, v0
+; GFX10-NEXT:    v_mul_lo_u32 v2, s9, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v4, v1, vcc_lo
-; GFX10-NEXT:    v_mul_hi_u32 v4, s12, v0
-; GFX10-NEXT:    v_mul_hi_u32 v0, s13, v0
-; GFX10-NEXT:    v_mul_lo_u32 v3, s12, v1
-; GFX10-NEXT:    v_mul_lo_u32 v5, s13, v1
+; GFX10-NEXT:    v_mul_hi_u32 v4, s8, v0
+; GFX10-NEXT:    v_mul_hi_u32 v0, s9, v0
+; GFX10-NEXT:    v_mul_lo_u32 v3, s8, v1
+; GFX10-NEXT:    v_mul_lo_u32 v5, s9, v1
 ; GFX10-NEXT:    v_add_co_u32 v2, s0, v2, v3
-; GFX10-NEXT:    v_mul_hi_u32 v3, s12, v1
+; GFX10-NEXT:    v_mul_hi_u32 v3, s8, v1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s0
 ; GFX10-NEXT:    v_add_co_u32 v2, s0, v2, v4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
@@ -466,38 +466,38 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX10-NEXT:    v_add_co_u32 v0, s0, v0, v3
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s0
 ; GFX10-NEXT:    v_add_co_u32 v5, s0, v0, v2
-; GFX10-NEXT:    v_mul_hi_u32 v2, s13, v1
+; GFX10-NEXT:    v_mul_hi_u32 v2, s9, v1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s0
 ; GFX10-NEXT:    v_add_nc_u32_e32 v3, v4, v3
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s0, s14, v5, 0
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s0, s10, v5, 0
 ; GFX10-NEXT:    v_add3_u32 v3, v3, v6, v2
-; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s0, s14, v3, v[1:2]
-; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s0, s15, v5, v[1:2]
+; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s0, s10, v3, v[1:2]
+; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s0, s11, v5, v[1:2]
 ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v5, 1
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v3, vcc_lo
-; GFX10-NEXT:    v_sub_co_u32 v7, vcc_lo, s12, v0
-; GFX10-NEXT:    v_sub_nc_u32_e32 v6, s13, v1
-; GFX10-NEXT:    v_sub_co_ci_u32_e64 v8, s0, s13, v1, vcc_lo
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v0, vcc_lo, s15, v6, vcc_lo
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s14, v7
+; GFX10-NEXT:    v_sub_co_u32 v7, vcc_lo, s8, v0
+; GFX10-NEXT:    v_sub_nc_u32_e32 v6, s9, v1
+; GFX10-NEXT:    v_sub_co_ci_u32_e64 v8, s0, s9, v1, vcc_lo
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v0, vcc_lo, s11, v6, vcc_lo
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s10, v7
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX10-NEXT:    v_sub_co_u32 v6, vcc_lo, v7, s14
+; GFX10-NEXT:    v_sub_co_u32 v6, vcc_lo, v7, s10
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v9, s0, 0, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s15, v8
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v0, vcc_lo, s15, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s11, v8
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v0, vcc_lo, s11, v0, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s0
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s14, v6
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s10, v6
 ; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s0
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s15, v9
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s11, v9
 ; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s0
 ; GFX10-NEXT:    v_add_co_u32 v13, s0, v2, 1
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v14, s0, 0, v4, s0
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s15, v9
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s11, v9
 ; GFX10-NEXT:    v_cndmask_b32_e64 v11, v12, v11, s0
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s15, v8
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s11, v8
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v11
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v10, v1, s0
-; GFX10-NEXT:    v_sub_co_u32 v10, s0, v6, s14
+; GFX10-NEXT:    v_sub_co_u32 v10, s0, v6, s10
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v0, s0, 0, v0, s0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v13, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v14, vcc_lo
@@ -509,8 +509,8 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v3, v4, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v7, v6, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v8, v9, s0
-; GFX10-NEXT:    global_store_dwordx2 v10, v[0:1], s[8:9]
-; GFX10-NEXT:    global_store_dwordx2 v10, v[2:3], s[10:11]
+; GFX10-NEXT:    global_store_dwordx2 v10, v[0:1], s[4:5]
+; GFX10-NEXT:    global_store_dwordx2 v10, v[2:3], s[6:7]
 ; GFX10-NEXT:    s_endpgm
   %div = udiv i64 %x, %y
   store i64 %div, ptr addrspace(1) %out0
@@ -576,12 +576,12 @@ define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1
 ;
 ; GFX9-LABEL: udivrem_v2i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x0
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[6:7], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s14
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s15
-; GFX9-NEXT:    s_sub_i32 s0, 0, s14
-; GFX9-NEXT:    s_sub_i32 s1, 0, s15
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s10
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s11
+; GFX9-NEXT:    s_sub_i32 s0, 0, s10
+; GFX9-NEXT:    s_sub_i32 s1, 0, s11
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
@@ -593,47 +593,47 @@ define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
 ; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v0, s12, v0
+; GFX9-NEXT:    v_mul_hi_u32 v0, s8, v0
 ; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
-; GFX9-NEXT:    v_mul_hi_u32 v1, s13, v1
-; GFX9-NEXT:    v_mul_lo_u32 v2, v0, s14
+; GFX9-NEXT:    v_mul_hi_u32 v1, s9, v1
+; GFX9-NEXT:    v_mul_lo_u32 v2, v0, s10
 ; GFX9-NEXT:    v_add_u32_e32 v4, 1, v0
-; GFX9-NEXT:    v_mul_lo_u32 v3, v1, s15
+; GFX9-NEXT:    v_mul_lo_u32 v3, v1, s11
 ; GFX9-NEXT:    v_add_u32_e32 v5, 1, v1
-; GFX9-NEXT:    v_sub_u32_e32 v2, s12, v2
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s14, v2
-; GFX9-NEXT:    v_sub_u32_e32 v3, s13, v3
+; GFX9-NEXT:    v_sub_u32_e32 v2, s8, v2
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
+; GFX9-NEXT:    v_sub_u32_e32 v3, s9, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v4, s14, v2
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s15, v3
+; GFX9-NEXT:    v_subrev_u32_e32 v4, s10, v2
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[0:1]
-; GFX9-NEXT:    v_subrev_u32_e32 v5, s15, v3
+; GFX9-NEXT:    v_subrev_u32_e32 v5, s11, v3
 ; GFX9-NEXT:    v_add_u32_e32 v4, 1, v0
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s14, v2
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v4, s14, v2
+; GFX9-NEXT:    v_subrev_u32_e32 v4, s10, v2
 ; GFX9-NEXT:    v_add_u32_e32 v5, 1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s15, v3
-; GFX9-NEXT:    v_subrev_u32_e32 v4, s15, v3
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
+; GFX9-NEXT:    v_subrev_u32_e32 v4, s11, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[8:9]
-; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[10:11]
+; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[4:5]
+; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: udivrem_v2i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x0
+; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s14
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s15
-; GFX10-NEXT:    s_sub_i32 s0, 0, s14
-; GFX10-NEXT:    s_sub_i32 s1, 0, s15
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s10
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s11
+; GFX10-NEXT:    s_sub_i32 s0, 0, s10
+; GFX10-NEXT:    s_sub_i32 s1, 0, s11
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v1, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
@@ -646,34 +646,34 @@ define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX10-NEXT:    v_mul_hi_u32 v3, v1, v3
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_add_nc_u32_e32 v1, v1, v3
-; GFX10-NEXT:    v_mul_hi_u32 v0, s12, v0
-; GFX10-NEXT:    v_mul_hi_u32 v1, s13, v1
-; GFX10-NEXT:    v_mul_lo_u32 v2, v0, s14
-; GFX10-NEXT:    v_mul_lo_u32 v3, v1, s15
+; GFX10-NEXT:    v_mul_hi_u32 v0, s8, v0
+; GFX10-NEXT:    v_mul_hi_u32 v1, s9, v1
+; GFX10-NEXT:    v_mul_lo_u32 v2, v0, s10
+; GFX10-NEXT:    v_mul_lo_u32 v3, v1, s11
 ; GFX10-NEXT:    v_add_nc_u32_e32 v4, 1, v0
 ; GFX10-NEXT:    v_add_nc_u32_e32 v5, 1, v1
-; GFX10-NEXT:    v_sub_nc_u32_e32 v2, s12, v2
-; GFX10-NEXT:    v_sub_nc_u32_e32 v3, s13, v3
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, s14, v2
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s15, v3
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s14, v2
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, s15, v3
+; GFX10-NEXT:    v_sub_nc_u32_e32 v2, s8, v2
+; GFX10-NEXT:    v_sub_nc_u32_e32 v3, s9, v3
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, s10, v2
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s11, v3
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s10, v2
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, s11, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
 ; GFX10-NEXT:    v_add_nc_u32_e32 v5, 1, v1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v4, 1, v0
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s14, v2
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s15, v3
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, s14, v2
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, s15, v3
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s10, v2
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s11, v3
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, s10, v2
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, s11, v3
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
-; GFX10-NEXT:    global_store_dwordx2 v8, v[0:1], s[8:9]
-; GFX10-NEXT:    global_store_dwordx2 v8, v[2:3], s[10:11]
+; GFX10-NEXT:    global_store_dwordx2 v8, v[0:1], s[4:5]
+; GFX10-NEXT:    global_store_dwordx2 v8, v[2:3], s[6:7]
 ; GFX10-NEXT:    s_endpgm
   %div = udiv <2 x i32> %x, %y
   store <2 x i32> %div, ptr addrspace(1) %out0
@@ -1248,16 +1248,16 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ;
 ; GFX9-LABEL: udivrem_v2i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[16:19], s[6:7], 0x20
-; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x0
+; GFX9-NEXT:    s_load_dwordx4 s[12:15], s[6:7], 0x20
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s17
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s16
-; GFX9-NEXT:    s_sub_u32 s2, 0, s16
-; GFX9-NEXT:    s_subb_u32 s3, 0, s17
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s13
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s12
+; GFX9-NEXT:    s_sub_u32 s2, 0, s12
+; GFX9-NEXT:    s_subb_u32 s3, 0, s13
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[6:7], 0x0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX9-NEXT:    v_trunc_f32_e32 v2, v1
@@ -1293,12 +1293,12 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0
 ; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2]
 ; GFX9-NEXT:    v_mul_hi_u32 v6, v3, v0
-; GFX9-NEXT:    s_sub_u32 s2, 0, s18
+; GFX9-NEXT:    s_sub_u32 s2, 0, s14
 ; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2]
 ; GFX9-NEXT:    v_mul_lo_u32 v2, v4, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v0, v4, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v5, v3, v1
-; GFX9-NEXT:    s_subb_u32 s3, 0, s19
+; GFX9-NEXT:    s_subb_u32 s3, 0, s15
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v5
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v6
@@ -1317,47 +1317,48 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX9-NEXT:    v_add3_u32 v1, v5, v2, v1
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v3, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v2, s13, v0
-; GFX9-NEXT:    v_mul_lo_u32 v3, s12, v1
-; GFX9-NEXT:    v_mul_hi_u32 v4, s12, v0
-; GFX9-NEXT:    v_mul_hi_u32 v0, s13, v0
-; GFX9-NEXT:    v_mul_hi_u32 v5, s13, v1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mul_lo_u32 v2, s9, v0
+; GFX9-NEXT:    v_mul_lo_u32 v3, s8, v1
+; GFX9-NEXT:    v_mul_hi_u32 v4, s8, v0
+; GFX9-NEXT:    v_mul_hi_u32 v0, s9, v0
+; GFX9-NEXT:    v_mul_hi_u32 v5, s9, v1
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v4, s13, v1
+; GFX9-NEXT:    v_mul_lo_u32 v4, s9, v1
 ; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
-; GFX9-NEXT:    v_mul_hi_u32 v3, s12, v1
-; GFX9-NEXT:    v_mov_b32_e32 v6, s17
+; GFX9-NEXT:    v_mul_hi_u32 v3, s8, v1
+; GFX9-NEXT:    v_mov_b32_e32 v6, s13
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v4, v0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v0, v2
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s16, v8, 0
+; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s12, v8, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v3, v4, v3
 ; GFX9-NEXT:    v_add3_u32 v9, v3, v0, v5
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v2
-; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s16, v9, v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v5, s13
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s12, v9, v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v5, s9
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s17, v8, v[2:3]
-; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, s12, v1
+; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s13, v8, v[2:3]
+; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, s8, v1
 ; GFX9-NEXT:    v_subb_co_u32_e64 v1, s[0:1], v5, v3, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s17, v1
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s13, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s16, v2
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s12, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s17, v1
-; GFX9-NEXT:    v_sub_u32_e32 v3, s13, v3
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s13, v1
+; GFX9-NEXT:    v_sub_u32_e32 v3, s9, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v10, v4, v5, s[0:1]
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s19
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s15
 ; GFX9-NEXT:    v_subb_co_u32_e32 v7, vcc, v3, v6, vcc
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s18
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s14
 ; GFX9-NEXT:    v_mul_f32_e32 v4, 0x4f800000, v4
-; GFX9-NEXT:    v_subrev_co_u32_e32 v11, vcc, s16, v2
+; GFX9-NEXT:    v_subrev_co_u32_e32 v11, vcc, s12, v2
 ; GFX9-NEXT:    v_add_f32_e32 v3, v4, v3
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v3
 ; GFX9-NEXT:    v_subbrev_co_u32_e64 v12, s[0:1], 0, v7, vcc
@@ -1369,13 +1370,13 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX9-NEXT:    v_add_f32_e32 v3, v4, v3
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v16, v3
 ; GFX9-NEXT:    v_addc_co_u32_e64 v14, s[0:1], 0, v9, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s17, v12
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s13, v12
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s16, v11
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s12, v11
 ; GFX9-NEXT:    v_cndmask_b32_e64 v17, 0, -1, s[0:1]
 ; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s2, v16, 0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v15, v15
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s17, v12
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s13, v12
 ; GFX9-NEXT:    v_cndmask_b32_e64 v17, v5, v17, s[0:1]
 ; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s2, v15, v[4:5]
 ; GFX9-NEXT:    v_add_co_u32_e64 v18, s[0:1], 1, v13
@@ -1384,7 +1385,7 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v7, v6, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v6, v15, v3
 ; GFX9-NEXT:    v_mul_lo_u32 v7, v16, v4
-; GFX9-NEXT:    v_subrev_co_u32_e32 v20, vcc, s16, v11
+; GFX9-NEXT:    v_subrev_co_u32_e32 v20, vcc, s12, v11
 ; GFX9-NEXT:    v_subbrev_co_u32_e32 v21, vcc, 0, v5, vcc
 ; GFX9-NEXT:    v_mul_hi_u32 v5, v16, v3
 ; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v6, v7
@@ -1440,55 +1441,55 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX9-NEXT:    v_add3_u32 v6, v9, v8, v6
 ; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v16, v5
 ; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v15, v6, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v8, s15, v5
-; GFX9-NEXT:    v_mul_lo_u32 v9, s14, v6
+; GFX9-NEXT:    v_mul_lo_u32 v8, s11, v5
+; GFX9-NEXT:    v_mul_lo_u32 v9, s10, v6
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, v2, v7, s[0:1]
-; GFX9-NEXT:    v_mul_hi_u32 v2, s14, v5
-; GFX9-NEXT:    v_mul_hi_u32 v5, s15, v5
+; GFX9-NEXT:    v_mul_hi_u32 v2, s10, v5
+; GFX9-NEXT:    v_mul_hi_u32 v5, s11, v5
 ; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v9
 ; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v8, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v8, s15, v6
+; GFX9-NEXT:    v_mul_lo_u32 v8, s11, v6
 ; GFX9-NEXT:    v_add_u32_e32 v2, v9, v2
-; GFX9-NEXT:    v_mul_hi_u32 v9, s14, v6
-; GFX9-NEXT:    v_mul_hi_u32 v13, s15, v6
+; GFX9-NEXT:    v_mul_hi_u32 v9, s10, v6
+; GFX9-NEXT:    v_mul_hi_u32 v13, s11, v6
 ; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v8, v5
 ; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v9
 ; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v12, vcc, v5, v2
-; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[2:3], s18, v12, 0
+; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[2:3], s14, v12, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v8, v1, v10, s[0:1]
 ; GFX9-NEXT:    v_add_u32_e32 v1, v11, v9
 ; GFX9-NEXT:    v_add3_u32 v9, v1, v2, v13
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v6
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s18, v9, v[1:2]
-; GFX9-NEXT:    v_mov_b32_e32 v10, s15
-; GFX9-NEXT:    v_mov_b32_e32 v6, s19
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s19, v12, v[1:2]
-; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, s14, v5
+; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s14, v9, v[1:2]
+; GFX9-NEXT:    v_mov_b32_e32 v10, s11
+; GFX9-NEXT:    v_mov_b32_e32 v6, s15
+; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s15, v12, v[1:2]
+; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, s10, v5
 ; GFX9-NEXT:    v_subb_co_u32_e64 v10, s[0:1], v10, v1, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s19, v10
-; GFX9-NEXT:    v_sub_u32_e32 v1, s15, v1
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s15, v10
+; GFX9-NEXT:    v_sub_u32_e32 v1, s11, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s18, v2
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s14, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s19, v10
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s15, v10
 ; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, v5, v11, s[0:1]
-; GFX9-NEXT:    v_subrev_co_u32_e32 v11, vcc, s18, v2
+; GFX9-NEXT:    v_subrev_co_u32_e32 v11, vcc, s14, v2
 ; GFX9-NEXT:    v_subbrev_co_u32_e64 v13, s[0:1], 0, v1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e64 v14, s[0:1], 1, v12
 ; GFX9-NEXT:    v_addc_co_u32_e64 v15, s[0:1], 0, v9, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s19, v13
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s15, v13
 ; GFX9-NEXT:    v_cndmask_b32_e64 v16, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s18, v11
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s14, v11
 ; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v17, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s19, v13
-; GFX9-NEXT:    v_subrev_co_u32_e32 v19, vcc, s18, v11
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s15, v13
+; GFX9-NEXT:    v_subrev_co_u32_e32 v19, vcc, s14, v11
 ; GFX9-NEXT:    v_cndmask_b32_e64 v16, v16, v17, s[0:1]
 ; GFX9-NEXT:    v_add_co_u32_e64 v17, s[0:1], 1, v14
 ; GFX9-NEXT:    v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc
@@ -1503,24 +1504,22 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v13, v1, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v9, v2, v9, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v10, v10, v1, s[0:1]
-; GFX9-NEXT:    global_store_dwordx4 v0, v[3:6], s[8:9]
-; GFX9-NEXT:    global_store_dwordx4 v0, v[7:10], s[10:11]
+; GFX9-NEXT:    global_store_dwordx4 v0, v[3:6], s[4:5]
+; GFX9-NEXT:    global_store_dwordx4 v0, v[7:10], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: udivrem_v2i64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    s_load_dwordx4 s[16:19], s[6:7], 0x20
-; GFX10-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x0
+; GFX10-NEXT:    s_load_dwordx4 s[12:15], s[6:7], 0x20
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s17
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s19
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v2, s16
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v3, s18
-; GFX10-NEXT:    s_sub_u32 s0, 0, s16
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s13
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s15
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v2, s12
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v3, s14
+; GFX10-NEXT:    s_sub_u32 s0, 0, s12
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v1
-; GFX10-NEXT:    s_subb_u32 s1, 0, s17
+; GFX10-NEXT:    s_subb_u32 s1, 0, s13
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
@@ -1540,13 +1539,13 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v7, v0
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v8, v1
 ; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s2, s0, v7, 0
-; GFX10-NEXT:    s_sub_u32 s2, 0, s18
+; GFX10-NEXT:    s_sub_u32 s2, 0, s14
 ; GFX10-NEXT:    v_mad_u64_u32 v[2:3], s3, s2, v8, 0
 ; GFX10-NEXT:    v_mul_hi_u32 v11, v9, v0
 ; GFX10-NEXT:    v_mad_u64_u32 v[4:5], s3, s0, v9, v[1:2]
 ; GFX10-NEXT:    v_mad_u64_u32 v[5:6], s3, s2, v10, v[3:4]
 ; GFX10-NEXT:    v_mul_lo_u32 v6, v9, v0
-; GFX10-NEXT:    s_subb_u32 s3, 0, s19
+; GFX10-NEXT:    s_subb_u32 s3, 0, s15
 ; GFX10-NEXT:    v_mad_u64_u32 v[3:4], s4, s1, v7, v[4:5]
 ; GFX10-NEXT:    v_mul_hi_u32 v4, v7, v0
 ; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, s3, v8, v[5:6]
@@ -1593,6 +1592,7 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, s0, v7, 0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v10, vcc_lo, v10, v2, vcc_lo
 ; GFX10-NEXT:    v_mad_u64_u32 v[2:3], s4, s2, v8, 0
+; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[6:7], 0x0
 ; GFX10-NEXT:    v_mul_hi_u32 v11, v9, v0
 ; GFX10-NEXT:    v_mad_u64_u32 v[4:5], s0, s0, v9, v[1:2]
 ; GFX10-NEXT:    v_mad_u64_u32 v[5:6], s0, s2, v10, v[3:4]
@@ -1641,20 +1641,21 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v9, v3, vcc_lo
 ; GFX10-NEXT:    v_add_co_u32 v1, vcc_lo, v8, v1
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v0, vcc_lo, v10, v0, vcc_lo
-; GFX10-NEXT:    v_mul_lo_u32 v3, s13, v4
-; GFX10-NEXT:    v_mul_lo_u32 v8, s12, v2
-; GFX10-NEXT:    v_mul_hi_u32 v5, s12, v4
-; GFX10-NEXT:    v_mul_hi_u32 v4, s13, v4
-; GFX10-NEXT:    v_mul_lo_u32 v9, s13, v2
-; GFX10-NEXT:    v_mul_lo_u32 v6, s15, v1
-; GFX10-NEXT:    v_mul_hi_u32 v10, s12, v2
-; GFX10-NEXT:    v_mul_hi_u32 v11, s13, v2
-; GFX10-NEXT:    v_mul_lo_u32 v2, s14, v0
-; GFX10-NEXT:    v_mul_hi_u32 v7, s14, v1
-; GFX10-NEXT:    v_mul_hi_u32 v1, s15, v1
-; GFX10-NEXT:    v_mul_lo_u32 v12, s15, v0
-; GFX10-NEXT:    v_mul_hi_u32 v13, s14, v0
-; GFX10-NEXT:    v_mul_hi_u32 v14, s15, v0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_mul_lo_u32 v3, s9, v4
+; GFX10-NEXT:    v_mul_lo_u32 v8, s8, v2
+; GFX10-NEXT:    v_mul_hi_u32 v5, s8, v4
+; GFX10-NEXT:    v_mul_hi_u32 v4, s9, v4
+; GFX10-NEXT:    v_mul_lo_u32 v9, s9, v2
+; GFX10-NEXT:    v_mul_lo_u32 v6, s11, v1
+; GFX10-NEXT:    v_mul_hi_u32 v10, s8, v2
+; GFX10-NEXT:    v_mul_hi_u32 v11, s9, v2
+; GFX10-NEXT:    v_mul_lo_u32 v2, s10, v0
+; GFX10-NEXT:    v_mul_hi_u32 v7, s10, v1
+; GFX10-NEXT:    v_mul_hi_u32 v1, s11, v1
+; GFX10-NEXT:    v_mul_lo_u32 v12, s11, v0
+; GFX10-NEXT:    v_mul_hi_u32 v13, s10, v0
+; GFX10-NEXT:    v_mul_hi_u32 v14, s11, v0
 ; GFX10-NEXT:    v_add_co_u32 v0, s0, v3, v8
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s0
 ; GFX10-NEXT:    v_add_co_u32 v4, s0, v9, v4
@@ -1677,77 +1678,77 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX10-NEXT:    v_add_co_u32 v8, s0, v4, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s0
 ; GFX10-NEXT:    v_add_co_u32 v10, s0, v1, v2
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s1, s16, v8, 0
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s1, s12, v8, 0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s0
-; GFX10-NEXT:    v_mad_u64_u32 v[2:3], s0, s18, v10, 0
+; GFX10-NEXT:    v_mad_u64_u32 v[2:3], s0, s14, v10, 0
 ; GFX10-NEXT:    v_add_nc_u32_e32 v7, v9, v7
 ; GFX10-NEXT:    v_add3_u32 v9, v5, v4, v11
 ; GFX10-NEXT:    v_add_co_u32 v12, vcc_lo, v8, 1
 ; GFX10-NEXT:    v_mov_b32_e32 v11, 0
 ; GFX10-NEXT:    v_add3_u32 v7, v7, v6, v14
-; GFX10-NEXT:    v_mad_u64_u32 v[4:5], s0, s16, v9, v[1:2]
+; GFX10-NEXT:    v_mad_u64_u32 v[4:5], s0, s12, v9, v[1:2]
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v13, vcc_lo, 0, v9, vcc_lo
-; GFX10-NEXT:    v_mad_u64_u32 v[5:6], s0, s18, v7, v[3:4]
-; GFX10-NEXT:    v_mad_u64_u32 v[3:4], s0, s17, v8, v[4:5]
+; GFX10-NEXT:    v_mad_u64_u32 v[5:6], s0, s14, v7, v[3:4]
+; GFX10-NEXT:    v_mad_u64_u32 v[3:4], s0, s13, v8, v[4:5]
 ; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, v12, 1
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, 0, v13, vcc_lo
-; GFX10-NEXT:    v_sub_co_u32 v14, vcc_lo, s12, v0
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s0, s19, v10, v[5:6]
-; GFX10-NEXT:    v_sub_co_ci_u32_e64 v5, s0, s13, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s16, v14
-; GFX10-NEXT:    v_sub_nc_u32_e32 v1, s13, v3
+; GFX10-NEXT:    v_sub_co_u32 v14, vcc_lo, s8, v0
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s0, s15, v10, v[5:6]
+; GFX10-NEXT:    v_sub_co_ci_u32_e64 v5, s0, s9, v3, vcc_lo
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s12, v14
+; GFX10-NEXT:    v_sub_nc_u32_e32 v1, s9, v3
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, -1, s0
-; GFX10-NEXT:    v_sub_co_u32 v15, s0, s14, v2
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v1, vcc_lo, s17, v1, vcc_lo
-; GFX10-NEXT:    v_sub_co_ci_u32_e64 v16, s1, s15, v0, s0
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s18, v15
-; GFX10-NEXT:    v_sub_nc_u32_e32 v0, s15, v0
+; GFX10-NEXT:    v_sub_co_u32 v15, s0, s10, v2
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v1, vcc_lo, s13, v1, vcc_lo
+; GFX10-NEXT:    v_sub_co_ci_u32_e64 v16, s1, s11, v0, s0
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s14, v15
+; GFX10-NEXT:    v_sub_nc_u32_e32 v0, s11, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc_lo
-; GFX10-NEXT:    v_sub_co_u32 v17, vcc_lo, v14, s16
+; GFX10-NEXT:    v_sub_co_u32 v17, vcc_lo, v14, s12
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v18, s1, 0, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_le_u32_e64 s1, s17, v5
-; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v23, s0, s19, v0, s0
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s17, v18
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v1, vcc_lo, s17, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_le_u32_e64 s1, s13, v5
+; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v23, s0, s15, v0, s0
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s13, v18
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v1, vcc_lo, s13, v1, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v19, 0, -1, s1
-; GFX10-NEXT:    v_cmp_le_u32_e64 s1, s16, v17
+; GFX10-NEXT:    v_cmp_le_u32_e64 s1, s12, v17
 ; GFX10-NEXT:    v_cndmask_b32_e64 v20, 0, -1, s1
-; GFX10-NEXT:    v_cmp_le_u32_e64 s1, s17, v18
+; GFX10-NEXT:    v_cmp_le_u32_e64 s1, s13, v18
 ; GFX10-NEXT:    v_cndmask_b32_e64 v21, 0, -1, s1
-; GFX10-NEXT:    v_cmp_le_u32_e64 s1, s19, v16
+; GFX10-NEXT:    v_cmp_le_u32_e64 s1, s15, v16
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v21, v20, s0
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s17, v5
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s13, v5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v22, 0, -1, s1
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s0
-; GFX10-NEXT:    v_sub_co_u32 v0, s0, v17, s16
+; GFX10-NEXT:    v_sub_co_u32 v0, s0, v17, s12
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v19, s0, 0, v1, s0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v12, v4, vcc_lo
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v13, v6, vcc_lo
-; GFX10-NEXT:    v_sub_co_u32 v6, s1, v15, s18
+; GFX10-NEXT:    v_sub_co_u32 v6, s1, v15, s14
 ; GFX10-NEXT:    v_cndmask_b32_e32 v4, v17, v0, vcc_lo
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v12, s2, 0, v23, s1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v8, v1, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v9, v3, s0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v18, v19, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, s19, v16
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, s15, v16
 ; GFX10-NEXT:    v_cndmask_b32_e64 v4, v14, v4, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, v3, s0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v22, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s19, v12
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s15, v12
 ; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc_lo
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s18, v6
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s14, v6
 ; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, -1, vcc_lo
 ; GFX10-NEXT:    v_add_co_u32 v13, vcc_lo, v10, 1
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v14, vcc_lo, 0, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, s19, v12
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, s15, v12
 ; GFX10-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc_lo
 ; GFX10-NEXT:    v_add_co_u32 v9, vcc_lo, v13, 1
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v17, vcc_lo, 0, v14, vcc_lo
-; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v18, vcc_lo, s19, v23, s1
+; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v18, vcc_lo, s15, v23, s1
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX10-NEXT:    v_sub_co_u32 v8, s1, v6, s18
+; GFX10-NEXT:    v_sub_co_u32 v8, s1, v6, s14
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v18, s1, 0, v18, s1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v9, v13, v9, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v13, v14, v17, vcc_lo
@@ -1758,8 +1759,8 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v7, v13, s1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v6, v15, v6, s1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v7, v16, v8, s1
-; GFX10-NEXT:    global_store_dwordx4 v11, v[0:3], s[8:9]
-; GFX10-NEXT:    global_store_dwordx4 v11, v[4:7], s[10:11]
+; GFX10-NEXT:    global_store_dwordx4 v11, v[0:3], s[4:5]
+; GFX10-NEXT:    global_store_dwordx4 v11, v[4:7], s[6:7]
 ; GFX10-NEXT:    s_endpgm
   %div = udiv <2 x i64> %x, %y
   store <2 x i64> %div, ptr addrspace(1) %out0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
index 1f1c2659e8110..c9a9eb9d91724 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
@@ -479,33 +479,33 @@ return:
 define amdgpu_kernel void @v8i8_phi_chain(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) {
 ; GFX906-LABEL: v8i8_phi_chain:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX906-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
-; GFX906-NEXT:    s_xor_b64 s[0:1], vcc, -1
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dwordx2 v[1:2], v3, s[4:5]
-; GFX906-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX906-NEXT:    global_load_dwordx2 v[1:2], v3, s[0:1]
+; GFX906-NEXT:    s_xor_b64 s[0:1], vcc, -1
+; GFX906-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX906-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    global_load_dwordx2 v[1:2], v3, s[6:7]
+; GFX906-NEXT:    global_load_dwordx2 v[1:2], v3, s[2:3]
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 7, v0
 ; GFX906-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
-; GFX906-NEXT:    s_and_b64 s[4:5], exec, vcc
-; GFX906-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
+; GFX906-NEXT:    s_and_b64 s[2:3], exec, vcc
+; GFX906-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
 ; GFX906-NEXT:  .LBB8_2: ; %Flow
-; GFX906-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX906-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX906-NEXT:    s_and_saveexec_b64 s[2:3], s[0:1]
 ; GFX906-NEXT:    s_cbranch_execz .LBB8_4
 ; GFX906-NEXT:  ; %bb.3: ; %bb.2
 ; GFX906-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    global_store_dwordx2 v0, v[1:2], s[8:9]
+; GFX906-NEXT:    global_store_dwordx2 v0, v[1:2], s[4:5]
 ; GFX906-NEXT:  .LBB8_4: ; %bb.3
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX906-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    global_store_dwordx2 v0, v[1:2], s[10:11]
+; GFX906-NEXT:    global_store_dwordx2 v0, v[1:2], s[6:7]
 ; GFX906-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -533,31 +533,31 @@ bb.3:
 define amdgpu_kernel void @v8i8_multi_block(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) {
 ; GFX906-LABEL: v8i8_multi_block:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX906-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 3, v0
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dwordx2 v[3:4], v5, s[4:5]
+; GFX906-NEXT:    global_load_dwordx2 v[3:4], v5, s[0:1]
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX906-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX906-NEXT:    s_cbranch_execz .LBB9_4
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    global_load_dwordx2 v[1:2], v5, s[6:7]
+; GFX906-NEXT:    global_load_dwordx2 v[1:2], v5, s[2:3]
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 7, v0
 ; GFX906-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX906-NEXT:    s_cbranch_execz .LBB9_3
 ; GFX906-NEXT:  ; %bb.2: ; %bb.2
 ; GFX906-NEXT:    v_mov_b32_e32 v0, 0
-; GFX906-NEXT:    global_store_dwordx2 v0, v[3:4], s[8:9]
+; GFX906-NEXT:    global_store_dwordx2 v0, v[3:4], s[4:5]
 ; GFX906-NEXT:  .LBB9_3: ; %Flow
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX906-NEXT:  .LBB9_4: ; %bb.3
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX906-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    global_store_dwordx2 v0, v[1:2], s[10:11]
+; GFX906-NEXT:    global_store_dwordx2 v0, v[1:2], s[6:7]
 ; GFX906-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll
index f36dcb487e915..ef2e57eafbf13 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll
@@ -18,24 +18,24 @@ define amdgpu_kernel void @constant_load_i8_align4(ptr addrspace (1) %out, ptr a
 ;
 ; GFX9-LABEL: constant_load_i8_align4:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    global_store_byte v1, v0, s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    global_store_byte v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: constant_load_i8_align4:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX10-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-NEXT:    global_store_byte v1, v0, s[4:5]
+; GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-NEXT:    global_store_byte v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
   %ld = load i8, ptr addrspace(4) %in, align 4
   store i8 %ld, ptr addrspace(1) %out, align 4
@@ -57,24 +57,24 @@ define amdgpu_kernel void @constant_load_i16_align4(ptr addrspace (1) %out, ptr
 ;
 ; GFX9-LABEL: constant_load_i16_align4:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    global_store_short v1, v0, s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    global_store_short v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: constant_load_i16_align4:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX10-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-NEXT:    global_store_short v1, v0, s[4:5]
+; GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-NEXT:    global_store_short v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
   %ld = load i16, ptr addrspace(4) %in, align 4
   store i16 %ld, ptr addrspace(1) %out, align 4
@@ -97,26 +97,26 @@ define amdgpu_kernel void @sextload_i8_to_i32_align4(ptr addrspace(1) %out, ptr
 ;
 ; GFX9-LABEL: sextload_i8_to_i32_align4:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_sext_i32_i8 s0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX9-NEXT:    s_sext_i32_i8 s2, s2
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: sextload_i8_to_i32_align4:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX10-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_sext_i32_i8 s0, s0
-; GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    s_sext_i32_i8 s2, s2
+; GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
   %load = load i8, ptr addrspace(1) %in, align 4
   %sext = sext i8 %load to i32
@@ -140,26 +140,26 @@ define amdgpu_kernel void @sextload_i16_to_i32_align4(ptr addrspace(1) %out, ptr
 ;
 ; GFX9-LABEL: sextload_i16_to_i32_align4:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_sext_i32_i16 s0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX9-NEXT:    s_sext_i32_i16 s2, s2
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: sextload_i16_to_i32_align4:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX10-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_sext_i32_i16 s0, s0
-; GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    s_sext_i32_i16 s2, s2
+; GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
   %load = load i16, ptr addrspace(1) %in, align 4
   %sext = sext i16 %load to i32
@@ -183,26 +183,26 @@ define amdgpu_kernel void @zextload_i8_to_i32_align4(ptr addrspace(1) %out, ptr
 ;
 ; GFX9-LABEL: zextload_i8_to_i32_align4:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_and_b32 s0, s0, 0xff
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX9-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: zextload_i8_to_i32_align4:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX10-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_and_b32 s0, s0, 0xff
-; GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
   %load = load i8, ptr addrspace(1) %in, align 4
   %zext = zext i8 %load to i32
@@ -226,26 +226,26 @@ define amdgpu_kernel void @zextload_i16_to_i32_align4(ptr addrspace(1) %out, ptr
 ;
 ; GFX9-LABEL: zextload_i16_to_i32_align4:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX9-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: zextload_i16_to_i32_align4:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX10-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
   %load = load i16, ptr addrspace(1) %in, align 4
   %zext = zext i16 %load to i32
@@ -269,22 +269,22 @@ define amdgpu_kernel void @constant_load_i8_align2(ptr addrspace(1) %out, ptr ad
 ;
 ; GFX9-LABEL: constant_load_i8_align2:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ubyte v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_ubyte v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_store_byte v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_byte v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: constant_load_i8_align2:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ubyte v1, v0, s[6:7]
+; GFX10-NEXT:    global_load_ubyte v1, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_store_byte v0, v1, s[4:5]
+; GFX10-NEXT:    global_store_byte v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
   %load = load i8, ptr addrspace(1) %in, align 2
   store i8 %load, ptr addrspace(1) %out, align 2
@@ -307,22 +307,22 @@ define amdgpu_kernel void @constant_load_i16_align2(ptr addrspace(1) %out, ptr a
 ;
 ; GFX9-LABEL: constant_load_i16_align2:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: constant_load_i16_align2:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ushort v1, v0, s[6:7]
+; GFX10-NEXT:    global_load_ushort v1, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
   %load = load i16, ptr addrspace(1) %in, align 2
   store i16 %load, ptr addrspace(1) %out, align 2
@@ -351,24 +351,24 @@ define amdgpu_kernel void @constant_sextload_i8_align2(ptr addrspace(1) %out, pt
 ;
 ; GFX9-LABEL: constant_sextload_i8_align2:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_sbyte v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_sbyte v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_store_short v0, v1, s[4:5]
-; GFX9-NEXT:    global_store_short_d16_hi v0, v1, s[4:5] offset:2
+; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_short_d16_hi v0, v1, s[0:1] offset:2
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: constant_sextload_i8_align2:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_sbyte v1, v0, s[6:7]
+; GFX10-NEXT:    global_load_sbyte v1, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_store_short v0, v1, s[4:5]
-; GFX10-NEXT:    global_store_short_d16_hi v0, v1, s[4:5] offset:2
+; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-NEXT:    global_store_short_d16_hi v0, v1, s[0:1] offset:2
 ; GFX10-NEXT:    s_endpgm
   %load = load i8, ptr addrspace(1) %in, align 2
   %sextload = sext i8 %load to i32
@@ -398,24 +398,24 @@ define amdgpu_kernel void @constant_zextload_i8_align2(ptr addrspace(1) %out, pt
 ;
 ; GFX9-LABEL: constant_zextload_i8_align2:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ubyte v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_ubyte v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_store_short v0, v1, s[4:5]
-; GFX9-NEXT:    global_store_short_d16_hi v0, v1, s[4:5] offset:2
+; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_short_d16_hi v0, v1, s[0:1] offset:2
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: constant_zextload_i8_align2:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ubyte v1, v0, s[6:7]
+; GFX10-NEXT:    global_load_ubyte v1, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_store_short v0, v1, s[4:5]
-; GFX10-NEXT:    global_store_short_d16_hi v0, v1, s[4:5] offset:2
+; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-NEXT:    global_store_short_d16_hi v0, v1, s[0:1] offset:2
 ; GFX10-NEXT:    s_endpgm
   %load = load i8, ptr addrspace(1) %in, align 2
   %zextload = zext i8 %load to i32
diff --git a/llvm/test/CodeGen/AMDGPU/add.ll b/llvm/test/CodeGen/AMDGPU/add.ll
index 94d704fa3f92d..e9797fa1fc309 100644
--- a/llvm/test/CodeGen/AMDGPU/add.ll
+++ b/llvm/test/CodeGen/AMDGPU/add.ll
@@ -35,26 +35,26 @@ define amdgpu_kernel void @s_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in
 ;
 ; GFX9-LABEL: s_add_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_add_i32 s0, s0, s1
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    s_add_i32 s2, s2, s3
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: s_add_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_add_i32 s0, s0, s1
-; GFX10-NEXT:    v_mov_b32_e32 v1, s0
-; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-NEXT:    s_add_i32 s2, s2, s3
+; GFX10-NEXT:    v_mov_b32_e32 v1, s2
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_add_i32:
@@ -125,30 +125,30 @@ define amdgpu_kernel void @s_add_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %
 ;
 ; GFX9-LABEL: s_add_v2i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_add_i32 s1, s1, s3
-; GFX9-NEXT:    s_add_i32 s0, s0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    s_add_i32 s2, s5, s7
+; GFX9-NEXT:    s_add_i32 s3, s4, s6
+; GFX9-NEXT:    v_mov_b32_e32 v0, s3
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: s_add_v2i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_add_i32 s0, s0, s2
-; GFX10-NEXT:    s_add_i32 s1, s1, s3
-; GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-NEXT:    s_add_i32 s2, s4, s6
+; GFX10-NEXT:    s_add_i32 s3, s5, s7
+; GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_add_v2i32:
@@ -813,30 +813,30 @@ define amdgpu_kernel void @v_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in
 ;
 ; GFX9-LABEL: v_add_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v2, v0, s[6:7] offset:4 glc
+; GFX9-NEXT:    global_load_dword v2, v0, s[2:3] offset:4 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_add_u32_e32 v1, v1, v2
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_add_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
+; GFX10-NEXT:    global_load_dword v1, v0, s[2:3] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_load_dword v2, v0, s[6:7] offset:4 glc dlc
+; GFX10-NEXT:    global_load_dword v2, v0, s[2:3] offset:4 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_add_nc_u32_e32 v1, v1, v2
-; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_add_i32:
@@ -922,26 +922,26 @@ define amdgpu_kernel void @v_add_imm_i32(ptr addrspace(1) %out, ptr addrspace(1)
 ;
 ; GFX9-LABEL: v_add_imm_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_dword v0, v0, s[2:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v0, 0x7b, v0
-; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_add_imm_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[6:7] glc dlc
+; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, 0x7b, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_add_imm_i32:
@@ -1240,50 +1240,50 @@ define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace(
 ;
 ; GFX9-LABEL: add64_in_branch:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_cmp_lg_u64 s[8:9], 0
+; GFX9-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GFX9-NEXT:    s_cbranch_scc0 .LBB9_4
 ; GFX9-NEXT:  ; %bb.1: ; %else
-; GFX9-NEXT:    s_add_u32 s0, s8, s10
-; GFX9-NEXT:    s_addc_u32 s1, s9, s11
-; GFX9-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; GFX9-NEXT:    s_add_u32 s4, s4, s6
+; GFX9-NEXT:    s_addc_u32 s5, s5, s7
+; GFX9-NEXT:    s_andn2_b64 vcc, exec, s[8:9]
 ; GFX9-NEXT:    s_cbranch_vccnz .LBB9_3
 ; GFX9-NEXT:  .LBB9_2: ; %if
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
 ; GFX9-NEXT:  .LBB9_3: ; %endif
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ; GFX9-NEXT:  .LBB9_4:
-; GFX9-NEXT:    ; implicit-def: $sgpr0_sgpr1
+; GFX9-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; GFX9-NEXT:    s_branch .LBB9_2
 ;
 ; GFX10-LABEL: add64_in_branch:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_cmp_lg_u64 s[8:9], 0
+; GFX10-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GFX10-NEXT:    s_cbranch_scc0 .LBB9_4
 ; GFX10-NEXT:  ; %bb.1: ; %else
-; GFX10-NEXT:    s_add_u32 s0, s8, s10
-; GFX10-NEXT:    s_addc_u32 s1, s9, s11
+; GFX10-NEXT:    s_add_u32 s4, s4, s6
+; GFX10-NEXT:    s_addc_u32 s5, s5, s7
 ; GFX10-NEXT:    s_cbranch_execnz .LBB9_3
 ; GFX10-NEXT:  .LBB9_2: ; %if
-; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
 ; GFX10-NEXT:  .LBB9_3: ; %endif
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
-; GFX10-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ; GFX10-NEXT:  .LBB9_4:
-; GFX10-NEXT:    ; implicit-def: $sgpr0_sgpr1
+; GFX10-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; GFX10-NEXT:    s_branch .LBB9_2
 ;
 ; GFX11-LABEL: add64_in_branch:
diff --git a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
index 4cc384e9d2718..b751be51a9739 100644
--- a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
@@ -186,24 +186,24 @@ define amdgpu_kernel void @s_test_add_self_v2i16(ptr addrspace(1) %out, ptr addr
 ;
 ; GFX9-LABEL: s_test_add_self_v2i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_pk_add_u16 v1, s0, s0
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    v_pk_add_u16 v1, s2, s2
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: s_test_add_self_v2i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX10-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_pk_add_u16 v1, s0, s0
-; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-NEXT:    v_pk_add_u16 v1, s2, s2
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_test_add_self_v2i16:
@@ -245,21 +245,21 @@ define amdgpu_kernel void @s_test_add_v2i16_kernarg(ptr addrspace(1) %out, <2 x
 ;
 ; GFX9-LABEL: s_test_add_v2i16_kernarg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, s7
-; GFX9-NEXT:    v_pk_add_u16 v1, s6, v1
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_pk_add_u16 v1, s2, v1
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: s_test_add_v2i16_kernarg:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_pk_add_u16 v1, s6, s7
-; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-NEXT:    v_pk_add_u16 v1, s2, s3
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_test_add_v2i16_kernarg:
@@ -300,27 +300,27 @@ define amdgpu_kernel void @v_test_add_v2i16_constant(ptr addrspace(1) %out, ptr
 ;
 ; GFX9-LABEL: v_test_add_v2i16_constant:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT:    s_mov_b32 s0, 0x1c8007b
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_dword v0, v0, s[2:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_pk_add_u16 v0, v0, s0
-; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX9-NEXT:    s_mov_b32 s2, 0x1c8007b
+; GFX9-NEXT:    v_pk_add_u16 v0, v0, s2
+; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_add_v2i16_constant:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[6:7] glc dlc
+; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_add_u16 v0, 0x1c8007b, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_add_v2i16_constant:
@@ -369,27 +369,27 @@ define amdgpu_kernel void @v_test_add_v2i16_neg_constant(ptr addrspace(1) %out,
 ;
 ; GFX9-LABEL: v_test_add_v2i16_neg_constant:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT:    s_mov_b32 s0, 0xfc21fcb3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_dword v0, v0, s[2:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_pk_add_u16 v0, v0, s0
-; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX9-NEXT:    s_mov_b32 s2, 0xfc21fcb3
+; GFX9-NEXT:    v_pk_add_u16 v0, v0, s2
+; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_add_v2i16_neg_constant:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[6:7] glc dlc
+; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_add_u16 v0, 0xfc21fcb3, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_add_v2i16_neg_constant:
@@ -437,26 +437,26 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(ptr addrspace(1) %out, p
 ;
 ; GFX9-LABEL: v_test_add_v2i16_inline_neg1:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_dword v0, v0, s[2:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_add_u16 v0, v0, -1
-; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_add_v2i16_inline_neg1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[6:7] glc dlc
+; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_add_u16 v0, v0, -1
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_add_v2i16_inline_neg1:
@@ -503,26 +503,26 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_lo_zero_hi(ptr addrspace(1) %
 ;
 ; GFX9-LABEL: v_test_add_v2i16_inline_lo_zero_hi:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_dword v0, v0, s[2:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_add_u16 v0, v0, 32
-; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_add_v2i16_inline_lo_zero_hi:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[6:7] glc dlc
+; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_add_u16 v0, v0, 32
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_add_v2i16_inline_lo_zero_hi:
@@ -570,26 +570,26 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(ptr addrspace(1) %ou
 ;
 ; GFX9-LABEL: v_test_add_v2i16_inline_fp_split:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_dword v0, v0, s[2:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_add_u16 v0, v0, 1.0
-; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_add_v2i16_inline_fp_split:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[6:7] glc dlc
+; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_add_u16 v0, v0, 1.0
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_add_v2i16_inline_fp_split:
diff --git a/llvm/test/CodeGen/AMDGPU/amd.endpgm.ll b/llvm/test/CodeGen/AMDGPU/amd.endpgm.ll
index 88203202a320d..def6df9adf597 100644
--- a/llvm/test/CodeGen/AMDGPU/amd.endpgm.ll
+++ b/llvm/test/CodeGen/AMDGPU/amd.endpgm.ll
@@ -46,11 +46,11 @@ define amdgpu_kernel void @test2(ptr %p, i32 %x) {
 ; GFX9-NEXT:    s_cmp_lt_i32 s0, 1
 ; GFX9-NEXT:    s_cbranch_scc0 .LBB2_2
 ; GFX9-NEXT:  ; %bb.1: ; %else
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    flat_store_dword v[0:1], v2
 ; GFX9-NEXT:    s_endpgm
 ; GFX9-NEXT:  .LBB2_2: ; %then
@@ -63,11 +63,11 @@ define amdgpu_kernel void @test2(ptr %p, i32 %x) {
 ; GFX10-NEXT:    s_cmp_lt_i32 s0, 1
 ; GFX10-NEXT:    s_cbranch_scc0 .LBB2_2
 ; GFX10-NEXT:  ; %bb.1: ; %else
-; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-NEXT:    s_endpgm
 ; GFX10-NEXT:  .LBB2_2: ; %then
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll b/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll
index 95f59479c73e8..77976e470fc78 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll
@@ -72,12 +72,12 @@ define amdgpu_ps void @test_sgpr_plus_imm_offset(ptr addrspace(4) inreg %base, i
 ; SDAG-DAG: %[[BASE1:.*]]:sgpr_32 = COPY $sgpr1
 ; SDAG-DAG: %[[OFFSET:.*]]:sgpr_32 = COPY $sgpr2
 ; SDAG-DAG: %[[BASE:.*]]:sgpr_64 = REG_SEQUENCE %[[BASE0]], %subreg.sub0, %[[BASE1]], %subreg.sub1
-; SDAG: S_LOAD_DWORDX2_SGPR_IMM_ec killed %[[BASE]], %[[OFFSET]], 16,
+; SDAG: S_LOAD_DWORDX2_SGPR_IMM killed %[[BASE]], %[[OFFSET]], 16,
 ; GISEL-DAG: %[[BASE0:.*]]:sreg_32 = COPY $sgpr0
 ; GISEL-DAG: %[[BASE1:.*]]:sreg_32 = COPY $sgpr1
 ; GISEL-DAG: %[[OFFSET:.*]]:sreg_32 = COPY $sgpr2
 ; GISEL-DAG: %[[BASE:.*]]:sreg_64 = REG_SEQUENCE %[[BASE0]], %subreg.sub0, %[[BASE1]], %subreg.sub1
-; GISEL: S_LOAD_DWORDX2_SGPR_IMM_ec %[[BASE]], %[[OFFSET]], 16,
+; GISEL: S_LOAD_DWORDX2_SGPR_IMM %[[BASE]], %[[OFFSET]], 16,
 define amdgpu_ps void @test_sgpr_plus_imm_offset_x2(ptr addrspace(4) inreg %base, i32 inreg %offset,
                                                     ptr addrspace(1) inreg %out) {
   %v1 = getelementptr i8, ptr addrspace(4) %base, i64 16
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
index 96e92bb3dce0d..7cf18171a6cd7 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -72,31 +72,31 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ;
 ; GFX9-LABEL: udiv_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s7
-; GFX9-NEXT:    s_sub_i32 s0, 0, s7
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
+; GFX9-NEXT:    s_sub_i32 s4, 0, s3
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    v_readfirstlane_b32 s1, v0
-; GFX9-NEXT:    s_mul_i32 s0, s0, s1
-; GFX9-NEXT:    s_mul_hi_u32 s0, s1, s0
-; GFX9-NEXT:    s_add_i32 s1, s1, s0
-; GFX9-NEXT:    s_mul_hi_u32 s0, s6, s1
-; GFX9-NEXT:    s_mul_i32 s1, s0, s7
-; GFX9-NEXT:    s_sub_i32 s1, s6, s1
-; GFX9-NEXT:    s_add_i32 s2, s0, 1
-; GFX9-NEXT:    s_sub_i32 s3, s1, s7
-; GFX9-NEXT:    s_cmp_ge_u32 s1, s7
-; GFX9-NEXT:    s_cselect_b32 s0, s2, s0
-; GFX9-NEXT:    s_cselect_b32 s1, s3, s1
-; GFX9-NEXT:    s_add_i32 s2, s0, 1
-; GFX9-NEXT:    s_cmp_ge_u32 s1, s7
-; GFX9-NEXT:    s_cselect_b32 s0, s2, s0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX9-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX9-NEXT:    s_mul_i32 s4, s4, s5
+; GFX9-NEXT:    s_mul_hi_u32 s4, s5, s4
+; GFX9-NEXT:    s_add_i32 s5, s5, s4
+; GFX9-NEXT:    s_mul_hi_u32 s4, s2, s5
+; GFX9-NEXT:    s_mul_i32 s5, s4, s3
+; GFX9-NEXT:    s_sub_i32 s2, s2, s5
+; GFX9-NEXT:    s_add_i32 s6, s4, 1
+; GFX9-NEXT:    s_sub_i32 s5, s2, s3
+; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX9-NEXT:    s_cselect_b32 s4, s6, s4
+; GFX9-NEXT:    s_cselect_b32 s2, s5, s2
+; GFX9-NEXT:    s_add_i32 s5, s4, 1
+; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX9-NEXT:    s_cselect_b32 s2, s5, s4
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = udiv i32 %x, %y
   store i32 %r, ptr addrspace(1) %out
@@ -167,29 +167,29 @@ define amdgpu_kernel void @urem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ;
 ; GFX9-LABEL: urem_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s7
-; GFX9-NEXT:    s_sub_i32 s0, 0, s7
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
+; GFX9-NEXT:    s_sub_i32 s4, 0, s3
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    v_readfirstlane_b32 s1, v0
-; GFX9-NEXT:    s_mul_i32 s0, s0, s1
-; GFX9-NEXT:    s_mul_hi_u32 s0, s1, s0
-; GFX9-NEXT:    s_add_i32 s1, s1, s0
-; GFX9-NEXT:    s_mul_hi_u32 s0, s6, s1
-; GFX9-NEXT:    s_mul_i32 s0, s0, s7
-; GFX9-NEXT:    s_sub_i32 s0, s6, s0
-; GFX9-NEXT:    s_sub_i32 s1, s0, s7
-; GFX9-NEXT:    s_cmp_ge_u32 s0, s7
-; GFX9-NEXT:    s_cselect_b32 s0, s1, s0
-; GFX9-NEXT:    s_sub_i32 s1, s0, s7
-; GFX9-NEXT:    s_cmp_ge_u32 s0, s7
-; GFX9-NEXT:    s_cselect_b32 s0, s1, s0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX9-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX9-NEXT:    s_mul_i32 s4, s4, s5
+; GFX9-NEXT:    s_mul_hi_u32 s4, s5, s4
+; GFX9-NEXT:    s_add_i32 s5, s5, s4
+; GFX9-NEXT:    s_mul_hi_u32 s4, s2, s5
+; GFX9-NEXT:    s_mul_i32 s4, s4, s3
+; GFX9-NEXT:    s_sub_i32 s2, s2, s4
+; GFX9-NEXT:    s_sub_i32 s4, s2, s3
+; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX9-NEXT:    s_cselect_b32 s2, s4, s2
+; GFX9-NEXT:    s_sub_i32 s4, s2, s3
+; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX9-NEXT:    s_cselect_b32 s2, s4, s2
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = urem i32 %x, %y
   store i32 %r, ptr addrspace(1) %out
@@ -280,37 +280,37 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ;
 ; GFX9-LABEL: sdiv_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_abs_i32 s0, s7
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s0
-; GFX9-NEXT:    s_xor_b32 s1, s6, s7
-; GFX9-NEXT:    s_abs_i32 s2, s6
-; GFX9-NEXT:    s_sub_i32 s3, 0, s0
+; GFX9-NEXT:    s_abs_i32 s4, s3
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s4
+; GFX9-NEXT:    s_sub_i32 s5, 0, s4
+; GFX9-NEXT:    s_xor_b32 s3, s2, s3
+; GFX9-NEXT:    s_abs_i32 s2, s2
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT:    s_ashr_i32 s1, s1, 31
+; GFX9-NEXT:    s_ashr_i32 s3, s3, 31
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s6, v0
-; GFX9-NEXT:    s_mul_i32 s3, s3, s6
-; GFX9-NEXT:    s_mul_hi_u32 s3, s6, s3
-; GFX9-NEXT:    s_add_i32 s6, s6, s3
-; GFX9-NEXT:    s_mul_hi_u32 s3, s2, s6
-; GFX9-NEXT:    s_mul_i32 s6, s3, s0
+; GFX9-NEXT:    s_mul_i32 s5, s5, s6
+; GFX9-NEXT:    s_mul_hi_u32 s5, s6, s5
+; GFX9-NEXT:    s_add_i32 s6, s6, s5
+; GFX9-NEXT:    s_mul_hi_u32 s5, s2, s6
+; GFX9-NEXT:    s_mul_i32 s6, s5, s4
 ; GFX9-NEXT:    s_sub_i32 s2, s2, s6
-; GFX9-NEXT:    s_add_i32 s7, s3, 1
-; GFX9-NEXT:    s_sub_i32 s6, s2, s0
-; GFX9-NEXT:    s_cmp_ge_u32 s2, s0
-; GFX9-NEXT:    s_cselect_b32 s3, s7, s3
+; GFX9-NEXT:    s_add_i32 s7, s5, 1
+; GFX9-NEXT:    s_sub_i32 s6, s2, s4
+; GFX9-NEXT:    s_cmp_ge_u32 s2, s4
+; GFX9-NEXT:    s_cselect_b32 s5, s7, s5
 ; GFX9-NEXT:    s_cselect_b32 s2, s6, s2
-; GFX9-NEXT:    s_add_i32 s6, s3, 1
-; GFX9-NEXT:    s_cmp_ge_u32 s2, s0
-; GFX9-NEXT:    s_cselect_b32 s0, s6, s3
-; GFX9-NEXT:    s_xor_b32 s0, s0, s1
-; GFX9-NEXT:    s_sub_i32 s0, s0, s1
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX9-NEXT:    s_add_i32 s6, s5, 1
+; GFX9-NEXT:    s_cmp_ge_u32 s2, s4
+; GFX9-NEXT:    s_cselect_b32 s2, s6, s5
+; GFX9-NEXT:    s_xor_b32 s2, s2, s3
+; GFX9-NEXT:    s_sub_i32 s2, s2, s3
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = sdiv i32 %x, %y
   store i32 %r, ptr addrspace(1) %out
@@ -394,34 +394,34 @@ define amdgpu_kernel void @srem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ;
 ; GFX9-LABEL: srem_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_abs_i32 s0, s7
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s0
-; GFX9-NEXT:    s_ashr_i32 s1, s6, 31
-; GFX9-NEXT:    s_abs_i32 s2, s6
-; GFX9-NEXT:    s_sub_i32 s3, 0, s0
+; GFX9-NEXT:    s_abs_i32 s3, s3
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
+; GFX9-NEXT:    s_sub_i32 s5, 0, s3
+; GFX9-NEXT:    s_ashr_i32 s4, s2, 31
+; GFX9-NEXT:    s_abs_i32 s2, s2
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s6, v0
-; GFX9-NEXT:    s_mul_i32 s3, s3, s6
-; GFX9-NEXT:    s_mul_hi_u32 s3, s6, s3
-; GFX9-NEXT:    s_add_i32 s6, s6, s3
-; GFX9-NEXT:    s_mul_hi_u32 s3, s2, s6
-; GFX9-NEXT:    s_mul_i32 s3, s3, s0
-; GFX9-NEXT:    s_sub_i32 s2, s2, s3
-; GFX9-NEXT:    s_sub_i32 s3, s2, s0
-; GFX9-NEXT:    s_cmp_ge_u32 s2, s0
-; GFX9-NEXT:    s_cselect_b32 s2, s3, s2
-; GFX9-NEXT:    s_sub_i32 s3, s2, s0
-; GFX9-NEXT:    s_cmp_ge_u32 s2, s0
-; GFX9-NEXT:    s_cselect_b32 s0, s3, s2
-; GFX9-NEXT:    s_xor_b32 s0, s0, s1
-; GFX9-NEXT:    s_sub_i32 s0, s0, s1
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX9-NEXT:    s_mul_i32 s5, s5, s6
+; GFX9-NEXT:    s_mul_hi_u32 s5, s6, s5
+; GFX9-NEXT:    s_add_i32 s6, s6, s5
+; GFX9-NEXT:    s_mul_hi_u32 s5, s2, s6
+; GFX9-NEXT:    s_mul_i32 s5, s5, s3
+; GFX9-NEXT:    s_sub_i32 s2, s2, s5
+; GFX9-NEXT:    s_sub_i32 s5, s2, s3
+; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX9-NEXT:    s_cselect_b32 s2, s5, s2
+; GFX9-NEXT:    s_sub_i32 s5, s2, s3
+; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX9-NEXT:    s_cselect_b32 s2, s5, s2
+; GFX9-NEXT:    s_xor_b32 s2, s2, s4
+; GFX9-NEXT:    s_sub_i32 s2, s2, s4
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = srem i32 %x, %y
   store i32 %r, ptr addrspace(1) %out
@@ -5486,13 +5486,13 @@ define amdgpu_kernel void @udiv_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x
 ;
 ; GFX9-LABEL: udiv_i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_add_i32 s0, s7, 12
-; GFX9-NEXT:    s_lshr_b32 s0, s6, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    s_add_i32 s3, s3, 12
+; GFX9-NEXT:    s_lshr_b32 s2, s2, s3
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %shl.y = shl i32 4096, %y
   %r = udiv i32 %x, %shl.y
@@ -5528,14 +5528,14 @@ define amdgpu_kernel void @udiv_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3
 ;
 ; GFX9-LABEL: udiv_v2i32_pow2k_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshr_b32 s0, s6, 12
-; GFX9-NEXT:    s_lshr_b32 s1, s7, 12
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    s_lshr_b32 s2, s2, 12
+; GFX9-NEXT:    s_lshr_b32 s3, s3, 12
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = udiv <2 x i32> %x, <i32 4096, i32 4096>
   store <2 x i32> %r, ptr addrspace(1) %out
@@ -5574,18 +5574,18 @@ define amdgpu_kernel void @udiv_v2i32_mixed_pow2k_denom(ptr addrspace(1) %out, <
 ;
 ; GFX9-LABEL: udiv_v2i32_mixed_pow2k_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mul_hi_u32 s1, s7, 0x100101
-; GFX9-NEXT:    s_sub_i32 s2, s7, s1
-; GFX9-NEXT:    s_lshr_b32 s2, s2, 1
-; GFX9-NEXT:    s_add_i32 s2, s2, s1
-; GFX9-NEXT:    s_lshr_b32 s0, s6, 12
-; GFX9-NEXT:    s_lshr_b32 s1, s2, 11
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    s_mul_hi_u32 s4, s3, 0x100101
+; GFX9-NEXT:    s_sub_i32 s3, s3, s4
+; GFX9-NEXT:    s_lshr_b32 s3, s3, 1
+; GFX9-NEXT:    s_add_i32 s3, s3, s4
+; GFX9-NEXT:    s_lshr_b32 s2, s2, 12
+; GFX9-NEXT:    s_lshr_b32 s3, s3, 11
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = udiv <2 x i32> %x, <i32 4096, i32 4095>
   store <2 x i32> %r, ptr addrspace(1) %out
@@ -5879,14 +5879,14 @@ define amdgpu_kernel void @urem_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x
 ;
 ; GFX9-LABEL: urem_i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b32 s0, 0x1000, s7
-; GFX9-NEXT:    s_add_i32 s0, s0, -1
-; GFX9-NEXT:    s_and_b32 s0, s6, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    s_lshl_b32 s3, 0x1000, s3
+; GFX9-NEXT:    s_add_i32 s3, s3, -1
+; GFX9-NEXT:    s_and_b32 s2, s2, s3
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %shl.y = shl i32 4096, %y
   %r = urem i32 %x, %shl.y
@@ -5922,14 +5922,14 @@ define amdgpu_kernel void @urem_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3
 ;
 ; GFX9-LABEL: urem_v2i32_pow2k_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_and_b32 s0, s6, 0xfff
-; GFX9-NEXT:    s_and_b32 s1, s7, 0xfff
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    s_and_b32 s2, s2, 0xfff
+; GFX9-NEXT:    s_and_b32 s3, s3, 0xfff
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = urem <2 x i32> %x, <i32 4096, i32 4096>
   store <2 x i32> %r, ptr addrspace(1) %out
@@ -6239,41 +6239,41 @@ define amdgpu_kernel void @sdiv_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x
 ;
 ; GFX9-LABEL: sdiv_i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b32 s0, 0x1000, s7
-; GFX9-NEXT:    s_ashr_i32 s1, s0, 31
-; GFX9-NEXT:    s_add_i32 s0, s0, s1
-; GFX9-NEXT:    s_xor_b32 s0, s0, s1
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s0
-; GFX9-NEXT:    s_ashr_i32 s2, s6, 31
-; GFX9-NEXT:    s_add_i32 s3, s6, s2
-; GFX9-NEXT:    s_sub_i32 s6, 0, s0
+; GFX9-NEXT:    s_lshl_b32 s3, 0x1000, s3
+; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
+; GFX9-NEXT:    s_add_i32 s3, s3, s4
+; GFX9-NEXT:    s_xor_b32 s3, s3, s4
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
+; GFX9-NEXT:    s_sub_i32 s6, 0, s3
+; GFX9-NEXT:    s_ashr_i32 s5, s2, 31
+; GFX9-NEXT:    s_add_i32 s2, s2, s5
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT:    s_xor_b32 s3, s3, s2
+; GFX9-NEXT:    s_xor_b32 s2, s2, s5
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s7, v0
 ; GFX9-NEXT:    s_mul_i32 s6, s6, s7
 ; GFX9-NEXT:    s_mul_hi_u32 s6, s7, s6
 ; GFX9-NEXT:    s_add_i32 s7, s7, s6
-; GFX9-NEXT:    s_mul_hi_u32 s6, s3, s7
-; GFX9-NEXT:    s_mul_i32 s8, s6, s0
-; GFX9-NEXT:    s_sub_i32 s3, s3, s8
+; GFX9-NEXT:    s_mul_hi_u32 s6, s2, s7
+; GFX9-NEXT:    s_mul_i32 s8, s6, s3
+; GFX9-NEXT:    s_sub_i32 s2, s2, s8
 ; GFX9-NEXT:    s_add_i32 s7, s6, 1
-; GFX9-NEXT:    s_sub_i32 s8, s3, s0
-; GFX9-NEXT:    s_cmp_ge_u32 s3, s0
+; GFX9-NEXT:    s_sub_i32 s8, s2, s3
+; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
 ; GFX9-NEXT:    s_cselect_b32 s6, s7, s6
-; GFX9-NEXT:    s_cselect_b32 s3, s8, s3
+; GFX9-NEXT:    s_cselect_b32 s2, s8, s2
 ; GFX9-NEXT:    s_add_i32 s7, s6, 1
-; GFX9-NEXT:    s_cmp_ge_u32 s3, s0
-; GFX9-NEXT:    s_cselect_b32 s0, s7, s6
-; GFX9-NEXT:    s_xor_b32 s1, s2, s1
-; GFX9-NEXT:    s_xor_b32 s0, s0, s1
-; GFX9-NEXT:    s_sub_i32 s0, s0, s1
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX9-NEXT:    s_cselect_b32 s2, s7, s6
+; GFX9-NEXT:    s_xor_b32 s3, s5, s4
+; GFX9-NEXT:    s_xor_b32 s2, s2, s3
+; GFX9-NEXT:    s_sub_i32 s2, s2, s3
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %shl.y = shl i32 4096, %y
   %r = sdiv i32 %x, %shl.y
@@ -6315,20 +6315,20 @@ define amdgpu_kernel void @sdiv_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3
 ;
 ; GFX9-LABEL: sdiv_v2i32_pow2k_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s0, s6, 31
-; GFX9-NEXT:    s_ashr_i32 s1, s7, 31
-; GFX9-NEXT:    s_lshr_b32 s0, s0, 20
-; GFX9-NEXT:    s_lshr_b32 s1, s1, 20
-; GFX9-NEXT:    s_add_i32 s0, s6, s0
-; GFX9-NEXT:    s_add_i32 s1, s7, s1
-; GFX9-NEXT:    s_ashr_i32 s0, s0, 12
-; GFX9-NEXT:    s_ashr_i32 s1, s1, 12
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    s_ashr_i32 s4, s2, 31
+; GFX9-NEXT:    s_ashr_i32 s5, s3, 31
+; GFX9-NEXT:    s_lshr_b32 s4, s4, 20
+; GFX9-NEXT:    s_lshr_b32 s5, s5, 20
+; GFX9-NEXT:    s_add_i32 s2, s2, s4
+; GFX9-NEXT:    s_add_i32 s3, s3, s5
+; GFX9-NEXT:    s_ashr_i32 s2, s2, 12
+; GFX9-NEXT:    s_ashr_i32 s3, s3, 12
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = sdiv <2 x i32> %x, <i32 4096, i32 4096>
   store <2 x i32> %r, ptr addrspace(1) %out
@@ -6370,21 +6370,21 @@ define amdgpu_kernel void @ssdiv_v2i32_mixed_pow2k_denom(ptr addrspace(1) %out,
 ;
 ; GFX9-LABEL: ssdiv_v2i32_mixed_pow2k_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s0, s6, 31
-; GFX9-NEXT:    s_mul_hi_i32 s1, s7, 0x80080081
-; GFX9-NEXT:    s_lshr_b32 s0, s0, 20
-; GFX9-NEXT:    s_add_i32 s1, s1, s7
-; GFX9-NEXT:    s_add_i32 s0, s6, s0
-; GFX9-NEXT:    s_lshr_b32 s2, s1, 31
-; GFX9-NEXT:    s_ashr_i32 s1, s1, 11
-; GFX9-NEXT:    s_ashr_i32 s0, s0, 12
-; GFX9-NEXT:    s_add_i32 s1, s1, s2
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    s_ashr_i32 s4, s2, 31
+; GFX9-NEXT:    s_mul_hi_i32 s5, s3, 0x80080081
+; GFX9-NEXT:    s_lshr_b32 s4, s4, 20
+; GFX9-NEXT:    s_add_i32 s5, s5, s3
+; GFX9-NEXT:    s_add_i32 s2, s2, s4
+; GFX9-NEXT:    s_lshr_b32 s3, s5, 31
+; GFX9-NEXT:    s_ashr_i32 s4, s5, 11
+; GFX9-NEXT:    s_ashr_i32 s2, s2, 12
+; GFX9-NEXT:    s_add_i32 s4, s4, s3
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = sdiv <2 x i32> %x, <i32 4096, i32 4095>
   store <2 x i32> %r, ptr addrspace(1) %out
@@ -6753,38 +6753,38 @@ define amdgpu_kernel void @srem_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x
 ;
 ; GFX9-LABEL: srem_i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b32 s0, 0x1000, s7
-; GFX9-NEXT:    s_ashr_i32 s1, s0, 31
-; GFX9-NEXT:    s_add_i32 s0, s0, s1
-; GFX9-NEXT:    s_xor_b32 s0, s0, s1
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s0
-; GFX9-NEXT:    s_ashr_i32 s1, s6, 31
-; GFX9-NEXT:    s_add_i32 s2, s6, s1
-; GFX9-NEXT:    s_sub_i32 s3, 0, s0
+; GFX9-NEXT:    s_lshl_b32 s3, 0x1000, s3
+; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
+; GFX9-NEXT:    s_add_i32 s3, s3, s4
+; GFX9-NEXT:    s_xor_b32 s3, s3, s4
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
+; GFX9-NEXT:    s_sub_i32 s5, 0, s3
+; GFX9-NEXT:    s_ashr_i32 s4, s2, 31
+; GFX9-NEXT:    s_add_i32 s2, s2, s4
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT:    s_xor_b32 s2, s2, s1
+; GFX9-NEXT:    s_xor_b32 s2, s2, s4
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s6, v0
-; GFX9-NEXT:    s_mul_i32 s3, s3, s6
-; GFX9-NEXT:    s_mul_hi_u32 s3, s6, s3
-; GFX9-NEXT:    s_add_i32 s6, s6, s3
-; GFX9-NEXT:    s_mul_hi_u32 s3, s2, s6
-; GFX9-NEXT:    s_mul_i32 s3, s3, s0
-; GFX9-NEXT:    s_sub_i32 s2, s2, s3
-; GFX9-NEXT:    s_sub_i32 s3, s2, s0
-; GFX9-NEXT:    s_cmp_ge_u32 s2, s0
-; GFX9-NEXT:    s_cselect_b32 s2, s3, s2
-; GFX9-NEXT:    s_sub_i32 s3, s2, s0
-; GFX9-NEXT:    s_cmp_ge_u32 s2, s0
-; GFX9-NEXT:    s_cselect_b32 s0, s3, s2
-; GFX9-NEXT:    s_xor_b32 s0, s0, s1
-; GFX9-NEXT:    s_sub_i32 s0, s0, s1
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX9-NEXT:    s_mul_i32 s5, s5, s6
+; GFX9-NEXT:    s_mul_hi_u32 s5, s6, s5
+; GFX9-NEXT:    s_add_i32 s6, s6, s5
+; GFX9-NEXT:    s_mul_hi_u32 s5, s2, s6
+; GFX9-NEXT:    s_mul_i32 s5, s5, s3
+; GFX9-NEXT:    s_sub_i32 s2, s2, s5
+; GFX9-NEXT:    s_sub_i32 s5, s2, s3
+; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX9-NEXT:    s_cselect_b32 s2, s5, s2
+; GFX9-NEXT:    s_sub_i32 s5, s2, s3
+; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX9-NEXT:    s_cselect_b32 s2, s5, s2
+; GFX9-NEXT:    s_xor_b32 s2, s2, s4
+; GFX9-NEXT:    s_sub_i32 s2, s2, s4
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %shl.y = shl i32 4096, %y
   %r = srem i32 %x, %shl.y
@@ -6828,22 +6828,22 @@ define amdgpu_kernel void @srem_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3
 ;
 ; GFX9-LABEL: srem_v2i32_pow2k_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s0, s6, 31
-; GFX9-NEXT:    s_ashr_i32 s1, s7, 31
-; GFX9-NEXT:    s_lshr_b32 s0, s0, 20
-; GFX9-NEXT:    s_lshr_b32 s1, s1, 20
-; GFX9-NEXT:    s_add_i32 s0, s6, s0
-; GFX9-NEXT:    s_add_i32 s1, s7, s1
-; GFX9-NEXT:    s_and_b32 s0, s0, 0xfffff000
-; GFX9-NEXT:    s_and_b32 s1, s1, 0xfffff000
-; GFX9-NEXT:    s_sub_i32 s0, s6, s0
-; GFX9-NEXT:    s_sub_i32 s1, s7, s1
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    s_ashr_i32 s4, s2, 31
+; GFX9-NEXT:    s_ashr_i32 s5, s3, 31
+; GFX9-NEXT:    s_lshr_b32 s4, s4, 20
+; GFX9-NEXT:    s_lshr_b32 s5, s5, 20
+; GFX9-NEXT:    s_add_i32 s4, s2, s4
+; GFX9-NEXT:    s_add_i32 s5, s3, s5
+; GFX9-NEXT:    s_and_b32 s4, s4, 0xfffff000
+; GFX9-NEXT:    s_sub_i32 s2, s2, s4
+; GFX9-NEXT:    s_and_b32 s4, s5, 0xfffff000
+; GFX9-NEXT:    s_sub_i32 s3, s3, s4
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = srem <2 x i32> %x, <i32 4096, i32 4096>
   store <2 x i32> %r, ptr addrspace(1) %out
@@ -7290,13 +7290,13 @@ define amdgpu_kernel void @udiv_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) {
 ;
 ; GFX9-LABEL: udiv_i64_pow2k_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshr_b64 s[0:1], s[6:7], 12
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    s_lshr_b64 s[2:3], s[2:3], 12
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = udiv i64 %x, 4096
   store i64 %r, ptr addrspace(1) %out
@@ -7871,12 +7871,12 @@ define amdgpu_kernel void @urem_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) {
 ;
 ; GFX9-LABEL: urem_i64_pow2k_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_and_b32 s0, s6, 0xfff
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    global_store_dwordx2 v1, v[0:1], s[4:5]
+; GFX9-NEXT:    s_and_b32 s2, s2, 0xfff
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = urem i64 %x, 4096
   store i64 %r, ptr addrspace(1) %out
@@ -8138,58 +8138,58 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
 ;
 ; GFX9-LABEL: sdiv_i64_oddk_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s0, 0x33fe64
-; GFX9-NEXT:    s_add_u32 s0, 0x396, s0
+; GFX9-NEXT:    s_mov_b32 s4, 0x33fe64
+; GFX9-NEXT:    s_add_u32 s4, 0x396, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x28100000
-; GFX9-NEXT:    s_addc_u32 s1, 0, 0
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
+; GFX9-NEXT:    s_addc_u32 s5, 0, 0
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s4, v0
 ; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-NEXT:    s_addc_u32 s0, s1, 0xd95
-; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX9-NEXT:    s_mul_i32 s1, s0, 0xffed2705
-; GFX9-NEXT:    s_mul_hi_u32 s3, s2, 0xffed2705
-; GFX9-NEXT:    s_add_i32 s3, s3, s1
-; GFX9-NEXT:    s_sub_i32 s1, s3, s2
-; GFX9-NEXT:    s_mul_i32 s8, s2, 0xffed2705
-; GFX9-NEXT:    s_mul_hi_u32 s11, s2, s1
-; GFX9-NEXT:    s_mul_i32 s12, s2, s1
-; GFX9-NEXT:    s_mul_hi_u32 s2, s2, s8
-; GFX9-NEXT:    s_add_u32 s2, s2, s12
-; GFX9-NEXT:    s_mul_hi_u32 s9, s0, s8
-; GFX9-NEXT:    s_mul_i32 s10, s0, s8
+; GFX9-NEXT:    s_addc_u32 s4, s5, 0xd95
+; GFX9-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX9-NEXT:    s_mul_i32 s5, s4, 0xffed2705
+; GFX9-NEXT:    s_mul_hi_u32 s7, s6, 0xffed2705
+; GFX9-NEXT:    s_add_i32 s7, s7, s5
+; GFX9-NEXT:    s_sub_i32 s5, s7, s6
+; GFX9-NEXT:    s_mul_i32 s8, s6, 0xffed2705
+; GFX9-NEXT:    s_mul_hi_u32 s11, s6, s5
+; GFX9-NEXT:    s_mul_i32 s12, s6, s5
+; GFX9-NEXT:    s_mul_hi_u32 s6, s6, s8
+; GFX9-NEXT:    s_add_u32 s6, s6, s12
+; GFX9-NEXT:    s_mul_hi_u32 s9, s4, s8
+; GFX9-NEXT:    s_mul_i32 s10, s4, s8
 ; GFX9-NEXT:    s_addc_u32 s8, 0, s11
-; GFX9-NEXT:    s_add_u32 s2, s2, s10
-; GFX9-NEXT:    s_mul_hi_u32 s3, s0, s1
-; GFX9-NEXT:    s_addc_u32 s2, s8, s9
-; GFX9-NEXT:    s_addc_u32 s3, s3, 0
-; GFX9-NEXT:    s_mul_i32 s1, s0, s1
-; GFX9-NEXT:    s_add_u32 s1, s2, s1
-; GFX9-NEXT:    s_addc_u32 s2, 0, s3
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s1, v0
+; GFX9-NEXT:    s_add_u32 s6, s6, s10
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_mul_hi_u32 s7, s4, s5
+; GFX9-NEXT:    s_addc_u32 s6, s8, s9
+; GFX9-NEXT:    s_addc_u32 s7, s7, 0
+; GFX9-NEXT:    s_mul_i32 s5, s4, s5
+; GFX9-NEXT:    s_add_u32 s5, s6, s5
+; GFX9-NEXT:    s_addc_u32 s6, 0, s7
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s5, v0
 ; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
-; GFX9-NEXT:    s_addc_u32 s8, s0, s2
+; GFX9-NEXT:    s_addc_u32 s6, s4, s6
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s0, s7, 31
-; GFX9-NEXT:    s_add_u32 s2, s6, s0
-; GFX9-NEXT:    s_mov_b32 s1, s0
-; GFX9-NEXT:    s_addc_u32 s3, s7, s0
-; GFX9-NEXT:    s_xor_b64 s[2:3], s[2:3], s[0:1]
+; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
+; GFX9-NEXT:    s_add_u32 s2, s2, s4
+; GFX9-NEXT:    s_mov_b32 s5, s4
+; GFX9-NEXT:    s_addc_u32 s3, s3, s4
+; GFX9-NEXT:    s_xor_b64 s[2:3], s[2:3], s[4:5]
 ; GFX9-NEXT:    v_readfirstlane_b32 s9, v0
-; GFX9-NEXT:    s_mul_i32 s7, s2, s8
+; GFX9-NEXT:    s_mul_i32 s8, s2, s6
 ; GFX9-NEXT:    s_mul_hi_u32 s10, s2, s9
-; GFX9-NEXT:    s_mul_hi_u32 s6, s2, s8
-; GFX9-NEXT:    s_add_u32 s7, s10, s7
-; GFX9-NEXT:    s_addc_u32 s6, 0, s6
+; GFX9-NEXT:    s_mul_hi_u32 s7, s2, s6
+; GFX9-NEXT:    s_add_u32 s8, s10, s8
+; GFX9-NEXT:    s_addc_u32 s7, 0, s7
 ; GFX9-NEXT:    s_mul_hi_u32 s11, s3, s9
 ; GFX9-NEXT:    s_mul_i32 s9, s3, s9
-; GFX9-NEXT:    s_add_u32 s7, s7, s9
-; GFX9-NEXT:    s_mul_hi_u32 s10, s3, s8
-; GFX9-NEXT:    s_addc_u32 s6, s6, s11
-; GFX9-NEXT:    s_addc_u32 s7, s10, 0
-; GFX9-NEXT:    s_mul_i32 s8, s3, s8
-; GFX9-NEXT:    s_add_u32 s6, s6, s8
-; GFX9-NEXT:    s_addc_u32 s7, 0, s7
+; GFX9-NEXT:    s_add_u32 s8, s8, s9
+; GFX9-NEXT:    s_mul_hi_u32 s10, s3, s6
+; GFX9-NEXT:    s_addc_u32 s7, s7, s11
+; GFX9-NEXT:    s_addc_u32 s8, s10, 0
+; GFX9-NEXT:    s_mul_i32 s6, s3, s6
+; GFX9-NEXT:    s_add_u32 s6, s7, s6
+; GFX9-NEXT:    s_addc_u32 s7, 0, s8
 ; GFX9-NEXT:    s_add_u32 s8, s6, 1
 ; GFX9-NEXT:    s_addc_u32 s9, s7, 0
 ; GFX9-NEXT:    s_add_u32 s10, s6, 2
@@ -8222,13 +8222,13 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
 ; GFX9-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX9-NEXT:    s_cselect_b32 s3, s3, s7
 ; GFX9-NEXT:    s_cselect_b32 s2, s8, s6
-; GFX9-NEXT:    s_xor_b64 s[2:3], s[2:3], s[0:1]
-; GFX9-NEXT:    s_sub_u32 s2, s2, s0
-; GFX9-NEXT:    s_subb_u32 s3, s3, s0
+; GFX9-NEXT:    s_xor_b64 s[2:3], s[2:3], s[4:5]
+; GFX9-NEXT:    s_sub_u32 s2, s2, s4
+; GFX9-NEXT:    s_subb_u32 s3, s3, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = sdiv i64 %x, 1235195
   store i64 %r, ptr addrspace(1) %out
@@ -8261,17 +8261,17 @@ define amdgpu_kernel void @sdiv_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) {
 ;
 ; GFX9-LABEL: sdiv_i64_pow2k_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s0, s7, 31
-; GFX9-NEXT:    s_lshr_b32 s0, s0, 20
-; GFX9-NEXT:    s_add_u32 s0, s6, s0
-; GFX9-NEXT:    s_addc_u32 s1, s7, 0
-; GFX9-NEXT:    s_ashr_i64 s[0:1], s[0:1], 12
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
+; GFX9-NEXT:    s_lshr_b32 s4, s4, 20
+; GFX9-NEXT:    s_add_u32 s2, s2, s4
+; GFX9-NEXT:    s_addc_u32 s3, s3, 0
+; GFX9-NEXT:    s_ashr_i64 s[2:3], s[2:3], 12
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = sdiv i64 %x, 4096
   store i64 %r, ptr addrspace(1) %out
@@ -9132,19 +9132,19 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshl_b64 s[0:1], 0x1000, s8
 ; GFX9-NEXT:    s_lshl_b64 s[10:11], 0x1000, s10
-; GFX9-NEXT:    s_ashr_i32 s12, s1, 31
-; GFX9-NEXT:    s_add_u32 s0, s0, s12
-; GFX9-NEXT:    s_mov_b32 s13, s12
-; GFX9-NEXT:    s_addc_u32 s1, s1, s12
-; GFX9-NEXT:    s_xor_b64 s[14:15], s[0:1], s[12:13]
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s14
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s15
-; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[2:3], 0x24
-; GFX9-NEXT:    s_sub_u32 s0, 0, s14
-; GFX9-NEXT:    s_subb_u32 s1, 0, s15
+; GFX9-NEXT:    s_ashr_i32 s8, s1, 31
+; GFX9-NEXT:    s_add_u32 s0, s0, s8
+; GFX9-NEXT:    s_mov_b32 s9, s8
+; GFX9-NEXT:    s_addc_u32 s1, s1, s8
+; GFX9-NEXT:    s_xor_b64 s[12:13], s[0:1], s[8:9]
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s12
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s13
+; GFX9-NEXT:    s_sub_u32 s0, 0, s12
+; GFX9-NEXT:    s_subb_u32 s1, 0, s13
 ; GFX9-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -9153,60 +9153,60 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX9-NEXT:    v_readfirstlane_b32 s3, v0
-; GFX9-NEXT:    s_mul_i32 s16, s0, s2
-; GFX9-NEXT:    s_mul_hi_u32 s18, s0, s3
-; GFX9-NEXT:    s_mul_i32 s17, s1, s3
+; GFX9-NEXT:    v_readfirstlane_b32 s14, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s15, v0
+; GFX9-NEXT:    s_mul_i32 s16, s0, s14
+; GFX9-NEXT:    s_mul_hi_u32 s18, s0, s15
+; GFX9-NEXT:    s_mul_i32 s17, s1, s15
 ; GFX9-NEXT:    s_add_i32 s16, s18, s16
-; GFX9-NEXT:    s_mul_i32 s19, s0, s3
+; GFX9-NEXT:    s_mul_i32 s19, s0, s15
 ; GFX9-NEXT:    s_add_i32 s16, s16, s17
-; GFX9-NEXT:    s_mul_hi_u32 s17, s3, s16
-; GFX9-NEXT:    s_mul_i32 s18, s3, s16
-; GFX9-NEXT:    s_mul_hi_u32 s3, s3, s19
-; GFX9-NEXT:    s_add_u32 s3, s3, s18
+; GFX9-NEXT:    s_mul_hi_u32 s17, s15, s16
+; GFX9-NEXT:    s_mul_i32 s18, s15, s16
+; GFX9-NEXT:    s_mul_hi_u32 s15, s15, s19
+; GFX9-NEXT:    s_add_u32 s15, s15, s18
 ; GFX9-NEXT:    s_addc_u32 s17, 0, s17
-; GFX9-NEXT:    s_mul_hi_u32 s20, s2, s19
-; GFX9-NEXT:    s_mul_i32 s19, s2, s19
-; GFX9-NEXT:    s_add_u32 s3, s3, s19
-; GFX9-NEXT:    s_mul_hi_u32 s18, s2, s16
-; GFX9-NEXT:    s_addc_u32 s3, s17, s20
+; GFX9-NEXT:    s_mul_hi_u32 s20, s14, s19
+; GFX9-NEXT:    s_mul_i32 s19, s14, s19
+; GFX9-NEXT:    s_add_u32 s15, s15, s19
+; GFX9-NEXT:    s_mul_hi_u32 s18, s14, s16
+; GFX9-NEXT:    s_addc_u32 s15, s17, s20
 ; GFX9-NEXT:    s_addc_u32 s17, s18, 0
-; GFX9-NEXT:    s_mul_i32 s16, s2, s16
-; GFX9-NEXT:    s_add_u32 s3, s3, s16
+; GFX9-NEXT:    s_mul_i32 s16, s14, s16
+; GFX9-NEXT:    s_add_u32 s15, s15, s16
 ; GFX9-NEXT:    s_addc_u32 s16, 0, s17
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s3, v0
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s15, v0
 ; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
-; GFX9-NEXT:    s_addc_u32 s2, s2, s16
+; GFX9-NEXT:    s_addc_u32 s14, s14, s16
 ; GFX9-NEXT:    v_readfirstlane_b32 s16, v0
-; GFX9-NEXT:    s_mul_i32 s3, s0, s2
+; GFX9-NEXT:    s_mul_i32 s15, s0, s14
 ; GFX9-NEXT:    s_mul_hi_u32 s17, s0, s16
-; GFX9-NEXT:    s_add_i32 s3, s17, s3
+; GFX9-NEXT:    s_add_i32 s15, s17, s15
 ; GFX9-NEXT:    s_mul_i32 s1, s1, s16
-; GFX9-NEXT:    s_add_i32 s3, s3, s1
+; GFX9-NEXT:    s_add_i32 s15, s15, s1
 ; GFX9-NEXT:    s_mul_i32 s0, s0, s16
-; GFX9-NEXT:    s_mul_hi_u32 s17, s2, s0
-; GFX9-NEXT:    s_mul_i32 s18, s2, s0
-; GFX9-NEXT:    s_mul_i32 s20, s16, s3
+; GFX9-NEXT:    s_mul_hi_u32 s17, s14, s0
+; GFX9-NEXT:    s_mul_i32 s18, s14, s0
+; GFX9-NEXT:    s_mul_i32 s20, s16, s15
 ; GFX9-NEXT:    s_mul_hi_u32 s0, s16, s0
-; GFX9-NEXT:    s_mul_hi_u32 s19, s16, s3
+; GFX9-NEXT:    s_mul_hi_u32 s19, s16, s15
 ; GFX9-NEXT:    s_add_u32 s0, s0, s20
 ; GFX9-NEXT:    s_addc_u32 s16, 0, s19
 ; GFX9-NEXT:    s_add_u32 s0, s0, s18
-; GFX9-NEXT:    s_mul_hi_u32 s1, s2, s3
+; GFX9-NEXT:    s_mul_hi_u32 s1, s14, s15
 ; GFX9-NEXT:    s_addc_u32 s0, s16, s17
 ; GFX9-NEXT:    s_addc_u32 s1, s1, 0
-; GFX9-NEXT:    s_mul_i32 s3, s2, s3
-; GFX9-NEXT:    s_add_u32 s0, s0, s3
+; GFX9-NEXT:    s_mul_i32 s15, s14, s15
+; GFX9-NEXT:    s_add_u32 s0, s0, s15
 ; GFX9-NEXT:    s_addc_u32 s1, 0, s1
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
 ; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
-; GFX9-NEXT:    s_addc_u32 s16, s2, s1
-; GFX9-NEXT:    s_ashr_i32 s2, s5, 31
-; GFX9-NEXT:    s_add_u32 s0, s4, s2
-; GFX9-NEXT:    s_mov_b32 s3, s2
-; GFX9-NEXT:    s_addc_u32 s1, s5, s2
-; GFX9-NEXT:    s_xor_b64 s[4:5], s[0:1], s[2:3]
+; GFX9-NEXT:    s_addc_u32 s16, s14, s1
+; GFX9-NEXT:    s_ashr_i32 s14, s5, 31
+; GFX9-NEXT:    s_add_u32 s0, s4, s14
+; GFX9-NEXT:    s_mov_b32 s15, s14
+; GFX9-NEXT:    s_addc_u32 s1, s5, s14
+; GFX9-NEXT:    s_xor_b64 s[4:5], s[0:1], s[14:15]
 ; GFX9-NEXT:    v_readfirstlane_b32 s17, v0
 ; GFX9-NEXT:    s_mul_i32 s1, s4, s16
 ; GFX9-NEXT:    s_mul_hi_u32 s18, s4, s17
@@ -9222,24 +9222,24 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    s_mul_i32 s16, s5, s16
 ; GFX9-NEXT:    s_add_u32 s16, s0, s16
 ; GFX9-NEXT:    s_addc_u32 s17, 0, s1
-; GFX9-NEXT:    s_mul_i32 s0, s14, s17
-; GFX9-NEXT:    s_mul_hi_u32 s1, s14, s16
+; GFX9-NEXT:    s_mul_i32 s0, s12, s17
+; GFX9-NEXT:    s_mul_hi_u32 s1, s12, s16
 ; GFX9-NEXT:    s_add_i32 s0, s1, s0
-; GFX9-NEXT:    s_mul_i32 s1, s15, s16
+; GFX9-NEXT:    s_mul_i32 s1, s13, s16
 ; GFX9-NEXT:    s_add_i32 s18, s0, s1
-; GFX9-NEXT:    s_mul_i32 s1, s14, s16
+; GFX9-NEXT:    s_mul_i32 s1, s12, s16
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX9-NEXT:    s_sub_i32 s0, s5, s18
 ; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s4, v0
 ; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
-; GFX9-NEXT:    s_subb_u32 s4, s0, s15
-; GFX9-NEXT:    v_subrev_co_u32_e64 v1, s[0:1], s14, v0
+; GFX9-NEXT:    s_subb_u32 s4, s0, s13
+; GFX9-NEXT:    v_subrev_co_u32_e64 v1, s[0:1], s12, v0
 ; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    s_subb_u32 s4, s4, 0
-; GFX9-NEXT:    s_cmp_ge_u32 s4, s15
+; GFX9-NEXT:    s_cmp_ge_u32 s4, s13
 ; GFX9-NEXT:    s_cselect_b32 s19, -1, 0
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s14, v1
-; GFX9-NEXT:    s_cmp_eq_u32 s4, s15
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s12, v1
+; GFX9-NEXT:    s_cmp_eq_u32 s4, s13
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[0:1]
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s19
 ; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
@@ -9257,35 +9257,35 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
 ; GFX9-NEXT:    s_subb_u32 s0, s5, s18
-; GFX9-NEXT:    s_cmp_ge_u32 s0, s15
+; GFX9-NEXT:    s_cmp_ge_u32 s0, s13
 ; GFX9-NEXT:    s_cselect_b32 s1, -1, 0
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s14, v0
-; GFX9-NEXT:    s_cmp_eq_u32 s0, s15
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s12, v0
+; GFX9-NEXT:    s_cmp_eq_u32 s0, s13
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX9-NEXT:    s_xor_b64 s[0:1], s[2:3], s[12:13]
-; GFX9-NEXT:    s_ashr_i32 s2, s11, 31
+; GFX9-NEXT:    s_xor_b64 s[0:1], s[14:15], s[8:9]
+; GFX9-NEXT:    s_ashr_i32 s4, s11, 31
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX9-NEXT:    s_add_u32 s4, s10, s2
+; GFX9-NEXT:    s_add_u32 s8, s10, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s17
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    s_mov_b32 s3, s2
-; GFX9-NEXT:    s_addc_u32 s5, s11, s2
+; GFX9-NEXT:    s_mov_b32 s5, s4
+; GFX9-NEXT:    s_addc_u32 s9, s11, s4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s16
-; GFX9-NEXT:    s_xor_b64 s[4:5], s[4:5], s[2:3]
+; GFX9-NEXT:    s_xor_b64 s[8:9], s[8:9], s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s4
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s5
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s8
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s9
 ; GFX9-NEXT:    v_xor_b32_e32 v1, s0, v1
 ; GFX9-NEXT:    v_xor_b32_e32 v5, s1, v0
 ; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s0, v1
 ; GFX9-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v3
 ; GFX9-NEXT:    v_rcp_f32_e32 v2, v2
-; GFX9-NEXT:    s_sub_u32 s0, 0, s4
+; GFX9-NEXT:    s_sub_u32 s0, 0, s8
 ; GFX9-NEXT:    v_mov_b32_e32 v6, s1
-; GFX9-NEXT:    s_subb_u32 s1, 0, s5
+; GFX9-NEXT:    s_subb_u32 s1, 0, s9
 ; GFX9-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
 ; GFX9-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
 ; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
@@ -9362,24 +9362,24 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    s_mul_i32 s12, s7, s12
 ; GFX9-NEXT:    s_add_u32 s12, s0, s12
 ; GFX9-NEXT:    s_addc_u32 s13, 0, s1
-; GFX9-NEXT:    s_mul_i32 s0, s4, s13
-; GFX9-NEXT:    s_mul_hi_u32 s1, s4, s12
+; GFX9-NEXT:    s_mul_i32 s0, s8, s13
+; GFX9-NEXT:    s_mul_hi_u32 s1, s8, s12
 ; GFX9-NEXT:    s_add_i32 s0, s1, s0
-; GFX9-NEXT:    s_mul_i32 s1, s5, s12
+; GFX9-NEXT:    s_mul_i32 s1, s9, s12
 ; GFX9-NEXT:    s_add_i32 s14, s0, s1
-; GFX9-NEXT:    s_mul_i32 s1, s4, s12
+; GFX9-NEXT:    s_mul_i32 s1, s8, s12
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX9-NEXT:    s_sub_i32 s0, s7, s14
 ; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, s6, v2
 ; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
-; GFX9-NEXT:    s_subb_u32 s6, s0, s5
-; GFX9-NEXT:    v_subrev_co_u32_e64 v3, s[0:1], s4, v2
+; GFX9-NEXT:    s_subb_u32 s6, s0, s9
+; GFX9-NEXT:    v_subrev_co_u32_e64 v3, s[0:1], s8, v2
 ; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    s_subb_u32 s6, s6, 0
-; GFX9-NEXT:    s_cmp_ge_u32 s6, s5
+; GFX9-NEXT:    s_cmp_ge_u32 s6, s9
 ; GFX9-NEXT:    s_cselect_b32 s15, -1, 0
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s4, v3
-; GFX9-NEXT:    s_cmp_eq_u32 s6, s5
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v3
+; GFX9-NEXT:    s_cmp_eq_u32 s6, s9
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, -1, s[0:1]
 ; GFX9-NEXT:    v_mov_b32_e32 v5, s15
 ; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
@@ -9397,10 +9397,10 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, v5, v6, s[0:1]
 ; GFX9-NEXT:    s_subb_u32 s0, s7, s14
-; GFX9-NEXT:    s_cmp_ge_u32 s0, s5
+; GFX9-NEXT:    s_cmp_ge_u32 s0, s9
 ; GFX9-NEXT:    s_cselect_b32 s1, -1, 0
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s4, v2
-; GFX9-NEXT:    s_cmp_eq_u32 s0, s5
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v2
+; GFX9-NEXT:    s_cmp_eq_u32 s0, s9
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v6, s1
 ; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
@@ -9410,14 +9410,13 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v6, v5, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v5, s12
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX9-NEXT:    s_xor_b64 s[0:1], s[10:11], s[2:3]
+; GFX9-NEXT:    s_xor_b64 s[0:1], s[10:11], s[4:5]
 ; GFX9-NEXT:    v_xor_b32_e32 v3, s0, v3
 ; GFX9-NEXT:    v_xor_b32_e32 v5, s1, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v6, s1
 ; GFX9-NEXT:    v_subrev_co_u32_e32 v2, vcc, s0, v3
 ; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v5, v6, vcc
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[8:9]
+; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
 ; GFX9-NEXT:    s_endpgm
   %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
   %r = sdiv <2 x i64> %x, %shl.y
@@ -9526,100 +9525,100 @@ define amdgpu_kernel void @srem_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
 ;
 ; GFX9-LABEL: srem_i64_oddk_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s0, 0x33fe64
-; GFX9-NEXT:    s_add_u32 s0, 0x396, s0
+; GFX9-NEXT:    s_mov_b32 s4, 0x33fe64
+; GFX9-NEXT:    s_add_u32 s4, 0x396, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x28100000
-; GFX9-NEXT:    s_addc_u32 s1, 0, 0
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
+; GFX9-NEXT:    s_addc_u32 s5, 0, 0
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s4, v0
 ; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-NEXT:    s_addc_u32 s0, s1, 0xd95
-; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX9-NEXT:    s_mul_i32 s1, s0, 0xffed2705
-; GFX9-NEXT:    s_mul_hi_u32 s3, s2, 0xffed2705
-; GFX9-NEXT:    s_add_i32 s3, s3, s1
-; GFX9-NEXT:    s_sub_i32 s1, s3, s2
-; GFX9-NEXT:    s_mul_i32 s8, s2, 0xffed2705
-; GFX9-NEXT:    s_mul_hi_u32 s11, s2, s1
-; GFX9-NEXT:    s_mul_i32 s12, s2, s1
-; GFX9-NEXT:    s_mul_hi_u32 s2, s2, s8
-; GFX9-NEXT:    s_add_u32 s2, s2, s12
-; GFX9-NEXT:    s_mul_hi_u32 s9, s0, s8
-; GFX9-NEXT:    s_mul_i32 s10, s0, s8
+; GFX9-NEXT:    s_addc_u32 s4, s5, 0xd95
+; GFX9-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX9-NEXT:    s_mul_i32 s5, s4, 0xffed2705
+; GFX9-NEXT:    s_mul_hi_u32 s7, s6, 0xffed2705
+; GFX9-NEXT:    s_add_i32 s7, s7, s5
+; GFX9-NEXT:    s_sub_i32 s5, s7, s6
+; GFX9-NEXT:    s_mul_i32 s8, s6, 0xffed2705
+; GFX9-NEXT:    s_mul_hi_u32 s11, s6, s5
+; GFX9-NEXT:    s_mul_i32 s12, s6, s5
+; GFX9-NEXT:    s_mul_hi_u32 s6, s6, s8
+; GFX9-NEXT:    s_add_u32 s6, s6, s12
+; GFX9-NEXT:    s_mul_hi_u32 s9, s4, s8
+; GFX9-NEXT:    s_mul_i32 s10, s4, s8
 ; GFX9-NEXT:    s_addc_u32 s8, 0, s11
-; GFX9-NEXT:    s_add_u32 s2, s2, s10
-; GFX9-NEXT:    s_mul_hi_u32 s3, s0, s1
-; GFX9-NEXT:    s_addc_u32 s2, s8, s9
-; GFX9-NEXT:    s_addc_u32 s3, s3, 0
-; GFX9-NEXT:    s_mul_i32 s1, s0, s1
-; GFX9-NEXT:    s_add_u32 s1, s2, s1
-; GFX9-NEXT:    s_addc_u32 s2, 0, s3
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s1, v0
+; GFX9-NEXT:    s_add_u32 s6, s6, s10
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_mul_hi_u32 s7, s4, s5
+; GFX9-NEXT:    s_addc_u32 s6, s8, s9
+; GFX9-NEXT:    s_addc_u32 s7, s7, 0
+; GFX9-NEXT:    s_mul_i32 s5, s4, s5
+; GFX9-NEXT:    s_add_u32 s5, s6, s5
+; GFX9-NEXT:    s_addc_u32 s6, 0, s7
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s5, v0
 ; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
-; GFX9-NEXT:    s_addc_u32 s8, s0, s2
+; GFX9-NEXT:    s_addc_u32 s6, s4, s6
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s0, s7, 31
-; GFX9-NEXT:    s_add_u32 s2, s6, s0
-; GFX9-NEXT:    s_mov_b32 s1, s0
-; GFX9-NEXT:    s_addc_u32 s3, s7, s0
-; GFX9-NEXT:    s_xor_b64 s[2:3], s[2:3], s[0:1]
-; GFX9-NEXT:    v_readfirstlane_b32 s7, v0
-; GFX9-NEXT:    s_mul_i32 s6, s2, s8
-; GFX9-NEXT:    s_mul_hi_u32 s9, s2, s7
-; GFX9-NEXT:    s_mul_hi_u32 s1, s2, s8
-; GFX9-NEXT:    s_add_u32 s6, s9, s6
-; GFX9-NEXT:    s_addc_u32 s1, 0, s1
-; GFX9-NEXT:    s_mul_hi_u32 s10, s3, s7
-; GFX9-NEXT:    s_mul_i32 s7, s3, s7
-; GFX9-NEXT:    s_add_u32 s6, s6, s7
-; GFX9-NEXT:    s_mul_hi_u32 s9, s3, s8
-; GFX9-NEXT:    s_addc_u32 s1, s1, s10
-; GFX9-NEXT:    s_addc_u32 s6, s9, 0
-; GFX9-NEXT:    s_mul_i32 s7, s3, s8
-; GFX9-NEXT:    s_add_u32 s1, s1, s7
-; GFX9-NEXT:    s_addc_u32 s6, 0, s6
-; GFX9-NEXT:    s_mul_hi_u32 s8, s1, 0x12d8fb
-; GFX9-NEXT:    s_mul_i32 s1, s1, 0x12d8fb
+; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
+; GFX9-NEXT:    s_add_u32 s2, s2, s4
+; GFX9-NEXT:    s_mov_b32 s5, s4
+; GFX9-NEXT:    s_addc_u32 s3, s3, s4
+; GFX9-NEXT:    s_xor_b64 s[2:3], s[2:3], s[4:5]
+; GFX9-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX9-NEXT:    s_mul_i32 s7, s2, s6
+; GFX9-NEXT:    s_mul_hi_u32 s9, s2, s8
+; GFX9-NEXT:    s_mul_hi_u32 s5, s2, s6
+; GFX9-NEXT:    s_add_u32 s7, s9, s7
+; GFX9-NEXT:    s_addc_u32 s5, 0, s5
+; GFX9-NEXT:    s_mul_hi_u32 s10, s3, s8
+; GFX9-NEXT:    s_mul_i32 s8, s3, s8
+; GFX9-NEXT:    s_add_u32 s7, s7, s8
+; GFX9-NEXT:    s_mul_hi_u32 s9, s3, s6
+; GFX9-NEXT:    s_addc_u32 s5, s5, s10
+; GFX9-NEXT:    s_addc_u32 s7, s9, 0
+; GFX9-NEXT:    s_mul_i32 s6, s3, s6
+; GFX9-NEXT:    s_add_u32 s5, s5, s6
+; GFX9-NEXT:    s_addc_u32 s6, 0, s7
+; GFX9-NEXT:    s_mul_hi_u32 s8, s5, 0x12d8fb
+; GFX9-NEXT:    s_mul_i32 s5, s5, 0x12d8fb
 ; GFX9-NEXT:    s_mul_i32 s6, s6, 0x12d8fb
-; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX9-NEXT:    s_add_i32 s8, s8, s6
 ; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s2, v0
 ; GFX9-NEXT:    s_mov_b32 s7, 0x12d8fb
 ; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
-; GFX9-NEXT:    s_subb_u32 s1, s3, s8
+; GFX9-NEXT:    s_subb_u32 s2, s3, s8
 ; GFX9-NEXT:    v_subrev_co_u32_e32 v1, vcc, s7, v0
 ; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
-; GFX9-NEXT:    s_subb_u32 s2, s1, 0
+; GFX9-NEXT:    s_subb_u32 s3, s2, 0
 ; GFX9-NEXT:    v_subrev_co_u32_e32 v3, vcc, s7, v1
 ; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
-; GFX9-NEXT:    s_subb_u32 s3, s2, 0
+; GFX9-NEXT:    s_subb_u32 s5, s3, 0
 ; GFX9-NEXT:    s_mov_b32 s6, 0x12d8fa
 ; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s6, v1
-; GFX9-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s3, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
 ; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v5, s2
-; GFX9-NEXT:    v_mov_b32_e32 v6, s3
+; GFX9-NEXT:    v_mov_b32_e32 v5, s3
+; GFX9-NEXT:    v_mov_b32_e32 v6, s5
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s6, v0
-; GFX9-NEXT:    s_cmp_eq_u32 s1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s2, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
 ; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, -1, v3, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
-; GFX9-NEXT:    v_mov_b32_e32 v5, s1
+; GFX9-NEXT:    v_mov_b32_e32 v5, s2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v4, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v0, s0, v0
-; GFX9-NEXT:    v_xor_b32_e32 v1, s0, v3
-; GFX9-NEXT:    v_mov_b32_e32 v3, s0
-; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s0, v0
+; GFX9-NEXT:    v_xor_b32_e32 v0, s4, v0
+; GFX9-NEXT:    v_xor_b32_e32 v1, s4, v3
+; GFX9-NEXT:    v_mov_b32_e32 v3, s4
+; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s4, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = srem i64 %x, 1235195
   store i64 %r, ptr addrspace(1) %out
@@ -9654,19 +9653,19 @@ define amdgpu_kernel void @srem_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) {
 ;
 ; GFX9-LABEL: srem_i64_pow2k_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s0, s7, 31
-; GFX9-NEXT:    s_lshr_b32 s0, s0, 20
-; GFX9-NEXT:    s_add_u32 s0, s6, s0
-; GFX9-NEXT:    s_addc_u32 s1, s7, 0
-; GFX9-NEXT:    s_and_b32 s0, s0, 0xfffff000
-; GFX9-NEXT:    s_sub_u32 s0, s6, s0
-; GFX9-NEXT:    s_subb_u32 s1, s7, s1
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
+; GFX9-NEXT:    s_lshr_b32 s4, s4, 20
+; GFX9-NEXT:    s_add_u32 s4, s2, s4
+; GFX9-NEXT:    s_addc_u32 s5, s3, 0
+; GFX9-NEXT:    s_and_b32 s4, s4, 0xfffff000
+; GFX9-NEXT:    s_sub_u32 s2, s2, s4
+; GFX9-NEXT:    s_subb_u32 s3, s3, s5
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = srem i64 %x, 4096
   store i64 %r, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/and.ll b/llvm/test/CodeGen/AMDGPU/and.ll
index c6233642110ea..d6137597293f6 100644
--- a/llvm/test/CodeGen/AMDGPU/and.ll
+++ b/llvm/test/CodeGen/AMDGPU/and.ll
@@ -227,10 +227,10 @@ define amdgpu_kernel void @s_and_32_bit_constant_i64(ptr addrspace(1) %out, i32,
 ; SI: s_load_dword [[B:s[0-9]+]]
 ; SI: s_load_dwordx2
 ; SI-NOT: and
-; SI: s_lshl_b32 [[C:s[0-9]+]], [[A]], 1
-; SI: s_lshl_b32 [[D:s[0-9]+]], [[B]], 1
-; SI: s_and_b32 s{{[0-9]+}}, [[C]], 62
-; SI: s_and_b32 s{{[0-9]+}}, [[D]], 62
+; SI: s_lshl_b32 [[A]], [[A]], 1
+; SI: s_lshl_b32 [[B]], [[B]], 1
+; SI: s_and_b32 s{{[0-9]+}}, [[A]], 62
+; SI: s_and_b32 s{{[0-9]+}}, [[B]], 62
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
 define amdgpu_kernel void @s_and_multi_use_inline_imm_i64(ptr addrspace(1) %out, i32, i64 %a, i32, i64 %b, i32, i64 %c) {
@@ -371,9 +371,9 @@ define amdgpu_kernel void @s_and_inline_imm_64_i64(ptr addrspace(1) %out, ptr ad
 
 ; FUNC-LABEL: {{^}}s_and_inline_imm_64_i64_noshrink:
 ; SI: s_load_dword [[A:s[0-9]+]]
-; SI: s_lshl_b32 [[B:s[0-9]+]], [[A]], 1{{$}}
+; SI: s_lshl_b32 [[A]], [[A]], 1{{$}}
 ; SI-NOT: and
-; SI: s_and_b32 s{{[0-9]+}}, [[B]], 64
+; SI: s_and_b32 s{{[0-9]+}}, [[A]], 64
 ; SI-NOT: and
 ; SI: s_add_u32
 ; SI-NEXT: s_addc_u32
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
index 6ec6f6aebf2b1..b1134ae78cb97 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
@@ -427,14 +427,13 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX10W32-NEXT:  .LBB1_2:
 ; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX10W32-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
+; GFX10W32-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x24
 ; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
-; GFX10W32-NEXT:    s_mov_b32 null, 0
-; GFX10W32-NEXT:    v_readfirstlane_b32 s2, v1
+; GFX10W32-NEXT:    v_readfirstlane_b32 s4, v1
 ; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT:    v_mad_u64_u32 v[0:1], s0, s0, v0, s[2:3]
+; GFX10W32-NEXT:    v_mad_u64_u32 v[0:1], s0, s0, v0, s[4:5]
 ; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10W32-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10W32-NEXT:    global_store_dword v1, v0, s[2:3]
 ; GFX10W32-NEXT:    s_endpgm
 ;
 ; GFX11W64-LABEL: add_i32_uniform:
@@ -1922,14 +1921,14 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX10W32-NEXT:  .LBB6_2:
 ; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX10W32-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
+; GFX10W32-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x24
 ; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10W32-NEXT:    v_mul_lo_u32 v0, s0, v0
 ; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10W32-NEXT:    v_readfirstlane_b32 s0, v1
 ; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10W32-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
-; GFX10W32-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10W32-NEXT:    global_store_dword v1, v0, s[2:3]
 ; GFX10W32-NEXT:    s_endpgm
 ;
 ; GFX11W64-LABEL: sub_i32_uniform:
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index da88b6792f269..bc5d2662dcb45 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -89,101 +89,101 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ;
 ; GFX9-LABEL: add_i32_constant:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b64 s[8:9], exec
-; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
-; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s9, v0
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    ; implicit-def: $vgpr1
-; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_cbranch_execz .LBB0_2
 ; GFX9-NEXT:  ; %bb.1:
-; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[8:9]
-; GFX9-NEXT:    s_mul_i32 s2, s2, 5
-; GFX9-NEXT:    s_mov_b32 s15, 0xf000
-; GFX9-NEXT:    s_mov_b32 s14, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s12, s6
-; GFX9-NEXT:    s_mov_b32 s13, s7
+; GFX9-NEXT:    s_mov_b32 s8, s2
+; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[6:7]
+; GFX9-NEXT:    s_mul_i32 s2, s2, 5
+; GFX9-NEXT:    s_mov_b32 s11, 0xf000
+; GFX9-NEXT:    s_mov_b32 s10, -1
+; GFX9-NEXT:    s_mov_b32 s9, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-NEXT:    buffer_atomic_add v1, off, s[12:15], 0 glc
+; GFX9-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:  .LBB0_2:
-; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
-; GFX9-NEXT:    v_mad_u32_u24 v0, v0, 5, s0
-; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    v_mad_u32_u24 v0, v0, 5, s4
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: add_i32_constant:
 ; GFX1064:       ; %bb.0: ; %entry
-; GFX1064-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX1064-NEXT:    s_mov_b64 s[8:9], exec
+; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX1064-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX1064-NEXT:    ; implicit-def: $vgpr1
-; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
-; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s9, v0
+; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1064-NEXT:    s_cbranch_execz .LBB0_2
 ; GFX1064-NEXT:  ; %bb.1:
-; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[8:9]
+; GFX1064-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
 ; GFX1064-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX1064-NEXT:    s_mul_i32 s2, s2, 5
+; GFX1064-NEXT:    s_mul_i32 s6, s6, 5
 ; GFX1064-NEXT:    s_mov_b32 s10, -1
-; GFX1064-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1064-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_mov_b32 s8, s6
-; GFX1064-NEXT:    s_mov_b32 s9, s7
+; GFX1064-NEXT:    s_mov_b32 s8, s2
+; GFX1064-NEXT:    s_mov_b32 s9, s3
 ; GFX1064-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-NEXT:    buffer_gl1_inv
 ; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:  .LBB0_2:
 ; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX1064-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX1064-NEXT:    s_mov_b32 s6, -1
-; GFX1064-NEXT:    v_mad_u32_u24 v0, v0, 5, s0
-; GFX1064-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
+; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1064-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
+; GFX1064-NEXT:    s_mov_b32 s2, -1
+; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: add_i32_constant:
 ; GFX1032:       ; %bb.0: ; %entry
-; GFX1032-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX1032-NEXT:    s_mov_b32 s1, exec_lo
+; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX1032-NEXT:    s_mov_b32 s5, exec_lo
 ; GFX1032-NEXT:    ; implicit-def: $vgpr1
-; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s1, 0
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s5, 0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX1032-NEXT:    s_cbranch_execz .LBB0_2
 ; GFX1032-NEXT:  ; %bb.1:
-; GFX1032-NEXT:    s_bcnt1_i32_b32 s1, s1
+; GFX1032-NEXT:    s_bcnt1_i32_b32 s5, s5
 ; GFX1032-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX1032-NEXT:    s_mul_i32 s1, s1, 5
+; GFX1032-NEXT:    s_mul_i32 s5, s5, 5
 ; GFX1032-NEXT:    s_mov_b32 s10, -1
-; GFX1032-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1032-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_mov_b32 s8, s6
-; GFX1032-NEXT:    s_mov_b32 s9, s7
+; GFX1032-NEXT:    s_mov_b32 s8, s2
+; GFX1032-NEXT:    s_mov_b32 s9, s3
 ; GFX1032-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-NEXT:    buffer_gl1_inv
 ; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:  .LBB0_2:
 ; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX1032-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX1032-NEXT:    s_mov_b32 s6, -1
-; GFX1032-NEXT:    v_mad_u32_u24 v0, v0, 5, s0
-; GFX1032-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
+; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1032-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
+; GFX1032-NEXT:    s_mov_b32 s2, -1
+; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: add_i32_constant:
@@ -766,137 +766,137 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX9_ITERATIVE-LABEL: add_i32_varying:
 ; GFX9_ITERATIVE:       ; %bb.0: ; %entry
 ; GFX9_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
-; GFX9_ITERATIVE-NEXT:    s_mov_b32 s8, 0
+; GFX9_ITERATIVE-NEXT:    s_mov_b32 s6, 0
 ; GFX9_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
 ; GFX9_ITERATIVE-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s4, s[0:1]
 ; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s4
-; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s4
+; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s7, v0, s4
 ; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s4
-; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s8, m0
-; GFX9_ITERATIVE-NEXT:    s_add_i32 s8, s8, s6
+; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s6, m0
+; GFX9_ITERATIVE-NEXT:    s_add_i32 s6, s6, s7
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
 ; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX9_ITERATIVE-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX9_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
-; GFX9_ITERATIVE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX9_ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9_ITERATIVE-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX9_ITERATIVE-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_execz .LBB2_4
 ; GFX9_ITERATIVE-NEXT:  ; %bb.3:
-; GFX9_ITERATIVE-NEXT:    s_mov_b32 s15, 0xf000
-; GFX9_ITERATIVE-NEXT:    s_mov_b32 s14, -1
+; GFX9_ITERATIVE-NEXT:    s_mov_b32 s11, 0xf000
+; GFX9_ITERATIVE-NEXT:    s_mov_b32 s10, -1
 ; GFX9_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9_ITERATIVE-NEXT:    s_mov_b32 s12, s6
-; GFX9_ITERATIVE-NEXT:    s_mov_b32 s13, s7
-; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s8
-; GFX9_ITERATIVE-NEXT:    buffer_atomic_add v0, off, s[12:15], 0 glc
+; GFX9_ITERATIVE-NEXT:    s_mov_b32 s8, s2
+; GFX9_ITERATIVE-NEXT:    s_mov_b32 s9, s3
+; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9_ITERATIVE-NEXT:    buffer_atomic_add v0, off, s[8:11], 0 glc
 ; GFX9_ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9_ITERATIVE-NEXT:    buffer_wbinvl1_vol
 ; GFX9_ITERATIVE-NEXT:  .LBB2_4:
-; GFX9_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX9_ITERATIVE-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX9_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9_ITERATIVE-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9_ITERATIVE-NEXT:    s_mov_b32 s6, -1
-; GFX9_ITERATIVE-NEXT:    v_add_u32_e32 v0, s0, v1
-; GFX9_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX9_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9_ITERATIVE-NEXT:    s_mov_b32 s2, -1
+; GFX9_ITERATIVE-NEXT:    v_add_u32_e32 v0, s4, v1
+; GFX9_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9_ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX1064_ITERATIVE-LABEL: add_i32_varying:
 ; GFX1064_ITERATIVE:       ; %bb.0: ; %entry
 ; GFX1064_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s8, 0
+; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s6, 0
 ; GFX1064_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
 ; GFX1064_ITERATIVE-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s6, s[0:1]
-; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s7, v0, s6
-; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s6
-; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s8, s6
+; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s7, s[0:1]
+; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s7
+; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s6, s7
 ; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
-; GFX1064_ITERATIVE-NEXT:    s_add_i32 s8, s8, s7
+; GFX1064_ITERATIVE-NEXT:    s_add_i32 s6, s6, s8
 ; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX1064_ITERATIVE-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX1064_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1064_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
-; GFX1064_ITERATIVE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX1064_ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064_ITERATIVE-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX1064_ITERATIVE-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_execz .LBB2_4
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.3:
-; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s8
+; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s11, 0x31016000
 ; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s10, -1
 ; GFX1064_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s8, s6
-; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s9, s7
+; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s8, s2
+; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s9, s3
 ; GFX1064_ITERATIVE-NEXT:    buffer_atomic_add v0, off, s[8:11], 0 glc
 ; GFX1064_ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064_ITERATIVE-NEXT:    buffer_gl1_inv
 ; GFX1064_ITERATIVE-NEXT:    buffer_gl0_inv
 ; GFX1064_ITERATIVE-NEXT:  .LBB2_4:
 ; GFX1064_ITERATIVE-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX1064_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX1064_ITERATIVE-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX1064_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1064_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s6, -1
-; GFX1064_ITERATIVE-NEXT:    v_add_nc_u32_e32 v0, s0, v1
-; GFX1064_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX1064_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1064_ITERATIVE-NEXT:    v_add_nc_u32_e32 v0, s2, v1
+; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s2, -1
+; GFX1064_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX1064_ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX1032_ITERATIVE-LABEL: add_i32_varying:
 ; GFX1032_ITERATIVE:       ; %bb.0: ; %entry
-; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s1, exec_lo
-; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s0, 0
+; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s4, 0
 ; GFX1032_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
 ; GFX1032_ITERATIVE-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s4, s1
-; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s5, v0, s4
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s4
-; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s4
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s6
-; GFX1032_ITERATIVE-NEXT:    s_add_i32 s0, s0, s5
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s1, s0
+; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s5, v0, s1
+; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s1
+; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s4, s1
+; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s0, s0, s6
+; GFX1032_ITERATIVE-NEXT:    s_add_i32 s4, s4, s5
+; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX1032_ITERATIVE-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX1032_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1032_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
-; GFX1032_ITERATIVE-NEXT:    s_and_saveexec_b32 s1, vcc_lo
-; GFX1032_ITERATIVE-NEXT:    s_xor_b32 s1, exec_lo, s1
+; GFX1032_ITERATIVE-NEXT:    s_and_saveexec_b32 s5, vcc_lo
+; GFX1032_ITERATIVE-NEXT:    s_xor_b32 s5, exec_lo, s5
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_execz .LBB2_4
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.3:
-; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s11, 0x31016000
 ; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s10, -1
 ; GFX1032_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s8, s6
-; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s9, s7
+; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s8, s2
+; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s9, s3
 ; GFX1032_ITERATIVE-NEXT:    buffer_atomic_add v0, off, s[8:11], 0 glc
 ; GFX1032_ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032_ITERATIVE-NEXT:    buffer_gl1_inv
 ; GFX1032_ITERATIVE-NEXT:    buffer_gl0_inv
 ; GFX1032_ITERATIVE-NEXT:  .LBB2_4:
 ; GFX1032_ITERATIVE-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX1032_ITERATIVE-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX1032_ITERATIVE-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX1032_ITERATIVE-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX1032_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s6, -1
-; GFX1032_ITERATIVE-NEXT:    v_add_nc_u32_e32 v0, s0, v1
-; GFX1032_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX1032_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1032_ITERATIVE-NEXT:    v_add_nc_u32_e32 v0, s2, v1
+; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s2, -1
+; GFX1032_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX1032_ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX1164_ITERATIVE-LABEL: add_i32_varying:
@@ -1180,17 +1180,17 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ;
 ; GFX9_DPP-LABEL: add_i32_varying:
 ; GFX9_DPP:       ; %bb.0: ; %entry
-; GFX9_DPP-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9_DPP-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9_DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX9_DPP-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
 ; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX9_DPP-NEXT:    s_not_b64 exec, exec
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9_DPP-NEXT:    s_not_b64 exec, exec
-; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9_DPP-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX9_DPP-NEXT:    s_nop 1
 ; GFX9_DPP-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -1202,33 +1202,33 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX9_DPP-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
 ; GFX9_DPP-NEXT:    s_nop 1
 ; GFX9_DPP-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9_DPP-NEXT:    v_readlane_b32 s2, v2, 63
+; GFX9_DPP-NEXT:    v_readlane_b32 s6, v2, 63
 ; GFX9_DPP-NEXT:    s_nop 0
 ; GFX9_DPP-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX9_DPP-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
 ; GFX9_DPP-NEXT:    ; implicit-def: $vgpr0
-; GFX9_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX9_DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9_DPP-NEXT:    s_cbranch_execz .LBB2_2
 ; GFX9_DPP-NEXT:  ; %bb.1:
 ; GFX9_DPP-NEXT:    s_mov_b32 s11, 0xf000
 ; GFX9_DPP-NEXT:    s_mov_b32 s10, -1
 ; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9_DPP-NEXT:    s_mov_b32 s8, s6
-; GFX9_DPP-NEXT:    s_mov_b32 s9, s7
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9_DPP-NEXT:    s_mov_b32 s8, s2
+; GFX9_DPP-NEXT:    s_mov_b32 s9, s3
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX9_DPP-NEXT:    buffer_atomic_add v0, off, s[8:11], 0 glc
 ; GFX9_DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9_DPP-NEXT:    buffer_wbinvl1_vol
 ; GFX9_DPP-NEXT:  .LBB2_2:
-; GFX9_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX9_DPP-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9_DPP-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9_DPP-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9_DPP-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9_DPP-NEXT:    s_mov_b32 s6, -1
-; GFX9_DPP-NEXT:    v_add_u32_e32 v0, s0, v0
-; GFX9_DPP-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX9_DPP-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9_DPP-NEXT:    s_mov_b32 s2, -1
+; GFX9_DPP-NEXT:    v_add_u32_e32 v0, s4, v0
+; GFX9_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9_DPP-NEXT:    s_endpgm
 ;
 ; GFX1064_DPP-LABEL: add_i32_varying:
@@ -1249,50 +1249,50 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1064_DPP-NEXT:    v_readlane_b32 s8, v1, 15
+; GFX1064_DPP-NEXT:    v_readlane_b32 s6, v1, 15
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064_DPP-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064_DPP-NEXT:    v_readlane_b32 s2, v1, 31
-; GFX1064_DPP-NEXT:    v_writelane_b32 v3, s8, 16
-; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1064_DPP-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX1064_DPP-NEXT:    v_readlane_b32 s7, v1, 31
+; GFX1064_DPP-NEXT:    v_writelane_b32 v3, s6, 16
+; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX1064_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s8, v1, 47
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s9, v1, 63
-; GFX1064_DPP-NEXT:    v_writelane_b32 v3, s2, 32
-; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1064_DPP-NEXT:    v_writelane_b32 v3, s7, 32
+; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX1064_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX1064_DPP-NEXT:    s_mov_b32 s0, s9
+; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX1064_DPP-NEXT:    s_mov_b32 s4, s9
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v3, s8, 48
-; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX1064_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064_DPP-NEXT:    s_mov_b32 s2, -1
+; GFX1064_DPP-NEXT:    s_mov_b32 s6, -1
 ; GFX1064_DPP-NEXT:    ; implicit-def: $vgpr0
 ; GFX1064_DPP-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX1064_DPP-NEXT:    s_cbranch_execz .LBB2_2
 ; GFX1064_DPP-NEXT:  ; %bb.1:
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX1064_DPP-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1064_DPP-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064_DPP-NEXT:    s_mov_b32 s0, s6
-; GFX1064_DPP-NEXT:    s_mov_b32 s1, s7
-; GFX1064_DPP-NEXT:    buffer_atomic_add v0, off, s[0:3], 0 glc
+; GFX1064_DPP-NEXT:    s_mov_b32 s4, s2
+; GFX1064_DPP-NEXT:    s_mov_b32 s5, s3
+; GFX1064_DPP-NEXT:    buffer_atomic_add v0, off, s[4:7], 0 glc
 ; GFX1064_DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064_DPP-NEXT:    buffer_gl1_inv
 ; GFX1064_DPP-NEXT:    buffer_gl0_inv
 ; GFX1064_DPP-NEXT:  .LBB2_2:
 ; GFX1064_DPP-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064_DPP-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064_DPP-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX1064_DPP-NEXT:    s_mov_b32 s6, s2
-; GFX1064_DPP-NEXT:    v_add_nc_u32_e32 v0, s0, v0
-; GFX1064_DPP-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v0, v3
+; GFX1064_DPP-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1064_DPP-NEXT:    v_add_nc_u32_e32 v0, s2, v0
+; GFX1064_DPP-NEXT:    s_mov_b32 s2, s6
+; GFX1064_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX1064_DPP-NEXT:    s_endpgm
 ;
 ; GFX1032_DPP-LABEL: add_i32_varying:
@@ -1309,44 +1309,44 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX1032_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032_DPP-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032_DPP-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s4, -1
 ; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1032_DPP-NEXT:    v_readlane_b32 s2, v1, 31
+; GFX1032_DPP-NEXT:    v_readlane_b32 s6, v1, 31
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1032_DPP-NEXT:    v_readlane_b32 s1, v1, 15
-; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1032_DPP-NEXT:    v_readlane_b32 s5, v1, 15
+; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s4
 ; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032_DPP-NEXT:    v_writelane_b32 v3, s1, 16
-; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX1032_DPP-NEXT:    v_writelane_b32 v3, s5, 16
+; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s4
 ; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032_DPP-NEXT:    s_mov_b32 s0, s2
-; GFX1032_DPP-NEXT:    s_mov_b32 s2, -1
+; GFX1032_DPP-NEXT:    s_mov_b32 s4, s6
+; GFX1032_DPP-NEXT:    s_mov_b32 s6, -1
 ; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr0
 ; GFX1032_DPP-NEXT:    s_and_saveexec_b32 s8, vcc_lo
 ; GFX1032_DPP-NEXT:    s_cbranch_execz .LBB2_2
 ; GFX1032_DPP-NEXT:  ; %bb.1:
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX1032_DPP-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1032_DPP-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032_DPP-NEXT:    s_mov_b32 s0, s6
-; GFX1032_DPP-NEXT:    s_mov_b32 s1, s7
-; GFX1032_DPP-NEXT:    buffer_atomic_add v0, off, s[0:3], 0 glc
+; GFX1032_DPP-NEXT:    s_mov_b32 s4, s2
+; GFX1032_DPP-NEXT:    s_mov_b32 s5, s3
+; GFX1032_DPP-NEXT:    buffer_atomic_add v0, off, s[4:7], 0 glc
 ; GFX1032_DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032_DPP-NEXT:    buffer_gl1_inv
 ; GFX1032_DPP-NEXT:    buffer_gl0_inv
 ; GFX1032_DPP-NEXT:  .LBB2_2:
 ; GFX1032_DPP-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s8
-; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032_DPP-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX1032_DPP-NEXT:    s_mov_b32 s6, s2
-; GFX1032_DPP-NEXT:    v_add_nc_u32_e32 v0, s0, v0
-; GFX1032_DPP-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v0, v3
+; GFX1032_DPP-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1032_DPP-NEXT:    v_add_nc_u32_e32 v0, s2, v0
+; GFX1032_DPP-NEXT:    s_mov_b32 s2, s6
+; GFX1032_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX1032_DPP-NEXT:    s_endpgm
 ;
 ; GFX1164_DPP-LABEL: add_i32_varying:
@@ -1712,109 +1712,110 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace
 ;
 ; GFX9-LABEL: add_i64_constant:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b64 s[8:9], exec
-; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
-; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, s9, v0
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_cbranch_execz .LBB3_2
 ; GFX9-NEXT:  ; %bb.1:
-; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[8:9]
-; GFX9-NEXT:    s_mul_i32 s2, s2, 5
-; GFX9-NEXT:    s_mov_b32 s15, 0xf000
-; GFX9-NEXT:    s_mov_b32 s14, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s12, s6
-; GFX9-NEXT:    s_mov_b32 s13, s7
+; GFX9-NEXT:    s_mov_b32 s8, s2
+; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[6:7]
+; GFX9-NEXT:    s_mul_i32 s2, s2, 5
+; GFX9-NEXT:    s_mov_b32 s11, 0xf000
+; GFX9-NEXT:    s_mov_b32 s10, -1
+; GFX9-NEXT:    s_mov_b32 s9, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    buffer_atomic_add_x2 v[0:1], off, s[12:15], 0 glc
+; GFX9-NEXT:    buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:  .LBB3_2:
-; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
-; GFX9-NEXT:    v_readfirstlane_b32 s1, v0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s1
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v2, 5, v[0:1]
+; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
-; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s3, v0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s3
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1]
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_nop 2
+; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: add_i64_constant:
 ; GFX1064:       ; %bb.0: ; %entry
-; GFX1064-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX1064-NEXT:    s_mov_b64 s[8:9], exec
-; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
-; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, s9, v0
+; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX1064-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
 ; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
-; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1064-NEXT:    s_cbranch_execz .LBB3_2
 ; GFX1064-NEXT:  ; %bb.1:
-; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[8:9]
+; GFX1064-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064-NEXT:    s_mul_i32 s2, s2, 5
+; GFX1064-NEXT:    s_mul_i32 s6, s6, 5
 ; GFX1064-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX1064-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1064-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX1064-NEXT:    s_mov_b32 s10, -1
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_mov_b32 s8, s6
-; GFX1064-NEXT:    s_mov_b32 s9, s7
+; GFX1064-NEXT:    s_mov_b32 s8, s2
+; GFX1064-NEXT:    s_mov_b32 s9, s3
 ; GFX1064-NEXT:    buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-NEXT:    buffer_gl1_inv
 ; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:  .LBB3_2:
 ; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX1064-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX1064-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX1064-NEXT:    s_mov_b32 s6, -1
-; GFX1064-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v2, 5, s[0:1]
-; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
+; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1064-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v2, 5, s[2:3]
+; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1064-NEXT:    s_mov_b32 s2, -1
+; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: add_i64_constant:
 ; GFX1032:       ; %bb.0: ; %entry
-; GFX1032-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX1032-NEXT:    s_mov_b32 s1, exec_lo
+; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX1032-NEXT:    s_mov_b32 s5, exec_lo
 ; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, s1, 0
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, s5, 0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
-; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX1032-NEXT:    s_cbranch_execz .LBB3_2
 ; GFX1032-NEXT:  ; %bb.1:
-; GFX1032-NEXT:    s_bcnt1_i32_b32 s1, s1
+; GFX1032-NEXT:    s_bcnt1_i32_b32 s5, s5
 ; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032-NEXT:    s_mul_i32 s1, s1, 5
+; GFX1032-NEXT:    s_mul_i32 s5, s5, 5
 ; GFX1032-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX1032-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1032-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX1032-NEXT:    s_mov_b32 s10, -1
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_mov_b32 s8, s6
-; GFX1032-NEXT:    s_mov_b32 s9, s7
+; GFX1032-NEXT:    s_mov_b32 s8, s2
+; GFX1032-NEXT:    s_mov_b32 s9, s3
 ; GFX1032-NEXT:    buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-NEXT:    buffer_gl1_inv
 ; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:  .LBB3_2:
 ; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX1032-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX1032-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX1032-NEXT:    s_mov_b32 s6, -1
-; GFX1032-NEXT:    v_mad_u64_u32 v[0:1], s0, v2, 5, s[0:1]
-; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
+; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1032-NEXT:    v_mad_u64_u32 v[0:1], s2, v2, 5, s[2:3]
+; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1032-NEXT:    s_mov_b32 s2, -1
+; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: add_i64_constant:
@@ -2479,160 +2480,160 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ;
 ; GFX9_ITERATIVE-LABEL: add_i64_varying:
 ; GFX9_ITERATIVE:       ; %bb.0: ; %entry
-; GFX9_ITERATIVE-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9_ITERATIVE-NEXT:    s_mov_b64 s[0:1], 0
+; GFX9_ITERATIVE-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
 ; GFX9_ITERATIVE-NEXT:  .LBB5_1: ; %ComputeLoop
 ; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s6, s[4:5]
+; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s6, s[0:1]
 ; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s6
 ; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s6
 ; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s6
-; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, m0
-; GFX9_ITERATIVE-NEXT:    s_add_u32 s0, s0, s8
-; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, m0
-; GFX9_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s7
+; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s4, m0
+; GFX9_ITERATIVE-NEXT:    s_add_u32 s4, s4, s8
+; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v2, s5, m0
+; GFX9_ITERATIVE-NEXT:    s_addc_u32 s5, s5, s7
 ; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s6
-; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[4:5], s[4:5], s[6:7]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX9_ITERATIVE-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX9_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9_ITERATIVE-NEXT:    ; implicit-def: $vgpr3_vgpr4
-; GFX9_ITERATIVE-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX9_ITERATIVE-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX9_ITERATIVE-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX9_ITERATIVE-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_execz .LBB5_4
 ; GFX9_ITERATIVE-NEXT:  ; %bb.3:
-; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s1
+; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX9_ITERATIVE-NEXT:    s_mov_b32 s11, 0xf000
 ; GFX9_ITERATIVE-NEXT:    s_mov_b32 s10, -1
 ; GFX9_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9_ITERATIVE-NEXT:    s_mov_b32 s8, s6
-; GFX9_ITERATIVE-NEXT:    s_mov_b32 s9, s7
-; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s0
+; GFX9_ITERATIVE-NEXT:    s_mov_b32 s8, s2
+; GFX9_ITERATIVE-NEXT:    s_mov_b32 s9, s3
+; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s5
 ; GFX9_ITERATIVE-NEXT:    buffer_atomic_add_x2 v[3:4], off, s[8:11], 0 glc
 ; GFX9_ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9_ITERATIVE-NEXT:    buffer_wbinvl1_vol
 ; GFX9_ITERATIVE-NEXT:  .LBB5_4:
-; GFX9_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX9_ITERATIVE-NEXT:    v_readfirstlane_b32 s0, v4
-; GFX9_ITERATIVE-NEXT:    v_readfirstlane_b32 s1, v3
-; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s0
-; GFX9_ITERATIVE-NEXT:    v_add_co_u32_e32 v0, vcc, s1, v1
+; GFX9_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX9_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v4
+; GFX9_ITERATIVE-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s4
+; GFX9_ITERATIVE-NEXT:    v_add_co_u32_e32 v0, vcc, s5, v1
 ; GFX9_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9_ITERATIVE-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9_ITERATIVE-NEXT:    s_mov_b32 s6, -1
+; GFX9_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9_ITERATIVE-NEXT:    s_mov_b32 s2, -1
 ; GFX9_ITERATIVE-NEXT:    v_addc_co_u32_e32 v1, vcc, v3, v2, vcc
-; GFX9_ITERATIVE-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX9_ITERATIVE-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX9_ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX1064_ITERATIVE-LABEL: add_i64_varying:
 ; GFX1064_ITERATIVE:       ; %bb.0: ; %entry
 ; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1064_ITERATIVE-NEXT:    s_mov_b64 s[4:5], exec
-; GFX1064_ITERATIVE-NEXT:    s_mov_b64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1064_ITERATIVE-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX1064_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
 ; GFX1064_ITERATIVE-NEXT:  .LBB5_1: ; %ComputeLoop
 ; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s6, s[4:5]
+; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s6, s[0:1]
 ; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s7, v0, s6
 ; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s8, v3, s6
-; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s6
-; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s6
-; GFX1064_ITERATIVE-NEXT:    s_add_u32 s0, s0, s7
-; GFX1064_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s8
+; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s4, s6
+; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v2, s5, s6
+; GFX1064_ITERATIVE-NEXT:    s_add_u32 s4, s4, s7
+; GFX1064_ITERATIVE-NEXT:    s_addc_u32 s5, s5, s8
 ; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s6
-; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[4:5], s[4:5], s[6:7]
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX1064_ITERATIVE-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX1064_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1064_ITERATIVE-NEXT:    ; implicit-def: $vgpr3_vgpr4
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1064_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064_ITERATIVE-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX1064_ITERATIVE-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1064_ITERATIVE-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX1064_ITERATIVE-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_execz .LBB5_4
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.3:
-; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s1
-; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s0
+; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s4
+; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s5
 ; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s11, 0x31016000
 ; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s10, -1
 ; GFX1064_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s8, s6
-; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s9, s7
+; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s8, s2
+; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s9, s3
 ; GFX1064_ITERATIVE-NEXT:    buffer_atomic_add_x2 v[3:4], off, s[8:11], 0 glc
 ; GFX1064_ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064_ITERATIVE-NEXT:    buffer_gl1_inv
 ; GFX1064_ITERATIVE-NEXT:    buffer_gl0_inv
 ; GFX1064_ITERATIVE-NEXT:  .LBB5_4:
 ; GFX1064_ITERATIVE-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX1064_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX1064_ITERATIVE-NEXT:    v_readfirstlane_b32 s0, v3
-; GFX1064_ITERATIVE-NEXT:    v_readfirstlane_b32 s1, v4
+; GFX1064_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX1064_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s6, -1
-; GFX1064_ITERATIVE-NEXT:    v_add_co_u32 v0, vcc, s0, v1
-; GFX1064_ITERATIVE-NEXT:    v_add_co_ci_u32_e32 v1, vcc, s1, v2, vcc
-; GFX1064_ITERATIVE-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX1064_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v3
+; GFX1064_ITERATIVE-NEXT:    v_readfirstlane_b32 s3, v4
+; GFX1064_ITERATIVE-NEXT:    v_add_co_u32 v0, vcc, s2, v1
+; GFX1064_ITERATIVE-NEXT:    v_add_co_ci_u32_e32 v1, vcc, s3, v2, vcc
+; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s2, -1
+; GFX1064_ITERATIVE-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX1064_ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX1032_ITERATIVE-LABEL: add_i64_varying:
 ; GFX1032_ITERATIVE:       ; %bb.0: ; %entry
 ; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s4, exec_lo
-; GFX1032_ITERATIVE-NEXT:    s_mov_b64 s[0:1], 0
+; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1032_ITERATIVE-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX1032_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
 ; GFX1032_ITERATIVE-NEXT:  .LBB5_1: ; %ComputeLoop
 ; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s5, s4
-; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s5
-; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s5
-; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s5
-; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s5
-; GFX1032_ITERATIVE-NEXT:    s_add_u32 s0, s0, s6
-; GFX1032_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s7
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s5, 1, s5
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s4, s4, s5
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s1, s0
+; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s1
+; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s1
+; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s4, s1
+; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v2, s5, s1
+; GFX1032_ITERATIVE-NEXT:    s_add_u32 s4, s4, s6
+; GFX1032_ITERATIVE-NEXT:    s_addc_u32 s5, s5, s7
+; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s1, 1, s1
+; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s0, s0, s1
+; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX1032_ITERATIVE-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX1032_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1032_ITERATIVE-NEXT:    ; implicit-def: $vgpr3_vgpr4
 ; GFX1032_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032_ITERATIVE-NEXT:    s_and_saveexec_b32 s2, vcc_lo
-; GFX1032_ITERATIVE-NEXT:    s_xor_b32 s2, exec_lo, s2
+; GFX1032_ITERATIVE-NEXT:    s_and_saveexec_b32 s6, vcc_lo
+; GFX1032_ITERATIVE-NEXT:    s_xor_b32 s6, exec_lo, s6
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_execz .LBB5_4
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.3:
-; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s1
-; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s0
+; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s4
+; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s5
 ; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s11, 0x31016000
 ; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s10, -1
 ; GFX1032_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s8, s6
-; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s9, s7
+; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s8, s2
+; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s9, s3
 ; GFX1032_ITERATIVE-NEXT:    buffer_atomic_add_x2 v[3:4], off, s[8:11], 0 glc
 ; GFX1032_ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032_ITERATIVE-NEXT:    buffer_gl1_inv
 ; GFX1032_ITERATIVE-NEXT:    buffer_gl0_inv
 ; GFX1032_ITERATIVE-NEXT:  .LBB5_4:
 ; GFX1032_ITERATIVE-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX1032_ITERATIVE-NEXT:    s_or_b32 exec_lo, exec_lo, s2
-; GFX1032_ITERATIVE-NEXT:    v_readfirstlane_b32 s0, v3
-; GFX1032_ITERATIVE-NEXT:    v_readfirstlane_b32 s1, v4
+; GFX1032_ITERATIVE-NEXT:    s_or_b32 exec_lo, exec_lo, s6
 ; GFX1032_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s6, -1
-; GFX1032_ITERATIVE-NEXT:    v_add_co_u32 v0, vcc_lo, s0, v1
-; GFX1032_ITERATIVE-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, s1, v2, vcc_lo
-; GFX1032_ITERATIVE-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX1032_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v3
+; GFX1032_ITERATIVE-NEXT:    v_readfirstlane_b32 s3, v4
+; GFX1032_ITERATIVE-NEXT:    v_add_co_u32 v0, vcc_lo, s2, v1
+; GFX1032_ITERATIVE-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo
+; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s2, -1
+; GFX1032_ITERATIVE-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX1032_ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX1164_ITERATIVE-LABEL: add_i64_varying:
@@ -2986,22 +2987,22 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ;
 ; GFX9_DPP-LABEL: add_i64_varying:
 ; GFX9_DPP:       ; %bb.0: ; %entry
-; GFX9_DPP-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9_DPP-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, v0
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9_DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX9_DPP-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, v8
 ; GFX9_DPP-NEXT:    s_not_b64 exec, exec
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9_DPP-NEXT:    s_not_b64 exec, exec
-; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX9_DPP-NEXT:    s_nop 0
@@ -3049,39 +3050,39 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX9_DPP-NEXT:    s_nop 0
 ; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
 ; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
-; GFX9_DPP-NEXT:    v_readlane_b32 s3, v4, 63
-; GFX9_DPP-NEXT:    v_readlane_b32 s2, v3, 63
+; GFX9_DPP-NEXT:    v_readlane_b32 s7, v4, 63
+; GFX9_DPP-NEXT:    v_readlane_b32 s6, v3, 63
 ; GFX9_DPP-NEXT:    v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX9_DPP-NEXT:    v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX9_DPP-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
-; GFX9_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX9_DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9_DPP-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX9_DPP-NEXT:  ; %bb.1:
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, s3
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, s7
 ; GFX9_DPP-NEXT:    s_mov_b32 s11, 0xf000
 ; GFX9_DPP-NEXT:    s_mov_b32 s10, -1
 ; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9_DPP-NEXT:    s_mov_b32 s8, s6
-; GFX9_DPP-NEXT:    s_mov_b32 s9, s7
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, s2
+; GFX9_DPP-NEXT:    s_mov_b32 s8, s2
+; GFX9_DPP-NEXT:    s_mov_b32 s9, s3
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, s6
 ; GFX9_DPP-NEXT:    buffer_atomic_add_x2 v[7:8], off, s[8:11], 0 glc
 ; GFX9_DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9_DPP-NEXT:    buffer_wbinvl1_vol
 ; GFX9_DPP-NEXT:  .LBB5_2:
-; GFX9_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX9_DPP-NEXT:    v_readfirstlane_b32 s0, v8
-; GFX9_DPP-NEXT:    v_readfirstlane_b32 s1, v7
+; GFX9_DPP-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9_DPP-NEXT:    v_readfirstlane_b32 s4, v8
+; GFX9_DPP-NEXT:    v_readfirstlane_b32 s5, v7
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9_DPP-NEXT:    v_add_co_u32_e32 v7, vcc, s1, v7
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9_DPP-NEXT:    v_add_co_u32_e32 v7, vcc, s5, v7
 ; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9_DPP-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9_DPP-NEXT:    s_mov_b32 s6, -1
+; GFX9_DPP-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9_DPP-NEXT:    s_mov_b32 s2, -1
 ; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v8, vcc, v0, v8, vcc
-; GFX9_DPP-NEXT:    buffer_store_dwordx2 v[7:8], off, s[4:7], 0
+; GFX9_DPP-NEXT:    buffer_store_dwordx2 v[7:8], off, s[0:3], 0
 ; GFX9_DPP-NEXT:    s_endpgm
 ;
 ; GFX1064_DPP-LABEL: add_i64_varying:
@@ -3144,59 +3145,59 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v5
 ; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc
 ; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064_DPP-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX1064_DPP-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX1064_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1064_DPP-NEXT:    v_readlane_b32 s2, v4, 15
-; GFX1064_DPP-NEXT:    v_readlane_b32 s3, v3, 15
+; GFX1064_DPP-NEXT:    v_readlane_b32 s6, v4, 15
+; GFX1064_DPP-NEXT:    v_readlane_b32 s7, v3, 15
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s8, v4, 31
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s9, v3, 31
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s10, v3, 47
-; GFX1064_DPP-NEXT:    v_writelane_b32 v2, s2, 16
-; GFX1064_DPP-NEXT:    v_writelane_b32 v1, s3, 16
-; GFX1064_DPP-NEXT:    v_readlane_b32 s2, v3, 63
+; GFX1064_DPP-NEXT:    v_writelane_b32 v2, s6, 16
+; GFX1064_DPP-NEXT:    v_writelane_b32 v1, s7, 16
+; GFX1064_DPP-NEXT:    v_readlane_b32 s6, v3, 63
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s11, v4, 47
-; GFX1064_DPP-NEXT:    v_readlane_b32 s3, v4, 63
+; GFX1064_DPP-NEXT:    v_readlane_b32 s7, v4, 63
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v2, s8, 32
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v1, s9, 32
-; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX1064_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GFX1064_DPP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX1064_DPP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v2, s11, 48
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v1, s10, 48
 ; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[8:9]
 ; GFX1064_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064_DPP-NEXT:    s_mov_b32 s2, -1
+; GFX1064_DPP-NEXT:    s_mov_b32 s6, -1
 ; GFX1064_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
 ; GFX1064_DPP-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX1064_DPP-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1064_DPP-NEXT:  ; %bb.1:
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v10, s1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v9, s0
-; GFX1064_DPP-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v10, s5
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v9, s4
+; GFX1064_DPP-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064_DPP-NEXT:    s_mov_b32 s0, s6
-; GFX1064_DPP-NEXT:    s_mov_b32 s1, s7
-; GFX1064_DPP-NEXT:    buffer_atomic_add_x2 v[9:10], off, s[0:3], 0 glc
+; GFX1064_DPP-NEXT:    s_mov_b32 s4, s2
+; GFX1064_DPP-NEXT:    s_mov_b32 s5, s3
+; GFX1064_DPP-NEXT:    buffer_atomic_add_x2 v[9:10], off, s[4:7], 0 glc
 ; GFX1064_DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064_DPP-NEXT:    buffer_gl1_inv
 ; GFX1064_DPP-NEXT:    buffer_gl0_inv
 ; GFX1064_DPP-NEXT:  .LBB5_2:
 ; GFX1064_DPP-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064_DPP-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s0, v9
+; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s2, v9
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v11, v1
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v12, v2
-; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s1, v10
-; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064_DPP-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX1064_DPP-NEXT:    v_add_co_u32 v9, vcc, s0, v11
-; GFX1064_DPP-NEXT:    s_mov_b32 s6, s2
-; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v10, vcc, s1, v12, vcc
-; GFX1064_DPP-NEXT:    buffer_store_dwordx2 v[9:10], off, s[4:7], 0
+; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s3, v10
+; GFX1064_DPP-NEXT:    v_add_co_u32 v9, vcc, s2, v11
+; GFX1064_DPP-NEXT:    s_mov_b32 s2, s6
+; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v10, vcc, s3, v12, vcc
+; GFX1064_DPP-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1064_DPP-NEXT:    buffer_store_dwordx2 v[9:10], off, s[0:3], 0
 ; GFX1064_DPP-NEXT:    s_endpgm
 ;
 ; GFX1032_DPP-LABEL: add_i64_varying:
@@ -3247,51 +3248,51 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v8, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032_DPP-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1032_DPP-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s6, -1
 ; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v7
 ; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
-; GFX1032_DPP-NEXT:    v_readlane_b32 s0, v3, 31
+; GFX1032_DPP-NEXT:    v_readlane_b32 s4, v3, 31
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s8, v4, 15
-; GFX1032_DPP-NEXT:    v_readlane_b32 s1, v4, 31
+; GFX1032_DPP-NEXT:    v_readlane_b32 s5, v4, 31
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1032_DPP-NEXT:    v_readlane_b32 s3, v3, 15
-; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1032_DPP-NEXT:    v_readlane_b32 s7, v3, 15
+; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s6, -1
 ; GFX1032_DPP-NEXT:    v_writelane_b32 v2, s8, 16
-; GFX1032_DPP-NEXT:    v_writelane_b32 v1, s3, 16
-; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1032_DPP-NEXT:    v_writelane_b32 v1, s7, 16
+; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032_DPP-NEXT:    s_mov_b32 s2, -1
+; GFX1032_DPP-NEXT:    s_mov_b32 s6, -1
 ; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
 ; GFX1032_DPP-NEXT:    s_and_saveexec_b32 s8, vcc_lo
 ; GFX1032_DPP-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1032_DPP-NEXT:  ; %bb.1:
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v10, s1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v9, s0
-; GFX1032_DPP-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v10, s5
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v9, s4
+; GFX1032_DPP-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032_DPP-NEXT:    s_mov_b32 s0, s6
-; GFX1032_DPP-NEXT:    s_mov_b32 s1, s7
-; GFX1032_DPP-NEXT:    buffer_atomic_add_x2 v[9:10], off, s[0:3], 0 glc
+; GFX1032_DPP-NEXT:    s_mov_b32 s4, s2
+; GFX1032_DPP-NEXT:    s_mov_b32 s5, s3
+; GFX1032_DPP-NEXT:    buffer_atomic_add_x2 v[9:10], off, s[4:7], 0 glc
 ; GFX1032_DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032_DPP-NEXT:    buffer_gl1_inv
 ; GFX1032_DPP-NEXT:    buffer_gl0_inv
 ; GFX1032_DPP-NEXT:  .LBB5_2:
 ; GFX1032_DPP-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s8
-; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s0, v9
+; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s2, v9
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v11, v1
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v12, v2
-; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s1, v10
-; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032_DPP-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX1032_DPP-NEXT:    v_add_co_u32 v9, vcc_lo, s0, v11
-; GFX1032_DPP-NEXT:    s_mov_b32 s6, s2
-; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v10, vcc_lo, s1, v12, vcc_lo
-; GFX1032_DPP-NEXT:    buffer_store_dwordx2 v[9:10], off, s[4:7], 0
+; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s3, v10
+; GFX1032_DPP-NEXT:    v_add_co_u32 v9, vcc_lo, s2, v11
+; GFX1032_DPP-NEXT:    s_mov_b32 s2, s6
+; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v10, vcc_lo, s3, v12, vcc_lo
+; GFX1032_DPP-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1032_DPP-NEXT:    buffer_store_dwordx2 v[9:10], off, s[0:3], 0
 ; GFX1032_DPP-NEXT:    s_endpgm
 ;
 ; GFX1164_DPP-LABEL: add_i64_varying:
@@ -3807,104 +3808,104 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ;
 ; GFX9-LABEL: sub_i32_constant:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b64 s[8:9], exec
-; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
-; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s9, v0
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    ; implicit-def: $vgpr1
-; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX9-NEXT:  ; %bb.1:
-; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[8:9]
-; GFX9-NEXT:    s_mul_i32 s2, s2, 5
-; GFX9-NEXT:    s_mov_b32 s15, 0xf000
-; GFX9-NEXT:    s_mov_b32 s14, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s12, s6
-; GFX9-NEXT:    s_mov_b32 s13, s7
+; GFX9-NEXT:    s_mov_b32 s8, s2
+; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[6:7]
+; GFX9-NEXT:    s_mul_i32 s2, s2, 5
+; GFX9-NEXT:    s_mov_b32 s11, 0xf000
+; GFX9-NEXT:    s_mov_b32 s10, -1
+; GFX9-NEXT:    s_mov_b32 s9, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-NEXT:    buffer_atomic_sub v1, off, s[12:15], 0 glc
+; GFX9-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:  .LBB6_2:
-; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
 ; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
-; GFX9-NEXT:    v_sub_u32_e32 v0, s0, v0
-; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v0
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: sub_i32_constant:
 ; GFX1064:       ; %bb.0: ; %entry
-; GFX1064-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX1064-NEXT:    s_mov_b64 s[8:9], exec
+; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX1064-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX1064-NEXT:    ; implicit-def: $vgpr1
-; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
-; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s9, v0
+; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1064-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX1064-NEXT:  ; %bb.1:
-; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[8:9]
+; GFX1064-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
 ; GFX1064-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX1064-NEXT:    s_mul_i32 s2, s2, 5
+; GFX1064-NEXT:    s_mul_i32 s6, s6, 5
 ; GFX1064-NEXT:    s_mov_b32 s10, -1
-; GFX1064-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1064-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_mov_b32 s8, s6
-; GFX1064-NEXT:    s_mov_b32 s9, s7
+; GFX1064-NEXT:    s_mov_b32 s8, s2
+; GFX1064-NEXT:    s_mov_b32 s9, s3
 ; GFX1064-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-NEXT:    buffer_gl1_inv
 ; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:  .LBB6_2:
 ; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX1064-NEXT:    v_readfirstlane_b32 s0, v1
-; GFX1064-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
+; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX1064-NEXT:    s_mov_b32 s6, -1
-; GFX1064-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
-; GFX1064-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
+; GFX1064-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
+; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1064-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
+; GFX1064-NEXT:    s_mov_b32 s2, -1
+; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: sub_i32_constant:
 ; GFX1032:       ; %bb.0: ; %entry
-; GFX1032-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX1032-NEXT:    s_mov_b32 s1, exec_lo
+; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX1032-NEXT:    s_mov_b32 s5, exec_lo
 ; GFX1032-NEXT:    ; implicit-def: $vgpr1
-; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s1, 0
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s5, 0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX1032-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX1032-NEXT:  ; %bb.1:
-; GFX1032-NEXT:    s_bcnt1_i32_b32 s1, s1
+; GFX1032-NEXT:    s_bcnt1_i32_b32 s5, s5
 ; GFX1032-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX1032-NEXT:    s_mul_i32 s1, s1, 5
+; GFX1032-NEXT:    s_mul_i32 s5, s5, 5
 ; GFX1032-NEXT:    s_mov_b32 s10, -1
-; GFX1032-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1032-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_mov_b32 s8, s6
-; GFX1032-NEXT:    s_mov_b32 s9, s7
+; GFX1032-NEXT:    s_mov_b32 s8, s2
+; GFX1032-NEXT:    s_mov_b32 s9, s3
 ; GFX1032-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-NEXT:    buffer_gl1_inv
 ; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:  .LBB6_2:
 ; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX1032-NEXT:    v_readfirstlane_b32 s0, v1
-; GFX1032-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
+; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX1032-NEXT:    s_mov_b32 s6, -1
-; GFX1032-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
-; GFX1032-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
+; GFX1032-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
+; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1032-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
+; GFX1032-NEXT:    s_mov_b32 s2, -1
+; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: sub_i32_constant:
@@ -4497,137 +4498,137 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX9_ITERATIVE-LABEL: sub_i32_varying:
 ; GFX9_ITERATIVE:       ; %bb.0: ; %entry
 ; GFX9_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
-; GFX9_ITERATIVE-NEXT:    s_mov_b32 s8, 0
+; GFX9_ITERATIVE-NEXT:    s_mov_b32 s6, 0
 ; GFX9_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
 ; GFX9_ITERATIVE-NEXT:  .LBB8_1: ; %ComputeLoop
 ; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s4, s[0:1]
 ; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s4
-; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s4
+; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s7, v0, s4
 ; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s4
-; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s8, m0
-; GFX9_ITERATIVE-NEXT:    s_add_i32 s8, s8, s6
+; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s6, m0
+; GFX9_ITERATIVE-NEXT:    s_add_i32 s6, s6, s7
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
 ; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX9_ITERATIVE-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX9_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
-; GFX9_ITERATIVE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX9_ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9_ITERATIVE-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX9_ITERATIVE-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_execz .LBB8_4
 ; GFX9_ITERATIVE-NEXT:  ; %bb.3:
-; GFX9_ITERATIVE-NEXT:    s_mov_b32 s15, 0xf000
-; GFX9_ITERATIVE-NEXT:    s_mov_b32 s14, -1
+; GFX9_ITERATIVE-NEXT:    s_mov_b32 s11, 0xf000
+; GFX9_ITERATIVE-NEXT:    s_mov_b32 s10, -1
 ; GFX9_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9_ITERATIVE-NEXT:    s_mov_b32 s12, s6
-; GFX9_ITERATIVE-NEXT:    s_mov_b32 s13, s7
-; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s8
-; GFX9_ITERATIVE-NEXT:    buffer_atomic_sub v0, off, s[12:15], 0 glc
+; GFX9_ITERATIVE-NEXT:    s_mov_b32 s8, s2
+; GFX9_ITERATIVE-NEXT:    s_mov_b32 s9, s3
+; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9_ITERATIVE-NEXT:    buffer_atomic_sub v0, off, s[8:11], 0 glc
 ; GFX9_ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9_ITERATIVE-NEXT:    buffer_wbinvl1_vol
 ; GFX9_ITERATIVE-NEXT:  .LBB8_4:
-; GFX9_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX9_ITERATIVE-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX9_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9_ITERATIVE-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9_ITERATIVE-NEXT:    s_mov_b32 s6, -1
-; GFX9_ITERATIVE-NEXT:    v_sub_u32_e32 v0, s0, v1
-; GFX9_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX9_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9_ITERATIVE-NEXT:    s_mov_b32 s2, -1
+; GFX9_ITERATIVE-NEXT:    v_sub_u32_e32 v0, s4, v1
+; GFX9_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9_ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX1064_ITERATIVE-LABEL: sub_i32_varying:
 ; GFX1064_ITERATIVE:       ; %bb.0: ; %entry
 ; GFX1064_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s8, 0
+; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s6, 0
 ; GFX1064_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
 ; GFX1064_ITERATIVE-NEXT:  .LBB8_1: ; %ComputeLoop
 ; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s6, s[0:1]
-; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s7, v0, s6
-; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s6
-; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s8, s6
+; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s7, s[0:1]
+; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s7
+; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s6, s7
 ; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
-; GFX1064_ITERATIVE-NEXT:    s_add_i32 s8, s8, s7
+; GFX1064_ITERATIVE-NEXT:    s_add_i32 s6, s6, s8
 ; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX1064_ITERATIVE-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX1064_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1064_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
-; GFX1064_ITERATIVE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX1064_ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064_ITERATIVE-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX1064_ITERATIVE-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_execz .LBB8_4
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.3:
-; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s8
+; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s11, 0x31016000
 ; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s10, -1
 ; GFX1064_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s8, s6
-; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s9, s7
+; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s8, s2
+; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s9, s3
 ; GFX1064_ITERATIVE-NEXT:    buffer_atomic_sub v0, off, s[8:11], 0 glc
 ; GFX1064_ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064_ITERATIVE-NEXT:    buffer_gl1_inv
 ; GFX1064_ITERATIVE-NEXT:    buffer_gl0_inv
 ; GFX1064_ITERATIVE-NEXT:  .LBB8_4:
 ; GFX1064_ITERATIVE-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX1064_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX1064_ITERATIVE-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX1064_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1064_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s6, -1
-; GFX1064_ITERATIVE-NEXT:    v_sub_nc_u32_e32 v0, s0, v1
-; GFX1064_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX1064_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1064_ITERATIVE-NEXT:    v_sub_nc_u32_e32 v0, s2, v1
+; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s2, -1
+; GFX1064_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX1064_ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX1032_ITERATIVE-LABEL: sub_i32_varying:
 ; GFX1032_ITERATIVE:       ; %bb.0: ; %entry
-; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s1, exec_lo
-; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s0, 0
+; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s4, 0
 ; GFX1032_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
 ; GFX1032_ITERATIVE-NEXT:  .LBB8_1: ; %ComputeLoop
 ; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s4, s1
-; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s5, v0, s4
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s4
-; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s4
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s6
-; GFX1032_ITERATIVE-NEXT:    s_add_i32 s0, s0, s5
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s1, s0
+; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s5, v0, s1
+; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s1
+; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s4, s1
+; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s0, s0, s6
+; GFX1032_ITERATIVE-NEXT:    s_add_i32 s4, s4, s5
+; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX1032_ITERATIVE-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX1032_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1032_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
-; GFX1032_ITERATIVE-NEXT:    s_and_saveexec_b32 s1, vcc_lo
-; GFX1032_ITERATIVE-NEXT:    s_xor_b32 s1, exec_lo, s1
+; GFX1032_ITERATIVE-NEXT:    s_and_saveexec_b32 s5, vcc_lo
+; GFX1032_ITERATIVE-NEXT:    s_xor_b32 s5, exec_lo, s5
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_execz .LBB8_4
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.3:
-; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s11, 0x31016000
 ; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s10, -1
 ; GFX1032_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s8, s6
-; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s9, s7
+; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s8, s2
+; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s9, s3
 ; GFX1032_ITERATIVE-NEXT:    buffer_atomic_sub v0, off, s[8:11], 0 glc
 ; GFX1032_ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032_ITERATIVE-NEXT:    buffer_gl1_inv
 ; GFX1032_ITERATIVE-NEXT:    buffer_gl0_inv
 ; GFX1032_ITERATIVE-NEXT:  .LBB8_4:
 ; GFX1032_ITERATIVE-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX1032_ITERATIVE-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX1032_ITERATIVE-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX1032_ITERATIVE-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX1032_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s6, -1
-; GFX1032_ITERATIVE-NEXT:    v_sub_nc_u32_e32 v0, s0, v1
-; GFX1032_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX1032_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1032_ITERATIVE-NEXT:    v_sub_nc_u32_e32 v0, s2, v1
+; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s2, -1
+; GFX1032_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX1032_ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX1164_ITERATIVE-LABEL: sub_i32_varying:
@@ -4911,17 +4912,17 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ;
 ; GFX9_DPP-LABEL: sub_i32_varying:
 ; GFX9_DPP:       ; %bb.0: ; %entry
-; GFX9_DPP-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9_DPP-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9_DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX9_DPP-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
 ; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX9_DPP-NEXT:    s_not_b64 exec, exec
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9_DPP-NEXT:    s_not_b64 exec, exec
-; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9_DPP-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX9_DPP-NEXT:    s_nop 1
 ; GFX9_DPP-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -4933,33 +4934,33 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX9_DPP-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
 ; GFX9_DPP-NEXT:    s_nop 1
 ; GFX9_DPP-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9_DPP-NEXT:    v_readlane_b32 s2, v2, 63
+; GFX9_DPP-NEXT:    v_readlane_b32 s6, v2, 63
 ; GFX9_DPP-NEXT:    s_nop 0
 ; GFX9_DPP-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX9_DPP-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
 ; GFX9_DPP-NEXT:    ; implicit-def: $vgpr0
-; GFX9_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX9_DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9_DPP-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX9_DPP-NEXT:  ; %bb.1:
 ; GFX9_DPP-NEXT:    s_mov_b32 s11, 0xf000
 ; GFX9_DPP-NEXT:    s_mov_b32 s10, -1
 ; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9_DPP-NEXT:    s_mov_b32 s8, s6
-; GFX9_DPP-NEXT:    s_mov_b32 s9, s7
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9_DPP-NEXT:    s_mov_b32 s8, s2
+; GFX9_DPP-NEXT:    s_mov_b32 s9, s3
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX9_DPP-NEXT:    buffer_atomic_sub v0, off, s[8:11], 0 glc
 ; GFX9_DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9_DPP-NEXT:    buffer_wbinvl1_vol
 ; GFX9_DPP-NEXT:  .LBB8_2:
-; GFX9_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX9_DPP-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9_DPP-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9_DPP-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9_DPP-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9_DPP-NEXT:    s_mov_b32 s6, -1
-; GFX9_DPP-NEXT:    v_sub_u32_e32 v0, s0, v0
-; GFX9_DPP-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX9_DPP-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9_DPP-NEXT:    s_mov_b32 s2, -1
+; GFX9_DPP-NEXT:    v_sub_u32_e32 v0, s4, v0
+; GFX9_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9_DPP-NEXT:    s_endpgm
 ;
 ; GFX1064_DPP-LABEL: sub_i32_varying:
@@ -4980,50 +4981,50 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s4, v1, 31
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1064_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1064_DPP-NEXT:    v_readlane_b32 s8, v1, 15
+; GFX1064_DPP-NEXT:    v_readlane_b32 s6, v1, 15
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064_DPP-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064_DPP-NEXT:    v_readlane_b32 s2, v1, 31
-; GFX1064_DPP-NEXT:    v_writelane_b32 v3, s8, 16
-; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1064_DPP-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX1064_DPP-NEXT:    v_readlane_b32 s7, v1, 31
+; GFX1064_DPP-NEXT:    v_writelane_b32 v3, s6, 16
+; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX1064_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s8, v1, 47
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s9, v1, 63
-; GFX1064_DPP-NEXT:    v_writelane_b32 v3, s2, 32
-; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1064_DPP-NEXT:    v_writelane_b32 v3, s7, 32
+; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX1064_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX1064_DPP-NEXT:    s_mov_b32 s0, s9
+; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX1064_DPP-NEXT:    s_mov_b32 s4, s9
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v3, s8, 48
-; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX1064_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064_DPP-NEXT:    s_mov_b32 s2, -1
+; GFX1064_DPP-NEXT:    s_mov_b32 s6, -1
 ; GFX1064_DPP-NEXT:    ; implicit-def: $vgpr0
 ; GFX1064_DPP-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX1064_DPP-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX1064_DPP-NEXT:  ; %bb.1:
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX1064_DPP-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1064_DPP-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064_DPP-NEXT:    s_mov_b32 s0, s6
-; GFX1064_DPP-NEXT:    s_mov_b32 s1, s7
-; GFX1064_DPP-NEXT:    buffer_atomic_sub v0, off, s[0:3], 0 glc
+; GFX1064_DPP-NEXT:    s_mov_b32 s4, s2
+; GFX1064_DPP-NEXT:    s_mov_b32 s5, s3
+; GFX1064_DPP-NEXT:    buffer_atomic_sub v0, off, s[4:7], 0 glc
 ; GFX1064_DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064_DPP-NEXT:    buffer_gl1_inv
 ; GFX1064_DPP-NEXT:    buffer_gl0_inv
 ; GFX1064_DPP-NEXT:  .LBB8_2:
 ; GFX1064_DPP-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064_DPP-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064_DPP-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX1064_DPP-NEXT:    s_mov_b32 s6, s2
-; GFX1064_DPP-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
-; GFX1064_DPP-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v0, v3
+; GFX1064_DPP-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1064_DPP-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
+; GFX1064_DPP-NEXT:    s_mov_b32 s2, s6
+; GFX1064_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX1064_DPP-NEXT:    s_endpgm
 ;
 ; GFX1032_DPP-LABEL: sub_i32_varying:
@@ -5040,44 +5041,44 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX1032_DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032_DPP-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032_DPP-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s4, -1
 ; GFX1032_DPP-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1032_DPP-NEXT:    v_readlane_b32 s2, v1, 31
+; GFX1032_DPP-NEXT:    v_readlane_b32 s6, v1, 31
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1032_DPP-NEXT:    v_readlane_b32 s1, v1, 15
-; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1032_DPP-NEXT:    v_readlane_b32 s5, v1, 15
+; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s4
 ; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032_DPP-NEXT:    v_writelane_b32 v3, s1, 16
-; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX1032_DPP-NEXT:    v_writelane_b32 v3, s5, 16
+; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s4
 ; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032_DPP-NEXT:    s_mov_b32 s0, s2
-; GFX1032_DPP-NEXT:    s_mov_b32 s2, -1
+; GFX1032_DPP-NEXT:    s_mov_b32 s4, s6
+; GFX1032_DPP-NEXT:    s_mov_b32 s6, -1
 ; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr0
 ; GFX1032_DPP-NEXT:    s_and_saveexec_b32 s8, vcc_lo
 ; GFX1032_DPP-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX1032_DPP-NEXT:  ; %bb.1:
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX1032_DPP-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1032_DPP-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032_DPP-NEXT:    s_mov_b32 s0, s6
-; GFX1032_DPP-NEXT:    s_mov_b32 s1, s7
-; GFX1032_DPP-NEXT:    buffer_atomic_sub v0, off, s[0:3], 0 glc
+; GFX1032_DPP-NEXT:    s_mov_b32 s4, s2
+; GFX1032_DPP-NEXT:    s_mov_b32 s5, s3
+; GFX1032_DPP-NEXT:    buffer_atomic_sub v0, off, s[4:7], 0 glc
 ; GFX1032_DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032_DPP-NEXT:    buffer_gl1_inv
 ; GFX1032_DPP-NEXT:    buffer_gl0_inv
 ; GFX1032_DPP-NEXT:  .LBB8_2:
 ; GFX1032_DPP-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s8
-; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032_DPP-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX1032_DPP-NEXT:    s_mov_b32 s6, s2
-; GFX1032_DPP-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
-; GFX1032_DPP-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v0, v3
+; GFX1032_DPP-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1032_DPP-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
+; GFX1032_DPP-NEXT:    s_mov_b32 s2, s6
+; GFX1032_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX1032_DPP-NEXT:    s_endpgm
 ;
 ; GFX1164_DPP-LABEL: sub_i32_varying:
@@ -5444,117 +5445,117 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
 ;
 ; GFX9-LABEL: sub_i64_constant:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b64 s[8:9], exec
-; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
-; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, s9, v0
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_cbranch_execz .LBB9_2
 ; GFX9-NEXT:  ; %bb.1:
-; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[8:9]
-; GFX9-NEXT:    s_mul_i32 s2, s2, 5
-; GFX9-NEXT:    s_mov_b32 s15, 0xf000
-; GFX9-NEXT:    s_mov_b32 s14, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s12, s6
-; GFX9-NEXT:    s_mov_b32 s13, s7
+; GFX9-NEXT:    s_mov_b32 s8, s2
+; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[6:7]
+; GFX9-NEXT:    s_mul_i32 s2, s2, 5
+; GFX9-NEXT:    s_mov_b32 s11, 0xf000
+; GFX9-NEXT:    s_mov_b32 s10, -1
+; GFX9-NEXT:    s_mov_b32 s9, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    buffer_atomic_sub_x2 v[0:1], off, s[12:15], 0 glc
+; GFX9-NEXT:    buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:  .LBB9_2:
-; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
-; GFX9-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s5, v0
 ; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
 ; GFX9-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX9-NEXT:    v_mov_b32_e32 v2, s0
-; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s5, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
-; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: sub_i64_constant:
 ; GFX1064:       ; %bb.0: ; %entry
-; GFX1064-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX1064-NEXT:    s_mov_b64 s[8:9], exec
-; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
-; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, s9, v0
+; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX1064-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
 ; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
-; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1064-NEXT:    s_cbranch_execz .LBB9_2
 ; GFX1064-NEXT:  ; %bb.1:
-; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[8:9]
+; GFX1064-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064-NEXT:    s_mul_i32 s2, s2, 5
+; GFX1064-NEXT:    s_mul_i32 s6, s6, 5
 ; GFX1064-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX1064-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1064-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX1064-NEXT:    s_mov_b32 s10, -1
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_mov_b32 s8, s6
-; GFX1064-NEXT:    s_mov_b32 s9, s7
+; GFX1064-NEXT:    s_mov_b32 s8, s2
+; GFX1064-NEXT:    s_mov_b32 s9, s3
 ; GFX1064-NEXT:    buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-NEXT:    buffer_gl1_inv
 ; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:  .LBB9_2:
 ; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX1064-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1064-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
-; GFX1064-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
 ; GFX1064-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX1064-NEXT:    s_mov_b32 s6, -1
-; GFX1064-NEXT:    v_sub_co_u32 v0, vcc, s0, v0
-; GFX1064-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s1, v1, vcc
-; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX1064-NEXT:    v_sub_co_u32 v0, vcc, s2, v0
+; GFX1064-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc
+; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1064-NEXT:    s_mov_b32 s2, -1
+; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: sub_i64_constant:
 ; GFX1032:       ; %bb.0: ; %entry
-; GFX1032-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX1032-NEXT:    s_mov_b32 s1, exec_lo
+; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX1032-NEXT:    s_mov_b32 s5, exec_lo
 ; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, s1, 0
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, s5, 0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
-; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX1032-NEXT:    s_cbranch_execz .LBB9_2
 ; GFX1032-NEXT:  ; %bb.1:
-; GFX1032-NEXT:    s_bcnt1_i32_b32 s1, s1
+; GFX1032-NEXT:    s_bcnt1_i32_b32 s5, s5
 ; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032-NEXT:    s_mul_i32 s1, s1, 5
+; GFX1032-NEXT:    s_mul_i32 s5, s5, 5
 ; GFX1032-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX1032-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1032-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX1032-NEXT:    s_mov_b32 s10, -1
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_mov_b32 s8, s6
-; GFX1032-NEXT:    s_mov_b32 s9, s7
+; GFX1032-NEXT:    s_mov_b32 s8, s2
+; GFX1032-NEXT:    s_mov_b32 s9, s3
 ; GFX1032-NEXT:    buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-NEXT:    buffer_gl1_inv
 ; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:  .LBB9_2:
 ; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX1032-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1032-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
-; GFX1032-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
 ; GFX1032-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX1032-NEXT:    s_mov_b32 s6, -1
-; GFX1032-NEXT:    v_sub_co_u32 v0, vcc_lo, s0, v0
-; GFX1032-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
-; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX1032-NEXT:    v_sub_co_u32 v0, vcc_lo, s2, v0
+; GFX1032-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
+; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1032-NEXT:    s_mov_b32 s2, -1
+; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: sub_i64_constant:
@@ -6252,160 +6253,160 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ;
 ; GFX9_ITERATIVE-LABEL: sub_i64_varying:
 ; GFX9_ITERATIVE:       ; %bb.0: ; %entry
-; GFX9_ITERATIVE-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9_ITERATIVE-NEXT:    s_mov_b64 s[0:1], 0
+; GFX9_ITERATIVE-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
 ; GFX9_ITERATIVE-NEXT:  .LBB11_1: ; %ComputeLoop
 ; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s6, s[4:5]
+; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s6, s[0:1]
 ; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s6
 ; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s6
 ; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s6
-; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, m0
-; GFX9_ITERATIVE-NEXT:    s_add_u32 s0, s0, s8
-; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, m0
-; GFX9_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s7
+; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s4, m0
+; GFX9_ITERATIVE-NEXT:    s_add_u32 s4, s4, s8
+; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v2, s5, m0
+; GFX9_ITERATIVE-NEXT:    s_addc_u32 s5, s5, s7
 ; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s6
-; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[4:5], s[4:5], s[6:7]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX9_ITERATIVE-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX9_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9_ITERATIVE-NEXT:    ; implicit-def: $vgpr3_vgpr4
-; GFX9_ITERATIVE-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX9_ITERATIVE-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX9_ITERATIVE-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX9_ITERATIVE-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_execz .LBB11_4
 ; GFX9_ITERATIVE-NEXT:  ; %bb.3:
-; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s1
+; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX9_ITERATIVE-NEXT:    s_mov_b32 s11, 0xf000
 ; GFX9_ITERATIVE-NEXT:    s_mov_b32 s10, -1
 ; GFX9_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9_ITERATIVE-NEXT:    s_mov_b32 s8, s6
-; GFX9_ITERATIVE-NEXT:    s_mov_b32 s9, s7
-; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s0
+; GFX9_ITERATIVE-NEXT:    s_mov_b32 s8, s2
+; GFX9_ITERATIVE-NEXT:    s_mov_b32 s9, s3
+; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s5
 ; GFX9_ITERATIVE-NEXT:    buffer_atomic_sub_x2 v[3:4], off, s[8:11], 0 glc
 ; GFX9_ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9_ITERATIVE-NEXT:    buffer_wbinvl1_vol
 ; GFX9_ITERATIVE-NEXT:  .LBB11_4:
-; GFX9_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX9_ITERATIVE-NEXT:    v_readfirstlane_b32 s0, v4
-; GFX9_ITERATIVE-NEXT:    v_readfirstlane_b32 s1, v3
-; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s0
-; GFX9_ITERATIVE-NEXT:    v_sub_co_u32_e32 v0, vcc, s1, v1
+; GFX9_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX9_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v4
+; GFX9_ITERATIVE-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s4
+; GFX9_ITERATIVE-NEXT:    v_sub_co_u32_e32 v0, vcc, s5, v1
 ; GFX9_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9_ITERATIVE-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9_ITERATIVE-NEXT:    s_mov_b32 s6, -1
+; GFX9_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9_ITERATIVE-NEXT:    s_mov_b32 s2, -1
 ; GFX9_ITERATIVE-NEXT:    v_subb_co_u32_e32 v1, vcc, v3, v2, vcc
-; GFX9_ITERATIVE-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX9_ITERATIVE-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX9_ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX1064_ITERATIVE-LABEL: sub_i64_varying:
 ; GFX1064_ITERATIVE:       ; %bb.0: ; %entry
 ; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1064_ITERATIVE-NEXT:    s_mov_b64 s[4:5], exec
-; GFX1064_ITERATIVE-NEXT:    s_mov_b64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1064_ITERATIVE-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX1064_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
 ; GFX1064_ITERATIVE-NEXT:  .LBB11_1: ; %ComputeLoop
 ; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s6, s[4:5]
+; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s6, s[0:1]
 ; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s7, v0, s6
 ; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s8, v3, s6
-; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s6
-; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s6
-; GFX1064_ITERATIVE-NEXT:    s_add_u32 s0, s0, s7
-; GFX1064_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s8
+; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s4, s6
+; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v2, s5, s6
+; GFX1064_ITERATIVE-NEXT:    s_add_u32 s4, s4, s7
+; GFX1064_ITERATIVE-NEXT:    s_addc_u32 s5, s5, s8
 ; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s6
-; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[4:5], s[4:5], s[6:7]
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX1064_ITERATIVE-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX1064_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1064_ITERATIVE-NEXT:    ; implicit-def: $vgpr3_vgpr4
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1064_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064_ITERATIVE-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX1064_ITERATIVE-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1064_ITERATIVE-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX1064_ITERATIVE-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_execz .LBB11_4
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.3:
-; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s1
-; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s0
+; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s4
+; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s5
 ; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s11, 0x31016000
 ; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s10, -1
 ; GFX1064_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s8, s6
-; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s9, s7
+; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s8, s2
+; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s9, s3
 ; GFX1064_ITERATIVE-NEXT:    buffer_atomic_sub_x2 v[3:4], off, s[8:11], 0 glc
 ; GFX1064_ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064_ITERATIVE-NEXT:    buffer_gl1_inv
 ; GFX1064_ITERATIVE-NEXT:    buffer_gl0_inv
 ; GFX1064_ITERATIVE-NEXT:  .LBB11_4:
 ; GFX1064_ITERATIVE-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX1064_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX1064_ITERATIVE-NEXT:    v_readfirstlane_b32 s0, v3
-; GFX1064_ITERATIVE-NEXT:    v_readfirstlane_b32 s1, v4
+; GFX1064_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX1064_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s6, -1
-; GFX1064_ITERATIVE-NEXT:    v_sub_co_u32 v0, vcc, s0, v1
-; GFX1064_ITERATIVE-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s1, v2, vcc
-; GFX1064_ITERATIVE-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX1064_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v3
+; GFX1064_ITERATIVE-NEXT:    v_readfirstlane_b32 s3, v4
+; GFX1064_ITERATIVE-NEXT:    v_sub_co_u32 v0, vcc, s2, v1
+; GFX1064_ITERATIVE-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s3, v2, vcc
+; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s2, -1
+; GFX1064_ITERATIVE-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX1064_ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX1032_ITERATIVE-LABEL: sub_i64_varying:
 ; GFX1032_ITERATIVE:       ; %bb.0: ; %entry
 ; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s4, exec_lo
-; GFX1032_ITERATIVE-NEXT:    s_mov_b64 s[0:1], 0
+; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1032_ITERATIVE-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX1032_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
 ; GFX1032_ITERATIVE-NEXT:  .LBB11_1: ; %ComputeLoop
 ; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s5, s4
-; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s5
-; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s5
-; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s5
-; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s5
-; GFX1032_ITERATIVE-NEXT:    s_add_u32 s0, s0, s6
-; GFX1032_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s7
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s5, 1, s5
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s4, s4, s5
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s1, s0
+; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s1
+; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s1
+; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s4, s1
+; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v2, s5, s1
+; GFX1032_ITERATIVE-NEXT:    s_add_u32 s4, s4, s6
+; GFX1032_ITERATIVE-NEXT:    s_addc_u32 s5, s5, s7
+; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s1, 1, s1
+; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s0, s0, s1
+; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX1032_ITERATIVE-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX1032_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1032_ITERATIVE-NEXT:    ; implicit-def: $vgpr3_vgpr4
 ; GFX1032_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032_ITERATIVE-NEXT:    s_and_saveexec_b32 s2, vcc_lo
-; GFX1032_ITERATIVE-NEXT:    s_xor_b32 s2, exec_lo, s2
+; GFX1032_ITERATIVE-NEXT:    s_and_saveexec_b32 s6, vcc_lo
+; GFX1032_ITERATIVE-NEXT:    s_xor_b32 s6, exec_lo, s6
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_execz .LBB11_4
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.3:
-; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s1
-; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s0
+; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s4
+; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s5
 ; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s11, 0x31016000
 ; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s10, -1
 ; GFX1032_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s8, s6
-; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s9, s7
+; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s8, s2
+; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s9, s3
 ; GFX1032_ITERATIVE-NEXT:    buffer_atomic_sub_x2 v[3:4], off, s[8:11], 0 glc
 ; GFX1032_ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032_ITERATIVE-NEXT:    buffer_gl1_inv
 ; GFX1032_ITERATIVE-NEXT:    buffer_gl0_inv
 ; GFX1032_ITERATIVE-NEXT:  .LBB11_4:
 ; GFX1032_ITERATIVE-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX1032_ITERATIVE-NEXT:    s_or_b32 exec_lo, exec_lo, s2
-; GFX1032_ITERATIVE-NEXT:    v_readfirstlane_b32 s0, v3
-; GFX1032_ITERATIVE-NEXT:    v_readfirstlane_b32 s1, v4
+; GFX1032_ITERATIVE-NEXT:    s_or_b32 exec_lo, exec_lo, s6
 ; GFX1032_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s6, -1
-; GFX1032_ITERATIVE-NEXT:    v_sub_co_u32 v0, vcc_lo, s0, v1
-; GFX1032_ITERATIVE-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v2, vcc_lo
-; GFX1032_ITERATIVE-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX1032_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v3
+; GFX1032_ITERATIVE-NEXT:    v_readfirstlane_b32 s3, v4
+; GFX1032_ITERATIVE-NEXT:    v_sub_co_u32 v0, vcc_lo, s2, v1
+; GFX1032_ITERATIVE-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo
+; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s2, -1
+; GFX1032_ITERATIVE-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX1032_ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX1164_ITERATIVE-LABEL: sub_i64_varying:
@@ -6759,22 +6760,22 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ;
 ; GFX9_DPP-LABEL: sub_i64_varying:
 ; GFX9_DPP:       ; %bb.0: ; %entry
-; GFX9_DPP-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9_DPP-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, v0
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9_DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX9_DPP-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, v8
 ; GFX9_DPP-NEXT:    s_not_b64 exec, exec
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9_DPP-NEXT:    s_not_b64 exec, exec
-; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX9_DPP-NEXT:    s_nop 0
@@ -6822,39 +6823,39 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX9_DPP-NEXT:    s_nop 0
 ; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
 ; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
-; GFX9_DPP-NEXT:    v_readlane_b32 s3, v4, 63
-; GFX9_DPP-NEXT:    v_readlane_b32 s2, v3, 63
+; GFX9_DPP-NEXT:    v_readlane_b32 s7, v4, 63
+; GFX9_DPP-NEXT:    v_readlane_b32 s6, v3, 63
 ; GFX9_DPP-NEXT:    v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX9_DPP-NEXT:    v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX9_DPP-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
-; GFX9_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX9_DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9_DPP-NEXT:    s_cbranch_execz .LBB11_2
 ; GFX9_DPP-NEXT:  ; %bb.1:
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, s3
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, s7
 ; GFX9_DPP-NEXT:    s_mov_b32 s11, 0xf000
 ; GFX9_DPP-NEXT:    s_mov_b32 s10, -1
 ; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9_DPP-NEXT:    s_mov_b32 s8, s6
-; GFX9_DPP-NEXT:    s_mov_b32 s9, s7
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, s2
+; GFX9_DPP-NEXT:    s_mov_b32 s8, s2
+; GFX9_DPP-NEXT:    s_mov_b32 s9, s3
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, s6
 ; GFX9_DPP-NEXT:    buffer_atomic_sub_x2 v[7:8], off, s[8:11], 0 glc
 ; GFX9_DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9_DPP-NEXT:    buffer_wbinvl1_vol
 ; GFX9_DPP-NEXT:  .LBB11_2:
-; GFX9_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX9_DPP-NEXT:    v_readfirstlane_b32 s0, v8
-; GFX9_DPP-NEXT:    v_readfirstlane_b32 s1, v7
+; GFX9_DPP-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9_DPP-NEXT:    v_readfirstlane_b32 s4, v8
+; GFX9_DPP-NEXT:    v_readfirstlane_b32 s5, v7
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9_DPP-NEXT:    v_sub_co_u32_e32 v7, vcc, s1, v7
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9_DPP-NEXT:    v_sub_co_u32_e32 v7, vcc, s5, v7
 ; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9_DPP-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9_DPP-NEXT:    s_mov_b32 s6, -1
+; GFX9_DPP-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9_DPP-NEXT:    s_mov_b32 s2, -1
 ; GFX9_DPP-NEXT:    v_subb_co_u32_e32 v8, vcc, v0, v8, vcc
-; GFX9_DPP-NEXT:    buffer_store_dwordx2 v[7:8], off, s[4:7], 0
+; GFX9_DPP-NEXT:    buffer_store_dwordx2 v[7:8], off, s[0:3], 0
 ; GFX9_DPP-NEXT:    s_endpgm
 ;
 ; GFX1064_DPP-LABEL: sub_i64_varying:
@@ -6917,59 +6918,59 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v5
 ; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc
 ; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064_DPP-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX1064_DPP-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX1064_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1064_DPP-NEXT:    v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1064_DPP-NEXT:    v_readlane_b32 s2, v4, 15
-; GFX1064_DPP-NEXT:    v_readlane_b32 s3, v3, 15
+; GFX1064_DPP-NEXT:    v_readlane_b32 s6, v4, 15
+; GFX1064_DPP-NEXT:    v_readlane_b32 s7, v3, 15
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s8, v4, 31
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s9, v3, 31
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s10, v3, 47
-; GFX1064_DPP-NEXT:    v_writelane_b32 v2, s2, 16
-; GFX1064_DPP-NEXT:    v_writelane_b32 v1, s3, 16
-; GFX1064_DPP-NEXT:    v_readlane_b32 s2, v3, 63
+; GFX1064_DPP-NEXT:    v_writelane_b32 v2, s6, 16
+; GFX1064_DPP-NEXT:    v_writelane_b32 v1, s7, 16
+; GFX1064_DPP-NEXT:    v_readlane_b32 s6, v3, 63
 ; GFX1064_DPP-NEXT:    v_readlane_b32 s11, v4, 47
-; GFX1064_DPP-NEXT:    v_readlane_b32 s3, v4, 63
+; GFX1064_DPP-NEXT:    v_readlane_b32 s7, v4, 63
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v2, s8, 32
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v1, s9, 32
-; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX1064_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; GFX1064_DPP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX1064_DPP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v2, s11, 48
 ; GFX1064_DPP-NEXT:    v_writelane_b32 v1, s10, 48
 ; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[8:9]
 ; GFX1064_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064_DPP-NEXT:    s_mov_b32 s2, -1
+; GFX1064_DPP-NEXT:    s_mov_b32 s6, -1
 ; GFX1064_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
 ; GFX1064_DPP-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX1064_DPP-NEXT:    s_cbranch_execz .LBB11_2
 ; GFX1064_DPP-NEXT:  ; %bb.1:
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v10, s1
-; GFX1064_DPP-NEXT:    v_mov_b32_e32 v9, s0
-; GFX1064_DPP-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v10, s5
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v9, s4
+; GFX1064_DPP-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064_DPP-NEXT:    s_mov_b32 s0, s6
-; GFX1064_DPP-NEXT:    s_mov_b32 s1, s7
-; GFX1064_DPP-NEXT:    buffer_atomic_sub_x2 v[9:10], off, s[0:3], 0 glc
+; GFX1064_DPP-NEXT:    s_mov_b32 s4, s2
+; GFX1064_DPP-NEXT:    s_mov_b32 s5, s3
+; GFX1064_DPP-NEXT:    buffer_atomic_sub_x2 v[9:10], off, s[4:7], 0 glc
 ; GFX1064_DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064_DPP-NEXT:    buffer_gl1_inv
 ; GFX1064_DPP-NEXT:    buffer_gl0_inv
 ; GFX1064_DPP-NEXT:  .LBB11_2:
 ; GFX1064_DPP-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064_DPP-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s0, v9
+; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s2, v9
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v11, v1
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v12, v2
-; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s1, v10
-; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064_DPP-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX1064_DPP-NEXT:    v_sub_co_u32 v9, vcc, s0, v11
-; GFX1064_DPP-NEXT:    s_mov_b32 s6, s2
-; GFX1064_DPP-NEXT:    v_sub_co_ci_u32_e32 v10, vcc, s1, v12, vcc
-; GFX1064_DPP-NEXT:    buffer_store_dwordx2 v[9:10], off, s[4:7], 0
+; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s3, v10
+; GFX1064_DPP-NEXT:    v_sub_co_u32 v9, vcc, s2, v11
+; GFX1064_DPP-NEXT:    s_mov_b32 s2, s6
+; GFX1064_DPP-NEXT:    v_sub_co_ci_u32_e32 v10, vcc, s3, v12, vcc
+; GFX1064_DPP-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1064_DPP-NEXT:    buffer_store_dwordx2 v[9:10], off, s[0:3], 0
 ; GFX1064_DPP-NEXT:    s_endpgm
 ;
 ; GFX1032_DPP-LABEL: sub_i64_varying:
@@ -7020,51 +7021,51 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v8, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032_DPP-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1032_DPP-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s6, -1
 ; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v7
 ; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
-; GFX1032_DPP-NEXT:    v_readlane_b32 s0, v3, 31
+; GFX1032_DPP-NEXT:    v_readlane_b32 s4, v3, 31
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1032_DPP-NEXT:    v_readlane_b32 s8, v4, 15
-; GFX1032_DPP-NEXT:    v_readlane_b32 s1, v4, 31
+; GFX1032_DPP-NEXT:    v_readlane_b32 s5, v4, 31
 ; GFX1032_DPP-NEXT:    v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1032_DPP-NEXT:    v_readlane_b32 s3, v3, 15
-; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1032_DPP-NEXT:    v_readlane_b32 s7, v3, 15
+; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s6, -1
 ; GFX1032_DPP-NEXT:    v_writelane_b32 v2, s8, 16
-; GFX1032_DPP-NEXT:    v_writelane_b32 v1, s3, 16
-; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1032_DPP-NEXT:    v_writelane_b32 v1, s7, 16
+; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032_DPP-NEXT:    s_mov_b32 s2, -1
+; GFX1032_DPP-NEXT:    s_mov_b32 s6, -1
 ; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
 ; GFX1032_DPP-NEXT:    s_and_saveexec_b32 s8, vcc_lo
 ; GFX1032_DPP-NEXT:    s_cbranch_execz .LBB11_2
 ; GFX1032_DPP-NEXT:  ; %bb.1:
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v10, s1
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v9, s0
-; GFX1032_DPP-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v10, s5
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v9, s4
+; GFX1032_DPP-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032_DPP-NEXT:    s_mov_b32 s0, s6
-; GFX1032_DPP-NEXT:    s_mov_b32 s1, s7
-; GFX1032_DPP-NEXT:    buffer_atomic_sub_x2 v[9:10], off, s[0:3], 0 glc
+; GFX1032_DPP-NEXT:    s_mov_b32 s4, s2
+; GFX1032_DPP-NEXT:    s_mov_b32 s5, s3
+; GFX1032_DPP-NEXT:    buffer_atomic_sub_x2 v[9:10], off, s[4:7], 0 glc
 ; GFX1032_DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032_DPP-NEXT:    buffer_gl1_inv
 ; GFX1032_DPP-NEXT:    buffer_gl0_inv
 ; GFX1032_DPP-NEXT:  .LBB11_2:
 ; GFX1032_DPP-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s8
-; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s0, v9
+; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s2, v9
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v11, v1
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v12, v2
-; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s1, v10
-; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032_DPP-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX1032_DPP-NEXT:    v_sub_co_u32 v9, vcc_lo, s0, v11
-; GFX1032_DPP-NEXT:    s_mov_b32 s6, s2
-; GFX1032_DPP-NEXT:    v_sub_co_ci_u32_e32 v10, vcc_lo, s1, v12, vcc_lo
-; GFX1032_DPP-NEXT:    buffer_store_dwordx2 v[9:10], off, s[4:7], 0
+; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s3, v10
+; GFX1032_DPP-NEXT:    v_sub_co_u32 v9, vcc_lo, s2, v11
+; GFX1032_DPP-NEXT:    s_mov_b32 s2, s6
+; GFX1032_DPP-NEXT:    v_sub_co_ci_u32_e32 v10, vcc_lo, s3, v12, vcc_lo
+; GFX1032_DPP-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1032_DPP-NEXT:    buffer_store_dwordx2 v[9:10], off, s[0:3], 0
 ; GFX1032_DPP-NEXT:    s_endpgm
 ;
 ; GFX1164_DPP-LABEL: sub_i64_varying:
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index eb05613da0453..1439d4b40c951 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -1839,112 +1839,111 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive)
 ;
 ; GFX9-LABEL: add_i64_uniform:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b64 s[8:9], exec
-; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
-; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, s9, v0
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX9-NEXT:  ; %bb.1:
-; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[8:9]
+; GFX9-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mul_i32 s3, s7, s2
-; GFX9-NEXT:    s_mul_hi_u32 s8, s6, s2
-; GFX9-NEXT:    s_add_i32 s8, s8, s3
-; GFX9-NEXT:    s_mul_i32 s2, s6, s2
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    s_mul_i32 s7, s3, s6
+; GFX9-NEXT:    s_mul_hi_u32 s8, s2, s6
+; GFX9-NEXT:    s_add_i32 s8, s8, s7
+; GFX9-NEXT:    s_mul_i32 s6, s2, s6
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s8
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    ds_add_rtn_u64 v[0:1], v3, v[0:1]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:  .LBB5_2:
-; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
-; GFX9-NEXT:    v_readfirstlane_b32 s1, v0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s1
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s5
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s6, v2, v[0:1]
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s7, v2, v[1:2]
-; GFX9-NEXT:    s_mov_b32 s0, s4
-; GFX9-NEXT:    s_mov_b32 s1, s5
-; GFX9-NEXT:    s_nop 2
-; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s2, v2, v[0:1]
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[2:3], s3, v2, v[1:2]
+; GFX9-NEXT:    s_mov_b32 s4, s0
+; GFX9-NEXT:    s_mov_b32 s5, s1
+; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: add_i64_uniform:
 ; GFX1064:       ; %bb.0: ; %entry
-; GFX1064-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX1064-NEXT:    s_mov_b64 s[8:9], exec
-; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
-; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, s9, v0
+; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX1064-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
 ; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
-; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1064-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1064-NEXT:  ; %bb.1:
-; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[8:9]
+; GFX1064-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
 ; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_mul_i32 s3, s7, s2
-; GFX1064-NEXT:    s_mul_hi_u32 s8, s6, s2
-; GFX1064-NEXT:    s_mul_i32 s2, s6, s2
-; GFX1064-NEXT:    s_add_i32 s8, s8, s3
-; GFX1064-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1064-NEXT:    s_mul_i32 s7, s3, s6
+; GFX1064-NEXT:    s_mul_hi_u32 s8, s2, s6
+; GFX1064-NEXT:    s_mul_i32 s6, s2, s6
+; GFX1064-NEXT:    s_add_i32 s8, s8, s7
+; GFX1064-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, s8
 ; GFX1064-NEXT:    ds_add_rtn_u64 v[0:1], v3, v[0:1]
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:  .LBB5_2:
 ; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX1064-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX1064-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX1064-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX1064-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s6, v2, s[0:1]
-; GFX1064-NEXT:    s_mov_b32 s6, -1
-; GFX1064-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s7, v2, v[1:2]
-; GFX1064-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX1064-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s2, v2, s[4:5]
+; GFX1064-NEXT:    v_mad_u64_u32 v[1:2], s[2:3], s3, v2, v[1:2]
+; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1064-NEXT:    s_mov_b32 s2, -1
+; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: add_i64_uniform:
 ; GFX1032:       ; %bb.0: ; %entry
-; GFX1032-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX1032-NEXT:    s_mov_b32 s1, exec_lo
+; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX1032-NEXT:    s_mov_b32 s5, exec_lo
 ; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, s1, 0
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, s5, 0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
-; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX1032-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1032-NEXT:  ; %bb.1:
-; GFX1032-NEXT:    s_bcnt1_i32_b32 s1, s1
+; GFX1032-NEXT:    s_bcnt1_i32_b32 s5, s5
 ; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_mul_i32 s2, s7, s1
-; GFX1032-NEXT:    s_mul_hi_u32 s3, s6, s1
-; GFX1032-NEXT:    s_mul_i32 s1, s6, s1
-; GFX1032-NEXT:    s_add_i32 s3, s3, s2
-; GFX1032-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1032-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1032-NEXT:    s_mul_i32 s6, s3, s5
+; GFX1032-NEXT:    s_mul_hi_u32 s7, s2, s5
+; GFX1032-NEXT:    s_mul_i32 s5, s2, s5
+; GFX1032-NEXT:    s_add_i32 s7, s7, s6
+; GFX1032-NEXT:    v_mov_b32_e32 v0, s5
+; GFX1032-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX1032-NEXT:    ds_add_rtn_u64 v[0:1], v3, v[0:1]
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:  .LBB5_2:
 ; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX1032-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX1032-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX1032-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX1032-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_mad_u64_u32 v[0:1], s0, s6, v2, s[0:1]
-; GFX1032-NEXT:    s_mov_b32 s6, -1
-; GFX1032-NEXT:    v_mad_u64_u32 v[1:2], s0, s7, v2, v[1:2]
-; GFX1032-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX1032-NEXT:    v_mad_u64_u32 v[0:1], s2, s2, v2, s[4:5]
+; GFX1032-NEXT:    v_mad_u64_u32 v[1:2], s2, s3, v2, v[1:2]
+; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1032-NEXT:    s_mov_b32 s2, -1
+; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: add_i64_uniform:
@@ -5502,119 +5501,119 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive)
 ;
 ; GFX9-LABEL: sub_i64_uniform:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b64 s[8:9], exec
-; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
-; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, s9, v0
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_cbranch_execz .LBB13_2
 ; GFX9-NEXT:  ; %bb.1:
-; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[8:9]
+; GFX9-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mul_i32 s3, s7, s2
-; GFX9-NEXT:    s_mul_hi_u32 s8, s6, s2
-; GFX9-NEXT:    s_add_i32 s8, s8, s3
-; GFX9-NEXT:    s_mul_i32 s2, s6, s2
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    s_mul_i32 s7, s3, s6
+; GFX9-NEXT:    s_mul_hi_u32 s8, s2, s6
+; GFX9-NEXT:    s_add_i32 s8, s8, s7
+; GFX9-NEXT:    s_mul_i32 s6, s2, s6
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s8
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    ds_sub_rtn_u64 v[0:1], v3, v[0:1]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:  .LBB13_2:
-; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s6, v2, 0
-; GFX9-NEXT:    s_mov_b32 s0, s4
-; GFX9-NEXT:    s_mov_b32 s1, s5
-; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[6:7], s7, v2, v[4:5]
-; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
-; GFX9-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], s2, v2, 0
+; GFX9-NEXT:    s_mov_b32 s4, s0
+; GFX9-NEXT:    s_mov_b32 s5, s1
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[2:3], s3, v2, v[4:5]
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s1, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v4
-; GFX9-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s5, v3
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s1, v3
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
 ; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
-; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: sub_i64_uniform:
 ; GFX1064:       ; %bb.0: ; %entry
-; GFX1064-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX1064-NEXT:    s_mov_b64 s[8:9], exec
-; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
-; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, s9, v0
+; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX1064-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
 ; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
-; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1064-NEXT:    s_cbranch_execz .LBB13_2
 ; GFX1064-NEXT:  ; %bb.1:
-; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[8:9]
+; GFX1064-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
 ; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_mul_i32 s3, s7, s2
-; GFX1064-NEXT:    s_mul_hi_u32 s8, s6, s2
-; GFX1064-NEXT:    s_mul_i32 s2, s6, s2
-; GFX1064-NEXT:    s_add_i32 s8, s8, s3
-; GFX1064-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1064-NEXT:    s_mul_i32 s7, s3, s6
+; GFX1064-NEXT:    s_mul_hi_u32 s8, s2, s6
+; GFX1064-NEXT:    s_mul_i32 s6, s2, s6
+; GFX1064-NEXT:    s_add_i32 s8, s8, s7
+; GFX1064-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, s8
 ; GFX1064-NEXT:    ds_sub_rtn_u64 v[0:1], v3, v[0:1]
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:  .LBB13_2:
 ; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s6, v2, 0
-; GFX1064-NEXT:    s_mov_b32 s6, -1
-; GFX1064-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s7, v2, v[4:5]
-; GFX1064-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX1064-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX1064-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX1064-NEXT:    v_sub_co_u32 v0, vcc, s0, v3
+; GFX1064-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], s2, v2, 0
+; GFX1064-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX1064-NEXT:    v_mad_u64_u32 v[4:5], s[2:3], s3, v2, v[4:5]
+; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1064-NEXT:    v_sub_co_u32 v0, vcc, s2, v3
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, v4
-; GFX1064-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s1, v1, vcc
-; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX1064-NEXT:    s_mov_b32 s2, -1
+; GFX1064-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc
+; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: sub_i64_uniform:
 ; GFX1032:       ; %bb.0: ; %entry
-; GFX1032-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX1032-NEXT:    s_mov_b32 s1, exec_lo
+; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX1032-NEXT:    s_mov_b32 s5, exec_lo
 ; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, s1, 0
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, s5, 0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
-; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX1032-NEXT:    s_cbranch_execz .LBB13_2
 ; GFX1032-NEXT:  ; %bb.1:
-; GFX1032-NEXT:    s_bcnt1_i32_b32 s1, s1
+; GFX1032-NEXT:    s_bcnt1_i32_b32 s5, s5
 ; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_mul_i32 s2, s7, s1
-; GFX1032-NEXT:    s_mul_hi_u32 s3, s6, s1
-; GFX1032-NEXT:    s_mul_i32 s1, s6, s1
-; GFX1032-NEXT:    s_add_i32 s3, s3, s2
-; GFX1032-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1032-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1032-NEXT:    s_mul_i32 s6, s3, s5
+; GFX1032-NEXT:    s_mul_hi_u32 s7, s2, s5
+; GFX1032-NEXT:    s_mul_i32 s5, s2, s5
+; GFX1032-NEXT:    s_add_i32 s7, s7, s6
+; GFX1032-NEXT:    v_mov_b32_e32 v0, s5
+; GFX1032-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX1032-NEXT:    ds_sub_rtn_u64 v[0:1], v3, v[0:1]
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:  .LBB13_2:
 ; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_mad_u64_u32 v[3:4], s0, s6, v2, 0
-; GFX1032-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX1032-NEXT:    s_mov_b32 s6, -1
-; GFX1032-NEXT:    v_mad_u64_u32 v[4:5], s0, s7, v2, v[4:5]
-; GFX1032-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX1032-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX1032-NEXT:    v_sub_co_u32 v0, vcc_lo, s0, v3
+; GFX1032-NEXT:    v_mad_u64_u32 v[3:4], s2, s2, v2, 0
+; GFX1032-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX1032-NEXT:    v_mad_u64_u32 v[4:5], s2, s3, v2, v[4:5]
+; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1032-NEXT:    v_sub_co_u32 v0, vcc_lo, s2, v3
 ; GFX1032-NEXT:    v_mov_b32_e32 v1, v4
-; GFX1032-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
-; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX1032-NEXT:    s_mov_b32 s2, -1
+; GFX1032-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo
+; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: sub_i64_uniform:
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
index 7ae4766584e00..f636fa5d83a57 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
@@ -426,14 +426,13 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX10W32-NEXT:  .LBB1_2:
 ; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX10W32-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
+; GFX10W32-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x24
 ; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
-; GFX10W32-NEXT:    s_mov_b32 null, 0
-; GFX10W32-NEXT:    v_readfirstlane_b32 s2, v1
+; GFX10W32-NEXT:    v_readfirstlane_b32 s4, v1
 ; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT:    v_mad_u64_u32 v[0:1], s0, s0, v0, s[2:3]
+; GFX10W32-NEXT:    v_mad_u64_u32 v[0:1], s0, s0, v0, s[4:5]
 ; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10W32-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10W32-NEXT:    global_store_dword v1, v0, s[2:3]
 ; GFX10W32-NEXT:    s_endpgm
 ;
 ; GFX11W64-LABEL: add_i32_uniform:
@@ -1505,14 +1504,14 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX10W32-NEXT:  .LBB5_2:
 ; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX10W32-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
+; GFX10W32-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x24
 ; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10W32-NEXT:    v_mul_lo_u32 v0, s0, v0
 ; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10W32-NEXT:    v_readfirstlane_b32 s0, v1
 ; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10W32-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
-; GFX10W32-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10W32-NEXT:    global_store_dword v1, v0, s[2:3]
 ; GFX10W32-NEXT:    s_endpgm
 ;
 ; GFX11W64-LABEL: sub_i32_uniform:
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
index 62e50b2597c17..3e8565d34c6be 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
@@ -439,14 +439,13 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX10W32-NEXT:  .LBB1_2:
 ; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX10W32-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
+; GFX10W32-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x24
 ; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
-; GFX10W32-NEXT:    s_mov_b32 null, 0
-; GFX10W32-NEXT:    v_readfirstlane_b32 s2, v1
+; GFX10W32-NEXT:    v_readfirstlane_b32 s4, v1
 ; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT:    v_mad_u64_u32 v[0:1], s0, s0, v0, s[2:3]
+; GFX10W32-NEXT:    v_mad_u64_u32 v[0:1], s0, s0, v0, s[4:5]
 ; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10W32-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10W32-NEXT:    global_store_dword v1, v0, s[2:3]
 ; GFX10W32-NEXT:    s_endpgm
 ;
 ; GFX11W64-LABEL: add_i32_uniform:
@@ -1675,14 +1674,14 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX10W32-NEXT:  .LBB6_2:
 ; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX10W32-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
+; GFX10W32-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x24
 ; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10W32-NEXT:    v_mul_lo_u32 v0, s0, v0
 ; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10W32-NEXT:    v_readfirstlane_b32 s0, v1
 ; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10W32-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
-; GFX10W32-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10W32-NEXT:    global_store_dword v1, v0, s[2:3]
 ; GFX10W32-NEXT:    s_endpgm
 ;
 ; GFX11W64-LABEL: sub_i32_uniform:
diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
index 04d72691a088a..0bd030f1a3750 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
@@ -15,7 +15,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $sgpr10_sgpr11 = COPY $sgpr8_sgpr9
   ; GFX90A-NEXT:   renamable $vgpr31 = COPY $vgpr0, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr33 = S_LOAD_DWORD_IMM renamable $sgpr6_sgpr7, 24, 0 :: (dereferenceable invariant load (s32) from %ir.arg4.kernarg.offset.align.down, align 8, addrspace 4)
-  ; GFX90A-NEXT:   early-clobber renamable $sgpr20_sgpr21_sgpr22_sgpr23 = S_LOAD_DWORDX4_IMM_ec renamable $sgpr6_sgpr7, 24, 0 :: (dereferenceable invariant load (s128) from %ir.arg6.kernarg.offset.align.down, align 8, addrspace 4)
+  ; GFX90A-NEXT:   renamable $sgpr20_sgpr21_sgpr22_sgpr23 = S_LOAD_DWORDX4_IMM renamable $sgpr6_sgpr7, 24, 0 :: (dereferenceable invariant load (s128) from %ir.arg6.kernarg.offset.align.down, align 8, addrspace 4)
   ; GFX90A-NEXT:   renamable $sgpr15 = S_LOAD_DWORD_IMM renamable $sgpr6_sgpr7, 40, 0 :: (dereferenceable invariant load (s32) from %ir.arg6.kernarg.offset.align.down + 16, align 8, addrspace 4)
   ; GFX90A-NEXT:   renamable $sgpr16_sgpr17_sgpr18_sgpr19 = S_LOAD_DWORDX4_IMM renamable $sgpr6_sgpr7, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
   ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_LOAD_DWORDX2_IMM renamable $sgpr6_sgpr7, 16, 0 :: (dereferenceable invariant load (s64) from %ir.arg.kernarg.offset1 + 16, align 16, addrspace 4)
diff --git a/llvm/test/CodeGen/AMDGPU/build_vector.ll b/llvm/test/CodeGen/AMDGPU/build_vector.ll
index 13c4ff8b2ff30..8293280609517 100644
--- a/llvm/test/CodeGen/AMDGPU/build_vector.ll
+++ b/llvm/test/CodeGen/AMDGPU/build_vector.ll
@@ -316,14 +316,14 @@ define amdgpu_kernel void @build_v2i32_from_v4i16_shuffle(ptr addrspace(1) %out,
 ;
 ; GFX940-LABEL: build_v2i32_from_v4i16_shuffle:
 ; GFX940:       ; %bb.0: ; %entry
-; GFX940-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX940-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX940-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    s_lshl_b32 s0, s7, 16
-; GFX940-NEXT:    s_lshl_b32 s1, s6, 16
-; GFX940-NEXT:    v_mov_b32_e32 v0, s1
-; GFX940-NEXT:    v_mov_b32_e32 v1, s0
-; GFX940-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5] sc0 sc1
+; GFX940-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX940-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX940-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
 ; GFX940-NEXT:    s_endpgm
 entry:
   %shuf = shufflevector <4 x i16> %in, <4 x i16> zeroinitializer, <2 x i32> <i32 0, i32 2>
diff --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
index 0cc10512af5cd..f248708d16ea2 100644
--- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
+++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
@@ -157,26 +157,26 @@ define amdgpu_kernel void @sadd64ri(ptr addrspace(1) %out, i64 %a) {
 ;
 ; GFX9-LABEL: sadd64ri:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_add_u32 s0, s6, 0x56789876
-; GFX9-NEXT:    s_addc_u32 s1, s7, 0x1234
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    s_add_u32 s2, s2, 0x56789876
+; GFX9-NEXT:    s_addc_u32 s3, s3, 0x1234
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1010-LABEL: sadd64ri:
 ; GFX1010:       ; %bb.0: ; %entry
-; GFX1010-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX1010-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX1010-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1010-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1010-NEXT:    s_add_u32 s0, s6, 0x56789876
-; GFX1010-NEXT:    s_addc_u32 s1, s7, 0x1234
-; GFX1010-NEXT:    v_mov_b32_e32 v0, s0
-; GFX1010-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1010-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX1010-NEXT:    s_add_u32 s2, s2, 0x56789876
+; GFX1010-NEXT:    s_addc_u32 s3, s3, 0x1234
+; GFX1010-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1010-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1010-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX1010-NEXT:    s_endpgm
 ;
 ; GFX1030W32-LABEL: sadd64ri:
@@ -255,23 +255,23 @@ define amdgpu_kernel void @vadd64rr(ptr addrspace(1) %out, i64 %a) {
 ;
 ; GFX9-LABEL: vadd64rr:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, s7
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s6, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1010-LABEL: vadd64rr:
 ; GFX1010:       ; %bb.0: ; %entry
-; GFX1010-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX1010-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX1010-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1010-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1010-NEXT:    v_add_co_u32 v0, s0, s6, v0
-; GFX1010-NEXT:    v_add_co_ci_u32_e64 v1, s0, s7, 0, s0
-; GFX1010-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX1010-NEXT:    v_add_co_u32 v0, s2, s2, v0
+; GFX1010-NEXT:    v_add_co_ci_u32_e64 v1, s2, s3, 0, s2
+; GFX1010-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX1010-NEXT:    s_endpgm
 ;
 ; GFX1030W32-LABEL: vadd64rr:
@@ -679,34 +679,34 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ;
 ; GFX9-LABEL: suaddo64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_add_u32 s0, s8, s10
-; GFX9-NEXT:    v_mov_b32_e32 v0, s8
-; GFX9-NEXT:    v_mov_b32_e32 v1, s9
-; GFX9-NEXT:    s_addc_u32 s1, s9, s11
-; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9-NEXT:    s_add_u32 s6, s4, s6
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    s_addc_u32 s7, s5, s7
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[4:5]
-; GFX9-NEXT:    global_store_byte v4, v0, s[6:7]
+; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1]
+; GFX9-NEXT:    global_store_byte v4, v0, s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1010-LABEL: suaddo64:
 ; GFX1010:       ; %bb.0:
-; GFX1010-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX1010-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX1010-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1010-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1010-NEXT:    s_add_u32 s0, s8, s10
-; GFX1010-NEXT:    s_addc_u32 s1, s9, s11
-; GFX1010-NEXT:    v_mov_b32_e32 v0, s0
-; GFX1010-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1010-NEXT:    v_cmp_lt_u64_e64 s0, s[0:1], s[8:9]
-; GFX1010-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s0
-; GFX1010-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
-; GFX1010-NEXT:    global_store_byte v2, v3, s[6:7]
+; GFX1010-NEXT:    s_add_u32 s6, s4, s6
+; GFX1010-NEXT:    s_addc_u32 s7, s5, s7
+; GFX1010-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1010-NEXT:    v_cmp_lt_u64_e64 s4, s[6:7], s[4:5]
+; GFX1010-NEXT:    v_mov_b32_e32 v1, s7
+; GFX1010-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s4
+; GFX1010-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-NEXT:    global_store_byte v2, v3, s[2:3]
 ; GFX1010-NEXT:    s_endpgm
 ;
 ; GFX1030W32-LABEL: suaddo64:
@@ -1048,26 +1048,26 @@ define amdgpu_kernel void @ssub64ri(ptr addrspace(1) %out, i64 %a) {
 ;
 ; GFX9-LABEL: ssub64ri:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_sub_u32 s0, 0x56789876, s6
-; GFX9-NEXT:    s_subb_u32 s1, 0x1234, s7
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    s_sub_u32 s2, 0x56789876, s2
+; GFX9-NEXT:    s_subb_u32 s3, 0x1234, s3
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1010-LABEL: ssub64ri:
 ; GFX1010:       ; %bb.0: ; %entry
-; GFX1010-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX1010-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX1010-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1010-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1010-NEXT:    s_sub_u32 s0, 0x56789876, s6
-; GFX1010-NEXT:    s_subb_u32 s1, 0x1234, s7
-; GFX1010-NEXT:    v_mov_b32_e32 v0, s0
-; GFX1010-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1010-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX1010-NEXT:    s_sub_u32 s2, 0x56789876, s2
+; GFX1010-NEXT:    s_subb_u32 s3, 0x1234, s3
+; GFX1010-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1010-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1010-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX1010-NEXT:    s_endpgm
 ;
 ; GFX1030W32-LABEL: ssub64ri:
@@ -1146,23 +1146,23 @@ define amdgpu_kernel void @vsub64rr(ptr addrspace(1) %out, i64 %a) {
 ;
 ; GFX9-LABEL: vsub64rr:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, s7
-; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s6, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s2, v0
 ; GFX9-NEXT:    v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1010-LABEL: vsub64rr:
 ; GFX1010:       ; %bb.0: ; %entry
-; GFX1010-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX1010-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX1010-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1010-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1010-NEXT:    v_sub_co_u32 v0, s0, s6, v0
-; GFX1010-NEXT:    v_sub_co_ci_u32_e64 v1, s0, s7, 0, s0
-; GFX1010-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX1010-NEXT:    v_sub_co_u32 v0, s2, s2, v0
+; GFX1010-NEXT:    v_sub_co_ci_u32_e64 v1, s2, s3, 0, s2
+; GFX1010-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX1010-NEXT:    s_endpgm
 ;
 ; GFX1030W32-LABEL: vsub64rr:
@@ -1571,34 +1571,34 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ;
 ; GFX9-LABEL: susubo64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_sub_u32 s0, s8, s10
-; GFX9-NEXT:    v_mov_b32_e32 v0, s8
-; GFX9-NEXT:    v_mov_b32_e32 v1, s9
-; GFX9-NEXT:    s_subb_u32 s1, s9, s11
-; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9-NEXT:    s_sub_u32 s6, s4, s6
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    s_subb_u32 s7, s5, s7
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[4:5]
-; GFX9-NEXT:    global_store_byte v4, v0, s[6:7]
+; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1]
+; GFX9-NEXT:    global_store_byte v4, v0, s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1010-LABEL: susubo64:
 ; GFX1010:       ; %bb.0:
-; GFX1010-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX1010-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX1010-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1010-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1010-NEXT:    s_sub_u32 s0, s8, s10
-; GFX1010-NEXT:    s_subb_u32 s1, s9, s11
-; GFX1010-NEXT:    v_mov_b32_e32 v0, s0
-; GFX1010-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1010-NEXT:    v_cmp_gt_u64_e64 s0, s[0:1], s[8:9]
-; GFX1010-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s0
-; GFX1010-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
-; GFX1010-NEXT:    global_store_byte v2, v3, s[6:7]
+; GFX1010-NEXT:    s_sub_u32 s6, s4, s6
+; GFX1010-NEXT:    s_subb_u32 s7, s5, s7
+; GFX1010-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1010-NEXT:    v_cmp_gt_u64_e64 s4, s[6:7], s[4:5]
+; GFX1010-NEXT:    v_mov_b32_e32 v1, s7
+; GFX1010-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s4
+; GFX1010-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-NEXT:    global_store_byte v2, v3, s[2:3]
 ; GFX1010-NEXT:    s_endpgm
 ;
 ; GFX1030W32-LABEL: susubo64:
diff --git a/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll b/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll
index 45324392aacde..e1717a816de0d 100644
--- a/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll
+++ b/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll
@@ -40,13 +40,13 @@ define amdgpu_kernel void @v_clamp_add_src_f32(ptr addrspace(1) %out, ptr addrsp
 ;
 ; GFX9-LABEL: v_clamp_add_src_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_f32_e64 v1, v1, 1.0 clamp
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_add_src_f32:
@@ -117,14 +117,14 @@ define amdgpu_kernel void @v_clamp_multi_use_src_f32(ptr addrspace(1) %out, ptr
 ;
 ; GFX9-LABEL: v_clamp_multi_use_src_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_f32_e32 v1, 1.0, v1
 ; GFX9-NEXT:    v_max_f32_e64 v2, v1, v1 clamp
-; GFX9-NEXT:    global_store_dword v0, v2, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v2, s[0:1]
 ; GFX9-NEXT:    global_store_dword v[0:1], v1, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
@@ -194,13 +194,13 @@ define amdgpu_kernel void @v_clamp_dbg_use_src_f32(ptr addrspace(1) %out, ptr ad
 ;
 ; GFX9-LABEL: v_clamp_dbg_use_src_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_f32_e64 v1, v1, 1.0 clamp
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_dbg_use_src_f32:
@@ -267,14 +267,14 @@ define amdgpu_kernel void @v_clamp_add_neg_src_f32(ptr addrspace(1) %out, ptr ad
 ;
 ; GFX9-LABEL: v_clamp_add_neg_src_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_floor_f32_e32 v1, v1
 ; GFX9-NEXT:    v_max_f32_e64 v1, -v1, -v1 clamp
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_add_neg_src_f32:
@@ -342,14 +342,14 @@ define amdgpu_kernel void @v_non_clamp_max_f32(ptr addrspace(1) %out, ptr addrsp
 ;
 ; GFX9-LABEL: v_non_clamp_max_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_f32_e32 v1, 1.0, v1
 ; GFX9-NEXT:    v_max_f32_e32 v1, 0, v1
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_non_clamp_max_f32:
@@ -413,13 +413,13 @@ define amdgpu_kernel void @v_clamp_add_src_f32_denormals(ptr addrspace(1) %out,
 ;
 ; GFX9-LABEL: v_clamp_add_src_f32_denormals:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_f32_e64 v1, v1, 1.0 clamp
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_add_src_f32_denormals:
@@ -485,13 +485,13 @@ define amdgpu_kernel void @v_clamp_add_src_f16_denorm(ptr addrspace(1) %out, ptr
 ;
 ; GFX9-LABEL: v_clamp_add_src_f16_denorm:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_f16_e64 v1, v1, 1.0 clamp
-; GFX9-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_add_src_f16_denorm:
@@ -557,13 +557,13 @@ define amdgpu_kernel void @v_clamp_add_src_f16_no_denormals(ptr addrspace(1) %ou
 ;
 ; GFX9-LABEL: v_clamp_add_src_f16_no_denormals:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_f16_e64 v1, v1, 1.0 clamp
-; GFX9-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_add_src_f16_no_denormals:
@@ -629,14 +629,14 @@ define amdgpu_kernel void @v_clamp_add_src_v2f32(ptr addrspace(1) %out, ptr addr
 ;
 ; GFX9-LABEL: v_clamp_add_src_v2f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[6:7]
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_f32_e64 v0, v0, 1.0 clamp
 ; GFX9-NEXT:    v_add_f32_e64 v1, v1, 1.0 clamp
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_add_src_v2f32:
@@ -701,13 +701,13 @@ define amdgpu_kernel void @v_clamp_add_src_f64(ptr addrspace(1) %out, ptr addrsp
 ;
 ; GFX9-LABEL: v_clamp_add_src_f64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[6:7]
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0 clamp
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_add_src_f64:
@@ -866,13 +866,13 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm(ptr addrspace(1) %out, p
 ;
 ; GFX9-LABEL: v_clamp_add_src_v2f16_denorm:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0] clamp
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_add_src_v2f16_denorm:
@@ -947,13 +947,13 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_no_denormals(ptr addrspace(1) %
 ;
 ; GFX9-LABEL: v_clamp_add_src_v2f16_no_denormals:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0] clamp
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_add_src_v2f16_no_denormals:
@@ -1038,14 +1038,14 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg(ptr addrspace(1) %ou
 ;
 ; GFX9-LABEL: v_clamp_add_src_v2f16_denorm_neg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_add_src_v2f16_denorm_neg:
@@ -1124,14 +1124,14 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg_lo(ptr addrspace(1)
 ;
 ; GFX9-LABEL: v_clamp_add_src_v2f16_denorm_neg_lo:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] clamp
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_add_src_v2f16_denorm_neg_lo:
@@ -1212,14 +1212,14 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg_hi(ptr addrspace(1)
 ;
 ; GFX9-LABEL: v_clamp_add_src_v2f16_denorm_neg_hi:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1 neg_hi:[1,1] clamp
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_add_src_v2f16_denorm_neg_hi:
@@ -1298,14 +1298,14 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_shuf(ptr addrspace(1) %o
 ;
 ; GFX9-LABEL: v_clamp_add_src_v2f16_denorm_shuf:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1 op_sel:[1,1] op_sel_hi:[0,0] clamp
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_add_src_v2f16_denorm_shuf:
@@ -1382,14 +1382,14 @@ define amdgpu_kernel void @v_no_clamp_add_src_v2f16_f32_src(ptr addrspace(1) %ou
 ;
 ; GFX9-LABEL: v_no_clamp_add_src_v2f16_f32_src:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_f32_e32 v1, 1.0, v1
 ; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1 clamp
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_no_clamp_add_src_v2f16_f32_src:
@@ -1469,14 +1469,14 @@ define amdgpu_kernel void @v_no_clamp_add_packed_src_f32(ptr addrspace(1) %out,
 ;
 ; GFX9-LABEL: v_no_clamp_add_packed_src_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_max_f32_e64 v1, v1, v1 clamp
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_no_clamp_add_packed_src_f32:
@@ -1553,15 +1553,15 @@ define amdgpu_kernel void @v_no_clamp_add_src_v2f16_f16_src(ptr addrspace(1) %ou
 ;
 ; GFX9-LABEL: v_no_clamp_add_src_v2f16_f16_src:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v1, v1, s[6:7]
+; GFX9-NEXT:    global_load_ushort v1, v1, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_f16_e32 v1, 1.0, v1
 ; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1 clamp
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_no_clamp_add_src_v2f16_f16_src:
diff --git a/llvm/test/CodeGen/AMDGPU/clamp.ll b/llvm/test/CodeGen/AMDGPU/clamp.ll
index 94482d7f0d809..9b6c50c10d90d 100644
--- a/llvm/test/CodeGen/AMDGPU/clamp.ll
+++ b/llvm/test/CodeGen/AMDGPU/clamp.ll
@@ -41,13 +41,13 @@ define amdgpu_kernel void @v_clamp_f32(ptr addrspace(1) %out, ptr addrspace(1) %
 ;
 ; GFX9-LABEL: v_clamp_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_max_f32_e64 v1, v1, v1 clamp
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_f32:
@@ -126,13 +126,13 @@ define amdgpu_kernel void @v_clamp_neg_f32(ptr addrspace(1) %out, ptr addrspace(
 ;
 ; GFX9-LABEL: v_clamp_neg_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_max_f32_e64 v1, -v1, -v1 clamp
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_neg_f32:
@@ -212,13 +212,13 @@ define amdgpu_kernel void @v_clamp_negabs_f32(ptr addrspace(1) %out, ptr addrspa
 ;
 ; GFX9-LABEL: v_clamp_negabs_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_max_f32_e64 v1, -|v1|, -|v1| clamp
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_negabs_f32:
@@ -304,15 +304,15 @@ define amdgpu_kernel void @v_clamp_negzero_f32(ptr addrspace(1) %out, ptr addrsp
 ;
 ; GFX9-LABEL: v_clamp_negzero_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_f32_e32 v1, 0.5, v1
 ; GFX9-NEXT:    v_max_f32_e32 v1, 0x80000000, v1
 ; GFX9-NEXT:    v_min_f32_e32 v1, 1.0, v1
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_negzero_f32:
@@ -400,15 +400,15 @@ define amdgpu_kernel void @v_clamp_negzero_maybe_snan_f32(ptr addrspace(1) %out,
 ;
 ; GFX9-LABEL: v_clamp_negzero_maybe_snan_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX9-NEXT:    v_max_f32_e32 v1, 0x80000000, v1
 ; GFX9-NEXT:    v_min_f32_e32 v1, 1.0, v1
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_negzero_maybe_snan_f32:
@@ -498,15 +498,15 @@ define amdgpu_kernel void @v_clamp_multi_use_max_f32(ptr addrspace(1) %out, ptr
 ;
 ; GFX9-LABEL: v_clamp_multi_use_max_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX9-NEXT:    v_max_f32_e32 v1, 0, v1
 ; GFX9-NEXT:    v_min_f32_e32 v2, 1.0, v1
-; GFX9-NEXT:    global_store_dword v0, v2, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v2, s[0:1]
 ; GFX9-NEXT:    global_store_dword v[0:1], v1, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
@@ -600,13 +600,13 @@ define amdgpu_kernel void @v_clamp_f16(ptr addrspace(1) %out, ptr addrspace(1) %
 ;
 ; GFX9-LABEL: v_clamp_f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_max_f16_e64 v1, v1, v1 clamp
-; GFX9-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_f16:
@@ -686,13 +686,13 @@ define amdgpu_kernel void @v_clamp_neg_f16(ptr addrspace(1) %out, ptr addrspace(
 ;
 ; GFX9-LABEL: v_clamp_neg_f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_max_f16_e64 v1, -v1, -v1 clamp
-; GFX9-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_neg_f16:
@@ -773,13 +773,13 @@ define amdgpu_kernel void @v_clamp_negabs_f16(ptr addrspace(1) %out, ptr addrspa
 ;
 ; GFX9-LABEL: v_clamp_negabs_f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_max_f16_e64 v1, -|v1|, -|v1| clamp
-; GFX9-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_negabs_f16:
@@ -861,13 +861,13 @@ define amdgpu_kernel void @v_clamp_f64(ptr addrspace(1) %out, ptr addrspace(1) %
 ;
 ; GFX9-LABEL: v_clamp_f64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[6:7]
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1] clamp
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_f64:
@@ -946,13 +946,13 @@ define amdgpu_kernel void @v_clamp_neg_f64(ptr addrspace(1) %out, ptr addrspace(
 ;
 ; GFX9-LABEL: v_clamp_neg_f64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[6:7]
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_max_f64 v[0:1], -v[0:1], -v[0:1] clamp
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_neg_f64:
@@ -1032,13 +1032,13 @@ define amdgpu_kernel void @v_clamp_negabs_f64(ptr addrspace(1) %out, ptr addrspa
 ;
 ; GFX9-LABEL: v_clamp_negabs_f64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[6:7]
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_max_f64 v[0:1], -|v[0:1]|, -|v[0:1]| clamp
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_negabs_f64:
@@ -1122,14 +1122,14 @@ define amdgpu_kernel void @v_clamp_med3_aby_negzero_f32(ptr addrspace(1) %out, p
 ;
 ; GFX9-LABEL: v_clamp_med3_aby_negzero_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT:    s_brev_b32 s0, 1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    s_brev_b32 s2, 1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_med3_f32 v1, s0, 1.0, v1
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    v_med3_f32 v1, s2, 1.0, v1
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_med3_aby_negzero_f32:
@@ -1206,13 +1206,13 @@ define amdgpu_kernel void @v_clamp_med3_aby_f32(ptr addrspace(1) %out, ptr addrs
 ;
 ; GFX9-LABEL: v_clamp_med3_aby_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_max_f32_e64 v1, v1, v1 clamp
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_med3_aby_f32:
@@ -1289,13 +1289,13 @@ define amdgpu_kernel void @v_clamp_med3_bay_f32(ptr addrspace(1) %out, ptr addrs
 ;
 ; GFX9-LABEL: v_clamp_med3_bay_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_max_f32_e64 v1, v1, v1 clamp
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_med3_bay_f32:
@@ -1372,13 +1372,13 @@ define amdgpu_kernel void @v_clamp_med3_yab_f32(ptr addrspace(1) %out, ptr addrs
 ;
 ; GFX9-LABEL: v_clamp_med3_yab_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_max_f32_e64 v1, v1, v1 clamp
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_med3_yab_f32:
@@ -1455,13 +1455,13 @@ define amdgpu_kernel void @v_clamp_med3_yba_f32(ptr addrspace(1) %out, ptr addrs
 ;
 ; GFX9-LABEL: v_clamp_med3_yba_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_max_f32_e64 v1, v1, v1 clamp
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_med3_yba_f32:
@@ -1538,13 +1538,13 @@ define amdgpu_kernel void @v_clamp_med3_ayb_f32(ptr addrspace(1) %out, ptr addrs
 ;
 ; GFX9-LABEL: v_clamp_med3_ayb_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_max_f32_e64 v1, v1, v1 clamp
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_med3_ayb_f32:
@@ -1621,13 +1621,13 @@ define amdgpu_kernel void @v_clamp_med3_bya_f32(ptr addrspace(1) %out, ptr addrs
 ;
 ; GFX9-LABEL: v_clamp_med3_bya_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_max_f32_e64 v1, v1, v1 clamp
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_med3_bya_f32:
@@ -2091,14 +2091,14 @@ define amdgpu_kernel void @v_clamp_f32_no_dx10_clamp(ptr addrspace(1) %out, ptr
 ;
 ; GFX9-LABEL: v_clamp_f32_no_dx10_clamp:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_f32_e32 v1, 0.5, v1
 ; GFX9-NEXT:    v_med3_f32 v1, v1, 0, 1.0
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_f32_no_dx10_clamp:
@@ -2179,13 +2179,13 @@ define amdgpu_kernel void @v_clamp_f32_snan_dx10clamp(ptr addrspace(1) %out, ptr
 ;
 ; GFX9-LABEL: v_clamp_f32_snan_dx10clamp:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_f32_e64 v1, v1, 0.5 clamp
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_f32_snan_dx10clamp:
@@ -2267,14 +2267,14 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp(ptr addrspace(1) %out,
 ;
 ; GFX9-LABEL: v_clamp_f32_snan_no_dx10clamp:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX9-NEXT:    v_med3_f32 v1, v1, 0, 1.0
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_f32_snan_no_dx10clamp:
@@ -2356,14 +2356,14 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp_nnan_src(ptr addrspace(
 ;
 ; GFX9-LABEL: v_clamp_f32_snan_no_dx10clamp_nnan_src:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_f32_e32 v1, 1.0, v1
 ; GFX9-NEXT:    v_med3_f32 v1, v1, 0, 1.0
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_f32_snan_no_dx10clamp_nnan_src:
@@ -2444,13 +2444,13 @@ define amdgpu_kernel void @v_clamp_med3_aby_f32_no_dx10_clamp(ptr addrspace(1) %
 ;
 ; GFX9-LABEL: v_clamp_med3_aby_f32_no_dx10_clamp:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_max_f32_e64 v1, v1, v1 clamp
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_med3_aby_f32_no_dx10_clamp:
@@ -2527,13 +2527,13 @@ define amdgpu_kernel void @v_clamp_med3_bay_f32_no_dx10_clamp(ptr addrspace(1) %
 ;
 ; GFX9-LABEL: v_clamp_med3_bay_f32_no_dx10_clamp:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_max_f32_e64 v1, v1, v1 clamp
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_med3_bay_f32_no_dx10_clamp:
@@ -2610,13 +2610,13 @@ define amdgpu_kernel void @v_clamp_med3_yab_f32_no_dx10_clamp(ptr addrspace(1) %
 ;
 ; GFX9-LABEL: v_clamp_med3_yab_f32_no_dx10_clamp:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_med3_f32 v1, v1, 0, 1.0
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_med3_yab_f32_no_dx10_clamp:
@@ -2693,13 +2693,13 @@ define amdgpu_kernel void @v_clamp_med3_yba_f32_no_dx10_clamp(ptr addrspace(1) %
 ;
 ; GFX9-LABEL: v_clamp_med3_yba_f32_no_dx10_clamp:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_med3_f32 v1, v1, 1.0, 0
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_med3_yba_f32_no_dx10_clamp:
@@ -2776,13 +2776,13 @@ define amdgpu_kernel void @v_clamp_med3_ayb_f32_no_dx10_clamp(ptr addrspace(1) %
 ;
 ; GFX9-LABEL: v_clamp_med3_ayb_f32_no_dx10_clamp:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_med3_f32 v1, 0, v1, 1.0
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_med3_ayb_f32_no_dx10_clamp:
@@ -2859,13 +2859,13 @@ define amdgpu_kernel void @v_clamp_med3_bya_f32_no_dx10_clamp(ptr addrspace(1) %
 ;
 ; GFX9-LABEL: v_clamp_med3_bya_f32_no_dx10_clamp:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_med3_f32 v1, 1.0, v1, 0
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_med3_bya_f32_no_dx10_clamp:
@@ -3078,13 +3078,13 @@ define amdgpu_kernel void @v_clamp_v2f16(ptr addrspace(1) %out, ptr addrspace(1)
 ;
 ; GFX9-LABEL: v_clamp_v2f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1 clamp
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_v2f16:
@@ -3180,13 +3180,13 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_elt(ptr addrspace(1) %out, ptr ad
 ;
 ; GFX9-LABEL: v_clamp_v2f16_undef_elt:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1 clamp
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_v2f16_undef_elt:
@@ -3277,15 +3277,15 @@ define amdgpu_kernel void @v_clamp_v2f16_not_zero(ptr addrspace(1) %out, ptr add
 ;
 ; GFX9-LABEL: v_clamp_v2f16_not_zero:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX9-NEXT:    v_pk_max_f16 v1, v1, 2.0
 ; GFX9-NEXT:    v_pk_min_f16 v1, v1, 1.0 op_sel_hi:[1,0]
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_v2f16_not_zero:
@@ -3381,15 +3381,15 @@ define amdgpu_kernel void @v_clamp_v2f16_not_one(ptr addrspace(1) %out, ptr addr
 ;
 ; GFX9-LABEL: v_clamp_v2f16_not_one:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX9-NEXT:    v_pk_max_f16 v1, v1, 0
 ; GFX9-NEXT:    v_pk_min_f16 v1, v1, 1.0 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_v2f16_not_one:
@@ -3483,13 +3483,13 @@ define amdgpu_kernel void @v_clamp_neg_v2f16(ptr addrspace(1) %out, ptr addrspac
 ;
 ; GFX9-LABEL: v_clamp_neg_v2f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_neg_v2f16:
@@ -3578,14 +3578,14 @@ define amdgpu_kernel void @v_clamp_negabs_v2f16(ptr addrspace(1) %out, ptr addrs
 ;
 ; GFX9-LABEL: v_clamp_negabs_v2f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v1
 ; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_negabs_v2f16:
@@ -3678,13 +3678,13 @@ define amdgpu_kernel void @v_clamp_neglo_v2f16(ptr addrspace(1) %out, ptr addrsp
 ;
 ; GFX9-LABEL: v_clamp_neglo_v2f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] clamp
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_neglo_v2f16:
@@ -3774,13 +3774,13 @@ define amdgpu_kernel void @v_clamp_neghi_v2f16(ptr addrspace(1) %out, ptr addrsp
 ;
 ; GFX9-LABEL: v_clamp_neghi_v2f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1 neg_hi:[1,1] clamp
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_neghi_v2f16:
@@ -3870,13 +3870,13 @@ define amdgpu_kernel void @v_clamp_v2f16_shuffle(ptr addrspace(1) %out, ptr addr
 ;
 ; GFX9-LABEL: v_clamp_v2f16_shuffle:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1 op_sel:[1,1] op_sel_hi:[0,0] clamp
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_v2f16_shuffle:
@@ -3973,13 +3973,13 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts0(ptr addrspace(1) %out
 ;
 ; GFX9-LABEL: v_clamp_v2f16_undef_limit_elts0:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1 clamp
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_v2f16_undef_limit_elts0:
@@ -4075,13 +4075,13 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts1(ptr addrspace(1) %out
 ;
 ; GFX9-LABEL: v_clamp_v2f16_undef_limit_elts1:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1 clamp
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_v2f16_undef_limit_elts1:
@@ -4163,18 +4163,18 @@ define amdgpu_kernel void @v_clamp_diff_source_f32(ptr addrspace(1) %out, ptr ad
 ;
 ; GFX9-LABEL: v_clamp_diff_source_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX9-NEXT:    s_load_dword s2, s[6:7], 0x8
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
+; GFX9-NEXT:    s_load_dword s6, s[2:3], 0x8
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    v_mov_b32_e32 v2, s2
-; GFX9-NEXT:    v_add_f32_e32 v1, s0, v1
-; GFX9-NEXT:    v_add_f32_e32 v2, s0, v2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    v_add_f32_e32 v1, s4, v1
+; GFX9-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX9-NEXT:    v_max_f32_e64 v1, v1, v2 clamp
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5] offset:12
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1] offset:12
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_clamp_diff_source_f32:
diff --git a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll
index bada3d904fbe3..fad1d47f55fd7 100644
--- a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll
+++ b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll
@@ -49,20 +49,20 @@ define amdgpu_kernel void @cluster_load_cluster_store(ptr noalias %lb, ptr noali
 ;
 ; GFX10-LABEL: cluster_load_cluster_store:
 ; GFX10:       ; %bb.0: ; %bb
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_add_u32 s0, s4, 8
-; GFX10-NEXT:    s_addc_u32 s1, s5, 0
-; GFX10-NEXT:    s_add_u32 s2, s4, 16
-; GFX10-NEXT:    v_mov_b32_e32 v3, s1
-; GFX10-NEXT:    s_addc_u32 s3, s5, 0
-; GFX10-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-NEXT:    v_mov_b32_e32 v2, s0
-; GFX10-NEXT:    s_add_u32 s0, s4, 24
-; GFX10-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-NEXT:    s_addc_u32 s1, s5, 0
-; GFX10-NEXT:    v_mov_b32_e32 v5, s3
-; GFX10-NEXT:    v_mov_b32_e32 v4, s2
+; GFX10-NEXT:    s_add_u32 s4, s0, 8
+; GFX10-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-NEXT:    s_add_u32 s6, s0, 16
+; GFX10-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-NEXT:    s_addc_u32 s7, s1, 0
+; GFX10-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-NEXT:    s_add_u32 s0, s0, 24
+; GFX10-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-NEXT:    v_mov_b32_e32 v3, s5
+; GFX10-NEXT:    v_mov_b32_e32 v4, s6
+; GFX10-NEXT:    v_mov_b32_e32 v5, s7
 ; GFX10-NEXT:    v_mov_b32_e32 v7, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v6, s0
 ; GFX10-NEXT:    s_clause 0x3
@@ -70,16 +70,16 @@ define amdgpu_kernel void @cluster_load_cluster_store(ptr noalias %lb, ptr noali
 ; GFX10-NEXT:    flat_load_dword v9, v[2:3]
 ; GFX10-NEXT:    flat_load_dword v10, v[4:5]
 ; GFX10-NEXT:    flat_load_dword v11, v[6:7]
-; GFX10-NEXT:    s_add_u32 s0, s6, 8
-; GFX10-NEXT:    s_addc_u32 s1, s7, 0
-; GFX10-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-NEXT:    s_add_u32 s0, s2, 8
+; GFX10-NEXT:    s_addc_u32 s1, s3, 0
+; GFX10-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s0
-; GFX10-NEXT:    s_add_u32 s0, s6, 16
-; GFX10-NEXT:    s_addc_u32 s1, s7, 0
-; GFX10-NEXT:    s_add_u32 s2, s6, 24
-; GFX10-NEXT:    v_mov_b32_e32 v1, s7
-; GFX10-NEXT:    s_addc_u32 s3, s7, 0
+; GFX10-NEXT:    s_add_u32 s0, s2, 16
+; GFX10-NEXT:    s_addc_u32 s1, s3, 0
+; GFX10-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-NEXT:    s_add_u32 s2, s2, 24
+; GFX10-NEXT:    s_addc_u32 s3, s3, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v5, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v4, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v7, s3
@@ -175,20 +175,20 @@ define amdgpu_kernel void @cluster_load_valu_cluster_store(ptr noalias %lb, ptr
 ;
 ; GFX10-LABEL: cluster_load_valu_cluster_store:
 ; GFX10:       ; %bb.0: ; %bb
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_add_u32 s0, s4, 8
-; GFX10-NEXT:    s_addc_u32 s1, s5, 0
-; GFX10-NEXT:    s_add_u32 s2, s4, 16
-; GFX10-NEXT:    v_mov_b32_e32 v3, s1
-; GFX10-NEXT:    v_mov_b32_e32 v2, s0
-; GFX10-NEXT:    s_addc_u32 s3, s5, 0
-; GFX10-NEXT:    s_add_u32 s0, s4, 24
-; GFX10-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-NEXT:    s_addc_u32 s1, s5, 0
-; GFX10-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-NEXT:    v_mov_b32_e32 v5, s3
-; GFX10-NEXT:    v_mov_b32_e32 v4, s2
+; GFX10-NEXT:    s_add_u32 s4, s0, 8
+; GFX10-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-NEXT:    s_add_u32 s6, s0, 16
+; GFX10-NEXT:    v_mov_b32_e32 v3, s5
+; GFX10-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-NEXT:    s_addc_u32 s7, s1, 0
+; GFX10-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-NEXT:    s_add_u32 s0, s0, 24
+; GFX10-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-NEXT:    v_mov_b32_e32 v4, s6
+; GFX10-NEXT:    v_mov_b32_e32 v5, s7
 ; GFX10-NEXT:    flat_load_dword v6, v[2:3]
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s0
@@ -196,18 +196,18 @@ define amdgpu_kernel void @cluster_load_valu_cluster_store(ptr noalias %lb, ptr
 ; GFX10-NEXT:    flat_load_dword v8, v[0:1]
 ; GFX10-NEXT:    flat_load_dword v9, v[4:5]
 ; GFX10-NEXT:    flat_load_dword v10, v[2:3]
-; GFX10-NEXT:    s_add_u32 s0, s6, 8
-; GFX10-NEXT:    s_addc_u32 s1, s7, 0
-; GFX10-NEXT:    s_add_u32 s2, s6, 16
+; GFX10-NEXT:    s_add_u32 s0, s2, 8
+; GFX10-NEXT:    s_addc_u32 s1, s3, 0
+; GFX10-NEXT:    s_add_u32 s4, s2, 16
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s1
-; GFX10-NEXT:    s_addc_u32 s3, s7, 0
-; GFX10-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-NEXT:    s_addc_u32 s5, s3, 0
+; GFX10-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s0
-; GFX10-NEXT:    s_add_u32 s0, s6, 24
-; GFX10-NEXT:    v_mov_b32_e32 v1, s7
-; GFX10-NEXT:    v_mov_b32_e32 v5, s3
-; GFX10-NEXT:    s_addc_u32 s1, s7, 0
-; GFX10-NEXT:    v_mov_b32_e32 v4, s2
+; GFX10-NEXT:    s_add_u32 s0, s2, 24
+; GFX10-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-NEXT:    v_mov_b32_e32 v4, s4
+; GFX10-NEXT:    s_addc_u32 s1, s3, 0
+; GFX10-NEXT:    v_mov_b32_e32 v5, s5
 ; GFX10-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v11, 1, v6
 ; GFX10-NEXT:    v_mov_b32_e32 v7, s1
diff --git a/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll b/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll
index b5e0589cf9e46..df223b3ec1354 100644
--- a/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll
+++ b/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll
@@ -665,17 +665,17 @@ define amdgpu_kernel void @sub_zext_setcc_commute(ptr addrspace(1) nocapture %ar
 ;
 ; GFX9-LABEL: sub_zext_setcc_commute:
 ; GFX9:       ; %bb.0: ; %bb
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, v0, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v3, v2, s[4:5]
+; GFX9-NEXT:    global_load_dword v3, v2, s[0:1]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v3
-; GFX9-NEXT:    v_add_u32_e32 v0, s6, v0
-; GFX9-NEXT:    v_subrev_u32_e32 v0, s7, v0
-; GFX9-NEXT:    global_store_dword v2, v0, s[4:5]
+; GFX9-NEXT:    v_add_u32_e32 v0, s2, v0
+; GFX9-NEXT:    v_subrev_u32_e32 v0, s3, v0
+; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 bb:
   %x = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -714,17 +714,17 @@ define amdgpu_kernel void @sub_sext_setcc_commute(ptr addrspace(1) nocapture %ar
 ;
 ; GFX9-LABEL: sub_sext_setcc_commute:
 ; GFX9:       ; %bb.0: ; %bb
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, v0, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v3, v2, s[4:5]
+; GFX9-NEXT:    global_load_dword v3, v2, s[0:1]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v3
-; GFX9-NEXT:    v_add_u32_e32 v0, s6, v0
-; GFX9-NEXT:    v_subrev_u32_e32 v0, s7, v0
-; GFX9-NEXT:    global_store_dword v2, v0, s[4:5]
+; GFX9-NEXT:    v_add_u32_e32 v0, s2, v0
+; GFX9-NEXT:    v_subrev_u32_e32 v0, s3, v0
+; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 bb:
   %x = tail call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll b/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll
index 1e2a2dcf1e333..48bd8f9b80799 100644
--- a/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll
+++ b/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll
@@ -4,13 +4,13 @@
 define amdgpu_kernel void @vectorLoadCombine(ptr %in, ptr %out) {
 ; GCN-LABEL: vectorLoadCombine:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-NEXT:    flat_load_dword v2, v[0:1]
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    flat_store_dword v[0:1], v2
 ; GCN-NEXT:    s_endpgm
@@ -37,14 +37,14 @@ entry:
 define amdgpu_kernel void @vectorLoadShuffle(ptr %in, ptr %out) {
 ; GCN-LABEL: vectorLoadShuffle:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GCN-NEXT:    s_mov_b32 s0, 0x7050604
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-NEXT:    flat_load_dword v2, v[0:1]
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-NEXT:    s_mov_b32 s0, 0x7050604
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_perm_b32 v2, v2, v2, s0
 ; GCN-NEXT:    flat_store_dword v[0:1], v2
diff --git a/llvm/test/CodeGen/AMDGPU/copy_to_scc.ll b/llvm/test/CodeGen/AMDGPU/copy_to_scc.ll
index ce181b6223e41..c57ee9cc6a1e2 100644
--- a/llvm/test/CodeGen/AMDGPU/copy_to_scc.ll
+++ b/llvm/test/CodeGen/AMDGPU/copy_to_scc.ll
@@ -8,21 +8,21 @@ define amdgpu_kernel void @copy_to_scc(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x0
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    buffer_load_dword v0, off, s[4:7], 0 offset:252
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GCN-NEXT:    s_load_dword s2, s[2:3], 0x0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_cmp_lg_u32 s2, 0
+; GCN-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_load_dword s0, s[6:7], 0x0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_cmp_lg_u32 s0, 0
-; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GCN-NEXT:    s_xor_b64 s[0:1], s[0:1], vcc
-; GCN-NEXT:    s_and_b64 s[0:1], s[0:1], exec
-; GCN-NEXT:    s_cselect_b32 s0, 2, 3
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    global_store_dword v1, v0, s[4:5]
+; GCN-NEXT:    s_xor_b64 s[2:3], s[2:3], vcc
+; GCN-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN-NEXT:    s_cselect_b32 s2, 2, 3
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GCN-NEXT:    s_endpgm
 entry:                                             ; preds = %1009
   %0 = load i32, ptr addrspace(1) %in, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll
index ba0a1e75e29b7..63b9d68123fa4 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll
@@ -164,28 +164,28 @@ define amdgpu_kernel void @v_ctlz_i32(ptr addrspace(1) noalias %out, ptr addrspa
 ;
 ; GFX10-LABEL: v_ctlz_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_ctlz_i32:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 32, v0
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_ctlz_i32:
@@ -278,32 +278,32 @@ define amdgpu_kernel void @v_ctlz_v2i32(ptr addrspace(1) noalias %out, ptr addrs
 ;
 ; GFX10-LABEL: v_ctlz_v2i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v0, s[6:7]
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbh_u32_e32 v1, v1
 ; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX10-NEXT:    v_min_u32_e32 v1, 32, v1
 ; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
-; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_ctlz_v2i32:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dwordx2 v[0:1], v0, s[6:7]
+; GFX10-GISEL-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 32, v0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
-; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_ctlz_v2i32:
@@ -414,11 +414,11 @@ define amdgpu_kernel void @v_ctlz_v4i32(ptr addrspace(1) noalias %out, ptr addrs
 ;
 ; GFX10-LABEL: v_ctlz_v4i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx4 v[0:3], v0, s[6:7]
+; GFX10-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbh_u32_e32 v3, v3
 ; GFX10-NEXT:    v_ffbh_u32_e32 v2, v2
@@ -428,16 +428,16 @@ define amdgpu_kernel void @v_ctlz_v4i32(ptr addrspace(1) noalias %out, ptr addrs
 ; GFX10-NEXT:    v_min_u32_e32 v2, 32, v2
 ; GFX10-NEXT:    v_min_u32_e32 v1, 32, v1
 ; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
-; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[4:5]
+; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_ctlz_v4i32:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dwordx4 v[0:3], v0, s[6:7]
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
@@ -447,7 +447,7 @@ define amdgpu_kernel void @v_ctlz_v4i32(ptr addrspace(1) noalias %out, ptr addrs
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v2, 32, v2
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v3, 32, v3
-; GFX10-GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[4:5]
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_ctlz_v4i32:
@@ -555,28 +555,28 @@ define amdgpu_kernel void @v_ctlz_i8(ptr addrspace(1) noalias %out, ptr addrspac
 ;
 ; GFX10-LABEL: v_ctlz_i8:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ubyte v1, v0, s[6:7]
+; GFX10-NEXT:    global_load_ubyte v1, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbh_u32_e32 v1, v1
 ; GFX10-NEXT:    v_min_u32_e32 v1, 32, v1
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v1, 24, v1
-; GFX10-NEXT:    global_store_byte v0, v1, s[4:5]
+; GFX10-NEXT:    global_store_byte v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_ctlz_i8:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_ubyte v1, v0, s[6:7]
+; GFX10-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
 ; GFX10-GISEL-NEXT:    v_subrev_nc_u32_e32 v1, 24, v1
-; GFX10-GISEL-NEXT:    global_store_byte v0, v1, s[4:5]
+; GFX10-GISEL-NEXT:    global_store_byte v0, v1, s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_ctlz_i8:
@@ -742,24 +742,24 @@ define amdgpu_kernel void @s_ctlz_i64_trunc(ptr addrspace(1) noalias %out, i64 %
 ;
 ; GFX10-LABEL: s_ctlz_i64_trunc:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_flbit_i32_b64 s0, s[6:7]
-; GFX10-NEXT:    s_min_u32 s0, s0, 64
-; GFX10-NEXT:    v_mov_b32_e32 v1, s0
-; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-NEXT:    s_flbit_i32_b64 s2, s[2:3]
+; GFX10-NEXT:    s_min_u32 s2, s2, 64
+; GFX10-NEXT:    v_mov_b32_e32 v1, s2
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: s_ctlz_i64_trunc:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_flbit_i32_b64 s0, s[6:7]
-; GFX10-GISEL-NEXT:    s_min_u32 s0, s0, 64
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-GISEL-NEXT:    s_flbit_i32_b64 s2, s[2:3]
+; GFX10-GISEL-NEXT:    s_min_u32 s2, s2, 64
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_ctlz_i64_trunc:
@@ -852,25 +852,25 @@ define amdgpu_kernel void @v_ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspa
 ;
 ; GFX10-LABEL: v_ctlz_i64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v2, s[6:7]
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX10-NEXT:    v_ffbh_u32_e32 v1, v1
 ; GFX10-NEXT:    v_add_nc_u32_e64 v0, v0, 32 clamp
 ; GFX10-NEXT:    v_min3_u32 v0, v0, v1, 64
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_ctlz_i64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dwordx2 v[0:1], v2, s[6:7]
+; GFX10-GISEL-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
@@ -878,7 +878,7 @@ define amdgpu_kernel void @v_ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspa
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, v1, v0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 64, v0
-; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_ctlz_i64:
@@ -981,33 +981,33 @@ define amdgpu_kernel void @v_ctlz_i64_trunc(ptr addrspace(1) noalias %out, ptr a
 ;
 ; GFX10-LABEL: v_ctlz_i64_trunc:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[1:2], v1, s[6:7]
+; GFX10-NEXT:    global_load_dwordx2 v[1:2], v1, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbh_u32_e32 v1, v1
 ; GFX10-NEXT:    v_ffbh_u32_e32 v2, v2
 ; GFX10-NEXT:    v_add_nc_u32_e64 v1, v1, 32 clamp
 ; GFX10-NEXT:    v_min3_u32 v1, v1, v2, 64
-; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_ctlz_i64_trunc:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dwordx2 v[1:2], v1, s[6:7]
+; GFX10-GISEL-NEXT:    global_load_dwordx2 v[1:2], v1, s[2:3]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v2, v2
 ; GFX10-GISEL-NEXT:    v_add_nc_u32_e64 v1, v1, 32 clamp
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, v2, v1
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 64, v1
-; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_ctlz_i64_trunc:
@@ -1099,29 +1099,29 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out,
 ;
 ; GFX10-LABEL: v_ctlz_i32_sel_eq_neg1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_ctlz_i32_sel_eq_neg1:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, -1, vcc_lo
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_ctlz_i32_sel_eq_neg1:
@@ -1208,29 +1208,29 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out,
 ;
 ; GFX10-LABEL: v_ctlz_i32_sel_ne_neg1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_ctlz_i32_sel_ne_neg1:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
 ; GFX10-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e32 v0, -1, v1, vcc_lo
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_ctlz_i32_sel_ne_neg1:
@@ -1326,32 +1326,32 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias %
 ;
 ; GFX10-LABEL: v_ctlz_i32_sel_eq_bitwidth:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 32, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_ctlz_i32_sel_eq_bitwidth:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 32, v0
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 32, v0
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc_lo
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_ctlz_i32_sel_eq_bitwidth:
@@ -1450,32 +1450,32 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ;
 ; GFX10-LABEL: v_ctlz_i32_sel_ne_bitwidth:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 32, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_ctlz_i32_sel_ne_bitwidth:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 32, v0
 ; GFX10-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 32, v0
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_ctlz_i32_sel_ne_bitwidth:
@@ -1569,22 +1569,22 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ;
 ; GFX10-LABEL: v_ctlz_i8_sel_eq_neg1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ubyte v0, v0, s[6:7]
+; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
-; GFX10-NEXT:    global_store_byte v1, v0, s[4:5]
+; GFX10-NEXT:    global_store_byte v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_ctlz_i8_sel_eq_neg1:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s3
 ; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
 ; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo
 ; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
@@ -1595,7 +1595,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; GFX10-GISEL-NEXT:    v_subrev_nc_u32_e32 v1, 24, v1
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, -1, vcc_lo
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-GISEL-NEXT:    global_store_byte v1, v0, s[4:5]
+; GFX10-GISEL-NEXT:    global_store_byte v1, v0, s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_ctlz_i8_sel_eq_neg1:
@@ -1691,25 +1691,25 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ;
 ; GFX10-LABEL: v_ctlz_i16_sel_eq_neg1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ushort v1, v0, s[6:7]
+; GFX10-NEXT:    global_load_ushort v1, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbh_u32_e32 v2, v1
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
 ; GFX10-NEXT:    v_min_u32_e32 v2, 32, v2
 ; GFX10-NEXT:    v_add_nc_u32_e32 v2, -16, v2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo
-; GFX10-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_ctlz_i16_sel_eq_neg1:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_ushort v1, v0, s[6:7]
+; GFX10-GISEL-NEXT:    global_load_ushort v1, v0, s[2:3]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v2, v1
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
@@ -1717,7 +1717,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; GFX10-GISEL-NEXT:    v_subrev_nc_u32_e32 v2, 16, v2
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v2, 0xffff, vcc_lo
-; GFX10-GISEL-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX10-GISEL-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_ctlz_i16_sel_eq_neg1:
@@ -1812,23 +1812,23 @@ define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out,
 ;
 ; GFX10-LABEL: v_ctlz_i7_sel_eq_neg1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ubyte v0, v0, s[6:7]
+; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0x7f, v0
-; GFX10-NEXT:    global_store_byte v1, v0, s[4:5]
+; GFX10-NEXT:    global_store_byte v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_ctlz_i7_sel_eq_neg1:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s3
 ; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
 ; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo
 ; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
@@ -1841,7 +1841,7 @@ define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out,
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, 0x7f, vcc_lo
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0x7f, v0
-; GFX10-GISEL-NEXT:    global_store_byte v1, v0, s[4:5]
+; GFX10-GISEL-NEXT:    global_store_byte v1, v0, s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_ctlz_i7_sel_eq_neg1:
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
index a55c8cdc9b6e8..f16f05811c185 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
@@ -134,14 +134,14 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out,
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
-; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
@@ -211,15 +211,15 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v2i32(ptr addrspace(1) noalias %out
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v2i32:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dwordx2 v[0:1], v0, s[6:7]
+; GFX9-GISEL-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
-; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr <2 x i32>, ptr addrspace(1) %valptr, i32 %tid
@@ -295,17 +295,17 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v4i32(ptr addrspace(1) noalias %out
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v4i32:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dwordx4 v[0:3], v0, s[6:7]
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v2, v2
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v3, v3
-; GFX9-GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[4:5]
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr <4 x i32>, ptr addrspace(1) %valptr, i32 %tid
@@ -561,14 +561,14 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no
 ;
 ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i64_with_select:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-GISEL-NEXT:    s_mov_b32 s1, 0
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_flbit_i32_b64 s0, s[6:7]
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-GISEL-NEXT:    s_flbit_i32_b64 s4, s[2:3]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %ctlz = tail call i64 @llvm.ctlz.i64(i64 %val, i1 true) nounwind readnone
   %ctlz_ret = icmp ne i64 %val, 0
@@ -649,17 +649,17 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i8_with_select:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[6:7]
+; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v1
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v2, v2
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc
-; GFX9-GISEL-NEXT:    global_store_byte v0, v1, s[4:5]
+; GFX9-GISEL-NEXT:    global_store_byte v0, v1, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i8, ptr addrspace(1) %arrayidx, align 1
   %ctlz = tail call i8 @llvm.ctlz.i8(i8 %val, i1 true) nounwind readnone
@@ -753,11 +753,11 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i16_with_select:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[6:7]
-; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[6:7] offset:1
+; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:1
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
@@ -765,7 +765,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc
-; GFX9-GISEL-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX9-GISEL-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i16, ptr addrspace(1) %arrayidx, align 1
   %ctlz = tail call i16 @llvm.ctlz.i16(i16 %val, i1 true) nounwind readnone
@@ -869,13 +869,13 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_with_select:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[6:7]
-; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[6:7] offset:1
-; GFX9-GISEL-NEXT:    global_load_ubyte v3, v0, s[6:7] offset:3
-; GFX9-GISEL-NEXT:    global_load_ubyte v4, v0, s[6:7] offset:2
+; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:1
+; GFX9-GISEL-NEXT:    global_load_ubyte v3, v0, s[2:3] offset:3
+; GFX9-GISEL-NEXT:    global_load_ubyte v4, v0, s[2:3] offset:2
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(1)
@@ -886,7 +886,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v2, v1
 ; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc
-; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i32, ptr addrspace(1) %arrayidx, align 1
   %ctlz = tail call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
@@ -1050,17 +1050,17 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i64_with_select:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_ubyte v0, v1, s[6:7]
-; GFX9-GISEL-NEXT:    global_load_ubyte v2, v1, s[6:7] offset:1
-; GFX9-GISEL-NEXT:    global_load_ubyte v3, v1, s[6:7] offset:2
-; GFX9-GISEL-NEXT:    global_load_ubyte v4, v1, s[6:7] offset:3
-; GFX9-GISEL-NEXT:    global_load_ubyte v5, v1, s[6:7] offset:4
-; GFX9-GISEL-NEXT:    global_load_ubyte v6, v1, s[6:7] offset:5
-; GFX9-GISEL-NEXT:    global_load_ubyte v7, v1, s[6:7] offset:6
-; GFX9-GISEL-NEXT:    global_load_ubyte v8, v1, s[6:7] offset:7
+; GFX9-GISEL-NEXT:    global_load_ubyte v0, v1, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:1
+; GFX9-GISEL-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:2
+; GFX9-GISEL-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:3
+; GFX9-GISEL-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:4
+; GFX9-GISEL-NEXT:    global_load_ubyte v6, v1, s[2:3] offset:5
+; GFX9-GISEL-NEXT:    global_load_ubyte v7, v1, s[2:3] offset:6
+; GFX9-GISEL-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:7
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v2, 8, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(5)
@@ -1081,7 +1081,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no
 ; GFX9-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
 ; GFX9-GISEL-NEXT:    v_min_u32_e32 v0, v4, v0
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, 64, v0, vcc
-; GFX9-GISEL-NEXT:    global_store_dwordx2 v1, v[0:1], s[4:5]
+; GFX9-GISEL-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i64, ptr addrspace(1) %arrayidx, align 1
   %ctlz = tail call i64 @llvm.ctlz.i64(i64 %val, i1 true) nounwind readnone
@@ -1158,11 +1158,11 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i8:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s7
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s3
 ; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v1, v0
 ; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v3, vcc
 ; GFX9-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
@@ -1170,7 +1170,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
-; GFX9-GISEL-NEXT:    global_store_byte v1, v0, s[4:5]
+; GFX9-GISEL-NEXT:    global_store_byte v1, v0, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i8, ptr addrspace(1) %valptr, i32 %tid
@@ -1282,12 +1282,12 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias
 ;
 ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i64_trunc:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_flbit_i32_b64 s0, s[6:7]
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX9-GISEL-NEXT:    s_flbit_i32_b64 s2, s[2:3]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true)
   %trunc = trunc i64 %ctlz to i32
@@ -1364,17 +1364,17 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out,
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dwordx2 v[0:1], v2, s[6:7]
+; GFX9-GISEL-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
 ; GFX9-GISEL-NEXT:    v_add_u32_e32 v0, 32, v0
 ; GFX9-GISEL-NEXT:    v_min_u32_e32 v0, v1, v0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid
@@ -1454,17 +1454,17 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i64_trunc:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dwordx2 v[1:2], v1, s[6:7]
+; GFX9-GISEL-NEXT:    global_load_dwordx2 v[1:2], v1, s[2:3]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v2, v2
 ; GFX9-GISEL-NEXT:    v_add_u32_e32 v1, 32, v1
 ; GFX9-GISEL-NEXT:    v_min_u32_e32 v1, v2, v1
-; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid
@@ -1534,16 +1534,16 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1(ptr addrspace(1) no
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, -1, vcc
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
@@ -1613,16 +1613,16 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_neg1(ptr addrspace(1) no
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_ne_neg1:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
 ; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, -1, v1, vcc
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
@@ -1697,11 +1697,11 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(ptr addrspace(1) noa
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i8_sel_eq_neg1:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s7
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s3
 ; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v1, v0
 ; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v3, vcc
 ; GFX9-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
@@ -1709,9 +1709,9 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(ptr addrspace(1) noa
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v0
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v2, v2
-; GFX9-GISEL-NEXT:    v_cmp_eq_u32_sdwa s[0:1], v0, v1 src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, -1, s[0:1]
-; GFX9-GISEL-NEXT:    global_store_byte v1, v0, s[4:5]
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_sdwa s[2:3], v0, v1 src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, -1, s[2:3]
+; GFX9-GISEL-NEXT:    global_store_byte v1, v0, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %valptr.gep = getelementptr i8, ptr addrspace(1) %valptr, i32 %tid
@@ -1799,17 +1799,17 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1_two_use(ptr addrspa
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1_two_use:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v2, v0
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, -1, vcc
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
@@ -1888,16 +1888,16 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_0(ptr addrspace(1) noali
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_0:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
@@ -1972,16 +1972,16 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_0(ptr addrspace(1) noali
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_ne_0:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
 ; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
@@ -2057,16 +2057,16 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(ptr addrspace(1
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_cmp_non0:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
@@ -2142,16 +2142,16 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_cmp_non0(ptr addrspace(1
 ;
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_ne_cmp_non0:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
 ; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 1, v0
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/cttz.ll b/llvm/test/CodeGen/AMDGPU/cttz.ll
index 57fe6cd4e1e45..02b0b1cc28fa8 100644
--- a/llvm/test/CodeGen/AMDGPU/cttz.ll
+++ b/llvm/test/CodeGen/AMDGPU/cttz.ll
@@ -148,28 +148,28 @@ define amdgpu_kernel void @v_cttz_i32(ptr addrspace(1) noalias %out, ptr addrspa
 ;
 ; GFX10-LABEL: v_cttz_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0
 ; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_cttz_i32:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v0, v0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 32, v0
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
@@ -246,32 +246,32 @@ define amdgpu_kernel void @v_cttz_v2i32(ptr addrspace(1) noalias %out, ptr addrs
 ;
 ; GFX10-LABEL: v_cttz_v2i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v0, s[6:7]
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbl_b32_e32 v1, v1
 ; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0
 ; GFX10-NEXT:    v_min_u32_e32 v1, 32, v1
 ; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
-; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_cttz_v2i32:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dwordx2 v[0:1], v0, s[6:7]
+; GFX10-GISEL-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v0, v0
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v1, v1
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 32, v0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
-; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr <2 x i32>, ptr addrspace(1) %valptr, i32 %tid
@@ -362,11 +362,11 @@ define amdgpu_kernel void @v_cttz_v4i32(ptr addrspace(1) noalias %out, ptr addrs
 ;
 ; GFX10-LABEL: v_cttz_v4i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx4 v[0:3], v0, s[6:7]
+; GFX10-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbl_b32_e32 v3, v3
 ; GFX10-NEXT:    v_ffbl_b32_e32 v2, v2
@@ -376,16 +376,16 @@ define amdgpu_kernel void @v_cttz_v4i32(ptr addrspace(1) noalias %out, ptr addrs
 ; GFX10-NEXT:    v_min_u32_e32 v2, 32, v2
 ; GFX10-NEXT:    v_min_u32_e32 v1, 32, v1
 ; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
-; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[4:5]
+; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_cttz_v4i32:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dwordx4 v[0:3], v0, s[6:7]
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v0, v0
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v1, v1
@@ -395,7 +395,7 @@ define amdgpu_kernel void @v_cttz_v4i32(ptr addrspace(1) noalias %out, ptr addrs
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v2, 32, v2
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v3, 32, v3
-; GFX10-GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[4:5]
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr <4 x i32>, ptr addrspace(1) %valptr, i32 %tid
@@ -475,26 +475,26 @@ define amdgpu_kernel void @v_cttz_i8(ptr addrspace(1) noalias %out, ptr addrspac
 ;
 ; GFX10-LABEL: v_cttz_i8:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ubyte v1, v0, s[6:7]
+; GFX10-NEXT:    global_load_ubyte v1, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_or_b32_e32 v1, 0x100, v1
 ; GFX10-NEXT:    v_ffbl_b32_e32 v1, v1
-; GFX10-NEXT:    global_store_byte v0, v1, s[4:5]
+; GFX10-NEXT:    global_store_byte v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_cttz_i8:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_ubyte v1, v0, s[6:7]
+; GFX10-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_or_b32_e32 v1, 0x100, v1
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v1, v1
-; GFX10-GISEL-NEXT:    global_store_byte v0, v1, s[4:5]
+; GFX10-GISEL-NEXT:    global_store_byte v0, v1, s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
   %val = load i8, ptr addrspace(1) %valptr
   %cttz = call i8 @llvm.cttz.i8(i8 %val, i1 false) nounwind readnone
@@ -629,24 +629,24 @@ define amdgpu_kernel void @s_cttz_i64_trunc(ptr addrspace(1) noalias %out, i64 %
 ;
 ; GFX10-LABEL: s_cttz_i64_trunc:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_ff1_i32_b64 s0, s[6:7]
-; GFX10-NEXT:    s_min_u32 s0, s0, 64
-; GFX10-NEXT:    v_mov_b32_e32 v1, s0
-; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-NEXT:    s_ff1_i32_b64 s2, s[2:3]
+; GFX10-NEXT:    s_min_u32 s2, s2, 64
+; GFX10-NEXT:    v_mov_b32_e32 v1, s2
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: s_cttz_i64_trunc:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_ff1_i32_b64 s0, s[6:7]
-; GFX10-GISEL-NEXT:    s_min_u32 s0, s0, 64
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-GISEL-NEXT:    s_ff1_i32_b64 s2, s[2:3]
+; GFX10-GISEL-NEXT:    s_min_u32 s2, s2, 64
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
   %cttz = call i64 @llvm.cttz.i64(i64 %val, i1 false)
   %trunc = trunc i64 %cttz to i32
@@ -726,25 +726,25 @@ define amdgpu_kernel void @v_cttz_i64(ptr addrspace(1) noalias %out, ptr addrspa
 ;
 ; GFX10-LABEL: v_cttz_i64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v2, s[6:7]
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbl_b32_e32 v1, v1
 ; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0
 ; GFX10-NEXT:    v_add_nc_u32_e64 v1, v1, 32 clamp
 ; GFX10-NEXT:    v_min3_u32 v0, v0, v1, 64
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_cttz_i64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dwordx2 v[0:1], v2, s[6:7]
+; GFX10-GISEL-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v1, v1
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v0, v0
@@ -752,7 +752,7 @@ define amdgpu_kernel void @v_cttz_i64(ptr addrspace(1) noalias %out, ptr addrspa
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, v0, v1
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 64, v0
-; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid
@@ -835,33 +835,33 @@ define amdgpu_kernel void @v_cttz_i64_trunc(ptr addrspace(1) noalias %out, ptr a
 ;
 ; GFX10-LABEL: v_cttz_i64_trunc:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[1:2], v1, s[6:7]
+; GFX10-NEXT:    global_load_dwordx2 v[1:2], v1, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbl_b32_e32 v2, v2
 ; GFX10-NEXT:    v_ffbl_b32_e32 v1, v1
 ; GFX10-NEXT:    v_add_nc_u32_e64 v2, v2, 32 clamp
 ; GFX10-NEXT:    v_min3_u32 v1, v1, v2, 64
-; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_cttz_i64_trunc:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dwordx2 v[1:2], v1, s[6:7]
+; GFX10-GISEL-NEXT:    global_load_dwordx2 v[1:2], v1, s[2:3]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v2, v2
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v1, v1
 ; GFX10-GISEL-NEXT:    v_add_nc_u32_e64 v2, v2, 32 clamp
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, v1, v2
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 64, v1
-; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid
@@ -933,29 +933,29 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out,
 ;
 ; GFX10-LABEL: v_cttz_i32_sel_eq_neg1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_cttz_i32_sel_eq_neg1:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v1, v0
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, -1, vcc_lo
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
@@ -1027,29 +1027,29 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out,
 ;
 ; GFX10-LABEL: v_cttz_i32_sel_ne_neg1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_cttz_i32_sel_ne_neg1:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v1, v0
 ; GFX10-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e32 v0, -1, v1, vcc_lo
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
@@ -1130,32 +1130,32 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias %
 ;
 ; GFX10-LABEL: v_cttz_i32_sel_eq_bitwidth:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0
 ; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 32, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_cttz_i32_sel_eq_bitwidth:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v0, v0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 32, v0
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 32, v0
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc_lo
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
@@ -1235,32 +1235,32 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ;
 ; GFX10-LABEL: v_cttz_i32_sel_ne_bitwidth:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0
 ; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 32, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_cttz_i32_sel_ne_bitwidth:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v0, v0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 32, v0
 ; GFX10-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 32, v0
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
@@ -1335,32 +1335,32 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ;
 ; GFX10-LABEL: v_cttz_i8_sel_eq_neg1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ubyte v0, v0, s[6:7]
+; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0
-; GFX10-NEXT:    global_store_byte v1, v0, s[4:5]
+; GFX10-NEXT:    global_store_byte v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_cttz_i8_sel_eq_neg1:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s3
 ; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
 ; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_or_b32_e32 v1, 0x100, v0
-; GFX10-GISEL-NEXT:    v_cmp_eq_u32_sdwa s0, v0, v2 src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_sdwa s2, v0, v2 src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v1, v1
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, -1, s0
-; GFX10-GISEL-NEXT:    global_store_byte v2, v0, s[4:5]
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, -1, s2
+; GFX10-GISEL-NEXT:    global_store_byte v2, v0, s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %valptr.gep = getelementptr i8, ptr addrspace(1) %valptr, i32 %tid
@@ -1442,31 +1442,31 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ;
 ; GFX10-LABEL: v_cttz_i16_sel_eq_neg1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ushort v1, v0, s[6:7]
+; GFX10-NEXT:    global_load_ushort v1, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_or_b32_e32 v2, 0x10000, v1
 ; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v1
 ; GFX10-NEXT:    v_ffbl_b32_e32 v2, v2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo
-; GFX10-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_cttz_i16_sel_eq_neg1:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_ushort v1, v0, s[6:7]
+; GFX10-GISEL-NEXT:    global_load_ushort v1, v0, s[2:3]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_or_b32_e32 v2, 0x10000, v1
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v2, v2
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v2, 0xffff, vcc_lo
-; GFX10-GISEL-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX10-GISEL-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
   %val = load i16, ptr addrspace(1) %valptr
   %cttz = call i16 @llvm.cttz.i16(i16 %val, i1 false) nounwind readnone
@@ -1542,23 +1542,23 @@ define amdgpu_kernel void @v_cttz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out,
 ;
 ; GFX10-LABEL: v_cttz_i7_sel_eq_neg1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ubyte v0, v0, s[6:7]
+; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0x7f, v0
-; GFX10-NEXT:    global_store_byte v1, v0, s[4:5]
+; GFX10-NEXT:    global_store_byte v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_cttz_i7_sel_eq_neg1:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s3
 ; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
 ; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo
 ; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
@@ -1570,7 +1570,7 @@ define amdgpu_kernel void @v_cttz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out,
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, 0x7f, vcc_lo
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0x7f, v0
-; GFX10-GISEL-NEXT:    global_store_byte v1, v0, s[4:5]
+; GFX10-GISEL-NEXT:    global_store_byte v1, v0, s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %valptr.gep = getelementptr i7, ptr addrspace(1) %valptr, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
index 81ed823bad204..2491abe4bc1ce 100644
--- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
@@ -121,14 +121,14 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32(ptr addrspace(1) noalias %out,
 ;
 ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i32:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v0, v0
-; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
@@ -198,15 +198,15 @@ define amdgpu_kernel void @v_cttz_zero_undef_v2i32(ptr addrspace(1) noalias %out
 ;
 ; GFX9-GISEL-LABEL: v_cttz_zero_undef_v2i32:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dwordx2 v[0:1], v0, s[6:7]
+; GFX9-GISEL-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v0, v0
 ; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v1, v1
-; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr <2 x i32>, ptr addrspace(1) %valptr, i32 %tid
@@ -282,17 +282,17 @@ define amdgpu_kernel void @v_cttz_zero_undef_v4i32(ptr addrspace(1) noalias %out
 ;
 ; GFX9-GISEL-LABEL: v_cttz_zero_undef_v4i32:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dwordx4 v[0:3], v0, s[6:7]
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v0, v0
 ; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v1, v1
 ; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v2, v2
 ; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v3, v3
-; GFX9-GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[4:5]
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr <4 x i32>, ptr addrspace(1) %valptr, i32 %tid
@@ -538,14 +538,14 @@ define amdgpu_kernel void @s_cttz_zero_undef_i64_with_select(ptr addrspace(1) no
 ;
 ; GFX9-GISEL-LABEL: s_cttz_zero_undef_i64_with_select:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-GISEL-NEXT:    s_mov_b32 s1, 0
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s0, s[6:7]
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s4, s[2:3]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %cttz = tail call i64 @llvm.cttz.i64(i64 %val, i1 true) nounwind readnone
   %cttz_ret = icmp ne i64 %val, 0
@@ -622,16 +622,16 @@ define amdgpu_kernel void @v_cttz_zero_undef_i8_with_select(ptr addrspace(1) noa
 ;
 ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i8_with_select:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[6:7]
+; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v2, v1
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc
-; GFX9-GISEL-NEXT:    global_store_byte v0, v1, s[4:5]
+; GFX9-GISEL-NEXT:    global_store_byte v0, v1, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i8, ptr addrspace(1) %arrayidx, align 1
   %cttz = tail call i8 @llvm.cttz.i8(i8 %val, i1 true) nounwind readnone
@@ -721,18 +721,18 @@ define amdgpu_kernel void @v_cttz_zero_undef_i16_with_select(ptr addrspace(1) no
 ;
 ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i16_with_select:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[6:7]
-; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[6:7] offset:1
+; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:1
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
 ; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v2, v1
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc
-; GFX9-GISEL-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX9-GISEL-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i16, ptr addrspace(1) %arrayidx, align 1
   %cttz = tail call i16 @llvm.cttz.i16(i16 %val, i1 true) nounwind readnone
@@ -836,13 +836,13 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(ptr addrspace(1) no
 ;
 ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i32_with_select:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[6:7]
-; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[6:7] offset:1
-; GFX9-GISEL-NEXT:    global_load_ubyte v3, v0, s[6:7] offset:3
-; GFX9-GISEL-NEXT:    global_load_ubyte v4, v0, s[6:7] offset:2
+; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:1
+; GFX9-GISEL-NEXT:    global_load_ubyte v3, v0, s[2:3] offset:3
+; GFX9-GISEL-NEXT:    global_load_ubyte v4, v0, s[2:3] offset:2
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(1)
@@ -853,7 +853,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(ptr addrspace(1) no
 ; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v2, v1
 ; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc
-; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i32, ptr addrspace(1) %arrayidx, align 1
   %cttz = tail call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone
@@ -1017,17 +1017,17 @@ define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) no
 ;
 ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i64_with_select:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_ubyte v0, v1, s[6:7]
-; GFX9-GISEL-NEXT:    global_load_ubyte v2, v1, s[6:7] offset:1
-; GFX9-GISEL-NEXT:    global_load_ubyte v3, v1, s[6:7] offset:2
-; GFX9-GISEL-NEXT:    global_load_ubyte v4, v1, s[6:7] offset:3
-; GFX9-GISEL-NEXT:    global_load_ubyte v5, v1, s[6:7] offset:4
-; GFX9-GISEL-NEXT:    global_load_ubyte v6, v1, s[6:7] offset:5
-; GFX9-GISEL-NEXT:    global_load_ubyte v7, v1, s[6:7] offset:6
-; GFX9-GISEL-NEXT:    global_load_ubyte v8, v1, s[6:7] offset:7
+; GFX9-GISEL-NEXT:    global_load_ubyte v0, v1, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:1
+; GFX9-GISEL-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:2
+; GFX9-GISEL-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:3
+; GFX9-GISEL-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:4
+; GFX9-GISEL-NEXT:    global_load_ubyte v6, v1, s[2:3] offset:5
+; GFX9-GISEL-NEXT:    global_load_ubyte v7, v1, s[2:3] offset:6
+; GFX9-GISEL-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:7
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v2, 8, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(5)
@@ -1048,7 +1048,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) no
 ; GFX9-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
 ; GFX9-GISEL-NEXT:    v_min_u32_e32 v0, v0, v4
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, 64, v0, vcc
-; GFX9-GISEL-NEXT:    global_store_dwordx2 v1, v[0:1], s[4:5]
+; GFX9-GISEL-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i64, ptr addrspace(1) %arrayidx, align 1
   %cttz = tail call i64 @llvm.cttz.i64(i64 %val, i1 true) nounwind readnone
@@ -1152,13 +1152,13 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out,
 ;
 ; GFX9-GISEL-LABEL: v_cttz_i32_sel_eq_neg1:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[6:7]
-; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[6:7] offset:1
-; GFX9-GISEL-NEXT:    global_load_ubyte v3, v0, s[6:7] offset:3
-; GFX9-GISEL-NEXT:    global_load_ubyte v4, v0, s[6:7] offset:2
+; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:1
+; GFX9-GISEL-NEXT:    global_load_ubyte v3, v0, s[2:3] offset:3
+; GFX9-GISEL-NEXT:    global_load_ubyte v4, v0, s[2:3] offset:2
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(1)
@@ -1170,7 +1170,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out,
 ; GFX9-GISEL-NEXT:    v_min_u32_e32 v2, 32, v2
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, v2, -1, vcc
-; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i32, ptr addrspace(1) %arrayidx, align 1
   %ctlz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone
@@ -1274,13 +1274,13 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out,
 ;
 ; GFX9-GISEL-LABEL: v_cttz_i32_sel_ne_neg1:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[6:7]
-; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[6:7] offset:1
-; GFX9-GISEL-NEXT:    global_load_ubyte v3, v0, s[6:7] offset:3
-; GFX9-GISEL-NEXT:    global_load_ubyte v4, v0, s[6:7] offset:2
+; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:1
+; GFX9-GISEL-NEXT:    global_load_ubyte v3, v0, s[2:3] offset:3
+; GFX9-GISEL-NEXT:    global_load_ubyte v4, v0, s[2:3] offset:2
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(1)
@@ -1292,7 +1292,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out,
 ; GFX9-GISEL-NEXT:    v_min_u32_e32 v2, 32, v2
 ; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, -1, v2, vcc
-; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i32, ptr addrspace(1) %arrayidx, align 1
   %ctlz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone
@@ -1404,13 +1404,13 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ;
 ; GFX9-GISEL-LABEL: v_cttz_i32_sel_ne_bitwidth:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[6:7]
-; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[6:7] offset:1
-; GFX9-GISEL-NEXT:    global_load_ubyte v3, v0, s[6:7] offset:3
-; GFX9-GISEL-NEXT:    global_load_ubyte v4, v0, s[6:7] offset:2
+; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:1
+; GFX9-GISEL-NEXT:    global_load_ubyte v3, v0, s[2:3] offset:3
+; GFX9-GISEL-NEXT:    global_load_ubyte v4, v0, s[2:3] offset:2
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(1)
@@ -1422,7 +1422,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; GFX9-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
 ; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 32, v1
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, -1, v1, vcc
-; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i32, ptr addrspace(1) %arrayidx, align 1
   %ctlz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone
@@ -1498,18 +1498,18 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ;
 ; GFX9-GISEL-LABEL: v_cttz_i8_sel_eq_neg1:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xff
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[6:7]
+; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_or_b32_e32 v3, 0x100, v1
 ; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v3, v3
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
-; GFX9-GISEL-NEXT:    global_store_byte v0, v1, s[4:5]
+; GFX9-GISEL-NEXT:    global_store_byte v0, v1, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i8, ptr addrspace(1) %arrayidx, align 1
   %ctlz = call i8 @llvm.cttz.i8(i8 %val, i1 false) nounwind readnone
@@ -1597,12 +1597,12 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ;
 ; GFX9-GISEL-LABEL: v_cttz_i16_sel_eq_neg1:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xffff
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[6:7]
-; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[6:7] offset:1
+; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:1
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
 ; GFX9-GISEL-NEXT:    v_or_b32_e32 v2, 0x10000, v1
@@ -1610,7 +1610,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-GISEL-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX9-GISEL-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i16, ptr addrspace(1) %arrayidx, align 1
   %ctlz = call i16 @llvm.cttz.i16(i16 %val, i1 false) nounwind readnone
diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
index f3f749b5c054b..96969a12b2c58 100644
--- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
@@ -933,24 +933,24 @@ define amdgpu_kernel void @load_i8_to_f32(ptr addrspace(1) noalias %out, ptr add
 ;
 ; GFX10-LABEL: load_i8_to_f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ubyte v0, v0, s[6:7]
+; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: load_i8_to_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ubyte v0, v0, s[6:7]
+; GFX9-NEXT:    global_load_ubyte v0, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: load_i8_to_f32:
@@ -1013,28 +1013,28 @@ define amdgpu_kernel void @load_v2i8_to_v2f32(ptr addrspace(1) noalias %out, ptr
 ;
 ; GFX10-LABEL: load_v2i8_to_v2f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ushort v0, v0, s[6:7]
+; GFX10-NEXT:    global_load_ushort v0, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: load_v2i8_to_v2f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v0, v0, s[6:7]
+; GFX9-NEXT:    global_load_ushort v0, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: load_v2i8_to_v2f32:
@@ -1104,30 +1104,30 @@ define amdgpu_kernel void @load_v3i8_to_v3f32(ptr addrspace(1) noalias %out, ptr
 ;
 ; GFX10-LABEL: load_v3i8_to_v3f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
 ; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX10-NEXT:    global_store_dwordx3 v3, v[0:2], s[4:5]
+; GFX10-NEXT:    global_store_dwordx3 v3, v[0:2], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: load_v3i8_to_v3f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
 ; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX9-NEXT:    global_store_dwordx3 v3, v[0:2], s[4:5]
+; GFX9-NEXT:    global_store_dwordx3 v3, v[0:2], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: load_v3i8_to_v3f32:
@@ -1198,32 +1198,32 @@ define amdgpu_kernel void @load_v4i8_to_v4f32(ptr addrspace(1) noalias %out, ptr
 ;
 ; GFX10-LABEL: load_v4i8_to_v4f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
 ; GFX10-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
 ; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[4:5]
+; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: load_v4i8_to_v4f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
 ; GFX9-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
 ; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[4:5]
+; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: load_v4i8_to_v4f32:
@@ -1318,15 +1318,15 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias
 ;
 ; GFX10-LABEL: load_v4i8_to_v4f32_unaligned:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x3
-; GFX10-NEXT:    global_load_ubyte v1, v0, s[6:7] offset:3
-; GFX10-NEXT:    global_load_ubyte v2, v0, s[6:7] offset:2
-; GFX10-NEXT:    global_load_ubyte v4, v0, s[6:7] offset:1
-; GFX10-NEXT:    global_load_ubyte v5, v0, s[6:7]
+; GFX10-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:3
+; GFX10-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:2
+; GFX10-NEXT:    global_load_ubyte v4, v0, s[2:3] offset:1
+; GFX10-NEXT:    global_load_ubyte v5, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(3)
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v3, v1
 ; GFX10-NEXT:    s_waitcnt vmcnt(2)
@@ -1335,19 +1335,19 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v1, v4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v5
-; GFX10-NEXT:    global_store_dwordx4 v6, v[0:3], s[4:5]
+; GFX10-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: load_v4i8_to_v4f32_unaligned:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ubyte v1, v0, s[6:7] offset:3
-; GFX9-NEXT:    global_load_ubyte v2, v0, s[6:7] offset:2
-; GFX9-NEXT:    global_load_ubyte v4, v0, s[6:7] offset:1
-; GFX9-NEXT:    global_load_ubyte v5, v0, s[6:7]
+; GFX9-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:3
+; GFX9-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:2
+; GFX9-NEXT:    global_load_ubyte v4, v0, s[2:3] offset:1
+; GFX9-NEXT:    global_load_ubyte v5, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v3, v1
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
@@ -1356,7 +1356,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v1, v4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v5
-; GFX9-NEXT:    global_store_dwordx4 v6, v[0:3], s[4:5]
+; GFX9-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: load_v4i8_to_v4f32_unaligned:
@@ -1481,15 +1481,15 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1
 ;
 ; GFX10-LABEL: load_v4i8_to_v4f32_unaligned_multiuse:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x3
-; GFX10-NEXT:    global_load_ubyte v1, v0, s[8:9] offset:2
-; GFX10-NEXT:    global_load_ubyte v3, v0, s[8:9] offset:3
-; GFX10-NEXT:    global_load_ubyte v2, v0, s[10:11] offset:3
-; GFX10-NEXT:    global_load_ubyte v4, v0, s[10:11] offset:2
+; GFX10-NEXT:    global_load_ubyte v1, v0, s[4:5] offset:2
+; GFX10-NEXT:    global_load_ubyte v3, v0, s[4:5] offset:3
+; GFX10-NEXT:    global_load_ubyte v2, v0, s[6:7] offset:3
+; GFX10-NEXT:    global_load_ubyte v4, v0, s[6:7] offset:2
 ; GFX10-NEXT:    s_waitcnt vmcnt(2)
 ; GFX10-NEXT:    v_lshl_or_b32 v5, v3, 8, v1
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v1, v1
@@ -1499,21 +1499,21 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v3
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX10-NEXT:    v_perm_b32 v4, v5, v6, 0x4000405
-; GFX10-NEXT:    global_store_dwordx4 v7, v[0:3], s[4:5]
-; GFX10-NEXT:    global_store_dword v7, v4, s[6:7]
+; GFX10-NEXT:    global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX10-NEXT:    global_store_dword v7, v4, s[2:3]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: load_v4i8_to_v4f32_unaligned_multiuse:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0
-; GFX9-NEXT:    s_mov_b32 s0, 0x4000405
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ubyte v1, v0, s[8:9] offset:2
-; GFX9-NEXT:    global_load_ubyte v2, v0, s[10:11] offset:3
-; GFX9-NEXT:    global_load_ubyte v3, v0, s[8:9] offset:3
-; GFX9-NEXT:    global_load_ubyte v4, v0, s[10:11] offset:2
+; GFX9-NEXT:    global_load_ubyte v1, v0, s[4:5] offset:2
+; GFX9-NEXT:    global_load_ubyte v2, v0, s[6:7] offset:3
+; GFX9-NEXT:    global_load_ubyte v3, v0, s[4:5] offset:3
+; GFX9-NEXT:    global_load_ubyte v4, v0, s[6:7] offset:2
+; GFX9-NEXT:    s_mov_b32 s4, 0x4000405
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_lshl_or_b32 v6, v3, 8, v1
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v1, v1
@@ -1522,9 +1522,9 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v2, v4
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v3
 ; GFX9-NEXT:    v_mov_b32_e32 v3, v1
-; GFX9-NEXT:    v_perm_b32 v4, v6, v7, s0
-; GFX9-NEXT:    global_store_dwordx4 v5, v[0:3], s[4:5]
-; GFX9-NEXT:    global_store_dword v5, v4, s[6:7]
+; GFX9-NEXT:    v_perm_b32 v4, v6, v7, s4
+; GFX9-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1]
+; GFX9-NEXT:    global_store_dword v5, v4, s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: load_v4i8_to_v4f32_unaligned_multiuse:
@@ -1653,12 +1653,12 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
 ;
 ; GFX10-LABEL: load_v4i8_to_v4f32_2_uses:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v0, v0, s[0:1]
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffffff00, v0
@@ -1676,21 +1676,22 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
 ; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
 ; GFX10-NEXT:    v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[4:5]
-; GFX10-NEXT:    global_store_dword v4, v5, s[6:7]
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX10-NEXT:    global_store_dword v4, v5, s[2:3]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: load_v4i8_to_v4f32_2_uses:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT:    s_movk_i32 s4, 0xff00
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 9
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v4, v0, s[0:1]
-; GFX9-NEXT:    s_movk_i32 s0, 0xff00
-; GFX9-NEXT:    s_movk_i32 s1, 0x900
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_movk_i32 s5, 0x900
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte3_e32 v3, v4
 ; GFX9-NEXT:    v_cvt_f32_ubyte2_e32 v2, v4
@@ -1698,16 +1699,17 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v4
 ; GFX9-NEXT:    v_and_b32_e32 v7, 0xffffff00, v4
 ; GFX9-NEXT:    v_add_u16_e32 v8, 9, v4
-; GFX9-NEXT:    v_and_b32_sdwa v9, v4, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v9, v4, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-NEXT:    v_add_u16_sdwa v4, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    global_store_dwordx4 v5, v[0:3], s[4:5]
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1]
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_u16_e32 v0, 0x900, v0
-; GFX9-NEXT:    v_add_u16_sdwa v1, v1, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    v_add_u16_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX9-NEXT:    global_store_dword v5, v0, s[6:7]
+; GFX9-NEXT:    global_store_dword v5, v0, s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: load_v4i8_to_v4f32_2_uses:
@@ -1849,17 +1851,17 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr
 ;
 ; GFX10-LABEL: load_v7i8_to_v7f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x5
-; GFX10-NEXT:    global_load_ubyte v4, v0, s[6:7] offset:6
-; GFX10-NEXT:    global_load_ubyte v1, v0, s[6:7] offset:3
-; GFX10-NEXT:    global_load_ubyte v2, v0, s[6:7] offset:2
-; GFX10-NEXT:    global_load_ubyte v5, v0, s[6:7] offset:1
-; GFX10-NEXT:    global_load_short_d16 v7, v0, s[6:7] offset:4
-; GFX10-NEXT:    global_load_ubyte v0, v0, s[6:7]
+; GFX10-NEXT:    global_load_ubyte v4, v0, s[2:3] offset:6
+; GFX10-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:3
+; GFX10-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:2
+; GFX10-NEXT:    global_load_ubyte v5, v0, s[2:3] offset:1
+; GFX10-NEXT:    global_load_short_d16 v7, v0, s[2:3] offset:4
+; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(5)
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v6, v4
 ; GFX10-NEXT:    s_waitcnt vmcnt(4)
@@ -1873,22 +1875,22 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v4, v7
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX10-NEXT:    global_store_dwordx3 v8, v[4:6], s[4:5] offset:16
-; GFX10-NEXT:    global_store_dwordx4 v8, v[0:3], s[4:5]
+; GFX10-NEXT:    global_store_dwordx3 v8, v[4:6], s[0:1] offset:16
+; GFX10-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: load_v7i8_to_v7f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v10, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ubyte v1, v0, s[6:7] offset:6
-; GFX9-NEXT:    global_load_ushort v2, v0, s[6:7] offset:4
-; GFX9-NEXT:    global_load_ubyte v3, v0, s[6:7] offset:3
-; GFX9-NEXT:    global_load_ubyte v7, v0, s[6:7] offset:2
-; GFX9-NEXT:    global_load_ubyte v8, v0, s[6:7] offset:1
-; GFX9-NEXT:    global_load_ubyte v9, v0, s[6:7]
+; GFX9-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:6
+; GFX9-NEXT:    global_load_ushort v2, v0, s[2:3] offset:4
+; GFX9-NEXT:    global_load_ubyte v3, v0, s[2:3] offset:3
+; GFX9-NEXT:    global_load_ubyte v7, v0, s[2:3] offset:2
+; GFX9-NEXT:    global_load_ubyte v8, v0, s[2:3] offset:1
+; GFX9-NEXT:    global_load_ubyte v9, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v6, v1
 ; GFX9-NEXT:    s_waitcnt vmcnt(4)
@@ -1902,8 +1904,8 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v1, v8
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v9
-; GFX9-NEXT:    global_store_dwordx4 v10, v[0:3], s[4:5]
-; GFX9-NEXT:    global_store_dwordx3 v10, v[4:6], s[4:5] offset:16
+; GFX9-NEXT:    global_store_dwordx4 v10, v[0:3], s[0:1]
+; GFX9-NEXT:    global_store_dwordx3 v10, v[4:6], s[0:1] offset:16
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: load_v7i8_to_v7f32:
@@ -2002,11 +2004,11 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr
 ;
 ; GFX10-LABEL: load_v8i8_to_v8f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v10, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[8:9], v0, s[6:7]
+; GFX10-NEXT:    global_load_dwordx2 v[8:9], v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte3_e32 v7, v9
 ; GFX10-NEXT:    v_cvt_f32_ubyte2_e32 v6, v9
@@ -2016,17 +2018,17 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr
 ; GFX10-NEXT:    v_cvt_f32_ubyte2_e32 v2, v8
 ; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v1, v8
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v8
-; GFX10-NEXT:    global_store_dwordx4 v10, v[4:7], s[4:5] offset:16
-; GFX10-NEXT:    global_store_dwordx4 v10, v[0:3], s[4:5]
+; GFX10-NEXT:    global_store_dwordx4 v10, v[4:7], s[0:1] offset:16
+; GFX10-NEXT:    global_store_dwordx4 v10, v[0:3], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: load_v8i8_to_v8f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v9, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[7:8], v0, s[6:7]
+; GFX9-NEXT:    global_load_dwordx2 v[7:8], v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte3_e32 v3, v7
 ; GFX9-NEXT:    v_cvt_f32_ubyte2_e32 v2, v7
@@ -2036,8 +2038,8 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr
 ; GFX9-NEXT:    v_cvt_f32_ubyte2_e32 v6, v8
 ; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v5, v8
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v4, v8
-; GFX9-NEXT:    global_store_dwordx4 v9, v[4:7], s[4:5] offset:16
-; GFX9-NEXT:    global_store_dwordx4 v9, v[0:3], s[4:5]
+; GFX9-NEXT:    global_store_dwordx4 v9, v[4:7], s[0:1] offset:16
+; GFX9-NEXT:    global_store_dwordx4 v9, v[0:3], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: load_v8i8_to_v8f32:
@@ -2112,28 +2114,28 @@ define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(ptr addrspace(1) noalias %ou
 ;
 ; GFX10-LABEL: i8_zext_inreg_i32_to_f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: i8_zext_inreg_i32_to_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: i8_zext_inreg_i32_to_f32:
@@ -2199,26 +2201,26 @@ define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(ptr addrspace(1) noalias %ou
 ;
 ; GFX10-LABEL: i8_zext_inreg_hi1_to_f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: i8_zext_inreg_hi1_to_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
-; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: i8_zext_inreg_hi1_to_f32:
@@ -2283,24 +2285,24 @@ define amdgpu_kernel void @i8_zext_i32_to_f32(ptr addrspace(1) noalias %out, ptr
 ;
 ; GFX10-LABEL: i8_zext_i32_to_f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ubyte v0, v0, s[6:7]
+; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: i8_zext_i32_to_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ubyte v0, v0, s[6:7]
+; GFX9-NEXT:    global_load_ubyte v0, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: i8_zext_i32_to_f32:
@@ -2386,15 +2388,15 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou
 ;
 ; GFX10-LABEL: v4i8_zext_v4i32_to_v4f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x3
-; GFX10-NEXT:    global_load_ubyte v1, v0, s[6:7] offset:3
-; GFX10-NEXT:    global_load_ubyte v2, v0, s[6:7] offset:2
-; GFX10-NEXT:    global_load_ubyte v4, v0, s[6:7] offset:1
-; GFX10-NEXT:    global_load_ubyte v5, v0, s[6:7]
+; GFX10-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:3
+; GFX10-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:2
+; GFX10-NEXT:    global_load_ubyte v4, v0, s[2:3] offset:1
+; GFX10-NEXT:    global_load_ubyte v5, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(3)
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v3, v1
 ; GFX10-NEXT:    s_waitcnt vmcnt(2)
@@ -2403,19 +2405,19 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v1, v4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v5
-; GFX10-NEXT:    global_store_dwordx4 v6, v[0:3], s[4:5]
+; GFX10-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v4i8_zext_v4i32_to_v4f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ubyte v1, v0, s[6:7] offset:3
-; GFX9-NEXT:    global_load_ubyte v2, v0, s[6:7] offset:2
-; GFX9-NEXT:    global_load_ubyte v4, v0, s[6:7] offset:1
-; GFX9-NEXT:    global_load_ubyte v5, v0, s[6:7]
+; GFX9-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:3
+; GFX9-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:2
+; GFX9-NEXT:    global_load_ubyte v4, v0, s[2:3] offset:1
+; GFX9-NEXT:    global_load_ubyte v5, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v3, v1
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
@@ -2424,7 +2426,7 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v1, v4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v5
-; GFX9-NEXT:    global_store_dwordx4 v6, v[0:3], s[4:5]
+; GFX9-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v4i8_zext_v4i32_to_v4f32:
@@ -2498,26 +2500,26 @@ define amdgpu_kernel void @extract_byte0_to_f32(ptr addrspace(1) noalias %out, p
 ;
 ; GFX10-LABEL: extract_byte0_to_f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: extract_byte0_to_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: extract_byte0_to_f32:
@@ -2581,26 +2583,26 @@ define amdgpu_kernel void @extract_byte1_to_f32(ptr addrspace(1) noalias %out, p
 ;
 ; GFX10-LABEL: extract_byte1_to_f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: extract_byte1_to_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
-; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: extract_byte1_to_f32:
@@ -2665,26 +2667,26 @@ define amdgpu_kernel void @extract_byte2_to_f32(ptr addrspace(1) noalias %out, p
 ;
 ; GFX10-LABEL: extract_byte2_to_f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte2_e32 v0, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: extract_byte2_to_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte2_e32 v0, v0
-; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: extract_byte2_to_f32:
@@ -2749,26 +2751,26 @@ define amdgpu_kernel void @extract_byte3_to_f32(ptr addrspace(1) noalias %out, p
 ;
 ; GFX10-LABEL: extract_byte3_to_f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte3_e32 v0, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: extract_byte3_to_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte3_e32 v0, v0
-; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: extract_byte3_to_f32:
@@ -2838,30 +2840,30 @@ define amdgpu_kernel void @cvt_ubyte0_or_multiuse(ptr addrspace(1) %in, ptr addr
 ;
 ; GFX10-LABEL: cvt_ubyte0_or_multiuse:
 ; GFX10:       ; %bb.0: ; %bb
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[4:5]
+; GFX10-NEXT:    global_load_dword v0, v0, s[0:1]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_or_b32_e32 v0, 0x80000001, v0
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v1, v0
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX10-NEXT:    global_store_dword v2, v0, s[6:7]
+; GFX10-NEXT:    global_store_dword v2, v0, s[2:3]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: cvt_ubyte0_or_multiuse:
 ; GFX9:       ; %bb.0: ; %bb
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v0, s[4:5]
+; GFX9-NEXT:    global_load_dword v0, v0, s[0:1]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_e32 v0, 0x80000001, v0
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v2, v0
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX9-NEXT:    global_store_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_store_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: cvt_ubyte0_or_multiuse:
diff --git a/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll b/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll
index de14d64dbf7e9..6799980c18439 100644
--- a/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll
+++ b/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll
@@ -8,13 +8,13 @@
 define protected amdgpu_kernel void @add(ptr addrspace(1) %p, ptr addrspace(1) %q) {
 ; CHECK-LABEL: add:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    global_atomic_add v2, v0, v1, s[4:5] glc
-; CHECK-NEXT:    v_mov_b32_e32 v0, s6
-; CHECK-NEXT:    v_mov_b32_e32 v1, s7
+; CHECK-NEXT:    global_atomic_add v2, v0, v1, s[0:1] glc
+; CHECK-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
@@ -30,13 +30,13 @@ define protected amdgpu_kernel void @add(ptr addrspace(1) %p, ptr addrspace(1) %
 define protected amdgpu_kernel void @sub(ptr addrspace(1) %p, ptr addrspace(1) %q) {
 ; CHECK-LABEL: sub:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    global_atomic_sub v2, v0, v1, s[4:5] glc
-; CHECK-NEXT:    v_mov_b32_e32 v0, s6
-; CHECK-NEXT:    v_mov_b32_e32 v1, s7
+; CHECK-NEXT:    global_atomic_sub v2, v0, v1, s[0:1] glc
+; CHECK-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
@@ -52,13 +52,13 @@ define protected amdgpu_kernel void @sub(ptr addrspace(1) %p, ptr addrspace(1) %
 define protected amdgpu_kernel void @and(ptr addrspace(1) %p, ptr addrspace(1) %q) {
 ; CHECK-LABEL: and:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    global_atomic_and v2, v0, v1, s[4:5] glc
-; CHECK-NEXT:    v_mov_b32_e32 v0, s6
-; CHECK-NEXT:    v_mov_b32_e32 v1, s7
+; CHECK-NEXT:    global_atomic_and v2, v0, v1, s[0:1] glc
+; CHECK-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
@@ -74,13 +74,13 @@ define protected amdgpu_kernel void @and(ptr addrspace(1) %p, ptr addrspace(1) %
 define protected amdgpu_kernel void @or(ptr addrspace(1) %p, ptr addrspace(1) %q) {
 ; CHECK-LABEL: or:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    global_atomic_or v2, v0, v1, s[4:5] glc
-; CHECK-NEXT:    v_mov_b32_e32 v0, s6
-; CHECK-NEXT:    v_mov_b32_e32 v1, s7
+; CHECK-NEXT:    global_atomic_or v2, v0, v1, s[0:1] glc
+; CHECK-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
@@ -96,13 +96,13 @@ define protected amdgpu_kernel void @or(ptr addrspace(1) %p, ptr addrspace(1) %q
 define protected amdgpu_kernel void @xor(ptr addrspace(1) %p, ptr addrspace(1) %q) {
 ; CHECK-LABEL: xor:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    global_atomic_xor v2, v0, v1, s[4:5] glc
-; CHECK-NEXT:    v_mov_b32_e32 v0, s6
-; CHECK-NEXT:    v_mov_b32_e32 v1, s7
+; CHECK-NEXT:    global_atomic_xor v2, v0, v1, s[0:1] glc
+; CHECK-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
@@ -118,28 +118,28 @@ define protected amdgpu_kernel void @xor(ptr addrspace(1) %p, ptr addrspace(1) %
 define protected amdgpu_kernel void @nand(ptr addrspace(1) %p, ptr addrspace(1) %q) {
 ; CHECK-LABEL: nand:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; CHECK-NEXT:    s_mov_b64 s[0:1], 0
+; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; CHECK-NEXT:    s_mov_b64 s[4:5], 0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_load_dword s2, s[4:5], 0x0
+; CHECK-NEXT:    s_load_dword s6, s[0:1], 0x0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-NEXT:    v_mov_b32_e32 v0, s6
 ; CHECK-NEXT:  .LBB5_1: ; %atomicrmw.start
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    v_mov_b32_e32 v3, v0
 ; CHECK-NEXT:    v_not_b32_e32 v0, v3
 ; CHECK-NEXT:    v_or_b32_e32 v2, -2, v0
-; CHECK-NEXT:    global_atomic_cmpswap v0, v1, v[2:3], s[4:5] glc
+; CHECK-NEXT:    global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v3
-; CHECK-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; CHECK-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; CHECK-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; CHECK-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; CHECK-NEXT:    s_cbranch_execnz .LBB5_1
 ; CHECK-NEXT:  ; %bb.2: ; %atomicrmw.end
-; CHECK-NEXT:    s_or_b64 exec, exec, s[0:1]
-; CHECK-NEXT:    v_mov_b32_e32 v2, s6
-; CHECK-NEXT:    v_mov_b32_e32 v3, s7
+; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
+; CHECK-NEXT:    v_mov_b32_e32 v2, s2
+; CHECK-NEXT:    v_mov_b32_e32 v3, s3
 ; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v0, 12, v[2:3]
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
 ; CHECK-NEXT:    global_store_dword v[0:1], v2, off
@@ -154,13 +154,13 @@ define protected amdgpu_kernel void @nand(ptr addrspace(1) %p, ptr addrspace(1)
 define protected amdgpu_kernel void @max_workgroup(ptr addrspace(1) %p, ptr addrspace(1) %q) {
 ; CHECK-LABEL: max_workgroup:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    global_atomic_smax v2, v0, v1, s[4:5] glc
-; CHECK-NEXT:    v_mov_b32_e32 v0, s6
-; CHECK-NEXT:    v_mov_b32_e32 v1, s7
+; CHECK-NEXT:    global_atomic_smax v2, v0, v1, s[0:1] glc
+; CHECK-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
@@ -176,13 +176,13 @@ define protected amdgpu_kernel void @max_workgroup(ptr addrspace(1) %p, ptr addr
 define protected amdgpu_kernel void @max(ptr addrspace(1) %p, ptr addrspace(1) %q) {
 ; CHECK-LABEL: max:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    global_atomic_smax v2, v0, v1, s[4:5] glc
-; CHECK-NEXT:    v_mov_b32_e32 v0, s6
-; CHECK-NEXT:    v_mov_b32_e32 v1, s7
+; CHECK-NEXT:    global_atomic_smax v2, v0, v1, s[0:1] glc
+; CHECK-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
@@ -198,13 +198,13 @@ define protected amdgpu_kernel void @max(ptr addrspace(1) %p, ptr addrspace(1) %
 define protected amdgpu_kernel void @min_workgroup(ptr addrspace(1) %p, ptr addrspace(1) %q) {
 ; CHECK-LABEL: min_workgroup:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    global_atomic_smin v2, v0, v1, s[4:5] glc
-; CHECK-NEXT:    v_mov_b32_e32 v0, s6
-; CHECK-NEXT:    v_mov_b32_e32 v1, s7
+; CHECK-NEXT:    global_atomic_smin v2, v0, v1, s[0:1] glc
+; CHECK-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
@@ -220,13 +220,13 @@ define protected amdgpu_kernel void @min_workgroup(ptr addrspace(1) %p, ptr addr
 define protected amdgpu_kernel void @min(ptr addrspace(1) %p, ptr addrspace(1) %q) {
 ; CHECK-LABEL: min:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    global_atomic_smin v2, v0, v1, s[4:5] glc
-; CHECK-NEXT:    v_mov_b32_e32 v0, s6
-; CHECK-NEXT:    v_mov_b32_e32 v1, s7
+; CHECK-NEXT:    global_atomic_smin v2, v0, v1, s[0:1] glc
+; CHECK-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
@@ -242,13 +242,13 @@ define protected amdgpu_kernel void @min(ptr addrspace(1) %p, ptr addrspace(1) %
 define protected amdgpu_kernel void @umax_workgroup(ptr addrspace(1) %p, ptr addrspace(1) %q) {
 ; CHECK-LABEL: umax_workgroup:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    global_atomic_umax v2, v0, v1, s[4:5] glc
-; CHECK-NEXT:    v_mov_b32_e32 v0, s6
-; CHECK-NEXT:    v_mov_b32_e32 v1, s7
+; CHECK-NEXT:    global_atomic_umax v2, v0, v1, s[0:1] glc
+; CHECK-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
@@ -264,13 +264,13 @@ define protected amdgpu_kernel void @umax_workgroup(ptr addrspace(1) %p, ptr add
 define protected amdgpu_kernel void @umax(ptr addrspace(1) %p, ptr addrspace(1) %q) {
 ; CHECK-LABEL: umax:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    global_atomic_umax v2, v0, v1, s[4:5] glc
-; CHECK-NEXT:    v_mov_b32_e32 v0, s6
-; CHECK-NEXT:    v_mov_b32_e32 v1, s7
+; CHECK-NEXT:    global_atomic_umax v2, v0, v1, s[0:1] glc
+; CHECK-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
@@ -286,13 +286,13 @@ define protected amdgpu_kernel void @umax(ptr addrspace(1) %p, ptr addrspace(1)
 define protected amdgpu_kernel void @umin_workgroup(ptr addrspace(1) %p, ptr addrspace(1) %q) {
 ; CHECK-LABEL: umin_workgroup:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    global_atomic_umin v2, v0, v1, s[4:5] glc
-; CHECK-NEXT:    v_mov_b32_e32 v0, s6
-; CHECK-NEXT:    v_mov_b32_e32 v1, s7
+; CHECK-NEXT:    global_atomic_umin v2, v0, v1, s[0:1] glc
+; CHECK-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
@@ -308,13 +308,13 @@ define protected amdgpu_kernel void @umin_workgroup(ptr addrspace(1) %p, ptr add
 define protected amdgpu_kernel void @umin(ptr addrspace(1) %p, ptr addrspace(1) %q) {
 ; CHECK-LABEL: umin:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    global_atomic_umin v2, v0, v1, s[4:5] glc
-; CHECK-NEXT:    v_mov_b32_e32 v0, s6
-; CHECK-NEXT:    v_mov_b32_e32 v1, s7
+; CHECK-NEXT:    global_atomic_umin v2, v0, v1, s[0:1] glc
+; CHECK-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
@@ -330,14 +330,14 @@ define protected amdgpu_kernel void @umin(ptr addrspace(1) %p, ptr addrspace(1)
 define protected amdgpu_kernel void @cmpxchg(ptr addrspace(1) %p, ptr addrspace(1) %q) {
 ; CHECK-LABEL: cmpxchg:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 2
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    global_atomic_cmpswap v2, v2, v[0:1], s[4:5] glc
-; CHECK-NEXT:    v_mov_b32_e32 v0, s6
-; CHECK-NEXT:    v_mov_b32_e32 v1, s7
+; CHECK-NEXT:    global_atomic_cmpswap v2, v2, v[0:1], s[0:1] glc
+; CHECK-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
@@ -354,13 +354,13 @@ define protected amdgpu_kernel void @cmpxchg(ptr addrspace(1) %p, ptr addrspace(
 define protected amdgpu_kernel void @xchg(ptr addrspace(1) %p, ptr addrspace(1) %q) {
 ; CHECK-LABEL: xchg:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    global_atomic_swap v2, v0, v1, s[4:5] glc
-; CHECK-NEXT:    v_mov_b32_e32 v0, s6
-; CHECK-NEXT:    v_mov_b32_e32 v1, s7
+; CHECK-NEXT:    global_atomic_swap v2, v0, v1, s[0:1] glc
+; CHECK-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
@@ -376,13 +376,13 @@ define protected amdgpu_kernel void @xchg(ptr addrspace(1) %p, ptr addrspace(1)
 define protected amdgpu_kernel void @inc(ptr addrspace(1) %p, ptr addrspace(1) %q) {
 ; CHECK-LABEL: inc:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    global_atomic_inc v2, v0, v1, s[4:5] glc
-; CHECK-NEXT:    v_mov_b32_e32 v0, s6
-; CHECK-NEXT:    v_mov_b32_e32 v1, s7
+; CHECK-NEXT:    global_atomic_inc v2, v0, v1, s[0:1] glc
+; CHECK-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
@@ -398,13 +398,13 @@ define protected amdgpu_kernel void @inc(ptr addrspace(1) %p, ptr addrspace(1) %
 define protected amdgpu_kernel void @dec(ptr addrspace(1) %p, ptr addrspace(1) %q) {
 ; CHECK-LABEL: dec:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    global_atomic_dec v2, v0, v1, s[4:5] glc
-; CHECK-NEXT:    v_mov_b32_e32 v0, s6
-; CHECK-NEXT:    v_mov_b32_e32 v1, s7
+; CHECK-NEXT:    global_atomic_dec v2, v0, v1, s[0:1] glc
+; CHECK-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
@@ -420,28 +420,28 @@ define protected amdgpu_kernel void @dec(ptr addrspace(1) %p, ptr addrspace(1) %
 define protected amdgpu_kernel void @fadd(ptr addrspace(1) %p, ptr addrspace(1) %q) {
 ; CHECK-LABEL: fadd:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; CHECK-NEXT:    s_mov_b64 s[0:1], 0
+; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; CHECK-NEXT:    s_mov_b64 s[4:5], 0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_load_dword s2, s[4:5], 0x0
+; CHECK-NEXT:    s_load_dword s6, s[0:1], 0x0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-NEXT:    v_mov_b32_e32 v0, s6
 ; CHECK-NEXT:  .LBB18_1: ; %atomicrmw.start
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    v_mov_b32_e32 v3, v0
 ; CHECK-NEXT:    v_add_f32_e32 v2, 1.0, v3
-; CHECK-NEXT:    global_atomic_cmpswap v0, v1, v[2:3], s[4:5] glc
+; CHECK-NEXT:    global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v3
-; CHECK-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; CHECK-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; CHECK-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; CHECK-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; CHECK-NEXT:    s_cbranch_execnz .LBB18_1
 ; CHECK-NEXT:  ; %bb.2: ; %atomicrmw.end
-; CHECK-NEXT:    s_or_b64 exec, exec, s[0:1]
+; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v2, v0
-; CHECK-NEXT:    v_mov_b32_e32 v0, s6
-; CHECK-NEXT:    v_mov_b32_e32 v1, s7
+; CHECK-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
 ; CHECK-NEXT:    global_store_dword v[0:1], v2, off
@@ -457,28 +457,28 @@ define protected amdgpu_kernel void @fadd(ptr addrspace(1) %p, ptr addrspace(1)
 define protected amdgpu_kernel void @fsub(ptr addrspace(1) %p, ptr addrspace(1) %q) {
 ; CHECK-LABEL: fsub:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; CHECK-NEXT:    s_mov_b64 s[0:1], 0
+; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; CHECK-NEXT:    s_mov_b64 s[4:5], 0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_load_dword s2, s[4:5], 0x0
+; CHECK-NEXT:    s_load_dword s6, s[0:1], 0x0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-NEXT:    v_mov_b32_e32 v0, s6
 ; CHECK-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    v_mov_b32_e32 v3, v0
 ; CHECK-NEXT:    v_add_f32_e32 v2, -1.0, v3
-; CHECK-NEXT:    global_atomic_cmpswap v0, v1, v[2:3], s[4:5] glc
+; CHECK-NEXT:    global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v3
-; CHECK-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; CHECK-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; CHECK-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; CHECK-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; CHECK-NEXT:    s_cbranch_execnz .LBB19_1
 ; CHECK-NEXT:  ; %bb.2: ; %atomicrmw.end
-; CHECK-NEXT:    s_or_b64 exec, exec, s[0:1]
+; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v2, v0
-; CHECK-NEXT:    v_mov_b32_e32 v0, s6
-; CHECK-NEXT:    v_mov_b32_e32 v1, s7
+; CHECK-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
 ; CHECK-NEXT:    global_store_dword v[0:1], v2, off
@@ -494,14 +494,14 @@ define protected amdgpu_kernel void @fsub(ptr addrspace(1) %p, ptr addrspace(1)
 define protected amdgpu_kernel void @fmin(ptr addrspace(1) %p, ptr addrspace(1) %q) {
 ; CHECK-LABEL: fmin:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0x3ff00000
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    global_atomic_min_f64 v[0:1], v2, v[0:1], s[4:5] glc
-; CHECK-NEXT:    v_mov_b32_e32 v2, s6
-; CHECK-NEXT:    v_mov_b32_e32 v3, s7
+; CHECK-NEXT:    global_atomic_min_f64 v[0:1], v2, v[0:1], s[0:1] glc
+; CHECK-NEXT:    v_mov_b32_e32 v2, s2
+; CHECK-NEXT:    v_mov_b32_e32 v3, s3
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
 ; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v0, 12, v[2:3]
@@ -519,14 +519,14 @@ define protected amdgpu_kernel void @fmin(ptr addrspace(1) %p, ptr addrspace(1)
 define protected amdgpu_kernel void @fmax(ptr addrspace(1) %p, ptr addrspace(1) %q) {
 ; CHECK-LABEL: fmax:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0x3ff00000
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    global_atomic_max_f64 v[0:1], v2, v[0:1], s[4:5] glc
-; CHECK-NEXT:    v_mov_b32_e32 v2, s6
-; CHECK-NEXT:    v_mov_b32_e32 v3, s7
+; CHECK-NEXT:    global_atomic_max_f64 v[0:1], v2, v[0:1], s[0:1] glc
+; CHECK-NEXT:    v_mov_b32_e32 v2, s2
+; CHECK-NEXT:    v_mov_b32_e32 v3, s3
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
 ; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v0, 12, v[2:3]
diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll
index fea1303d0a2b7..f0ab3a5342e01 100644
--- a/llvm/test/CodeGen/AMDGPU/div_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll
@@ -482,21 +482,28 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v9
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v7
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v8
+; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v5
+; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[8:9], s[6:7]
+; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[8:9], s[8:9]
 ; GFX9-O0-NEXT:    s_mov_b64 s[12:13], 0x7f
-; GFX9-O0-NEXT:    v_cmp_gt_u64_e64 s[14:15], v[5:6], s[12:13]
+; GFX9-O0-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX9-O0-NEXT:    v_cmp_gt_u64_e64 s[14:15], v[5:6], s[14:15]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[14:15]
-; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[14:15], v[8:9], s[6:7]
+; GFX9-O0-NEXT:    s_mov_b64 s[14:15], s[6:7]
+; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[14:15], v[8:9], s[14:15]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[14:15]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v7, v7, v10, s[8:9]
 ; GFX9-O0-NEXT:    v_and_b32_e64 v7, 1, v7
@@ -507,6 +514,7 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
 ; GFX9-O0-NEXT:    s_mov_b32 s14, s13
 ; GFX9-O0-NEXT:    v_xor_b32_e64 v7, v7, s14
+; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec
 ; GFX9-O0-NEXT:    ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13
 ; GFX9-O0-NEXT:    v_xor_b32_e64 v5, v5, s12
 ; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
@@ -1040,10 +1048,10 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 1
 ; GFX9-O0-NEXT:    s_mov_b32 s5, s6
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
@@ -2687,21 +2695,28 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v9
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v7
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v8
+; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v5
+; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[8:9], s[6:7]
+; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[8:9], s[8:9]
 ; GFX9-O0-NEXT:    s_mov_b64 s[12:13], 0x7f
-; GFX9-O0-NEXT:    v_cmp_gt_u64_e64 s[14:15], v[5:6], s[12:13]
+; GFX9-O0-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX9-O0-NEXT:    v_cmp_gt_u64_e64 s[14:15], v[5:6], s[14:15]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[14:15]
-; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[14:15], v[8:9], s[6:7]
+; GFX9-O0-NEXT:    s_mov_b64 s[14:15], s[6:7]
+; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[14:15], v[8:9], s[14:15]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[14:15]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v7, v7, v10, s[8:9]
 ; GFX9-O0-NEXT:    v_and_b32_e64 v7, 1, v7
@@ -2712,6 +2727,7 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
 ; GFX9-O0-NEXT:    s_mov_b32 s14, s13
 ; GFX9-O0-NEXT:    v_xor_b32_e64 v7, v7, s14
+; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec
 ; GFX9-O0-NEXT:    ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13
 ; GFX9-O0-NEXT:    v_xor_b32_e64 v5, v5, s12
 ; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
@@ -3245,10 +3261,10 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 1
 ; GFX9-O0-NEXT:    s_mov_b32 s5, s6
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
index 5fae60a7acac1..f298a95c63485 100644
--- a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
+++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
@@ -277,12 +277,12 @@ define amdgpu_kernel void @uniform_vec_i16_LL(ptr addrspace(4) %in0, ptr addrspa
 ;
 ; GFX9-LABEL: uniform_vec_i16_LL:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x0
-; GFX9-NEXT:    s_load_dword s1, s[6:7], 0x0
+; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT:    s_load_dword s5, s[2:3], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
+; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s4, s5
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; use s0
 ; GFX9-NEXT:    ;;#ASMEND
@@ -290,12 +290,12 @@ define amdgpu_kernel void @uniform_vec_i16_LL(ptr addrspace(4) %in0, ptr addrspa
 ;
 ; GFX906-LABEL: uniform_vec_i16_LL:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX906-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    s_load_dword s0, s[4:5], 0x0
-; GFX906-NEXT:    s_load_dword s1, s[6:7], 0x0
+; GFX906-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX906-NEXT:    s_load_dword s5, s[2:3], 0x0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
+; GFX906-NEXT:    s_pack_ll_b32_b16 s0, s4, s5
 ; GFX906-NEXT:    ;;#ASMSTART
 ; GFX906-NEXT:    ; use s0
 ; GFX906-NEXT:    ;;#ASMEND
@@ -376,22 +376,22 @@ define amdgpu_kernel void @uniform_vec_i16_LH(ptr addrspace(1) %out, i16 %a, i32
 ;
 ; GFX9-LABEL: uniform_vec_i16_LH:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_pack_lh_b32_b16 s0, s6, s7
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    s_pack_lh_b32_b16 s2, s2, s3
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX906-LABEL: uniform_vec_i16_LH:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX906-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX906-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    s_pack_lh_b32_b16 s0, s6, s7
-; GFX906-NEXT:    v_mov_b32_e32 v1, s0
-; GFX906-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX906-NEXT:    s_pack_lh_b32_b16 s2, s2, s3
+; GFX906-NEXT:    v_mov_b32_e32 v1, s2
+; GFX906-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX906-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: uniform_vec_i16_LH:
@@ -466,22 +466,22 @@ define amdgpu_kernel void @uniform_vec_i16_HH(ptr addrspace(1) %out, i32 %a, i32
 ;
 ; GFX9-LABEL: uniform_vec_i16_HH:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_pack_hh_b32_b16 s0, s6, s7
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    s_pack_hh_b32_b16 s2, s2, s3
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX906-LABEL: uniform_vec_i16_HH:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX906-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX906-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    s_pack_hh_b32_b16 s0, s6, s7
-; GFX906-NEXT:    v_mov_b32_e32 v1, s0
-; GFX906-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX906-NEXT:    s_pack_hh_b32_b16 s2, s2, s3
+; GFX906-NEXT:    v_mov_b32_e32 v1, s2
+; GFX906-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX906-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: uniform_vec_i16_HH:
@@ -561,12 +561,12 @@ define amdgpu_kernel void @uniform_vec_f16_LL(ptr addrspace(4) %in0, ptr addrspa
 ;
 ; GFX9-LABEL: uniform_vec_f16_LL:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x0
-; GFX9-NEXT:    s_load_dword s1, s[6:7], 0x0
+; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT:    s_load_dword s5, s[2:3], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
+; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s4, s5
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; use s0
 ; GFX9-NEXT:    ;;#ASMEND
@@ -574,12 +574,12 @@ define amdgpu_kernel void @uniform_vec_f16_LL(ptr addrspace(4) %in0, ptr addrspa
 ;
 ; GFX906-LABEL: uniform_vec_f16_LL:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX906-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    s_load_dword s0, s[4:5], 0x0
-; GFX906-NEXT:    s_load_dword s1, s[6:7], 0x0
+; GFX906-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX906-NEXT:    s_load_dword s5, s[2:3], 0x0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
+; GFX906-NEXT:    s_pack_ll_b32_b16 s0, s4, s5
 ; GFX906-NEXT:    ;;#ASMSTART
 ; GFX906-NEXT:    ; use s0
 ; GFX906-NEXT:    ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll
index 5b39cc2e185b7..ae4d302e04a7c 100644
--- a/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll
@@ -15,16 +15,13 @@ define amdgpu_kernel void @uniform_trunc_i16_to_i1(ptr addrspace(1) %out, i16 %x
   ; GCN-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
   ; GCN-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY2]], %subreg.sub0, killed [[COPY1]], %subreg.sub1, killed [[S_MOV_B32_1]], %subreg.sub2, killed [[S_MOV_B32_]], %subreg.sub3
   ; GCN-NEXT:   [[S_SEXT_I32_I16_:%[0-9]+]]:sreg_32 = S_SEXT_I32_I16 [[S_LOAD_DWORD_IMM]]
-  ; GCN-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 16
-  ; GCN-NEXT:   [[S_LSHR_B32_:%[0-9]+]]:sreg_32 = S_LSHR_B32 [[S_LOAD_DWORD_IMM]], killed [[S_MOV_B32_2]], implicit-def dead $scc
-  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY killed [[S_LSHR_B32_]]
-  ; GCN-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, killed [[COPY3]], implicit-def dead $scc
-  ; GCN-NEXT:   S_CMP_EQ_U32 killed [[S_AND_B32_]], 1, implicit-def $scc
+  ; GCN-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 65536, [[S_LOAD_DWORD_IMM]], implicit-def dead $scc
+  ; GCN-NEXT:   S_CMP_LG_U32 killed [[S_AND_B32_]], 0, implicit-def $scc
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sreg_64 = COPY $scc
+  ; GCN-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+  ; GCN-NEXT:   S_CMP_LT_I32 killed [[S_SEXT_I32_I16_]], killed [[S_MOV_B32_2]], implicit-def $scc
   ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sreg_64 = COPY $scc
-  ; GCN-NEXT:   [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-  ; GCN-NEXT:   S_CMP_LT_I32 killed [[S_SEXT_I32_I16_]], killed [[S_MOV_B32_3]], implicit-def $scc
-  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sreg_64 = COPY $scc
-  ; GCN-NEXT:   [[S_OR_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 killed [[COPY5]], killed [[COPY4]], implicit-def dead $scc
+  ; GCN-NEXT:   [[S_OR_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 killed [[COPY4]], killed [[COPY3]], implicit-def dead $scc
   ; GCN-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed [[S_OR_B64_]], implicit $exec
   ; GCN-NEXT:   BUFFER_STORE_BYTE_OFFSET killed [[V_CNDMASK_B32_e64_]], killed [[REG_SEQUENCE]], 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.out.load, addrspace 1)
   ; GCN-NEXT:   S_ENDPGM 0
@@ -68,16 +65,15 @@ define amdgpu_kernel void @uniform_trunc_i32_to_i1(ptr addrspace(1) %out, i32 %x
   ; GCN-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
   ; GCN-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
   ; GCN-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY2]], %subreg.sub0, killed [[COPY1]], %subreg.sub1, killed [[S_MOV_B32_1]], %subreg.sub2, killed [[S_MOV_B32_]], %subreg.sub3
-  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sreg_64 = COPY killed [[S_LOAD_DWORDX2_IMM1]]
-  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY [[COPY3]].sub0
-  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY [[COPY3]].sub1
-  ; GCN-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, killed [[COPY5]], implicit-def dead $scc
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM1]].sub0
+  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM1]].sub1
+  ; GCN-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, killed [[COPY4]], implicit-def dead $scc
   ; GCN-NEXT:   S_CMP_EQ_U32 killed [[S_AND_B32_]], 1, implicit-def $scc
-  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sreg_64 = COPY $scc
+  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sreg_64 = COPY $scc
   ; GCN-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-  ; GCN-NEXT:   S_CMP_LT_I32 killed [[COPY4]], killed [[S_MOV_B32_2]], implicit-def $scc
-  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sreg_64 = COPY $scc
-  ; GCN-NEXT:   [[S_OR_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 killed [[COPY7]], killed [[COPY6]], implicit-def dead $scc
+  ; GCN-NEXT:   S_CMP_LT_I32 killed [[COPY3]], killed [[S_MOV_B32_2]], implicit-def $scc
+  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sreg_64 = COPY $scc
+  ; GCN-NEXT:   [[S_OR_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 killed [[COPY6]], killed [[COPY5]], implicit-def dead $scc
   ; GCN-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed [[S_OR_B64_]], implicit $exec
   ; GCN-NEXT:   BUFFER_STORE_BYTE_OFFSET killed [[V_CNDMASK_B32_e64_]], killed [[REG_SEQUENCE]], 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.out.load, addrspace 1)
   ; GCN-NEXT:   S_ENDPGM 0
@@ -126,14 +122,13 @@ define amdgpu_kernel void @uniform_trunc_i64_to_i1(ptr addrspace(1) %out, i64 %x
   ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub3
   ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub2
   ; GCN-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY5]], %subreg.sub1
-  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY killed [[S_LOAD_DWORD_IMM]]
-  ; GCN-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, killed [[COPY7]], implicit-def dead $scc
+  ; GCN-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, killed [[S_LOAD_DWORD_IMM]], implicit-def dead $scc
   ; GCN-NEXT:   S_CMP_EQ_U32 killed [[S_AND_B32_]], 1, implicit-def $scc
-  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:sreg_64 = COPY $scc
+  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sreg_64 = COPY $scc
   ; GCN-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:vreg_64 = COPY killed [[S_MOV_B64_]]
-  ; GCN-NEXT:   [[V_CMP_LT_I64_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I64_e64 killed [[REG_SEQUENCE2]], [[COPY9]], implicit $exec
-  ; GCN-NEXT:   [[S_OR_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 killed [[V_CMP_LT_I64_e64_]], killed [[COPY8]], implicit-def dead $scc
+  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:vreg_64 = COPY killed [[S_MOV_B64_]]
+  ; GCN-NEXT:   [[V_CMP_LT_I64_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I64_e64 killed [[REG_SEQUENCE2]], [[COPY8]], implicit $exec
+  ; GCN-NEXT:   [[S_OR_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 killed [[V_CMP_LT_I64_e64_]], killed [[COPY7]], implicit-def dead $scc
   ; GCN-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed [[S_OR_B64_]], implicit $exec
   ; GCN-NEXT:   BUFFER_STORE_BYTE_OFFSET killed [[V_CNDMASK_B32_e64_]], killed [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.2, addrspace 1)
   ; GCN-NEXT:   S_ENDPGM 0
diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2.ll b/llvm/test/CodeGen/AMDGPU/ds_write2.ll
index 9649cdc6001cd..9f191fa69f654 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_write2.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_write2.ll
@@ -105,12 +105,12 @@ define amdgpu_kernel void @simple_write2_two_val_f32_volatile_0(ptr addrspace(1)
 ;
 ; GFX9-LABEL: simple_write2_two_val_f32_volatile_0:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x8
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x8
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[4:5] glc
+; GFX9-NEXT:    global_load_dword v1, v0, s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v2, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_dword v2, v0, s[2:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ds_write_b32 v0, v1
 ; GFX9-NEXT:    ds_write_b32 v0, v2 offset:32
@@ -151,12 +151,12 @@ define amdgpu_kernel void @simple_write2_two_val_f32_volatile_1(ptr addrspace(1)
 ;
 ; GFX9-LABEL: simple_write2_two_val_f32_volatile_1:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x8
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x8
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[4:5] glc
+; GFX9-NEXT:    global_load_dword v1, v0, s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v2, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_dword v2, v0, s[2:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ds_write_b32 v0, v1
 ; GFX9-NEXT:    ds_write_b32 v0, v2 offset:32
@@ -368,11 +368,11 @@ define amdgpu_kernel void @simple_write2_two_val_too_far_f32(ptr addrspace(1) %C
 ;
 ; GFX9-LABEL: simple_write2_two_val_too_far_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x8
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x8
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[4:5]
-; GFX9-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX9-NEXT:    global_load_dword v2, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    ds_write_b32 v0, v1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -413,11 +413,11 @@ define amdgpu_kernel void @simple_write2_two_val_f32_x2(ptr addrspace(1) %C, ptr
 ;
 ; GFX9-LABEL: simple_write2_two_val_f32_x2:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x8
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x8
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[4:5]
-; GFX9-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX9-NEXT:    global_load_dword v2, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ds_write2_b32 v0, v1, v2 offset1:8
 ; GFX9-NEXT:    ds_write2_b32 v0, v1, v2 offset0:11 offset1:27
@@ -469,11 +469,11 @@ define amdgpu_kernel void @simple_write2_two_val_f32_x2_nonzero_base(ptr addrspa
 ;
 ; GFX9-LABEL: simple_write2_two_val_f32_x2_nonzero_base:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x8
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x8
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[4:5]
-; GFX9-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX9-NEXT:    global_load_dword v2, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ds_write2_b32 v0, v1, v2 offset0:3 offset1:8
 ; GFX9-NEXT:    ds_write2_b32 v0, v1, v2 offset0:11 offset1:27
@@ -963,11 +963,11 @@ define amdgpu_kernel void @simple_write2_v4f32_superreg_align4(ptr addrspace(3)
 ;
 ; GFX9-ALIGNED-LABEL: simple_write2_v4f32_superreg_align4:
 ; GFX9-ALIGNED:       ; %bb.0:
-; GFX9-ALIGNED-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x8
-; GFX9-ALIGNED-NEXT:    s_load_dword s6, s[2:3], 0x0
+; GFX9-ALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x8
+; GFX9-ALIGNED-NEXT:    s_load_dword s4, s[2:3], 0x0
 ; GFX9-ALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-ALIGNED-NEXT:    v_lshl_add_u32 v0, v0, 4, s6
-; GFX9-ALIGNED-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX9-ALIGNED-NEXT:    v_lshl_add_u32 v0, v0, 4, s4
+; GFX9-ALIGNED-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
 ; GFX9-ALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-ALIGNED-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-ALIGNED-NEXT:    v_mov_b32_e32 v2, s1
@@ -979,11 +979,11 @@ define amdgpu_kernel void @simple_write2_v4f32_superreg_align4(ptr addrspace(3)
 ;
 ; GFX9-UNALIGNED-LABEL: simple_write2_v4f32_superreg_align4:
 ; GFX9-UNALIGNED:       ; %bb.0:
-; GFX9-UNALIGNED-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x8
-; GFX9-UNALIGNED-NEXT:    s_load_dword s6, s[2:3], 0x0
+; GFX9-UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x8
+; GFX9-UNALIGNED-NEXT:    s_load_dword s4, s[2:3], 0x0
 ; GFX9-UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-UNALIGNED-NEXT:    v_lshl_add_u32 v0, v0, 4, s6
-; GFX9-UNALIGNED-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX9-UNALIGNED-NEXT:    v_lshl_add_u32 v0, v0, 4, s4
+; GFX9-UNALIGNED-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
 ; GFX9-UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-UNALIGNED-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9-UNALIGNED-NEXT:    v_mov_b32_e32 v2, s3
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
index fe672f1b3b131..bd483f4c07071 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
@@ -1842,21 +1842,21 @@ define amdgpu_kernel void @s_copysign_v2f16(ptr addrspace(1) %arg_out, <2 x half
 ;
 ; GFX9-LABEL: s_copysign_v2f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-NEXT:    s_movk_i32 s0, 0x7fff
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-NEXT:    v_mov_b32_e32 v2, s7
-; GFX9-NEXT:    s_lshr_b32 s1, s7, 16
-; GFX9-NEXT:    s_lshr_b32 s2, s6, 16
-; GFX9-NEXT:    v_bfi_b32 v1, s0, v1, v2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    v_mov_b32_e32 v2, s3
+; GFX9-NEXT:    s_lshr_b32 s3, s3, 16
+; GFX9-NEXT:    s_lshr_b32 s2, s2, 16
+; GFX9-NEXT:    v_bfi_b32 v1, s4, v1, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    v_bfi_b32 v2, s0, v2, v3
+; GFX9-NEXT:    v_mov_b32_e32 v3, s3
+; GFX9-NEXT:    v_bfi_b32 v2, s4, v2, v3
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_copysign_v2f16:
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
index 7c89efd0a713c..f53d3cf33c9cc 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
@@ -197,24 +197,24 @@ define amdgpu_kernel void @v_rcp_f16(ptr addrspace(1) %r, ptr addrspace(1) %b) #
 ;
 ; GFX9-LABEL: v_rcp_f16:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v1, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_rcp_f16_e32 v1, v1
-; GFX9-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_rcp_f16:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
+; GFX10-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_rcp_f16_e32 v1, v1
-; GFX10-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_rcp_f16:
@@ -293,24 +293,24 @@ define amdgpu_kernel void @v_rcp_f16_abs(ptr addrspace(1) %r, ptr addrspace(1) %
 ;
 ; GFX9-LABEL: v_rcp_f16_abs:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v1, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_rcp_f16_e64 v1, |v1|
-; GFX9-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_rcp_f16_abs:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
+; GFX10-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_rcp_f16_e64 v1, |v1|
-; GFX10-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_rcp_f16_abs:
@@ -392,24 +392,24 @@ define amdgpu_kernel void @reciprocal_f16_rounded(ptr addrspace(1) %r, ptr addrs
 ;
 ; GFX9-LABEL: reciprocal_f16_rounded:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v1, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_rcp_f16_e32 v1, v1
-; GFX9-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: reciprocal_f16_rounded:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
+; GFX10-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_rcp_f16_e32 v1, v1
-; GFX10-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: reciprocal_f16_rounded:
@@ -475,24 +475,24 @@ define amdgpu_kernel void @v_rcp_f16_afn(ptr addrspace(1) %r, ptr addrspace(1) %
 ;
 ; GFX9-LABEL: v_rcp_f16_afn:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v1, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_rcp_f16_e32 v1, v1
-; GFX9-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_rcp_f16_afn:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
+; GFX10-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_rcp_f16_e32 v1, v1
-; GFX10-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_rcp_f16_afn:
@@ -571,24 +571,24 @@ define amdgpu_kernel void @v_rcp_f16_neg(ptr addrspace(1) %r, ptr addrspace(1) %
 ;
 ; GFX9-LABEL: v_rcp_f16_neg:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v1, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_rcp_f16_e64 v1, -v1
-; GFX9-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_rcp_f16_neg:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
+; GFX10-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_rcp_f16_e64 v1, -v1
-; GFX10-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_rcp_f16_neg:
@@ -670,24 +670,24 @@ define amdgpu_kernel void @v_rsq_f16(ptr addrspace(1) %r, ptr addrspace(1) %b) #
 ;
 ; GFX9-LABEL: v_rsq_f16:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v1, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_rsq_f16_e32 v1, v1
-; GFX9-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_rsq_f16:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
+; GFX10-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_rsq_f16_e32 v1, v1
-; GFX10-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_rsq_f16:
@@ -771,26 +771,26 @@ define amdgpu_kernel void @v_rsq_f16_neg(ptr addrspace(1) %r, ptr addrspace(1) %
 ;
 ; GFX9-LABEL: v_rsq_f16_neg:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v1, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_rsq_f16_e32 v1, v1
 ; GFX9-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
-; GFX9-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_rsq_f16_neg:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
+; GFX10-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_rsq_f16_e32 v1, v1
 ; GFX10-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
-; GFX10-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_rsq_f16_neg:
@@ -879,28 +879,28 @@ define amdgpu_kernel void @v_rsq_f16_multi_use(ptr addrspace(1) %r, ptr addrspac
 ;
 ; GFX9-LABEL: v_rsq_f16_multi_use:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v1, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_rsq_f16_e32 v2, v1
-; GFX9-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_store_short v0, v2, s[4:5]
+; GFX9-NEXT:    global_store_short v0, v2, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_rsq_f16_multi_use:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
+; GFX10-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_rsq_f16_e32 v2, v1
-; GFX10-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_store_short v0, v2, s[4:5]
+; GFX10-NEXT:    global_store_short v0, v2, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_rsq_f16_multi_use:
@@ -987,26 +987,26 @@ define amdgpu_kernel void @v_rsq_f16_missing_contract0(ptr addrspace(1) %r, ptr
 ;
 ; GFX9-LABEL: v_rsq_f16_missing_contract0:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v1, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_sqrt_f16_e32 v1, v1
 ; GFX9-NEXT:    v_rcp_f16_e32 v1, v1
-; GFX9-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_rsq_f16_missing_contract0:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
+; GFX10-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_sqrt_f16_e32 v1, v1
 ; GFX10-NEXT:    v_rcp_f16_e32 v1, v1
-; GFX10-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_rsq_f16_missing_contract0:
@@ -1092,26 +1092,26 @@ define amdgpu_kernel void @v_rsq_f16_missing_contract1(ptr addrspace(1) %r, ptr
 ;
 ; GFX9-LABEL: v_rsq_f16_missing_contract1:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v1, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_sqrt_f16_e32 v1, v1
 ; GFX9-NEXT:    v_rcp_f16_e32 v1, v1
-; GFX9-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_rsq_f16_missing_contract1:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
+; GFX10-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_sqrt_f16_e32 v1, v1
 ; GFX10-NEXT:    v_rcp_f16_e32 v1, v1
-; GFX10-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_rsq_f16_missing_contract1:
@@ -1197,26 +1197,26 @@ define amdgpu_kernel void @v_neg_rsq_f16_missing_contract1(ptr addrspace(1) %r,
 ;
 ; GFX9-LABEL: v_neg_rsq_f16_missing_contract1:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v1, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_sqrt_f16_e32 v1, v1
 ; GFX9-NEXT:    v_rcp_f16_e64 v1, -v1
-; GFX9-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_neg_rsq_f16_missing_contract1:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
+; GFX10-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_sqrt_f16_e32 v1, v1
 ; GFX10-NEXT:    v_rcp_f16_e64 v1, -v1
-; GFX10-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_neg_rsq_f16_missing_contract1:
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.ll b/llvm/test/CodeGen/AMDGPU/fdiv.ll
index 93105e57a5918..c6b730e3fd5d6 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv.ll
@@ -118,10 +118,10 @@ define amdgpu_kernel void @s_fdiv_f32_ninf(ptr addrspace(1) %out, float %a, floa
 ;
 ; GFX10-LABEL: s_fdiv_f32_ninf:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_div_scale_f32 v0, s0, s7, s7, s6
-; GFX10-NEXT:    v_div_scale_f32 v2, vcc_lo, s6, s7, s6
+; GFX10-NEXT:    v_div_scale_f32 v0, s4, s3, s3, s2
+; GFX10-NEXT:    v_div_scale_f32 v2, vcc_lo, s2, s3, s2
 ; GFX10-NEXT:    v_rcp_f32_e32 v1, v0
 ; GFX10-NEXT:    s_denorm_mode 15
 ; GFX10-NEXT:    v_fma_f32 v3, -v0, v1, 1.0
@@ -133,8 +133,8 @@ define amdgpu_kernel void @s_fdiv_f32_ninf(ptr addrspace(1) %out, float %a, floa
 ; GFX10-NEXT:    s_denorm_mode 12
 ; GFX10-NEXT:    v_div_fmas_f32 v0, v0, v1, v3
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    v_div_fixup_f32 v0, v0, s7, s6
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    v_div_fixup_f32 v0, v0, s3, s2
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_fdiv_f32_ninf:
@@ -275,21 +275,21 @@ define amdgpu_kernel void @s_fdiv_f32_ieee(ptr addrspace(1) %out, float %a, floa
 ;
 ; GFX10-LABEL: s_fdiv_f32_ieee:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_div_scale_f32 v0, s0, s7, s7, s6
+; GFX10-NEXT:    v_div_scale_f32 v0, s4, s3, s3, s2
 ; GFX10-NEXT:    v_rcp_f32_e32 v1, v0
 ; GFX10-NEXT:    v_fma_f32 v2, -v0, v1, 1.0
 ; GFX10-NEXT:    v_fmac_f32_e32 v1, v2, v1
-; GFX10-NEXT:    v_div_scale_f32 v2, vcc_lo, s6, s7, s6
+; GFX10-NEXT:    v_div_scale_f32 v2, vcc_lo, s2, s3, s2
 ; GFX10-NEXT:    v_mul_f32_e32 v3, v2, v1
 ; GFX10-NEXT:    v_fma_f32 v4, -v0, v3, v2
 ; GFX10-NEXT:    v_fmac_f32_e32 v3, v4, v1
 ; GFX10-NEXT:    v_fma_f32 v0, -v0, v3, v2
 ; GFX10-NEXT:    v_div_fmas_f32 v0, v0, v1, v3
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    v_div_fixup_f32 v0, v0, s7, s6
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    v_div_fixup_f32 v0, v0, s3, s2
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_fdiv_f32_ieee:
@@ -370,16 +370,16 @@ define amdgpu_kernel void @s_fdiv_25ulp_f32(ptr addrspace(1) %out, float %a, flo
 ;
 ; GFX10-LABEL: s_fdiv_25ulp_f32:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_cmp_lt_f32_e64 s0, 0x6f800000, |s7|
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x2f800000, s0
-; GFX10-NEXT:    v_mul_f32_e32 v1, s7, v0
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s4, 0x6f800000, |s3|
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x2f800000, s4
+; GFX10-NEXT:    v_mul_f32_e32 v1, s3, v0
 ; GFX10-NEXT:    v_rcp_f32_e32 v1, v1
-; GFX10-NEXT:    v_mul_f32_e32 v1, s6, v1
+; GFX10-NEXT:    v_mul_f32_e32 v1, s2, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX10-NEXT:    global_store_dword v2, v0, s[4:5]
+; GFX10-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_fdiv_25ulp_f32:
@@ -482,18 +482,18 @@ define amdgpu_kernel void @s_fdiv_25ulp_ieee_f32(ptr addrspace(1) %out, float %a
 ;
 ; GFX10-LABEL: s_fdiv_25ulp_ieee_f32:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_frexp_mant_f32_e32 v0, s7
-; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v1, s7
-; GFX10-NEXT:    v_frexp_mant_f32_e32 v2, s6
-; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v3, s6
+; GFX10-NEXT:    v_frexp_mant_f32_e32 v0, s3
+; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v1, s3
+; GFX10-NEXT:    v_frexp_mant_f32_e32 v2, s2
+; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v3, s2
 ; GFX10-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v1, v3, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX10-NEXT:    global_store_dword v2, v0, s[4:5]
+; GFX10-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_fdiv_25ulp_ieee_f32:
@@ -559,12 +559,12 @@ define amdgpu_kernel void @s_fdiv_fast_ieee_f32(ptr addrspace(1) %out, float %a,
 ;
 ; GFX10-LABEL: s_fdiv_fast_ieee_f32:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_rcp_f32_e32 v0, s7
-; GFX10-NEXT:    v_mul_f32_e32 v0, s6, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    v_rcp_f32_e32 v0, s3
+; GFX10-NEXT:    v_mul_f32_e32 v0, s2, v0
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_fdiv_fast_ieee_f32:
@@ -623,12 +623,12 @@ define amdgpu_kernel void @s_fdiv_f32_fast_math(ptr addrspace(1) %out, float %a,
 ;
 ; GFX10-LABEL: s_fdiv_f32_fast_math:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_rcp_f32_e32 v0, s7
-; GFX10-NEXT:    v_mul_f32_e32 v0, s6, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    v_rcp_f32_e32 v0, s3
+; GFX10-NEXT:    v_mul_f32_e32 v0, s2, v0
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_fdiv_f32_fast_math:
@@ -687,12 +687,12 @@ define amdgpu_kernel void @s_fdiv_ulp25_f32_fast_math(ptr addrspace(1) %out, flo
 ;
 ; GFX10-LABEL: s_fdiv_ulp25_f32_fast_math:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_rcp_f32_e32 v0, s7
-; GFX10-NEXT:    v_mul_f32_e32 v0, s6, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    v_rcp_f32_e32 v0, s3
+; GFX10-NEXT:    v_mul_f32_e32 v0, s2, v0
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_fdiv_ulp25_f32_fast_math:
@@ -829,10 +829,10 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_daz(ptr addrspace(1) %out, float %a,
 ;
 ; GFX10-LABEL: s_fdiv_f32_arcp_daz:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_div_scale_f32 v0, s0, s7, s7, s6
-; GFX10-NEXT:    v_div_scale_f32 v2, vcc_lo, s6, s7, s6
+; GFX10-NEXT:    v_div_scale_f32 v0, s4, s3, s3, s2
+; GFX10-NEXT:    v_div_scale_f32 v2, vcc_lo, s2, s3, s2
 ; GFX10-NEXT:    v_rcp_f32_e32 v1, v0
 ; GFX10-NEXT:    s_denorm_mode 15
 ; GFX10-NEXT:    v_fma_f32 v3, -v0, v1, 1.0
@@ -844,8 +844,8 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_daz(ptr addrspace(1) %out, float %a,
 ; GFX10-NEXT:    s_denorm_mode 12
 ; GFX10-NEXT:    v_div_fmas_f32 v0, v0, v1, v3
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    v_div_fixup_f32 v0, v0, s7, s6
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    v_div_fixup_f32 v0, v0, s3, s2
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_fdiv_f32_arcp_daz:
@@ -916,12 +916,12 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_ninf(ptr addrspace(1) %out, float %a,
 ;
 ; GFX10-LABEL: s_fdiv_f32_arcp_ninf:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_rcp_f32_e32 v0, s7
-; GFX10-NEXT:    v_mul_f32_e32 v0, s6, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    v_rcp_f32_e32 v0, s3
+; GFX10-NEXT:    v_mul_f32_e32 v0, s2, v0
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_fdiv_f32_arcp_ninf:
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll
index e0377ddad14c0..c9618d43943ef 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll
@@ -4315,12 +4315,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr %out, i32 %in, i32 %old
 ;
 ; GCN3-LABEL: atomic_cmpxchg_i32_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v0, s4
-; GCN3-NEXT:    v_mov_b32_e32 v2, s6
-; GCN3-NEXT:    v_mov_b32_e32 v1, s5
-; GCN3-NEXT:    v_mov_b32_e32 v3, s7
+; GCN3-NEXT:    v_mov_b32_e32 v0, s0
+; GCN3-NEXT:    v_mov_b32_e32 v2, s2
+; GCN3-NEXT:    v_mov_b32_e32 v1, s1
+; GCN3-NEXT:    v_mov_b32_e32 v3, s3
 ; GCN3-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
@@ -4570,12 +4570,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i32(ptr %out, i32 %in, i32 %old) {
 ;
 ; GCN3-LABEL: atomic_cmpxchg_i32:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v0, s4
-; GCN3-NEXT:    v_mov_b32_e32 v2, s6
-; GCN3-NEXT:    v_mov_b32_e32 v1, s5
-; GCN3-NEXT:    v_mov_b32_e32 v3, s7
+; GCN3-NEXT:    v_mov_b32_e32 v0, s0
+; GCN3-NEXT:    v_mov_b32_e32 v2, s2
+; GCN3-NEXT:    v_mov_b32_e32 v1, s1
+; GCN3-NEXT:    v_mov_b32_e32 v3, s3
 ; GCN3-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
@@ -5279,15 +5279,15 @@ define amdgpu_kernel void @atomic_load_i32_offset(ptr %in, ptr %out) {
 ;
 ; GCN3-LABEL: atomic_load_i32_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v0, s4
-; GCN3-NEXT:    v_mov_b32_e32 v1, s5
+; GCN3-NEXT:    v_mov_b32_e32 v0, s0
+; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    flat_load_dword v2, v[0:1] offset:16 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_mov_b32_e32 v0, s6
-; GCN3-NEXT:    v_mov_b32_e32 v1, s7
+; GCN3-NEXT:    v_mov_b32_e32 v0, s2
+; GCN3-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
@@ -5328,15 +5328,15 @@ define amdgpu_kernel void @atomic_load_i32(ptr %in, ptr %out) {
 ;
 ; GCN3-LABEL: atomic_load_i32:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v0, s4
-; GCN3-NEXT:    v_mov_b32_e32 v1, s5
+; GCN3-NEXT:    v_mov_b32_e32 v0, s0
+; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    flat_load_dword v2, v[0:1] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_mov_b32_e32 v0, s6
-; GCN3-NEXT:    v_mov_b32_e32 v1, s7
+; GCN3-NEXT:    v_mov_b32_e32 v0, s2
+; GCN3-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
@@ -5689,15 +5689,15 @@ define amdgpu_kernel void @atomic_load_f32_offset(ptr %in, ptr %out) {
 ;
 ; GCN3-LABEL: atomic_load_f32_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v0, s4
-; GCN3-NEXT:    v_mov_b32_e32 v1, s5
+; GCN3-NEXT:    v_mov_b32_e32 v0, s0
+; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    flat_load_dword v2, v[0:1] offset:16 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_mov_b32_e32 v0, s6
-; GCN3-NEXT:    v_mov_b32_e32 v1, s7
+; GCN3-NEXT:    v_mov_b32_e32 v0, s2
+; GCN3-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
@@ -5738,15 +5738,15 @@ define amdgpu_kernel void @atomic_load_f32(ptr %in, ptr %out) {
 ;
 ; GCN3-LABEL: atomic_load_f32:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v0, s4
-; GCN3-NEXT:    v_mov_b32_e32 v1, s5
+; GCN3-NEXT:    v_mov_b32_e32 v0, s0
+; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    flat_load_dword v2, v[0:1] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_mov_b32_e32 v0, s6
-; GCN3-NEXT:    v_mov_b32_e32 v1, s7
+; GCN3-NEXT:    v_mov_b32_e32 v0, s2
+; GCN3-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
@@ -6099,15 +6099,15 @@ define amdgpu_kernel void @atomic_load_i8_offset(ptr %in, ptr %out) {
 ;
 ; GCN3-LABEL: atomic_load_i8_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v0, s4
-; GCN3-NEXT:    v_mov_b32_e32 v1, s5
+; GCN3-NEXT:    v_mov_b32_e32 v0, s0
+; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    flat_load_ubyte v2, v[0:1] offset:16 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_mov_b32_e32 v0, s6
-; GCN3-NEXT:    v_mov_b32_e32 v1, s7
+; GCN3-NEXT:    v_mov_b32_e32 v0, s2
+; GCN3-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN3-NEXT:    flat_store_byte v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
@@ -6148,15 +6148,15 @@ define amdgpu_kernel void @atomic_load_i8(ptr %in, ptr %out) {
 ;
 ; GCN3-LABEL: atomic_load_i8:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v0, s4
-; GCN3-NEXT:    v_mov_b32_e32 v1, s5
+; GCN3-NEXT:    v_mov_b32_e32 v0, s0
+; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    flat_load_ubyte v2, v[0:1] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_mov_b32_e32 v0, s6
-; GCN3-NEXT:    v_mov_b32_e32 v1, s7
+; GCN3-NEXT:    v_mov_b32_e32 v0, s2
+; GCN3-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN3-NEXT:    flat_store_byte v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
@@ -6394,15 +6394,15 @@ define amdgpu_kernel void @atomic_load_i16_offset(ptr %in, ptr %out) {
 ;
 ; GCN3-LABEL: atomic_load_i16_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v0, s4
-; GCN3-NEXT:    v_mov_b32_e32 v1, s5
+; GCN3-NEXT:    v_mov_b32_e32 v0, s0
+; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    flat_load_ushort v2, v[0:1] offset:16 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_mov_b32_e32 v0, s6
-; GCN3-NEXT:    v_mov_b32_e32 v1, s7
+; GCN3-NEXT:    v_mov_b32_e32 v0, s2
+; GCN3-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN3-NEXT:    flat_store_short v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
@@ -6443,15 +6443,15 @@ define amdgpu_kernel void @atomic_load_i16(ptr %in, ptr %out) {
 ;
 ; GCN3-LABEL: atomic_load_i16:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v0, s4
-; GCN3-NEXT:    v_mov_b32_e32 v1, s5
+; GCN3-NEXT:    v_mov_b32_e32 v0, s0
+; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    flat_load_ushort v2, v[0:1] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_mov_b32_e32 v0, s6
-; GCN3-NEXT:    v_mov_b32_e32 v1, s7
+; GCN3-NEXT:    v_mov_b32_e32 v0, s2
+; GCN3-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN3-NEXT:    flat_store_short v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
@@ -7979,15 +7979,15 @@ define amdgpu_kernel void @atomic_load_f16_offset(ptr %in, ptr %out) {
 ;
 ; GCN3-LABEL: atomic_load_f16_offset:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v0, s4
-; GCN3-NEXT:    v_mov_b32_e32 v1, s5
+; GCN3-NEXT:    v_mov_b32_e32 v0, s0
+; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    flat_load_ushort v2, v[0:1] offset:16 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_mov_b32_e32 v0, s6
-; GCN3-NEXT:    v_mov_b32_e32 v1, s7
+; GCN3-NEXT:    v_mov_b32_e32 v0, s2
+; GCN3-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN3-NEXT:    flat_store_short v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
   %gep = getelementptr half, ptr %in, i64 8
@@ -8027,15 +8027,15 @@ define amdgpu_kernel void @atomic_load_f16(ptr %in, ptr %out) {
 ;
 ; GCN3-LABEL: atomic_load_f16:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v0, s4
-; GCN3-NEXT:    v_mov_b32_e32 v1, s5
+; GCN3-NEXT:    v_mov_b32_e32 v0, s0
+; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    flat_load_ushort v2, v[0:1] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_mov_b32_e32 v0, s6
-; GCN3-NEXT:    v_mov_b32_e32 v1, s7
+; GCN3-NEXT:    v_mov_b32_e32 v0, s2
+; GCN3-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN3-NEXT:    flat_store_short v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
   %val = load atomic half, ptr %in seq_cst, align 2
@@ -8078,15 +8078,15 @@ define amdgpu_kernel void @atomic_load_bf16_offset(ptr %in, ptr %out) {
 ;
 ; GCN3-LABEL: atomic_load_bf16_offset:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v0, s4
-; GCN3-NEXT:    v_mov_b32_e32 v1, s5
+; GCN3-NEXT:    v_mov_b32_e32 v0, s0
+; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    flat_load_ushort v2, v[0:1] offset:16 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_mov_b32_e32 v0, s6
-; GCN3-NEXT:    v_mov_b32_e32 v1, s7
+; GCN3-NEXT:    v_mov_b32_e32 v0, s2
+; GCN3-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN3-NEXT:    flat_store_short v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
   %gep = getelementptr bfloat, ptr %in, i64 8
@@ -8126,15 +8126,15 @@ define amdgpu_kernel void @atomic_load_bf16(ptr %in, ptr %out) {
 ;
 ; GCN3-LABEL: atomic_load_bf16:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v0, s4
-; GCN3-NEXT:    v_mov_b32_e32 v1, s5
+; GCN3-NEXT:    v_mov_b32_e32 v0, s0
+; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    flat_load_ushort v2, v[0:1] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_mov_b32_e32 v0, s6
-; GCN3-NEXT:    v_mov_b32_e32 v1, s7
+; GCN3-NEXT:    v_mov_b32_e32 v0, s2
+; GCN3-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN3-NEXT:    flat_store_short v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
   %val = load atomic bfloat, ptr %in seq_cst, align 2
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll
index 9c2faf622623d..4d80e9124f41f 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll
@@ -3883,13 +3883,13 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 %
 ;
 ; GCN3-LABEL: atomic_max_i32_addr64_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_ashr_i32 s1, s7, 31
-; GCN3-NEXT:    s_mov_b32 s0, s7
-; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN3-NEXT:    s_add_u32 s0, s4, s0
-; GCN3-NEXT:    s_addc_u32 s1, s5, s1
+; GCN3-NEXT:    s_ashr_i32 s5, s3, 31
+; GCN3-NEXT:    s_mov_b32 s4, s3
+; GCN3-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
+; GCN3-NEXT:    s_add_u32 s0, s0, s4
+; GCN3-NEXT:    s_addc_u32 s1, s1, s5
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    flat_load_dword v3, v[0:1] offset:16
@@ -3897,7 +3897,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 %
 ; GCN3-NEXT:  .LBB88_1: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_max_i32_e32 v2, s6, v3
+; GCN3-NEXT:    v_max_i32_e32 v2, s2, v3
 ; GCN3-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
@@ -4085,13 +4085,13 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index)
 ;
 ; GCN3-LABEL: atomic_max_i32_addr64:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_ashr_i32 s1, s7, 31
-; GCN3-NEXT:    s_mov_b32 s0, s7
-; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN3-NEXT:    s_add_u32 s0, s4, s0
-; GCN3-NEXT:    s_addc_u32 s1, s5, s1
+; GCN3-NEXT:    s_ashr_i32 s5, s3, 31
+; GCN3-NEXT:    s_mov_b32 s4, s3
+; GCN3-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
+; GCN3-NEXT:    s_add_u32 s0, s0, s4
+; GCN3-NEXT:    s_addc_u32 s1, s1, s5
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    flat_load_dword v3, v[0:1]
@@ -4099,7 +4099,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index)
 ; GCN3-NEXT:  .LBB90_1: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_max_i32_e32 v2, s6, v3
+; GCN3-NEXT:    v_max_i32_e32 v2, s2, v3
 ; GCN3-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
@@ -5026,13 +5026,13 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32
 ;
 ; GCN3-LABEL: atomic_umax_i32_addr64_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_ashr_i32 s1, s7, 31
-; GCN3-NEXT:    s_mov_b32 s0, s7
-; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN3-NEXT:    s_add_u32 s0, s4, s0
-; GCN3-NEXT:    s_addc_u32 s1, s5, s1
+; GCN3-NEXT:    s_ashr_i32 s5, s3, 31
+; GCN3-NEXT:    s_mov_b32 s4, s3
+; GCN3-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
+; GCN3-NEXT:    s_add_u32 s0, s0, s4
+; GCN3-NEXT:    s_addc_u32 s1, s1, s5
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    flat_load_dword v3, v[0:1] offset:16
@@ -5040,7 +5040,7 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32
 ; GCN3-NEXT:  .LBB102_1: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_max_u32_e32 v2, s6, v3
+; GCN3-NEXT:    v_max_u32_e32 v2, s2, v3
 ; GCN3-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
@@ -6820,13 +6820,13 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 %
 ;
 ; GCN3-LABEL: atomic_min_i32_addr64_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_ashr_i32 s1, s7, 31
-; GCN3-NEXT:    s_mov_b32 s0, s7
-; GCN3-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN3-NEXT:    s_add_u32 s0, s4, s0
-; GCN3-NEXT:    s_addc_u32 s1, s5, s1
+; GCN3-NEXT:    s_ashr_i32 s5, s3, 31
+; GCN3-NEXT:    s_mov_b32 s4, s3
+; GCN3-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
+; GCN3-NEXT:    s_add_u32 s0, s0, s4
+; GCN3-NEXT:    s_addc_u32 s1, s1, s5
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    flat_load_dword v3, v[0:1] offset:16
@@ -6834,7 +6834,7 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 %
 ; GCN3-NEXT:  .LBB125_1: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_min_i32_e32 v2, s6, v3
+; GCN3-NEXT:    v_min_i32_e32 v2, s2, v3
 ; GCN3-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll
index 3fd624b592cd4..7e4a36b7dc11b 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll
@@ -4439,23 +4439,23 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ;
 ; GCN3-LABEL: atomic_max_i64_ret_addr64_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GCN3-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
-; GCN3-NEXT:    s_add_u32 s0, s4, s0
-; GCN3-NEXT:    s_addc_u32 s1, s5, s1
+; GCN3-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
+; GCN3-NEXT:    s_add_u32 s0, s0, s6
+; GCN3-NEXT:    s_addc_u32 s1, s1, s7
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    flat_load_dwordx2 v[2:3], v[0:1] offset:32
 ; GCN3-NEXT:    s_mov_b64 s[0:1], 0
-; GCN3-NEXT:    v_mov_b32_e32 v4, s9
-; GCN3-NEXT:    v_mov_b32_e32 v5, s8
+; GCN3-NEXT:    v_mov_b32_e32 v4, s5
+; GCN3-NEXT:    v_mov_b32_e32 v5, s4
 ; GCN3-NEXT:  .LBB89_1: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v9, v3
 ; GCN3-NEXT:    v_mov_b32_e32 v8, v2
-; GCN3-NEXT:    v_cmp_lt_i64_e32 vcc, s[8:9], v[8:9]
+; GCN3-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9]
 ; GCN3-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
 ; GCN3-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
 ; GCN3-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] offset:32 glc
@@ -4467,8 +4467,8 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GCN3-NEXT:    s_cbranch_execnz .LBB89_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GCN3-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GCN3-NEXT:    v_mov_b32_e32 v0, s6
-; GCN3-NEXT:    v_mov_b32_e32 v1, s7
+; GCN3-NEXT:    v_mov_b32_e32 v0, s2
+; GCN3-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN3-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GCN3-NEXT:    s_endpgm
 entry:
@@ -4654,23 +4654,23 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ;
 ; GCN3-LABEL: atomic_max_i64_ret_addr64:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GCN3-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
-; GCN3-NEXT:    s_add_u32 s0, s4, s0
-; GCN3-NEXT:    s_addc_u32 s1, s5, s1
+; GCN3-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
+; GCN3-NEXT:    s_add_u32 s0, s0, s6
+; GCN3-NEXT:    s_addc_u32 s1, s1, s7
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
 ; GCN3-NEXT:    s_mov_b64 s[0:1], 0
-; GCN3-NEXT:    v_mov_b32_e32 v4, s9
-; GCN3-NEXT:    v_mov_b32_e32 v5, s8
+; GCN3-NEXT:    v_mov_b32_e32 v4, s5
+; GCN3-NEXT:    v_mov_b32_e32 v5, s4
 ; GCN3-NEXT:  .LBB91_1: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v9, v3
 ; GCN3-NEXT:    v_mov_b32_e32 v8, v2
-; GCN3-NEXT:    v_cmp_lt_i64_e32 vcc, s[8:9], v[8:9]
+; GCN3-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9]
 ; GCN3-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
 ; GCN3-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
 ; GCN3-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
@@ -4682,8 +4682,8 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ; GCN3-NEXT:    s_cbranch_execnz .LBB91_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GCN3-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GCN3-NEXT:    v_mov_b32_e32 v0, s6
-; GCN3-NEXT:    v_mov_b32_e32 v1, s7
+; GCN3-NEXT:    v_mov_b32_e32 v0, s2
+; GCN3-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN3-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GCN3-NEXT:    s_endpgm
 entry:
@@ -5821,23 +5821,23 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2
 ;
 ; GCN3-LABEL: atomic_umax_i64_ret_addr64_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GCN3-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
-; GCN3-NEXT:    s_add_u32 s0, s4, s0
-; GCN3-NEXT:    s_addc_u32 s1, s5, s1
+; GCN3-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
+; GCN3-NEXT:    s_add_u32 s0, s0, s6
+; GCN3-NEXT:    s_addc_u32 s1, s1, s7
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    flat_load_dwordx2 v[2:3], v[0:1] offset:32
 ; GCN3-NEXT:    s_mov_b64 s[0:1], 0
-; GCN3-NEXT:    v_mov_b32_e32 v4, s9
-; GCN3-NEXT:    v_mov_b32_e32 v5, s8
+; GCN3-NEXT:    v_mov_b32_e32 v4, s5
+; GCN3-NEXT:    v_mov_b32_e32 v5, s4
 ; GCN3-NEXT:  .LBB103_1: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v9, v3
 ; GCN3-NEXT:    v_mov_b32_e32 v8, v2
-; GCN3-NEXT:    v_cmp_lt_u64_e32 vcc, s[8:9], v[8:9]
+; GCN3-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9]
 ; GCN3-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
 ; GCN3-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
 ; GCN3-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] offset:32 glc
@@ -5849,8 +5849,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2
 ; GCN3-NEXT:    s_cbranch_execnz .LBB103_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GCN3-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GCN3-NEXT:    v_mov_b32_e32 v0, s6
-; GCN3-NEXT:    v_mov_b32_e32 v1, s7
+; GCN3-NEXT:    v_mov_b32_e32 v0, s2
+; GCN3-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN3-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GCN3-NEXT:    s_endpgm
 entry:
@@ -5934,23 +5934,23 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 %
 ;
 ; GCN3-LABEL: atomic_umax_i64_ret_addr64:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GCN3-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
-; GCN3-NEXT:    s_add_u32 s0, s4, s0
-; GCN3-NEXT:    s_addc_u32 s1, s5, s1
+; GCN3-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
+; GCN3-NEXT:    s_add_u32 s0, s0, s6
+; GCN3-NEXT:    s_addc_u32 s1, s1, s7
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
 ; GCN3-NEXT:    s_mov_b64 s[0:1], 0
-; GCN3-NEXT:    v_mov_b32_e32 v4, s9
-; GCN3-NEXT:    v_mov_b32_e32 v5, s8
+; GCN3-NEXT:    v_mov_b32_e32 v4, s5
+; GCN3-NEXT:    v_mov_b32_e32 v5, s4
 ; GCN3-NEXT:  .LBB104_1: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v9, v3
 ; GCN3-NEXT:    v_mov_b32_e32 v8, v2
-; GCN3-NEXT:    v_cmp_lt_u64_e32 vcc, s[8:9], v[8:9]
+; GCN3-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9]
 ; GCN3-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
 ; GCN3-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
 ; GCN3-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
@@ -5962,8 +5962,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 %
 ; GCN3-NEXT:    s_cbranch_execnz .LBB104_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GCN3-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GCN3-NEXT:    v_mov_b32_e32 v0, s6
-; GCN3-NEXT:    v_mov_b32_e32 v1, s7
+; GCN3-NEXT:    v_mov_b32_e32 v0, s2
+; GCN3-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN3-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GCN3-NEXT:    s_endpgm
 entry:
@@ -8045,23 +8045,23 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ;
 ; GCN3-LABEL: atomic_min_i64_ret_addr64_offset:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GCN3-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
-; GCN3-NEXT:    s_add_u32 s0, s4, s0
-; GCN3-NEXT:    s_addc_u32 s1, s5, s1
+; GCN3-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
+; GCN3-NEXT:    s_add_u32 s0, s0, s6
+; GCN3-NEXT:    s_addc_u32 s1, s1, s7
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    flat_load_dwordx2 v[2:3], v[0:1] offset:32
 ; GCN3-NEXT:    s_mov_b64 s[0:1], 0
-; GCN3-NEXT:    v_mov_b32_e32 v4, s9
-; GCN3-NEXT:    v_mov_b32_e32 v5, s8
+; GCN3-NEXT:    v_mov_b32_e32 v4, s5
+; GCN3-NEXT:    v_mov_b32_e32 v5, s4
 ; GCN3-NEXT:  .LBB126_1: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v9, v3
 ; GCN3-NEXT:    v_mov_b32_e32 v8, v2
-; GCN3-NEXT:    v_cmp_ge_i64_e32 vcc, s[8:9], v[8:9]
+; GCN3-NEXT:    v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9]
 ; GCN3-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
 ; GCN3-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
 ; GCN3-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] offset:32 glc
@@ -8073,8 +8073,8 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GCN3-NEXT:    s_cbranch_execnz .LBB126_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GCN3-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GCN3-NEXT:    v_mov_b32_e32 v0, s6
-; GCN3-NEXT:    v_mov_b32_e32 v1, s7
+; GCN3-NEXT:    v_mov_b32_e32 v0, s2
+; GCN3-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN3-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GCN3-NEXT:    s_endpgm
 entry:
@@ -8148,20 +8148,20 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) {
 ;
 ; GCN3-LABEL: atomic_min_i64:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GCN3-NEXT:    s_mov_b64 s[0:1], 0
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GCN3-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v0, s4
-; GCN3-NEXT:    v_mov_b32_e32 v1, s5
+; GCN3-NEXT:    v_mov_b32_e32 v0, s0
+; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
-; GCN3-NEXT:    v_mov_b32_e32 v4, s4
-; GCN3-NEXT:    v_mov_b32_e32 v6, s7
-; GCN3-NEXT:    v_mov_b32_e32 v7, s6
-; GCN3-NEXT:    v_mov_b32_e32 v5, s5
+; GCN3-NEXT:    v_mov_b32_e32 v5, s1
+; GCN3-NEXT:    v_mov_b32_e32 v6, s3
+; GCN3-NEXT:    v_mov_b32_e32 v7, s2
+; GCN3-NEXT:    v_mov_b32_e32 v4, s0
 ; GCN3-NEXT:  .LBB127_1: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
+; GCN3-NEXT:    v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
 ; GCN3-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
 ; GCN3-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
 ; GCN3-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
@@ -8169,9 +8169,9 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) {
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
 ; GCN3-NEXT:    v_mov_b32_e32 v3, v1
-; GCN3-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GCN3-NEXT:    v_mov_b32_e32 v2, v0
-; GCN3-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB127_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GCN3-NEXT:    s_endpgm
@@ -8253,23 +8253,23 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ;
 ; GCN3-LABEL: atomic_min_i64_ret_addr64:
 ; GCN3:       ; %bb.0: ; %entry
-; GCN3-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GCN3-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN3-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
-; GCN3-NEXT:    s_add_u32 s0, s4, s0
-; GCN3-NEXT:    s_addc_u32 s1, s5, s1
+; GCN3-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
+; GCN3-NEXT:    s_add_u32 s0, s0, s6
+; GCN3-NEXT:    s_addc_u32 s1, s1, s7
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN3-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
 ; GCN3-NEXT:    s_mov_b64 s[0:1], 0
-; GCN3-NEXT:    v_mov_b32_e32 v4, s9
-; GCN3-NEXT:    v_mov_b32_e32 v5, s8
+; GCN3-NEXT:    v_mov_b32_e32 v4, s5
+; GCN3-NEXT:    v_mov_b32_e32 v5, s4
 ; GCN3-NEXT:  .LBB128_1: ; %atomicrmw.start
 ; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v9, v3
 ; GCN3-NEXT:    v_mov_b32_e32 v8, v2
-; GCN3-NEXT:    v_cmp_ge_i64_e32 vcc, s[8:9], v[8:9]
+; GCN3-NEXT:    v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9]
 ; GCN3-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
 ; GCN3-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
 ; GCN3-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
@@ -8281,8 +8281,8 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ; GCN3-NEXT:    s_cbranch_execnz .LBB128_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GCN3-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GCN3-NEXT:    v_mov_b32_e32 v0, s6
-; GCN3-NEXT:    v_mov_b32_e32 v1, s7
+; GCN3-NEXT:    v_mov_b32_e32 v0, s2
+; GCN3-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN3-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GCN3-NEXT:    s_endpgm
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/fmax3.ll b/llvm/test/CodeGen/AMDGPU/fmax3.ll
index f86f5305e6ba1..84852c2632f67 100644
--- a/llvm/test/CodeGen/AMDGPU/fmax3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmax3.ll
@@ -67,32 +67,32 @@ define amdgpu_kernel void @test_fmax3_olt_0_f32(ptr addrspace(1) %out, ptr addrs
 ;
 ; GFX9-LABEL: test_fmax3_olt_0_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    s_mov_b32 s14, s2
-; GFX9-NEXT:    s_mov_b32 s15, s3
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s11, 0xf000
+; GFX9-NEXT:    s_mov_b32 s10, -1
+; GFX9-NEXT:    s_mov_b32 s14, s10
+; GFX9-NEXT:    s_mov_b32 s15, s11
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s12, s6
-; GFX9-NEXT:    s_mov_b32 s13, s7
-; GFX9-NEXT:    s_mov_b32 s16, s8
-; GFX9-NEXT:    s_mov_b32 s17, s9
-; GFX9-NEXT:    s_mov_b32 s18, s2
-; GFX9-NEXT:    s_mov_b32 s19, s3
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s10, s2
-; GFX9-NEXT:    s_mov_b32 s11, s3
+; GFX9-NEXT:    s_mov_b32 s12, s2
+; GFX9-NEXT:    s_mov_b32 s13, s3
+; GFX9-NEXT:    s_mov_b32 s16, s4
+; GFX9-NEXT:    s_mov_b32 s17, s5
+; GFX9-NEXT:    s_mov_b32 s18, s10
+; GFX9-NEXT:    s_mov_b32 s19, s11
+; GFX9-NEXT:    s_mov_b32 s4, s6
+; GFX9-NEXT:    s_mov_b32 s5, s7
+; GFX9-NEXT:    s_mov_b32 s6, s10
+; GFX9-NEXT:    s_mov_b32 s7, s11
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[12:15], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[16:19], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_load_dword v2, off, s[8:11], 0 glc
+; GFX9-NEXT:    buffer_load_dword v2, off, s[4:7], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s0, s4
-; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_mov_b32 s8, s0
+; GFX9-NEXT:    s_mov_b32 s9, s1
 ; GFX9-NEXT:    v_max3_f32 v0, v0, v1, v2
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    buffer_store_dword v0, off, s[8:11], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_fmax3_olt_0_f32:
@@ -199,32 +199,32 @@ define amdgpu_kernel void @test_fmax3_olt_1_f32(ptr addrspace(1) %out, ptr addrs
 ;
 ; GFX9-LABEL: test_fmax3_olt_1_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    s_mov_b32 s14, s2
-; GFX9-NEXT:    s_mov_b32 s15, s3
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s11, 0xf000
+; GFX9-NEXT:    s_mov_b32 s10, -1
+; GFX9-NEXT:    s_mov_b32 s14, s10
+; GFX9-NEXT:    s_mov_b32 s15, s11
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s12, s6
-; GFX9-NEXT:    s_mov_b32 s13, s7
-; GFX9-NEXT:    s_mov_b32 s16, s8
-; GFX9-NEXT:    s_mov_b32 s17, s9
-; GFX9-NEXT:    s_mov_b32 s18, s2
-; GFX9-NEXT:    s_mov_b32 s19, s3
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s10, s2
-; GFX9-NEXT:    s_mov_b32 s11, s3
+; GFX9-NEXT:    s_mov_b32 s12, s2
+; GFX9-NEXT:    s_mov_b32 s13, s3
+; GFX9-NEXT:    s_mov_b32 s16, s4
+; GFX9-NEXT:    s_mov_b32 s17, s5
+; GFX9-NEXT:    s_mov_b32 s18, s10
+; GFX9-NEXT:    s_mov_b32 s19, s11
+; GFX9-NEXT:    s_mov_b32 s4, s6
+; GFX9-NEXT:    s_mov_b32 s5, s7
+; GFX9-NEXT:    s_mov_b32 s6, s10
+; GFX9-NEXT:    s_mov_b32 s7, s11
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[12:15], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[16:19], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_load_dword v2, off, s[8:11], 0 glc
+; GFX9-NEXT:    buffer_load_dword v2, off, s[4:7], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s0, s4
-; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_mov_b32 s8, s0
+; GFX9-NEXT:    s_mov_b32 s9, s1
 ; GFX9-NEXT:    v_max3_f32 v0, v2, v0, v1
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    buffer_store_dword v0, off, s[8:11], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_fmax3_olt_1_f32:
@@ -338,32 +338,32 @@ define amdgpu_kernel void @test_fmax3_olt_0_f16(ptr addrspace(1) %out, ptr addrs
 ;
 ; GFX9-LABEL: test_fmax3_olt_0_f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    s_mov_b32 s14, s2
-; GFX9-NEXT:    s_mov_b32 s15, s3
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s11, 0xf000
+; GFX9-NEXT:    s_mov_b32 s10, -1
+; GFX9-NEXT:    s_mov_b32 s14, s10
+; GFX9-NEXT:    s_mov_b32 s15, s11
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s12, s6
-; GFX9-NEXT:    s_mov_b32 s13, s7
-; GFX9-NEXT:    s_mov_b32 s16, s8
-; GFX9-NEXT:    s_mov_b32 s17, s9
-; GFX9-NEXT:    s_mov_b32 s18, s2
-; GFX9-NEXT:    s_mov_b32 s19, s3
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s10, s2
-; GFX9-NEXT:    s_mov_b32 s11, s3
+; GFX9-NEXT:    s_mov_b32 s12, s2
+; GFX9-NEXT:    s_mov_b32 s13, s3
+; GFX9-NEXT:    s_mov_b32 s16, s4
+; GFX9-NEXT:    s_mov_b32 s17, s5
+; GFX9-NEXT:    s_mov_b32 s18, s10
+; GFX9-NEXT:    s_mov_b32 s19, s11
+; GFX9-NEXT:    s_mov_b32 s4, s6
+; GFX9-NEXT:    s_mov_b32 s5, s7
+; GFX9-NEXT:    s_mov_b32 s6, s10
+; GFX9-NEXT:    s_mov_b32 s7, s11
 ; GFX9-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[16:19], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_load_ushort v2, off, s[8:11], 0 glc
+; GFX9-NEXT:    buffer_load_ushort v2, off, s[4:7], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s0, s4
-; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_mov_b32 s8, s0
+; GFX9-NEXT:    s_mov_b32 s9, s1
 ; GFX9-NEXT:    v_max3_f16 v0, v0, v1, v2
-; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX9-NEXT:    buffer_store_short v0, off, s[8:11], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_fmax3_olt_0_f16:
@@ -478,32 +478,32 @@ define amdgpu_kernel void @test_fmax3_olt_1_f16(ptr addrspace(1) %out, ptr addrs
 ;
 ; GFX9-LABEL: test_fmax3_olt_1_f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    s_mov_b32 s14, s2
-; GFX9-NEXT:    s_mov_b32 s15, s3
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s11, 0xf000
+; GFX9-NEXT:    s_mov_b32 s10, -1
+; GFX9-NEXT:    s_mov_b32 s14, s10
+; GFX9-NEXT:    s_mov_b32 s15, s11
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s12, s6
-; GFX9-NEXT:    s_mov_b32 s13, s7
-; GFX9-NEXT:    s_mov_b32 s16, s8
-; GFX9-NEXT:    s_mov_b32 s17, s9
-; GFX9-NEXT:    s_mov_b32 s18, s2
-; GFX9-NEXT:    s_mov_b32 s19, s3
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s10, s2
-; GFX9-NEXT:    s_mov_b32 s11, s3
+; GFX9-NEXT:    s_mov_b32 s12, s2
+; GFX9-NEXT:    s_mov_b32 s13, s3
+; GFX9-NEXT:    s_mov_b32 s16, s4
+; GFX9-NEXT:    s_mov_b32 s17, s5
+; GFX9-NEXT:    s_mov_b32 s18, s10
+; GFX9-NEXT:    s_mov_b32 s19, s11
+; GFX9-NEXT:    s_mov_b32 s4, s6
+; GFX9-NEXT:    s_mov_b32 s5, s7
+; GFX9-NEXT:    s_mov_b32 s6, s10
+; GFX9-NEXT:    s_mov_b32 s7, s11
 ; GFX9-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[16:19], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_load_ushort v2, off, s[8:11], 0 glc
+; GFX9-NEXT:    buffer_load_ushort v2, off, s[4:7], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s0, s4
-; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_mov_b32 s8, s0
+; GFX9-NEXT:    s_mov_b32 s9, s1
 ; GFX9-NEXT:    v_max3_f16 v0, v2, v0, v1
-; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX9-NEXT:    buffer_store_short v0, off, s[8:11], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_fmax3_olt_1_f16:
diff --git a/llvm/test/CodeGen/AMDGPU/fmed3.ll b/llvm/test/CodeGen/AMDGPU/fmed3.ll
index 6634d36122d0a..84099e472d65f 100644
--- a/llvm/test/CodeGen/AMDGPU/fmed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmed3.ll
@@ -83,14 +83,14 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(ptr addrspace(1) %o
 ;
 ; GFX9-LABEL: v_test_nnan_input_fmed3_r_i_i_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_f32_e32 v1, 1.0, v1
 ; GFX9-NEXT:    v_med3_f32 v1, v1, 2.0, 4.0
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_nnan_input_fmed3_r_i_i_f32:
@@ -195,14 +195,14 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_f32(ptr addrspace(1) %out, pt
 ;
 ; GFX9-LABEL: v_test_fmed3_nnan_r_i_i_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_f32_e32 v1, 1.0, v1
 ; GFX9-NEXT:    v_med3_f32 v1, v1, 2.0, 4.0
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_fmed3_nnan_r_i_i_f32:
@@ -308,14 +308,14 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute0_f32(ptr addrspace(1)
 ;
 ; GFX9-LABEL: v_test_fmed3_nnan_r_i_i_commute0_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_f32_e32 v1, 1.0, v1
 ; GFX9-NEXT:    v_med3_f32 v1, v1, 2.0, 4.0
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_fmed3_nnan_r_i_i_commute0_f32:
@@ -421,14 +421,14 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute1_f32(ptr addrspace(1)
 ;
 ; GFX9-LABEL: v_test_fmed3_nnan_r_i_i_commute1_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_f32_e32 v1, 1.0, v1
 ; GFX9-NEXT:    v_med3_f32 v1, v1, 2.0, 4.0
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_fmed3_nnan_r_i_i_commute1_f32:
@@ -538,15 +538,15 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_constant_order_f32(ptr addrsp
 ;
 ; GFX9-LABEL: v_test_fmed3_nnan_r_i_i_constant_order_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_f32_e32 v1, 1.0, v1
 ; GFX9-NEXT:    v_max_f32_e32 v1, 4.0, v1
 ; GFX9-NEXT:    v_min_f32_e32 v1, 2.0, v1
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_fmed3_nnan_r_i_i_constant_order_f32:
@@ -668,33 +668,33 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_multi_use_f32(ptr addrspace(1
 ;
 ; GFX9-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    v_add_f32_e32 v1, 1.0, v1
 ; GFX9-SDAG-NEXT:    v_max_f32_e32 v1, 2.0, v1
 ; GFX9-SDAG-NEXT:    v_min_f32_e32 v2, 4.0, v1
-; GFX9-SDAG-NEXT:    global_store_dword v0, v2, s[4:5]
+; GFX9-SDAG-NEXT:    global_store_dword v0, v2, s[0:1]
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_add_f32_e32 v1, 1.0, v1
 ; GFX9-GISEL-NEXT:    v_max_f32_e32 v2, 2.0, v1
 ; GFX9-GISEL-NEXT:    v_med3_f32 v1, v1, 2.0, 4.0
-; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    global_store_dword v0, v2, s[4:5]
+; GFX9-GISEL-NEXT:    global_store_dword v0, v2, s[0:1]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    s_endpgm
 ;
@@ -831,15 +831,15 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_f64(ptr addrspace(1) %out, ptr add
 ;
 ; GFX9-LABEL: v_test_fmed3_r_i_i_f64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[6:7]
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
 ; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], 2.0
 ; GFX9-NEXT:    v_min_f64 v[0:1], v[0:1], 4.0
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_fmed3_r_i_i_f64:
@@ -943,13 +943,13 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_no_nans_f32(ptr addrspace(1) %out,
 ;
 ; GFX9-LABEL: v_test_fmed3_r_i_i_no_nans_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_med3_f32 v1, v1, 2.0, 4.0
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_fmed3_r_i_i_no_nans_f32:
@@ -1057,29 +1057,29 @@ define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(ptr addrspace(1) %out,
 ;
 ; GFX9-SDAG-LABEL: v_test_legacy_fmed3_r_i_i_f32:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    v_add_f32_e32 v1, 1.0, v1
 ; GFX9-SDAG-NEXT:    v_med3_f32 v1, v1, 2.0, 4.0
-; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-SDAG-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: v_test_legacy_fmed3_r_i_i_f32:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_add_f32_e32 v1, 1.0, v1
 ; GFX9-GISEL-NEXT:    v_cmp_nlt_f32_e32 vcc, 2.0, v1
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, 2.0, vcc
 ; GFX9-GISEL-NEXT:    v_cmp_ngt_f32_e32 vcc, 4.0, v1
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, 4.0, vcc
-; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: v_test_legacy_fmed3_r_i_i_f32:
@@ -1244,33 +1244,33 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(ptr addrspa
 ;
 ; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[6:7] glc
+; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3] glc
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v2, v0, s[8:9] glc
+; GFX9-SDAG-NEXT:    global_load_dword v2, v0, s[4:5] glc
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v3, v0, s[10:11] glc
+; GFX9-SDAG-NEXT:    global_load_dword v3, v0, s[6:7] glc
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    v_med3_f32 v1, -v1, v2, v3
-; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-SDAG-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[6:7] glc
+; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[2:3] glc
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v2, v0, s[8:9] glc
+; GFX9-GISEL-NEXT:    global_load_dword v2, v0, s[4:5] glc
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v3, v0, s[10:11] glc
+; GFX9-GISEL-NEXT:    global_load_dword v3, v0, s[6:7] glc
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_max_f32_e64 v1, -v1, -v1
 ; GFX9-GISEL-NEXT:    v_med3_f32 v1, v1, v2, v3
-; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0:
@@ -1438,33 +1438,33 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(ptr addrspa
 ;
 ; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[6:7] glc
+; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3] glc
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v2, v0, s[8:9] glc
+; GFX9-SDAG-NEXT:    global_load_dword v2, v0, s[4:5] glc
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v3, v0, s[10:11] glc
+; GFX9-SDAG-NEXT:    global_load_dword v3, v0, s[6:7] glc
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    v_med3_f32 v1, v1, -v2, v3
-; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-SDAG-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[6:7] glc
+; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[2:3] glc
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v2, v0, s[8:9] glc
+; GFX9-GISEL-NEXT:    global_load_dword v2, v0, s[4:5] glc
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v3, v0, s[10:11] glc
+; GFX9-GISEL-NEXT:    global_load_dword v3, v0, s[6:7] glc
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_max_f32_e64 v2, -v2, -v2
 ; GFX9-GISEL-NEXT:    v_med3_f32 v1, v1, v2, v3
-; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1:
@@ -1632,33 +1632,33 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(ptr addrspa
 ;
 ; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[6:7] glc
+; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3] glc
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v2, v0, s[8:9] glc
+; GFX9-SDAG-NEXT:    global_load_dword v2, v0, s[4:5] glc
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v3, v0, s[10:11] glc
+; GFX9-SDAG-NEXT:    global_load_dword v3, v0, s[6:7] glc
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    v_med3_f32 v1, v1, v2, -v3
-; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-SDAG-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[6:7] glc
+; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[2:3] glc
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v2, v0, s[8:9] glc
+; GFX9-GISEL-NEXT:    global_load_dword v2, v0, s[4:5] glc
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v3, v0, s[10:11] glc
+; GFX9-GISEL-NEXT:    global_load_dword v3, v0, s[6:7] glc
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_max_f32_e64 v3, -v3, -v3
 ; GFX9-GISEL-NEXT:    v_med3_f32 v1, v1, v2, v3
-; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2:
@@ -1828,34 +1828,34 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(ptr addrs
 ;
 ; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[6:7] glc
+; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3] glc
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v2, v0, s[8:9] glc
+; GFX9-SDAG-NEXT:    global_load_dword v2, v0, s[4:5] glc
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v3, v0, s[10:11] glc
+; GFX9-SDAG-NEXT:    global_load_dword v3, v0, s[6:7] glc
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    v_med3_f32 v1, -v1, |v2|, -|v3|
-; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-SDAG-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[6:7] glc
+; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[2:3] glc
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v2, v0, s[8:9] glc
+; GFX9-GISEL-NEXT:    global_load_dword v2, v0, s[4:5] glc
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v3, v0, s[10:11] glc
+; GFX9-GISEL-NEXT:    global_load_dword v3, v0, s[6:7] glc
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_max_f32_e64 v1, -v1, -v1
 ; GFX9-GISEL-NEXT:    v_max_f32_e64 v3, -|v3|, -|v3|
 ; GFX9-GISEL-NEXT:    v_med3_f32 v1, v1, |v2|, v3
-; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012:
@@ -2034,35 +2034,35 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(ptr addrs
 ;
 ; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[6:7] glc
+; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3] glc
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v2, v0, s[8:9] glc
+; GFX9-SDAG-NEXT:    global_load_dword v2, v0, s[4:5] glc
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v3, v0, s[10:11] glc
+; GFX9-SDAG-NEXT:    global_load_dword v3, v0, s[6:7] glc
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    v_med3_f32 v1, -|v1|, -|v2|, -|v3|
-; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-SDAG-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[6:7] glc
+; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[2:3] glc
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v2, v0, s[8:9] glc
+; GFX9-GISEL-NEXT:    global_load_dword v2, v0, s[4:5] glc
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v3, v0, s[10:11] glc
+; GFX9-GISEL-NEXT:    global_load_dword v3, v0, s[6:7] glc
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_max_f32_e64 v1, -|v1|, -|v1|
 ; GFX9-GISEL-NEXT:    v_max_f32_e64 v2, -|v2|, -|v2|
 ; GFX9-GISEL-NEXT:    v_max_f32_e64 v3, -|v3|, -|v3|
 ; GFX9-GISEL-NEXT:    v_med3_f32 v1, v1, v2, v3
-; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012:
@@ -2250,20 +2250,20 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(ptr addrspace(1) %out, pt
 ;
 ; GFX9-LABEL: v_nnan_inputs_med3_f32_pat0:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v2, v0, s[8:9] glc
+; GFX9-NEXT:    global_load_dword v2, v0, s[4:5] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v3, v0, s[10:11] glc
+; GFX9-NEXT:    global_load_dword v3, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_f32_e32 v1, 1.0, v1
 ; GFX9-NEXT:    v_add_f32_e32 v2, 2.0, v2
 ; GFX9-NEXT:    v_add_f32_e32 v3, 4.0, v3
 ; GFX9-NEXT:    v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_nnan_inputs_med3_f32_pat0:
@@ -2415,17 +2415,17 @@ define amdgpu_kernel void @v_nnan_input_calls_med3_f32_pat0(ptr addrspace(1) %ou
 ;
 ; GFX9-LABEL: v_nnan_input_calls_med3_f32_pat0:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v2, v0, s[8:9] glc
+; GFX9-NEXT:    global_load_dword v2, v0, s[4:5] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v3, v0, s[10:11] glc
+; GFX9-NEXT:    global_load_dword v3, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_nnan_input_calls_med3_f32_pat0:
@@ -2569,17 +2569,17 @@ define amdgpu_kernel void @v_nnan_call_med3_f32_pat0(ptr addrspace(1) %out, ptr
 ;
 ; GFX9-LABEL: v_nnan_call_med3_f32_pat0:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v2, v0, s[8:9] glc
+; GFX9-NEXT:    global_load_dword v2, v0, s[4:5] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v3, v0, s[10:11] glc
+; GFX9-NEXT:    global_load_dword v3, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_nnan_call_med3_f32_pat0:
@@ -2723,17 +2723,17 @@ define amdgpu_kernel void @v_fast_call_med3_f32_pat0(ptr addrspace(1) %out, ptr
 ;
 ; GFX9-LABEL: v_fast_call_med3_f32_pat0:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v2, v0, s[8:9] glc
+; GFX9-NEXT:    global_load_dword v2, v0, s[4:5] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v3, v0, s[10:11] glc
+; GFX9-NEXT:    global_load_dword v3, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_fast_call_med3_f32_pat0:
@@ -2889,17 +2889,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0(ptr addrspace(1) %o
 ;
 ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat0:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v2, v0, s[8:9] glc
+; GFX9-NEXT:    global_load_dword v2, v0, s[4:5] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v3, v0, s[10:11] glc
+; GFX9-NEXT:    global_load_dword v3, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat0:
@@ -3043,17 +3043,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1(ptr addrspace(1) %o
 ;
 ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat1:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v2, v0, s[8:9] glc
+; GFX9-NEXT:    global_load_dword v2, v0, s[4:5] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v3, v0, s[10:11] glc
+; GFX9-NEXT:    global_load_dword v3, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat1:
@@ -3199,33 +3199,33 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1_srcmod0(ptr addrspa
 ;
 ; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[6:7] glc
+; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3] glc
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v2, v0, s[8:9] glc
+; GFX9-SDAG-NEXT:    global_load_dword v2, v0, s[4:5] glc
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v3, v0, s[10:11] glc
+; GFX9-SDAG-NEXT:    global_load_dword v3, v0, s[6:7] glc
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    v_med3_f32 v1, -v1, v2, v3
-; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-SDAG-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[6:7] glc
+; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[2:3] glc
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v2, v0, s[8:9] glc
+; GFX9-GISEL-NEXT:    global_load_dword v2, v0, s[4:5] glc
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v3, v0, s[10:11] glc
+; GFX9-GISEL-NEXT:    global_load_dword v3, v0, s[6:7] glc
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_max_f32_e64 v1, -v1, -v1
 ; GFX9-GISEL-NEXT:    v_med3_f32 v1, v1, v2, v3
-; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0:
@@ -3391,17 +3391,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat2(ptr addrspace(1) %o
 ;
 ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat2:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v2, v0, s[8:9] glc
+; GFX9-NEXT:    global_load_dword v2, v0, s[4:5] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v3, v0, s[10:11] glc
+; GFX9-NEXT:    global_load_dword v3, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat2:
@@ -3545,17 +3545,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat3(ptr addrspace(1) %o
 ;
 ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat3:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v2, v0, s[8:9] glc
+; GFX9-NEXT:    global_load_dword v2, v0, s[4:5] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v3, v0, s[10:11] glc
+; GFX9-NEXT:    global_load_dword v3, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat3:
@@ -3699,17 +3699,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat4(ptr addrspace(1) %o
 ;
 ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat4:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v2, v0, s[8:9] glc
+; GFX9-NEXT:    global_load_dword v2, v0, s[4:5] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v3, v0, s[10:11] glc
+; GFX9-NEXT:    global_load_dword v3, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_med3_f32 v1, v2, v1, v3
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat4:
@@ -3853,17 +3853,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat5(ptr addrspace(1) %o
 ;
 ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat5:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v2, v0, s[8:9] glc
+; GFX9-NEXT:    global_load_dword v2, v0, s[4:5] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v3, v0, s[10:11] glc
+; GFX9-NEXT:    global_load_dword v3, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_med3_f32 v1, v2, v1, v3
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat5:
@@ -4007,17 +4007,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat6(ptr addrspace(1) %o
 ;
 ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat6:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v2, v0, s[8:9] glc
+; GFX9-NEXT:    global_load_dword v2, v0, s[4:5] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v3, v0, s[10:11] glc
+; GFX9-NEXT:    global_load_dword v3, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_med3_f32 v1, v2, v1, v3
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat6:
@@ -4161,17 +4161,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat7(ptr addrspace(1) %o
 ;
 ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat7:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v2, v0, s[8:9] glc
+; GFX9-NEXT:    global_load_dword v2, v0, s[4:5] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v3, v0, s[10:11] glc
+; GFX9-NEXT:    global_load_dword v3, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_med3_f32 v1, v2, v1, v3
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat7:
@@ -4315,17 +4315,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat8(ptr addrspace(1) %o
 ;
 ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat8:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v2, v0, s[8:9] glc
+; GFX9-NEXT:    global_load_dword v2, v0, s[4:5] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v3, v0, s[10:11] glc
+; GFX9-NEXT:    global_load_dword v3, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat8:
@@ -4469,17 +4469,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat9(ptr addrspace(1) %o
 ;
 ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat9:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v2, v0, s[8:9] glc
+; GFX9-NEXT:    global_load_dword v2, v0, s[4:5] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v3, v0, s[10:11] glc
+; GFX9-NEXT:    global_load_dword v3, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_med3_f32 v1, v2, v1, v3
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat9:
@@ -4623,17 +4623,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat10(ptr addrspace(1) %
 ;
 ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat10:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v2, v0, s[8:9] glc
+; GFX9-NEXT:    global_load_dword v2, v0, s[4:5] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v3, v0, s[10:11] glc
+; GFX9-NEXT:    global_load_dword v3, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat10:
@@ -4777,17 +4777,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat11(ptr addrspace(1) %
 ;
 ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat11:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v2, v0, s[8:9] glc
+; GFX9-NEXT:    global_load_dword v2, v0, s[4:5] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v3, v0, s[10:11] glc
+; GFX9-NEXT:    global_load_dword v3, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_med3_f32 v1, v2, v1, v3
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat11:
@@ -4931,17 +4931,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat12(ptr addrspace(1) %
 ;
 ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat12:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v2, v0, s[8:9] glc
+; GFX9-NEXT:    global_load_dword v2, v0, s[4:5] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v3, v0, s[10:11] glc
+; GFX9-NEXT:    global_load_dword v3, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_med3_f32 v1, v2, v1, v3
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat12:
@@ -5085,17 +5085,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat13(ptr addrspace(1) %
 ;
 ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat13:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v2, v0, s[8:9] glc
+; GFX9-NEXT:    global_load_dword v2, v0, s[4:5] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v3, v0, s[10:11] glc
+; GFX9-NEXT:    global_load_dword v3, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_med3_f32 v1, v2, v1, v3
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat13:
@@ -5239,17 +5239,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat14(ptr addrspace(1) %
 ;
 ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat14:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v2, v0, s[8:9] glc
+; GFX9-NEXT:    global_load_dword v2, v0, s[4:5] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v3, v0, s[10:11] glc
+; GFX9-NEXT:    global_load_dword v3, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat14:
@@ -5393,17 +5393,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat15(ptr addrspace(1) %
 ;
 ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat15:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v2, v0, s[8:9] glc
+; GFX9-NEXT:    global_load_dword v2, v0, s[4:5] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v3, v0, s[10:11] glc
+; GFX9-NEXT:    global_load_dword v3, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_med3_f32 v1, v2, v1, v3
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat15:
@@ -5550,17 +5550,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat16(ptr addrspace(1) %
 ;
 ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v2, v0, s[8:9] glc
+; GFX9-NEXT:    global_load_dword v2, v0, s[4:5] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v3, v0, s[10:11] glc
+; GFX9-NEXT:    global_load_dword v3, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat16:
@@ -5743,14 +5743,14 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1)
 ;
 ; GFX9-LABEL: v_test_safe_med3_f32_pat0_multi_use0:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v2, v0, s[8:9] glc
+; GFX9-NEXT:    global_load_dword v2, v0, s[4:5] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v3, v0, s[10:11] glc
+; GFX9-NEXT:    global_load_dword v3, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
@@ -5761,7 +5761,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1)
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_min_f32_e32 v1, v1, v3
 ; GFX9-NEXT:    v_max_f32_e32 v1, v4, v1
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_safe_med3_f32_pat0_multi_use0:
@@ -5947,14 +5947,14 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1)
 ;
 ; GFX9-LABEL: v_test_safe_med3_f32_pat0_multi_use1:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v2, v0, s[8:9] glc
+; GFX9-NEXT:    global_load_dword v2, v0, s[4:5] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v3, v0, s[10:11] glc
+; GFX9-NEXT:    global_load_dword v3, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
@@ -5965,7 +5965,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1)
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_min_f32_e32 v1, v1, v3
 ; GFX9-NEXT:    v_max_f32_e32 v1, v4, v1
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use1:
@@ -6178,14 +6178,14 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1)
 ;
 ; GFX9-LABEL: v_test_safe_med3_f32_pat0_multi_use2:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v2, v0, s[8:9] glc
+; GFX9-NEXT:    global_load_dword v2, v0, s[4:5] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v3, v0, s[10:11] glc
+; GFX9-NEXT:    global_load_dword v3, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
@@ -6196,7 +6196,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1)
 ; GFX9-NEXT:    global_store_dword v[0:1], v1, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_max_f32_e32 v1, v4, v1
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_safe_med3_f32_pat0_multi_use2:
@@ -6370,14 +6370,14 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr
 ;
 ; GFX9-LABEL: v_test_safe_med3_f32_pat0:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v2, v0, s[8:9] glc
+; GFX9-NEXT:    global_load_dword v2, v0, s[4:5] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v3, v0, s[10:11] glc
+; GFX9-NEXT:    global_load_dword v3, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
@@ -6386,7 +6386,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr
 ; GFX9-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX9-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX9-NEXT:    v_max_f32_e32 v1, v4, v1
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: v_test_safe_med3_f32_pat0:
@@ -6569,20 +6569,20 @@ define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(ptr addrspace(1)
 ;
 ; GFX9-LABEL: v_nnan_inputs_missing0_med3_f32_pat0:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v2, v0, s[8:9] glc
+; GFX9-NEXT:    global_load_dword v2, v0, s[4:5] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v3, v0, s[10:11] glc
+; GFX9-NEXT:    global_load_dword v3, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_f32_e32 v1, 1.0, v1
 ; GFX9-NEXT:    v_add_f32_e32 v2, 2.0, v2
 ; GFX9-NEXT:    v_add_f32_e32 v3, 4.0, v3
 ; GFX9-NEXT:    v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_nnan_inputs_missing0_med3_f32_pat0:
@@ -6746,20 +6746,20 @@ define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(ptr addrspace(1)
 ;
 ; GFX9-LABEL: v_nnan_inputs_missing1_med3_f32_pat0:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v2, v0, s[8:9] glc
+; GFX9-NEXT:    global_load_dword v2, v0, s[4:5] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v3, v0, s[10:11] glc
+; GFX9-NEXT:    global_load_dword v3, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_f32_e32 v1, 1.0, v1
 ; GFX9-NEXT:    v_add_f32_e32 v2, 2.0, v2
 ; GFX9-NEXT:    v_add_f32_e32 v3, 4.0, v3
 ; GFX9-NEXT:    v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_nnan_inputs_missing1_med3_f32_pat0:
@@ -6923,20 +6923,20 @@ define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(ptr addrspace(1)
 ;
 ; GFX9-LABEL: v_nnan_inputs_missing2_med3_f32_pat0:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v2, v0, s[8:9] glc
+; GFX9-NEXT:    global_load_dword v2, v0, s[4:5] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v3, v0, s[10:11] glc
+; GFX9-NEXT:    global_load_dword v3, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_f32_e32 v1, 1.0, v1
 ; GFX9-NEXT:    v_add_f32_e32 v2, 2.0, v2
 ; GFX9-NEXT:    v_add_f32_e32 v3, 4.0, v3
 ; GFX9-NEXT:    v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_nnan_inputs_missing2_med3_f32_pat0:
@@ -7090,33 +7090,33 @@ define amdgpu_kernel void @v_test_nnan_on_call_med3_f32_pat0_srcmod0(ptr addrspa
 ;
 ; GFX9-SDAG-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[6:7] glc
+; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3] glc
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v2, v0, s[8:9] glc
+; GFX9-SDAG-NEXT:    global_load_dword v2, v0, s[4:5] glc
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v3, v0, s[10:11] glc
+; GFX9-SDAG-NEXT:    global_load_dword v3, v0, s[6:7] glc
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    v_med3_f32 v1, -v1, v2, v3
-; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-SDAG-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[6:7] glc
+; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[2:3] glc
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v2, v0, s[8:9] glc
+; GFX9-GISEL-NEXT:    global_load_dword v2, v0, s[4:5] glc
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v3, v0, s[10:11] glc
+; GFX9-GISEL-NEXT:    global_load_dword v3, v0, s[6:7] glc
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_max_f32_e64 v1, -v1, -v1
 ; GFX9-GISEL-NEXT:    v_med3_f32 v1, v1, v2, v3
-; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0:
@@ -7295,39 +7295,39 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(pt
 ;
 ; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[6:7] glc
+; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3] glc
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v2, v0, s[8:9] glc
+; GFX9-SDAG-NEXT:    global_load_dword v2, v0, s[4:5] glc
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v3, v0, s[10:11] glc
+; GFX9-SDAG-NEXT:    global_load_dword v3, v0, s[6:7] glc
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    v_min_f32_e64 v4, -v1, v2
 ; GFX9-SDAG-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX9-SDAG-NEXT:    v_min_f32_e32 v1, v1, v3
 ; GFX9-SDAG-NEXT:    v_max_f32_e32 v1, v4, v1
-; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-SDAG-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[6:7] glc
+; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[2:3] glc
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v2, v0, s[8:9] glc
+; GFX9-GISEL-NEXT:    global_load_dword v2, v0, s[4:5] glc
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v3, v0, s[10:11] glc
+; GFX9-GISEL-NEXT:    global_load_dword v3, v0, s[6:7] glc
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_max_f32_e64 v4, -v1, -v1
 ; GFX9-GISEL-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX9-GISEL-NEXT:    v_min_f32_e32 v2, v4, v2
 ; GFX9-GISEL-NEXT:    v_min_f32_e32 v1, v1, v3
 ; GFX9-GISEL-NEXT:    v_max_f32_e32 v1, v2, v1
-; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch:
@@ -7502,18 +7502,18 @@ define amdgpu_kernel void @v_test_global_nnans_min_max_f32(ptr addrspace(1) %out
 ;
 ; GFX9-LABEL: v_test_global_nnans_min_max_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v2, v0, s[8:9] glc
+; GFX9-NEXT:    global_load_dword v2, v0, s[4:5] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v3, v0, s[10:11] glc
+; GFX9-NEXT:    global_load_dword v3, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX9-NEXT:    v_min_f32_e32 v1, v1, v3
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_global_nnans_min_max_f32:
@@ -7637,14 +7637,14 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(ptr addrspace(1) %o
 ;
 ; GFX9-LABEL: v_test_nnan_input_fmed3_r_i_i_f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_f16_e32 v1, 1.0, v1
 ; GFX9-NEXT:    v_med3_f16 v1, v1, 2.0, 4.0
-; GFX9-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_nnan_input_fmed3_r_i_i_f16:
@@ -7825,20 +7825,20 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, pt
 ;
 ; GFX9-LABEL: v_nnan_inputs_med3_f16_pat0:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v1, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_ushort v2, v0, s[8:9] glc
+; GFX9-NEXT:    global_load_ushort v2, v0, s[4:5] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_ushort v3, v0, s[10:11] glc
+; GFX9-NEXT:    global_load_ushort v3, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_f16_e32 v1, 1.0, v1
 ; GFX9-NEXT:    v_add_f16_e32 v2, 2.0, v2
 ; GFX9-NEXT:    v_add_f16_e32 v3, 4.0, v3
 ; GFX9-NEXT:    v_med3_f16 v1, v1, v2, v3
-; GFX9-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_nnan_inputs_med3_f16_pat0:
@@ -7963,15 +7963,15 @@ define amdgpu_kernel void @two_non_inline_constant(ptr addrspace(1) %out, ptr ad
 ;
 ; GFX9-LABEL: two_non_inline_constant:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_f32_e32 v1, 0.5, v1
 ; GFX9-NEXT:    v_max_f32_e32 v1, 0x41000000, v1
 ; GFX9-NEXT:    v_min_f32_e32 v1, 0x41800000, v1
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: two_non_inline_constant:
@@ -8114,16 +8114,16 @@ define amdgpu_kernel void @one_non_inline_constant(ptr addrspace(1) %out, ptr ad
 ;
 ; GFX9-LABEL: one_non_inline_constant:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x41800000
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_f32_e32 v3, 0.5, v1
 ; GFX9-NEXT:    v_add_f32_e32 v1, 0x41800000, v1
 ; GFX9-NEXT:    v_med3_f32 v2, v3, 1.0, v2
-; GFX9-NEXT:    global_store_dword v0, v2, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v2, s[0:1]
 ; GFX9-NEXT:    global_store_dword v[0:1], v1, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
@@ -8271,18 +8271,18 @@ define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %o
 ;
 ; GFX9-SDAG-LABEL: two_non_inline_constant_multi_use:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-SDAG-NEXT:    s_mov_b32 s0, 0x41000000
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, 0x41800000
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-SDAG-NEXT:    s_mov_b32 s2, 0x41000000
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    v_add_f32_e32 v3, 0.5, v1
 ; GFX9-SDAG-NEXT:    v_add_f32_e32 v4, 0x41800000, v1
 ; GFX9-SDAG-NEXT:    v_add_f32_e32 v1, 0x41000000, v1
-; GFX9-SDAG-NEXT:    v_med3_f32 v2, v3, s0, v2
-; GFX9-SDAG-NEXT:    global_store_dword v0, v2, s[4:5]
+; GFX9-SDAG-NEXT:    v_med3_f32 v2, v3, s2, v2
+; GFX9-SDAG-NEXT:    global_store_dword v0, v2, s[0:1]
 ; GFX9-SDAG-NEXT:    global_store_dword v[0:1], v4, off
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    global_store_dword v[0:1], v1, off
@@ -8291,18 +8291,18 @@ define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %o
 ;
 ; GFX9-GISEL-LABEL: two_non_inline_constant_multi_use:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0x41000000
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x41800000
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_add_f32_e32 v4, 0.5, v1
 ; GFX9-GISEL-NEXT:    v_add_f32_e32 v5, 0x41800000, v1
 ; GFX9-GISEL-NEXT:    v_add_f32_e32 v1, 0x41000000, v1
 ; GFX9-GISEL-NEXT:    v_med3_f32 v2, v4, v2, v3
-; GFX9-GISEL-NEXT:    global_store_dword v0, v2, s[4:5]
+; GFX9-GISEL-NEXT:    global_store_dword v0, v2, s[0:1]
 ; GFX9-GISEL-NEXT:    global_store_dword v[0:1], v5, off
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    global_store_dword v[0:1], v1, off
diff --git a/llvm/test/CodeGen/AMDGPU/fmin3.ll b/llvm/test/CodeGen/AMDGPU/fmin3.ll
index 2025ddb07e83a..3a55b2d50a5e5 100644
--- a/llvm/test/CodeGen/AMDGPU/fmin3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmin3.ll
@@ -67,32 +67,32 @@ define amdgpu_kernel void @test_fmin3_olt_0_f32(ptr addrspace(1) %out, ptr addrs
 ;
 ; GFX9-LABEL: test_fmin3_olt_0_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    s_mov_b32 s14, s2
-; GFX9-NEXT:    s_mov_b32 s15, s3
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s11, 0xf000
+; GFX9-NEXT:    s_mov_b32 s10, -1
+; GFX9-NEXT:    s_mov_b32 s14, s10
+; GFX9-NEXT:    s_mov_b32 s15, s11
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s12, s6
-; GFX9-NEXT:    s_mov_b32 s13, s7
-; GFX9-NEXT:    s_mov_b32 s16, s8
-; GFX9-NEXT:    s_mov_b32 s17, s9
-; GFX9-NEXT:    s_mov_b32 s18, s2
-; GFX9-NEXT:    s_mov_b32 s19, s3
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s10, s2
-; GFX9-NEXT:    s_mov_b32 s11, s3
+; GFX9-NEXT:    s_mov_b32 s12, s2
+; GFX9-NEXT:    s_mov_b32 s13, s3
+; GFX9-NEXT:    s_mov_b32 s16, s4
+; GFX9-NEXT:    s_mov_b32 s17, s5
+; GFX9-NEXT:    s_mov_b32 s18, s10
+; GFX9-NEXT:    s_mov_b32 s19, s11
+; GFX9-NEXT:    s_mov_b32 s4, s6
+; GFX9-NEXT:    s_mov_b32 s5, s7
+; GFX9-NEXT:    s_mov_b32 s6, s10
+; GFX9-NEXT:    s_mov_b32 s7, s11
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[12:15], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[16:19], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_load_dword v2, off, s[8:11], 0 glc
+; GFX9-NEXT:    buffer_load_dword v2, off, s[4:7], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s0, s4
-; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_mov_b32 s8, s0
+; GFX9-NEXT:    s_mov_b32 s9, s1
 ; GFX9-NEXT:    v_min3_f32 v0, v0, v1, v2
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    buffer_store_dword v0, off, s[8:11], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_fmin3_olt_0_f32:
@@ -199,32 +199,32 @@ define amdgpu_kernel void @test_fmin3_olt_1_f32(ptr addrspace(1) %out, ptr addrs
 ;
 ; GFX9-LABEL: test_fmin3_olt_1_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    s_mov_b32 s14, s2
-; GFX9-NEXT:    s_mov_b32 s15, s3
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s11, 0xf000
+; GFX9-NEXT:    s_mov_b32 s10, -1
+; GFX9-NEXT:    s_mov_b32 s14, s10
+; GFX9-NEXT:    s_mov_b32 s15, s11
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s12, s6
-; GFX9-NEXT:    s_mov_b32 s13, s7
-; GFX9-NEXT:    s_mov_b32 s16, s8
-; GFX9-NEXT:    s_mov_b32 s17, s9
-; GFX9-NEXT:    s_mov_b32 s18, s2
-; GFX9-NEXT:    s_mov_b32 s19, s3
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s10, s2
-; GFX9-NEXT:    s_mov_b32 s11, s3
+; GFX9-NEXT:    s_mov_b32 s12, s2
+; GFX9-NEXT:    s_mov_b32 s13, s3
+; GFX9-NEXT:    s_mov_b32 s16, s4
+; GFX9-NEXT:    s_mov_b32 s17, s5
+; GFX9-NEXT:    s_mov_b32 s18, s10
+; GFX9-NEXT:    s_mov_b32 s19, s11
+; GFX9-NEXT:    s_mov_b32 s4, s6
+; GFX9-NEXT:    s_mov_b32 s5, s7
+; GFX9-NEXT:    s_mov_b32 s6, s10
+; GFX9-NEXT:    s_mov_b32 s7, s11
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[12:15], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[16:19], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_load_dword v2, off, s[8:11], 0 glc
+; GFX9-NEXT:    buffer_load_dword v2, off, s[4:7], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s0, s4
-; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_mov_b32 s8, s0
+; GFX9-NEXT:    s_mov_b32 s9, s1
 ; GFX9-NEXT:    v_min3_f32 v0, v2, v0, v1
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    buffer_store_dword v0, off, s[8:11], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_fmin3_olt_1_f32:
@@ -338,32 +338,32 @@ define amdgpu_kernel void @test_fmin3_olt_0_f16(ptr addrspace(1) %out, ptr addrs
 ;
 ; GFX9-LABEL: test_fmin3_olt_0_f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    s_mov_b32 s14, s2
-; GFX9-NEXT:    s_mov_b32 s15, s3
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s11, 0xf000
+; GFX9-NEXT:    s_mov_b32 s10, -1
+; GFX9-NEXT:    s_mov_b32 s14, s10
+; GFX9-NEXT:    s_mov_b32 s15, s11
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s12, s6
-; GFX9-NEXT:    s_mov_b32 s13, s7
-; GFX9-NEXT:    s_mov_b32 s16, s8
-; GFX9-NEXT:    s_mov_b32 s17, s9
-; GFX9-NEXT:    s_mov_b32 s18, s2
-; GFX9-NEXT:    s_mov_b32 s19, s3
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s10, s2
-; GFX9-NEXT:    s_mov_b32 s11, s3
+; GFX9-NEXT:    s_mov_b32 s12, s2
+; GFX9-NEXT:    s_mov_b32 s13, s3
+; GFX9-NEXT:    s_mov_b32 s16, s4
+; GFX9-NEXT:    s_mov_b32 s17, s5
+; GFX9-NEXT:    s_mov_b32 s18, s10
+; GFX9-NEXT:    s_mov_b32 s19, s11
+; GFX9-NEXT:    s_mov_b32 s4, s6
+; GFX9-NEXT:    s_mov_b32 s5, s7
+; GFX9-NEXT:    s_mov_b32 s6, s10
+; GFX9-NEXT:    s_mov_b32 s7, s11
 ; GFX9-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[16:19], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_load_ushort v2, off, s[8:11], 0 glc
+; GFX9-NEXT:    buffer_load_ushort v2, off, s[4:7], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s0, s4
-; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_mov_b32 s8, s0
+; GFX9-NEXT:    s_mov_b32 s9, s1
 ; GFX9-NEXT:    v_min3_f16 v0, v0, v1, v2
-; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX9-NEXT:    buffer_store_short v0, off, s[8:11], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_fmin3_olt_0_f16:
@@ -478,32 +478,32 @@ define amdgpu_kernel void @test_fmin3_olt_1_f16(ptr addrspace(1) %out, ptr addrs
 ;
 ; GFX9-LABEL: test_fmin3_olt_1_f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    s_mov_b32 s14, s2
-; GFX9-NEXT:    s_mov_b32 s15, s3
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s11, 0xf000
+; GFX9-NEXT:    s_mov_b32 s10, -1
+; GFX9-NEXT:    s_mov_b32 s14, s10
+; GFX9-NEXT:    s_mov_b32 s15, s11
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s12, s6
-; GFX9-NEXT:    s_mov_b32 s13, s7
-; GFX9-NEXT:    s_mov_b32 s16, s8
-; GFX9-NEXT:    s_mov_b32 s17, s9
-; GFX9-NEXT:    s_mov_b32 s18, s2
-; GFX9-NEXT:    s_mov_b32 s19, s3
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s10, s2
-; GFX9-NEXT:    s_mov_b32 s11, s3
+; GFX9-NEXT:    s_mov_b32 s12, s2
+; GFX9-NEXT:    s_mov_b32 s13, s3
+; GFX9-NEXT:    s_mov_b32 s16, s4
+; GFX9-NEXT:    s_mov_b32 s17, s5
+; GFX9-NEXT:    s_mov_b32 s18, s10
+; GFX9-NEXT:    s_mov_b32 s19, s11
+; GFX9-NEXT:    s_mov_b32 s4, s6
+; GFX9-NEXT:    s_mov_b32 s5, s7
+; GFX9-NEXT:    s_mov_b32 s6, s10
+; GFX9-NEXT:    s_mov_b32 s7, s11
 ; GFX9-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[16:19], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_load_ushort v2, off, s[8:11], 0 glc
+; GFX9-NEXT:    buffer_load_ushort v2, off, s[4:7], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s0, s4
-; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_mov_b32 s8, s0
+; GFX9-NEXT:    s_mov_b32 s9, s1
 ; GFX9-NEXT:    v_min3_f16 v0, v2, v0, v1
-; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX9-NEXT:    buffer_store_short v0, off, s[8:11], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_fmin3_olt_1_f16:
@@ -680,36 +680,36 @@ define amdgpu_kernel void @test_fmin3_olt_0_f64(ptr addrspace(1) %out, ptr addrs
 ;
 ; GFX9-LABEL: test_fmin3_olt_0_f64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    s_mov_b32 s14, s2
-; GFX9-NEXT:    s_mov_b32 s15, s3
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s11, 0xf000
+; GFX9-NEXT:    s_mov_b32 s10, -1
+; GFX9-NEXT:    s_mov_b32 s14, s10
+; GFX9-NEXT:    s_mov_b32 s15, s11
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s12, s6
-; GFX9-NEXT:    s_mov_b32 s13, s7
-; GFX9-NEXT:    s_mov_b32 s16, s8
-; GFX9-NEXT:    s_mov_b32 s17, s9
-; GFX9-NEXT:    s_mov_b32 s18, s2
-; GFX9-NEXT:    s_mov_b32 s19, s3
+; GFX9-NEXT:    s_mov_b32 s12, s2
+; GFX9-NEXT:    s_mov_b32 s13, s3
+; GFX9-NEXT:    s_mov_b32 s16, s4
+; GFX9-NEXT:    s_mov_b32 s17, s5
+; GFX9-NEXT:    s_mov_b32 s18, s10
+; GFX9-NEXT:    s_mov_b32 s19, s11
 ; GFX9-NEXT:    buffer_load_dwordx2 v[0:1], off, s[12:15], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_load_dwordx2 v[2:3], off, s[16:19], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s10, s2
-; GFX9-NEXT:    s_mov_b32 s11, s3
-; GFX9-NEXT:    buffer_load_dwordx2 v[4:5], off, s[8:11], 0 glc
+; GFX9-NEXT:    s_mov_b32 s4, s6
+; GFX9-NEXT:    s_mov_b32 s5, s7
+; GFX9-NEXT:    s_mov_b32 s6, s10
+; GFX9-NEXT:    s_mov_b32 s7, s11
+; GFX9-NEXT:    buffer_load_dwordx2 v[4:5], off, s[4:7], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s0, s4
-; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_mov_b32 s8, s0
+; GFX9-NEXT:    s_mov_b32 s9, s1
 ; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GFX9-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX9-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
 ; GFX9-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
 ; GFX9-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_fmin3_olt_0_f64:
@@ -827,36 +827,36 @@ define amdgpu_kernel void @test_fmin3_olt_1_f64(ptr addrspace(1) %out, ptr addrs
 ;
 ; GFX9-LABEL: test_fmin3_olt_1_f64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    s_mov_b32 s14, s2
-; GFX9-NEXT:    s_mov_b32 s15, s3
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s11, 0xf000
+; GFX9-NEXT:    s_mov_b32 s10, -1
+; GFX9-NEXT:    s_mov_b32 s14, s10
+; GFX9-NEXT:    s_mov_b32 s15, s11
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s12, s6
-; GFX9-NEXT:    s_mov_b32 s13, s7
-; GFX9-NEXT:    s_mov_b32 s16, s8
-; GFX9-NEXT:    s_mov_b32 s17, s9
-; GFX9-NEXT:    s_mov_b32 s18, s2
-; GFX9-NEXT:    s_mov_b32 s19, s3
+; GFX9-NEXT:    s_mov_b32 s12, s2
+; GFX9-NEXT:    s_mov_b32 s13, s3
+; GFX9-NEXT:    s_mov_b32 s16, s4
+; GFX9-NEXT:    s_mov_b32 s17, s5
+; GFX9-NEXT:    s_mov_b32 s18, s10
+; GFX9-NEXT:    s_mov_b32 s19, s11
 ; GFX9-NEXT:    buffer_load_dwordx2 v[0:1], off, s[12:15], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_load_dwordx2 v[2:3], off, s[16:19], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    s_mov_b32 s9, s11
-; GFX9-NEXT:    s_mov_b32 s10, s2
-; GFX9-NEXT:    s_mov_b32 s11, s3
-; GFX9-NEXT:    buffer_load_dwordx2 v[4:5], off, s[8:11], 0 glc
+; GFX9-NEXT:    s_mov_b32 s4, s6
+; GFX9-NEXT:    s_mov_b32 s5, s7
+; GFX9-NEXT:    s_mov_b32 s6, s10
+; GFX9-NEXT:    s_mov_b32 s7, s11
+; GFX9-NEXT:    buffer_load_dwordx2 v[4:5], off, s[4:7], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s0, s4
-; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_mov_b32 s8, s0
+; GFX9-NEXT:    s_mov_b32 s9, s1
 ; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GFX9-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX9-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
 ; GFX9-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
 ; GFX9-NEXT:    v_min_f64 v[0:1], v[2:3], v[0:1]
-; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_fmin3_olt_1_f64:
diff --git a/llvm/test/CodeGen/AMDGPU/fmul.f16.ll b/llvm/test/CodeGen/AMDGPU/fmul.f16.ll
index 7b8384f317c6c..7c1c970b3fef7 100644
--- a/llvm/test/CodeGen/AMDGPU/fmul.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmul.f16.ll
@@ -111,41 +111,23 @@ define amdgpu_kernel void @fmul_f16_imm_a(
 ; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
-; VI-LABEL: fmul_f16_imm_a:
-; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
-; VI-NEXT:    s_mov_b32 s10, s6
-; VI-NEXT:    s_mov_b32 s11, s7
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s8, s2
-; VI-NEXT:    s_mov_b32 s9, s3
-; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0 glc
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    v_mul_f16_e32 v0, 0x4200, v0
-; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
-; VI-NEXT:    s_endpgm
-;
-; GFX9-LABEL: fmul_f16_imm_a:
-; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    s_mov_b32 s10, s2
-; GFX9-NEXT:    s_mov_b32 s11, s3
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s8, s6
-; GFX9-NEXT:    s_mov_b32 s9, s7
-; GFX9-NEXT:    buffer_load_ushort v0, off, s[8:11], 0 glc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s0, s4
-; GFX9-NEXT:    s_mov_b32 s1, s5
-; GFX9-NEXT:    v_mul_f16_e32 v0, 0x4200, v0
-; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
-; GFX9-NEXT:    s_endpgm
+; GFX89-LABEL: fmul_f16_imm_a:
+; GFX89:       ; %bb.0: ; %entry
+; GFX89-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX89-NEXT:    s_mov_b32 s7, 0xf000
+; GFX89-NEXT:    s_mov_b32 s6, -1
+; GFX89-NEXT:    s_mov_b32 s10, s6
+; GFX89-NEXT:    s_mov_b32 s11, s7
+; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX89-NEXT:    s_mov_b32 s8, s2
+; GFX89-NEXT:    s_mov_b32 s9, s3
+; GFX89-NEXT:    buffer_load_ushort v0, off, s[8:11], 0 glc
+; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    s_mov_b32 s4, s0
+; GFX89-NEXT:    s_mov_b32 s5, s1
+; GFX89-NEXT:    v_mul_f16_e32 v0, 0x4200, v0
+; GFX89-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; GFX89-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: fmul_f16_imm_a:
 ; GFX11:       ; %bb.0: ; %entry
@@ -196,41 +178,23 @@ define amdgpu_kernel void @fmul_f16_imm_b(
 ; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
-; VI-LABEL: fmul_f16_imm_b:
-; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
-; VI-NEXT:    s_mov_b32 s10, s6
-; VI-NEXT:    s_mov_b32 s11, s7
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s8, s2
-; VI-NEXT:    s_mov_b32 s9, s3
-; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0 glc
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    v_mul_f16_e32 v0, 4.0, v0
-; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
-; VI-NEXT:    s_endpgm
-;
-; GFX9-LABEL: fmul_f16_imm_b:
-; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    s_mov_b32 s10, s2
-; GFX9-NEXT:    s_mov_b32 s11, s3
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s8, s6
-; GFX9-NEXT:    s_mov_b32 s9, s7
-; GFX9-NEXT:    buffer_load_ushort v0, off, s[8:11], 0 glc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s0, s4
-; GFX9-NEXT:    s_mov_b32 s1, s5
-; GFX9-NEXT:    v_mul_f16_e32 v0, 4.0, v0
-; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
-; GFX9-NEXT:    s_endpgm
+; GFX89-LABEL: fmul_f16_imm_b:
+; GFX89:       ; %bb.0: ; %entry
+; GFX89-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX89-NEXT:    s_mov_b32 s7, 0xf000
+; GFX89-NEXT:    s_mov_b32 s6, -1
+; GFX89-NEXT:    s_mov_b32 s10, s6
+; GFX89-NEXT:    s_mov_b32 s11, s7
+; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX89-NEXT:    s_mov_b32 s8, s2
+; GFX89-NEXT:    s_mov_b32 s9, s3
+; GFX89-NEXT:    buffer_load_ushort v0, off, s[8:11], 0 glc
+; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    s_mov_b32 s4, s0
+; GFX89-NEXT:    s_mov_b32 s5, s1
+; GFX89-NEXT:    v_mul_f16_e32 v0, 4.0, v0
+; GFX89-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; GFX89-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: fmul_f16_imm_b:
 ; GFX11:       ; %bb.0: ; %entry
@@ -426,21 +390,21 @@ define amdgpu_kernel void @fmul_v2f16_imm_a(
 ;
 ; GFX9-LABEL: fmul_v2f16_imm_a:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    s_mov_b32 s10, s2
-; GFX9-NEXT:    s_mov_b32 s11, s3
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    s_mov_b32 s10, s6
+; GFX9-NEXT:    s_mov_b32 s11, s7
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s8, s6
-; GFX9-NEXT:    s_mov_b32 s9, s7
+; GFX9-NEXT:    s_mov_b32 s8, s2
+; GFX9-NEXT:    s_mov_b32 s9, s3
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[8:11], 0
-; GFX9-NEXT:    s_mov_b32 s0, s4
-; GFX9-NEXT:    s_mov_b32 s4, 0x44004200
-; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_mov_b32 s4, s0
+; GFX9-NEXT:    s_mov_b32 s0, 0x44004200
+; GFX9-NEXT:    s_mov_b32 s5, s1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_pk_mul_f16 v0, v0, s4
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    v_pk_mul_f16 v0, v0, s0
+; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: fmul_v2f16_imm_a:
@@ -521,21 +485,21 @@ define amdgpu_kernel void @fmul_v2f16_imm_b(
 ;
 ; GFX9-LABEL: fmul_v2f16_imm_b:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    s_mov_b32 s10, s2
-; GFX9-NEXT:    s_mov_b32 s11, s3
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    s_mov_b32 s10, s6
+; GFX9-NEXT:    s_mov_b32 s11, s7
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s8, s6
-; GFX9-NEXT:    s_mov_b32 s9, s7
+; GFX9-NEXT:    s_mov_b32 s8, s2
+; GFX9-NEXT:    s_mov_b32 s9, s3
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[8:11], 0
-; GFX9-NEXT:    s_mov_b32 s0, s4
-; GFX9-NEXT:    s_mov_b32 s4, 0x42004400
-; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_mov_b32 s4, s0
+; GFX9-NEXT:    s_mov_b32 s0, 0x42004400
+; GFX9-NEXT:    s_mov_b32 s5, s1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_pk_mul_f16 v0, v0, s4
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    v_pk_mul_f16 v0, v0, s0
+; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: fmul_v2f16_imm_b:
@@ -761,23 +725,23 @@ define amdgpu_kernel void @fmul_v4f16_imm_a(
 ;
 ; GFX9-LABEL: fmul_v4f16_imm_a:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    s_mov_b32 s10, s2
-; GFX9-NEXT:    s_mov_b32 s11, s3
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    s_mov_b32 s10, s6
+; GFX9-NEXT:    s_mov_b32 s11, s7
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s8, s6
-; GFX9-NEXT:    s_mov_b32 s9, s7
+; GFX9-NEXT:    s_mov_b32 s8, s2
+; GFX9-NEXT:    s_mov_b32 s9, s3
 ; GFX9-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; GFX9-NEXT:    s_mov_b32 s6, 0x44004200
-; GFX9-NEXT:    s_mov_b32 s7, 0x40004800
-; GFX9-NEXT:    s_mov_b32 s0, s4
-; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_mov_b32 s2, 0x44004200
+; GFX9-NEXT:    s_mov_b32 s3, 0x40004800
+; GFX9-NEXT:    s_mov_b32 s4, s0
+; GFX9-NEXT:    s_mov_b32 s5, s1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_pk_mul_f16 v1, v1, s6
-; GFX9-NEXT:    v_pk_mul_f16 v0, v0, s7
-; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT:    v_pk_mul_f16 v1, v1, s2
+; GFX9-NEXT:    v_pk_mul_f16 v0, v0, s3
+; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: fmul_v4f16_imm_a:
diff --git a/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll b/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll
index 2b556a0be2b16..9300dfcb16e8a 100644
--- a/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll
@@ -62,32 +62,32 @@ define amdgpu_kernel void @fmuladd_f16(ptr addrspace(1) %out, ptr addrspace(1) %
 ;
 ; GFX10-FLUSH-LABEL: fmuladd_f16:
 ; GFX10-FLUSH:       ; %bb.0:
-; GFX10-FLUSH-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX10-FLUSH-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX10-FLUSH-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-FLUSH-NEXT:    s_clause 0x2
-; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[6:7]
-; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[8:9]
-; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[10:11]
+; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[2:3]
+; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[4:5]
+; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[6:7]
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v3
-; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-FLUSH-NEXT:    s_endpgm
 ;
 ; GFX10-DENORM-LABEL: fmuladd_f16:
 ; GFX10-DENORM:       ; %bb.0:
-; GFX10-DENORM-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX10-DENORM-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX10-DENORM-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    s_clause 0x2
-; GFX10-DENORM-NEXT:    global_load_ushort v1, v0, s[6:7]
-; GFX10-DENORM-NEXT:    global_load_ushort v2, v0, s[8:9]
-; GFX10-DENORM-NEXT:    global_load_ushort v3, v0, s[10:11]
+; GFX10-DENORM-NEXT:    global_load_ushort v1, v0, s[2:3]
+; GFX10-DENORM-NEXT:    global_load_ushort v2, v0, s[4:5]
+; GFX10-DENORM-NEXT:    global_load_ushort v3, v0, s[6:7]
 ; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DENORM-NEXT:    v_fmac_f16_e32 v3, v1, v2
-; GFX10-DENORM-NEXT:    global_store_short v0, v3, s[4:5]
+; GFX10-DENORM-NEXT:    global_store_short v0, v3, s[0:1]
 ; GFX10-DENORM-NEXT:    s_endpgm
 ;
 ; GFX11-FLUSH-LABEL: fmuladd_f16:
@@ -176,48 +176,48 @@ define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1)
 ;
 ; GFX10-FLUSH-LABEL: fmul_fadd_f16:
 ; GFX10-FLUSH:       ; %bb.0:
-; GFX10-FLUSH-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX10-FLUSH-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX10-FLUSH-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-FLUSH-NEXT:    s_clause 0x2
-; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[6:7]
-; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[8:9]
-; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[10:11]
+; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[2:3]
+; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[4:5]
+; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[6:7]
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v3
-; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-FLUSH-NEXT:    s_endpgm
 ;
 ; GFX10-DENORM-STRICT-LABEL: fmul_fadd_f16:
 ; GFX10-DENORM-STRICT:       ; %bb.0:
-; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX10-DENORM-STRICT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DENORM-STRICT-NEXT:    s_clause 0x2
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[6:7]
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[8:9]
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[10:11]
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[2:3]
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[4:5]
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[6:7]
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-DENORM-STRICT-NEXT:    v_mul_f16_e32 v1, v1, v2
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DENORM-STRICT-NEXT:    v_add_f16_e32 v1, v1, v3
-; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-DENORM-STRICT-NEXT:    s_endpgm
 ;
 ; GFX10-DENORM-CONTRACT-LABEL: fmul_fadd_f16:
 ; GFX10-DENORM-CONTRACT:       ; %bb.0:
-; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX10-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DENORM-CONTRACT-NEXT:    s_clause 0x2
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[6:7]
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[8:9]
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[10:11]
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[2:3]
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[4:5]
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[6:7]
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DENORM-CONTRACT-NEXT:    v_fmac_f16_e32 v3, v1, v2
-; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v3, s[4:5]
+; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v3, s[0:1]
 ; GFX10-DENORM-CONTRACT-NEXT:    s_endpgm
 ;
 ; GFX11-FLUSH-LABEL: fmul_fadd_f16:
@@ -326,32 +326,32 @@ define amdgpu_kernel void @fmul_fadd_contract_f16(ptr addrspace(1) %out, ptr add
 ;
 ; GFX10-FLUSH-LABEL: fmul_fadd_contract_f16:
 ; GFX10-FLUSH:       ; %bb.0:
-; GFX10-FLUSH-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX10-FLUSH-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX10-FLUSH-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-FLUSH-NEXT:    s_clause 0x2
-; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[6:7]
-; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[8:9]
-; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[10:11]
+; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[2:3]
+; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[4:5]
+; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[6:7]
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v3
-; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-FLUSH-NEXT:    s_endpgm
 ;
 ; GFX10-DENORM-LABEL: fmul_fadd_contract_f16:
 ; GFX10-DENORM:       ; %bb.0:
-; GFX10-DENORM-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX10-DENORM-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX10-DENORM-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    s_clause 0x2
-; GFX10-DENORM-NEXT:    global_load_ushort v1, v0, s[6:7]
-; GFX10-DENORM-NEXT:    global_load_ushort v2, v0, s[8:9]
-; GFX10-DENORM-NEXT:    global_load_ushort v3, v0, s[10:11]
+; GFX10-DENORM-NEXT:    global_load_ushort v1, v0, s[2:3]
+; GFX10-DENORM-NEXT:    global_load_ushort v2, v0, s[4:5]
+; GFX10-DENORM-NEXT:    global_load_ushort v3, v0, s[6:7]
 ; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DENORM-NEXT:    v_fmac_f16_e32 v3, v1, v2
-; GFX10-DENORM-NEXT:    global_store_short v0, v3, s[4:5]
+; GFX10-DENORM-NEXT:    global_store_short v0, v3, s[0:1]
 ; GFX10-DENORM-NEXT:    s_endpgm
 ;
 ; GFX11-FLUSH-LABEL: fmul_fadd_contract_f16:
@@ -1425,49 +1425,49 @@ define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out,
 ;
 ; GFX10-FLUSH-LABEL: mad_sub_f16:
 ; GFX10-FLUSH:       ; %bb.0:
-; GFX10-FLUSH-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-FLUSH-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
 ; GFX10-FLUSH-NEXT:    v_sub_f16_e32 v1, v1, v3
-; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-FLUSH-NEXT:    s_endpgm
 ;
 ; GFX10-DENORM-STRICT-LABEL: mad_sub_f16:
 ; GFX10-DENORM-STRICT:       ; %bb.0:
-; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DENORM-STRICT-NEXT:    v_mul_f16_e32 v1, v1, v2
 ; GFX10-DENORM-STRICT-NEXT:    v_sub_f16_e32 v1, v1, v3
-; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-DENORM-STRICT-NEXT:    s_endpgm
 ;
 ; GFX10-DENORM-CONTRACT-LABEL: mad_sub_f16:
 ; GFX10-DENORM-CONTRACT:       ; %bb.0:
-; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DENORM-CONTRACT-NEXT:    v_fma_f16 v1, v1, v2, -v3
-; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-DENORM-CONTRACT-NEXT:    s_endpgm
 ;
 ; GFX11-FLUSH-LABEL: mad_sub_f16:
@@ -1600,49 +1600,49 @@ define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %o
 ;
 ; GFX10-FLUSH-LABEL: mad_sub_inv_f16:
 ; GFX10-FLUSH:       ; %bb.0:
-; GFX10-FLUSH-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-FLUSH-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
 ; GFX10-FLUSH-NEXT:    v_sub_f16_e32 v1, v3, v1
-; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-FLUSH-NEXT:    s_endpgm
 ;
 ; GFX10-DENORM-STRICT-LABEL: mad_sub_inv_f16:
 ; GFX10-DENORM-STRICT:       ; %bb.0:
-; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DENORM-STRICT-NEXT:    v_mul_f16_e32 v1, v1, v2
 ; GFX10-DENORM-STRICT-NEXT:    v_sub_f16_e32 v1, v3, v1
-; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-DENORM-STRICT-NEXT:    s_endpgm
 ;
 ; GFX10-DENORM-CONTRACT-LABEL: mad_sub_inv_f16:
 ; GFX10-DENORM-CONTRACT:       ; %bb.0:
-; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DENORM-CONTRACT-NEXT:    v_fma_f16 v1, -v1, v2, v3
-; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-DENORM-CONTRACT-NEXT:    s_endpgm
 ;
 ; GFX11-FLUSH-LABEL: mad_sub_inv_f16:
@@ -1775,49 +1775,49 @@ define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture %
 ;
 ; GFX10-FLUSH-LABEL: mad_sub_fabs_f16:
 ; GFX10-FLUSH:       ; %bb.0:
-; GFX10-FLUSH-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-FLUSH-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
 ; GFX10-FLUSH-NEXT:    v_sub_f16_e64 v1, v1, |v3|
-; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-FLUSH-NEXT:    s_endpgm
 ;
 ; GFX10-DENORM-STRICT-LABEL: mad_sub_fabs_f16:
 ; GFX10-DENORM-STRICT:       ; %bb.0:
-; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DENORM-STRICT-NEXT:    v_mul_f16_e32 v1, v1, v2
 ; GFX10-DENORM-STRICT-NEXT:    v_sub_f16_e64 v1, v1, |v3|
-; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-DENORM-STRICT-NEXT:    s_endpgm
 ;
 ; GFX10-DENORM-CONTRACT-LABEL: mad_sub_fabs_f16:
 ; GFX10-DENORM-CONTRACT:       ; %bb.0:
-; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DENORM-CONTRACT-NEXT:    v_fma_f16 v1, v1, v2, -|v3|
-; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-DENORM-CONTRACT-NEXT:    s_endpgm
 ;
 ; GFX11-FLUSH-LABEL: mad_sub_fabs_f16:
@@ -1951,49 +1951,49 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocaptu
 ;
 ; GFX10-FLUSH-LABEL: mad_sub_fabs_inv_f16:
 ; GFX10-FLUSH:       ; %bb.0:
-; GFX10-FLUSH-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-FLUSH-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
 ; GFX10-FLUSH-NEXT:    v_sub_f16_e64 v1, |v3|, v1
-; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-FLUSH-NEXT:    s_endpgm
 ;
 ; GFX10-DENORM-STRICT-LABEL: mad_sub_fabs_inv_f16:
 ; GFX10-DENORM-STRICT:       ; %bb.0:
-; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DENORM-STRICT-NEXT:    v_mul_f16_e32 v1, v1, v2
 ; GFX10-DENORM-STRICT-NEXT:    v_sub_f16_e64 v1, |v3|, v1
-; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-DENORM-STRICT-NEXT:    s_endpgm
 ;
 ; GFX10-DENORM-CONTRACT-LABEL: mad_sub_fabs_inv_f16:
 ; GFX10-DENORM-CONTRACT:       ; %bb.0:
-; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DENORM-CONTRACT-NEXT:    v_fma_f16 v1, -v1, v2, |v3|
-; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-DENORM-CONTRACT-NEXT:    s_endpgm
 ;
 ; GFX11-FLUSH-LABEL: mad_sub_fabs_inv_f16:
@@ -2127,49 +2127,49 @@ define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %o
 ;
 ; GFX10-FLUSH-LABEL: neg_neg_mad_f16:
 ; GFX10-FLUSH:       ; %bb.0:
-; GFX10-FLUSH-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-FLUSH-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
 ; GFX10-FLUSH-NEXT:    v_add_f16_e32 v1, v3, v1
-; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-FLUSH-NEXT:    s_endpgm
 ;
 ; GFX10-DENORM-STRICT-LABEL: neg_neg_mad_f16:
 ; GFX10-DENORM-STRICT:       ; %bb.0:
-; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DENORM-STRICT-NEXT:    v_mul_f16_e32 v1, v1, v2
 ; GFX10-DENORM-STRICT-NEXT:    v_add_f16_e32 v1, v3, v1
-; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-DENORM-STRICT-NEXT:    s_endpgm
 ;
 ; GFX10-DENORM-CONTRACT-LABEL: neg_neg_mad_f16:
 ; GFX10-DENORM-CONTRACT:       ; %bb.0:
-; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DENORM-CONTRACT-NEXT:    v_fmac_f16_e32 v3, v1, v2
-; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v3, s[4:5]
+; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v3, s[0:1]
 ; GFX10-DENORM-CONTRACT-NEXT:    s_endpgm
 ;
 ; GFX11-FLUSH-LABEL: neg_neg_mad_f16:
@@ -2304,49 +2304,49 @@ define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture %
 ;
 ; GFX10-FLUSH-LABEL: mad_fabs_sub_f16:
 ; GFX10-FLUSH:       ; %bb.0:
-; GFX10-FLUSH-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-FLUSH-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
+; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLUSH-NEXT:    v_mul_f16_e64 v1, v1, |v2|
 ; GFX10-FLUSH-NEXT:    v_sub_f16_e32 v1, v1, v3
-; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-FLUSH-NEXT:    s_endpgm
 ;
 ; GFX10-DENORM-STRICT-LABEL: mad_fabs_sub_f16:
 ; GFX10-DENORM-STRICT:       ; %bb.0:
-; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DENORM-STRICT-NEXT:    v_mul_f16_e64 v1, v1, |v2|
 ; GFX10-DENORM-STRICT-NEXT:    v_sub_f16_e32 v1, v1, v3
-; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-DENORM-STRICT-NEXT:    s_endpgm
 ;
 ; GFX10-DENORM-CONTRACT-LABEL: mad_fabs_sub_f16:
 ; GFX10-DENORM-CONTRACT:       ; %bb.0:
-; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DENORM-CONTRACT-NEXT:    v_fma_f16 v1, v1, |v2|, -v3
-; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-DENORM-CONTRACT-NEXT:    s_endpgm
 ;
 ; GFX11-FLUSH-LABEL: mad_fabs_sub_f16:
diff --git a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll
index 742aeb96fcc2c..105d9246880a4 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll
@@ -271,12 +271,12 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f32_off4_slc(<4 x i32> inre
 ;
 ; GFX10-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s8
-; GFX10-NEXT:    v_mov_b32_e32 v1, s9
-; GFX10-NEXT:    buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc
-; GFX10-NEXT:    v_mov_b32_e32 v1, s10
+; GFX10-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-NEXT:    buffer_atomic_fmin v0, v1, s[0:3], 4 offen glc slc
+; GFX10-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ds_write_b32 v1, v0
 ; GFX10-NEXT:    s_endpgm
@@ -657,14 +657,14 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inre
 ;
 ; GFX10-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s8
-; GFX10-NEXT:    v_mov_b32_e32 v1, s9
-; GFX10-NEXT:    buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc
+; GFX10-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-NEXT:    buffer_atomic_fmax v0, v1, s[0:3], 4 offen glc slc
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_store_dword v1, v0, s[10:11]
+; GFX10-NEXT:    global_store_dword v1, v0, s[6:7]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX1030-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc:
@@ -736,14 +736,14 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inre
 ;
 ; G_GFX10-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc:
 ; G_GFX10:       ; %bb.0: ; %main_body
-; G_GFX10-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; G_GFX10-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; G_GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX10-NEXT:    v_mov_b32_e32 v0, s8
-; G_GFX10-NEXT:    v_mov_b32_e32 v1, s9
-; G_GFX10-NEXT:    buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc
+; G_GFX10-NEXT:    v_mov_b32_e32 v0, s4
+; G_GFX10-NEXT:    v_mov_b32_e32 v1, s5
+; G_GFX10-NEXT:    buffer_atomic_fmax v0, v1, s[0:3], 4 offen glc slc
 ; G_GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; G_GFX10-NEXT:    s_waitcnt vmcnt(0)
-; G_GFX10-NEXT:    global_store_dword v1, v0, s[10:11]
+; G_GFX10-NEXT:    global_store_dword v1, v0, s[6:7]
 ; G_GFX10-NEXT:    s_endpgm
 ;
 ; G_GFX1030-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc:
diff --git a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll
index 950d228f29929..e124aadf4e8c2 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll
@@ -248,12 +248,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f32_off4_slc(ptr addrsp
 ;
 ; GFX10-LABEL: raw_ptr_buffer_atomic_min_rtn_f32_off4_slc:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s8
-; GFX10-NEXT:    v_mov_b32_e32 v1, s9
-; GFX10-NEXT:    buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc
-; GFX10-NEXT:    v_mov_b32_e32 v1, s10
+; GFX10-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-NEXT:    buffer_atomic_fmin v0, v1, s[0:3], 4 offen glc slc
+; GFX10-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ds_write_b32 v1, v0
 ; GFX10-NEXT:    s_endpgm
@@ -600,14 +600,14 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f32_off4_slc(ptr addrsp
 ;
 ; GFX10-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s8
-; GFX10-NEXT:    v_mov_b32_e32 v1, s9
-; GFX10-NEXT:    buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc
+; GFX10-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-NEXT:    buffer_atomic_fmax v0, v1, s[0:3], 4 offen glc slc
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_store_dword v1, v0, s[10:11]
+; GFX10-NEXT:    global_store_dword v1, v0, s[6:7]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX1030-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc:
@@ -665,14 +665,14 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f32_off4_slc(ptr addrsp
 ;
 ; G_GFX10-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc:
 ; G_GFX10:       ; %bb.0: ; %main_body
-; G_GFX10-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; G_GFX10-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; G_GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX10-NEXT:    v_mov_b32_e32 v0, s8
-; G_GFX10-NEXT:    v_mov_b32_e32 v1, s9
-; G_GFX10-NEXT:    buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc
+; G_GFX10-NEXT:    v_mov_b32_e32 v0, s4
+; G_GFX10-NEXT:    v_mov_b32_e32 v1, s5
+; G_GFX10-NEXT:    buffer_atomic_fmax v0, v1, s[0:3], 4 offen glc slc
 ; G_GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; G_GFX10-NEXT:    s_waitcnt vmcnt(0)
-; G_GFX10-NEXT:    global_store_dword v1, v0, s[10:11]
+; G_GFX10-NEXT:    global_store_dword v1, v0, s[6:7]
 ; G_GFX10-NEXT:    s_endpgm
 ;
 ; G_GFX1030-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc:
diff --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
index d964cf4858f09..ce1fcccf4a17c 100644
--- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
@@ -1022,22 +1022,22 @@ main_body:
 define amdgpu_kernel void @global_atomic_fadd_f64_noret(ptr addrspace(1) %ptr, double %data) {
 ; GFX90A-LABEL: global_atomic_fadd_f64_noret:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s6
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s7
-; GFX90A-NEXT:    global_atomic_add_f64 v2, v[0:1], s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s2
+; GFX90A-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-NEXT:    global_atomic_add_f64 v2, v[0:1], s[0:1]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX940-LABEL: global_atomic_fadd_f64_noret:
 ; GFX940:       ; %bb.0: ; %main_body
-; GFX940-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX940-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX940-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v0, s6
-; GFX940-NEXT:    v_mov_b32_e32 v1, s7
-; GFX940-NEXT:    global_atomic_add_f64 v2, v[0:1], s[4:5]
+; GFX940-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-NEXT:    global_atomic_add_f64 v2, v[0:1], s[0:1]
 ; GFX940-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
@@ -1047,22 +1047,22 @@ main_body:
 define amdgpu_kernel void @global_atomic_fmin_f64_noret(ptr addrspace(1) %ptr, double %data) {
 ; GFX90A-LABEL: global_atomic_fmin_f64_noret:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s6
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s7
-; GFX90A-NEXT:    global_atomic_min_f64 v2, v[0:1], s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s2
+; GFX90A-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-NEXT:    global_atomic_min_f64 v2, v[0:1], s[0:1]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX940-LABEL: global_atomic_fmin_f64_noret:
 ; GFX940:       ; %bb.0: ; %main_body
-; GFX940-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX940-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX940-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v0, s6
-; GFX940-NEXT:    v_mov_b32_e32 v1, s7
-; GFX940-NEXT:    global_atomic_min_f64 v2, v[0:1], s[4:5]
+; GFX940-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-NEXT:    global_atomic_min_f64 v2, v[0:1], s[0:1]
 ; GFX940-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.global.atomic.fmin.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
@@ -1072,22 +1072,22 @@ main_body:
 define amdgpu_kernel void @global_atomic_fmax_f64_noret(ptr addrspace(1) %ptr, double %data) {
 ; GFX90A-LABEL: global_atomic_fmax_f64_noret:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s6
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s7
-; GFX90A-NEXT:    global_atomic_max_f64 v2, v[0:1], s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s2
+; GFX90A-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-NEXT:    global_atomic_max_f64 v2, v[0:1], s[0:1]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX940-LABEL: global_atomic_fmax_f64_noret:
 ; GFX940:       ; %bb.0: ; %main_body
-; GFX940-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX940-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX940-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v0, s6
-; GFX940-NEXT:    v_mov_b32_e32 v1, s7
-; GFX940-NEXT:    global_atomic_max_f64 v2, v[0:1], s[4:5]
+; GFX940-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-NEXT:    global_atomic_max_f64 v2, v[0:1], s[0:1]
 ; GFX940-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.global.atomic.fmax.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
@@ -1139,14 +1139,14 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt
 ; GFX940-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX940-NEXT:    s_cbranch_execz .LBB39_2
 ; GFX940-NEXT:  ; %bb.1:
-; GFX940-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
+; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x24
 ; GFX940-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
 ; GFX940-NEXT:    v_cvt_f64_u32_e32 v[0:1], s0
 ; GFX940-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX940-NEXT:    v_mul_f64 v[0:1], v[0:1], 4.0
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    global_atomic_add_f64 v2, v[0:1], s[4:5] sc1
+; GFX940-NEXT:    global_atomic_add_f64 v2, v[0:1], s[2:3] sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc0 sc1
 ; GFX940-NEXT:  .LBB39_2:
@@ -1166,13 +1166,13 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace(
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX90A-NEXT:    s_cbranch_execz .LBB40_2
 ; GFX90A-NEXT:  ; %bb.1:
-; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
+; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x24
 ; GFX90A-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
 ; GFX90A-NEXT:    v_cvt_f64_u32_e32 v[0:1], s0
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX90A-NEXT:    v_mul_f64 v[0:1], v[0:1], 4.0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    global_atomic_add_f64 v2, v[0:1], s[4:5]
+; GFX90A-NEXT:    global_atomic_add_f64 v2, v[0:1], s[2:3]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-NEXT:  .LBB40_2:
@@ -1187,14 +1187,14 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace(
 ; GFX940-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX940-NEXT:    s_cbranch_execz .LBB40_2
 ; GFX940-NEXT:  ; %bb.1:
-; GFX940-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
+; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x24
 ; GFX940-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
 ; GFX940-NEXT:    v_cvt_f64_u32_e32 v[0:1], s0
 ; GFX940-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX940-NEXT:    v_mul_f64 v[0:1], v[0:1], 4.0
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    global_atomic_add_f64 v2, v[0:1], s[4:5]
+; GFX940-NEXT:    global_atomic_add_f64 v2, v[0:1], s[2:3]
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:  .LBB40_2:
@@ -1249,14 +1249,14 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace
 ; GFX940-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX940-NEXT:    s_cbranch_execz .LBB41_2
 ; GFX940-NEXT:  ; %bb.1:
-; GFX940-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
+; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x24
 ; GFX940-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
 ; GFX940-NEXT:    v_cvt_f64_u32_e32 v[0:1], s0
 ; GFX940-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX940-NEXT:    v_mul_f64 v[0:1], v[0:1], 4.0
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    global_atomic_add_f64 v2, v[0:1], s[4:5] sc1
+; GFX940-NEXT:    global_atomic_add_f64 v2, v[0:1], s[2:3] sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc0 sc1
 ; GFX940-NEXT:  .LBB41_2:
@@ -1276,13 +1276,13 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace(
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX90A-NEXT:    s_cbranch_execz .LBB42_2
 ; GFX90A-NEXT:  ; %bb.1:
-; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
+; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x24
 ; GFX90A-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
 ; GFX90A-NEXT:    v_cvt_f64_u32_e32 v[0:1], s0
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX90A-NEXT:    v_mul_f64 v[0:1], v[0:1], 4.0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    global_atomic_add_f64 v2, v[0:1], s[4:5]
+; GFX90A-NEXT:    global_atomic_add_f64 v2, v[0:1], s[2:3]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-NEXT:  .LBB42_2:
@@ -1297,14 +1297,14 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace(
 ; GFX940-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX940-NEXT:    s_cbranch_execz .LBB42_2
 ; GFX940-NEXT:  ; %bb.1:
-; GFX940-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
+; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x24
 ; GFX940-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
 ; GFX940-NEXT:    v_cvt_f64_u32_e32 v[0:1], s0
 ; GFX940-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX940-NEXT:    v_mul_f64 v[0:1], v[0:1], 4.0
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    global_atomic_add_f64 v2, v[0:1], s[4:5]
+; GFX940-NEXT:    global_atomic_add_f64 v2, v[0:1], s[2:3]
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:  .LBB42_2:
@@ -1519,14 +1519,14 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs
 ; GFX940-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX940-NEXT:    s_cbranch_execz .LBB49_2
 ; GFX940-NEXT:  ; %bb.1:
-; GFX940-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
+; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x24
 ; GFX940-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
 ; GFX940-NEXT:    v_cvt_f64_u32_e32 v[0:1], s0
 ; GFX940-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX940-NEXT:    v_mul_f64 v[0:1], v[0:1], 4.0
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    global_atomic_add_f64 v2, v[0:1], s[4:5]
+; GFX940-NEXT:    global_atomic_add_f64 v2, v[0:1], s[2:3]
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:  .LBB49_2:
@@ -1539,12 +1539,12 @@ main_body:
 define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 {
 ; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
+; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x24
 ; GFX90A-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
 ; GFX90A-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
-; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], s[2:3], s[2:3] op_sel:[0,1]
 ; GFX90A-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1610,12 +1610,12 @@ main_body:
 define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 {
 ; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat_system:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
+; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x24
 ; GFX90A-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
 ; GFX90A-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
-; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], s[2:3], s[2:3] op_sel:[0,1]
 ; GFX90A-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1760,23 +1760,23 @@ main_body:
 define amdgpu_kernel void @flat_atomic_fadd_f64_noret(ptr %ptr, double %data) {
 ; GFX90A-LABEL: flat_atomic_fadd_f64_noret:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s4
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s5
-; GFX90A-NEXT:    v_mov_b32_e32 v2, s6
-; GFX90A-NEXT:    v_mov_b32_e32 v3, s7
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX90A-NEXT:    flat_atomic_add_f64 v[0:1], v[2:3]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX940-LABEL: flat_atomic_fadd_f64_noret:
 ; GFX940:       ; %bb.0: ; %main_body
-; GFX940-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX940-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v0, s4
-; GFX940-NEXT:    v_mov_b32_e32 v1, s5
-; GFX940-NEXT:    v_mov_b32_e32 v2, s6
-; GFX940-NEXT:    v_mov_b32_e32 v3, s7
+; GFX940-NEXT:    v_mov_b32_e32 v0, s0
+; GFX940-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NEXT:    v_mov_b32_e32 v2, s2
+; GFX940-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX940-NEXT:    flat_atomic_add_f64 v[0:1], v[2:3]
 ; GFX940-NEXT:    s_endpgm
 main_body:
@@ -1806,12 +1806,12 @@ main_body:
 define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) {
 ; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat_agent_safe:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
+; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x24
 ; GFX90A-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
 ; GFX90A-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
-; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], s[2:3], s[2:3] op_sel:[0,1]
 ; GFX90A-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1846,23 +1846,23 @@ main_body:
 define amdgpu_kernel void @flat_atomic_fmin_f64_noret(ptr %ptr, double %data) {
 ; GFX90A-LABEL: flat_atomic_fmin_f64_noret:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s4
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s5
-; GFX90A-NEXT:    v_mov_b32_e32 v2, s6
-; GFX90A-NEXT:    v_mov_b32_e32 v3, s7
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX90A-NEXT:    flat_atomic_min_f64 v[0:1], v[2:3]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX940-LABEL: flat_atomic_fmin_f64_noret:
 ; GFX940:       ; %bb.0: ; %main_body
-; GFX940-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX940-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v0, s4
-; GFX940-NEXT:    v_mov_b32_e32 v1, s5
-; GFX940-NEXT:    v_mov_b32_e32 v2, s6
-; GFX940-NEXT:    v_mov_b32_e32 v3, s7
+; GFX940-NEXT:    v_mov_b32_e32 v0, s0
+; GFX940-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NEXT:    v_mov_b32_e32 v2, s2
+; GFX940-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX940-NEXT:    flat_atomic_min_f64 v[0:1], v[2:3]
 ; GFX940-NEXT:    s_endpgm
 main_body:
@@ -1892,23 +1892,23 @@ main_body:
 define amdgpu_kernel void @flat_atomic_fmax_f64_noret(ptr %ptr, double %data) {
 ; GFX90A-LABEL: flat_atomic_fmax_f64_noret:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s4
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s5
-; GFX90A-NEXT:    v_mov_b32_e32 v2, s6
-; GFX90A-NEXT:    v_mov_b32_e32 v3, s7
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX90A-NEXT:    flat_atomic_max_f64 v[0:1], v[2:3]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX940-LABEL: flat_atomic_fmax_f64_noret:
 ; GFX940:       ; %bb.0: ; %main_body
-; GFX940-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX940-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v0, s4
-; GFX940-NEXT:    v_mov_b32_e32 v1, s5
-; GFX940-NEXT:    v_mov_b32_e32 v2, s6
-; GFX940-NEXT:    v_mov_b32_e32 v3, s7
+; GFX940-NEXT:    v_mov_b32_e32 v0, s0
+; GFX940-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NEXT:    v_mov_b32_e32 v2, s2
+; GFX940-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX940-NEXT:    flat_atomic_max_f64 v[0:1], v[2:3]
 ; GFX940-NEXT:    s_endpgm
 main_body:
diff --git a/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll
index 90c1759070a59..f18f5752269e0 100644
--- a/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll
@@ -452,13 +452,13 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inre
 ;
 ; GFX10-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s8
-; GFX10-NEXT:    v_mov_b32_e32 v1, s9
-; GFX10-NEXT:    v_mov_b32_e32 v2, s10
-; GFX10-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 4 offen glc slc
-; GFX10-NEXT:    v_mov_b32_e32 v2, s11
+; GFX10-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-NEXT:    v_mov_b32_e32 v2, s6
+; GFX10-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 4 offen glc slc
+; GFX10-NEXT:    v_mov_b32_e32 v2, s7
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ds_write_b64 v2, v[0:1]
 ; GFX10-NEXT:    s_endpgm
@@ -506,13 +506,13 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inre
 ;
 ; G_GFX10-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
 ; G_GFX10:       ; %bb.0: ; %main_body
-; G_GFX10-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; G_GFX10-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; G_GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX10-NEXT:    v_mov_b32_e32 v0, s8
-; G_GFX10-NEXT:    v_mov_b32_e32 v1, s9
-; G_GFX10-NEXT:    v_mov_b32_e32 v2, s10
-; G_GFX10-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 4 offen glc slc
-; G_GFX10-NEXT:    v_mov_b32_e32 v2, s11
+; G_GFX10-NEXT:    v_mov_b32_e32 v0, s4
+; G_GFX10-NEXT:    v_mov_b32_e32 v1, s5
+; G_GFX10-NEXT:    v_mov_b32_e32 v2, s6
+; G_GFX10-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 4 offen glc slc
+; G_GFX10-NEXT:    v_mov_b32_e32 v2, s7
 ; G_GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; G_GFX10-NEXT:    ds_write_b64 v2, v[0:1]
 ; G_GFX10-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-ptr-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-ptr-atomics.ll
index ff6700d10ff53..6a2a8c3ce595d 100644
--- a/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-ptr-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-ptr-atomics.ll
@@ -452,13 +452,13 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp
 ;
 ; GFX10-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s8
-; GFX10-NEXT:    v_mov_b32_e32 v1, s9
-; GFX10-NEXT:    v_mov_b32_e32 v2, s10
-; GFX10-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 4 offen glc slc
-; GFX10-NEXT:    v_mov_b32_e32 v2, s11
+; GFX10-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-NEXT:    v_mov_b32_e32 v2, s6
+; GFX10-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 4 offen glc slc
+; GFX10-NEXT:    v_mov_b32_e32 v2, s7
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ds_write_b64 v2, v[0:1]
 ; GFX10-NEXT:    s_endpgm
@@ -506,13 +506,13 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp
 ;
 ; G_GFX10-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc:
 ; G_GFX10:       ; %bb.0: ; %main_body
-; G_GFX10-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; G_GFX10-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; G_GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX10-NEXT:    v_mov_b32_e32 v0, s8
-; G_GFX10-NEXT:    v_mov_b32_e32 v1, s9
-; G_GFX10-NEXT:    v_mov_b32_e32 v2, s10
-; G_GFX10-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 4 offen glc slc
-; G_GFX10-NEXT:    v_mov_b32_e32 v2, s11
+; G_GFX10-NEXT:    v_mov_b32_e32 v0, s4
+; G_GFX10-NEXT:    v_mov_b32_e32 v1, s5
+; G_GFX10-NEXT:    v_mov_b32_e32 v2, s6
+; G_GFX10-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 4 offen glc slc
+; G_GFX10-NEXT:    v_mov_b32_e32 v2, s7
 ; G_GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; G_GFX10-NEXT:    ds_write_b64 v2, v[0:1]
 ; G_GFX10-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fpext.f16.ll b/llvm/test/CodeGen/AMDGPU/fpext.f16.ll
index 75f4dff14fcbd..8c6dc4395839c 100644
--- a/llvm/test/CodeGen/AMDGPU/fpext.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fpext.f16.ll
@@ -23,41 +23,23 @@ define amdgpu_kernel void @fpext_f16_to_f32(
 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
-; VI-LABEL: fpext_f16_to_f32:
-; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
-; VI-NEXT:    s_mov_b32 s10, s6
-; VI-NEXT:    s_mov_b32 s11, s7
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s8, s2
-; VI-NEXT:    s_mov_b32 s9, s3
-; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
-; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; VI-NEXT:    s_endpgm
-;
-; GFX9-LABEL: fpext_f16_to_f32:
-; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    s_mov_b32 s10, s2
-; GFX9-NEXT:    s_mov_b32 s11, s3
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s8, s6
-; GFX9-NEXT:    s_mov_b32 s9, s7
-; GFX9-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
-; GFX9-NEXT:    s_mov_b32 s0, s4
-; GFX9-NEXT:    s_mov_b32 s1, s5
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
-; GFX9-NEXT:    s_endpgm
+; GFX89-LABEL: fpext_f16_to_f32:
+; GFX89:       ; %bb.0: ; %entry
+; GFX89-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX89-NEXT:    s_mov_b32 s7, 0xf000
+; GFX89-NEXT:    s_mov_b32 s6, -1
+; GFX89-NEXT:    s_mov_b32 s10, s6
+; GFX89-NEXT:    s_mov_b32 s11, s7
+; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX89-NEXT:    s_mov_b32 s8, s2
+; GFX89-NEXT:    s_mov_b32 s9, s3
+; GFX89-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
+; GFX89-NEXT:    s_mov_b32 s4, s0
+; GFX89-NEXT:    s_mov_b32 s5, s1
+; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX89-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX89-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: fpext_f16_to_f32:
 ; GFX11:       ; %bb.0: ; %entry
@@ -107,43 +89,24 @@ define amdgpu_kernel void @fpext_f16_to_f64(
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
-; VI-LABEL: fpext_f16_to_f64:
-; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
-; VI-NEXT:    s_mov_b32 s10, s6
-; VI-NEXT:    s_mov_b32 s11, s7
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s8, s2
-; VI-NEXT:    s_mov_b32 s9, s3
-; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
-; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; VI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
-; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
-; VI-NEXT:    s_endpgm
-;
-; GFX9-LABEL: fpext_f16_to_f64:
-; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    s_mov_b32 s10, s2
-; GFX9-NEXT:    s_mov_b32 s11, s3
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s8, s6
-; GFX9-NEXT:    s_mov_b32 s9, s7
-; GFX9-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
-; GFX9-NEXT:    s_mov_b32 s0, s4
-; GFX9-NEXT:    s_mov_b32 s1, s5
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
-; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
-; GFX9-NEXT:    s_endpgm
+; GFX89-LABEL: fpext_f16_to_f64:
+; GFX89:       ; %bb.0: ; %entry
+; GFX89-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX89-NEXT:    s_mov_b32 s7, 0xf000
+; GFX89-NEXT:    s_mov_b32 s6, -1
+; GFX89-NEXT:    s_mov_b32 s10, s6
+; GFX89-NEXT:    s_mov_b32 s11, s7
+; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX89-NEXT:    s_mov_b32 s8, s2
+; GFX89-NEXT:    s_mov_b32 s9, s3
+; GFX89-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
+; GFX89-NEXT:    s_mov_b32 s4, s0
+; GFX89-NEXT:    s_mov_b32 s5, s1
+; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX89-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX89-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX89-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: fpext_f16_to_f64:
 ; GFX11:       ; %bb.0: ; %entry
@@ -196,43 +159,24 @@ define amdgpu_kernel void @fpext_v2f16_to_v2f32(
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
-; VI-LABEL: fpext_v2f16_to_v2f32:
-; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
-; VI-NEXT:    s_mov_b32 s10, s6
-; VI-NEXT:    s_mov_b32 s11, s7
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s8, s2
-; VI-NEXT:    s_mov_b32 s9, s3
-; VI-NEXT:    buffer_load_dword v1, off, s[8:11], 0
-; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_cvt_f32_f16_e32 v0, v1
-; VI-NEXT:    v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
-; VI-NEXT:    s_endpgm
-;
-; GFX9-LABEL: fpext_v2f16_to_v2f32:
-; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    s_mov_b32 s10, s2
-; GFX9-NEXT:    s_mov_b32 s11, s3
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s8, s6
-; GFX9-NEXT:    s_mov_b32 s9, s7
-; GFX9-NEXT:    buffer_load_dword v1, off, s[8:11], 0
-; GFX9-NEXT:    s_mov_b32 s0, s4
-; GFX9-NEXT:    s_mov_b32 s1, s5
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_f16_e32 v0, v1
-; GFX9-NEXT:    v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
-; GFX9-NEXT:    s_endpgm
+; GFX89-LABEL: fpext_v2f16_to_v2f32:
+; GFX89:       ; %bb.0: ; %entry
+; GFX89-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX89-NEXT:    s_mov_b32 s7, 0xf000
+; GFX89-NEXT:    s_mov_b32 s6, -1
+; GFX89-NEXT:    s_mov_b32 s10, s6
+; GFX89-NEXT:    s_mov_b32 s11, s7
+; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX89-NEXT:    s_mov_b32 s8, s2
+; GFX89-NEXT:    s_mov_b32 s9, s3
+; GFX89-NEXT:    buffer_load_dword v1, off, s[8:11], 0
+; GFX89-NEXT:    s_mov_b32 s4, s0
+; GFX89-NEXT:    s_mov_b32 s5, s1
+; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    v_cvt_f32_f16_e32 v0, v1
+; GFX89-NEXT:    v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX89-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX89-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: fpext_v2f16_to_v2f32:
 ; GFX11:       ; %bb.0: ; %entry
@@ -288,47 +232,26 @@ define amdgpu_kernel void @fpext_v2f16_to_v2f64(
 ; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
-; VI-LABEL: fpext_v2f16_to_v2f64:
-; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
-; VI-NEXT:    s_mov_b32 s10, s6
-; VI-NEXT:    s_mov_b32 s11, s7
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s8, s2
-; VI-NEXT:    s_mov_b32 s9, s3
-; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_cvt_f32_f16_e32 v1, v0
-; VI-NEXT:    v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; VI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v1
-; VI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
-; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
-; VI-NEXT:    s_endpgm
-;
-; GFX9-LABEL: fpext_v2f16_to_v2f64:
-; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    s_mov_b32 s10, s2
-; GFX9-NEXT:    s_mov_b32 s11, s3
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s8, s6
-; GFX9-NEXT:    s_mov_b32 s9, s7
-; GFX9-NEXT:    buffer_load_dword v0, off, s[8:11], 0
-; GFX9-NEXT:    s_mov_b32 s0, s4
-; GFX9-NEXT:    s_mov_b32 s1, s5
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_f16_e32 v1, v0
-; GFX9-NEXT:    v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[0:1], v1
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
-; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
-; GFX9-NEXT:    s_endpgm
+; GFX89-LABEL: fpext_v2f16_to_v2f64:
+; GFX89:       ; %bb.0: ; %entry
+; GFX89-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX89-NEXT:    s_mov_b32 s7, 0xf000
+; GFX89-NEXT:    s_mov_b32 s6, -1
+; GFX89-NEXT:    s_mov_b32 s10, s6
+; GFX89-NEXT:    s_mov_b32 s11, s7
+; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX89-NEXT:    s_mov_b32 s8, s2
+; GFX89-NEXT:    s_mov_b32 s9, s3
+; GFX89-NEXT:    buffer_load_dword v0, off, s[8:11], 0
+; GFX89-NEXT:    s_mov_b32 s4, s0
+; GFX89-NEXT:    s_mov_b32 s5, s1
+; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    v_cvt_f32_f16_e32 v1, v0
+; GFX89-NEXT:    v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX89-NEXT:    v_cvt_f64_f32_e32 v[0:1], v1
+; GFX89-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
+; GFX89-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GFX89-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: fpext_v2f16_to_v2f64:
 ; GFX11:       ; %bb.0: ; %entry
@@ -427,41 +350,23 @@ define amdgpu_kernel void @fneg_fpext_f16_to_f32(
 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
-; VI-LABEL: fneg_fpext_f16_to_f32:
-; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
-; VI-NEXT:    s_mov_b32 s10, s6
-; VI-NEXT:    s_mov_b32 s11, s7
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s8, s2
-; VI-NEXT:    s_mov_b32 s9, s3
-; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
-; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_cvt_f32_f16_e64 v0, -v0
-; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; VI-NEXT:    s_endpgm
-;
-; GFX9-LABEL: fneg_fpext_f16_to_f32:
-; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    s_mov_b32 s10, s2
-; GFX9-NEXT:    s_mov_b32 s11, s3
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s8, s6
-; GFX9-NEXT:    s_mov_b32 s9, s7
-; GFX9-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
-; GFX9-NEXT:    s_mov_b32 s0, s4
-; GFX9-NEXT:    s_mov_b32 s1, s5
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_f16_e64 v0, -v0
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
-; GFX9-NEXT:    s_endpgm
+; GFX89-LABEL: fneg_fpext_f16_to_f32:
+; GFX89:       ; %bb.0: ; %entry
+; GFX89-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX89-NEXT:    s_mov_b32 s7, 0xf000
+; GFX89-NEXT:    s_mov_b32 s6, -1
+; GFX89-NEXT:    s_mov_b32 s10, s6
+; GFX89-NEXT:    s_mov_b32 s11, s7
+; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX89-NEXT:    s_mov_b32 s8, s2
+; GFX89-NEXT:    s_mov_b32 s9, s3
+; GFX89-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
+; GFX89-NEXT:    s_mov_b32 s4, s0
+; GFX89-NEXT:    s_mov_b32 s5, s1
+; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    v_cvt_f32_f16_e64 v0, -v0
+; GFX89-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX89-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: fneg_fpext_f16_to_f32:
 ; GFX11:       ; %bb.0: ; %entry
@@ -511,41 +416,23 @@ define amdgpu_kernel void @fabs_fpext_f16_to_f32(
 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
-; VI-LABEL: fabs_fpext_f16_to_f32:
-; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
-; VI-NEXT:    s_mov_b32 s10, s6
-; VI-NEXT:    s_mov_b32 s11, s7
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s8, s2
-; VI-NEXT:    s_mov_b32 s9, s3
-; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
-; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_cvt_f32_f16_e64 v0, |v0|
-; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; VI-NEXT:    s_endpgm
-;
-; GFX9-LABEL: fabs_fpext_f16_to_f32:
-; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    s_mov_b32 s10, s2
-; GFX9-NEXT:    s_mov_b32 s11, s3
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s8, s6
-; GFX9-NEXT:    s_mov_b32 s9, s7
-; GFX9-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
-; GFX9-NEXT:    s_mov_b32 s0, s4
-; GFX9-NEXT:    s_mov_b32 s1, s5
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_f16_e64 v0, |v0|
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
-; GFX9-NEXT:    s_endpgm
+; GFX89-LABEL: fabs_fpext_f16_to_f32:
+; GFX89:       ; %bb.0: ; %entry
+; GFX89-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX89-NEXT:    s_mov_b32 s7, 0xf000
+; GFX89-NEXT:    s_mov_b32 s6, -1
+; GFX89-NEXT:    s_mov_b32 s10, s6
+; GFX89-NEXT:    s_mov_b32 s11, s7
+; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX89-NEXT:    s_mov_b32 s8, s2
+; GFX89-NEXT:    s_mov_b32 s9, s3
+; GFX89-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
+; GFX89-NEXT:    s_mov_b32 s4, s0
+; GFX89-NEXT:    s_mov_b32 s5, s1
+; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    v_cvt_f32_f16_e64 v0, |v0|
+; GFX89-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX89-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: fabs_fpext_f16_to_f32:
 ; GFX11:       ; %bb.0: ; %entry
@@ -595,41 +482,23 @@ define amdgpu_kernel void @fneg_fabs_fpext_f16_to_f32(
 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
-; VI-LABEL: fneg_fabs_fpext_f16_to_f32:
-; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
-; VI-NEXT:    s_mov_b32 s10, s6
-; VI-NEXT:    s_mov_b32 s11, s7
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s8, s2
-; VI-NEXT:    s_mov_b32 s9, s3
-; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
-; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_cvt_f32_f16_e64 v0, -|v0|
-; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; VI-NEXT:    s_endpgm
-;
-; GFX9-LABEL: fneg_fabs_fpext_f16_to_f32:
-; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    s_mov_b32 s10, s2
-; GFX9-NEXT:    s_mov_b32 s11, s3
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s8, s6
-; GFX9-NEXT:    s_mov_b32 s9, s7
-; GFX9-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
-; GFX9-NEXT:    s_mov_b32 s0, s4
-; GFX9-NEXT:    s_mov_b32 s1, s5
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_f16_e64 v0, -|v0|
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
-; GFX9-NEXT:    s_endpgm
+; GFX89-LABEL: fneg_fabs_fpext_f16_to_f32:
+; GFX89:       ; %bb.0: ; %entry
+; GFX89-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX89-NEXT:    s_mov_b32 s7, 0xf000
+; GFX89-NEXT:    s_mov_b32 s6, -1
+; GFX89-NEXT:    s_mov_b32 s10, s6
+; GFX89-NEXT:    s_mov_b32 s11, s7
+; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX89-NEXT:    s_mov_b32 s8, s2
+; GFX89-NEXT:    s_mov_b32 s9, s3
+; GFX89-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
+; GFX89-NEXT:    s_mov_b32 s4, s0
+; GFX89-NEXT:    s_mov_b32 s5, s1
+; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    v_cvt_f32_f16_e64 v0, -|v0|
+; GFX89-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX89-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: fneg_fabs_fpext_f16_to_f32:
 ; GFX11:       ; %bb.0: ; %entry
@@ -686,49 +555,27 @@ define amdgpu_kernel void @fneg_multi_use_fpext_f16_to_f32(
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_endpgm
 ;
-; VI-LABEL: fneg_multi_use_fpext_f16_to_f32:
-; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
-; VI-NEXT:    s_mov_b32 s10, s6
-; VI-NEXT:    s_mov_b32 s11, s7
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s8, s2
-; VI-NEXT:    s_mov_b32 s9, s3
-; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
-; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_cvt_f32_f16_e64 v1, -v0
-; VI-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
-; VI-NEXT:    buffer_store_dword v1, off, s[4:7], 0
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    s_endpgm
-;
-; GFX9-LABEL: fneg_multi_use_fpext_f16_to_f32:
-; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    s_mov_b32 s10, s2
-; GFX9-NEXT:    s_mov_b32 s11, s3
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s8, s6
-; GFX9-NEXT:    s_mov_b32 s9, s7
-; GFX9-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
-; GFX9-NEXT:    s_mov_b32 s0, s4
-; GFX9-NEXT:    s_mov_b32 s1, s5
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_f16_e64 v1, -v0
-; GFX9-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_endpgm
+; GFX89-LABEL: fneg_multi_use_fpext_f16_to_f32:
+; GFX89:       ; %bb.0: ; %entry
+; GFX89-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX89-NEXT:    s_mov_b32 s7, 0xf000
+; GFX89-NEXT:    s_mov_b32 s6, -1
+; GFX89-NEXT:    s_mov_b32 s10, s6
+; GFX89-NEXT:    s_mov_b32 s11, s7
+; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX89-NEXT:    s_mov_b32 s8, s2
+; GFX89-NEXT:    s_mov_b32 s9, s3
+; GFX89-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
+; GFX89-NEXT:    s_mov_b32 s4, s0
+; GFX89-NEXT:    s_mov_b32 s5, s1
+; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    v_cvt_f32_f16_e64 v1, -v0
+; GFX89-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; GFX89-NEXT:    buffer_store_dword v1, off, s[4:7], 0
+; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: fneg_multi_use_fpext_f16_to_f32:
 ; GFX11:       ; %bb.0: ; %entry
@@ -789,49 +636,27 @@ define amdgpu_kernel void @fneg_multi_foldable_use_fpext_f16_to_f32(
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_endpgm
 ;
-; VI-LABEL: fneg_multi_foldable_use_fpext_f16_to_f32:
-; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
-; VI-NEXT:    s_mov_b32 s10, s6
-; VI-NEXT:    s_mov_b32 s11, s7
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s8, s2
-; VI-NEXT:    s_mov_b32 s9, s3
-; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
-; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_cvt_f32_f16_e64 v1, -v0
-; VI-NEXT:    v_mul_f16_e64 v0, -v0, v0
-; VI-NEXT:    buffer_store_dword v1, off, s[4:7], 0
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    s_endpgm
-;
-; GFX9-LABEL: fneg_multi_foldable_use_fpext_f16_to_f32:
-; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    s_mov_b32 s10, s2
-; GFX9-NEXT:    s_mov_b32 s11, s3
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s8, s6
-; GFX9-NEXT:    s_mov_b32 s9, s7
-; GFX9-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
-; GFX9-NEXT:    s_mov_b32 s0, s4
-; GFX9-NEXT:    s_mov_b32 s1, s5
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_f16_e64 v1, -v0
-; GFX9-NEXT:    v_mul_f16_e64 v0, -v0, v0
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_endpgm
+; GFX89-LABEL: fneg_multi_foldable_use_fpext_f16_to_f32:
+; GFX89:       ; %bb.0: ; %entry
+; GFX89-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX89-NEXT:    s_mov_b32 s7, 0xf000
+; GFX89-NEXT:    s_mov_b32 s6, -1
+; GFX89-NEXT:    s_mov_b32 s10, s6
+; GFX89-NEXT:    s_mov_b32 s11, s7
+; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX89-NEXT:    s_mov_b32 s8, s2
+; GFX89-NEXT:    s_mov_b32 s9, s3
+; GFX89-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
+; GFX89-NEXT:    s_mov_b32 s4, s0
+; GFX89-NEXT:    s_mov_b32 s5, s1
+; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    v_cvt_f32_f16_e64 v1, -v0
+; GFX89-NEXT:    v_mul_f16_e64 v0, -v0, v0
+; GFX89-NEXT:    buffer_store_dword v1, off, s[4:7], 0
+; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: fneg_multi_foldable_use_fpext_f16_to_f32:
 ; GFX11:       ; %bb.0: ; %entry
@@ -891,49 +716,27 @@ define amdgpu_kernel void @fabs_multi_use_fpext_f16_to_f32(
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_endpgm
 ;
-; VI-LABEL: fabs_multi_use_fpext_f16_to_f32:
-; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
-; VI-NEXT:    s_mov_b32 s10, s6
-; VI-NEXT:    s_mov_b32 s11, s7
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s8, s2
-; VI-NEXT:    s_mov_b32 s9, s3
-; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
-; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_cvt_f32_f16_e64 v1, |v0|
-; VI-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; VI-NEXT:    buffer_store_dword v1, off, s[4:7], 0
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    s_endpgm
-;
-; GFX9-LABEL: fabs_multi_use_fpext_f16_to_f32:
-; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    s_mov_b32 s10, s2
-; GFX9-NEXT:    s_mov_b32 s11, s3
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s8, s6
-; GFX9-NEXT:    s_mov_b32 s9, s7
-; GFX9-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
-; GFX9-NEXT:    s_mov_b32 s0, s4
-; GFX9-NEXT:    s_mov_b32 s1, s5
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_f16_e64 v1, |v0|
-; GFX9-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_endpgm
+; GFX89-LABEL: fabs_multi_use_fpext_f16_to_f32:
+; GFX89:       ; %bb.0: ; %entry
+; GFX89-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX89-NEXT:    s_mov_b32 s7, 0xf000
+; GFX89-NEXT:    s_mov_b32 s6, -1
+; GFX89-NEXT:    s_mov_b32 s10, s6
+; GFX89-NEXT:    s_mov_b32 s11, s7
+; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX89-NEXT:    s_mov_b32 s8, s2
+; GFX89-NEXT:    s_mov_b32 s9, s3
+; GFX89-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
+; GFX89-NEXT:    s_mov_b32 s4, s0
+; GFX89-NEXT:    s_mov_b32 s5, s1
+; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    v_cvt_f32_f16_e64 v1, |v0|
+; GFX89-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX89-NEXT:    buffer_store_dword v1, off, s[4:7], 0
+; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: fabs_multi_use_fpext_f16_to_f32:
 ; GFX11:       ; %bb.0: ; %entry
@@ -994,49 +797,27 @@ define amdgpu_kernel void @fabs_multi_foldable_use_fpext_f16_to_f32(
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_endpgm
 ;
-; VI-LABEL: fabs_multi_foldable_use_fpext_f16_to_f32:
-; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
-; VI-NEXT:    s_mov_b32 s10, s6
-; VI-NEXT:    s_mov_b32 s11, s7
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s8, s2
-; VI-NEXT:    s_mov_b32 s9, s3
-; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
-; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_cvt_f32_f16_e64 v1, |v0|
-; VI-NEXT:    v_mul_f16_e64 v0, |v0|, v0
-; VI-NEXT:    buffer_store_dword v1, off, s[4:7], 0
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    s_endpgm
-;
-; GFX9-LABEL: fabs_multi_foldable_use_fpext_f16_to_f32:
-; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    s_mov_b32 s10, s2
-; GFX9-NEXT:    s_mov_b32 s11, s3
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s8, s6
-; GFX9-NEXT:    s_mov_b32 s9, s7
-; GFX9-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
-; GFX9-NEXT:    s_mov_b32 s0, s4
-; GFX9-NEXT:    s_mov_b32 s1, s5
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_f16_e64 v1, |v0|
-; GFX9-NEXT:    v_mul_f16_e64 v0, |v0|, v0
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_endpgm
+; GFX89-LABEL: fabs_multi_foldable_use_fpext_f16_to_f32:
+; GFX89:       ; %bb.0: ; %entry
+; GFX89-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX89-NEXT:    s_mov_b32 s7, 0xf000
+; GFX89-NEXT:    s_mov_b32 s6, -1
+; GFX89-NEXT:    s_mov_b32 s10, s6
+; GFX89-NEXT:    s_mov_b32 s11, s7
+; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX89-NEXT:    s_mov_b32 s8, s2
+; GFX89-NEXT:    s_mov_b32 s9, s3
+; GFX89-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
+; GFX89-NEXT:    s_mov_b32 s4, s0
+; GFX89-NEXT:    s_mov_b32 s5, s1
+; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    v_cvt_f32_f16_e64 v1, |v0|
+; GFX89-NEXT:    v_mul_f16_e64 v0, |v0|, v0
+; GFX89-NEXT:    buffer_store_dword v1, off, s[4:7], 0
+; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: fabs_multi_foldable_use_fpext_f16_to_f32:
 ; GFX11:       ; %bb.0: ; %entry
@@ -1096,49 +877,27 @@ define amdgpu_kernel void @fabs_fneg_multi_use_fpext_f16_to_f32(
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_endpgm
 ;
-; VI-LABEL: fabs_fneg_multi_use_fpext_f16_to_f32:
-; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
-; VI-NEXT:    s_mov_b32 s10, s6
-; VI-NEXT:    s_mov_b32 s11, s7
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s8, s2
-; VI-NEXT:    s_mov_b32 s9, s3
-; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
-; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_cvt_f32_f16_e64 v1, -|v0|
-; VI-NEXT:    v_or_b32_e32 v0, 0x8000, v0
-; VI-NEXT:    buffer_store_dword v1, off, s[4:7], 0
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    s_endpgm
-;
-; GFX9-LABEL: fabs_fneg_multi_use_fpext_f16_to_f32:
-; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    s_mov_b32 s10, s2
-; GFX9-NEXT:    s_mov_b32 s11, s3
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s8, s6
-; GFX9-NEXT:    s_mov_b32 s9, s7
-; GFX9-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
-; GFX9-NEXT:    s_mov_b32 s0, s4
-; GFX9-NEXT:    s_mov_b32 s1, s5
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_f16_e64 v1, -|v0|
-; GFX9-NEXT:    v_or_b32_e32 v0, 0x8000, v0
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_endpgm
+; GFX89-LABEL: fabs_fneg_multi_use_fpext_f16_to_f32:
+; GFX89:       ; %bb.0: ; %entry
+; GFX89-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX89-NEXT:    s_mov_b32 s7, 0xf000
+; GFX89-NEXT:    s_mov_b32 s6, -1
+; GFX89-NEXT:    s_mov_b32 s10, s6
+; GFX89-NEXT:    s_mov_b32 s11, s7
+; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX89-NEXT:    s_mov_b32 s8, s2
+; GFX89-NEXT:    s_mov_b32 s9, s3
+; GFX89-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
+; GFX89-NEXT:    s_mov_b32 s4, s0
+; GFX89-NEXT:    s_mov_b32 s5, s1
+; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    v_cvt_f32_f16_e64 v1, -|v0|
+; GFX89-NEXT:    v_or_b32_e32 v0, 0x8000, v0
+; GFX89-NEXT:    buffer_store_dword v1, off, s[4:7], 0
+; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: fabs_fneg_multi_use_fpext_f16_to_f32:
 ; GFX11:       ; %bb.0: ; %entry
@@ -1200,49 +959,27 @@ define amdgpu_kernel void @fabs_fneg_multi_foldable_use_fpext_f16_to_f32(
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_endpgm
 ;
-; VI-LABEL: fabs_fneg_multi_foldable_use_fpext_f16_to_f32:
-; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
-; VI-NEXT:    s_mov_b32 s10, s6
-; VI-NEXT:    s_mov_b32 s11, s7
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s8, s2
-; VI-NEXT:    s_mov_b32 s9, s3
-; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
-; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_cvt_f32_f16_e64 v1, -|v0|
-; VI-NEXT:    v_mul_f16_e64 v0, -|v0|, v0
-; VI-NEXT:    buffer_store_dword v1, off, s[4:7], 0
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    s_endpgm
-;
-; GFX9-LABEL: fabs_fneg_multi_foldable_use_fpext_f16_to_f32:
-; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    s_mov_b32 s10, s2
-; GFX9-NEXT:    s_mov_b32 s11, s3
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s8, s6
-; GFX9-NEXT:    s_mov_b32 s9, s7
-; GFX9-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
-; GFX9-NEXT:    s_mov_b32 s0, s4
-; GFX9-NEXT:    s_mov_b32 s1, s5
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_f16_e64 v1, -|v0|
-; GFX9-NEXT:    v_mul_f16_e64 v0, -|v0|, v0
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_endpgm
+; GFX89-LABEL: fabs_fneg_multi_foldable_use_fpext_f16_to_f32:
+; GFX89:       ; %bb.0: ; %entry
+; GFX89-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX89-NEXT:    s_mov_b32 s7, 0xf000
+; GFX89-NEXT:    s_mov_b32 s6, -1
+; GFX89-NEXT:    s_mov_b32 s10, s6
+; GFX89-NEXT:    s_mov_b32 s11, s7
+; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX89-NEXT:    s_mov_b32 s8, s2
+; GFX89-NEXT:    s_mov_b32 s9, s3
+; GFX89-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
+; GFX89-NEXT:    s_mov_b32 s4, s0
+; GFX89-NEXT:    s_mov_b32 s5, s1
+; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    v_cvt_f32_f16_e64 v1, -|v0|
+; GFX89-NEXT:    v_mul_f16_e64 v0, -|v0|, v0
+; GFX89-NEXT:    buffer_store_dword v1, off, s[4:7], 0
+; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: fabs_fneg_multi_foldable_use_fpext_f16_to_f32:
 ; GFX11:       ; %bb.0: ; %entry
@@ -1283,3 +1020,6 @@ entry:
 declare half @llvm.fabs.f16(half) #1
 
 attributes #1 = { nounwind readnone }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX9: {{.*}}
+; VI: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
index 667a3f398c08a..99818df6175bd 100644
--- a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
@@ -22,13 +22,13 @@ define i128 @fptosi_f64_to_i128(double %x) {
 ; SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffffb81, v6
 ; SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v7, vcc
 ; SDAG-NEXT:    v_addc_co_u32_e32 v2, vcc, -1, v7, vcc
-; SDAG-NEXT:    s_movk_i32 s6, 0xff7f
+; SDAG-NEXT:    s_movk_i32 s4, 0xff7f
 ; SDAG-NEXT:    v_addc_co_u32_e32 v3, vcc, -1, v7, vcc
-; SDAG-NEXT:    s_mov_b32 s7, -1
-; SDAG-NEXT:    v_cmp_eq_u64_e64 s[4:5], -1, v[2:3]
-; SDAG-NEXT:    v_cmp_lt_u64_e64 s[6:7], s[6:7], v[0:1]
+; SDAG-NEXT:    s_mov_b32 s5, -1
+; SDAG-NEXT:    v_cmp_lt_u64_e64 s[4:5], s[4:5], v[0:1]
+; SDAG-NEXT:    v_cmp_eq_u64_e64 s[6:7], -1, v[2:3]
 ; SDAG-NEXT:    v_cmp_lt_i64_e32 vcc, -1, v[4:5]
-; SDAG-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
+; SDAG-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
 ; SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; SDAG-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
@@ -394,13 +394,13 @@ define i128 @fptoui_f64_to_i128(double %x) {
 ; SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffffb81, v6
 ; SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v7, vcc
 ; SDAG-NEXT:    v_addc_co_u32_e32 v2, vcc, -1, v7, vcc
-; SDAG-NEXT:    s_movk_i32 s6, 0xff7f
+; SDAG-NEXT:    s_movk_i32 s4, 0xff7f
 ; SDAG-NEXT:    v_addc_co_u32_e32 v3, vcc, -1, v7, vcc
-; SDAG-NEXT:    s_mov_b32 s7, -1
-; SDAG-NEXT:    v_cmp_eq_u64_e64 s[4:5], -1, v[2:3]
-; SDAG-NEXT:    v_cmp_lt_u64_e64 s[6:7], s[6:7], v[0:1]
+; SDAG-NEXT:    s_mov_b32 s5, -1
+; SDAG-NEXT:    v_cmp_lt_u64_e64 s[4:5], s[4:5], v[0:1]
+; SDAG-NEXT:    v_cmp_eq_u64_e64 s[6:7], -1, v[2:3]
 ; SDAG-NEXT:    v_cmp_lt_i64_e32 vcc, -1, v[4:5]
-; SDAG-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
+; SDAG-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
 ; SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; SDAG-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
@@ -765,13 +765,13 @@ define i128 @fptosi_f32_to_i128(float %x) {
 ; SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffff01, v5
 ; SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v6, vcc
 ; SDAG-NEXT:    v_addc_co_u32_e32 v2, vcc, -1, v6, vcc
-; SDAG-NEXT:    s_movk_i32 s6, 0xff7f
+; SDAG-NEXT:    s_movk_i32 s4, 0xff7f
 ; SDAG-NEXT:    v_addc_co_u32_e32 v3, vcc, -1, v6, vcc
-; SDAG-NEXT:    s_mov_b32 s7, -1
-; SDAG-NEXT:    v_cmp_eq_u64_e64 s[4:5], -1, v[2:3]
-; SDAG-NEXT:    v_cmp_lt_u64_e64 s[6:7], s[6:7], v[0:1]
+; SDAG-NEXT:    s_mov_b32 s5, -1
+; SDAG-NEXT:    v_cmp_lt_u64_e64 s[4:5], s[4:5], v[0:1]
+; SDAG-NEXT:    v_cmp_eq_u64_e64 s[6:7], -1, v[2:3]
 ; SDAG-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v4
-; SDAG-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
+; SDAG-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
 ; SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; SDAG-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
@@ -1123,13 +1123,13 @@ define i128 @fptoui_f32_to_i128(float %x) {
 ; SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffff01, v5
 ; SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v6, vcc
 ; SDAG-NEXT:    v_addc_co_u32_e32 v2, vcc, -1, v6, vcc
-; SDAG-NEXT:    s_movk_i32 s6, 0xff7f
+; SDAG-NEXT:    s_movk_i32 s4, 0xff7f
 ; SDAG-NEXT:    v_addc_co_u32_e32 v3, vcc, -1, v6, vcc
-; SDAG-NEXT:    s_mov_b32 s7, -1
-; SDAG-NEXT:    v_cmp_eq_u64_e64 s[4:5], -1, v[2:3]
-; SDAG-NEXT:    v_cmp_lt_u64_e64 s[6:7], s[6:7], v[0:1]
+; SDAG-NEXT:    s_mov_b32 s5, -1
+; SDAG-NEXT:    v_cmp_lt_u64_e64 s[4:5], s[4:5], v[0:1]
+; SDAG-NEXT:    v_cmp_eq_u64_e64 s[6:7], -1, v[2:3]
 ; SDAG-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v4
-; SDAG-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
+; SDAG-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
 ; SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; SDAG-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
@@ -1509,13 +1509,13 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) {
 ; SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffff01, v5
 ; SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v6, vcc
 ; SDAG-NEXT:    v_addc_co_u32_e32 v2, vcc, -1, v6, vcc
-; SDAG-NEXT:    s_movk_i32 s6, 0xff7f
+; SDAG-NEXT:    s_movk_i32 s4, 0xff7f
 ; SDAG-NEXT:    v_addc_co_u32_e32 v3, vcc, -1, v6, vcc
-; SDAG-NEXT:    s_mov_b32 s7, -1
-; SDAG-NEXT:    v_cmp_eq_u64_e64 s[4:5], -1, v[2:3]
-; SDAG-NEXT:    v_cmp_lt_u64_e64 s[6:7], s[6:7], v[0:1]
+; SDAG-NEXT:    s_mov_b32 s5, -1
+; SDAG-NEXT:    v_cmp_lt_u64_e64 s[4:5], s[4:5], v[0:1]
+; SDAG-NEXT:    v_cmp_eq_u64_e64 s[6:7], -1, v[2:3]
 ; SDAG-NEXT:    v_cmp_lt_i16_e32 vcc, -1, v4
-; SDAG-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
+; SDAG-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
 ; SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; SDAG-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
@@ -1860,13 +1860,13 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) {
 ; SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffff01, v5
 ; SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v6, vcc
 ; SDAG-NEXT:    v_addc_co_u32_e32 v2, vcc, -1, v6, vcc
-; SDAG-NEXT:    s_movk_i32 s6, 0xff7f
+; SDAG-NEXT:    s_movk_i32 s4, 0xff7f
 ; SDAG-NEXT:    v_addc_co_u32_e32 v3, vcc, -1, v6, vcc
-; SDAG-NEXT:    s_mov_b32 s7, -1
-; SDAG-NEXT:    v_cmp_eq_u64_e64 s[4:5], -1, v[2:3]
-; SDAG-NEXT:    v_cmp_lt_u64_e64 s[6:7], s[6:7], v[0:1]
+; SDAG-NEXT:    s_mov_b32 s5, -1
+; SDAG-NEXT:    v_cmp_lt_u64_e64 s[4:5], s[4:5], v[0:1]
+; SDAG-NEXT:    v_cmp_eq_u64_e64 s[6:7], -1, v[2:3]
 ; SDAG-NEXT:    v_cmp_lt_i16_e32 vcc, -1, v4
-; SDAG-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
+; SDAG-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
 ; SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; SDAG-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
index 0817ac1b3cd67..65ac2e240469d 100644
--- a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
@@ -71,32 +71,32 @@ define amdgpu_kernel void @fptrunc_f32_to_f16(
 ;
 ; GFX9-SDAG-LABEL: fptrunc_f32_to_f16:
 ; GFX9-SDAG:       ; %bb.0: ; %entry
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-SDAG-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-SDAG-NEXT:    s_mov_b32 s2, -1
-; GFX9-SDAG-NEXT:    s_mov_b32 s10, s2
-; GFX9-SDAG-NEXT:    s_mov_b32 s11, s3
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-SDAG-NEXT:    s_mov_b32 s6, -1
+; GFX9-SDAG-NEXT:    s_mov_b32 s10, s6
+; GFX9-SDAG-NEXT:    s_mov_b32 s11, s7
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    s_mov_b32 s8, s6
-; GFX9-SDAG-NEXT:    s_mov_b32 s9, s7
+; GFX9-SDAG-NEXT:    s_mov_b32 s8, s2
+; GFX9-SDAG-NEXT:    s_mov_b32 s9, s3
 ; GFX9-SDAG-NEXT:    buffer_load_dword v0, off, s[8:11], 0
-; GFX9-SDAG-NEXT:    s_mov_b32 s0, s4
-; GFX9-SDAG-NEXT:    s_mov_b32 s1, s5
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, s0
+; GFX9-SDAG-NEXT:    s_mov_b32 s5, s1
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX9-SDAG-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX9-SDAG-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; GFX9-SDAG-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: fptrunc_f32_to_f16:
 ; GFX9-GISEL:       ; %bb.0: ; %entry
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_load_dword s0, s[6:7], 0x0
-; GFX9-GISEL-NEXT:    s_mov_b32 s6, -1
-; GFX9-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-GISEL-NEXT:    s_load_dword s2, s[2:3], 0x0
+; GFX9-GISEL-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, s0
-; GFX9-GISEL-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; GFX9-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, s2
+; GFX9-GISEL-NEXT:    s_mov_b32 s2, -1
+; GFX9-GISEL-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX9-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: fptrunc_f32_to_f16:
@@ -208,34 +208,34 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
 ;
 ; GFX9-SDAG-LABEL: fptrunc_f64_to_f16:
 ; GFX9-SDAG:       ; %bb.0: ; %entry
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-SDAG-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-SDAG-NEXT:    s_mov_b32 s2, -1
-; GFX9-SDAG-NEXT:    s_mov_b32 s10, s2
-; GFX9-SDAG-NEXT:    s_mov_b32 s11, s3
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-SDAG-NEXT:    s_mov_b32 s6, -1
+; GFX9-SDAG-NEXT:    s_mov_b32 s10, s6
+; GFX9-SDAG-NEXT:    s_mov_b32 s11, s7
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    s_mov_b32 s8, s6
-; GFX9-SDAG-NEXT:    s_mov_b32 s9, s7
+; GFX9-SDAG-NEXT:    s_mov_b32 s8, s2
+; GFX9-SDAG-NEXT:    s_mov_b32 s9, s3
 ; GFX9-SDAG-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; GFX9-SDAG-NEXT:    s_mov_b32 s0, s4
-; GFX9-SDAG-NEXT:    s_mov_b32 s1, s5
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, s0
+; GFX9-SDAG-NEXT:    s_mov_b32 s5, s1
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX9-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX9-SDAG-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX9-SDAG-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; GFX9-SDAG-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: fptrunc_f64_to_f16:
 ; GFX9-GISEL:       ; %bb.0: ; %entry
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX9-GISEL-NEXT:    s_mov_b32 s6, -1
-; GFX9-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_cvt_f32_f64_e32 v0, s[0:1]
+; GFX9-GISEL-NEXT:    v_cvt_f32_f64_e32 v0, s[2:3]
+; GFX9-GISEL-NEXT:    s_mov_b32 s2, -1
+; GFX9-GISEL-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX9-GISEL-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; GFX9-GISEL-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX9-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: fptrunc_f64_to_f16:
@@ -358,36 +358,36 @@ define amdgpu_kernel void @fptrunc_v2f32_to_v2f16(
 ;
 ; GFX9-SDAG-LABEL: fptrunc_v2f32_to_v2f16:
 ; GFX9-SDAG:       ; %bb.0: ; %entry
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-SDAG-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-SDAG-NEXT:    s_mov_b32 s2, -1
-; GFX9-SDAG-NEXT:    s_mov_b32 s10, s2
-; GFX9-SDAG-NEXT:    s_mov_b32 s11, s3
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-SDAG-NEXT:    s_mov_b32 s6, -1
+; GFX9-SDAG-NEXT:    s_mov_b32 s10, s6
+; GFX9-SDAG-NEXT:    s_mov_b32 s11, s7
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    s_mov_b32 s8, s6
-; GFX9-SDAG-NEXT:    s_mov_b32 s9, s7
+; GFX9-SDAG-NEXT:    s_mov_b32 s8, s2
+; GFX9-SDAG-NEXT:    s_mov_b32 s9, s3
 ; GFX9-SDAG-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; GFX9-SDAG-NEXT:    s_mov_b32 s0, s4
-; GFX9-SDAG-NEXT:    s_mov_b32 s1, s5
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, s0
+; GFX9-SDAG-NEXT:    s_mov_b32 s5, s1
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX9-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX9-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX9-SDAG-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: fptrunc_v2f32_to_v2f16:
 ; GFX9-GISEL:       ; %bb.0: ; %entry
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX9-GISEL-NEXT:    s_mov_b32 s6, -1
-; GFX9-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, s0
-; GFX9-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, s1
+; GFX9-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, s2
+; GFX9-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, s3
+; GFX9-GISEL-NEXT:    s_mov_b32 s2, -1
+; GFX9-GISEL-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-GISEL-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX9-GISEL-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: fptrunc_v2f32_to_v2f16:
@@ -519,40 +519,40 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ;
 ; GFX9-SDAG-LABEL: fptrunc_v2f64_to_v2f16:
 ; GFX9-SDAG:       ; %bb.0: ; %entry
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-SDAG-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-SDAG-NEXT:    s_mov_b32 s2, -1
-; GFX9-SDAG-NEXT:    s_mov_b32 s10, s2
-; GFX9-SDAG-NEXT:    s_mov_b32 s11, s3
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-SDAG-NEXT:    s_mov_b32 s6, -1
+; GFX9-SDAG-NEXT:    s_mov_b32 s10, s6
+; GFX9-SDAG-NEXT:    s_mov_b32 s11, s7
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    s_mov_b32 s8, s6
-; GFX9-SDAG-NEXT:    s_mov_b32 s9, s7
+; GFX9-SDAG-NEXT:    s_mov_b32 s8, s2
+; GFX9-SDAG-NEXT:    s_mov_b32 s9, s3
 ; GFX9-SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; GFX9-SDAG-NEXT:    s_mov_b32 s0, s4
-; GFX9-SDAG-NEXT:    s_mov_b32 s1, s5
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, s0
+; GFX9-SDAG-NEXT:    s_mov_b32 s5, s1
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    v_cvt_f32_f64_e32 v2, v[2:3]
 ; GFX9-SDAG-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX9-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v2
 ; GFX9-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX9-SDAG-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX9-SDAG-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: fptrunc_v2f64_to_v2f16:
 ; GFX9-GISEL:       ; %bb.0: ; %entry
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX9-GISEL-NEXT:    s_mov_b32 s6, -1
-; GFX9-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GFX9-GISEL-NEXT:    s_mov_b32 s2, -1
+; GFX9-GISEL-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_cvt_f32_f64_e32 v0, s[0:1]
-; GFX9-GISEL-NEXT:    v_cvt_f32_f64_e32 v1, s[2:3]
+; GFX9-GISEL-NEXT:    v_cvt_f32_f64_e32 v0, s[4:5]
+; GFX9-GISEL-NEXT:    v_cvt_f32_f64_e32 v1, s[6:7]
 ; GFX9-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX9-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX9-GISEL-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX9-GISEL-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: fptrunc_v2f64_to_v2f16:
@@ -673,32 +673,32 @@ define amdgpu_kernel void @fneg_fptrunc_f32_to_f16(
 ;
 ; GFX9-SDAG-LABEL: fneg_fptrunc_f32_to_f16:
 ; GFX9-SDAG:       ; %bb.0: ; %entry
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-SDAG-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-SDAG-NEXT:    s_mov_b32 s2, -1
-; GFX9-SDAG-NEXT:    s_mov_b32 s10, s2
-; GFX9-SDAG-NEXT:    s_mov_b32 s11, s3
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-SDAG-NEXT:    s_mov_b32 s6, -1
+; GFX9-SDAG-NEXT:    s_mov_b32 s10, s6
+; GFX9-SDAG-NEXT:    s_mov_b32 s11, s7
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    s_mov_b32 s8, s6
-; GFX9-SDAG-NEXT:    s_mov_b32 s9, s7
+; GFX9-SDAG-NEXT:    s_mov_b32 s8, s2
+; GFX9-SDAG-NEXT:    s_mov_b32 s9, s3
 ; GFX9-SDAG-NEXT:    buffer_load_dword v0, off, s[8:11], 0
-; GFX9-SDAG-NEXT:    s_mov_b32 s0, s4
-; GFX9-SDAG-NEXT:    s_mov_b32 s1, s5
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, s0
+; GFX9-SDAG-NEXT:    s_mov_b32 s5, s1
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    v_cvt_f16_f32_e64 v0, -v0
-; GFX9-SDAG-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX9-SDAG-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; GFX9-SDAG-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: fneg_fptrunc_f32_to_f16:
 ; GFX9-GISEL:       ; %bb.0: ; %entry
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_load_dword s0, s[6:7], 0x0
-; GFX9-GISEL-NEXT:    s_mov_b32 s6, -1
-; GFX9-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-GISEL-NEXT:    s_load_dword s2, s[2:3], 0x0
+; GFX9-GISEL-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_cvt_f16_f32_e64 v0, -s0
-; GFX9-GISEL-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; GFX9-GISEL-NEXT:    v_cvt_f16_f32_e64 v0, -s2
+; GFX9-GISEL-NEXT:    s_mov_b32 s2, -1
+; GFX9-GISEL-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX9-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: fneg_fptrunc_f32_to_f16:
@@ -807,32 +807,32 @@ define amdgpu_kernel void @fabs_fptrunc_f32_to_f16(
 ;
 ; GFX9-SDAG-LABEL: fabs_fptrunc_f32_to_f16:
 ; GFX9-SDAG:       ; %bb.0: ; %entry
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-SDAG-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-SDAG-NEXT:    s_mov_b32 s2, -1
-; GFX9-SDAG-NEXT:    s_mov_b32 s10, s2
-; GFX9-SDAG-NEXT:    s_mov_b32 s11, s3
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-SDAG-NEXT:    s_mov_b32 s6, -1
+; GFX9-SDAG-NEXT:    s_mov_b32 s10, s6
+; GFX9-SDAG-NEXT:    s_mov_b32 s11, s7
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    s_mov_b32 s8, s6
-; GFX9-SDAG-NEXT:    s_mov_b32 s9, s7
+; GFX9-SDAG-NEXT:    s_mov_b32 s8, s2
+; GFX9-SDAG-NEXT:    s_mov_b32 s9, s3
 ; GFX9-SDAG-NEXT:    buffer_load_dword v0, off, s[8:11], 0
-; GFX9-SDAG-NEXT:    s_mov_b32 s0, s4
-; GFX9-SDAG-NEXT:    s_mov_b32 s1, s5
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, s0
+; GFX9-SDAG-NEXT:    s_mov_b32 s5, s1
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    v_cvt_f16_f32_e64 v0, |v0|
-; GFX9-SDAG-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX9-SDAG-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; GFX9-SDAG-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: fabs_fptrunc_f32_to_f16:
 ; GFX9-GISEL:       ; %bb.0: ; %entry
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_load_dword s0, s[6:7], 0x0
-; GFX9-GISEL-NEXT:    s_mov_b32 s6, -1
-; GFX9-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-GISEL-NEXT:    s_load_dword s2, s[2:3], 0x0
+; GFX9-GISEL-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_cvt_f16_f32_e64 v0, |s0|
-; GFX9-GISEL-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; GFX9-GISEL-NEXT:    v_cvt_f16_f32_e64 v0, |s2|
+; GFX9-GISEL-NEXT:    s_mov_b32 s2, -1
+; GFX9-GISEL-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX9-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: fabs_fptrunc_f32_to_f16:
@@ -941,32 +941,32 @@ define amdgpu_kernel void @fneg_fabs_fptrunc_f32_to_f16(
 ;
 ; GFX9-SDAG-LABEL: fneg_fabs_fptrunc_f32_to_f16:
 ; GFX9-SDAG:       ; %bb.0: ; %entry
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-SDAG-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-SDAG-NEXT:    s_mov_b32 s2, -1
-; GFX9-SDAG-NEXT:    s_mov_b32 s10, s2
-; GFX9-SDAG-NEXT:    s_mov_b32 s11, s3
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-SDAG-NEXT:    s_mov_b32 s6, -1
+; GFX9-SDAG-NEXT:    s_mov_b32 s10, s6
+; GFX9-SDAG-NEXT:    s_mov_b32 s11, s7
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    s_mov_b32 s8, s6
-; GFX9-SDAG-NEXT:    s_mov_b32 s9, s7
+; GFX9-SDAG-NEXT:    s_mov_b32 s8, s2
+; GFX9-SDAG-NEXT:    s_mov_b32 s9, s3
 ; GFX9-SDAG-NEXT:    buffer_load_dword v0, off, s[8:11], 0
-; GFX9-SDAG-NEXT:    s_mov_b32 s0, s4
-; GFX9-SDAG-NEXT:    s_mov_b32 s1, s5
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, s0
+; GFX9-SDAG-NEXT:    s_mov_b32 s5, s1
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    v_cvt_f16_f32_e64 v0, -|v0|
-; GFX9-SDAG-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX9-SDAG-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; GFX9-SDAG-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: fneg_fabs_fptrunc_f32_to_f16:
 ; GFX9-GISEL:       ; %bb.0: ; %entry
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_load_dword s0, s[6:7], 0x0
-; GFX9-GISEL-NEXT:    s_mov_b32 s6, -1
-; GFX9-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-GISEL-NEXT:    s_load_dword s2, s[2:3], 0x0
+; GFX9-GISEL-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_cvt_f16_f32_e64 v0, -|s0|
-; GFX9-GISEL-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; GFX9-GISEL-NEXT:    v_cvt_f16_f32_e64 v0, -|s2|
+; GFX9-GISEL-NEXT:    s_mov_b32 s2, -1
+; GFX9-GISEL-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX9-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: fneg_fabs_fptrunc_f32_to_f16:
@@ -1076,32 +1076,32 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_zext_i32(
 ;
 ; GFX9-SDAG-LABEL: fptrunc_f32_to_f16_zext_i32:
 ; GFX9-SDAG:       ; %bb.0: ; %entry
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-SDAG-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-SDAG-NEXT:    s_mov_b32 s2, -1
-; GFX9-SDAG-NEXT:    s_mov_b32 s10, s2
-; GFX9-SDAG-NEXT:    s_mov_b32 s11, s3
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-SDAG-NEXT:    s_mov_b32 s6, -1
+; GFX9-SDAG-NEXT:    s_mov_b32 s10, s6
+; GFX9-SDAG-NEXT:    s_mov_b32 s11, s7
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    s_mov_b32 s8, s6
-; GFX9-SDAG-NEXT:    s_mov_b32 s9, s7
+; GFX9-SDAG-NEXT:    s_mov_b32 s8, s2
+; GFX9-SDAG-NEXT:    s_mov_b32 s9, s3
 ; GFX9-SDAG-NEXT:    buffer_load_dword v0, off, s[8:11], 0
-; GFX9-SDAG-NEXT:    s_mov_b32 s0, s4
-; GFX9-SDAG-NEXT:    s_mov_b32 s1, s5
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, s0
+; GFX9-SDAG-NEXT:    s_mov_b32 s5, s1
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX9-SDAG-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: fptrunc_f32_to_f16_zext_i32:
 ; GFX9-GISEL:       ; %bb.0: ; %entry
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_load_dword s0, s[6:7], 0x0
-; GFX9-GISEL-NEXT:    s_mov_b32 s6, -1
-; GFX9-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-GISEL-NEXT:    s_load_dword s2, s[2:3], 0x0
+; GFX9-GISEL-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, s0
-; GFX9-GISEL-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX9-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, s2
+; GFX9-GISEL-NEXT:    s_mov_b32 s2, -1
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: fptrunc_f32_to_f16_zext_i32:
@@ -1215,32 +1215,32 @@ define amdgpu_kernel void @fptrunc_fabs_f32_to_f16_zext_i32(
 ;
 ; GFX9-SDAG-LABEL: fptrunc_fabs_f32_to_f16_zext_i32:
 ; GFX9-SDAG:       ; %bb.0: ; %entry
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-SDAG-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-SDAG-NEXT:    s_mov_b32 s2, -1
-; GFX9-SDAG-NEXT:    s_mov_b32 s10, s2
-; GFX9-SDAG-NEXT:    s_mov_b32 s11, s3
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-SDAG-NEXT:    s_mov_b32 s6, -1
+; GFX9-SDAG-NEXT:    s_mov_b32 s10, s6
+; GFX9-SDAG-NEXT:    s_mov_b32 s11, s7
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    s_mov_b32 s8, s6
-; GFX9-SDAG-NEXT:    s_mov_b32 s9, s7
+; GFX9-SDAG-NEXT:    s_mov_b32 s8, s2
+; GFX9-SDAG-NEXT:    s_mov_b32 s9, s3
 ; GFX9-SDAG-NEXT:    buffer_load_dword v0, off, s[8:11], 0
-; GFX9-SDAG-NEXT:    s_mov_b32 s0, s4
-; GFX9-SDAG-NEXT:    s_mov_b32 s1, s5
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, s0
+; GFX9-SDAG-NEXT:    s_mov_b32 s5, s1
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    v_cvt_f16_f32_e64 v0, |v0|
-; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX9-SDAG-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: fptrunc_fabs_f32_to_f16_zext_i32:
 ; GFX9-GISEL:       ; %bb.0: ; %entry
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_load_dword s0, s[6:7], 0x0
-; GFX9-GISEL-NEXT:    s_mov_b32 s6, -1
-; GFX9-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-GISEL-NEXT:    s_load_dword s2, s[2:3], 0x0
+; GFX9-GISEL-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_cvt_f16_f32_e64 v0, |s0|
-; GFX9-GISEL-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX9-GISEL-NEXT:    v_cvt_f16_f32_e64 v0, |s2|
+; GFX9-GISEL-NEXT:    s_mov_b32 s2, -1
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: fptrunc_fabs_f32_to_f16_zext_i32:
@@ -1359,34 +1359,34 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_sext_i32(
 ;
 ; GFX9-SDAG-LABEL: fptrunc_f32_to_f16_sext_i32:
 ; GFX9-SDAG:       ; %bb.0: ; %entry
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-SDAG-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-SDAG-NEXT:    s_mov_b32 s2, -1
-; GFX9-SDAG-NEXT:    s_mov_b32 s10, s2
-; GFX9-SDAG-NEXT:    s_mov_b32 s11, s3
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-SDAG-NEXT:    s_mov_b32 s6, -1
+; GFX9-SDAG-NEXT:    s_mov_b32 s10, s6
+; GFX9-SDAG-NEXT:    s_mov_b32 s11, s7
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    s_mov_b32 s8, s6
-; GFX9-SDAG-NEXT:    s_mov_b32 s9, s7
+; GFX9-SDAG-NEXT:    s_mov_b32 s8, s2
+; GFX9-SDAG-NEXT:    s_mov_b32 s9, s3
 ; GFX9-SDAG-NEXT:    buffer_load_dword v0, off, s[8:11], 0
-; GFX9-SDAG-NEXT:    s_mov_b32 s0, s4
-; GFX9-SDAG-NEXT:    s_mov_b32 s1, s5
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, s0
+; GFX9-SDAG-NEXT:    s_mov_b32 s5, s1
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX9-SDAG-NEXT:    v_bfe_i32 v0, v0, 0, 16
-; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX9-SDAG-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: fptrunc_f32_to_f16_sext_i32:
 ; GFX9-GISEL:       ; %bb.0: ; %entry
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_load_dword s0, s[6:7], 0x0
-; GFX9-GISEL-NEXT:    s_mov_b32 s6, -1
-; GFX9-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-GISEL-NEXT:    s_load_dword s2, s[2:3], 0x0
+; GFX9-GISEL-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, s0
+; GFX9-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, s2
+; GFX9-GISEL-NEXT:    s_mov_b32 s2, -1
 ; GFX9-GISEL-NEXT:    v_bfe_i32 v0, v0, 0, 16
-; GFX9-GISEL-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: fptrunc_f32_to_f16_sext_i32:
diff --git a/llvm/test/CodeGen/AMDGPU/fshl.ll b/llvm/test/CodeGen/AMDGPU/fshl.ll
index 4f230140f7ba4..ea588df86b846 100644
--- a/llvm/test/CodeGen/AMDGPU/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/fshl.ll
@@ -135,12 +135,12 @@ define amdgpu_kernel void @fshl_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) {
 ;
 ; GFX9-LABEL: fshl_i32_imm:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, s7
-; GFX9-NEXT:    v_alignbit_b32 v1, s6, v1, 25
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_alignbit_b32 v1, s2, v1, 25
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; R600-LABEL: fshl_i32_imm:
@@ -157,11 +157,11 @@ define amdgpu_kernel void @fshl_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) {
 ;
 ; GFX10-LABEL: fshl_i32_imm:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_alignbit_b32 v1, s6, s7, 25
-; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-NEXT:    v_alignbit_b32 v1, s2, s3, 25
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: fshl_i32_imm:
@@ -732,15 +732,15 @@ define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) {
 ;
 ; GFX9-LABEL: orxor2or1:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b32 s0, s6, 7
-; GFX9-NEXT:    s_or_b32 s0, s7, s0
-; GFX9-NEXT:    s_cmp_eq_u32 s0, 0
-; GFX9-NEXT:    s_cselect_b32 s0, s6, s7
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    s_lshl_b32 s4, s2, 7
+; GFX9-NEXT:    s_or_b32 s4, s3, s4
+; GFX9-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX9-NEXT:    s_cselect_b32 s2, s2, s3
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; R600-LABEL: orxor2or1:
@@ -759,15 +759,15 @@ define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) {
 ;
 ; GFX10-LABEL: orxor2or1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_lshl_b32 s0, s6, 7
-; GFX10-NEXT:    s_or_b32 s0, s7, s0
-; GFX10-NEXT:    s_cmp_eq_u32 s0, 0
-; GFX10-NEXT:    s_cselect_b32 s0, s6, s7
-; GFX10-NEXT:    v_mov_b32_e32 v1, s0
-; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-NEXT:    s_lshl_b32 s4, s2, 7
+; GFX10-NEXT:    s_or_b32 s4, s3, s4
+; GFX10-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX10-NEXT:    s_cselect_b32 s2, s2, s3
+; GFX10-NEXT:    v_mov_b32_e32 v1, s2
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: orxor2or1:
diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll
index 31f574d44ab8c..dbcebe6e07e3f 100644
--- a/llvm/test/CodeGen/AMDGPU/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/fshr.ll
@@ -127,12 +127,12 @@ define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) {
 ;
 ; GFX9-LABEL: fshr_i32_imm:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, s7
-; GFX9-NEXT:    v_alignbit_b32 v1, s6, v1, 7
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_alignbit_b32 v1, s2, v1, 7
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; R600-LABEL: fshr_i32_imm:
@@ -149,11 +149,11 @@ define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) {
 ;
 ; GFX10-LABEL: fshr_i32_imm:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_alignbit_b32 v1, s6, s7, 7
-; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-NEXT:    v_alignbit_b32 v1, s2, s3, 7
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: fshr_i32_imm:
diff --git a/llvm/test/CodeGen/AMDGPU/fsub.f16.ll b/llvm/test/CodeGen/AMDGPU/fsub.f16.ll
index 2a79793443fb2..f6df1cbbdd06b 100644
--- a/llvm/test/CodeGen/AMDGPU/fsub.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fsub.f16.ll
@@ -111,41 +111,23 @@ define amdgpu_kernel void @fsub_f16_imm_a(
 ; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
-; VI-LABEL: fsub_f16_imm_a:
-; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
-; VI-NEXT:    s_mov_b32 s10, s6
-; VI-NEXT:    s_mov_b32 s11, s7
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s8, s2
-; VI-NEXT:    s_mov_b32 s9, s3
-; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0 glc
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    v_sub_f16_e32 v0, 1.0, v0
-; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
-; VI-NEXT:    s_endpgm
-;
-; GFX9-LABEL: fsub_f16_imm_a:
-; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    s_mov_b32 s10, s2
-; GFX9-NEXT:    s_mov_b32 s11, s3
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s8, s6
-; GFX9-NEXT:    s_mov_b32 s9, s7
-; GFX9-NEXT:    buffer_load_ushort v0, off, s[8:11], 0 glc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s0, s4
-; GFX9-NEXT:    s_mov_b32 s1, s5
-; GFX9-NEXT:    v_sub_f16_e32 v0, 1.0, v0
-; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
-; GFX9-NEXT:    s_endpgm
+; GFX89-LABEL: fsub_f16_imm_a:
+; GFX89:       ; %bb.0: ; %entry
+; GFX89-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX89-NEXT:    s_mov_b32 s7, 0xf000
+; GFX89-NEXT:    s_mov_b32 s6, -1
+; GFX89-NEXT:    s_mov_b32 s10, s6
+; GFX89-NEXT:    s_mov_b32 s11, s7
+; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX89-NEXT:    s_mov_b32 s8, s2
+; GFX89-NEXT:    s_mov_b32 s9, s3
+; GFX89-NEXT:    buffer_load_ushort v0, off, s[8:11], 0 glc
+; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    s_mov_b32 s4, s0
+; GFX89-NEXT:    s_mov_b32 s5, s1
+; GFX89-NEXT:    v_sub_f16_e32 v0, 1.0, v0
+; GFX89-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; GFX89-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: fsub_f16_imm_a:
 ; GFX11:       ; %bb.0: ; %entry
@@ -196,41 +178,23 @@ define amdgpu_kernel void @fsub_f16_imm_b(
 ; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
-; VI-LABEL: fsub_f16_imm_b:
-; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
-; VI-NEXT:    s_mov_b32 s10, s6
-; VI-NEXT:    s_mov_b32 s11, s7
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s8, s2
-; VI-NEXT:    s_mov_b32 s9, s3
-; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0 glc
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    v_add_f16_e32 v0, -2.0, v0
-; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
-; VI-NEXT:    s_endpgm
-;
-; GFX9-LABEL: fsub_f16_imm_b:
-; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    s_mov_b32 s10, s2
-; GFX9-NEXT:    s_mov_b32 s11, s3
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s8, s6
-; GFX9-NEXT:    s_mov_b32 s9, s7
-; GFX9-NEXT:    buffer_load_ushort v0, off, s[8:11], 0 glc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s0, s4
-; GFX9-NEXT:    s_mov_b32 s1, s5
-; GFX9-NEXT:    v_add_f16_e32 v0, -2.0, v0
-; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
-; GFX9-NEXT:    s_endpgm
+; GFX89-LABEL: fsub_f16_imm_b:
+; GFX89:       ; %bb.0: ; %entry
+; GFX89-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX89-NEXT:    s_mov_b32 s7, 0xf000
+; GFX89-NEXT:    s_mov_b32 s6, -1
+; GFX89-NEXT:    s_mov_b32 s10, s6
+; GFX89-NEXT:    s_mov_b32 s11, s7
+; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX89-NEXT:    s_mov_b32 s8, s2
+; GFX89-NEXT:    s_mov_b32 s9, s3
+; GFX89-NEXT:    buffer_load_ushort v0, off, s[8:11], 0 glc
+; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    s_mov_b32 s4, s0
+; GFX89-NEXT:    s_mov_b32 s5, s1
+; GFX89-NEXT:    v_add_f16_e32 v0, -2.0, v0
+; GFX89-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; GFX89-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: fsub_f16_imm_b:
 ; GFX11:       ; %bb.0: ; %entry
@@ -426,21 +390,21 @@ define amdgpu_kernel void @fsub_v2f16_imm_a(
 ;
 ; GFX9-LABEL: fsub_v2f16_imm_a:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    s_mov_b32 s10, s2
-; GFX9-NEXT:    s_mov_b32 s11, s3
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    s_mov_b32 s10, s6
+; GFX9-NEXT:    s_mov_b32 s11, s7
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s8, s6
-; GFX9-NEXT:    s_mov_b32 s9, s7
+; GFX9-NEXT:    s_mov_b32 s8, s2
+; GFX9-NEXT:    s_mov_b32 s9, s3
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[8:11], 0
-; GFX9-NEXT:    s_mov_b32 s0, s4
-; GFX9-NEXT:    s_mov_b32 s4, 0x40003c00
-; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_mov_b32 s4, s0
+; GFX9-NEXT:    s_mov_b32 s0, 0x40003c00
+; GFX9-NEXT:    s_mov_b32 s5, s1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_pk_add_f16 v0, v0, s4 neg_lo:[1,0] neg_hi:[1,0]
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    v_pk_add_f16 v0, v0, s0 neg_lo:[1,0] neg_hi:[1,0]
+; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: fsub_v2f16_imm_a:
@@ -521,21 +485,21 @@ define amdgpu_kernel void @fsub_v2f16_imm_b(
 ;
 ; GFX9-LABEL: fsub_v2f16_imm_b:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    s_mov_b32 s10, s2
-; GFX9-NEXT:    s_mov_b32 s11, s3
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    s_mov_b32 s10, s6
+; GFX9-NEXT:    s_mov_b32 s11, s7
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s8, s6
-; GFX9-NEXT:    s_mov_b32 s9, s7
+; GFX9-NEXT:    s_mov_b32 s8, s2
+; GFX9-NEXT:    s_mov_b32 s9, s3
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[8:11], 0
-; GFX9-NEXT:    s_mov_b32 s0, s4
-; GFX9-NEXT:    s_mov_b32 s4, 0xbc00c000
-; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_mov_b32 s4, s0
+; GFX9-NEXT:    s_mov_b32 s0, 0xbc00c000
+; GFX9-NEXT:    s_mov_b32 s5, s1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_pk_add_f16 v0, v0, s4
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    v_pk_add_f16 v0, v0, s0
+; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: fsub_v2f16_imm_b:
diff --git a/llvm/test/CodeGen/AMDGPU/global-i16-load-store.ll b/llvm/test/CodeGen/AMDGPU/global-i16-load-store.ll
index 31069e567d86d..7f6a3ad5c9346 100644
--- a/llvm/test/CodeGen/AMDGPU/global-i16-load-store.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-i16-load-store.ll
@@ -135,10 +135,10 @@ define amdgpu_kernel void @half4(ptr addrspace(1) nocapture readonly %0, ptr add
 ; GFX908-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX908-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX908-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v0, s4
-; GFX908-NEXT:    v_mov_b32_e32 v1, s5
+; GFX908-NEXT:    v_mov_b32_e32 v0, s0
+; GFX908-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX908-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX908-NEXT:    s_endpgm
 ;
@@ -147,9 +147,9 @@ define amdgpu_kernel void @half4(ptr addrspace(1) nocapture readonly %0, ptr add
 ; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
 ; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX90A-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics.ll b/llvm/test/CodeGen/AMDGPU/global_atomics.ll
index 16c09cf600080..9bee539b1e4e5 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics.ll
@@ -4155,12 +4155,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr addrspace(1) %out, i32
 ;
 ; GFX9-LABEL: atomic_cmpxchg_i32_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9-NEXT:    v_mov_b32_e32 v1, s7
-; GFX9-NEXT:    global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
@@ -4406,12 +4406,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i32(ptr addrspace(1) %out, i32 %in, i3
 ;
 ; GFX9-LABEL: atomic_cmpxchg_i32:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9-NEXT:    v_mov_b32_e32 v1, s7
-; GFX9-NEXT:    global_atomic_cmpswap v2, v[0:1], s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
@@ -5105,13 +5105,13 @@ define amdgpu_kernel void @atomic_load_i32_offset(ptr addrspace(1) %in, ptr addr
 ;
 ; GFX9-LABEL: atomic_load_i32_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[4:5] offset:16 glc
+; GFX9-NEXT:    global_load_dword v1, v0, s[0:1] offset:16 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX9-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr addrspace(1) %in, i64 4
@@ -5159,13 +5159,13 @@ define amdgpu_kernel void @atomic_load_i32_negoffset(ptr addrspace(1) %in, ptr a
 ;
 ; GFX9-LABEL: atomic_load_i32_negoffset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[4:5] offset:-512 glc
+; GFX9-NEXT:    global_load_dword v1, v0, s[0:1] offset:-512 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX9-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr addrspace(1) %in, i64 -128
@@ -5211,13 +5211,13 @@ define amdgpu_kernel void @atomic_load_f32_offset(ptr addrspace(1) %in, ptr addr
 ;
 ; GFX9-LABEL: atomic_load_f32_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[4:5] offset:16 glc
+; GFX9-NEXT:    global_load_dword v1, v0, s[0:1] offset:16 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX9-NEXT:    s_endpgm
 entry:
   %gep = getelementptr float, ptr addrspace(1) %in, i64 4
@@ -5261,13 +5261,13 @@ define amdgpu_kernel void @atomic_load_i32(ptr addrspace(1) %in, ptr addrspace(1
 ;
 ; GFX9-LABEL: atomic_load_i32:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[4:5] glc
+; GFX9-NEXT:    global_load_dword v1, v0, s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX9-NEXT:    s_endpgm
 entry:
   %val = load atomic i32, ptr addrspace(1) %in syncscope("agent") seq_cst, align 4
@@ -5812,13 +5812,13 @@ define amdgpu_kernel void @atomic_load_i8_offset(ptr addrspace(1) %in, ptr addrs
 ;
 ; GFX9-LABEL: atomic_load_i8_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ubyte v1, v0, s[4:5] offset:16 glc
+; GFX9-NEXT:    global_load_ubyte v1, v0, s[0:1] offset:16 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    global_store_byte v0, v1, s[6:7]
+; GFX9-NEXT:    global_store_byte v0, v1, s[2:3]
 ; GFX9-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i8, ptr addrspace(1) %in, i64 16
@@ -5866,13 +5866,13 @@ define amdgpu_kernel void @atomic_load_i8_negoffset(ptr addrspace(1) %in, ptr ad
 ;
 ; GFX9-LABEL: atomic_load_i8_negoffset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ubyte v1, v0, s[4:5] offset:-512 glc
+; GFX9-NEXT:    global_load_ubyte v1, v0, s[0:1] offset:-512 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    global_store_byte v0, v1, s[6:7]
+; GFX9-NEXT:    global_store_byte v0, v1, s[2:3]
 ; GFX9-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i8, ptr addrspace(1) %in, i64 -512
@@ -5993,13 +5993,13 @@ define amdgpu_kernel void @atomic_load_i16_offset(ptr addrspace(1) %in, ptr addr
 ;
 ; GFX9-LABEL: atomic_load_i16_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v1, v0, s[4:5] offset:16 glc
+; GFX9-NEXT:    global_load_ushort v1, v0, s[0:1] offset:16 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    global_store_short v0, v1, s[6:7]
+; GFX9-NEXT:    global_store_short v0, v1, s[2:3]
 ; GFX9-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i16, ptr addrspace(1) %in, i64 8
@@ -6047,13 +6047,13 @@ define amdgpu_kernel void @atomic_load_i16_negoffset(ptr addrspace(1) %in, ptr a
 ;
 ; GFX9-LABEL: atomic_load_i16_negoffset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v1, v0, s[4:5] offset:-512 glc
+; GFX9-NEXT:    global_load_ushort v1, v0, s[0:1] offset:-512 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    global_store_short v0, v1, s[6:7]
+; GFX9-NEXT:    global_store_short v0, v1, s[2:3]
 ; GFX9-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i16, ptr addrspace(1) %in, i64 -256
@@ -7074,13 +7074,13 @@ define amdgpu_kernel void @atomic_load_f16_offset(ptr addrspace(1) %in, ptr addr
 ;
 ; GFX9-LABEL: atomic_load_f16_offset:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v1, v0, s[4:5] offset:16 glc
+; GFX9-NEXT:    global_load_ushort v1, v0, s[0:1] offset:16 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    global_store_short v0, v1, s[6:7]
+; GFX9-NEXT:    global_store_short v0, v1, s[2:3]
 ; GFX9-NEXT:    s_endpgm
   %gep = getelementptr half, ptr addrspace(1) %in, i64 8
   %val = load atomic half, ptr addrspace(1) %gep  seq_cst, align 2
@@ -7127,13 +7127,13 @@ define amdgpu_kernel void @atomic_load_f16_negoffset(ptr addrspace(1) %in, ptr a
 ;
 ; GFX9-LABEL: atomic_load_f16_negoffset:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v1, v0, s[4:5] offset:-512 glc
+; GFX9-NEXT:    global_load_ushort v1, v0, s[0:1] offset:-512 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    global_store_short v0, v1, s[6:7]
+; GFX9-NEXT:    global_store_short v0, v1, s[2:3]
 ; GFX9-NEXT:    s_endpgm
   %gep = getelementptr half, ptr addrspace(1) %in, i64 -256
   %val = load atomic half, ptr addrspace(1) %gep  seq_cst, align 2
@@ -7176,13 +7176,13 @@ define amdgpu_kernel void @atomic_load_bf16_offset(ptr addrspace(1) %in, ptr add
 ;
 ; GFX9-LABEL: atomic_load_bf16_offset:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v1, v0, s[4:5] offset:16 glc
+; GFX9-NEXT:    global_load_ushort v1, v0, s[0:1] offset:16 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    global_store_short v0, v1, s[6:7]
+; GFX9-NEXT:    global_store_short v0, v1, s[2:3]
 ; GFX9-NEXT:    s_endpgm
   %gep = getelementptr bfloat, ptr addrspace(1) %in, i64 8
   %val = load atomic bfloat, ptr addrspace(1) %gep  seq_cst, align 2
@@ -7229,13 +7229,13 @@ define amdgpu_kernel void @atomic_load_bf16_negoffset(ptr addrspace(1) %in, ptr
 ;
 ; GFX9-LABEL: atomic_load_bf16_negoffset:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v1, v0, s[4:5] offset:-512 glc
+; GFX9-NEXT:    global_load_ushort v1, v0, s[0:1] offset:-512 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    global_store_short v0, v1, s[6:7]
+; GFX9-NEXT:    global_store_short v0, v1, s[2:3]
 ; GFX9-NEXT:    s_endpgm
   %gep = getelementptr bfloat, ptr addrspace(1) %in, i64 -256
   %val = load atomic bfloat, ptr addrspace(1) %gep  seq_cst, align 2
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll
index c7fa2a2ede388..a7ba8a084272b 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll
@@ -4679,28 +4679,28 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i
 ;
 ; GFX9-LABEL: atomic_max_i32_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s1, s7, 31
-; GFX9-NEXT:    s_mov_b32 s0, s7
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX9-NEXT:    s_add_u32 s0, s4, s0
-; GFX9-NEXT:    s_addc_u32 s1, s5, s1
-; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x10
+; GFX9-NEXT:    s_ashr_i32 s5, s3, 31
+; GFX9-NEXT:    s_mov_b32 s4, s3
+; GFX9-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
+; GFX9-NEXT:    s_add_u32 s0, s0, s4
+; GFX9-NEXT:    s_addc_u32 s1, s1, s5
+; GFX9-NEXT:    s_load_dword s3, s[0:1], 0x10
+; GFX9-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:  .LBB91_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_max_i32_e32 v0, s6, v1
+; GFX9-NEXT:    v_max_i32_e32 v0, s2, v1
 ; GFX9-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v0
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB91_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX9-NEXT:    s_endpgm
@@ -4890,28 +4890,28 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in,
 ;
 ; GFX9-LABEL: atomic_max_i32_addr64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s1, s7, 31
-; GFX9-NEXT:    s_mov_b32 s0, s7
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX9-NEXT:    s_add_u32 s0, s4, s0
-; GFX9-NEXT:    s_addc_u32 s1, s5, s1
-; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT:    s_ashr_i32 s5, s3, 31
+; GFX9-NEXT:    s_mov_b32 s4, s3
+; GFX9-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
+; GFX9-NEXT:    s_add_u32 s0, s0, s4
+; GFX9-NEXT:    s_addc_u32 s1, s1, s5
+; GFX9-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX9-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:  .LBB93_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_max_i32_e32 v0, s6, v1
+; GFX9-NEXT:    v_max_i32_e32 v0, s2, v1
 ; GFX9-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v0
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB93_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX9-NEXT:    s_endpgm
@@ -5932,28 +5932,28 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out,
 ;
 ; GFX9-LABEL: atomic_umax_i32_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s1, s7, 31
-; GFX9-NEXT:    s_mov_b32 s0, s7
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX9-NEXT:    s_add_u32 s0, s4, s0
-; GFX9-NEXT:    s_addc_u32 s1, s5, s1
-; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x10
+; GFX9-NEXT:    s_ashr_i32 s5, s3, 31
+; GFX9-NEXT:    s_mov_b32 s4, s3
+; GFX9-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
+; GFX9-NEXT:    s_add_u32 s0, s0, s4
+; GFX9-NEXT:    s_addc_u32 s1, s1, s5
+; GFX9-NEXT:    s_load_dword s3, s[0:1], 0x10
+; GFX9-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:  .LBB105_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_max_u32_e32 v0, s6, v1
+; GFX9-NEXT:    v_max_u32_e32 v0, s2, v1
 ; GFX9-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v0
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB105_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX9-NEXT:    s_endpgm
@@ -7923,28 +7923,28 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i
 ;
 ; GFX9-LABEL: atomic_min_i32_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s1, s7, 31
-; GFX9-NEXT:    s_mov_b32 s0, s7
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX9-NEXT:    s_add_u32 s0, s4, s0
-; GFX9-NEXT:    s_addc_u32 s1, s5, s1
-; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x10
+; GFX9-NEXT:    s_ashr_i32 s5, s3, 31
+; GFX9-NEXT:    s_mov_b32 s4, s3
+; GFX9-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
+; GFX9-NEXT:    s_add_u32 s0, s0, s4
+; GFX9-NEXT:    s_addc_u32 s1, s1, s5
+; GFX9-NEXT:    s_load_dword s3, s[0:1], 0x10
+; GFX9-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:  .LBB128_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_min_i32_e32 v0, s6, v1
+; GFX9-NEXT:    v_min_i32_e32 v0, s2, v1
 ; GFX9-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v0
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB128_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX9-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
index d1a371fc4356f..40f0acf3d5d09 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
@@ -33,12 +33,12 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr addrspace(1) %out, i64 %in)
 ;
 ; GFX9-LABEL: atomic_add_i64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9-NEXT:    v_mov_b32_e32 v1, s7
-; GFX9-NEXT:    global_atomic_add_x2 v2, v[0:1], s[4:5] offset:32
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_atomic_add_x2 v2, v[0:1], s[0:1] offset:32
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
@@ -259,18 +259,18 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr addrspace(1) %ou
 ;
 ; GFX9-LABEL: atomic_add_i64_ret_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
-; GFX9-NEXT:    s_add_u32 s0, s4, s0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s8
-; GFX9-NEXT:    v_mov_b32_e32 v1, s9
-; GFX9-NEXT:    s_addc_u32 s1, s5, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX9-NEXT:    s_add_u32 s0, s0, s4
+; GFX9-NEXT:    s_addc_u32 s1, s1, s5
 ; GFX9-NEXT:    global_atomic_add_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_add_i64_ret_addr64_offset:
@@ -331,12 +331,12 @@ define amdgpu_kernel void @atomic_add_i64(ptr addrspace(1) %out, i64 %in) {
 ;
 ; GFX9-LABEL: atomic_add_i64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9-NEXT:    v_mov_b32_e32 v1, s7
-; GFX9-NEXT:    global_atomic_add_x2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_atomic_add_x2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
@@ -550,18 +550,18 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ;
 ; GFX9-LABEL: atomic_add_i64_ret_addr64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
-; GFX9-NEXT:    s_add_u32 s0, s4, s0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s8
-; GFX9-NEXT:    v_mov_b32_e32 v1, s9
-; GFX9-NEXT:    s_addc_u32 s1, s5, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX9-NEXT:    s_add_u32 s0, s0, s4
+; GFX9-NEXT:    s_addc_u32 s1, s1, s5
 ; GFX9-NEXT:    global_atomic_add_x2 v[0:1], v2, v[0:1], s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_add_i64_ret_addr64:
@@ -617,12 +617,12 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr addrspace(1) %out, i64 %in)
 ;
 ; GFX9-LABEL: atomic_and_i64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9-NEXT:    v_mov_b32_e32 v1, s7
-; GFX9-NEXT:    global_atomic_and_x2 v2, v[0:1], s[4:5] offset:32
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_atomic_and_x2 v2, v[0:1], s[0:1] offset:32
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
@@ -843,18 +843,18 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr addrspace(1) %ou
 ;
 ; GFX9-LABEL: atomic_and_i64_ret_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
-; GFX9-NEXT:    s_add_u32 s0, s4, s0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s8
-; GFX9-NEXT:    v_mov_b32_e32 v1, s9
-; GFX9-NEXT:    s_addc_u32 s1, s5, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX9-NEXT:    s_add_u32 s0, s0, s4
+; GFX9-NEXT:    s_addc_u32 s1, s1, s5
 ; GFX9-NEXT:    global_atomic_and_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_and_i64_ret_addr64_offset:
@@ -915,12 +915,12 @@ define amdgpu_kernel void @atomic_and_i64(ptr addrspace(1) %out, i64 %in) {
 ;
 ; GFX9-LABEL: atomic_and_i64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9-NEXT:    v_mov_b32_e32 v1, s7
-; GFX9-NEXT:    global_atomic_and_x2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_atomic_and_x2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
@@ -1134,18 +1134,18 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ;
 ; GFX9-LABEL: atomic_and_i64_ret_addr64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
-; GFX9-NEXT:    s_add_u32 s0, s4, s0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s8
-; GFX9-NEXT:    v_mov_b32_e32 v1, s9
-; GFX9-NEXT:    s_addc_u32 s1, s5, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX9-NEXT:    s_add_u32 s0, s0, s4
+; GFX9-NEXT:    s_addc_u32 s1, s1, s5
 ; GFX9-NEXT:    global_atomic_and_x2 v[0:1], v2, v[0:1], s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_and_i64_ret_addr64:
@@ -1201,12 +1201,12 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr addrspace(1) %out, i64 %in)
 ;
 ; GFX9-LABEL: atomic_sub_i64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9-NEXT:    v_mov_b32_e32 v1, s7
-; GFX9-NEXT:    global_atomic_sub_x2 v2, v[0:1], s[4:5] offset:32
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_atomic_sub_x2 v2, v[0:1], s[0:1] offset:32
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
@@ -1427,18 +1427,18 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr addrspace(1) %ou
 ;
 ; GFX9-LABEL: atomic_sub_i64_ret_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
-; GFX9-NEXT:    s_add_u32 s0, s4, s0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s8
-; GFX9-NEXT:    v_mov_b32_e32 v1, s9
-; GFX9-NEXT:    s_addc_u32 s1, s5, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX9-NEXT:    s_add_u32 s0, s0, s4
+; GFX9-NEXT:    s_addc_u32 s1, s1, s5
 ; GFX9-NEXT:    global_atomic_sub_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_sub_i64_ret_addr64_offset:
@@ -1499,12 +1499,12 @@ define amdgpu_kernel void @atomic_sub_i64(ptr addrspace(1) %out, i64 %in) {
 ;
 ; GFX9-LABEL: atomic_sub_i64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9-NEXT:    v_mov_b32_e32 v1, s7
-; GFX9-NEXT:    global_atomic_sub_x2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_atomic_sub_x2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
@@ -1718,18 +1718,18 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ;
 ; GFX9-LABEL: atomic_sub_i64_ret_addr64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
-; GFX9-NEXT:    s_add_u32 s0, s4, s0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s8
-; GFX9-NEXT:    v_mov_b32_e32 v1, s9
-; GFX9-NEXT:    s_addc_u32 s1, s5, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX9-NEXT:    s_add_u32 s0, s0, s4
+; GFX9-NEXT:    s_addc_u32 s1, s1, s5
 ; GFX9-NEXT:    global_atomic_sub_x2 v[0:1], v2, v[0:1], s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_sub_i64_ret_addr64:
@@ -1781,12 +1781,12 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr addrspace(1) %out, i64 %in)
 ;
 ; GFX9-LABEL: atomic_max_i64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9-NEXT:    v_mov_b32_e32 v1, s7
-; GFX9-NEXT:    global_atomic_smax_x2 v2, v[0:1], s[4:5] offset:32
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_atomic_smax_x2 v2, v[0:1], s[0:1] offset:32
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_max_i64_offset:
@@ -1994,17 +1994,17 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou
 ;
 ; GFX9-LABEL: atomic_max_i64_ret_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
-; GFX9-NEXT:    s_add_u32 s0, s4, s0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s8
-; GFX9-NEXT:    v_mov_b32_e32 v1, s9
-; GFX9-NEXT:    s_addc_u32 s1, s5, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX9-NEXT:    s_add_u32 s0, s0, s4
+; GFX9-NEXT:    s_addc_u32 s1, s1, s5
 ; GFX9-NEXT:    global_atomic_smax_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_max_i64_ret_addr64_offset:
@@ -2061,12 +2061,12 @@ define amdgpu_kernel void @atomic_max_i64(ptr addrspace(1) %out, i64 %in) {
 ;
 ; GFX9-LABEL: atomic_max_i64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9-NEXT:    v_mov_b32_e32 v1, s7
-; GFX9-NEXT:    global_atomic_smax_x2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_atomic_smax_x2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_max_i64:
@@ -2267,17 +2267,17 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ;
 ; GFX9-LABEL: atomic_max_i64_ret_addr64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
-; GFX9-NEXT:    s_add_u32 s0, s4, s0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s8
-; GFX9-NEXT:    v_mov_b32_e32 v1, s9
-; GFX9-NEXT:    s_addc_u32 s1, s5, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX9-NEXT:    s_add_u32 s0, s0, s4
+; GFX9-NEXT:    s_addc_u32 s1, s1, s5
 ; GFX9-NEXT:    global_atomic_smax_x2 v[0:1], v2, v[0:1], s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_max_i64_ret_addr64:
@@ -2329,12 +2329,12 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr addrspace(1) %out, i64 %in
 ;
 ; GFX9-LABEL: atomic_umax_i64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9-NEXT:    v_mov_b32_e32 v1, s7
-; GFX9-NEXT:    global_atomic_umax_x2 v2, v[0:1], s[4:5] offset:32
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_atomic_umax_x2 v2, v[0:1], s[0:1] offset:32
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_umax_i64_offset:
@@ -2542,17 +2542,17 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o
 ;
 ; GFX9-LABEL: atomic_umax_i64_ret_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
-; GFX9-NEXT:    s_add_u32 s0, s4, s0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s8
-; GFX9-NEXT:    v_mov_b32_e32 v1, s9
-; GFX9-NEXT:    s_addc_u32 s1, s5, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX9-NEXT:    s_add_u32 s0, s0, s4
+; GFX9-NEXT:    s_addc_u32 s1, s1, s5
 ; GFX9-NEXT:    global_atomic_umax_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_umax_i64_ret_addr64_offset:
@@ -2609,12 +2609,12 @@ define amdgpu_kernel void @atomic_umax_i64(ptr addrspace(1) %out, i64 %in) {
 ;
 ; GFX9-LABEL: atomic_umax_i64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9-NEXT:    v_mov_b32_e32 v1, s7
-; GFX9-NEXT:    global_atomic_umax_x2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_atomic_umax_x2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_umax_i64:
@@ -2815,17 +2815,17 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ;
 ; GFX9-LABEL: atomic_umax_i64_ret_addr64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
-; GFX9-NEXT:    s_add_u32 s0, s4, s0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s8
-; GFX9-NEXT:    v_mov_b32_e32 v1, s9
-; GFX9-NEXT:    s_addc_u32 s1, s5, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX9-NEXT:    s_add_u32 s0, s0, s4
+; GFX9-NEXT:    s_addc_u32 s1, s1, s5
 ; GFX9-NEXT:    global_atomic_umax_x2 v[0:1], v2, v[0:1], s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_umax_i64_ret_addr64:
@@ -2877,12 +2877,12 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr addrspace(1) %out, i64 %in)
 ;
 ; GFX9-LABEL: atomic_min_i64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9-NEXT:    v_mov_b32_e32 v1, s7
-; GFX9-NEXT:    global_atomic_smin_x2 v2, v[0:1], s[4:5] offset:32
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_atomic_smin_x2 v2, v[0:1], s[0:1] offset:32
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_min_i64_offset:
@@ -3090,17 +3090,17 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou
 ;
 ; GFX9-LABEL: atomic_min_i64_ret_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
-; GFX9-NEXT:    s_add_u32 s0, s4, s0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s8
-; GFX9-NEXT:    v_mov_b32_e32 v1, s9
-; GFX9-NEXT:    s_addc_u32 s1, s5, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX9-NEXT:    s_add_u32 s0, s0, s4
+; GFX9-NEXT:    s_addc_u32 s1, s1, s5
 ; GFX9-NEXT:    global_atomic_smin_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_min_i64_ret_addr64_offset:
@@ -3157,12 +3157,12 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) {
 ;
 ; GFX9-LABEL: atomic_min_i64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9-NEXT:    v_mov_b32_e32 v1, s7
-; GFX9-NEXT:    global_atomic_smin_x2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_atomic_smin_x2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_min_i64:
@@ -3363,17 +3363,17 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ;
 ; GFX9-LABEL: atomic_min_i64_ret_addr64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
-; GFX9-NEXT:    s_add_u32 s0, s4, s0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s8
-; GFX9-NEXT:    v_mov_b32_e32 v1, s9
-; GFX9-NEXT:    s_addc_u32 s1, s5, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX9-NEXT:    s_add_u32 s0, s0, s4
+; GFX9-NEXT:    s_addc_u32 s1, s1, s5
 ; GFX9-NEXT:    global_atomic_smin_x2 v[0:1], v2, v[0:1], s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_min_i64_ret_addr64:
@@ -3425,12 +3425,12 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr addrspace(1) %out, i64 %in
 ;
 ; GFX9-LABEL: atomic_umin_i64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9-NEXT:    v_mov_b32_e32 v1, s7
-; GFX9-NEXT:    global_atomic_umin_x2 v2, v[0:1], s[4:5] offset:32
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_atomic_umin_x2 v2, v[0:1], s[0:1] offset:32
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_umin_i64_offset:
@@ -3638,17 +3638,17 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr addrspace(1) %o
 ;
 ; GFX9-LABEL: atomic_umin_i64_ret_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
-; GFX9-NEXT:    s_add_u32 s0, s4, s0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s8
-; GFX9-NEXT:    v_mov_b32_e32 v1, s9
-; GFX9-NEXT:    s_addc_u32 s1, s5, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX9-NEXT:    s_add_u32 s0, s0, s4
+; GFX9-NEXT:    s_addc_u32 s1, s1, s5
 ; GFX9-NEXT:    global_atomic_umin_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_umin_i64_ret_addr64_offset:
@@ -3705,12 +3705,12 @@ define amdgpu_kernel void @atomic_umin_i64(ptr addrspace(1) %out, i64 %in) {
 ;
 ; GFX9-LABEL: atomic_umin_i64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9-NEXT:    v_mov_b32_e32 v1, s7
-; GFX9-NEXT:    global_atomic_umin_x2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_atomic_umin_x2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_umin_i64:
@@ -3911,17 +3911,17 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ;
 ; GFX9-LABEL: atomic_umin_i64_ret_addr64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
-; GFX9-NEXT:    s_add_u32 s0, s4, s0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s8
-; GFX9-NEXT:    v_mov_b32_e32 v1, s9
-; GFX9-NEXT:    s_addc_u32 s1, s5, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX9-NEXT:    s_add_u32 s0, s0, s4
+; GFX9-NEXT:    s_addc_u32 s1, s1, s5
 ; GFX9-NEXT:    global_atomic_umin_x2 v[0:1], v2, v[0:1], s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_umin_i64_ret_addr64:
@@ -3977,12 +3977,12 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr addrspace(1) %out, i64 %in)
 ;
 ; GFX9-LABEL: atomic_or_i64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9-NEXT:    v_mov_b32_e32 v1, s7
-; GFX9-NEXT:    global_atomic_or_x2 v2, v[0:1], s[4:5] offset:32
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_atomic_or_x2 v2, v[0:1], s[0:1] offset:32
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
@@ -4203,18 +4203,18 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr addrspace(1) %out
 ;
 ; GFX9-LABEL: atomic_or_i64_ret_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
-; GFX9-NEXT:    s_add_u32 s0, s4, s0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s8
-; GFX9-NEXT:    v_mov_b32_e32 v1, s9
-; GFX9-NEXT:    s_addc_u32 s1, s5, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX9-NEXT:    s_add_u32 s0, s0, s4
+; GFX9-NEXT:    s_addc_u32 s1, s1, s5
 ; GFX9-NEXT:    global_atomic_or_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_or_i64_ret_addr64_offset:
@@ -4275,12 +4275,12 @@ define amdgpu_kernel void @atomic_or_i64(ptr addrspace(1) %out, i64 %in) {
 ;
 ; GFX9-LABEL: atomic_or_i64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9-NEXT:    v_mov_b32_e32 v1, s7
-; GFX9-NEXT:    global_atomic_or_x2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_atomic_or_x2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
@@ -4494,18 +4494,18 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr addrspace(1) %out, ptr a
 ;
 ; GFX9-LABEL: atomic_or_i64_ret_addr64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
-; GFX9-NEXT:    s_add_u32 s0, s4, s0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s8
-; GFX9-NEXT:    v_mov_b32_e32 v1, s9
-; GFX9-NEXT:    s_addc_u32 s1, s5, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX9-NEXT:    s_add_u32 s0, s0, s4
+; GFX9-NEXT:    s_addc_u32 s1, s1, s5
 ; GFX9-NEXT:    global_atomic_or_x2 v[0:1], v2, v[0:1], s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_or_i64_ret_addr64:
@@ -4561,12 +4561,12 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(ptr addrspace(1) %out, i64 %in
 ;
 ; GFX9-LABEL: atomic_xchg_i64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9-NEXT:    v_mov_b32_e32 v1, s7
-; GFX9-NEXT:    global_atomic_swap_x2 v2, v[0:1], s[4:5] offset:32
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_atomic_swap_x2 v2, v[0:1], s[0:1] offset:32
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
@@ -4617,12 +4617,12 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(ptr addrspace(1) %out, double
 ;
 ; GFX9-LABEL: atomic_xchg_f64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9-NEXT:    v_mov_b32_e32 v1, s7
-; GFX9-NEXT:    global_atomic_swap_x2 v2, v[0:1], s[4:5] offset:32
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_atomic_swap_x2 v2, v[0:1], s[0:1] offset:32
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
@@ -4673,12 +4673,12 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr addrspace(1) %out, ptr
 ;
 ; GFX9-LABEL: atomic_xchg_pointer_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9-NEXT:    v_mov_b32_e32 v1, s7
-; GFX9-NEXT:    global_atomic_swap_x2 v2, v[0:1], s[4:5] offset:32
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_atomic_swap_x2 v2, v[0:1], s[0:1] offset:32
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
@@ -4899,18 +4899,18 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr addrspace(1) %o
 ;
 ; GFX9-LABEL: atomic_xchg_i64_ret_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
-; GFX9-NEXT:    s_add_u32 s0, s4, s0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s8
-; GFX9-NEXT:    v_mov_b32_e32 v1, s9
-; GFX9-NEXT:    s_addc_u32 s1, s5, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX9-NEXT:    s_add_u32 s0, s0, s4
+; GFX9-NEXT:    s_addc_u32 s1, s1, s5
 ; GFX9-NEXT:    global_atomic_swap_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_xchg_i64_ret_addr64_offset:
@@ -4971,12 +4971,12 @@ define amdgpu_kernel void @atomic_xchg_i64(ptr addrspace(1) %out, i64 %in) {
 ;
 ; GFX9-LABEL: atomic_xchg_i64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9-NEXT:    v_mov_b32_e32 v1, s7
-; GFX9-NEXT:    global_atomic_swap_x2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_atomic_swap_x2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
@@ -5190,18 +5190,18 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ;
 ; GFX9-LABEL: atomic_xchg_i64_ret_addr64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
-; GFX9-NEXT:    s_add_u32 s0, s4, s0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s8
-; GFX9-NEXT:    v_mov_b32_e32 v1, s9
-; GFX9-NEXT:    s_addc_u32 s1, s5, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX9-NEXT:    s_add_u32 s0, s0, s4
+; GFX9-NEXT:    s_addc_u32 s1, s1, s5
 ; GFX9-NEXT:    global_atomic_swap_x2 v[0:1], v2, v[0:1], s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_xchg_i64_ret_addr64:
@@ -5257,12 +5257,12 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr addrspace(1) %out, i64 %in)
 ;
 ; GFX9-LABEL: atomic_xor_i64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9-NEXT:    v_mov_b32_e32 v1, s7
-; GFX9-NEXT:    global_atomic_xor_x2 v2, v[0:1], s[4:5] offset:32
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_atomic_xor_x2 v2, v[0:1], s[0:1] offset:32
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
@@ -5483,18 +5483,18 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr addrspace(1) %ou
 ;
 ; GFX9-LABEL: atomic_xor_i64_ret_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
-; GFX9-NEXT:    s_add_u32 s0, s4, s0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s8
-; GFX9-NEXT:    v_mov_b32_e32 v1, s9
-; GFX9-NEXT:    s_addc_u32 s1, s5, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX9-NEXT:    s_add_u32 s0, s0, s4
+; GFX9-NEXT:    s_addc_u32 s1, s1, s5
 ; GFX9-NEXT:    global_atomic_xor_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_xor_i64_ret_addr64_offset:
@@ -5555,12 +5555,12 @@ define amdgpu_kernel void @atomic_xor_i64(ptr addrspace(1) %out, i64 %in) {
 ;
 ; GFX9-LABEL: atomic_xor_i64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9-NEXT:    v_mov_b32_e32 v1, s7
-; GFX9-NEXT:    global_atomic_xor_x2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_atomic_xor_x2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
@@ -5774,18 +5774,18 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ;
 ; GFX9-LABEL: atomic_xor_i64_ret_addr64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
-; GFX9-NEXT:    s_add_u32 s0, s4, s0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s8
-; GFX9-NEXT:    v_mov_b32_e32 v1, s9
-; GFX9-NEXT:    s_addc_u32 s1, s5, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX9-NEXT:    s_add_u32 s0, s0, s4
+; GFX9-NEXT:    s_addc_u32 s1, s1, s5
 ; GFX9-NEXT:    global_atomic_xor_x2 v[0:1], v2, v[0:1], s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_xor_i64_ret_addr64:
@@ -6001,17 +6001,17 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr addrspace(1) %out,
 ;
 ; GFX9-LABEL: atomic_cmpxchg_i64_ret_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s8
-; GFX9-NEXT:    v_mov_b32_e32 v1, s9
-; GFX9-NEXT:    v_mov_b32_e32 v2, s10
-; GFX9-NEXT:    v_mov_b32_e32 v3, s11
-; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] offset:32 glc
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] offset:32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[6:7]
+; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_cmpxchg_i64_ret_offset:
@@ -6079,16 +6079,16 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr addrspace(1) %ou
 ;
 ; GFX9-LABEL: atomic_cmpxchg_i64_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[8:9], 3
-; GFX9-NEXT:    s_add_u32 s0, s4, s0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9-NEXT:    v_mov_b32_e32 v1, s7
-; GFX9-NEXT:    v_mov_b32_e32 v2, s10
-; GFX9-NEXT:    s_addc_u32 s1, s5, s1
-; GFX9-NEXT:    v_mov_b32_e32 v3, s11
+; GFX9-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX9-NEXT:    s_add_u32 s0, s0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    s_addc_u32 s1, s1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX9-NEXT:    global_atomic_cmpswap_x2 v4, v[0:3], s[0:1] offset:32
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
@@ -6329,17 +6329,17 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr addrspace(1) %out, ptr add
 ;
 ; GFX9-LABEL: atomic_cmpxchg_i64_ret:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s8
-; GFX9-NEXT:    v_mov_b32_e32 v1, s9
-; GFX9-NEXT:    v_mov_b32_e32 v2, s10
-; GFX9-NEXT:    v_mov_b32_e32 v3, s11
-; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] glc
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[6:7]
+; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_cmpxchg_i64_ret:
@@ -6404,16 +6404,16 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr addrspace(1) %out, i64
 ;
 ; GFX9-LABEL: atomic_cmpxchg_i64_addr64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[8:9], 3
-; GFX9-NEXT:    s_add_u32 s0, s4, s0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9-NEXT:    v_mov_b32_e32 v1, s7
-; GFX9-NEXT:    v_mov_b32_e32 v2, s10
-; GFX9-NEXT:    s_addc_u32 s1, s5, s1
-; GFX9-NEXT:    v_mov_b32_e32 v3, s11
+; GFX9-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX9-NEXT:    s_add_u32 s0, s0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    s_addc_u32 s1, s1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX9-NEXT:    global_atomic_cmpswap_x2 v4, v[0:3], s[0:1]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
@@ -6573,13 +6573,13 @@ define amdgpu_kernel void @atomic_load_i64_offset(ptr addrspace(1) %in, ptr addr
 ;
 ; GFX9-LABEL: atomic_load_i64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[4:5] offset:32 glc
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[0:1] offset:32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_load_i64_offset:
@@ -6640,13 +6640,13 @@ define amdgpu_kernel void @atomic_load_i64_neg_offset(ptr addrspace(1) %in, ptr
 ;
 ; GFX9-LABEL: atomic_load_i64_neg_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[4:5] offset:-32 glc
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[0:1] offset:-32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_load_i64_neg_offset:
@@ -6703,13 +6703,13 @@ define amdgpu_kernel void @atomic_load_i64(ptr addrspace(1) %in, ptr addrspace(1
 ;
 ; GFX9-LABEL: atomic_load_i64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[4:5] glc
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_load_i64:
@@ -7005,12 +7005,12 @@ define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, ptr addrspace(1) %ou
 ;
 ; GFX9-LABEL: atomic_store_i64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7] offset:32
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] offset:32
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_store_i64_offset:
@@ -7057,12 +7057,12 @@ define amdgpu_kernel void @atomic_store_i64(i64 %in, ptr addrspace(1) %out) {
 ;
 ; GFX9-LABEL: atomic_store_i64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_store_i64:
@@ -7320,12 +7320,12 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr addrspace(1) %out, i64 %in)
 ;
 ; GFX9-LABEL: atomic_inc_i64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9-NEXT:    v_mov_b32_e32 v1, s7
-; GFX9-NEXT:    global_atomic_inc_x2 v2, v[0:1], s[4:5] offset:32
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_atomic_inc_x2 v2, v[0:1], s[0:1] offset:32
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
@@ -7528,12 +7528,12 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr addrspace(1) %out, i64 %in)
 ;
 ; GFX9-LABEL: atomic_dec_i64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9-NEXT:    v_mov_b32_e32 v1, s7
-; GFX9-NEXT:    global_atomic_dec_x2 v2, v[0:1], s[4:5] offset:32
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_atomic_dec_x2 v2, v[0:1], s[0:1] offset:32
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll
index 9af7e0978f9db..8897ad3e950a5 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll
@@ -5064,37 +5064,37 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou
 ;
 ; GFX9-LABEL: atomic_max_i64_ret_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
-; GFX9-NEXT:    s_add_u32 s0, s4, s0
-; GFX9-NEXT:    s_addc_u32 s1, s5, s1
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x20
-; GFX9-NEXT:    v_mov_b32_e32 v2, s9
-; GFX9-NEXT:    v_mov_b32_e32 v3, s8
+; GFX9-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
+; GFX9-NEXT:    s_add_u32 s0, s0, s6
+; GFX9-NEXT:    s_addc_u32 s1, s1, s7
+; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x20
+; GFX9-NEXT:    s_mov_b64 s[6:7], 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, s5
+; GFX9-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX9-NEXT:  .LBB89_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    v_mov_b32_e32 v8, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v7, v0
-; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[8:9], v[7:8]
+; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[7:8]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v2, v8, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
 ; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[0:1] offset:32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
-; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB89_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX9-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
@@ -5300,37 +5300,37 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ;
 ; GFX9-LABEL: atomic_max_i64_ret_addr64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
-; GFX9-NEXT:    s_add_u32 s0, s4, s0
-; GFX9-NEXT:    s_addc_u32 s1, s5, s1
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX9-NEXT:    v_mov_b32_e32 v2, s9
-; GFX9-NEXT:    v_mov_b32_e32 v3, s8
+; GFX9-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
+; GFX9-NEXT:    s_add_u32 s0, s0, s6
+; GFX9-NEXT:    s_addc_u32 s1, s1, s7
+; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x0
+; GFX9-NEXT:    s_mov_b64 s[6:7], 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, s5
+; GFX9-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX9-NEXT:  .LBB91_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    v_mov_b32_e32 v8, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v7, v0
-; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[8:9], v[7:8]
+; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[7:8]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v2, v8, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
 ; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
-; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB91_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX9-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
@@ -6526,37 +6526,37 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o
 ;
 ; GFX9-LABEL: atomic_umax_i64_ret_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
-; GFX9-NEXT:    s_add_u32 s0, s4, s0
-; GFX9-NEXT:    s_addc_u32 s1, s5, s1
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x20
-; GFX9-NEXT:    v_mov_b32_e32 v2, s9
-; GFX9-NEXT:    v_mov_b32_e32 v3, s8
+; GFX9-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
+; GFX9-NEXT:    s_add_u32 s0, s0, s6
+; GFX9-NEXT:    s_addc_u32 s1, s1, s7
+; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x20
+; GFX9-NEXT:    s_mov_b64 s[6:7], 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, s5
+; GFX9-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX9-NEXT:  .LBB103_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    v_mov_b32_e32 v8, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v7, v0
-; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[8:9], v[7:8]
+; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[7:8]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v2, v8, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
 ; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[0:1] offset:32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
-; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB103_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX9-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
@@ -6650,37 +6650,37 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ;
 ; GFX9-LABEL: atomic_umax_i64_ret_addr64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
-; GFX9-NEXT:    s_add_u32 s0, s4, s0
-; GFX9-NEXT:    s_addc_u32 s1, s5, s1
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX9-NEXT:    v_mov_b32_e32 v2, s9
-; GFX9-NEXT:    v_mov_b32_e32 v3, s8
+; GFX9-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
+; GFX9-NEXT:    s_add_u32 s0, s0, s6
+; GFX9-NEXT:    s_addc_u32 s1, s1, s7
+; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x0
+; GFX9-NEXT:    s_mov_b64 s[6:7], 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, s5
+; GFX9-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX9-NEXT:  .LBB104_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    v_mov_b32_e32 v8, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v7, v0
-; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[8:9], v[7:8]
+; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[7:8]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v2, v8, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
 ; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
-; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB104_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX9-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
@@ -8862,37 +8862,37 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou
 ;
 ; GFX9-LABEL: atomic_min_i64_ret_addr64_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
-; GFX9-NEXT:    s_add_u32 s0, s4, s0
-; GFX9-NEXT:    s_addc_u32 s1, s5, s1
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x20
-; GFX9-NEXT:    v_mov_b32_e32 v2, s9
-; GFX9-NEXT:    v_mov_b32_e32 v3, s8
+; GFX9-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
+; GFX9-NEXT:    s_add_u32 s0, s0, s6
+; GFX9-NEXT:    s_addc_u32 s1, s1, s7
+; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x20
+; GFX9-NEXT:    s_mov_b64 s[6:7], 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, s5
+; GFX9-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX9-NEXT:  .LBB126_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    v_mov_b32_e32 v8, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v7, v0
-; GFX9-NEXT:    v_cmp_ge_i64_e32 vcc, s[8:9], v[7:8]
+; GFX9-NEXT:    v_cmp_ge_i64_e32 vcc, s[4:5], v[7:8]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v2, v8, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
 ; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[0:1] offset:32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
-; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB126_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX9-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
@@ -8972,29 +8972,29 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) {
 ;
 ; GFX9-LABEL: atomic_min_i64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b64 s[0:1], 0
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
-; GFX9-NEXT:    v_mov_b32_e32 v4, s7
-; GFX9-NEXT:    v_mov_b32_e32 v5, s6
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v4, s3
+; GFX9-NEXT:    v_mov_b32_e32 v5, s2
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v2, s2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s3
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX9-NEXT:  .LBB127_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
+; GFX9-NEXT:    v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
-; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[4:5] glc
+; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v3, v1
-; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX9-NEXT:    v_mov_b32_e32 v2, v0
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB127_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX9-NEXT:    s_endpgm
@@ -9087,37 +9087,37 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ;
 ; GFX9-LABEL: atomic_min_i64_ret_addr64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
-; GFX9-NEXT:    s_add_u32 s0, s4, s0
-; GFX9-NEXT:    s_addc_u32 s1, s5, s1
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX9-NEXT:    v_mov_b32_e32 v2, s9
-; GFX9-NEXT:    v_mov_b32_e32 v3, s8
+; GFX9-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
+; GFX9-NEXT:    s_add_u32 s0, s0, s6
+; GFX9-NEXT:    s_addc_u32 s1, s1, s7
+; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x0
+; GFX9-NEXT:    s_mov_b64 s[6:7], 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, s5
+; GFX9-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX9-NEXT:  .LBB128_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    v_mov_b32_e32 v8, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v7, v0
-; GFX9-NEXT:    v_cmp_ge_i64_e32 vcc, s[8:9], v[7:8]
+; GFX9-NEXT:    v_cmp_ge_i64_e32 vcc, s[4:5], v[7:8]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v2, v8, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
 ; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
-; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB128_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX9-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
diff --git a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
index 5abd4c9069c91..297b5180dfe9b 100644
--- a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
+++ b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
@@ -35,8 +35,8 @@ define void @main(i1 %arg) #0 {
 ; CHECK-NEXT:    ; implicit-def: $vgpr4 : SGPR spill to VGPR lane
 ; CHECK-NEXT:    s_mov_b64 s[4:5], 0
 ; CHECK-NEXT:    s_load_dwordx4 s[28:31], s[4:5], 0x0
-; CHECK-NEXT:    s_movk_i32 s20, 0x130
-; CHECK-NEXT:    s_mov_b32 s21, s24
+; CHECK-NEXT:    s_movk_i32 s4, 0x130
+; CHECK-NEXT:    s_mov_b32 s5, s24
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    v_writelane_b32 v4, s36, 0
 ; CHECK-NEXT:    v_writelane_b32 v4, s37, 1
@@ -49,7 +49,7 @@ define void @main(i1 %arg) #0 {
 ; CHECK-NEXT:    v_writelane_b32 v4, s44, 8
 ; CHECK-NEXT:    v_writelane_b32 v4, s45, 9
 ; CHECK-NEXT:    v_writelane_b32 v4, s46, 10
-; CHECK-NEXT:    s_load_dwordx16 s[4:19], s[20:21], 0x0
+; CHECK-NEXT:    s_load_dwordx16 s[4:19], s[4:5], 0x0
 ; CHECK-NEXT:    v_writelane_b32 v4, s47, 11
 ; CHECK-NEXT:    v_writelane_b32 v4, s48, 12
 ; CHECK-NEXT:    v_writelane_b32 v4, s49, 13
@@ -78,17 +78,17 @@ define void @main(i1 %arg) #0 {
 ; CHECK-NEXT:    v_writelane_b32 v4, s13, 25
 ; CHECK-NEXT:    v_writelane_b32 v4, s14, 26
 ; CHECK-NEXT:    v_writelane_b32 v4, s15, 27
-; CHECK-NEXT:    v_writelane_b32 v8, s52, 18
 ; CHECK-NEXT:    v_writelane_b32 v4, s16, 28
-; CHECK-NEXT:    v_writelane_b32 v8, s53, 19
+; CHECK-NEXT:    v_writelane_b32 v8, s52, 18
 ; CHECK-NEXT:    v_writelane_b32 v4, s17, 29
-; CHECK-NEXT:    v_writelane_b32 v8, s54, 20
+; CHECK-NEXT:    v_writelane_b32 v8, s53, 19
 ; CHECK-NEXT:    v_writelane_b32 v4, s18, 30
-; CHECK-NEXT:    s_mov_b32 s26, 48
-; CHECK-NEXT:    s_mov_b32 s27, s24
-; CHECK-NEXT:    v_writelane_b32 v8, s55, 21
+; CHECK-NEXT:    v_writelane_b32 v8, s54, 20
 ; CHECK-NEXT:    v_writelane_b32 v4, s19, 31
-; CHECK-NEXT:    s_load_dwordx8 s[4:11], s[26:27], 0x0
+; CHECK-NEXT:    s_mov_b32 s4, 48
+; CHECK-NEXT:    s_mov_b32 s5, s24
+; CHECK-NEXT:    v_writelane_b32 v8, s55, 21
+; CHECK-NEXT:    s_load_dwordx8 s[4:11], s[4:5], 0x0
 ; CHECK-NEXT:    v_writelane_b32 v8, s56, 22
 ; CHECK-NEXT:    v_writelane_b32 v8, s57, 23
 ; CHECK-NEXT:    v_writelane_b32 v8, s58, 24
@@ -107,15 +107,15 @@ define void @main(i1 %arg) #0 {
 ; CHECK-NEXT:    v_writelane_b32 v8, s65, 31
 ; CHECK-NEXT:    v_writelane_b32 v4, s9, 37
 ; CHECK-NEXT:    v_writelane_b32 v8, s66, 32
-; CHECK-NEXT:    s_movk_i32 s28, 0x1f0
-; CHECK-NEXT:    s_movk_i32 s30, 0x2f0
+; CHECK-NEXT:    s_movk_i32 s26, 0x1f0
+; CHECK-NEXT:    s_movk_i32 s28, 0x2f0
+; CHECK-NEXT:    s_mov_b32 s27, s24
 ; CHECK-NEXT:    s_mov_b32 s29, s24
-; CHECK-NEXT:    s_mov_b32 s31, s24
 ; CHECK-NEXT:    v_writelane_b32 v4, s10, 38
 ; CHECK-NEXT:    v_writelane_b32 v8, s67, 33
 ; CHECK-NEXT:    v_writelane_b32 v4, s11, 39
-; CHECK-NEXT:    s_load_dwordx16 s[52:67], s[28:29], 0x0
-; CHECK-NEXT:    s_load_dwordx16 s[4:19], s[30:31], 0x0
+; CHECK-NEXT:    s_load_dwordx16 s[52:67], s[26:27], 0x0
+; CHECK-NEXT:    s_load_dwordx16 s[4:19], s[28:29], 0x0
 ; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
 ; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; CHECK-NEXT:    s_xor_b64 s[24:25], vcc, -1
diff --git a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
index 81210d8f5d0ca..78653d7e21ad8 100644
--- a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
+++ b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
@@ -638,9 +638,9 @@ define amdgpu_kernel void @udiv16_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX9-LABEL: udiv16_invariant_denom:
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x2c
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    s_movk_i32 s2, 0x400
+; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x24
+; GFX9-NEXT:    s_movk_i32 s4, 0x400
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_and_b32 s0, s0, 0xffff
@@ -651,14 +651,14 @@ define amdgpu_kernel void @udiv16_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX9-NEXT:    v_cvt_f32_u32_sdwa v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v5, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX9-NEXT:    v_add_u16_e32 v2, 1, v2
-; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, s2, v2
+; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v2
 ; GFX9-NEXT:    v_mul_f32_e32 v6, v4, v1
 ; GFX9-NEXT:    v_trunc_f32_e32 v6, v6
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v7, v6
 ; GFX9-NEXT:    v_mad_f32 v4, -v6, v0, v4
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v4|, v0
 ; GFX9-NEXT:    v_addc_co_u32_e64 v4, s[0:1], 0, v7, s[0:1]
-; GFX9-NEXT:    global_store_short v5, v4, s[4:5]
+; GFX9-NEXT:    global_store_short v5, v4, s[2:3]
 ; GFX9-NEXT:    s_cbranch_vccz .LBB4_1
 ; GFX9-NEXT:  ; %bb.2: ; %bb2
 ; GFX9-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll
index fee59455da4c8..1d68b0ba0a280 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4s.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll
@@ -2044,13 +2044,13 @@ define amdgpu_kernel void @idot4_acc32_3src(ptr addrspace(1) %src1,
 ;
 ; GFX9-NODL-LABEL: idot4_acc32_3src:
 ; GFX9-NODL:       ; %bb.0: ; %entry
-; GFX9-NODL-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NODL-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
-; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
-; GFX9-NODL-NEXT:    global_load_dword v3, v0, s[8:9]
-; GFX9-NODL-NEXT:    s_load_dword s0, s[10:11], 0x0
+; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[2:3]
+; GFX9-NODL-NEXT:    global_load_dword v3, v0, s[4:5]
+; GFX9-NODL-NEXT:    s_load_dword s0, s[6:7], 0x0
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NODL-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v4, sext(v1), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
@@ -2062,20 +2062,20 @@ define amdgpu_kernel void @idot4_acc32_3src(ptr addrspace(1) %src1,
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NODL-NEXT:    v_add3_u32 v2, v4, s0, v2
 ; GFX9-NODL-NEXT:    v_add3_u32 v1, v2, v5, v1
-; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[10:11]
+; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX9-NODL-NEXT:    s_endpgm
 ;
 ; GFX9-DL-LABEL: idot4_acc32_3src:
 ; GFX9-DL:       ; %bb.0: ; %entry
-; GFX9-DL-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-DL-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-DL-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX9-DL-NEXT:    global_load_dword v3, v0, s[0:1]
+; GFX9-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX9-DL-NEXT:    s_mov_b32 s0, 0x706010c
 ; GFX9-DL-NEXT:    s_mov_b32 s2, 0xc0c0c00
-; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    global_load_dword v1, v0, s[6:7]
-; GFX9-DL-NEXT:    global_load_dword v2, v0, s[8:9]
-; GFX9-DL-NEXT:    global_load_dword v3, v0, s[4:5]
-; GFX9-DL-NEXT:    s_load_dword s1, s[10:11], 0x0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-DL-NEXT:    v_perm_b32 v1, v2, v1, s0
@@ -2084,19 +2084,20 @@ define amdgpu_kernel void @idot4_acc32_3src(ptr addrspace(1) %src1,
 ; GFX9-DL-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    v_dot4_i32_i8 v1, v3, v1, s1
-; GFX9-DL-NEXT:    global_store_dword v0, v1, s[10:11]
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: idot4_acc32_3src:
 ; GFX10-DL:       ; %bb.0: ; %entry
-; GFX10-DL-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX10-DL-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_clause 0x2
-; GFX10-DL-NEXT:    global_load_dword v1, v0, s[6:7]
-; GFX10-DL-NEXT:    global_load_dword v2, v0, s[8:9]
-; GFX10-DL-NEXT:    global_load_dword v3, v0, s[4:5]
-; GFX10-DL-NEXT:    s_load_dword s0, s[10:11], 0x0
+; GFX10-DL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-DL-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX10-DL-NEXT:    global_load_dword v3, v0, s[0:1]
+; GFX10-DL-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-DL-NEXT:    v_perm_b32 v0, v2, v1, 0x706010c
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
@@ -2106,7 +2107,7 @@ define amdgpu_kernel void @idot4_acc32_3src(ptr addrspace(1) %src1,
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX10-DL-NEXT:    v_dot4c_i32_i8 v1, v3, v0
-; GFX10-DL-NEXT:    global_store_dword v2, v1, s[10:11]
+; GFX10-DL-NEXT:    global_store_dword v2, v1, s[6:7]
 ; GFX10-DL-NEXT:    s_endpgm
 ;
 ; GFX11-DL-LABEL: idot4_acc32_3src:
@@ -2249,13 +2250,13 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele(ptr addrspace(1) %src1,
 ;
 ; GFX9-NODL-LABEL: idot4_acc32_3src_3ele:
 ; GFX9-NODL:       ; %bb.0: ; %entry
-; GFX9-NODL-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NODL-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
-; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
-; GFX9-NODL-NEXT:    global_load_dword v3, v0, s[8:9]
-; GFX9-NODL-NEXT:    s_load_dword s0, s[10:11], 0x0
+; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[2:3]
+; GFX9-NODL-NEXT:    global_load_dword v3, v0, s[4:5]
+; GFX9-NODL-NEXT:    s_load_dword s0, s[6:7], 0x0
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NODL-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NODL-NEXT:    v_bfe_i32 v4, v1, 0, 8
@@ -2266,21 +2267,21 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele(ptr addrspace(1) %src1,
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NODL-NEXT:    v_mad_i32_i24 v3, v4, v4, s0
 ; GFX9-NODL-NEXT:    v_add3_u32 v1, v3, v2, v1
-; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[10:11]
+; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX9-NODL-NEXT:    s_endpgm
 ;
 ; GFX9-DL-LABEL: idot4_acc32_3src_3ele:
 ; GFX9-DL:       ; %bb.0: ; %entry
-; GFX9-DL-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-DL-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-DL-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX9-DL-NEXT:    global_load_dword v3, v0, s[0:1]
+; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
 ; GFX9-DL-NEXT:    s_mov_b32 s0, 0xc06010c
 ; GFX9-DL-NEXT:    s_mov_b32 s1, 0xc0c0c00
 ; GFX9-DL-NEXT:    s_mov_b32 s2, 0xc020100
-; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    global_load_dword v1, v0, s[6:7]
-; GFX9-DL-NEXT:    global_load_dword v2, v0, s[8:9]
-; GFX9-DL-NEXT:    global_load_dword v3, v0, s[4:5]
-; GFX9-DL-NEXT:    s_load_dword s3, s[10:11], 0x0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-DL-NEXT:    v_perm_b32 v1, v2, v1, s0
@@ -2290,19 +2291,20 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele(ptr addrspace(1) %src1,
 ; GFX9-DL-NEXT:    v_perm_b32 v2, v3, v3, s2
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    v_dot4_i32_i8 v1, v2, v1, s3
-; GFX9-DL-NEXT:    global_store_dword v0, v1, s[10:11]
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: idot4_acc32_3src_3ele:
 ; GFX10-DL:       ; %bb.0: ; %entry
-; GFX10-DL-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX10-DL-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_clause 0x2
-; GFX10-DL-NEXT:    global_load_dword v1, v0, s[6:7]
-; GFX10-DL-NEXT:    global_load_dword v2, v0, s[8:9]
-; GFX10-DL-NEXT:    global_load_dword v3, v0, s[4:5]
-; GFX10-DL-NEXT:    s_load_dword s0, s[10:11], 0x0
+; GFX10-DL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-DL-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX10-DL-NEXT:    global_load_dword v3, v0, s[0:1]
+; GFX10-DL-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-DL-NEXT:    v_perm_b32 v0, v2, v1, 0xc06010c
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
@@ -2313,7 +2315,7 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele(ptr addrspace(1) %src1,
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX10-DL-NEXT:    v_dot4c_i32_i8 v1, v2, v0
-; GFX10-DL-NEXT:    global_store_dword v3, v1, s[10:11]
+; GFX10-DL-NEXT:    global_store_dword v3, v1, s[6:7]
 ; GFX10-DL-NEXT:    s_endpgm
 ;
 ; GFX11-DL-LABEL: idot4_acc32_3src_3ele:
@@ -2854,13 +2856,13 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1,
 ;
 ; GFX9-NODL-LABEL: idot4_acc32_3src_3ele_src0:
 ; GFX9-NODL:       ; %bb.0: ; %entry
-; GFX9-NODL-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NODL-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[6:7]
-; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[8:9]
-; GFX9-NODL-NEXT:    global_load_dword v3, v0, s[4:5]
-; GFX9-NODL-NEXT:    s_load_dword s0, s[10:11], 0x0
+; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX9-NODL-NEXT:    global_load_dword v3, v0, s[0:1]
+; GFX9-NODL-NEXT:    s_load_dword s0, s[6:7], 0x0
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NODL-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NODL-NEXT:    v_bfe_i32 v4, v1, 8, 8
@@ -2871,21 +2873,21 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1,
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NODL-NEXT:    v_mad_i32_i24 v3, v4, v4, s0
 ; GFX9-NODL-NEXT:    v_add3_u32 v1, v3, v2, v1
-; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[10:11]
+; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX9-NODL-NEXT:    s_endpgm
 ;
 ; GFX9-DL-LABEL: idot4_acc32_3src_3ele_src0:
 ; GFX9-DL:       ; %bb.0: ; %entry
-; GFX9-DL-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-DL-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX9-DL-NEXT:    global_load_dword v2, v0, s[0:1]
+; GFX9-DL-NEXT:    global_load_dword v3, v0, s[2:3]
+; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
 ; GFX9-DL-NEXT:    s_mov_b32 s0, 0xc06010c
 ; GFX9-DL-NEXT:    s_mov_b32 s1, 0xc0c0c01
 ; GFX9-DL-NEXT:    s_mov_b32 s2, 0xc020101
-; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    global_load_dword v1, v0, s[8:9]
-; GFX9-DL-NEXT:    global_load_dword v2, v0, s[4:5]
-; GFX9-DL-NEXT:    global_load_dword v3, v0, s[6:7]
-; GFX9-DL-NEXT:    s_load_dword s3, s[10:11], 0x0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-DL-NEXT:    v_perm_b32 v1, v1, v2, s0
@@ -2895,19 +2897,20 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1,
 ; GFX9-DL-NEXT:    v_perm_b32 v2, v3, v3, s2
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    v_dot4_i32_i8 v1, v2, v1, s3
-; GFX9-DL-NEXT:    global_store_dword v0, v1, s[10:11]
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: idot4_acc32_3src_3ele_src0:
 ; GFX10-DL:       ; %bb.0: ; %entry
-; GFX10-DL-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX10-DL-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_clause 0x2
-; GFX10-DL-NEXT:    global_load_dword v1, v0, s[8:9]
-; GFX10-DL-NEXT:    global_load_dword v2, v0, s[4:5]
-; GFX10-DL-NEXT:    global_load_dword v3, v0, s[6:7]
-; GFX10-DL-NEXT:    s_load_dword s0, s[10:11], 0x0
+; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX10-DL-NEXT:    global_load_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT:    global_load_dword v3, v0, s[2:3]
+; GFX10-DL-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-DL-NEXT:    v_perm_b32 v0, v1, v2, 0xc06010c
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
@@ -2918,7 +2921,7 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1,
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX10-DL-NEXT:    v_dot4c_i32_i8 v1, v2, v0
-; GFX10-DL-NEXT:    global_store_dword v3, v1, s[10:11]
+; GFX10-DL-NEXT:    global_store_dword v3, v1, s[6:7]
 ; GFX10-DL-NEXT:    s_endpgm
 ;
 ; GFX11-DL-LABEL: idot4_acc32_3src_3ele_src0:
diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll
index 2894ae76c0be4..fb94b504781b1 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll
@@ -3494,13 +3494,13 @@ define amdgpu_kernel void @udot4_acc32_3src(ptr addrspace(1) %src1,
 ;
 ; GFX9-NODL-LABEL: udot4_acc32_3src:
 ; GFX9-NODL:       ; %bb.0: ; %entry
-; GFX9-NODL-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NODL-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
-; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
-; GFX9-NODL-NEXT:    global_load_dword v3, v0, s[8:9]
-; GFX9-NODL-NEXT:    s_load_dword s0, s[10:11], 0x0
+; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[2:3]
+; GFX9-NODL-NEXT:    global_load_dword v3, v0, s[4:5]
+; GFX9-NODL-NEXT:    s_load_dword s0, s[6:7], 0x0
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NODL-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v4, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
@@ -3512,20 +3512,20 @@ define amdgpu_kernel void @udot4_acc32_3src(ptr addrspace(1) %src1,
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NODL-NEXT:    v_add3_u32 v2, v4, s0, v2
 ; GFX9-NODL-NEXT:    v_add3_u32 v1, v2, v5, v1
-; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[10:11]
+; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX9-NODL-NEXT:    s_endpgm
 ;
 ; GFX9-DL-LABEL: udot4_acc32_3src:
 ; GFX9-DL:       ; %bb.0: ; %entry
-; GFX9-DL-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-DL-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-DL-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX9-DL-NEXT:    global_load_dword v3, v0, s[0:1]
+; GFX9-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
 ; GFX9-DL-NEXT:    s_mov_b32 s0, 0x706010c
 ; GFX9-DL-NEXT:    s_mov_b32 s2, 0xc0c0c00
-; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    global_load_dword v1, v0, s[6:7]
-; GFX9-DL-NEXT:    global_load_dword v2, v0, s[8:9]
-; GFX9-DL-NEXT:    global_load_dword v3, v0, s[4:5]
-; GFX9-DL-NEXT:    s_load_dword s1, s[10:11], 0x0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-DL-NEXT:    v_perm_b32 v1, v2, v1, s0
@@ -3534,19 +3534,20 @@ define amdgpu_kernel void @udot4_acc32_3src(ptr addrspace(1) %src1,
 ; GFX9-DL-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    v_dot4_u32_u8 v1, v3, v1, s1
-; GFX9-DL-NEXT:    global_store_dword v0, v1, s[10:11]
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: udot4_acc32_3src:
 ; GFX10-DL:       ; %bb.0: ; %entry
-; GFX10-DL-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX10-DL-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_clause 0x2
-; GFX10-DL-NEXT:    global_load_dword v1, v0, s[6:7]
-; GFX10-DL-NEXT:    global_load_dword v2, v0, s[8:9]
-; GFX10-DL-NEXT:    global_load_dword v3, v0, s[4:5]
-; GFX10-DL-NEXT:    s_load_dword s0, s[10:11], 0x0
+; GFX10-DL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-DL-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX10-DL-NEXT:    global_load_dword v3, v0, s[0:1]
+; GFX10-DL-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-DL-NEXT:    v_perm_b32 v0, v2, v1, 0x706010c
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
@@ -3555,7 +3556,7 @@ define amdgpu_kernel void @udot4_acc32_3src(ptr addrspace(1) %src1,
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    v_dot4_u32_u8 v0, v3, v0, s0
-; GFX10-DL-NEXT:    global_store_dword v1, v0, s[10:11]
+; GFX10-DL-NEXT:    global_store_dword v1, v0, s[6:7]
 ; GFX10-DL-NEXT:    s_endpgm
 ;
 ; GFX11-DL-LABEL: udot4_acc32_3src:
@@ -3699,13 +3700,13 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele(ptr addrspace(1) %src1,
 ;
 ; GFX9-NODL-LABEL: udot4_acc32_3src_3ele:
 ; GFX9-NODL:       ; %bb.0: ; %entry
-; GFX9-NODL-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NODL-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
-; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
-; GFX9-NODL-NEXT:    global_load_dword v3, v0, s[8:9]
-; GFX9-NODL-NEXT:    s_load_dword s0, s[10:11], 0x0
+; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[2:3]
+; GFX9-NODL-NEXT:    global_load_dword v3, v0, s[4:5]
+; GFX9-NODL-NEXT:    s_load_dword s0, s[6:7], 0x0
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NODL-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NODL-NEXT:    v_and_b32_e32 v4, 0xff, v1
@@ -3716,21 +3717,21 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele(ptr addrspace(1) %src1,
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NODL-NEXT:    v_mad_u32_u24 v3, v4, v4, s0
 ; GFX9-NODL-NEXT:    v_add3_u32 v1, v3, v2, v1
-; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[10:11]
+; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX9-NODL-NEXT:    s_endpgm
 ;
 ; GFX9-DL-LABEL: udot4_acc32_3src_3ele:
 ; GFX9-DL:       ; %bb.0: ; %entry
-; GFX9-DL-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-DL-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-DL-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX9-DL-NEXT:    global_load_dword v3, v0, s[0:1]
+; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
 ; GFX9-DL-NEXT:    s_mov_b32 s0, 0xc06010c
 ; GFX9-DL-NEXT:    s_mov_b32 s1, 0xc0c0c00
 ; GFX9-DL-NEXT:    s_mov_b32 s2, 0xc020100
-; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    global_load_dword v1, v0, s[6:7]
-; GFX9-DL-NEXT:    global_load_dword v2, v0, s[8:9]
-; GFX9-DL-NEXT:    global_load_dword v3, v0, s[4:5]
-; GFX9-DL-NEXT:    s_load_dword s3, s[10:11], 0x0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-DL-NEXT:    v_perm_b32 v1, v2, v1, s0
@@ -3740,19 +3741,20 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele(ptr addrspace(1) %src1,
 ; GFX9-DL-NEXT:    v_perm_b32 v2, v3, v3, s2
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    v_dot4_u32_u8 v1, v2, v1, s3
-; GFX9-DL-NEXT:    global_store_dword v0, v1, s[10:11]
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: udot4_acc32_3src_3ele:
 ; GFX10-DL:       ; %bb.0: ; %entry
-; GFX10-DL-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX10-DL-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_clause 0x2
-; GFX10-DL-NEXT:    global_load_dword v1, v0, s[6:7]
-; GFX10-DL-NEXT:    global_load_dword v2, v0, s[8:9]
-; GFX10-DL-NEXT:    global_load_dword v3, v0, s[4:5]
-; GFX10-DL-NEXT:    s_load_dword s0, s[10:11], 0x0
+; GFX10-DL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-DL-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX10-DL-NEXT:    global_load_dword v3, v0, s[0:1]
+; GFX10-DL-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-DL-NEXT:    v_perm_b32 v0, v2, v1, 0xc06010c
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
@@ -3762,7 +3764,7 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele(ptr addrspace(1) %src1,
 ; GFX10-DL-NEXT:    v_perm_b32 v1, v3, v3, 0xc020100
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    v_dot4_u32_u8 v0, v1, v0, s0
-; GFX10-DL-NEXT:    global_store_dword v2, v0, s[10:11]
+; GFX10-DL-NEXT:    global_store_dword v2, v0, s[6:7]
 ; GFX10-DL-NEXT:    s_endpgm
 ;
 ; GFX11-DL-LABEL: udot4_acc32_3src_3ele:
@@ -4304,13 +4306,13 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1,
 ;
 ; GFX9-NODL-LABEL: udot4_acc32_3src_3ele_src0:
 ; GFX9-NODL:       ; %bb.0: ; %entry
-; GFX9-NODL-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NODL-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[6:7]
-; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[8:9]
-; GFX9-NODL-NEXT:    global_load_dword v3, v0, s[4:5]
-; GFX9-NODL-NEXT:    s_load_dword s0, s[10:11], 0x0
+; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX9-NODL-NEXT:    global_load_dword v3, v0, s[0:1]
+; GFX9-NODL-NEXT:    s_load_dword s0, s[6:7], 0x0
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NODL-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NODL-NEXT:    v_bfe_u32 v4, v1, 8, 8
@@ -4321,21 +4323,21 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1,
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NODL-NEXT:    v_mad_u32_u24 v3, v4, v4, s0
 ; GFX9-NODL-NEXT:    v_add3_u32 v1, v3, v2, v1
-; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[10:11]
+; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX9-NODL-NEXT:    s_endpgm
 ;
 ; GFX9-DL-LABEL: udot4_acc32_3src_3ele_src0:
 ; GFX9-DL:       ; %bb.0: ; %entry
-; GFX9-DL-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-DL-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX9-DL-NEXT:    global_load_dword v2, v0, s[0:1]
+; GFX9-DL-NEXT:    global_load_dword v3, v0, s[2:3]
+; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
 ; GFX9-DL-NEXT:    s_mov_b32 s0, 0xc06010c
 ; GFX9-DL-NEXT:    s_mov_b32 s1, 0xc0c0c01
 ; GFX9-DL-NEXT:    s_mov_b32 s2, 0xc020101
-; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    global_load_dword v1, v0, s[8:9]
-; GFX9-DL-NEXT:    global_load_dword v2, v0, s[4:5]
-; GFX9-DL-NEXT:    global_load_dword v3, v0, s[6:7]
-; GFX9-DL-NEXT:    s_load_dword s3, s[10:11], 0x0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-DL-NEXT:    v_perm_b32 v1, v1, v2, s0
@@ -4345,19 +4347,20 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1,
 ; GFX9-DL-NEXT:    v_perm_b32 v2, v3, v3, s2
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    v_dot4_u32_u8 v1, v2, v1, s3
-; GFX9-DL-NEXT:    global_store_dword v0, v1, s[10:11]
+; GFX9-DL-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
 ; GFX10-DL-LABEL: udot4_acc32_3src_3ele_src0:
 ; GFX10-DL:       ; %bb.0: ; %entry
-; GFX10-DL-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX10-DL-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_clause 0x2
-; GFX10-DL-NEXT:    global_load_dword v1, v0, s[8:9]
-; GFX10-DL-NEXT:    global_load_dword v2, v0, s[4:5]
-; GFX10-DL-NEXT:    global_load_dword v3, v0, s[6:7]
-; GFX10-DL-NEXT:    s_load_dword s0, s[10:11], 0x0
+; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX10-DL-NEXT:    global_load_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT:    global_load_dword v3, v0, s[2:3]
+; GFX10-DL-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-DL-NEXT:    v_perm_b32 v0, v1, v2, 0xc06010c
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
@@ -4367,7 +4370,7 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1,
 ; GFX10-DL-NEXT:    v_perm_b32 v1, v3, v3, 0xc020101
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    v_dot4_u32_u8 v0, v1, v0, s0
-; GFX10-DL-NEXT:    global_store_dword v2, v0, s[10:11]
+; GFX10-DL-NEXT:    global_store_dword v2, v0, s[6:7]
 ; GFX10-DL-NEXT:    s_endpgm
 ;
 ; GFX11-DL-LABEL: udot4_acc32_3src_3ele_src0:
@@ -5594,16 +5597,15 @@ define amdgpu_kernel void @idot4_acc32_v8i8(ptr addrspace(1) %src1,
 ;
 ; GFX10-DL-LABEL: idot4_acc32_v8i8:
 ; GFX10-DL:       ; %bb.0: ; %entry
-; GFX10-DL-NEXT:    s_clause 0x1
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x34
 ; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x34
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    global_load_dwordx2 v[0:1], v0, s[0:1]
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DL-NEXT:    v_dot4_u32_u8 v0, v0, v1, 0
-; GFX10-DL-NEXT:    global_store_dword v2, v0, s[4:5]
+; GFX10-DL-NEXT:    global_store_dword v2, v0, s[2:3]
 ; GFX10-DL-NEXT:    s_endpgm
 ;
 ; GFX11-DL-LABEL: idot4_acc32_v8i8:
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll
index 44e8ae01fd692..3cabe41afb05a 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll
@@ -14,7 +14,7 @@ define amdgpu_kernel void @extract_w_offset_vgpr(ptr addrspace(1) %out) {
   ; GCN-NEXT:   liveins: $vgpr0, $sgpr2_sgpr3
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY killed $vgpr0
-  ; GCN-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr2_sgpr3, 36, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset, align 4, addrspace 4)
+  ; GCN-NEXT:   renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr2_sgpr3, 36, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset, align 4, addrspace 4)
   ; GCN-NEXT:   renamable $sgpr6 = COPY renamable $sgpr1
   ; GCN-NEXT:   renamable $sgpr0 = COPY renamable $sgpr0, implicit killed $sgpr0_sgpr1
   ; GCN-NEXT:   renamable $sgpr4 = S_MOV_B32 61440
diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll
index 17fe1721f73d7..b9dc27cb7e019 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll
@@ -438,121 +438,121 @@ entry:
 define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ; GFX9-LABEL: udiv_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s7
-; GFX9-NEXT:    s_sub_i32 s0, 0, s7
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
+; GFX9-NEXT:    s_sub_i32 s4, 0, s3
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    v_readfirstlane_b32 s1, v0
-; GFX9-NEXT:    s_mul_i32 s0, s0, s1
-; GFX9-NEXT:    s_mul_hi_u32 s0, s1, s0
-; GFX9-NEXT:    s_add_i32 s1, s1, s0
-; GFX9-NEXT:    s_mul_hi_u32 s0, s6, s1
-; GFX9-NEXT:    s_mul_i32 s1, s0, s7
-; GFX9-NEXT:    s_sub_i32 s1, s6, s1
-; GFX9-NEXT:    s_add_i32 s2, s0, 1
-; GFX9-NEXT:    s_sub_i32 s3, s1, s7
-; GFX9-NEXT:    s_cmp_ge_u32 s1, s7
-; GFX9-NEXT:    s_cselect_b32 s0, s2, s0
-; GFX9-NEXT:    s_cselect_b32 s1, s3, s1
-; GFX9-NEXT:    s_add_i32 s2, s0, 1
-; GFX9-NEXT:    s_cmp_ge_u32 s1, s7
-; GFX9-NEXT:    s_cselect_b32 s0, s2, s0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX9-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX9-NEXT:    s_mul_i32 s4, s4, s5
+; GFX9-NEXT:    s_mul_hi_u32 s4, s5, s4
+; GFX9-NEXT:    s_add_i32 s5, s5, s4
+; GFX9-NEXT:    s_mul_hi_u32 s4, s2, s5
+; GFX9-NEXT:    s_mul_i32 s5, s4, s3
+; GFX9-NEXT:    s_sub_i32 s2, s2, s5
+; GFX9-NEXT:    s_add_i32 s6, s4, 1
+; GFX9-NEXT:    s_sub_i32 s5, s2, s3
+; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX9-NEXT:    s_cselect_b32 s4, s6, s4
+; GFX9-NEXT:    s_cselect_b32 s2, s5, s2
+; GFX9-NEXT:    s_add_i32 s5, s4, 1
+; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX9-NEXT:    s_cselect_b32 s2, s5, s4
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX90A-LABEL: udiv_i32:
 ; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, 0
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s7
-; GFX90A-NEXT:    s_sub_i32 s0, 0, s7
+; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s3
+; GFX90A-NEXT:    s_sub_i32 s4, 0, s3
 ; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX90A-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX90A-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX90A-NEXT:    v_readfirstlane_b32 s1, v0
-; GFX90A-NEXT:    s_mul_i32 s0, s0, s1
-; GFX90A-NEXT:    s_mul_hi_u32 s0, s1, s0
-; GFX90A-NEXT:    s_add_i32 s1, s1, s0
-; GFX90A-NEXT:    s_mul_hi_u32 s0, s6, s1
-; GFX90A-NEXT:    s_mul_i32 s1, s0, s7
-; GFX90A-NEXT:    s_sub_i32 s1, s6, s1
-; GFX90A-NEXT:    s_add_i32 s2, s0, 1
-; GFX90A-NEXT:    s_sub_i32 s3, s1, s7
-; GFX90A-NEXT:    s_cmp_ge_u32 s1, s7
-; GFX90A-NEXT:    s_cselect_b32 s0, s2, s0
-; GFX90A-NEXT:    s_cselect_b32 s1, s3, s1
-; GFX90A-NEXT:    s_add_i32 s2, s0, 1
-; GFX90A-NEXT:    s_cmp_ge_u32 s1, s7
-; GFX90A-NEXT:    s_cselect_b32 s0, s2, s0
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX90A-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX90A-NEXT:    s_mul_i32 s4, s4, s5
+; GFX90A-NEXT:    s_mul_hi_u32 s4, s5, s4
+; GFX90A-NEXT:    s_add_i32 s5, s5, s4
+; GFX90A-NEXT:    s_mul_hi_u32 s4, s2, s5
+; GFX90A-NEXT:    s_mul_i32 s5, s4, s3
+; GFX90A-NEXT:    s_sub_i32 s2, s2, s5
+; GFX90A-NEXT:    s_add_i32 s6, s4, 1
+; GFX90A-NEXT:    s_sub_i32 s5, s2, s3
+; GFX90A-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX90A-NEXT:    s_cselect_b32 s4, s6, s4
+; GFX90A-NEXT:    s_cselect_b32 s2, s5, s2
+; GFX90A-NEXT:    s_add_i32 s5, s4, 1
+; GFX90A-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX90A-NEXT:    s_cselect_b32 s2, s5, s4
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s2
+; GFX90A-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: udiv_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s7
-; GFX10-NEXT:    s_sub_i32 s1, 0, s7
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s3
+; GFX10-NEXT:    s_sub_i32 s5, 0, s3
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-NEXT:    s_mul_i32 s1, s1, s0
-; GFX10-NEXT:    s_mul_hi_u32 s1, s0, s1
-; GFX10-NEXT:    s_add_i32 s0, s0, s1
-; GFX10-NEXT:    s_mul_hi_u32 s0, s6, s0
-; GFX10-NEXT:    s_mul_i32 s1, s0, s7
-; GFX10-NEXT:    s_add_i32 s2, s0, 1
-; GFX10-NEXT:    s_sub_i32 s1, s6, s1
-; GFX10-NEXT:    s_sub_i32 s3, s1, s7
-; GFX10-NEXT:    s_cmp_ge_u32 s1, s7
-; GFX10-NEXT:    s_cselect_b32 s0, s2, s0
-; GFX10-NEXT:    s_cselect_b32 s1, s3, s1
-; GFX10-NEXT:    s_add_i32 s2, s0, 1
-; GFX10-NEXT:    s_cmp_ge_u32 s1, s7
-; GFX10-NEXT:    s_cselect_b32 s0, s2, s0
-; GFX10-NEXT:    v_mov_b32_e32 v1, s0
-; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-NEXT:    s_mul_i32 s5, s5, s4
+; GFX10-NEXT:    s_mul_hi_u32 s5, s4, s5
+; GFX10-NEXT:    s_add_i32 s4, s4, s5
+; GFX10-NEXT:    s_mul_hi_u32 s4, s2, s4
+; GFX10-NEXT:    s_mul_i32 s5, s4, s3
+; GFX10-NEXT:    s_sub_i32 s2, s2, s5
+; GFX10-NEXT:    s_add_i32 s5, s4, 1
+; GFX10-NEXT:    s_sub_i32 s6, s2, s3
+; GFX10-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX10-NEXT:    s_cselect_b32 s4, s5, s4
+; GFX10-NEXT:    s_cselect_b32 s2, s6, s2
+; GFX10-NEXT:    s_add_i32 s5, s4, 1
+; GFX10-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX10-NEXT:    s_cselect_b32 s2, s5, s4
+; GFX10-NEXT:    v_mov_b32_e32 v1, s2
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-FLATSCR-LABEL: udiv_i32:
 ; GFX9-FLATSCR:       ; %bb.0:
-; GFX9-FLATSCR-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-FLATSCR-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-FLATSCR-NEXT:    v_cvt_f32_u32_e32 v0, s7
-; GFX9-FLATSCR-NEXT:    s_sub_i32 s0, 0, s7
+; GFX9-FLATSCR-NEXT:    v_cvt_f32_u32_e32 v0, s3
+; GFX9-FLATSCR-NEXT:    s_sub_i32 s4, 0, s3
 ; GFX9-FLATSCR-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-FLATSCR-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-FLATSCR-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-FLATSCR-NEXT:    v_readfirstlane_b32 s1, v0
-; GFX9-FLATSCR-NEXT:    s_mul_i32 s0, s0, s1
-; GFX9-FLATSCR-NEXT:    s_mul_hi_u32 s0, s1, s0
-; GFX9-FLATSCR-NEXT:    s_add_i32 s1, s1, s0
-; GFX9-FLATSCR-NEXT:    s_mul_hi_u32 s0, s6, s1
-; GFX9-FLATSCR-NEXT:    s_mul_i32 s1, s0, s7
-; GFX9-FLATSCR-NEXT:    s_sub_i32 s1, s6, s1
-; GFX9-FLATSCR-NEXT:    s_add_i32 s2, s0, 1
-; GFX9-FLATSCR-NEXT:    s_sub_i32 s3, s1, s7
-; GFX9-FLATSCR-NEXT:    s_cmp_ge_u32 s1, s7
-; GFX9-FLATSCR-NEXT:    s_cselect_b32 s0, s2, s0
-; GFX9-FLATSCR-NEXT:    s_cselect_b32 s1, s3, s1
-; GFX9-FLATSCR-NEXT:    s_add_i32 s2, s0, 1
-; GFX9-FLATSCR-NEXT:    s_cmp_ge_u32 s1, s7
-; GFX9-FLATSCR-NEXT:    s_cselect_b32 s0, s2, s0
-; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-FLATSCR-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX9-FLATSCR-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX9-FLATSCR-NEXT:    s_mul_i32 s4, s4, s5
+; GFX9-FLATSCR-NEXT:    s_mul_hi_u32 s4, s5, s4
+; GFX9-FLATSCR-NEXT:    s_add_i32 s5, s5, s4
+; GFX9-FLATSCR-NEXT:    s_mul_hi_u32 s4, s2, s5
+; GFX9-FLATSCR-NEXT:    s_mul_i32 s5, s4, s3
+; GFX9-FLATSCR-NEXT:    s_sub_i32 s2, s2, s5
+; GFX9-FLATSCR-NEXT:    s_add_i32 s6, s4, 1
+; GFX9-FLATSCR-NEXT:    s_sub_i32 s5, s2, s3
+; GFX9-FLATSCR-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX9-FLATSCR-NEXT:    s_cselect_b32 s4, s6, s4
+; GFX9-FLATSCR-NEXT:    s_cselect_b32 s2, s5, s2
+; GFX9-FLATSCR-NEXT:    s_add_i32 s5, s4, 1
+; GFX9-FLATSCR-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX9-FLATSCR-NEXT:    s_cselect_b32 s2, s5, s4
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-FLATSCR-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    s_endpgm
 ;
@@ -1660,4 +1660,3 @@ entry:
   %bc = bitcast <2 x i32> %r.1 to <2 x float>
   ret <2 x float> %bc
 }
-
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll
index eed85345b3b1c..e1caf3bea6119 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll
@@ -32,21 +32,21 @@ define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, float %x
 ;
 ; GFX9-LABEL: s_cvt_pkrtz_v2f16_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, s7
-; GFX9-NEXT:    v_cvt_pkrtz_f16_f32 v1, s6, v1
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_cvt_pkrtz_f16_f32 v1, s2, v1
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: s_cvt_pkrtz_v2f16_f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_cvt_pkrtz_f16_f32_e64 v1, s6, s7
-; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-NEXT:    v_cvt_pkrtz_f16_f32_e64 v1, s2, s3
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_cvt_pkrtz_v2f16_f32:
@@ -281,24 +281,24 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(ptr addrspace(1) %out,
 ;
 ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_pkrtz_f16_f32 v1, v1, 1.0
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
+; GFX10-NEXT:    global_load_dword v1, v0, s[2:3] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_pkrtz_f16_f32_e64 v1, v1, 1.0
-; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm:
@@ -361,24 +361,24 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(ptr addrspace(1) %out,
 ;
 ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_pkrtz_f16_f32 v1, 1.0, v1
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
+; GFX10-NEXT:    global_load_dword v1, v0, s[2:3] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_pkrtz_f16_f32_e32 v1, 1.0, v1
-; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll
index 67e4feed21fac..224de9512c493 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefixes=GCN,GFX8,PREGFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefixes=GCN,PREGFX11 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefixes=GCN,GFX10,PREGFX11 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefixes=GCN,GFX11 %s
 
@@ -549,20 +549,12 @@ end:
 ; GCN-LABEL: {{^}}test_export_clustering:
 ; PREGFX11-DAG: v_mov_b32_e32 [[W0:v[0-9]+]], 0
 ; PREGFX11-DAG: v_mov_b32_e32 [[W1:v[0-9]+]], 1.0
-
-; GFX8-DAG: v_mov_b32_e32 [[X:v[0-9]+]], s1
-; GFX8-DAG: v_mov_b32_e32 [[Y:v[0-9]+]], s0
-; GFX8-DAG: v_add_f32_e{{32|64}} [[Z0:v[0-9]+]]
-; GFX8-DAG: v_sub_f32_e{{32|64}} [[Z1:v[0-9]+]]
-; GFX8: exp param0 [[Y]], [[X]], [[Z0]], [[W0]]{{$}}
-; GFX8-NEXT: exp param1 [[Y]], [[X]], [[Z1]], [[W1]] done{{$}}
-
-; GFX10-DAG: v_mov_b32_e32 [[X:v[0-9]+]], s0
-; GFX10-DAG: v_mov_b32_e32 [[Y:v[0-9]+]], s1
-; GFX10-DAG: v_add_f32_e{{32|64}} [[Z0:v[0-9]+]]
-; GFX10-DAG: v_sub_f32_e{{32|64}} [[Z1:v[0-9]+]]
-; GFX10: exp param0 [[X]], [[Y]], [[Z0]], [[W0]]{{$}}
-; GFX10-NEXT: exp param1 [[X]], [[Y]], [[Z1]], [[W1]] done{{$}}
+; PREGFX11-DAG: v_mov_b32_e32 [[X:v[0-9]+]], s0
+; PREGFX11-DAG: v_mov_b32_e32 [[Y:v[0-9]+]], s1
+; PREGFX11-DAG: v_add_f32_e{{32|64}} [[Z0:v[0-9]+]]
+; PREGFX11-DAG: v_sub_f32_e{{32|64}} [[Z1:v[0-9]+]]
+; PREGFX11: exp param0 [[X]], [[Y]], [[Z0]], [[W0]]{{$}}
+; PREGFX11-NEXT: exp param1 [[X]], [[Y]], [[Z1]], [[W1]] done{{$}}
 define amdgpu_kernel void @test_export_clustering(float %x, float %y) #0 {
   %z0 = fadd float %x, %y
   call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float %x, float %y, float %z0, float 0.0, i1 false, i1 false)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll
index dad77d1efd3a8..a26b84e17374a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll
@@ -27,12 +27,12 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float
 ;
 ; SDAG-GFX10-LABEL: v_fcmp_f32_oeq_with_fabs:
 ; SDAG-GFX10:       ; %bb.0:
-; SDAG-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_eq_f32_e64 s0, s6, |s7|
-; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s0
-; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; SDAG-GFX10-NEXT:    v_cmp_eq_f32_e64 s2, s2, |s3|
+; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
+; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; SDAG-GFX10-NEXT:    s_endpgm
 ;
 ; GISEL-GFX11-LABEL: v_fcmp_f32_oeq_with_fabs:
@@ -50,12 +50,12 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f32_oeq_with_fabs:
 ; GISEL-GFX10:       ; %bb.0:
-; GISEL-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_eq_f32_e64 s0, s6, |s7|
-; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GISEL-GFX10-NEXT:    v_cmp_eq_f32_e64 s2, s2, |s3|
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %temp = call float @llvm.fabs.f32(float %a)
   %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float %temp, i32 1)
@@ -78,12 +78,12 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace(
 ;
 ; SDAG-GFX10-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs:
 ; SDAG-GFX10:       ; %bb.0:
-; SDAG-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_eq_f32_e64 s0, |s6|, |s7|
-; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s0
-; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; SDAG-GFX10-NEXT:    v_cmp_eq_f32_e64 s2, |s2|, |s3|
+; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
+; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; SDAG-GFX10-NEXT:    s_endpgm
 ;
 ; GISEL-GFX11-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs:
@@ -101,12 +101,12 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace(
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs:
 ; GISEL-GFX10:       ; %bb.0:
-; GISEL-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_eq_f32_e64 s0, |s6|, |s7|
-; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GISEL-GFX10-NEXT:    v_cmp_eq_f32_e64 s2, |s2|, |s3|
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %temp = call float @llvm.fabs.f32(float %a)
   %src_input = call float @llvm.fabs.f32(float %src)
@@ -973,12 +973,12 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) {
 ;
 ; SDAG-GFX10-LABEL: v_fcmp_f64_oeq:
 ; SDAG-GFX10:       ; %bb.0:
-; SDAG-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_eq_f64_e64 s0, 0x40590000, s[6:7]
-; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s0
-; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; SDAG-GFX10-NEXT:    v_cmp_eq_f64_e64 s2, 0x40590000, s[2:3]
+; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
+; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; SDAG-GFX10-NEXT:    s_endpgm
 ;
 ; GISEL-GFX11-LABEL: v_fcmp_f64_oeq:
@@ -996,12 +996,12 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) {
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f64_oeq:
 ; GISEL-GFX10:       ; %bb.0:
-; GISEL-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_eq_f64_e64 s0, 0x40590000, s[6:7]
-; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GISEL-GFX10-NEXT:    v_cmp_eq_f64_e64 s2, 0x40590000, s[2:3]
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 1)
   store i32 %result, ptr addrspace(1) %out
@@ -1023,12 +1023,12 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) {
 ;
 ; SDAG-GFX10-LABEL: v_fcmp_f64_one:
 ; SDAG-GFX10:       ; %bb.0:
-; SDAG-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_neq_f64_e64 s0, 0x40590000, s[6:7]
-; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s0
-; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; SDAG-GFX10-NEXT:    v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3]
+; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
+; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; SDAG-GFX10-NEXT:    s_endpgm
 ;
 ; GISEL-GFX11-LABEL: v_fcmp_f64_one:
@@ -1046,12 +1046,12 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) {
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f64_one:
 ; GISEL-GFX10:       ; %bb.0:
-; GISEL-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_neq_f64_e64 s0, 0x40590000, s[6:7]
-; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GISEL-GFX10-NEXT:    v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3]
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 6)
   store i32 %result, ptr addrspace(1) %out
@@ -1073,12 +1073,12 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) {
 ;
 ; SDAG-GFX10-LABEL: v_fcmp_f64_ogt:
 ; SDAG-GFX10:       ; %bb.0:
-; SDAG-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_lt_f64_e64 s0, 0x40590000, s[6:7]
-; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s0
-; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; SDAG-GFX10-NEXT:    v_cmp_lt_f64_e64 s2, 0x40590000, s[2:3]
+; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
+; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; SDAG-GFX10-NEXT:    s_endpgm
 ;
 ; GISEL-GFX11-LABEL: v_fcmp_f64_ogt:
@@ -1096,12 +1096,12 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) {
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f64_ogt:
 ; GISEL-GFX10:       ; %bb.0:
-; GISEL-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_lt_f64_e64 s0, 0x40590000, s[6:7]
-; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GISEL-GFX10-NEXT:    v_cmp_lt_f64_e64 s2, 0x40590000, s[2:3]
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 2)
   store i32 %result, ptr addrspace(1) %out
@@ -1123,12 +1123,12 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) {
 ;
 ; SDAG-GFX10-LABEL: v_fcmp_f64_oge:
 ; SDAG-GFX10:       ; %bb.0:
-; SDAG-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_le_f64_e64 s0, 0x40590000, s[6:7]
-; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s0
-; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; SDAG-GFX10-NEXT:    v_cmp_le_f64_e64 s2, 0x40590000, s[2:3]
+; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
+; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; SDAG-GFX10-NEXT:    s_endpgm
 ;
 ; GISEL-GFX11-LABEL: v_fcmp_f64_oge:
@@ -1146,12 +1146,12 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) {
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f64_oge:
 ; GISEL-GFX10:       ; %bb.0:
-; GISEL-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_le_f64_e64 s0, 0x40590000, s[6:7]
-; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GISEL-GFX10-NEXT:    v_cmp_le_f64_e64 s2, 0x40590000, s[2:3]
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 3)
   store i32 %result, ptr addrspace(1) %out
@@ -1173,12 +1173,12 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) {
 ;
 ; SDAG-GFX10-LABEL: v_fcmp_f64_olt:
 ; SDAG-GFX10:       ; %bb.0:
-; SDAG-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_gt_f64_e64 s0, 0x40590000, s[6:7]
-; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s0
-; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; SDAG-GFX10-NEXT:    v_cmp_gt_f64_e64 s2, 0x40590000, s[2:3]
+; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
+; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; SDAG-GFX10-NEXT:    s_endpgm
 ;
 ; GISEL-GFX11-LABEL: v_fcmp_f64_olt:
@@ -1196,12 +1196,12 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) {
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f64_olt:
 ; GISEL-GFX10:       ; %bb.0:
-; GISEL-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_gt_f64_e64 s0, 0x40590000, s[6:7]
-; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GISEL-GFX10-NEXT:    v_cmp_gt_f64_e64 s2, 0x40590000, s[2:3]
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 4)
   store i32 %result, ptr addrspace(1) %out
@@ -1223,12 +1223,12 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) {
 ;
 ; SDAG-GFX10-LABEL: v_fcmp_f64_ole:
 ; SDAG-GFX10:       ; %bb.0:
-; SDAG-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_ge_f64_e64 s0, 0x40590000, s[6:7]
-; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s0
-; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; SDAG-GFX10-NEXT:    v_cmp_ge_f64_e64 s2, 0x40590000, s[2:3]
+; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
+; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; SDAG-GFX10-NEXT:    s_endpgm
 ;
 ; GISEL-GFX11-LABEL: v_fcmp_f64_ole:
@@ -1246,12 +1246,12 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) {
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f64_ole:
 ; GISEL-GFX10:       ; %bb.0:
-; GISEL-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_ge_f64_e64 s0, 0x40590000, s[6:7]
-; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GISEL-GFX10-NEXT:    v_cmp_ge_f64_e64 s2, 0x40590000, s[2:3]
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 5)
   store i32 %result, ptr addrspace(1) %out
@@ -1273,12 +1273,12 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) {
 ;
 ; SDAG-GFX10-LABEL: v_fcmp_f64_ueq:
 ; SDAG-GFX10:       ; %bb.0:
-; SDAG-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_nlg_f64_e64 s0, 0x40590000, s[6:7]
-; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s0
-; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; SDAG-GFX10-NEXT:    v_cmp_nlg_f64_e64 s2, 0x40590000, s[2:3]
+; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
+; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; SDAG-GFX10-NEXT:    s_endpgm
 ;
 ; GISEL-GFX11-LABEL: v_fcmp_f64_ueq:
@@ -1296,12 +1296,12 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) {
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f64_ueq:
 ; GISEL-GFX10:       ; %bb.0:
-; GISEL-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_nlg_f64_e64 s0, 0x40590000, s[6:7]
-; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GISEL-GFX10-NEXT:    v_cmp_nlg_f64_e64 s2, 0x40590000, s[2:3]
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 9)
   store i32 %result, ptr addrspace(1) %out
@@ -1323,12 +1323,12 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) {
 ;
 ; SDAG-GFX10-LABEL: v_fcmp_f64_o:
 ; SDAG-GFX10:       ; %bb.0:
-; SDAG-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_o_f64_e64 s0, 0x40590000, s[6:7]
-; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s0
-; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; SDAG-GFX10-NEXT:    v_cmp_o_f64_e64 s2, 0x40590000, s[2:3]
+; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
+; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; SDAG-GFX10-NEXT:    s_endpgm
 ;
 ; GISEL-GFX11-LABEL: v_fcmp_f64_o:
@@ -1346,12 +1346,12 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) {
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f64_o:
 ; GISEL-GFX10:       ; %bb.0:
-; GISEL-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_o_f64_e64 s0, 0x40590000, s[6:7]
-; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GISEL-GFX10-NEXT:    v_cmp_o_f64_e64 s2, 0x40590000, s[2:3]
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 7)
   store i32 %result, ptr addrspace(1) %out
@@ -1373,12 +1373,12 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) {
 ;
 ; SDAG-GFX10-LABEL: v_fcmp_f64_uo:
 ; SDAG-GFX10:       ; %bb.0:
-; SDAG-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_u_f64_e64 s0, 0x40590000, s[6:7]
-; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s0
-; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; SDAG-GFX10-NEXT:    v_cmp_u_f64_e64 s2, 0x40590000, s[2:3]
+; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
+; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; SDAG-GFX10-NEXT:    s_endpgm
 ;
 ; GISEL-GFX11-LABEL: v_fcmp_f64_uo:
@@ -1396,12 +1396,12 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) {
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f64_uo:
 ; GISEL-GFX10:       ; %bb.0:
-; GISEL-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_u_f64_e64 s0, 0x40590000, s[6:7]
-; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GISEL-GFX10-NEXT:    v_cmp_u_f64_e64 s2, 0x40590000, s[2:3]
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 8)
   store i32 %result, ptr addrspace(1) %out
@@ -1423,12 +1423,12 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) {
 ;
 ; SDAG-GFX10-LABEL: v_fcmp_f64_une:
 ; SDAG-GFX10:       ; %bb.0:
-; SDAG-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_neq_f64_e64 s0, 0x40590000, s[6:7]
-; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s0
-; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; SDAG-GFX10-NEXT:    v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3]
+; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
+; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; SDAG-GFX10-NEXT:    s_endpgm
 ;
 ; GISEL-GFX11-LABEL: v_fcmp_f64_une:
@@ -1446,12 +1446,12 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) {
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f64_une:
 ; GISEL-GFX10:       ; %bb.0:
-; GISEL-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_neq_f64_e64 s0, 0x40590000, s[6:7]
-; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GISEL-GFX10-NEXT:    v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3]
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 14)
   store i32 %result, ptr addrspace(1) %out
@@ -1473,12 +1473,12 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) {
 ;
 ; SDAG-GFX10-LABEL: v_fcmp_f64_ugt:
 ; SDAG-GFX10:       ; %bb.0:
-; SDAG-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_nge_f64_e64 s0, 0x40590000, s[6:7]
-; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s0
-; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; SDAG-GFX10-NEXT:    v_cmp_nge_f64_e64 s2, 0x40590000, s[2:3]
+; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
+; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; SDAG-GFX10-NEXT:    s_endpgm
 ;
 ; GISEL-GFX11-LABEL: v_fcmp_f64_ugt:
@@ -1496,12 +1496,12 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) {
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f64_ugt:
 ; GISEL-GFX10:       ; %bb.0:
-; GISEL-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_nge_f64_e64 s0, 0x40590000, s[6:7]
-; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GISEL-GFX10-NEXT:    v_cmp_nge_f64_e64 s2, 0x40590000, s[2:3]
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 10)
   store i32 %result, ptr addrspace(1) %out
@@ -1523,12 +1523,12 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) {
 ;
 ; SDAG-GFX10-LABEL: v_fcmp_f64_uge:
 ; SDAG-GFX10:       ; %bb.0:
-; SDAG-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_ngt_f64_e64 s0, 0x40590000, s[6:7]
-; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s0
-; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; SDAG-GFX10-NEXT:    v_cmp_ngt_f64_e64 s2, 0x40590000, s[2:3]
+; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
+; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; SDAG-GFX10-NEXT:    s_endpgm
 ;
 ; GISEL-GFX11-LABEL: v_fcmp_f64_uge:
@@ -1546,12 +1546,12 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) {
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f64_uge:
 ; GISEL-GFX10:       ; %bb.0:
-; GISEL-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_ngt_f64_e64 s0, 0x40590000, s[6:7]
-; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GISEL-GFX10-NEXT:    v_cmp_ngt_f64_e64 s2, 0x40590000, s[2:3]
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 11)
   store i32 %result, ptr addrspace(1) %out
@@ -1573,12 +1573,12 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) {
 ;
 ; SDAG-GFX10-LABEL: v_fcmp_f64_ult:
 ; SDAG-GFX10:       ; %bb.0:
-; SDAG-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_nle_f64_e64 s0, 0x40590000, s[6:7]
-; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s0
-; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; SDAG-GFX10-NEXT:    v_cmp_nle_f64_e64 s2, 0x40590000, s[2:3]
+; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
+; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; SDAG-GFX10-NEXT:    s_endpgm
 ;
 ; GISEL-GFX11-LABEL: v_fcmp_f64_ult:
@@ -1596,12 +1596,12 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) {
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f64_ult:
 ; GISEL-GFX10:       ; %bb.0:
-; GISEL-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_nle_f64_e64 s0, 0x40590000, s[6:7]
-; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GISEL-GFX10-NEXT:    v_cmp_nle_f64_e64 s2, 0x40590000, s[2:3]
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 12)
   store i32 %result, ptr addrspace(1) %out
@@ -1623,12 +1623,12 @@ define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) {
 ;
 ; SDAG-GFX10-LABEL: v_fcmp_f64_ule:
 ; SDAG-GFX10:       ; %bb.0:
-; SDAG-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_nlt_f64_e64 s0, 0x40590000, s[6:7]
-; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s0
-; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; SDAG-GFX10-NEXT:    v_cmp_nlt_f64_e64 s2, 0x40590000, s[2:3]
+; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
+; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; SDAG-GFX10-NEXT:    s_endpgm
 ;
 ; GISEL-GFX11-LABEL: v_fcmp_f64_ule:
@@ -1646,12 +1646,12 @@ define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) {
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f64_ule:
 ; GISEL-GFX10:       ; %bb.0:
-; GISEL-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_nlt_f64_e64 s0, 0x40590000, s[6:7]
-; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GISEL-GFX10-NEXT:    v_cmp_nlt_f64_e64 s2, 0x40590000, s[2:3]
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 13)
   store i32 %result, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll
index d1883d9196af6..7e78d8b05d09f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll
@@ -30,14 +30,14 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float
 ;
 ; GFX9-LABEL: v_fcmp_f32_oeq_with_fabs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s7
-; GFX9-NEXT:    v_cmp_eq_f32_e64 s[0:1], s6, |v0|
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s3
+; GFX9-NEXT:    v_cmp_eq_f32_e64 s[2:3], s2, |v0|
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_fcmp_f32_oeq_with_fabs:
@@ -88,14 +88,14 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace(
 ;
 ; GFX9-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s7
-; GFX9-NEXT:    v_cmp_eq_f32_e64 s[0:1], |s6|, |v0|
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s3
+; GFX9-NEXT:    v_cmp_eq_f32_e64 s[2:3], |s2|, |v0|
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs:
@@ -1059,15 +1059,15 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) {
 ;
 ; GFX9-LABEL: v_fcmp_f64_oeq:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x40590000
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_f64_e64 s[0:1], s[6:7], v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f64_e64 s[2:3], s[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_fcmp_f64_oeq:
@@ -1119,15 +1119,15 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) {
 ;
 ; GFX9-LABEL: v_fcmp_f64_one:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x40590000
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_neq_f64_e64 s[0:1], s[6:7], v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    v_cmp_neq_f64_e64 s[2:3], s[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_fcmp_f64_one:
@@ -1179,15 +1179,15 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) {
 ;
 ; GFX9-LABEL: v_fcmp_f64_ogt:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x40590000
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_gt_f64_e64 s[0:1], s[6:7], v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    v_cmp_gt_f64_e64 s[2:3], s[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_fcmp_f64_ogt:
@@ -1239,15 +1239,15 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) {
 ;
 ; GFX9-LABEL: v_fcmp_f64_oge:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x40590000
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_ge_f64_e64 s[0:1], s[6:7], v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    v_cmp_ge_f64_e64 s[2:3], s[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_fcmp_f64_oge:
@@ -1299,15 +1299,15 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) {
 ;
 ; GFX9-LABEL: v_fcmp_f64_olt:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x40590000
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_lt_f64_e64 s[0:1], s[6:7], v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    v_cmp_lt_f64_e64 s[2:3], s[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_fcmp_f64_olt:
@@ -1359,15 +1359,15 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) {
 ;
 ; GFX9-LABEL: v_fcmp_f64_ole:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x40590000
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_le_f64_e64 s[0:1], s[6:7], v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    v_cmp_le_f64_e64 s[2:3], s[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_fcmp_f64_ole:
@@ -1419,15 +1419,15 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) {
 ;
 ; GFX9-LABEL: v_fcmp_f64_ueq:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x40590000
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_nlg_f64_e64 s[0:1], s[6:7], v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    v_cmp_nlg_f64_e64 s[2:3], s[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_fcmp_f64_ueq:
@@ -1479,15 +1479,15 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) {
 ;
 ; GFX9-LABEL: v_fcmp_f64_o:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x40590000
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_o_f64_e64 s[0:1], s[6:7], v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[2:3], s[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_fcmp_f64_o:
@@ -1539,15 +1539,15 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) {
 ;
 ; GFX9-LABEL: v_fcmp_f64_uo:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x40590000
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_u_f64_e64 s[0:1], s[6:7], v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    v_cmp_u_f64_e64 s[2:3], s[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_fcmp_f64_uo:
@@ -1599,15 +1599,15 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) {
 ;
 ; GFX9-LABEL: v_fcmp_f64_une:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x40590000
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_neq_f64_e64 s[0:1], s[6:7], v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    v_cmp_neq_f64_e64 s[2:3], s[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_fcmp_f64_une:
@@ -1659,15 +1659,15 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) {
 ;
 ; GFX9-LABEL: v_fcmp_f64_ugt:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x40590000
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_nle_f64_e64 s[0:1], s[6:7], v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    v_cmp_nle_f64_e64 s[2:3], s[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_fcmp_f64_ugt:
@@ -1719,15 +1719,15 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) {
 ;
 ; GFX9-LABEL: v_fcmp_f64_uge:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x40590000
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_nlt_f64_e64 s[0:1], s[6:7], v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    v_cmp_nlt_f64_e64 s[2:3], s[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_fcmp_f64_uge:
@@ -1779,15 +1779,15 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) {
 ;
 ; GFX9-LABEL: v_fcmp_f64_ult:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x40590000
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_nge_f64_e64 s[0:1], s[6:7], v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    v_cmp_nge_f64_e64 s[2:3], s[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_fcmp_f64_ult:
@@ -1839,15 +1839,15 @@ define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) {
 ;
 ; GFX9-LABEL: v_fcmp_f64_ule:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x40590000
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_ngt_f64_e64 s[0:1], s[6:7], v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    v_cmp_ngt_f64_e64 s[2:3], s[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_fcmp_f64_ule:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll
index 453913b334a49..78d5da8dda177 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll
@@ -45,6 +45,17 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_bf16_bf16_dpp(
 ; GFX11-NEXT:    v_dot2_bf16_bf16_e64_dpp v0, v2, v0, v1 quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX11-NEXT:    scratch_store_b16 off, v0, s0
 ; GFX11-NEXT:    s_endpgm
+; GISEL-GFX11-LABEL: test_llvm_amdgcn_fdot2_bf16_bf16_dpp:
+; GISEL-GFX11:       ; %bb.0: ; %entry
+; GISEL-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-GFX11-NEXT:    scratch_load_b32 v0, off, s1
+; GISEL-GFX11-NEXT:    scratch_load_b32 v1, off, s2
+; GISEL-GFX11-NEXT:    scratch_load_u16 v2, off, s3
+; GISEL-GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-GFX11-NEXT:    v_dot2_bf16_bf16_e64_dpp v0, v0, v1, v2 quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GISEL-GFX11-NEXT:    scratch_store_b16 off, v0, s0
+; GISEL-GFX11-NEXT:    s_endpgm
     ptr addrspace(5) %r,
     ptr addrspace(5) %a,
     ptr addrspace(5) %b,
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.csub.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.csub.ll
index 4a66b761306f3..d7dd0ce58a08f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.csub.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.csub.ll
@@ -6,7 +6,7 @@ declare i32 @llvm.amdgcn.global.atomic.csub(ptr addrspace(1), i32)
 
 ; GCN-LABEL: {{^}}global_atomic_csub_rtn:
 ; PREGFX12: global_atomic_csub v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9:]+}}, s{{\[[0-9]+:[0-9]+\]}} glc
-; GFX12PLUS: global_atomic_sub_clamp_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} th:TH_ATOMIC_RETURN
+; GFX12PLUS: global_atomic_sub_clamp_u32 v0, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
 define amdgpu_kernel void @global_atomic_csub_rtn(ptr addrspace(1) %ptr, i32 %data) {
 main_body:
   %ret = call i32 @llvm.amdgcn.global.atomic.csub(ptr addrspace(1) %ptr, i32 %data)
@@ -15,7 +15,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}global_atomic_csub_no_rtn:
 ; PREGFX12: global_atomic_csub v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}
-; GFX12PLUS: global_atomic_sub_clamp_u32 v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}
+; GFX12PLUS: global_atomic_sub_clamp_u32 v0, v1, s[0:1]
 define amdgpu_kernel void @global_atomic_csub_no_rtn(ptr addrspace(1) %ptr, i32 %data) #0 {
 main_body:
   %ret = call i32 @llvm.amdgcn.global.atomic.csub(ptr addrspace(1) %ptr, i32 %data)
@@ -24,7 +24,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}global_atomic_csub_off4_rtn:
 ; PREGFX12: global_atomic_csub v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:4 glc
-; GFX12PLUS: global_atomic_sub_clamp_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:4 th:TH_ATOMIC_RETURN
+; GFX12PLUS: global_atomic_sub_clamp_u32 v0, v0, v1, s[0:1] offset:4 th:TH_ATOMIC_RETURN
 define amdgpu_kernel void @global_atomic_csub_off4_rtn(ptr addrspace(1) %ptr, i32 %data) {
 main_body:
   %p = getelementptr i32, ptr addrspace(1) %ptr, i64 1
@@ -34,7 +34,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}global_atomic_csub_off4_no_rtn:
 ; PREGFX12: global_atomic_csub v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:4
-; GFX12PLUS: global_atomic_sub_clamp_u32 v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:4
+; GFX12PLUS: global_atomic_sub_clamp_u32 v0, v1, s[0:1] offset:4
 define amdgpu_kernel void @global_atomic_csub_off4_no_rtn(ptr addrspace(1) %ptr, i32 %data) #0 {
 main_body:
   %p = getelementptr i32, ptr addrspace(1) %ptr, i64 1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll
index d0d759a57a682..9e3e393d82e22 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll
@@ -644,12 +644,12 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) {
 ;
 ; SDAG-GFX10-LABEL: v_icmp_i64_eq:
 ; SDAG-GFX10:       ; %bb.0:
-; SDAG-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_eq_u64_e64 s0, 0x64, s[6:7]
-; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s0
-; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; SDAG-GFX10-NEXT:    v_cmp_eq_u64_e64 s2, 0x64, s[2:3]
+; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
+; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; SDAG-GFX10-NEXT:    s_endpgm
 ;
 ; GISEL-GFX11-LABEL: v_icmp_i64_eq:
@@ -667,12 +667,12 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) {
 ;
 ; GISEL-GFX10-LABEL: v_icmp_i64_eq:
 ; GISEL-GFX10:       ; %bb.0:
-; GISEL-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_eq_u64_e64 s0, 0x64, s[6:7]
-; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GISEL-GFX10-NEXT:    v_cmp_eq_u64_e64 s2, 0x64, s[2:3]
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 32)
   store i32 %result, ptr addrspace(1) %out
@@ -694,12 +694,12 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) {
 ;
 ; SDAG-GFX10-LABEL: v_icmp_i64_ne:
 ; SDAG-GFX10:       ; %bb.0:
-; SDAG-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_ne_u64_e64 s0, 0x64, s[6:7]
-; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s0
-; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; SDAG-GFX10-NEXT:    v_cmp_ne_u64_e64 s2, 0x64, s[2:3]
+; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
+; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; SDAG-GFX10-NEXT:    s_endpgm
 ;
 ; GISEL-GFX11-LABEL: v_icmp_i64_ne:
@@ -717,12 +717,12 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) {
 ;
 ; GISEL-GFX10-LABEL: v_icmp_i64_ne:
 ; GISEL-GFX10:       ; %bb.0:
-; GISEL-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_ne_u64_e64 s0, 0x64, s[6:7]
-; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GISEL-GFX10-NEXT:    v_cmp_ne_u64_e64 s2, 0x64, s[2:3]
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 33)
   store i32 %result, ptr addrspace(1) %out
@@ -744,12 +744,12 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) {
 ;
 ; SDAG-GFX10-LABEL: v_icmp_u64_ugt:
 ; SDAG-GFX10:       ; %bb.0:
-; SDAG-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_lt_u64_e64 s0, 0x64, s[6:7]
-; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s0
-; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; SDAG-GFX10-NEXT:    v_cmp_lt_u64_e64 s2, 0x64, s[2:3]
+; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
+; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; SDAG-GFX10-NEXT:    s_endpgm
 ;
 ; GISEL-GFX11-LABEL: v_icmp_u64_ugt:
@@ -767,12 +767,12 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) {
 ;
 ; GISEL-GFX10-LABEL: v_icmp_u64_ugt:
 ; GISEL-GFX10:       ; %bb.0:
-; GISEL-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_lt_u64_e64 s0, 0x64, s[6:7]
-; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GISEL-GFX10-NEXT:    v_cmp_lt_u64_e64 s2, 0x64, s[2:3]
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 34)
   store i32 %result, ptr addrspace(1) %out
@@ -794,12 +794,12 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) {
 ;
 ; SDAG-GFX10-LABEL: v_icmp_u64_uge:
 ; SDAG-GFX10:       ; %bb.0:
-; SDAG-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_le_u64_e64 s0, 0x64, s[6:7]
-; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s0
-; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; SDAG-GFX10-NEXT:    v_cmp_le_u64_e64 s2, 0x64, s[2:3]
+; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
+; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; SDAG-GFX10-NEXT:    s_endpgm
 ;
 ; GISEL-GFX11-LABEL: v_icmp_u64_uge:
@@ -817,12 +817,12 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) {
 ;
 ; GISEL-GFX10-LABEL: v_icmp_u64_uge:
 ; GISEL-GFX10:       ; %bb.0:
-; GISEL-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_le_u64_e64 s0, 0x64, s[6:7]
-; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GISEL-GFX10-NEXT:    v_cmp_le_u64_e64 s2, 0x64, s[2:3]
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 35)
   store i32 %result, ptr addrspace(1) %out
@@ -844,12 +844,12 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) {
 ;
 ; SDAG-GFX10-LABEL: v_icmp_u64_ult:
 ; SDAG-GFX10:       ; %bb.0:
-; SDAG-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_gt_u64_e64 s0, 0x64, s[6:7]
-; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s0
-; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; SDAG-GFX10-NEXT:    v_cmp_gt_u64_e64 s2, 0x64, s[2:3]
+; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
+; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; SDAG-GFX10-NEXT:    s_endpgm
 ;
 ; GISEL-GFX11-LABEL: v_icmp_u64_ult:
@@ -867,12 +867,12 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) {
 ;
 ; GISEL-GFX10-LABEL: v_icmp_u64_ult:
 ; GISEL-GFX10:       ; %bb.0:
-; GISEL-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_gt_u64_e64 s0, 0x64, s[6:7]
-; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GISEL-GFX10-NEXT:    v_cmp_gt_u64_e64 s2, 0x64, s[2:3]
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 36)
   store i32 %result, ptr addrspace(1) %out
@@ -894,12 +894,12 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) {
 ;
 ; SDAG-GFX10-LABEL: v_icmp_u64_ule:
 ; SDAG-GFX10:       ; %bb.0:
-; SDAG-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_ge_u64_e64 s0, 0x64, s[6:7]
-; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s0
-; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; SDAG-GFX10-NEXT:    v_cmp_ge_u64_e64 s2, 0x64, s[2:3]
+; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
+; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; SDAG-GFX10-NEXT:    s_endpgm
 ;
 ; GISEL-GFX11-LABEL: v_icmp_u64_ule:
@@ -917,12 +917,12 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) {
 ;
 ; GISEL-GFX10-LABEL: v_icmp_u64_ule:
 ; GISEL-GFX10:       ; %bb.0:
-; GISEL-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_ge_u64_e64 s0, 0x64, s[6:7]
-; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GISEL-GFX10-NEXT:    v_cmp_ge_u64_e64 s2, 0x64, s[2:3]
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 37)
   store i32 %result, ptr addrspace(1) %out
@@ -944,12 +944,12 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) {
 ;
 ; SDAG-GFX10-LABEL: v_icmp_i64_sgt:
 ; SDAG-GFX10:       ; %bb.0:
-; SDAG-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_lt_i64_e64 s0, 0x64, s[6:7]
-; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s0
-; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; SDAG-GFX10-NEXT:    v_cmp_lt_i64_e64 s2, 0x64, s[2:3]
+; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
+; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; SDAG-GFX10-NEXT:    s_endpgm
 ;
 ; GISEL-GFX11-LABEL: v_icmp_i64_sgt:
@@ -967,12 +967,12 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) {
 ;
 ; GISEL-GFX10-LABEL: v_icmp_i64_sgt:
 ; GISEL-GFX10:       ; %bb.0:
-; GISEL-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_lt_i64_e64 s0, 0x64, s[6:7]
-; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GISEL-GFX10-NEXT:    v_cmp_lt_i64_e64 s2, 0x64, s[2:3]
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 38)
   store i32 %result, ptr addrspace(1) %out
@@ -994,12 +994,12 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) {
 ;
 ; SDAG-GFX10-LABEL: v_icmp_i64_sge:
 ; SDAG-GFX10:       ; %bb.0:
-; SDAG-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_le_i64_e64 s0, 0x64, s[6:7]
-; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s0
-; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; SDAG-GFX10-NEXT:    v_cmp_le_i64_e64 s2, 0x64, s[2:3]
+; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
+; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; SDAG-GFX10-NEXT:    s_endpgm
 ;
 ; GISEL-GFX11-LABEL: v_icmp_i64_sge:
@@ -1017,12 +1017,12 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) {
 ;
 ; GISEL-GFX10-LABEL: v_icmp_i64_sge:
 ; GISEL-GFX10:       ; %bb.0:
-; GISEL-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_le_i64_e64 s0, 0x64, s[6:7]
-; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GISEL-GFX10-NEXT:    v_cmp_le_i64_e64 s2, 0x64, s[2:3]
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 39)
   store i32 %result, ptr addrspace(1) %out
@@ -1044,12 +1044,12 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) {
 ;
 ; SDAG-GFX10-LABEL: v_icmp_i64_slt:
 ; SDAG-GFX10:       ; %bb.0:
-; SDAG-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_gt_i64_e64 s0, 0x64, s[6:7]
-; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s0
-; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; SDAG-GFX10-NEXT:    v_cmp_gt_i64_e64 s2, 0x64, s[2:3]
+; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
+; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; SDAG-GFX10-NEXT:    s_endpgm
 ;
 ; GISEL-GFX11-LABEL: v_icmp_i64_slt:
@@ -1067,12 +1067,12 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) {
 ;
 ; GISEL-GFX10-LABEL: v_icmp_i64_slt:
 ; GISEL-GFX10:       ; %bb.0:
-; GISEL-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_gt_i64_e64 s0, 0x64, s[6:7]
-; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GISEL-GFX10-NEXT:    v_cmp_gt_i64_e64 s2, 0x64, s[2:3]
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 40)
   store i32 %result, ptr addrspace(1) %out
@@ -1094,12 +1094,12 @@ define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) {
 ;
 ; SDAG-GFX10-LABEL: v_icmp_i64_sle:
 ; SDAG-GFX10:       ; %bb.0:
-; SDAG-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_ge_i64_e64 s0, 0x64, s[6:7]
-; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s0
-; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; SDAG-GFX10-NEXT:    v_cmp_ge_i64_e64 s2, 0x64, s[2:3]
+; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
+; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; SDAG-GFX10-NEXT:    s_endpgm
 ;
 ; GISEL-GFX11-LABEL: v_icmp_i64_sle:
@@ -1117,12 +1117,12 @@ define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) {
 ;
 ; GISEL-GFX10-LABEL: v_icmp_i64_sle:
 ; GISEL-GFX10:       ; %bb.0:
-; GISEL-GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_ge_i64_e64 s0, 0x64, s[6:7]
-; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GISEL-GFX10-NEXT:    v_cmp_ge_i64_e64 s2, 0x64, s[2:3]
+; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 41)
   store i32 %result, ptr addrspace(1) %out
@@ -1759,16 +1759,16 @@ define amdgpu_kernel void @v_icmp_i1_ne0(ptr addrspace(1) %out, i32 %a, i32 %b)
 ;
 ; GFX10-LABEL: v_icmp_i1_ne0:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_cmp_gt_u32 s6, 1
-; GFX10-NEXT:    s_cselect_b32 s0, -1, 0
-; GFX10-NEXT:    s_cmp_gt_u32 s7, 2
-; GFX10-NEXT:    s_cselect_b32 s1, -1, 0
-; GFX10-NEXT:    s_and_b32 s0, s0, s1
-; GFX10-NEXT:    v_mov_b32_e32 v1, s0
-; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-NEXT:    s_cmp_gt_u32 s2, 1
+; GFX10-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX10-NEXT:    s_cmp_gt_u32 s3, 2
+; GFX10-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX10-NEXT:    s_and_b32 s2, s2, s3
+; GFX10-NEXT:    v_mov_b32_e32 v1, s2
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
   %c0 = icmp ugt i32 %a, 1
   %c1 = icmp ugt i32 %b, 2
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll
index cad3c54ae54b0..60e242bf5b0e8 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll
@@ -716,15 +716,15 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) {
 ;
 ; GFX9-LABEL: v_icmp_i64_eq:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x64
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    v_cmp_eq_u64_e64 s[2:3], s[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GISEL-VI-LABEL: v_icmp_i64_eq:
@@ -776,15 +776,15 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) {
 ;
 ; GFX9-LABEL: v_icmp_i64_ne:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x64
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_ne_u64_e64 s[0:1], s[6:7], v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GISEL-VI-LABEL: v_icmp_i64_ne:
@@ -836,15 +836,15 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) {
 ;
 ; GFX9-LABEL: v_icmp_u64_ugt:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x64
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_gt_u64_e64 s[0:1], s[6:7], v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    v_cmp_gt_u64_e64 s[2:3], s[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GISEL-VI-LABEL: v_icmp_u64_ugt:
@@ -896,15 +896,15 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) {
 ;
 ; GFX9-LABEL: v_icmp_u64_uge:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x64
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_ge_u64_e64 s[0:1], s[6:7], v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    v_cmp_ge_u64_e64 s[2:3], s[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GISEL-VI-LABEL: v_icmp_u64_uge:
@@ -956,15 +956,15 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) {
 ;
 ; GFX9-LABEL: v_icmp_u64_ult:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x64
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_lt_u64_e64 s[0:1], s[6:7], v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    v_cmp_lt_u64_e64 s[2:3], s[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GISEL-VI-LABEL: v_icmp_u64_ult:
@@ -1016,15 +1016,15 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) {
 ;
 ; GFX9-LABEL: v_icmp_u64_ule:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x64
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_le_u64_e64 s[0:1], s[6:7], v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    v_cmp_le_u64_e64 s[2:3], s[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GISEL-VI-LABEL: v_icmp_u64_ule:
@@ -1076,15 +1076,15 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) {
 ;
 ; GFX9-LABEL: v_icmp_i64_sgt:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x64
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[6:7], v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    v_cmp_gt_i64_e64 s[2:3], s[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GISEL-VI-LABEL: v_icmp_i64_sgt:
@@ -1136,15 +1136,15 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) {
 ;
 ; GFX9-LABEL: v_icmp_i64_sge:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x64
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_ge_i64_e64 s[0:1], s[6:7], v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    v_cmp_ge_i64_e64 s[2:3], s[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GISEL-VI-LABEL: v_icmp_i64_sge:
@@ -1196,15 +1196,15 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) {
 ;
 ; GFX9-LABEL: v_icmp_i64_slt:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x64
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[6:7], v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    v_cmp_lt_i64_e64 s[2:3], s[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GISEL-VI-LABEL: v_icmp_i64_slt:
@@ -1256,15 +1256,15 @@ define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) {
 ;
 ; GFX9-LABEL: v_icmp_i64_sle:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x64
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_le_i64_e64 s[0:1], s[6:7], v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    v_cmp_le_i64_e64 s[2:3], s[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GISEL-VI-LABEL: v_icmp_i64_sle:
@@ -1986,17 +1986,17 @@ define amdgpu_kernel void @v_icmp_i1_ne0(ptr addrspace(1) %out, i32 %a, i32 %b)
 ;
 ; GFX9-LABEL: v_icmp_i1_ne0:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_cmp_gt_u32 s6, 1
-; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GFX9-NEXT:    s_cmp_gt_u32 s7, 2
+; GFX9-NEXT:    s_cmp_gt_u32 s2, 1
+; GFX9-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GFX9-NEXT:    s_cmp_gt_u32 s3, 2
 ; GFX9-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    s_and_b64 s[2:3], s[4:5], s[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %c0 = icmp ugt i32 %a, 1
   %c1 = icmp ugt i32 %b, 2
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll
index 0393a551dcd41..2d01703c78d78 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll
@@ -163,7 +163,7 @@ main_body:
 define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ptr %p_ray, <4 x i32> inreg %tdescr) {
 ; GFX1013-LABEL: image_bvh_intersect_ray_nsa_reassign:
 ; GFX1013:       ; %bb.0: ; %main_body
-; GFX1013-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX1013-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX1013-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX1013-NEXT:    v_mov_b32_e32 v6, 4.0
 ; GFX1013-NEXT:    v_mov_b32_e32 v7, 0x40a00000
@@ -171,10 +171,10 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr,
 ; GFX1013-NEXT:    v_mov_b32_e32 v9, 0x40e00000
 ; GFX1013-NEXT:    v_mov_b32_e32 v10, 0x41000000
 ; GFX1013-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1013-NEXT:    v_add_co_u32 v2, s0, s4, v0
-; GFX1013-NEXT:    v_add_co_ci_u32_e64 v3, s0, s5, 0, s0
-; GFX1013-NEXT:    v_add_co_u32 v4, s0, s6, v0
-; GFX1013-NEXT:    v_add_co_ci_u32_e64 v5, s0, s7, 0, s0
+; GFX1013-NEXT:    v_add_co_u32 v2, s0, s0, v0
+; GFX1013-NEXT:    v_add_co_ci_u32_e64 v3, s0, s1, 0, s0
+; GFX1013-NEXT:    v_add_co_u32 v4, s0, s2, v0
+; GFX1013-NEXT:    v_add_co_ci_u32_e64 v5, s0, s3, 0, s0
 ; GFX1013-NEXT:    flat_load_dword v0, v[2:3]
 ; GFX1013-NEXT:    flat_load_dword v1, v[4:5]
 ; GFX1013-NEXT:    v_mov_b32_e32 v2, 0
@@ -182,7 +182,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr,
 ; GFX1013-NEXT:    v_mov_b32_e32 v4, 2.0
 ; GFX1013-NEXT:    v_mov_b32_e32 v5, 0x40400000
 ; GFX1013-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX1013-NEXT:    image_bvh_intersect_ray v[0:3], v[0:10], s[8:11]
+; GFX1013-NEXT:    image_bvh_intersect_ray v[0:3], v[0:10], s[4:7]
 ; GFX1013-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1013-NEXT:    flat_store_dwordx4 v[0:1], v[0:3]
 ; GFX1013-NEXT:    s_endpgm
@@ -262,15 +262,15 @@ main_body:
 define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ptr, ptr %p_ray, <4 x i32> inreg %tdescr) {
 ; GFX1013-LABEL: image_bvh_intersect_ray_a16_nsa_reassign:
 ; GFX1013:       ; %bb.0: ; %main_body
-; GFX1013-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX1013-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX1013-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX1013-NEXT:    v_mov_b32_e32 v6, 0x46004500
 ; GFX1013-NEXT:    v_mov_b32_e32 v7, 0x48004700
 ; GFX1013-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1013-NEXT:    v_add_co_u32 v2, s0, s4, v0
-; GFX1013-NEXT:    v_add_co_ci_u32_e64 v3, s0, s5, 0, s0
-; GFX1013-NEXT:    v_add_co_u32 v4, s0, s6, v0
-; GFX1013-NEXT:    v_add_co_ci_u32_e64 v5, s0, s7, 0, s0
+; GFX1013-NEXT:    v_add_co_u32 v2, s0, s0, v0
+; GFX1013-NEXT:    v_add_co_ci_u32_e64 v3, s0, s1, 0, s0
+; GFX1013-NEXT:    v_add_co_u32 v4, s0, s2, v0
+; GFX1013-NEXT:    v_add_co_ci_u32_e64 v5, s0, s3, 0, s0
 ; GFX1013-NEXT:    flat_load_dword v0, v[2:3]
 ; GFX1013-NEXT:    flat_load_dword v1, v[4:5]
 ; GFX1013-NEXT:    v_mov_b32_e32 v2, 0
@@ -278,7 +278,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_
 ; GFX1013-NEXT:    v_mov_b32_e32 v4, 2.0
 ; GFX1013-NEXT:    v_mov_b32_e32 v5, 0x44004200
 ; GFX1013-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX1013-NEXT:    image_bvh_intersect_ray v[0:3], v[0:7], s[8:11] a16
+; GFX1013-NEXT:    image_bvh_intersect_ray v[0:3], v[0:7], s[4:7] a16
 ; GFX1013-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1013-NEXT:    flat_store_dwordx4 v[0:1], v[0:3]
 ; GFX1013-NEXT:    s_endpgm
@@ -352,10 +352,9 @@ main_body:
 define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 x i32> inreg %tdescr) {
 ; GFX1013-LABEL: image_bvh64_intersect_ray_nsa_reassign:
 ; GFX1013:       ; %bb.0: ; %main_body
-; GFX1013-NEXT:    s_clause 0x1
-; GFX1013-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1013-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x34
+; GFX1013-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
 ; GFX1013-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX1013-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x34
 ; GFX1013-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1013-NEXT:    v_mov_b32_e32 v4, 1.0
 ; GFX1013-NEXT:    v_mov_b32_e32 v5, 2.0
@@ -366,13 +365,13 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4
 ; GFX1013-NEXT:    v_mov_b32_e32 v10, 0x40e00000
 ; GFX1013-NEXT:    v_mov_b32_e32 v11, 0x41000000
 ; GFX1013-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1013-NEXT:    v_add_co_u32 v0, s0, s0, v0
-; GFX1013-NEXT:    v_add_co_ci_u32_e64 v1, s0, s1, 0, s0
+; GFX1013-NEXT:    v_add_co_u32 v0, s4, s4, v0
+; GFX1013-NEXT:    v_add_co_ci_u32_e64 v1, s4, s5, 0, s4
 ; GFX1013-NEXT:    flat_load_dword v2, v[0:1]
 ; GFX1013-NEXT:    v_mov_b32_e32 v0, 0xb36211c7
 ; GFX1013-NEXT:    v_bfrev_b32_e32 v1, 4.0
 ; GFX1013-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX1013-NEXT:    image_bvh64_intersect_ray v[0:3], v[0:11], s[4:7]
+; GFX1013-NEXT:    image_bvh64_intersect_ray v[0:3], v[0:11], s[0:3]
 ; GFX1013-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1013-NEXT:    flat_store_dwordx4 v[0:1], v[0:3]
 ; GFX1013-NEXT:    s_endpgm
@@ -448,10 +447,9 @@ main_body:
 define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray, <4 x i32> inreg %tdescr) {
 ; GFX1013-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign:
 ; GFX1013:       ; %bb.0: ; %main_body
-; GFX1013-NEXT:    s_clause 0x1
-; GFX1013-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1013-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x34
+; GFX1013-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
 ; GFX1013-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX1013-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x34
 ; GFX1013-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1013-NEXT:    v_mov_b32_e32 v4, 1.0
 ; GFX1013-NEXT:    v_mov_b32_e32 v5, 2.0
@@ -459,13 +457,13 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray
 ; GFX1013-NEXT:    v_mov_b32_e32 v7, 0x46004500
 ; GFX1013-NEXT:    v_mov_b32_e32 v8, 0x48004700
 ; GFX1013-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1013-NEXT:    v_add_co_u32 v0, s0, s0, v0
-; GFX1013-NEXT:    v_add_co_ci_u32_e64 v1, s0, s1, 0, s0
+; GFX1013-NEXT:    v_add_co_u32 v0, s4, s4, v0
+; GFX1013-NEXT:    v_add_co_ci_u32_e64 v1, s4, s5, 0, s4
 ; GFX1013-NEXT:    flat_load_dword v2, v[0:1]
 ; GFX1013-NEXT:    v_mov_b32_e32 v0, 0xb36211c6
 ; GFX1013-NEXT:    v_bfrev_b32_e32 v1, 4.0
 ; GFX1013-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX1013-NEXT:    image_bvh64_intersect_ray v[0:3], v[0:8], s[4:7] a16
+; GFX1013-NEXT:    image_bvh64_intersect_ray v[0:3], v[0:8], s[0:3] a16
 ; GFX1013-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1013-NEXT:    flat_store_dwordx4 v[0:1], v[0:3]
 ; GFX1013-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
index aad74410d1453..8e9a652ae8a8e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
@@ -384,26 +384,26 @@ define amdgpu_kernel void @v_permlane16_b32_vii_f32(ptr addrspace(1) %out, float
 define amdgpu_kernel void @v_permlane16_b32_vii_i64(ptr addrspace(1) %out, i64 %src0) {
 ; GFX10-SDAG-LABEL: v_permlane16_b32_vii_i64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s7
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX10-SDAG-NEXT:    v_permlane16_b32 v1, v1, 1, 2
 ; GFX10-SDAG-NEXT:    v_permlane16_b32 v0, v0, 1, 2
-; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-SDAG-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_permlane16_b32_vii_i64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s6
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-GISEL-NEXT:    v_permlane16_b32 v0, v0, 1, 2
 ; GFX10-GISEL-NEXT:    v_permlane16_b32 v1, v1, 1, 2
-; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: v_permlane16_b32_vii_i64:
@@ -469,26 +469,26 @@ define amdgpu_kernel void @v_permlane16_b32_vii_i64(ptr addrspace(1) %out, i64 %
 define amdgpu_kernel void @v_permlane16_b32_vii_f64(ptr addrspace(1) %out, double %src0) {
 ; GFX10-SDAG-LABEL: v_permlane16_b32_vii_f64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s7
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX10-SDAG-NEXT:    v_permlane16_b32 v1, v1, 1, 2
 ; GFX10-SDAG-NEXT:    v_permlane16_b32 v0, v0, 1, 2
-; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-SDAG-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_permlane16_b32_vii_f64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s6
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-GISEL-NEXT:    v_permlane16_b32 v0, v0, 1, 2
 ; GFX10-GISEL-NEXT:    v_permlane16_b32 v1, v1, 1, 2
-; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: v_permlane16_b32_vii_f64:
@@ -601,28 +601,28 @@ define amdgpu_kernel void @v_permlane16_b32_vll_i32(ptr addrspace(1) %out, i32 %
 define amdgpu_kernel void @v_permlane16_b32_vll_i64(ptr addrspace(1) %out, i64 %src0) {
 ; GFX10-SDAG-LABEL: v_permlane16_b32_vll_i64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX10-SDAG-NEXT:    s_movk_i32 s0, 0x1234
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s7
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s6
-; GFX10-SDAG-NEXT:    v_permlane16_b32 v1, v1, s0, 0xc1d1
-; GFX10-SDAG-NEXT:    v_permlane16_b32 v0, v0, s0, 0xc1d1
-; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-SDAG-NEXT:    s_movk_i32 s2, 0x1234
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v1, v1, s2, 0xc1d1
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v0, v0, s2, 0xc1d1
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-SDAG-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_permlane16_b32_vll_i64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX10-GISEL-NEXT:    s_movk_i32 s0, 0x1234
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s6
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s7
-; GFX10-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, 0xc1d1
-; GFX10-GISEL-NEXT:    v_permlane16_b32 v1, v1, s0, 0xc1d1
-; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-GISEL-NEXT:    s_movk_i32 s2, 0x1234
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v0, v0, s2, 0xc1d1
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v1, v1, s2, 0xc1d1
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: v_permlane16_b32_vll_i64:
@@ -742,28 +742,28 @@ define amdgpu_kernel void @v_permlane16_b32_vll_f32(ptr addrspace(1) %out,float
 define amdgpu_kernel void @v_permlane16_b32_vll_f64(ptr addrspace(1) %out, double %src0) {
 ; GFX10-SDAG-LABEL: v_permlane16_b32_vll_f64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX10-SDAG-NEXT:    s_movk_i32 s0, 0x1234
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s7
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s6
-; GFX10-SDAG-NEXT:    v_permlane16_b32 v1, v1, s0, 0xc1d1
-; GFX10-SDAG-NEXT:    v_permlane16_b32 v0, v0, s0, 0xc1d1
-; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-SDAG-NEXT:    s_movk_i32 s2, 0x1234
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v1, v1, s2, 0xc1d1
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v0, v0, s2, 0xc1d1
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-SDAG-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_permlane16_b32_vll_f64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX10-GISEL-NEXT:    s_movk_i32 s0, 0x1234
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s6
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s7
-; GFX10-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, 0xc1d1
-; GFX10-GISEL-NEXT:    v_permlane16_b32 v1, v1, s0, 0xc1d1
-; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-GISEL-NEXT:    s_movk_i32 s2, 0x1234
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v0, v0, s2, 0xc1d1
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v1, v1, s2, 0xc1d1
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: v_permlane16_b32_vll_f64:
@@ -933,30 +933,30 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_i32(ptr addrspace(1) %out, i32 %
 define amdgpu_kernel void @v_permlane16_b32_vvv_i64(ptr addrspace(1) %out, i64 %src0) {
 ; GFX10-SDAG-LABEL: v_permlane16_b32_vvv_i64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s7
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s6
-; GFX10-SDAG-NEXT:    v_permlane16_b32 v1, v1, s0, s1
-; GFX10-SDAG-NEXT:    v_permlane16_b32 v0, v0, s0, s1
-; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v1, v1, s4, s5
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v0, v0, s4, s5
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-SDAG-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_permlane16_b32_vvv_i64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s6
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s7
-; GFX10-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s1
-; GFX10-GISEL-NEXT:    v_permlane16_b32 v1, v1, s0, s1
-; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v0, v0, s4, s5
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v1, v1, s4, s5
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_permlane16_b32_vvv_i64:
@@ -1104,30 +1104,30 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_f32(ptr addrspace(1) %out, float
 define amdgpu_kernel void @v_permlane16_b32_vvv_f64(ptr addrspace(1) %out, double %src0) {
 ; GFX10-SDAG-LABEL: v_permlane16_b32_vvv_f64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s7
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s6
-; GFX10-SDAG-NEXT:    v_permlane16_b32 v1, v1, s0, s1
-; GFX10-SDAG-NEXT:    v_permlane16_b32 v0, v0, s0, s1
-; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v1, v1, s4, s5
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v0, v0, s4, s5
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-SDAG-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_permlane16_b32_vvv_f64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s6
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s7
-; GFX10-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s1
-; GFX10-GISEL-NEXT:    v_permlane16_b32 v1, v1, s0, s1
-; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v0, v0, s4, s5
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v1, v1, s4, s5
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_permlane16_b32_vvv_f64:
@@ -1179,24 +1179,24 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_f64(ptr addrspace(1) %out, doubl
 define amdgpu_kernel void @v_permlane16_b32_vvs_i32(ptr addrspace(1) %out, i32 %src0, i32 %src2) {
 ; GFX10-SDAG-LABEL: v_permlane16_b32_vvs_i32:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s6
-; GFX10-SDAG-NEXT:    v_permlane16_b32 v1, v1, s0, s7
-; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s2
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v1, v1, s2, s3
+; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-SDAG-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_permlane16_b32_vvs_i32:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s6
-; GFX10-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s7
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v0, v0, s4, s3
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: v_permlane16_b32_vvs_i32:
@@ -1337,24 +1337,24 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_i64(ptr addrspace(1) %out, i64 %
 define amdgpu_kernel void @v_permlane16_b32_vvs_f32(ptr addrspace(1) %out, float %src0, i32 %src2) {
 ; GFX10-SDAG-LABEL: v_permlane16_b32_vvs_f32:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s6
-; GFX10-SDAG-NEXT:    v_permlane16_b32 v1, v1, s0, s7
-; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s2
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v1, v1, s2, s3
+; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-SDAG-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_permlane16_b32_vvs_f32:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s6
-; GFX10-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s7
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v0, v0, s4, s3
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: v_permlane16_b32_vvs_f32:
@@ -1493,16 +1493,27 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_f64(ptr addrspace(1) %out, doubl
 }
 
 define amdgpu_kernel void @v_permlane16_b32_vsv_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1) {
-; GFX10-LABEL: v_permlane16_b32_vsv_i32:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX10-NEXT:    v_readfirstlane_b32 s0, v1
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s6
-; GFX10-NEXT:    v_permlane16_b32 v0, v0, s7, s0
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
-; GFX10-NEXT:    s_endpgm
+; GFX10-SDAG-LABEL: v_permlane16_b32_vsv_i32:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s2, v1
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v0, v0, s3, s2
+; GFX10-SDAG-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlane16_b32_vsv_i32:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v0, v0, s3, s4
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: v_permlane16_b32_vsv_i32:
 ; GFX11-SDAG:       ; %bb.0:
@@ -1682,16 +1693,27 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_i64(ptr addrspace(1) %out, i64 %
 }
 
 define amdgpu_kernel void @v_permlane16_b32_vsv_f32(ptr addrspace(1) %out, float %src0, i32 %src1) {
-; GFX10-LABEL: v_permlane16_b32_vsv_f32:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX10-NEXT:    v_readfirstlane_b32 s0, v1
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s6
-; GFX10-NEXT:    v_permlane16_b32 v0, v0, s7, s0
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
-; GFX10-NEXT:    s_endpgm
+; GFX10-SDAG-LABEL: v_permlane16_b32_vsv_f32:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s2, v1
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-SDAG-NEXT:    v_permlane16_b32 v0, v0, s3, s2
+; GFX10-SDAG-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlane16_b32_vsv_f32:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-GISEL-NEXT:    v_permlane16_b32 v0, v0, s3, s4
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: v_permlane16_b32_vsv_f32:
 ; GFX11-SDAG:       ; %bb.0:
@@ -3095,26 +3117,26 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_f32(ptr addrspace(1) %out, floa
 define amdgpu_kernel void @v_permlanex16_b32_vii_i64(ptr addrspace(1) %out, i64 %src0) {
 ; GFX10-SDAG-LABEL: v_permlanex16_b32_vii_i64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s7
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX10-SDAG-NEXT:    v_permlanex16_b32 v1, v1, 1, 2
 ; GFX10-SDAG-NEXT:    v_permlanex16_b32 v0, v0, 1, 2
-; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-SDAG-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_permlanex16_b32_vii_i64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s6
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-GISEL-NEXT:    v_permlanex16_b32 v0, v0, 1, 2
 ; GFX10-GISEL-NEXT:    v_permlanex16_b32 v1, v1, 1, 2
-; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: v_permlanex16_b32_vii_i64:
@@ -3180,26 +3202,26 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_i64(ptr addrspace(1) %out, i64
 define amdgpu_kernel void @v_permlanex16_b32_vii_f64(ptr addrspace(1) %out, double %src0) {
 ; GFX10-SDAG-LABEL: v_permlanex16_b32_vii_f64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s7
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX10-SDAG-NEXT:    v_permlanex16_b32 v1, v1, 1, 2
 ; GFX10-SDAG-NEXT:    v_permlanex16_b32 v0, v0, 1, 2
-; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-SDAG-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_permlanex16_b32_vii_f64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s6
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-GISEL-NEXT:    v_permlanex16_b32 v0, v0, 1, 2
 ; GFX10-GISEL-NEXT:    v_permlanex16_b32 v1, v1, 1, 2
-; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: v_permlanex16_b32_vii_f64:
@@ -3358,28 +3380,28 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_f32(ptr addrspace(1) %out, floa
 define amdgpu_kernel void @v_permlanex16_b32_vll_i64(ptr addrspace(1) %out, i64 %src0) {
 ; GFX10-SDAG-LABEL: v_permlanex16_b32_vll_i64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX10-SDAG-NEXT:    s_movk_i32 s0, 0x1234
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s7
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s6
-; GFX10-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s0, 0xc1d1
-; GFX10-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s0, 0xc1d1
-; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-SDAG-NEXT:    s_movk_i32 s2, 0x1234
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s2, 0xc1d1
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s2, 0xc1d1
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-SDAG-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_permlanex16_b32_vll_i64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX10-GISEL-NEXT:    s_movk_i32 s0, 0x1234
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s6
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s7
-; GFX10-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, 0xc1d1
-; GFX10-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s0, 0xc1d1
-; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-GISEL-NEXT:    s_movk_i32 s2, 0x1234
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s2, 0xc1d1
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s2, 0xc1d1
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: v_permlanex16_b32_vll_i64:
@@ -3453,28 +3475,28 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_i64(ptr addrspace(1) %out, i64
 define amdgpu_kernel void @v_permlanex16_b32_vll_f64(ptr addrspace(1) %out, double %src0) {
 ; GFX10-SDAG-LABEL: v_permlanex16_b32_vll_f64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX10-SDAG-NEXT:    s_movk_i32 s0, 0x1234
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s7
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s6
-; GFX10-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s0, 0xc1d1
-; GFX10-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s0, 0xc1d1
-; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-SDAG-NEXT:    s_movk_i32 s2, 0x1234
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s2, 0xc1d1
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s2, 0xc1d1
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-SDAG-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_permlanex16_b32_vll_f64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX10-GISEL-NEXT:    s_movk_i32 s0, 0x1234
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s6
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s7
-; GFX10-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, 0xc1d1
-; GFX10-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s0, 0xc1d1
-; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-GISEL-NEXT:    s_movk_i32 s2, 0x1234
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s2, 0xc1d1
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s2, 0xc1d1
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: v_permlanex16_b32_vll_f64:
@@ -3740,30 +3762,30 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_f32(ptr addrspace(1) %out, floa
 define amdgpu_kernel void @v_permlanex16_b32_vvv_i64(ptr addrspace(1) %out, i64 %src0) {
 ; GFX10-SDAG-LABEL: v_permlanex16_b32_vvv_i64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s7
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s6
-; GFX10-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s0, s1
-; GFX10-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s0, s1
-; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s4, s5
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s4, s5
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-SDAG-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_permlanex16_b32_vvv_i64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s6
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s7
-; GFX10-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s1
-; GFX10-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s0, s1
-; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s4, s5
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s4, s5
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_permlanex16_b32_vvv_i64:
@@ -3815,30 +3837,30 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_i64(ptr addrspace(1) %out, i64
 define amdgpu_kernel void @v_permlanex16_b32_vvv_f64(ptr addrspace(1) %out, double %src0) {
 ; GFX10-SDAG-LABEL: v_permlanex16_b32_vvv_f64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s7
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s6
-; GFX10-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s0, s1
-; GFX10-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s0, s1
-; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s4, s5
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s4, s5
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-SDAG-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_permlanex16_b32_vvv_f64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s6
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s7
-; GFX10-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s1
-; GFX10-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s0, s1
-; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s4, s5
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s4, s5
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_permlanex16_b32_vvv_f64:
@@ -3890,24 +3912,24 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_f64(ptr addrspace(1) %out, doub
 define amdgpu_kernel void @v_permlanex16_b32_vvs_i32(ptr addrspace(1) %out, i32 %src0, i32 %src2) {
 ; GFX10-SDAG-LABEL: v_permlanex16_b32_vvs_i32:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s6
-; GFX10-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s0, s7
-; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s2
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s2, s3
+; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-SDAG-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_permlanex16_b32_vvs_i32:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s6
-; GFX10-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s7
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s4, s3
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: v_permlanex16_b32_vvs_i32:
@@ -3974,24 +3996,24 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_i32(ptr addrspace(1) %out, i32
 define amdgpu_kernel void @v_permlanex16_b32_vvs_f32(ptr addrspace(1) %out, float %src0, i32 %src2) {
 ; GFX10-SDAG-LABEL: v_permlanex16_b32_vvs_f32:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s6
-; GFX10-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s0, s7
-; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s2
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s2, s3
+; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-SDAG-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_permlanex16_b32_vvs_f32:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s6
-; GFX10-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s7
-; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s4, s3
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: v_permlanex16_b32_vvs_f32:
@@ -4204,16 +4226,27 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_f64(ptr addrspace(1) %out, doub
 }
 
 define amdgpu_kernel void @v_permlanex16_b32_vsv_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1) {
-; GFX10-LABEL: v_permlanex16_b32_vsv_i32:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX10-NEXT:    v_readfirstlane_b32 s0, v1
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s6
-; GFX10-NEXT:    v_permlanex16_b32 v0, v0, s7, s0
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
-; GFX10-NEXT:    s_endpgm
+; GFX10-SDAG-LABEL: v_permlanex16_b32_vsv_i32:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s2, v1
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s3, s2
+; GFX10-SDAG-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlanex16_b32_vsv_i32:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s3, s4
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: v_permlanex16_b32_vsv_i32:
 ; GFX11-SDAG:       ; %bb.0:
@@ -4281,16 +4314,27 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_i32(ptr addrspace(1) %out, i32
 }
 
 define amdgpu_kernel void @v_permlanex16_b32_vsv_f32(ptr addrspace(1) %out, float %src0, i32 %src1) {
-; GFX10-LABEL: v_permlanex16_b32_vsv_f32:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX10-NEXT:    v_readfirstlane_b32 s0, v1
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s6
-; GFX10-NEXT:    v_permlanex16_b32 v0, v0, s7, s0
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
-; GFX10-NEXT:    s_endpgm
+; GFX10-SDAG-LABEL: v_permlanex16_b32_vsv_f32:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s2, v1
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s3, s2
+; GFX10-SDAG-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: v_permlanex16_b32_vsv_f32:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s3, s4
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: v_permlanex16_b32_vsv_f32:
 ; GFX11-SDAG:       ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll
deleted file mode 100644
index 419e19083f85e..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll
+++ /dev/null
@@ -1,313 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -global-isel=0 | FileCheck %s -check-prefix=CHECK
-; RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -global-isel=1 | FileCheck %s -check-prefix=CHECK
-
-define amdgpu_kernel void @raw_atomic_buffer_load_i32(<4 x i32> %addr) {
-; CHECK-LABEL: raw_atomic_buffer_load_i32:
-; CHECK:       ; %bb.0: ; %bb
-; CHECK-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT:    s_mov_b32 s4, 0
-; CHECK-NEXT:  .LBB0_1: ; %bb1
-; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    buffer_load_b32 v1, off, s[0:3], 0 glc
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT:    s_cbranch_execnz .LBB0_1
-; CHECK-NEXT:  ; %bb.2: ; %bb2
-; CHECK-NEXT:    s_endpgm
-bb:
-  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  br label %bb1
-bb1:
-  %load = call i32 @llvm.amdgcn.raw.atomic.buffer.load.i32(<4 x i32> %addr, i32 0, i32 0, i32 1)
-  %cmp = icmp eq i32 %load, %id
-  br i1 %cmp, label %bb1, label %bb2
-bb2:
-  ret void
-}
-
-define amdgpu_kernel void @raw_atomic_buffer_load_i32_off(<4 x i32> %addr) {
-; CHECK-LABEL: raw_atomic_buffer_load_i32_off:
-; CHECK:       ; %bb.0: ; %bb
-; CHECK-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT:    s_mov_b32 s4, 0
-; CHECK-NEXT:  .LBB1_1: ; %bb1
-; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    buffer_load_b32 v1, off, s[0:3], 0 glc
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT:    s_cbranch_execnz .LBB1_1
-; CHECK-NEXT:  ; %bb.2: ; %bb2
-; CHECK-NEXT:    s_endpgm
-bb:
-  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  br label %bb1
-bb1:
-  %load = call i32 @llvm.amdgcn.raw.atomic.buffer.load.i32(<4 x i32> %addr, i32 0, i32 0, i32 1)
-  %cmp = icmp eq i32 %load, %id
-  br i1 %cmp, label %bb1, label %bb2
-bb2:
-  ret void
-}
-define amdgpu_kernel void @raw_atomic_buffer_load_i32_soff(<4 x i32> %addr) {
-; CHECK-LABEL: raw_atomic_buffer_load_i32_soff:
-; CHECK:       ; %bb.0: ; %bb
-; CHECK-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT:    s_mov_b32 s4, 0
-; CHECK-NEXT:  .LBB2_1: ; %bb1
-; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    buffer_load_b32 v1, off, s[0:3], 4 offset:4 glc
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT:    s_cbranch_execnz .LBB2_1
-; CHECK-NEXT:  ; %bb.2: ; %bb2
-; CHECK-NEXT:    s_endpgm
-bb:
-  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  br label %bb1
-bb1:
-  %load = call i32 @llvm.amdgcn.raw.atomic.buffer.load.i32(<4 x i32> %addr, i32 4, i32 4, i32 1)
-  %cmp = icmp eq i32 %load, %id
-  br i1 %cmp, label %bb1, label %bb2
-bb2:
-  ret void
-}
-define amdgpu_kernel void @raw_atomic_buffer_load_i32_dlc(<4 x i32> %addr) {
-; CHECK-LABEL: raw_atomic_buffer_load_i32_dlc:
-; CHECK:       ; %bb.0: ; %bb
-; CHECK-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT:    s_mov_b32 s4, 0
-; CHECK-NEXT:  .LBB3_1: ; %bb1
-; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    buffer_load_b32 v1, off, s[0:3], 0 offset:4 dlc
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT:    s_cbranch_execnz .LBB3_1
-; CHECK-NEXT:  ; %bb.2: ; %bb2
-; CHECK-NEXT:    s_endpgm
-bb:
-  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  br label %bb1
-bb1:
-  %load = call i32 @llvm.amdgcn.raw.atomic.buffer.load.i32(<4 x i32> %addr, i32 4, i32 0, i32 4)
-  %cmp = icmp eq i32 %load, %id
-  br i1 %cmp, label %bb1, label %bb2
-bb2:
-  ret void
-}
-
-define amdgpu_kernel void @raw_nonatomic_buffer_load_i32(<4 x i32> %addr) {
-; CHECK-LABEL: raw_nonatomic_buffer_load_i32:
-; CHECK:       ; %bb.0: ; %bb
-; CHECK-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    buffer_load_b32 v1, off, s[0:3], 0 offset:4 glc
-; CHECK-NEXT:    s_mov_b32 s0, 0
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-NEXT:  .LBB4_1: ; %bb1
-; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    s_and_b32 s1, exec_lo, vcc_lo
-; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; CHECK-NEXT:    s_or_b32 s0, s1, s0
-; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; CHECK-NEXT:    s_cbranch_execnz .LBB4_1
-; CHECK-NEXT:  ; %bb.2: ; %bb2
-; CHECK-NEXT:    s_endpgm
-bb:
-  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  br label %bb1
-bb1:
-  %load = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %addr, i32 4, i32 0, i32 1)
-  %cmp = icmp eq i32 %load, %id
-  br i1 %cmp, label %bb1, label %bb2
-bb2:
-  ret void
-}
-
-define amdgpu_kernel void @raw_atomic_buffer_load_i64(<4 x i32> %addr) {
-; CHECK-LABEL: raw_atomic_buffer_load_i64:
-; CHECK:       ; %bb.0: ; %bb
-; CHECK-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; CHECK-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
-; CHECK-NEXT:    s_mov_b32 s4, 0
-; CHECK-NEXT:  .LBB5_1: ; %bb1
-; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    buffer_load_b64 v[2:3], off, s[0:3], 0 offset:4 glc
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc_lo, v[2:3], v[0:1]
-; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT:    s_cbranch_execnz .LBB5_1
-; CHECK-NEXT:  ; %bb.2: ; %bb2
-; CHECK-NEXT:    s_endpgm
-bb:
-  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %id.zext = zext i32 %id to i64
-  br label %bb1
-bb1:
-  %load = call i64 @llvm.amdgcn.raw.atomic.buffer.load.i64(<4 x i32> %addr, i32 4, i32 0, i32 1)
-  %cmp = icmp eq i64 %load, %id.zext
-  br i1 %cmp, label %bb1, label %bb2
-bb2:
-  ret void
-}
-
-define amdgpu_kernel void @raw_atomic_buffer_load_v2i16(<4 x i32> %addr) {
-; CHECK-LABEL: raw_atomic_buffer_load_v2i16:
-; CHECK:       ; %bb.0: ; %bb
-; CHECK-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT:    s_mov_b32 s4, 0
-; CHECK-NEXT:  .LBB6_1: ; %bb1
-; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    buffer_load_b32 v1, off, s[0:3], 0 glc
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT:    s_cbranch_execnz .LBB6_1
-; CHECK-NEXT:  ; %bb.2: ; %bb2
-; CHECK-NEXT:    s_endpgm
-bb:
-  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  br label %bb1
-bb1:
-  %load = call <2 x i16> @llvm.amdgcn.raw.atomic.buffer.load.v2i16(<4 x i32> %addr, i32 0, i32 0, i32 1)
-  %bitcast = bitcast <2 x i16> %load to i32
-  %cmp = icmp eq i32 %bitcast, %id
-  br i1 %cmp, label %bb1, label %bb2
-bb2:
-  ret void
-}
-
-define amdgpu_kernel void @raw_atomic_buffer_load_v4i16(<4 x i32> %addr) {
-; CHECK-LABEL: raw_atomic_buffer_load_v4i16:
-; CHECK:       ; %bb.0: ; %bb
-; CHECK-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT:    s_mov_b32 s4, 0
-; CHECK-NEXT:  .LBB7_1: ; %bb1
-; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    buffer_load_b64 v[1:2], off, s[0:3], 0 offset:4 glc
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; CHECK-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT:    s_cbranch_execnz .LBB7_1
-; CHECK-NEXT:  ; %bb.2: ; %bb2
-; CHECK-NEXT:    s_endpgm
-bb:
-  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  br label %bb1
-bb1:
-  %load = call <4 x i16> @llvm.amdgcn.raw.atomic.buffer.load.v4i16(<4 x i32> %addr, i32 4, i32 0, i32 1)
-  %shortened = shufflevector <4 x i16> %load, <4 x i16> poison, <2 x i32> <i32 0, i32 2>
-  %bitcast = bitcast <2 x i16> %shortened to i32
-  %cmp = icmp eq i32 %bitcast, %id
-  br i1 %cmp, label %bb1, label %bb2
-bb2:
-  ret void
-}
-
-define amdgpu_kernel void @raw_atomic_buffer_load_v4i32(<4 x i32> %addr) {
-; CHECK-LABEL: raw_atomic_buffer_load_v4i32:
-; CHECK:       ; %bb.0: ; %bb
-; CHECK-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT:    s_mov_b32 s4, 0
-; CHECK-NEXT:  .LBB8_1: ; %bb1
-; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    buffer_load_b128 v[1:4], off, s[0:3], 0 offset:4 glc
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v4, v0
-; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT:    s_cbranch_execnz .LBB8_1
-; CHECK-NEXT:  ; %bb.2: ; %bb2
-; CHECK-NEXT:    s_endpgm
-bb:
-  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  br label %bb1
-bb1:
-  %load = call <4 x i32> @llvm.amdgcn.raw.atomic.buffer.load.v4i32(<4 x i32> %addr, i32 4, i32 0, i32 1)
-  %extracted = extractelement <4 x i32> %load, i32 3
-  %cmp = icmp eq i32 %extracted, %id
-  br i1 %cmp, label %bb1, label %bb2
-bb2:
-  ret void
-}
-
-define amdgpu_kernel void @raw_atomic_buffer_load_ptr(<4 x i32> %addr) {
-; CHECK-LABEL: raw_atomic_buffer_load_ptr:
-; CHECK:       ; %bb.0: ; %bb
-; CHECK-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT:    s_mov_b32 s4, 0
-; CHECK-NEXT:  .LBB9_1: ; %bb1
-; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    buffer_load_b64 v[1:2], off, s[0:3], 0 offset:4 glc
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_load_b32 v1, v[1:2]
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT:    s_cbranch_execnz .LBB9_1
-; CHECK-NEXT:  ; %bb.2: ; %bb2
-; CHECK-NEXT:    s_endpgm
-bb:
-  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  br label %bb1
-bb1:
-  %load = call ptr @llvm.amdgcn.raw.atomic.buffer.load.ptr(<4 x i32> %addr, i32 4, i32 0, i32 1)
-  %elem = load i32, ptr %load
-  %cmp = icmp eq i32 %elem, %id
-  br i1 %cmp, label %bb1, label %bb2
-bb2:
-  ret void
-}
-
-; Function Attrs: nounwind readonly
-declare i32 @llvm.amdgcn.raw.atomic.buffer.load.i32(<4 x i32>, i32, i32, i32 immarg)
-declare i64 @llvm.amdgcn.raw.atomic.buffer.load.i64(<4 x i32>, i32, i32, i32 immarg)
-declare <2 x i16> @llvm.amdgcn.raw.atomic.buffer.load.v2i16(<4 x i32>, i32, i32, i32 immarg)
-declare <4 x i16> @llvm.amdgcn.raw.atomic.buffer.load.v4i16(<4 x i32>, i32, i32, i32 immarg)
-declare <4 x i32> @llvm.amdgcn.raw.atomic.buffer.load.v4i32(<4 x i32>, i32, i32, i32 immarg)
-declare ptr @llvm.amdgcn.raw.atomic.buffer.load.ptr(<4 x i32>, i32, i32, i32 immarg)
-declare i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32>, i32, i32, i32 immarg)
-declare i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll
deleted file mode 100644
index 6541ac9553231..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll
+++ /dev/null
@@ -1,313 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -global-isel=0 | FileCheck %s -check-prefix=CHECK
-; RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -global-isel=1 | FileCheck %s -check-prefix=CHECK
-
-define amdgpu_kernel void @raw_ptr_atomic_buffer_ptr_load_i32(ptr addrspace(8) %ptr) {
-; CHECK-LABEL: raw_ptr_atomic_buffer_ptr_load_i32:
-; CHECK:       ; %bb.0: ; %bb
-; CHECK-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT:    s_mov_b32 s4, 0
-; CHECK-NEXT:  .LBB0_1: ; %bb1
-; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    buffer_load_b32 v1, off, s[0:3], 0 glc
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT:    s_cbranch_execnz .LBB0_1
-; CHECK-NEXT:  ; %bb.2: ; %bb2
-; CHECK-NEXT:    s_endpgm
-bb:
-  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  br label %bb1
-bb1:
-  %load = call i32 @llvm.amdgcn.raw.ptr.atomic.buffer.load.i32(ptr addrspace(8) %ptr, i32 0, i32 0, i32 1)
-  %cmp = icmp eq i32 %load, %id
-  br i1 %cmp, label %bb1, label %bb2
-bb2:
-  ret void
-}
-
-define amdgpu_kernel void @raw_ptr_atomic_buffer_load_i32_off(ptr addrspace(8) %ptr) {
-; CHECK-LABEL: raw_ptr_atomic_buffer_load_i32_off:
-; CHECK:       ; %bb.0: ; %bb
-; CHECK-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT:    s_mov_b32 s4, 0
-; CHECK-NEXT:  .LBB1_1: ; %bb1
-; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    buffer_load_b32 v1, off, s[0:3], 0 glc
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT:    s_cbranch_execnz .LBB1_1
-; CHECK-NEXT:  ; %bb.2: ; %bb2
-; CHECK-NEXT:    s_endpgm
-bb:
-  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  br label %bb1
-bb1:
-  %load = call i32 @llvm.amdgcn.raw.ptr.atomic.buffer.load.i32(ptr addrspace(8) %ptr, i32 0, i32 0, i32 1)
-  %cmp = icmp eq i32 %load, %id
-  br i1 %cmp, label %bb1, label %bb2
-bb2:
-  ret void
-}
-define amdgpu_kernel void @raw_ptr_atomic_buffer_load_i32_soff(ptr addrspace(8) %ptr) {
-; CHECK-LABEL: raw_ptr_atomic_buffer_load_i32_soff:
-; CHECK:       ; %bb.0: ; %bb
-; CHECK-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT:    s_mov_b32 s4, 0
-; CHECK-NEXT:  .LBB2_1: ; %bb1
-; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    buffer_load_b32 v1, off, s[0:3], 4 offset:4 glc
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT:    s_cbranch_execnz .LBB2_1
-; CHECK-NEXT:  ; %bb.2: ; %bb2
-; CHECK-NEXT:    s_endpgm
-bb:
-  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  br label %bb1
-bb1:
-  %load = call i32 @llvm.amdgcn.raw.ptr.atomic.buffer.load.i32(ptr addrspace(8) %ptr, i32 4, i32 4, i32 1)
-  %cmp = icmp eq i32 %load, %id
-  br i1 %cmp, label %bb1, label %bb2
-bb2:
-  ret void
-}
-define amdgpu_kernel void @raw_ptr_atomic_buffer_load_i32_dlc(ptr addrspace(8) %ptr) {
-; CHECK-LABEL: raw_ptr_atomic_buffer_load_i32_dlc:
-; CHECK:       ; %bb.0: ; %bb
-; CHECK-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT:    s_mov_b32 s4, 0
-; CHECK-NEXT:  .LBB3_1: ; %bb1
-; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    buffer_load_b32 v1, off, s[0:3], 0 offset:4 dlc
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT:    s_cbranch_execnz .LBB3_1
-; CHECK-NEXT:  ; %bb.2: ; %bb2
-; CHECK-NEXT:    s_endpgm
-bb:
-  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  br label %bb1
-bb1:
-  %load = call i32 @llvm.amdgcn.raw.ptr.atomic.buffer.load.i32(ptr addrspace(8) %ptr, i32 4, i32 0, i32 4)
-  %cmp = icmp eq i32 %load, %id
-  br i1 %cmp, label %bb1, label %bb2
-bb2:
-  ret void
-}
-
-define amdgpu_kernel void @raw_nonptr_atomic_buffer_load_i32(ptr addrspace(8) %ptr) {
-; CHECK-LABEL: raw_nonptr_atomic_buffer_load_i32:
-; CHECK:       ; %bb.0: ; %bb
-; CHECK-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    buffer_load_b32 v1, off, s[0:3], 0 offset:4 glc
-; CHECK-NEXT:    s_mov_b32 s0, 0
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-NEXT:  .LBB4_1: ; %bb1
-; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    s_and_b32 s1, exec_lo, vcc_lo
-; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; CHECK-NEXT:    s_or_b32 s0, s1, s0
-; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; CHECK-NEXT:    s_cbranch_execnz .LBB4_1
-; CHECK-NEXT:  ; %bb.2: ; %bb2
-; CHECK-NEXT:    s_endpgm
-bb:
-  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  br label %bb1
-bb1:
-  %load = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) %ptr, i32 4, i32 0, i32 1)
-  %cmp = icmp eq i32 %load, %id
-  br i1 %cmp, label %bb1, label %bb2
-bb2:
-  ret void
-}
-
-define amdgpu_kernel void @raw_ptr_atomic_buffer_load_i64(ptr addrspace(8) %ptr) {
-; CHECK-LABEL: raw_ptr_atomic_buffer_load_i64:
-; CHECK:       ; %bb.0: ; %bb
-; CHECK-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; CHECK-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
-; CHECK-NEXT:    s_mov_b32 s4, 0
-; CHECK-NEXT:  .LBB5_1: ; %bb1
-; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    buffer_load_b64 v[2:3], off, s[0:3], 0 offset:4 glc
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc_lo, v[2:3], v[0:1]
-; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT:    s_cbranch_execnz .LBB5_1
-; CHECK-NEXT:  ; %bb.2: ; %bb2
-; CHECK-NEXT:    s_endpgm
-bb:
-  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %id.zext = zext i32 %id to i64
-  br label %bb1
-bb1:
-  %load = call i64 @llvm.amdgcn.raw.ptr.atomic.buffer.load.i64(ptr addrspace(8) %ptr, i32 4, i32 0, i32 1)
-  %cmp = icmp eq i64 %load, %id.zext
-  br i1 %cmp, label %bb1, label %bb2
-bb2:
-  ret void
-}
-
-define amdgpu_kernel void @raw_ptr_atomic_buffer_load_v2i16(ptr addrspace(8) %ptr) {
-; CHECK-LABEL: raw_ptr_atomic_buffer_load_v2i16:
-; CHECK:       ; %bb.0: ; %bb
-; CHECK-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT:    s_mov_b32 s4, 0
-; CHECK-NEXT:  .LBB6_1: ; %bb1
-; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    buffer_load_b32 v1, off, s[0:3], 0 glc
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT:    s_cbranch_execnz .LBB6_1
-; CHECK-NEXT:  ; %bb.2: ; %bb2
-; CHECK-NEXT:    s_endpgm
-bb:
-  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  br label %bb1
-bb1:
-  %load = call <2 x i16> @llvm.amdgcn.raw.ptr.atomic.buffer.load.v2i16(ptr addrspace(8) %ptr, i32 0, i32 0, i32 1)
-  %bitcast = bitcast <2 x i16> %load to i32
-  %cmp = icmp eq i32 %bitcast, %id
-  br i1 %cmp, label %bb1, label %bb2
-bb2:
-  ret void
-}
-
-define amdgpu_kernel void @raw_ptr_atomic_buffer_load_v4i16(ptr addrspace(8) %ptr) {
-; CHECK-LABEL: raw_ptr_atomic_buffer_load_v4i16:
-; CHECK:       ; %bb.0: ; %bb
-; CHECK-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT:    s_mov_b32 s4, 0
-; CHECK-NEXT:  .LBB7_1: ; %bb1
-; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    buffer_load_b64 v[1:2], off, s[0:3], 0 offset:4 glc
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; CHECK-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT:    s_cbranch_execnz .LBB7_1
-; CHECK-NEXT:  ; %bb.2: ; %bb2
-; CHECK-NEXT:    s_endpgm
-bb:
-  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  br label %bb1
-bb1:
-  %load = call <4 x i16> @llvm.amdgcn.raw.ptr.atomic.buffer.load.v4i16(ptr addrspace(8) %ptr, i32 4, i32 0, i32 1)
-  %shortened = shufflevector <4 x i16> %load, <4 x i16> poison, <2 x i32> <i32 0, i32 2>
-  %bitcast = bitcast <2 x i16> %shortened to i32
-  %cmp = icmp eq i32 %bitcast, %id
-  br i1 %cmp, label %bb1, label %bb2
-bb2:
-  ret void
-}
-
-define amdgpu_kernel void @raw_ptr_atomic_buffer_load_v4i32(ptr addrspace(8) %ptr) {
-; CHECK-LABEL: raw_ptr_atomic_buffer_load_v4i32:
-; CHECK:       ; %bb.0: ; %bb
-; CHECK-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT:    s_mov_b32 s4, 0
-; CHECK-NEXT:  .LBB8_1: ; %bb1
-; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    buffer_load_b128 v[1:4], off, s[0:3], 0 offset:4 glc
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v4, v0
-; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT:    s_cbranch_execnz .LBB8_1
-; CHECK-NEXT:  ; %bb.2: ; %bb2
-; CHECK-NEXT:    s_endpgm
-bb:
-  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  br label %bb1
-bb1:
-  %load = call <4 x i32> @llvm.amdgcn.raw.ptr.atomic.buffer.load.v4i32(ptr addrspace(8) %ptr, i32 4, i32 0, i32 1)
-  %extracted = extractelement <4 x i32> %load, i32 3
-  %cmp = icmp eq i32 %extracted, %id
-  br i1 %cmp, label %bb1, label %bb2
-bb2:
-  ret void
-}
-
-define amdgpu_kernel void @raw_ptr_atomic_buffer_load_ptr(ptr addrspace(8) %ptr) {
-; CHECK-LABEL: raw_ptr_atomic_buffer_load_ptr:
-; CHECK:       ; %bb.0: ; %bb
-; CHECK-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT:    s_mov_b32 s4, 0
-; CHECK-NEXT:  .LBB9_1: ; %bb1
-; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    buffer_load_b64 v[1:2], off, s[0:3], 0 offset:4 glc
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_load_b32 v1, v[1:2]
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT:    s_cbranch_execnz .LBB9_1
-; CHECK-NEXT:  ; %bb.2: ; %bb2
-; CHECK-NEXT:    s_endpgm
-bb:
-  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  br label %bb1
-bb1:
-  %load = call ptr @llvm.amdgcn.raw.ptr.atomic.buffer.load.ptr(ptr addrspace(8) %ptr, i32 4, i32 0, i32 1)
-  %elem = load i32, ptr %load
-  %cmp = icmp eq i32 %elem, %id
-  br i1 %cmp, label %bb1, label %bb2
-bb2:
-  ret void
-}
-
-; Function Attrs: nounwind readonly
-declare i32 @llvm.amdgcn.raw.ptr.atom.buffer.load.i32(ptr addrspace(8), i32, i32, i32 immarg)
-declare i64 @llvm.amdgcn.raw.ptr.atom.buffer.load.i64(ptr addrspace(8), i32, i32, i32 immarg)
-declare <2 x i16> @llvm.amdgcn.raw.ptr.atom.buffer.load.v2i16(ptr addrspace(8), i32, i32, i32 immarg)
-declare <4 x i16> @llvm.amdgcn.raw.ptr.atom.buffer.load.v4i16(ptr addrspace(8), i32, i32, i32 immarg)
-declare <4 x i32> @llvm.amdgcn.raw.ptr.atom.buffer.load.v4i32(ptr addrspace(8), i32, i32, i32 immarg)
-declare ptr @llvm.amdgcn.raw.ptr.atom.buffer.load.ptr(ptr addrspace(8), i32, i32, i32 immarg)
-declare i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8), i32, i32, i32 immarg)
-declare i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll
index 7371d498a7070..e2f494283a3f2 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll
@@ -808,11 +808,11 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB4_4
 ; GFX1032GISEL-NEXT:  .LBB4_5: ; %endif
 ; GFX1032GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX1032GISEL-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
+; GFX1032GISEL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x24
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX1032GISEL-NEXT:    global_store_dword v1, v0, s[2:3]
 ; GFX1032GISEL-NEXT:    s_endpgm
 ;
 ; GFX1164DAGISEL-LABEL: divergent_cfg:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll
index 60af21524a04a..5304188e02f84 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll
@@ -809,11 +809,11 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB4_4
 ; GFX1032GISEL-NEXT:  .LBB4_5: ; %endif
 ; GFX1032GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX1032GISEL-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
+; GFX1032GISEL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x24
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX1032GISEL-NEXT:    global_store_dword v1, v0, s[2:3]
 ; GFX1032GISEL-NEXT:    s_endpgm
 ;
 ; GFX1164DAGISEL-LABEL: divergent_cfg:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll
index b8a4674833cee..24b8a3c2dc873 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll
@@ -29,19 +29,19 @@ entry:
 define amdgpu_kernel void @test_sched_group_barrier_pipeline_READ_VALU_WRITE(ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %out) #0 {
 ; GCN-LABEL: test_sched_group_barrier_pipeline_READ_VALU_WRITE:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GCN-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GCN-NEXT:    v_lshlrev_b32_e32 v32, 7, v0
-; GCN-NEXT:    ; kill: killed $sgpr4_sgpr5
+; GCN-NEXT:    ; kill: killed $sgpr0_sgpr1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    global_load_dwordx4 v[0:3], v32, s[4:5]
-; GCN-NEXT:    global_load_dwordx4 v[4:7], v32, s[4:5] offset:16
-; GCN-NEXT:    global_load_dwordx4 v[8:11], v32, s[4:5] offset:32
-; GCN-NEXT:    global_load_dwordx4 v[12:15], v32, s[4:5] offset:48
-; GCN-NEXT:    global_load_dwordx4 v[16:19], v32, s[4:5] offset:64
-; GCN-NEXT:    global_load_dwordx4 v[20:23], v32, s[4:5] offset:80
-; GCN-NEXT:    global_load_dwordx4 v[24:27], v32, s[4:5] offset:96
-; GCN-NEXT:    global_load_dwordx4 v[28:31], v32, s[4:5] offset:112
+; GCN-NEXT:    global_load_dwordx4 v[0:3], v32, s[0:1]
+; GCN-NEXT:    global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
+; GCN-NEXT:    global_load_dwordx4 v[8:11], v32, s[0:1] offset:32
+; GCN-NEXT:    global_load_dwordx4 v[12:15], v32, s[0:1] offset:48
+; GCN-NEXT:    global_load_dwordx4 v[16:19], v32, s[0:1] offset:64
+; GCN-NEXT:    global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
+; GCN-NEXT:    global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
+; GCN-NEXT:    global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000020) size(8) SyncID(0)
 ; GCN-NEXT:    s_waitcnt vmcnt(7)
 ; GCN-NEXT:    v_mul_lo_u32 v3, v3, v3
@@ -83,33 +83,33 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_READ_VALU_WRITE(ptr
 ; GCN-NEXT:    v_mul_lo_u32 v30, v30, v30
 ; GCN-NEXT:    v_mul_lo_u32 v29, v29, v29
 ; GCN-NEXT:    v_mul_lo_u32 v28, v28, v28
-; GCN-NEXT:    global_store_dwordx4 v32, v[28:31], s[6:7] offset:112
-; GCN-NEXT:    global_store_dwordx4 v32, v[24:27], s[6:7] offset:96
-; GCN-NEXT:    global_store_dwordx4 v32, v[20:23], s[6:7] offset:80
-; GCN-NEXT:    global_store_dwordx4 v32, v[16:19], s[6:7] offset:64
-; GCN-NEXT:    global_store_dwordx4 v32, v[12:15], s[6:7] offset:48
-; GCN-NEXT:    global_store_dwordx4 v32, v[8:11], s[6:7] offset:32
-; GCN-NEXT:    global_store_dwordx4 v32, v[4:7], s[6:7] offset:16
-; GCN-NEXT:    global_store_dwordx4 v32, v[0:3], s[6:7]
+; GCN-NEXT:    global_store_dwordx4 v32, v[28:31], s[2:3] offset:112
+; GCN-NEXT:    global_store_dwordx4 v32, v[24:27], s[2:3] offset:96
+; GCN-NEXT:    global_store_dwordx4 v32, v[20:23], s[2:3] offset:80
+; GCN-NEXT:    global_store_dwordx4 v32, v[16:19], s[2:3] offset:64
+; GCN-NEXT:    global_store_dwordx4 v32, v[12:15], s[2:3] offset:48
+; GCN-NEXT:    global_store_dwordx4 v32, v[8:11], s[2:3] offset:32
+; GCN-NEXT:    global_store_dwordx4 v32, v[4:7], s[2:3] offset:16
+; GCN-NEXT:    global_store_dwordx4 v32, v[0:3], s[2:3]
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000002) size(30) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000040) size(8) SyncID(0)
 ; GCN-NEXT:    s_endpgm
 ;
 ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_READ_VALU_WRITE:
 ; EXACTCUTOFF:       ; %bb.0:
-; EXACTCUTOFF-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; EXACTCUTOFF-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; EXACTCUTOFF-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; EXACTCUTOFF-NEXT:    v_lshlrev_b32_e32 v32, 7, v0
-; EXACTCUTOFF-NEXT:    ; kill: killed $sgpr4_sgpr5
+; EXACTCUTOFF-NEXT:    ; kill: killed $sgpr0_sgpr1
 ; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(0)
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[0:3], v32, s[4:5]
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[4:7], v32, s[4:5] offset:16
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[8:11], v32, s[4:5] offset:32
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[12:15], v32, s[4:5] offset:48
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[16:19], v32, s[4:5] offset:64
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[20:23], v32, s[4:5] offset:80
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[24:27], v32, s[4:5] offset:96
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[28:31], v32, s[4:5] offset:112
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[0:3], v32, s[0:1]
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[8:11], v32, s[0:1] offset:32
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[12:15], v32, s[0:1] offset:48
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[16:19], v32, s[0:1] offset:64
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000020) size(8) SyncID(0)
 ; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(7)
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v3, v3, v3
@@ -151,14 +151,14 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_READ_VALU_WRITE(ptr
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v30, v30, v30
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v29, v29, v29
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v28, v28, v28
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[28:31], s[6:7] offset:112
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[24:27], s[6:7] offset:96
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[20:23], s[6:7] offset:80
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[16:19], s[6:7] offset:64
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[12:15], s[6:7] offset:48
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[8:11], s[6:7] offset:32
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[4:7], s[6:7] offset:16
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[0:3], s[6:7]
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[28:31], s[2:3] offset:112
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[24:27], s[2:3] offset:96
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[20:23], s[2:3] offset:80
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[16:19], s[2:3] offset:64
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[12:15], s[2:3] offset:48
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[8:11], s[2:3] offset:32
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[4:7], s[2:3] offset:16
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[0:3], s[2:3]
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000002) size(30) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000040) size(8) SyncID(0)
 ; EXACTCUTOFF-NEXT:    s_endpgm
@@ -180,12 +180,12 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_READ_VALU_WRITE(ptr
 define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VALU(ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %out) #0 {
 ; GCN-LABEL: test_sched_group_barrier_pipeline_alternating_READ_VALU:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GCN-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GCN-NEXT:    v_lshlrev_b32_e32 v32, 7, v0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    global_load_dwordx4 v[28:31], v32, s[4:5] offset:16
-; GCN-NEXT:    global_load_dwordx4 v[0:3], v32, s[4:5]
+; GCN-NEXT:    global_load_dwordx4 v[28:31], v32, s[0:1] offset:16
+; GCN-NEXT:    global_load_dwordx4 v[0:3], v32, s[0:1]
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
@@ -194,10 +194,10 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_mul_lo_u32 v3, v3, v3
 ; GCN-NEXT:    v_mul_lo_u32 v2, v2, v2
-; GCN-NEXT:    global_load_dwordx4 v[4:7], v32, s[4:5] offset:112
+; GCN-NEXT:    global_load_dwordx4 v[4:7], v32, s[0:1] offset:112
 ; GCN-NEXT:    v_mul_lo_u32 v1, v1, v1
 ; GCN-NEXT:    v_mul_lo_u32 v0, v0, v0
-; GCN-NEXT:    global_load_dwordx4 v[8:11], v32, s[4:5] offset:96
+; GCN-NEXT:    global_load_dwordx4 v[8:11], v32, s[0:1] offset:96
 ; GCN-NEXT:    v_mul_lo_u32 v28, v28, v28
 ; GCN-NEXT:    v_mul_lo_u32 v31, v31, v31
 ; GCN-NEXT:    v_mul_lo_u32 v30, v30, v30
@@ -208,10 +208,10 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    v_mul_lo_u32 v7, v7, v7
 ; GCN-NEXT:    v_mul_lo_u32 v6, v6, v6
-; GCN-NEXT:    global_load_dwordx4 v[12:15], v32, s[4:5] offset:80
+; GCN-NEXT:    global_load_dwordx4 v[12:15], v32, s[0:1] offset:80
 ; GCN-NEXT:    v_mul_lo_u32 v5, v5, v5
 ; GCN-NEXT:    v_mul_lo_u32 v4, v4, v4
-; GCN-NEXT:    global_load_dwordx4 v[16:19], v32, s[4:5] offset:48
+; GCN-NEXT:    global_load_dwordx4 v[16:19], v32, s[0:1] offset:48
 ; GCN-NEXT:    s_waitcnt vmcnt(2)
 ; GCN-NEXT:    v_mul_lo_u32 v11, v11, v11
 ; GCN-NEXT:    v_mul_lo_u32 v10, v10, v10
@@ -228,8 +228,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_mul_lo_u32 v19, v19, v19
 ; GCN-NEXT:    v_mul_lo_u32 v18, v18, v18
-; GCN-NEXT:    global_load_dwordx4 v[20:23], v32, s[4:5] offset:64
-; GCN-NEXT:    global_load_dwordx4 v[24:27], v32, s[4:5] offset:32
+; GCN-NEXT:    global_load_dwordx4 v[20:23], v32, s[0:1] offset:64
+; GCN-NEXT:    global_load_dwordx4 v[24:27], v32, s[0:1] offset:32
 ; GCN-NEXT:    v_mul_lo_u32 v17, v17, v17
 ; GCN-NEXT:    v_mul_lo_u32 v16, v16, v16
 ; GCN-NEXT:    v_mul_lo_u32 v12, v12, v12
@@ -245,14 +245,14 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; GCN-NEXT:    v_mul_lo_u32 v22, v22, v22
 ; GCN-NEXT:    v_mul_lo_u32 v21, v21, v21
 ; GCN-NEXT:    v_mul_lo_u32 v20, v20, v20
-; GCN-NEXT:    global_store_dwordx4 v32, v[4:7], s[6:7] offset:112
-; GCN-NEXT:    global_store_dwordx4 v32, v[8:11], s[6:7] offset:96
-; GCN-NEXT:    global_store_dwordx4 v32, v[12:15], s[6:7] offset:80
-; GCN-NEXT:    global_store_dwordx4 v32, v[20:23], s[6:7] offset:64
-; GCN-NEXT:    global_store_dwordx4 v32, v[16:19], s[6:7] offset:48
-; GCN-NEXT:    global_store_dwordx4 v32, v[24:27], s[6:7] offset:32
-; GCN-NEXT:    global_store_dwordx4 v32, v[28:31], s[6:7] offset:16
-; GCN-NEXT:    global_store_dwordx4 v32, v[0:3], s[6:7]
+; GCN-NEXT:    global_store_dwordx4 v32, v[4:7], s[2:3] offset:112
+; GCN-NEXT:    global_store_dwordx4 v32, v[8:11], s[2:3] offset:96
+; GCN-NEXT:    global_store_dwordx4 v32, v[12:15], s[2:3] offset:80
+; GCN-NEXT:    global_store_dwordx4 v32, v[20:23], s[2:3] offset:64
+; GCN-NEXT:    global_store_dwordx4 v32, v[16:19], s[2:3] offset:48
+; GCN-NEXT:    global_store_dwordx4 v32, v[24:27], s[2:3] offset:32
+; GCN-NEXT:    global_store_dwordx4 v32, v[28:31], s[2:3] offset:16
+; GCN-NEXT:    global_store_dwordx4 v32, v[0:3], s[2:3]
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
@@ -261,12 +261,12 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ;
 ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_alternating_READ_VALU:
 ; EXACTCUTOFF:       ; %bb.0:
-; EXACTCUTOFF-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; EXACTCUTOFF-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; EXACTCUTOFF-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; EXACTCUTOFF-NEXT:    v_lshlrev_b32_e32 v32, 7, v0
 ; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(0)
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[28:31], v32, s[4:5] offset:16
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[0:3], v32, s[4:5]
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[28:31], v32, s[0:1] offset:16
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[0:3], v32, s[0:1]
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
@@ -275,10 +275,10 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(0)
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v3, v3, v3
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v2, v2, v2
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[4:7], v32, s[4:5] offset:112
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[4:7], v32, s[0:1] offset:112
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v1, v1, v1
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v0, v0, v0
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[8:11], v32, s[4:5] offset:96
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[8:11], v32, s[0:1] offset:96
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v28, v28, v28
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v31, v31, v31
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v30, v30, v30
@@ -289,10 +289,10 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(1)
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v7, v7, v7
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v6, v6, v6
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[12:15], v32, s[4:5] offset:80
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[12:15], v32, s[0:1] offset:80
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v5, v5, v5
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v4, v4, v4
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[16:19], v32, s[4:5] offset:48
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[16:19], v32, s[0:1] offset:48
 ; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(2)
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v11, v11, v11
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v10, v10, v10
@@ -309,8 +309,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(0)
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v19, v19, v19
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v18, v18, v18
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[20:23], v32, s[4:5] offset:64
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[24:27], v32, s[4:5] offset:32
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[20:23], v32, s[0:1] offset:64
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[24:27], v32, s[0:1] offset:32
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v17, v17, v17
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v16, v16, v16
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v12, v12, v12
@@ -326,14 +326,14 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v22, v22, v22
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v21, v21, v21
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v20, v20, v20
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[4:7], s[6:7] offset:112
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[8:11], s[6:7] offset:96
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[12:15], s[6:7] offset:80
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[20:23], s[6:7] offset:64
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[16:19], s[6:7] offset:48
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[24:27], s[6:7] offset:32
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[28:31], s[6:7] offset:16
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[0:3], s[6:7]
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[4:7], s[2:3] offset:112
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[8:11], s[2:3] offset:96
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[12:15], s[2:3] offset:80
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[20:23], s[2:3] offset:64
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[16:19], s[2:3] offset:48
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[24:27], s[2:3] offset:32
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[28:31], s[2:3] offset:16
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[0:3], s[2:3]
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
@@ -385,12 +385,12 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VALU_WRITE(ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %out) #0 {
 ; GCN-LABEL: test_sched_group_barrier_pipeline_alternating_READ_VALU_WRITE:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GCN-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GCN-NEXT:    v_lshlrev_b32_e32 v16, 7, v0
-; GCN-NEXT:    ; kill: killed $sgpr4_sgpr5
+; GCN-NEXT:    ; kill: killed $sgpr0_sgpr1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    global_load_dwordx4 v[12:15], v16, s[4:5] offset:32
+; GCN-NEXT:    global_load_dwordx4 v[12:15], v16, s[0:1] offset:32
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
@@ -398,8 +398,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; GCN-NEXT:    v_mul_lo_u32 v12, v12, v12
 ; GCN-NEXT:    v_mul_lo_u32 v15, v15, v15
 ; GCN-NEXT:    v_mul_lo_u32 v14, v14, v14
-; GCN-NEXT:    global_store_dwordx4 v16, v[12:15], s[6:7] offset:32
-; GCN-NEXT:    global_load_dwordx4 v[0:3], v16, s[4:5]
+; GCN-NEXT:    global_store_dwordx4 v16, v[12:15], s[2:3] offset:32
+; GCN-NEXT:    global_load_dwordx4 v[0:3], v16, s[0:1]
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
@@ -407,30 +407,30 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; GCN-NEXT:    v_mul_lo_u32 v2, v2, v2
 ; GCN-NEXT:    v_mul_lo_u32 v1, v1, v1
 ; GCN-NEXT:    v_mul_lo_u32 v0, v0, v0
-; GCN-NEXT:    global_store_dwordx4 v16, v[0:3], s[6:7]
-; GCN-NEXT:    global_load_dwordx4 v[0:3], v16, s[4:5] offset:112
+; GCN-NEXT:    global_store_dwordx4 v16, v[0:3], s[2:3]
+; GCN-NEXT:    global_load_dwordx4 v[0:3], v16, s[0:1] offset:112
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_mul_lo_u32 v3, v3, v3
 ; GCN-NEXT:    v_mul_lo_u32 v2, v2, v2
 ; GCN-NEXT:    v_mul_lo_u32 v1, v1, v1
 ; GCN-NEXT:    v_mul_lo_u32 v0, v0, v0
-; GCN-NEXT:    global_store_dwordx4 v16, v[0:3], s[6:7] offset:112
-; GCN-NEXT:    global_load_dwordx4 v[0:3], v16, s[4:5] offset:96
+; GCN-NEXT:    global_store_dwordx4 v16, v[0:3], s[2:3] offset:112
+; GCN-NEXT:    global_load_dwordx4 v[0:3], v16, s[0:1] offset:96
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_mul_lo_u32 v3, v3, v3
 ; GCN-NEXT:    v_mul_lo_u32 v2, v2, v2
 ; GCN-NEXT:    v_mul_lo_u32 v1, v1, v1
 ; GCN-NEXT:    v_mul_lo_u32 v0, v0, v0
-; GCN-NEXT:    global_store_dwordx4 v16, v[0:3], s[6:7] offset:96
-; GCN-NEXT:    global_load_dwordx4 v[0:3], v16, s[4:5] offset:80
+; GCN-NEXT:    global_store_dwordx4 v16, v[0:3], s[2:3] offset:96
+; GCN-NEXT:    global_load_dwordx4 v[0:3], v16, s[0:1] offset:80
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_mul_lo_u32 v3, v3, v3
 ; GCN-NEXT:    v_mul_lo_u32 v2, v2, v2
 ; GCN-NEXT:    v_mul_lo_u32 v1, v1, v1
 ; GCN-NEXT:    v_mul_lo_u32 v0, v0, v0
-; GCN-NEXT:    global_store_dwordx4 v16, v[0:3], s[6:7] offset:80
-; GCN-NEXT:    global_load_dwordx4 v[4:7], v16, s[4:5] offset:48
+; GCN-NEXT:    global_store_dwordx4 v16, v[0:3], s[2:3] offset:80
+; GCN-NEXT:    global_load_dwordx4 v[4:7], v16, s[0:1] offset:48
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
@@ -442,8 +442,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; GCN-NEXT:    v_mul_lo_u32 v6, v6, v6
 ; GCN-NEXT:    v_mul_lo_u32 v5, v5, v5
 ; GCN-NEXT:    v_mul_lo_u32 v4, v4, v4
-; GCN-NEXT:    global_store_dwordx4 v16, v[4:7], s[6:7] offset:48
-; GCN-NEXT:    global_load_dwordx4 v[8:11], v16, s[4:5] offset:16
+; GCN-NEXT:    global_store_dwordx4 v16, v[4:7], s[2:3] offset:48
+; GCN-NEXT:    global_load_dwordx4 v[8:11], v16, s[0:1] offset:16
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
@@ -455,8 +455,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; GCN-NEXT:    v_mul_lo_u32 v8, v8, v8
 ; GCN-NEXT:    v_mul_lo_u32 v11, v11, v11
 ; GCN-NEXT:    v_mul_lo_u32 v10, v10, v10
-; GCN-NEXT:    global_store_dwordx4 v16, v[8:11], s[6:7] offset:16
-; GCN-NEXT:    global_load_dwordx4 v[8:11], v16, s[4:5] offset:64
+; GCN-NEXT:    global_store_dwordx4 v16, v[8:11], s[2:3] offset:16
+; GCN-NEXT:    global_load_dwordx4 v[8:11], v16, s[0:1] offset:64
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
@@ -468,18 +468,18 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; GCN-NEXT:    v_mul_lo_u32 v10, v10, v10
 ; GCN-NEXT:    v_mul_lo_u32 v9, v9, v9
 ; GCN-NEXT:    v_mul_lo_u32 v8, v8, v8
-; GCN-NEXT:    global_store_dwordx4 v16, v[8:11], s[6:7] offset:64
+; GCN-NEXT:    global_store_dwordx4 v16, v[8:11], s[2:3] offset:64
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
 ; GCN-NEXT:    s_endpgm
 ;
 ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_alternating_READ_VALU_WRITE:
 ; EXACTCUTOFF:       ; %bb.0:
-; EXACTCUTOFF-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; EXACTCUTOFF-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; EXACTCUTOFF-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; EXACTCUTOFF-NEXT:    v_lshlrev_b32_e32 v16, 7, v0
-; EXACTCUTOFF-NEXT:    ; kill: killed $sgpr4_sgpr5
+; EXACTCUTOFF-NEXT:    ; kill: killed $sgpr0_sgpr1
 ; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(0)
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[12:15], v16, s[4:5] offset:32
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[12:15], v16, s[0:1] offset:32
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
 ; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(0)
@@ -487,8 +487,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v12, v12, v12
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v15, v15, v15
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v14, v14, v14
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[12:15], s[6:7] offset:32
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[0:3], v16, s[4:5]
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[12:15], s[2:3] offset:32
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[0:3], v16, s[0:1]
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(0)
@@ -496,30 +496,30 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v2, v2, v2
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v1, v1, v1
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v0, v0, v0
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[0:3], s[6:7]
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[0:3], v16, s[4:5] offset:112
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[0:3], s[2:3]
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[0:3], v16, s[0:1] offset:112
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
 ; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(0)
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v3, v3, v3
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v2, v2, v2
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v1, v1, v1
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v0, v0, v0
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[0:3], s[6:7] offset:112
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[0:3], v16, s[4:5] offset:96
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[0:3], s[2:3] offset:112
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[0:3], v16, s[0:1] offset:96
 ; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(0)
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v3, v3, v3
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v2, v2, v2
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v1, v1, v1
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v0, v0, v0
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[0:3], s[6:7] offset:96
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[0:3], v16, s[4:5] offset:80
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[0:3], s[2:3] offset:96
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[0:3], v16, s[0:1] offset:80
 ; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(0)
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v3, v3, v3
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v2, v2, v2
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v1, v1, v1
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v0, v0, v0
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[0:3], s[6:7] offset:80
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[4:7], v16, s[4:5] offset:48
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[0:3], s[2:3] offset:80
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[4:7], v16, s[0:1] offset:48
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
@@ -531,8 +531,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v6, v6, v6
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v5, v5, v5
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v4, v4, v4
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[4:7], s[6:7] offset:48
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[8:11], v16, s[4:5] offset:16
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[4:7], s[2:3] offset:48
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[8:11], v16, s[0:1] offset:16
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
@@ -544,8 +544,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v8, v8, v8
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v11, v11, v11
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v10, v10, v10
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[8:11], s[6:7] offset:16
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[8:11], v16, s[4:5] offset:64
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[8:11], s[2:3] offset:16
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[8:11], v16, s[0:1] offset:64
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
@@ -557,7 +557,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v10, v10, v10
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v9, v9, v9
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v8, v8, v8
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[8:11], s[6:7] offset:64
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[8:11], s[2:3] offset:64
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #2
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
index 0b18e5f35a310..5514efa6838e7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
@@ -46,26 +46,26 @@ define amdgpu_kernel void @cos_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
 ;
 ; GFX9-LABEL: cos_f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mul_f16_e32 v1, 0.15915494, v1
 ; GFX9-NEXT:    v_cos_f16_e32 v1, v1
-; GFX9-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: cos_f16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ushort v1, v0, s[6:7]
+; GFX10-NEXT:    global_load_ushort v1, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mul_f16_e32 v1, 0.15915494, v1
 ; GFX10-NEXT:    v_cos_f16_e32 v1, v1
-; GFX10-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: cos_f16:
@@ -142,34 +142,34 @@ define amdgpu_kernel void @cos_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
 ;
 ; GFX9-LABEL: cos_v2f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x3118
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mul_f16_e32 v3, 0.15915494, v1
 ; GFX9-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-NEXT:    v_cos_f16_e32 v2, v3
 ; GFX9-NEXT:    v_cos_f16_e32 v1, v1
 ; GFX9-NEXT:    v_pack_b32_f16 v1, v2, v1
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: cos_v2f16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0x3118
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mul_f16_e32 v3, 0.15915494, v1
 ; GFX10-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    v_cos_f16_e32 v2, v3
 ; GFX10-NEXT:    v_cos_f16_e32 v1, v1
 ; GFX10-NEXT:    v_pack_b32_f16 v1, v2, v1
-; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: cos_v2f16:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
index 4bed23487445a..142145098df87 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
@@ -442,59 +442,59 @@ define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
 ;
 ; GFX900-SDAG-LABEL: s_exp_v2f32:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX900-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v0, 0x3fb8aa3b
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x32a5705f
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v5, 0xc2ce8ed0
 ; GFX900-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_mul_f32_e32 v2, s7, v0
+; GFX900-SDAG-NEXT:    v_mul_f32_e32 v2, s3, v0
 ; GFX900-SDAG-NEXT:    v_rndne_f32_e32 v3, v2
-; GFX900-SDAG-NEXT:    v_fma_f32 v4, s7, v0, -v2
+; GFX900-SDAG-NEXT:    v_fma_f32 v4, s3, v0, -v2
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v2, v2, v3
-; GFX900-SDAG-NEXT:    v_fma_f32 v4, s7, v1, v4
-; GFX900-SDAG-NEXT:    v_mul_f32_e32 v6, s6, v0
+; GFX900-SDAG-NEXT:    v_fma_f32 v4, s3, v1, v4
+; GFX900-SDAG-NEXT:    v_mul_f32_e32 v6, s2, v0
 ; GFX900-SDAG-NEXT:    v_add_f32_e32 v2, v2, v4
 ; GFX900-SDAG-NEXT:    v_rndne_f32_e32 v7, v6
-; GFX900-SDAG-NEXT:    v_fma_f32 v0, s6, v0, -v6
+; GFX900-SDAG-NEXT:    v_fma_f32 v0, s2, v0, -v6
 ; GFX900-SDAG-NEXT:    v_cvt_i32_f32_e32 v3, v3
 ; GFX900-SDAG-NEXT:    v_exp_f32_e32 v2, v2
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v8, v6, v7
-; GFX900-SDAG-NEXT:    v_fma_f32 v0, s6, v1, v0
+; GFX900-SDAG-NEXT:    v_fma_f32 v0, s2, v1, v0
 ; GFX900-SDAG-NEXT:    v_add_f32_e32 v0, v8, v0
 ; GFX900-SDAG-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX900-SDAG-NEXT:    v_cvt_i32_f32_e32 v6, v7
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v2, v2, v3
-; GFX900-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s7, v5
+; GFX900-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s3, v5
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v3, 0x42b17218
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v7, 0x7f800000
-; GFX900-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s7, v3
+; GFX900-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s3, v3
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v6
-; GFX900-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s6, v5
+; GFX900-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s2, v5
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GFX900-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s6, v3
+; GFX900-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s2, v3
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc
-; GFX900-SDAG-NEXT:    global_store_dwordx2 v4, v[0:1], s[4:5]
+; GFX900-SDAG-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
 ; GFX900-SDAG-NEXT:    s_endpgm
 ;
 ; GFX900-GISEL-LABEL: s_exp_v2f32:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX900-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, 0x3fb8aa3b
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x32a5705f
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7f800000
 ; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, s6, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v3, s6, v0, -v2
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, s2, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v3, s2, v0, -v2
 ; GFX900-GISEL-NEXT:    v_rndne_f32_e32 v4, v2
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v5, s7, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v3, s6, v1, v3
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v5, s3, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v3, s2, v1, v3
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v4
-; GFX900-GISEL-NEXT:    v_fma_f32 v0, s7, v0, -v5
+; GFX900-GISEL-NEXT:    v_fma_f32 v0, s3, v0, -v5
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
-; GFX900-GISEL-NEXT:    v_fma_f32 v0, s7, v1, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v0, s3, v1, v0
 ; GFX900-GISEL-NEXT:    v_rndne_f32_e32 v1, v5
 ; GFX900-GISEL-NEXT:    v_cvt_i32_f32_e32 v3, v4
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v2, v2
@@ -504,18 +504,18 @@ define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v5, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0xc2ce8ed0
 ; GFX900-GISEL-NEXT:    v_ldexp_f32 v2, v2, v3
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v4
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v4
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x42b17218
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
-; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s6, v3
+; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s2, v3
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v6, vcc
 ; GFX900-GISEL-NEXT:    v_ldexp_f32 v1, v5, v1
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s7, v4
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s3, v4
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
-; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s7, v3
+; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s3, v3
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0
-; GFX900-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX900-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX900-GISEL-NEXT:    s_endpgm
 ;
 ; SI-SDAG-LABEL: s_exp_v2f32:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
index ec7e52532cd32..4d981d27c309e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
@@ -444,59 +444,59 @@ define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ;
 ; GFX900-SDAG-LABEL: s_exp10_v2f32:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX900-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v0, 0x40549a78
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x33979a37
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v5, 0xc23369f4
 ; GFX900-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_mul_f32_e32 v2, s7, v0
+; GFX900-SDAG-NEXT:    v_mul_f32_e32 v2, s3, v0
 ; GFX900-SDAG-NEXT:    v_rndne_f32_e32 v3, v2
-; GFX900-SDAG-NEXT:    v_fma_f32 v4, s7, v0, -v2
+; GFX900-SDAG-NEXT:    v_fma_f32 v4, s3, v0, -v2
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v2, v2, v3
-; GFX900-SDAG-NEXT:    v_fma_f32 v4, s7, v1, v4
-; GFX900-SDAG-NEXT:    v_mul_f32_e32 v6, s6, v0
+; GFX900-SDAG-NEXT:    v_fma_f32 v4, s3, v1, v4
+; GFX900-SDAG-NEXT:    v_mul_f32_e32 v6, s2, v0
 ; GFX900-SDAG-NEXT:    v_add_f32_e32 v2, v2, v4
 ; GFX900-SDAG-NEXT:    v_rndne_f32_e32 v7, v6
-; GFX900-SDAG-NEXT:    v_fma_f32 v0, s6, v0, -v6
+; GFX900-SDAG-NEXT:    v_fma_f32 v0, s2, v0, -v6
 ; GFX900-SDAG-NEXT:    v_cvt_i32_f32_e32 v3, v3
 ; GFX900-SDAG-NEXT:    v_exp_f32_e32 v2, v2
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v8, v6, v7
-; GFX900-SDAG-NEXT:    v_fma_f32 v0, s6, v1, v0
+; GFX900-SDAG-NEXT:    v_fma_f32 v0, s2, v1, v0
 ; GFX900-SDAG-NEXT:    v_add_f32_e32 v0, v8, v0
 ; GFX900-SDAG-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX900-SDAG-NEXT:    v_cvt_i32_f32_e32 v6, v7
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v2, v2, v3
-; GFX900-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s7, v5
+; GFX900-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s3, v5
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v3, 0x421a209b
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v7, 0x7f800000
-; GFX900-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s7, v3
+; GFX900-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s3, v3
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v6
-; GFX900-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s6, v5
+; GFX900-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s2, v5
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GFX900-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s6, v3
+; GFX900-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s2, v3
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc
-; GFX900-SDAG-NEXT:    global_store_dwordx2 v4, v[0:1], s[4:5]
+; GFX900-SDAG-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
 ; GFX900-SDAG-NEXT:    s_endpgm
 ;
 ; GFX900-GISEL-LABEL: s_exp10_v2f32:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX900-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, 0x40549a78
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x33979a37
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7f800000
 ; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, s6, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v3, s6, v0, -v2
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, s2, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v3, s2, v0, -v2
 ; GFX900-GISEL-NEXT:    v_rndne_f32_e32 v4, v2
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v5, s7, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v3, s6, v1, v3
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v5, s3, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v3, s2, v1, v3
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v4
-; GFX900-GISEL-NEXT:    v_fma_f32 v0, s7, v0, -v5
+; GFX900-GISEL-NEXT:    v_fma_f32 v0, s3, v0, -v5
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
-; GFX900-GISEL-NEXT:    v_fma_f32 v0, s7, v1, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v0, s3, v1, v0
 ; GFX900-GISEL-NEXT:    v_rndne_f32_e32 v1, v5
 ; GFX900-GISEL-NEXT:    v_cvt_i32_f32_e32 v3, v4
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v2, v2
@@ -506,18 +506,18 @@ define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v5, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0xc23369f4
 ; GFX900-GISEL-NEXT:    v_ldexp_f32 v2, v2, v3
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v4
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v4
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x421a209b
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
-; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s6, v3
+; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s2, v3
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v6, vcc
 ; GFX900-GISEL-NEXT:    v_ldexp_f32 v1, v5, v1
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s7, v4
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s3, v4
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
-; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s7, v3
+; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s3, v3
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0
-; GFX900-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX900-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX900-GISEL-NEXT:    s_endpgm
 ;
 ; SI-SDAG-LABEL: s_exp10_v2f32:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
index 32b599e63c61d..9f80e66e8f873 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
@@ -271,25 +271,25 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ;
 ; GFX900-SDAG-LABEL: s_exp2_v2f32:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX900-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v0, 0xc2fc0000
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x1f800000
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v2, 0x42800000
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX900-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s7, v0
+; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s3, v0
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v3, 1.0, v1, vcc
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v2, vcc
-; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v0
+; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v0
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, 1.0, v1, vcc
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; GFX900-SDAG-NEXT:    v_add_f32_e32 v4, s7, v4
-; GFX900-SDAG-NEXT:    v_add_f32_e32 v1, s6, v1
+; GFX900-SDAG-NEXT:    v_add_f32_e32 v4, s3, v4
+; GFX900-SDAG-NEXT:    v_add_f32_e32 v1, s2, v1
 ; GFX900-SDAG-NEXT:    v_exp_f32_e32 v4, v4
 ; GFX900-SDAG-NEXT:    v_exp_f32_e32 v2, v1
 ; GFX900-SDAG-NEXT:    v_mul_f32_e32 v1, v4, v3
 ; GFX900-SDAG-NEXT:    v_mul_f32_e32 v0, v2, v0
-; GFX900-SDAG-NEXT:    global_store_dwordx2 v5, v[0:1], s[4:5]
+; GFX900-SDAG-NEXT:    global_store_dwordx2 v5, v[0:1], s[0:1]
 ; GFX900-SDAG-NEXT:    s_endpgm
 ;
 ; GFX900-GISEL-LABEL: s_exp2_v2f32:
@@ -538,10 +538,10 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ; GFX900-GISEL-LABEL: s_exp2_v3f32:
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x34
-; GFX900-GISEL-NEXT:    s_load_dwordx2 s[8:9], s[2:3], 0x24
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x1f800000
+; GFX900-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x24
 ; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v1
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
@@ -562,7 +562,7 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, v4, v1
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, v2, v3
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-GISEL-NEXT:    global_store_dwordx3 v3, v[0:2], s[8:9]
+; GFX900-GISEL-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3]
 ; GFX900-GISEL-NEXT:    s_endpgm
 ;
 ; R600-LABEL: s_exp2_v3f32:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
index 92658e660ea67..edcdd323cb0ae 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
@@ -104,60 +104,60 @@ define amdgpu_kernel void @fmuladd_f16(
 ;
 ; GFX10-FLUSH-LABEL: fmuladd_f16:
 ; GFX10-FLUSH:       ; %bb.0:
-; GFX10-FLUSH-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
-; GFX10-FLUSH-NEXT:    s_mov_b32 s2, -1
-; GFX10-FLUSH-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX10-FLUSH-NEXT:    s_mov_b32 s14, s2
-; GFX10-FLUSH-NEXT:    s_mov_b32 s15, s3
-; GFX10-FLUSH-NEXT:    s_mov_b32 s18, s2
-; GFX10-FLUSH-NEXT:    s_mov_b32 s19, s3
+; GFX10-FLUSH-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX10-FLUSH-NEXT:    s_mov_b32 s10, -1
+; GFX10-FLUSH-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX10-FLUSH-NEXT:    s_mov_b32 s14, s10
+; GFX10-FLUSH-NEXT:    s_mov_b32 s15, s11
+; GFX10-FLUSH-NEXT:    s_mov_b32 s18, s10
+; GFX10-FLUSH-NEXT:    s_mov_b32 s19, s11
 ; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-FLUSH-NEXT:    s_mov_b32 s12, s6
-; GFX10-FLUSH-NEXT:    s_mov_b32 s13, s7
-; GFX10-FLUSH-NEXT:    s_mov_b32 s16, s8
-; GFX10-FLUSH-NEXT:    s_mov_b32 s17, s9
+; GFX10-FLUSH-NEXT:    s_mov_b32 s12, s2
+; GFX10-FLUSH-NEXT:    s_mov_b32 s13, s3
+; GFX10-FLUSH-NEXT:    s_mov_b32 s16, s4
+; GFX10-FLUSH-NEXT:    s_mov_b32 s17, s5
 ; GFX10-FLUSH-NEXT:    buffer_load_ushort v0, off, s[12:15], 0
 ; GFX10-FLUSH-NEXT:    buffer_load_ushort v1, off, s[16:19], 0
-; GFX10-FLUSH-NEXT:    s_mov_b32 s8, s10
-; GFX10-FLUSH-NEXT:    s_mov_b32 s9, s11
-; GFX10-FLUSH-NEXT:    s_mov_b32 s10, s2
-; GFX10-FLUSH-NEXT:    s_mov_b32 s11, s3
-; GFX10-FLUSH-NEXT:    s_mov_b32 s0, s4
-; GFX10-FLUSH-NEXT:    buffer_load_ushort v2, off, s[8:11], 0
-; GFX10-FLUSH-NEXT:    s_mov_b32 s1, s5
+; GFX10-FLUSH-NEXT:    s_mov_b32 s4, s6
+; GFX10-FLUSH-NEXT:    s_mov_b32 s5, s7
+; GFX10-FLUSH-NEXT:    s_mov_b32 s6, s10
+; GFX10-FLUSH-NEXT:    s_mov_b32 s7, s11
+; GFX10-FLUSH-NEXT:    s_mov_b32 s8, s0
+; GFX10-FLUSH-NEXT:    buffer_load_ushort v2, off, s[4:7], 0
+; GFX10-FLUSH-NEXT:    s_mov_b32 s9, s1
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-FLUSH-NEXT:    v_mul_f16_e32 v0, v0, v1
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLUSH-NEXT:    v_add_f16_e32 v0, v0, v2
-; GFX10-FLUSH-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX10-FLUSH-NEXT:    buffer_store_short v0, off, s[8:11], 0
 ; GFX10-FLUSH-NEXT:    s_endpgm
 ;
 ; GFX10-DENORM-LABEL: fmuladd_f16:
 ; GFX10-DENORM:       ; %bb.0:
-; GFX10-DENORM-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
-; GFX10-DENORM-NEXT:    s_mov_b32 s2, -1
-; GFX10-DENORM-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX10-DENORM-NEXT:    s_mov_b32 s14, s2
-; GFX10-DENORM-NEXT:    s_mov_b32 s15, s3
-; GFX10-DENORM-NEXT:    s_mov_b32 s18, s2
-; GFX10-DENORM-NEXT:    s_mov_b32 s19, s3
-; GFX10-DENORM-NEXT:    s_mov_b32 s22, s2
-; GFX10-DENORM-NEXT:    s_mov_b32 s23, s3
+; GFX10-DENORM-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX10-DENORM-NEXT:    s_mov_b32 s10, -1
+; GFX10-DENORM-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX10-DENORM-NEXT:    s_mov_b32 s14, s10
+; GFX10-DENORM-NEXT:    s_mov_b32 s15, s11
+; GFX10-DENORM-NEXT:    s_mov_b32 s18, s10
+; GFX10-DENORM-NEXT:    s_mov_b32 s19, s11
+; GFX10-DENORM-NEXT:    s_mov_b32 s22, s10
+; GFX10-DENORM-NEXT:    s_mov_b32 s23, s11
 ; GFX10-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-NEXT:    s_mov_b32 s12, s6
-; GFX10-DENORM-NEXT:    s_mov_b32 s13, s7
-; GFX10-DENORM-NEXT:    s_mov_b32 s16, s8
-; GFX10-DENORM-NEXT:    s_mov_b32 s17, s9
-; GFX10-DENORM-NEXT:    s_mov_b32 s20, s10
-; GFX10-DENORM-NEXT:    s_mov_b32 s21, s11
+; GFX10-DENORM-NEXT:    s_mov_b32 s12, s2
+; GFX10-DENORM-NEXT:    s_mov_b32 s13, s3
+; GFX10-DENORM-NEXT:    s_mov_b32 s16, s4
+; GFX10-DENORM-NEXT:    s_mov_b32 s17, s5
+; GFX10-DENORM-NEXT:    s_mov_b32 s20, s6
+; GFX10-DENORM-NEXT:    s_mov_b32 s21, s7
 ; GFX10-DENORM-NEXT:    buffer_load_ushort v0, off, s[12:15], 0
 ; GFX10-DENORM-NEXT:    buffer_load_ushort v1, off, s[16:19], 0
 ; GFX10-DENORM-NEXT:    buffer_load_ushort v2, off, s[20:23], 0
-; GFX10-DENORM-NEXT:    s_mov_b32 s0, s4
-; GFX10-DENORM-NEXT:    s_mov_b32 s1, s5
+; GFX10-DENORM-NEXT:    s_mov_b32 s8, s0
+; GFX10-DENORM-NEXT:    s_mov_b32 s9, s1
 ; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DENORM-NEXT:    v_fmac_f16_e32 v2, v0, v1
-; GFX10-DENORM-NEXT:    buffer_store_short v2, off, s[0:3], 0
+; GFX10-DENORM-NEXT:    buffer_store_short v2, off, s[8:11], 0
 ; GFX10-DENORM-NEXT:    s_endpgm
 ;
 ; GFX11-FLUSH-LABEL: fmuladd_f16:
@@ -722,60 +722,60 @@ define amdgpu_kernel void @fmuladd_v2f16(
 ;
 ; GFX10-FLUSH-LABEL: fmuladd_v2f16:
 ; GFX10-FLUSH:       ; %bb.0:
-; GFX10-FLUSH-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
-; GFX10-FLUSH-NEXT:    s_mov_b32 s2, -1
-; GFX10-FLUSH-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX10-FLUSH-NEXT:    s_mov_b32 s14, s2
-; GFX10-FLUSH-NEXT:    s_mov_b32 s15, s3
-; GFX10-FLUSH-NEXT:    s_mov_b32 s18, s2
-; GFX10-FLUSH-NEXT:    s_mov_b32 s19, s3
+; GFX10-FLUSH-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX10-FLUSH-NEXT:    s_mov_b32 s10, -1
+; GFX10-FLUSH-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX10-FLUSH-NEXT:    s_mov_b32 s14, s10
+; GFX10-FLUSH-NEXT:    s_mov_b32 s15, s11
+; GFX10-FLUSH-NEXT:    s_mov_b32 s18, s10
+; GFX10-FLUSH-NEXT:    s_mov_b32 s19, s11
 ; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-FLUSH-NEXT:    s_mov_b32 s12, s6
-; GFX10-FLUSH-NEXT:    s_mov_b32 s13, s7
-; GFX10-FLUSH-NEXT:    s_mov_b32 s16, s8
-; GFX10-FLUSH-NEXT:    s_mov_b32 s17, s9
+; GFX10-FLUSH-NEXT:    s_mov_b32 s12, s2
+; GFX10-FLUSH-NEXT:    s_mov_b32 s13, s3
+; GFX10-FLUSH-NEXT:    s_mov_b32 s16, s4
+; GFX10-FLUSH-NEXT:    s_mov_b32 s17, s5
 ; GFX10-FLUSH-NEXT:    buffer_load_dword v0, off, s[12:15], 0
 ; GFX10-FLUSH-NEXT:    buffer_load_dword v1, off, s[16:19], 0
-; GFX10-FLUSH-NEXT:    s_mov_b32 s8, s10
-; GFX10-FLUSH-NEXT:    s_mov_b32 s9, s11
-; GFX10-FLUSH-NEXT:    s_mov_b32 s10, s2
-; GFX10-FLUSH-NEXT:    s_mov_b32 s11, s3
-; GFX10-FLUSH-NEXT:    s_mov_b32 s0, s4
-; GFX10-FLUSH-NEXT:    buffer_load_dword v2, off, s[8:11], 0
-; GFX10-FLUSH-NEXT:    s_mov_b32 s1, s5
+; GFX10-FLUSH-NEXT:    s_mov_b32 s4, s6
+; GFX10-FLUSH-NEXT:    s_mov_b32 s5, s7
+; GFX10-FLUSH-NEXT:    s_mov_b32 s6, s10
+; GFX10-FLUSH-NEXT:    s_mov_b32 s7, s11
+; GFX10-FLUSH-NEXT:    s_mov_b32 s8, s0
+; GFX10-FLUSH-NEXT:    buffer_load_dword v2, off, s[4:7], 0
+; GFX10-FLUSH-NEXT:    s_mov_b32 s9, s1
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-FLUSH-NEXT:    v_pk_mul_f16 v0, v0, v1
 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLUSH-NEXT:    v_pk_add_f16 v0, v0, v2
-; GFX10-FLUSH-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX10-FLUSH-NEXT:    buffer_store_dword v0, off, s[8:11], 0
 ; GFX10-FLUSH-NEXT:    s_endpgm
 ;
 ; GFX10-DENORM-LABEL: fmuladd_v2f16:
 ; GFX10-DENORM:       ; %bb.0:
-; GFX10-DENORM-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
-; GFX10-DENORM-NEXT:    s_mov_b32 s2, -1
-; GFX10-DENORM-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX10-DENORM-NEXT:    s_mov_b32 s14, s2
-; GFX10-DENORM-NEXT:    s_mov_b32 s15, s3
-; GFX10-DENORM-NEXT:    s_mov_b32 s18, s2
-; GFX10-DENORM-NEXT:    s_mov_b32 s19, s3
-; GFX10-DENORM-NEXT:    s_mov_b32 s22, s2
-; GFX10-DENORM-NEXT:    s_mov_b32 s23, s3
+; GFX10-DENORM-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX10-DENORM-NEXT:    s_mov_b32 s10, -1
+; GFX10-DENORM-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX10-DENORM-NEXT:    s_mov_b32 s14, s10
+; GFX10-DENORM-NEXT:    s_mov_b32 s15, s11
+; GFX10-DENORM-NEXT:    s_mov_b32 s18, s10
+; GFX10-DENORM-NEXT:    s_mov_b32 s19, s11
+; GFX10-DENORM-NEXT:    s_mov_b32 s22, s10
+; GFX10-DENORM-NEXT:    s_mov_b32 s23, s11
 ; GFX10-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-NEXT:    s_mov_b32 s12, s6
-; GFX10-DENORM-NEXT:    s_mov_b32 s13, s7
-; GFX10-DENORM-NEXT:    s_mov_b32 s16, s8
-; GFX10-DENORM-NEXT:    s_mov_b32 s17, s9
-; GFX10-DENORM-NEXT:    s_mov_b32 s20, s10
-; GFX10-DENORM-NEXT:    s_mov_b32 s21, s11
+; GFX10-DENORM-NEXT:    s_mov_b32 s12, s2
+; GFX10-DENORM-NEXT:    s_mov_b32 s13, s3
+; GFX10-DENORM-NEXT:    s_mov_b32 s16, s4
+; GFX10-DENORM-NEXT:    s_mov_b32 s17, s5
+; GFX10-DENORM-NEXT:    s_mov_b32 s20, s6
+; GFX10-DENORM-NEXT:    s_mov_b32 s21, s7
 ; GFX10-DENORM-NEXT:    buffer_load_dword v0, off, s[12:15], 0
 ; GFX10-DENORM-NEXT:    buffer_load_dword v1, off, s[16:19], 0
 ; GFX10-DENORM-NEXT:    buffer_load_dword v2, off, s[20:23], 0
-; GFX10-DENORM-NEXT:    s_mov_b32 s0, s4
-; GFX10-DENORM-NEXT:    s_mov_b32 s1, s5
+; GFX10-DENORM-NEXT:    s_mov_b32 s8, s0
+; GFX10-DENORM-NEXT:    s_mov_b32 s9, s1
 ; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DENORM-NEXT:    v_pk_fma_f16 v0, v0, v1, v2
-; GFX10-DENORM-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX10-DENORM-NEXT:    buffer_store_dword v0, off, s[8:11], 0
 ; GFX10-DENORM-NEXT:    s_endpgm
 ;
 ; GFX11-FLUSH-LABEL: fmuladd_v2f16:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll
index e05f3f1e65ff3..a807885e0d853 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll
@@ -139,12 +139,12 @@ define amdgpu_kernel void @sgpr_isnan_f64(ptr addrspace(1) %out, double %x) {
 ;
 ; GFX9CHECK-LABEL: sgpr_isnan_f64:
 ; GFX9CHECK:       ; %bb.0:
-; GFX9CHECK-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9CHECK-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9CHECK-NEXT:    v_cmp_class_f64_e64 s[0:1], s[6:7], 3
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[0:1]
-; GFX9CHECK-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9CHECK-NEXT:    v_cmp_class_f64_e64 s[2:3], s[2:3], 3
+; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[2:3]
+; GFX9CHECK-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9CHECK-NEXT:    s_endpgm
 ;
 ; GFX10CHECK-LABEL: sgpr_isnan_f64:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
index 50c52037dc4d3..7ca04cc235605 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
@@ -314,25 +314,25 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ;
 ; GFX900-SDAG-LABEL: s_log2_v2f32:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX900-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v0, 0x800000
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX900-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s7, v0
+; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s3, v0
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v3, 0, v1, vcc
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v4, 1.0, v2, vcc
-; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v0
+; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v0
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-SDAG-NEXT:    v_mul_f32_e32 v4, s7, v4
-; GFX900-SDAG-NEXT:    v_mul_f32_e32 v1, s6, v1
+; GFX900-SDAG-NEXT:    v_mul_f32_e32 v4, s3, v4
+; GFX900-SDAG-NEXT:    v_mul_f32_e32 v1, s2, v1
 ; GFX900-SDAG-NEXT:    v_log_f32_e32 v4, v4
 ; GFX900-SDAG-NEXT:    v_log_f32_e32 v2, v1
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v1, v4, v3
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v2, v0
-; GFX900-SDAG-NEXT:    global_store_dwordx2 v5, v[0:1], s[4:5]
+; GFX900-SDAG-NEXT:    global_store_dwordx2 v5, v[0:1], s[0:1]
 ; GFX900-SDAG-NEXT:    s_endpgm
 ;
 ; GFX900-GISEL-LABEL: s_log2_v2f32:
@@ -630,10 +630,10 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ; GFX900-GISEL-LABEL: s_log2_v3f32:
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x34
-; GFX900-GISEL-NEXT:    s_load_dwordx2 s[8:9], s[2:3], 0x24
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x42000000
+; GFX900-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x24
 ; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v1
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v2, vcc
@@ -654,7 +654,7 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v1, v4, v1
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v3
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-GISEL-NEXT:    global_store_dwordx3 v3, v[0:2], s[8:9]
+; GFX900-GISEL-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3]
 ; GFX900-GISEL-NEXT:    s_endpgm
 ;
 ; GFX1100-SDAG-LABEL: s_log2_v3f32:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
index 2c9ce001b8c4d..c7913f638798a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
@@ -194,40 +194,40 @@ define amdgpu_kernel void @maxnum_f16_imm_a(
 ;
 ; GFX9-LABEL: maxnum_f16_imm_a:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    s_mov_b32 s10, s2
-; GFX9-NEXT:    s_mov_b32 s11, s3
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    s_mov_b32 s10, s6
+; GFX9-NEXT:    s_mov_b32 s11, s7
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s8, s6
-; GFX9-NEXT:    s_mov_b32 s9, s7
+; GFX9-NEXT:    s_mov_b32 s8, s2
+; GFX9-NEXT:    s_mov_b32 s9, s3
 ; GFX9-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
-; GFX9-NEXT:    s_mov_b32 s0, s4
-; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_mov_b32 s4, s0
+; GFX9-NEXT:    s_mov_b32 s5, s1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
 ; GFX9-NEXT:    v_max_f16_e32 v0, 0x4200, v0
-; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX9-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: maxnum_f16_imm_a:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX10-NEXT:    s_mov_b32 s2, -1
-; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX10-NEXT:    s_mov_b32 s10, s2
-; GFX10-NEXT:    s_mov_b32 s11, s3
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_mov_b32 s6, -1
+; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX10-NEXT:    s_mov_b32 s10, s6
+; GFX10-NEXT:    s_mov_b32 s11, s7
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s8, s6
-; GFX10-NEXT:    s_mov_b32 s9, s7
-; GFX10-NEXT:    s_mov_b32 s0, s4
+; GFX10-NEXT:    s_mov_b32 s8, s2
+; GFX10-NEXT:    s_mov_b32 s9, s3
+; GFX10-NEXT:    s_mov_b32 s4, s0
 ; GFX10-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
-; GFX10-NEXT:    s_mov_b32 s1, s5
+; GFX10-NEXT:    s_mov_b32 s5, s1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_max_f16_e32 v0, v0, v0
 ; GFX10-NEXT:    v_max_f16_e32 v0, 0x4200, v0
-; GFX10-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX10-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: maxnum_f16_imm_a:
@@ -302,40 +302,40 @@ define amdgpu_kernel void @maxnum_f16_imm_b(
 ;
 ; GFX9-LABEL: maxnum_f16_imm_b:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    s_mov_b32 s10, s2
-; GFX9-NEXT:    s_mov_b32 s11, s3
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    s_mov_b32 s10, s6
+; GFX9-NEXT:    s_mov_b32 s11, s7
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s8, s6
-; GFX9-NEXT:    s_mov_b32 s9, s7
+; GFX9-NEXT:    s_mov_b32 s8, s2
+; GFX9-NEXT:    s_mov_b32 s9, s3
 ; GFX9-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
-; GFX9-NEXT:    s_mov_b32 s0, s4
-; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_mov_b32 s4, s0
+; GFX9-NEXT:    s_mov_b32 s5, s1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
 ; GFX9-NEXT:    v_max_f16_e32 v0, 4.0, v0
-; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX9-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: maxnum_f16_imm_b:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX10-NEXT:    s_mov_b32 s2, -1
-; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX10-NEXT:    s_mov_b32 s10, s2
-; GFX10-NEXT:    s_mov_b32 s11, s3
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_mov_b32 s6, -1
+; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX10-NEXT:    s_mov_b32 s10, s6
+; GFX10-NEXT:    s_mov_b32 s11, s7
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s8, s6
-; GFX10-NEXT:    s_mov_b32 s9, s7
-; GFX10-NEXT:    s_mov_b32 s0, s4
+; GFX10-NEXT:    s_mov_b32 s8, s2
+; GFX10-NEXT:    s_mov_b32 s9, s3
+; GFX10-NEXT:    s_mov_b32 s4, s0
 ; GFX10-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
-; GFX10-NEXT:    s_mov_b32 s1, s5
+; GFX10-NEXT:    s_mov_b32 s5, s1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_max_f16_e32 v0, v0, v0
 ; GFX10-NEXT:    v_max_f16_e32 v0, 4.0, v0
-; GFX10-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX10-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: maxnum_f16_imm_b:
@@ -524,29 +524,29 @@ define amdgpu_kernel void @maxnum_v2f16_imm_a(
 ;
 ; GFX9-LABEL: maxnum_v2f16_imm_a:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dword s0, s[6:7], 0x0
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    s_load_dword s4, s[2:3], 0x0
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_pk_max_f16 v0, s0, s0
-; GFX9-NEXT:    s_mov_b32 s0, 0x44004200
-; GFX9-NEXT:    v_pk_max_f16 v0, v0, s0
-; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT:    v_pk_max_f16 v0, s4, s4
+; GFX9-NEXT:    s_mov_b32 s4, 0x44004200
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, s4
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: maxnum_v2f16_imm_a:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_load_dword s0, s[6:7], 0x0
-; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX10-NEXT:    s_mov_b32 s6, -1
+; GFX10-NEXT:    s_load_dword s2, s[2:3], 0x0
+; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_pk_max_f16 v0, s0, s0
+; GFX10-NEXT:    v_pk_max_f16 v0, s2, s2
+; GFX10-NEXT:    s_mov_b32 s2, -1
 ; GFX10-NEXT:    v_pk_max_f16 v0, 0x44004200, v0
-; GFX10-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: maxnum_v2f16_imm_a:
@@ -614,29 +614,29 @@ define amdgpu_kernel void @maxnum_v2f16_imm_b(
 ;
 ; GFX9-LABEL: maxnum_v2f16_imm_b:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dword s0, s[6:7], 0x0
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    s_load_dword s4, s[2:3], 0x0
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_pk_max_f16 v0, s0, s0
-; GFX9-NEXT:    s_mov_b32 s0, 0x42004400
-; GFX9-NEXT:    v_pk_max_f16 v0, v0, s0
-; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT:    v_pk_max_f16 v0, s4, s4
+; GFX9-NEXT:    s_mov_b32 s4, 0x42004400
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, s4
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: maxnum_v2f16_imm_b:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_load_dword s0, s[6:7], 0x0
-; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX10-NEXT:    s_mov_b32 s6, -1
+; GFX10-NEXT:    s_load_dword s2, s[2:3], 0x0
+; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_pk_max_f16 v0, s0, s0
+; GFX10-NEXT:    v_pk_max_f16 v0, s2, s2
+; GFX10-NEXT:    s_mov_b32 s2, -1
 ; GFX10-NEXT:    v_pk_max_f16 v0, 0x42004400, v0
-; GFX10-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: maxnum_v2f16_imm_b:
@@ -1007,36 +1007,36 @@ define amdgpu_kernel void @fmax_v4f16_imm_a(
 ;
 ; GFX9-LABEL: fmax_v4f16_imm_a:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    s_mov_b32 s8, 0x44004200
 ; GFX9-NEXT:    s_mov_b32 s9, 0x40004800
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
-; GFX9-NEXT:    s_mov_b32 s0, s4
-; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX9-NEXT:    s_mov_b32 s4, s0
+; GFX9-NEXT:    s_mov_b32 s5, s1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_pk_max_f16 v0, s7, s7
-; GFX9-NEXT:    v_pk_max_f16 v2, s6, s6
+; GFX9-NEXT:    v_pk_max_f16 v0, s3, s3
+; GFX9-NEXT:    v_pk_max_f16 v2, s2, s2
 ; GFX9-NEXT:    v_pk_max_f16 v1, v0, s8
 ; GFX9-NEXT:    v_pk_max_f16 v0, v2, s9
-; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: fmax_v4f16_imm_a:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX10-NEXT:    s_mov_b32 s6, -1
+; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_pk_max_f16 v0, s1, s1
-; GFX10-NEXT:    v_pk_max_f16 v2, s0, s0
+; GFX10-NEXT:    v_pk_max_f16 v0, s3, s3
+; GFX10-NEXT:    v_pk_max_f16 v2, s2, s2
+; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX10-NEXT:    s_mov_b32 s2, -1
 ; GFX10-NEXT:    v_pk_max_f16 v1, 0x44004200, v0
 ; GFX10-NEXT:    v_pk_max_f16 v0, 0x40004800, v2
-; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: fmax_v4f16_imm_a:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
index 59508e049a3a9..0a004fd7701cf 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
@@ -221,40 +221,40 @@ define amdgpu_kernel void @minnum_f16_imm_a(
 ;
 ; GFX9-LABEL: minnum_f16_imm_a:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    s_mov_b32 s10, s2
-; GFX9-NEXT:    s_mov_b32 s11, s3
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    s_mov_b32 s10, s6
+; GFX9-NEXT:    s_mov_b32 s11, s7
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s8, s6
-; GFX9-NEXT:    s_mov_b32 s9, s7
+; GFX9-NEXT:    s_mov_b32 s8, s2
+; GFX9-NEXT:    s_mov_b32 s9, s3
 ; GFX9-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
-; GFX9-NEXT:    s_mov_b32 s0, s4
-; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_mov_b32 s4, s0
+; GFX9-NEXT:    s_mov_b32 s5, s1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
 ; GFX9-NEXT:    v_min_f16_e32 v0, 0x4200, v0
-; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX9-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: minnum_f16_imm_a:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX10-NEXT:    s_mov_b32 s2, -1
-; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX10-NEXT:    s_mov_b32 s10, s2
-; GFX10-NEXT:    s_mov_b32 s11, s3
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_mov_b32 s6, -1
+; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX10-NEXT:    s_mov_b32 s10, s6
+; GFX10-NEXT:    s_mov_b32 s11, s7
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s8, s6
-; GFX10-NEXT:    s_mov_b32 s9, s7
-; GFX10-NEXT:    s_mov_b32 s0, s4
+; GFX10-NEXT:    s_mov_b32 s8, s2
+; GFX10-NEXT:    s_mov_b32 s9, s3
+; GFX10-NEXT:    s_mov_b32 s4, s0
 ; GFX10-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
-; GFX10-NEXT:    s_mov_b32 s1, s5
+; GFX10-NEXT:    s_mov_b32 s5, s1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_max_f16_e32 v0, v0, v0
 ; GFX10-NEXT:    v_min_f16_e32 v0, 0x4200, v0
-; GFX10-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX10-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: minnum_f16_imm_a:
@@ -328,40 +328,40 @@ define amdgpu_kernel void @minnum_f16_imm_b(
 ;
 ; GFX9-LABEL: minnum_f16_imm_b:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    s_mov_b32 s10, s2
-; GFX9-NEXT:    s_mov_b32 s11, s3
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    s_mov_b32 s10, s6
+; GFX9-NEXT:    s_mov_b32 s11, s7
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s8, s6
-; GFX9-NEXT:    s_mov_b32 s9, s7
+; GFX9-NEXT:    s_mov_b32 s8, s2
+; GFX9-NEXT:    s_mov_b32 s9, s3
 ; GFX9-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
-; GFX9-NEXT:    s_mov_b32 s0, s4
-; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_mov_b32 s4, s0
+; GFX9-NEXT:    s_mov_b32 s5, s1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
 ; GFX9-NEXT:    v_min_f16_e32 v0, 4.0, v0
-; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX9-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: minnum_f16_imm_b:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX10-NEXT:    s_mov_b32 s2, -1
-; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX10-NEXT:    s_mov_b32 s10, s2
-; GFX10-NEXT:    s_mov_b32 s11, s3
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_mov_b32 s6, -1
+; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX10-NEXT:    s_mov_b32 s10, s6
+; GFX10-NEXT:    s_mov_b32 s11, s7
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s8, s6
-; GFX10-NEXT:    s_mov_b32 s9, s7
-; GFX10-NEXT:    s_mov_b32 s0, s4
+; GFX10-NEXT:    s_mov_b32 s8, s2
+; GFX10-NEXT:    s_mov_b32 s9, s3
+; GFX10-NEXT:    s_mov_b32 s4, s0
 ; GFX10-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
-; GFX10-NEXT:    s_mov_b32 s1, s5
+; GFX10-NEXT:    s_mov_b32 s5, s1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_max_f16_e32 v0, v0, v0
 ; GFX10-NEXT:    v_min_f16_e32 v0, 4.0, v0
-; GFX10-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX10-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: minnum_f16_imm_b:
@@ -583,29 +583,29 @@ define amdgpu_kernel void @minnum_v2f16_imm_a(
 ;
 ; GFX9-LABEL: minnum_v2f16_imm_a:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dword s0, s[6:7], 0x0
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    s_load_dword s4, s[2:3], 0x0
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_pk_max_f16 v0, s0, s0
-; GFX9-NEXT:    s_mov_b32 s0, 0x44004200
-; GFX9-NEXT:    v_pk_min_f16 v0, v0, s0
-; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT:    v_pk_max_f16 v0, s4, s4
+; GFX9-NEXT:    s_mov_b32 s4, 0x44004200
+; GFX9-NEXT:    v_pk_min_f16 v0, v0, s4
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: minnum_v2f16_imm_a:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_load_dword s0, s[6:7], 0x0
-; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX10-NEXT:    s_mov_b32 s6, -1
+; GFX10-NEXT:    s_load_dword s2, s[2:3], 0x0
+; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_pk_max_f16 v0, s0, s0
+; GFX10-NEXT:    v_pk_max_f16 v0, s2, s2
+; GFX10-NEXT:    s_mov_b32 s2, -1
 ; GFX10-NEXT:    v_pk_min_f16 v0, 0x44004200, v0
-; GFX10-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: minnum_v2f16_imm_a:
@@ -672,29 +672,29 @@ define amdgpu_kernel void @minnum_v2f16_imm_b(
 ;
 ; GFX9-LABEL: minnum_v2f16_imm_b:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dword s0, s[6:7], 0x0
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    s_load_dword s4, s[2:3], 0x0
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_pk_max_f16 v0, s0, s0
-; GFX9-NEXT:    s_mov_b32 s0, 0x42004400
-; GFX9-NEXT:    v_pk_min_f16 v0, v0, s0
-; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT:    v_pk_max_f16 v0, s4, s4
+; GFX9-NEXT:    s_mov_b32 s4, 0x42004400
+; GFX9-NEXT:    v_pk_min_f16 v0, v0, s4
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: minnum_v2f16_imm_b:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_load_dword s0, s[6:7], 0x0
-; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX10-NEXT:    s_mov_b32 s6, -1
+; GFX10-NEXT:    s_load_dword s2, s[2:3], 0x0
+; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_pk_max_f16 v0, s0, s0
+; GFX10-NEXT:    v_pk_max_f16 v0, s2, s2
+; GFX10-NEXT:    s_mov_b32 s2, -1
 ; GFX10-NEXT:    v_pk_min_f16 v0, 0x42004400, v0
-; GFX10-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: minnum_v2f16_imm_b:
@@ -1062,36 +1062,36 @@ define amdgpu_kernel void @fmin_v4f16_imm_a(
 ;
 ; GFX9-LABEL: fmin_v4f16_imm_a:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    s_mov_b32 s8, 0x44004200
 ; GFX9-NEXT:    s_mov_b32 s9, 0x40004800
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
-; GFX9-NEXT:    s_mov_b32 s0, s4
-; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX9-NEXT:    s_mov_b32 s4, s0
+; GFX9-NEXT:    s_mov_b32 s5, s1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_pk_max_f16 v0, s7, s7
-; GFX9-NEXT:    v_pk_max_f16 v2, s6, s6
+; GFX9-NEXT:    v_pk_max_f16 v0, s3, s3
+; GFX9-NEXT:    v_pk_max_f16 v2, s2, s2
 ; GFX9-NEXT:    v_pk_min_f16 v1, v0, s8
 ; GFX9-NEXT:    v_pk_min_f16 v0, v2, s9
-; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: fmin_v4f16_imm_a:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX10-NEXT:    s_mov_b32 s6, -1
+; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_pk_max_f16 v0, s1, s1
-; GFX10-NEXT:    v_pk_max_f16 v2, s0, s0
+; GFX10-NEXT:    v_pk_max_f16 v0, s3, s3
+; GFX10-NEXT:    v_pk_max_f16 v2, s2, s2
+; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX10-NEXT:    s_mov_b32 s2, -1
 ; GFX10-NEXT:    v_pk_min_f16 v1, 0x44004200, v0
 ; GFX10-NEXT:    v_pk_min_f16 v0, 0x40004800, v2
-; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: fmin_v4f16_imm_a:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
index 2d3b18dcb121c..53ea253035655 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
@@ -365,57 +365,57 @@ define amdgpu_kernel void @umulo_i64_s(i64 %x, i64 %y) {
 ;
 ; GFX9-LABEL: umulo_i64_s:
 ; GFX9:       ; %bb.0: ; %bb
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mul_i32 s3, s4, s7
-; GFX9-NEXT:    s_mul_hi_u32 s8, s4, s6
-; GFX9-NEXT:    s_mul_hi_u32 s1, s4, s7
-; GFX9-NEXT:    s_add_u32 s9, s8, s3
-; GFX9-NEXT:    s_mul_i32 s2, s5, s6
-; GFX9-NEXT:    s_addc_u32 s1, 0, s1
-; GFX9-NEXT:    s_mul_hi_u32 s0, s5, s6
-; GFX9-NEXT:    s_add_u32 s9, s9, s2
-; GFX9-NEXT:    s_mul_hi_u32 s10, s5, s7
-; GFX9-NEXT:    s_addc_u32 s0, s1, s0
-; GFX9-NEXT:    s_addc_u32 s1, s10, 0
-; GFX9-NEXT:    s_mul_i32 s5, s5, s7
-; GFX9-NEXT:    s_add_u32 s0, s0, s5
-; GFX9-NEXT:    s_addc_u32 s1, 0, s1
-; GFX9-NEXT:    s_add_i32 s3, s8, s3
-; GFX9-NEXT:    s_add_i32 s3, s3, s2
-; GFX9-NEXT:    s_mul_i32 s2, s4, s6
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT:    s_cselect_b32 s0, 0, s3
-; GFX9-NEXT:    s_cselect_b32 s1, 0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v0, s1
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    s_mul_i32 s7, s0, s3
+; GFX9-NEXT:    s_mul_hi_u32 s8, s0, s2
+; GFX9-NEXT:    s_mul_hi_u32 s5, s0, s3
+; GFX9-NEXT:    s_add_u32 s9, s8, s7
+; GFX9-NEXT:    s_mul_i32 s6, s1, s2
+; GFX9-NEXT:    s_addc_u32 s5, 0, s5
+; GFX9-NEXT:    s_mul_hi_u32 s4, s1, s2
+; GFX9-NEXT:    s_add_u32 s9, s9, s6
+; GFX9-NEXT:    s_mul_hi_u32 s10, s1, s3
+; GFX9-NEXT:    s_addc_u32 s4, s5, s4
+; GFX9-NEXT:    s_addc_u32 s5, s10, 0
+; GFX9-NEXT:    s_mul_i32 s1, s1, s3
+; GFX9-NEXT:    s_add_u32 s4, s4, s1
+; GFX9-NEXT:    s_addc_u32 s5, 0, s5
+; GFX9-NEXT:    s_add_i32 s1, s8, s7
+; GFX9-NEXT:    s_add_i32 s1, s1, s6
+; GFX9-NEXT:    s_mul_i32 s0, s0, s2
+; GFX9-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9-NEXT:    s_cselect_b32 s1, 0, s1
+; GFX9-NEXT:    s_cselect_b32 s0, 0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: umulo_i64_s:
 ; GFX10:       ; %bb.0: ; %bb
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_mul_i32 s3, s4, s7
-; GFX10-NEXT:    s_mul_hi_u32 s8, s4, s6
-; GFX10-NEXT:    s_mul_hi_u32 s1, s4, s7
-; GFX10-NEXT:    s_mul_hi_u32 s0, s5, s6
-; GFX10-NEXT:    s_mul_i32 s2, s5, s6
-; GFX10-NEXT:    s_mul_hi_u32 s9, s5, s7
-; GFX10-NEXT:    s_mul_i32 s5, s5, s7
-; GFX10-NEXT:    s_add_u32 s7, s8, s3
-; GFX10-NEXT:    s_addc_u32 s1, 0, s1
-; GFX10-NEXT:    s_add_u32 s7, s7, s2
-; GFX10-NEXT:    s_addc_u32 s0, s1, s0
-; GFX10-NEXT:    s_addc_u32 s1, s9, 0
-; GFX10-NEXT:    s_add_u32 s0, s0, s5
-; GFX10-NEXT:    s_addc_u32 s1, 0, s1
-; GFX10-NEXT:    s_add_i32 s3, s8, s3
-; GFX10-NEXT:    s_mul_i32 s4, s4, s6
-; GFX10-NEXT:    s_add_i32 s3, s3, s2
-; GFX10-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX10-NEXT:    s_cselect_b32 s0, 0, s4
-; GFX10-NEXT:    s_cselect_b32 s1, 0, s3
+; GFX10-NEXT:    s_mul_i32 s7, s0, s3
+; GFX10-NEXT:    s_mul_hi_u32 s8, s0, s2
+; GFX10-NEXT:    s_mul_hi_u32 s5, s0, s3
+; GFX10-NEXT:    s_mul_hi_u32 s4, s1, s2
+; GFX10-NEXT:    s_mul_i32 s6, s1, s2
+; GFX10-NEXT:    s_mul_hi_u32 s9, s1, s3
+; GFX10-NEXT:    s_mul_i32 s1, s1, s3
+; GFX10-NEXT:    s_add_u32 s3, s8, s7
+; GFX10-NEXT:    s_addc_u32 s5, 0, s5
+; GFX10-NEXT:    s_add_u32 s3, s3, s6
+; GFX10-NEXT:    s_addc_u32 s3, s5, s4
+; GFX10-NEXT:    s_addc_u32 s5, s9, 0
+; GFX10-NEXT:    s_add_u32 s4, s3, s1
+; GFX10-NEXT:    s_addc_u32 s5, 0, s5
+; GFX10-NEXT:    s_add_i32 s1, s8, s7
+; GFX10-NEXT:    s_mul_i32 s0, s0, s2
+; GFX10-NEXT:    s_add_i32 s1, s1, s6
+; GFX10-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX10-NEXT:    s_cselect_b32 s0, 0, s0
+; GFX10-NEXT:    s_cselect_b32 s1, 0, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
@@ -540,81 +540,81 @@ define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) {
 ;
 ; GFX9-LABEL: smulo_i64_s:
 ; GFX9:       ; %bb.0: ; %bb
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mul_i32 s3, s4, s7
-; GFX9-NEXT:    s_mul_hi_u32 s8, s4, s6
-; GFX9-NEXT:    s_mul_hi_u32 s1, s4, s7
-; GFX9-NEXT:    s_add_u32 s9, s8, s3
-; GFX9-NEXT:    s_mul_i32 s2, s5, s6
-; GFX9-NEXT:    s_addc_u32 s1, 0, s1
-; GFX9-NEXT:    s_mul_hi_u32 s0, s5, s6
-; GFX9-NEXT:    s_add_u32 s9, s9, s2
-; GFX9-NEXT:    s_mul_hi_i32 s10, s5, s7
-; GFX9-NEXT:    s_addc_u32 s0, s1, s0
-; GFX9-NEXT:    s_addc_u32 s1, s10, 0
-; GFX9-NEXT:    s_mul_i32 s9, s5, s7
-; GFX9-NEXT:    s_add_u32 s0, s0, s9
-; GFX9-NEXT:    s_addc_u32 s1, 0, s1
-; GFX9-NEXT:    s_sub_u32 s9, s0, s6
-; GFX9-NEXT:    s_subb_u32 s10, s1, 0
-; GFX9-NEXT:    s_cmp_lt_i32 s5, 0
-; GFX9-NEXT:    s_cselect_b32 s0, s9, s0
-; GFX9-NEXT:    s_cselect_b32 s1, s10, s1
-; GFX9-NEXT:    s_sub_u32 s5, s0, s4
-; GFX9-NEXT:    s_subb_u32 s9, s1, 0
-; GFX9-NEXT:    s_cmp_lt_i32 s7, 0
-; GFX9-NEXT:    s_cselect_b32 s1, s9, s1
-; GFX9-NEXT:    s_cselect_b32 s0, s5, s0
-; GFX9-NEXT:    s_add_i32 s3, s8, s3
-; GFX9-NEXT:    s_add_i32 s5, s3, s2
-; GFX9-NEXT:    s_ashr_i32 s2, s5, 31
-; GFX9-NEXT:    s_mov_b32 s3, s2
-; GFX9-NEXT:    s_mul_i32 s4, s4, s6
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], s[2:3]
-; GFX9-NEXT:    s_cselect_b32 s0, 0, s5
-; GFX9-NEXT:    s_cselect_b32 s1, 0, s4
-; GFX9-NEXT:    v_mov_b32_e32 v0, s1
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    s_mul_i32 s7, s0, s3
+; GFX9-NEXT:    s_mul_hi_u32 s8, s0, s2
+; GFX9-NEXT:    s_mul_hi_u32 s5, s0, s3
+; GFX9-NEXT:    s_add_u32 s9, s8, s7
+; GFX9-NEXT:    s_mul_i32 s6, s1, s2
+; GFX9-NEXT:    s_addc_u32 s5, 0, s5
+; GFX9-NEXT:    s_mul_hi_u32 s4, s1, s2
+; GFX9-NEXT:    s_add_u32 s9, s9, s6
+; GFX9-NEXT:    s_mul_hi_i32 s10, s1, s3
+; GFX9-NEXT:    s_addc_u32 s4, s5, s4
+; GFX9-NEXT:    s_addc_u32 s5, s10, 0
+; GFX9-NEXT:    s_mul_i32 s9, s1, s3
+; GFX9-NEXT:    s_add_u32 s4, s4, s9
+; GFX9-NEXT:    s_addc_u32 s5, 0, s5
+; GFX9-NEXT:    s_sub_u32 s9, s4, s2
+; GFX9-NEXT:    s_subb_u32 s10, s5, 0
+; GFX9-NEXT:    s_cmp_lt_i32 s1, 0
+; GFX9-NEXT:    s_cselect_b32 s4, s9, s4
+; GFX9-NEXT:    s_cselect_b32 s1, s10, s5
+; GFX9-NEXT:    s_sub_u32 s9, s4, s0
+; GFX9-NEXT:    s_subb_u32 s5, s1, 0
+; GFX9-NEXT:    s_cmp_lt_i32 s3, 0
+; GFX9-NEXT:    s_cselect_b32 s5, s5, s1
+; GFX9-NEXT:    s_cselect_b32 s4, s9, s4
+; GFX9-NEXT:    s_add_i32 s1, s8, s7
+; GFX9-NEXT:    s_add_i32 s1, s1, s6
+; GFX9-NEXT:    s_ashr_i32 s6, s1, 31
+; GFX9-NEXT:    s_mov_b32 s7, s6
+; GFX9-NEXT:    s_mul_i32 s0, s0, s2
+; GFX9-NEXT:    s_cmp_lg_u64 s[4:5], s[6:7]
+; GFX9-NEXT:    s_cselect_b32 s1, 0, s1
+; GFX9-NEXT:    s_cselect_b32 s0, 0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: smulo_i64_s:
 ; GFX10:       ; %bb.0: ; %bb
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_mul_i32 s3, s4, s7
-; GFX10-NEXT:    s_mul_hi_u32 s8, s4, s6
-; GFX10-NEXT:    s_mul_hi_u32 s1, s4, s7
-; GFX10-NEXT:    s_mul_i32 s2, s5, s6
-; GFX10-NEXT:    s_add_u32 s11, s8, s3
-; GFX10-NEXT:    s_mul_hi_u32 s0, s5, s6
-; GFX10-NEXT:    s_addc_u32 s1, 0, s1
-; GFX10-NEXT:    s_mul_hi_i32 s9, s5, s7
-; GFX10-NEXT:    s_add_u32 s11, s11, s2
-; GFX10-NEXT:    s_mul_i32 s10, s5, s7
-; GFX10-NEXT:    s_addc_u32 s0, s1, s0
-; GFX10-NEXT:    s_addc_u32 s1, s9, 0
-; GFX10-NEXT:    s_add_u32 s0, s0, s10
-; GFX10-NEXT:    s_addc_u32 s1, 0, s1
-; GFX10-NEXT:    s_sub_u32 s9, s0, s6
-; GFX10-NEXT:    s_subb_u32 s10, s1, 0
-; GFX10-NEXT:    s_cmp_lt_i32 s5, 0
-; GFX10-NEXT:    s_cselect_b32 s0, s9, s0
-; GFX10-NEXT:    s_cselect_b32 s1, s10, s1
-; GFX10-NEXT:    s_sub_u32 s5, s0, s4
-; GFX10-NEXT:    s_subb_u32 s9, s1, 0
-; GFX10-NEXT:    s_cmp_lt_i32 s7, 0
-; GFX10-NEXT:    s_mul_i32 s4, s4, s6
-; GFX10-NEXT:    s_cselect_b32 s1, s9, s1
-; GFX10-NEXT:    s_cselect_b32 s0, s5, s0
-; GFX10-NEXT:    s_add_i32 s3, s8, s3
-; GFX10-NEXT:    s_add_i32 s5, s3, s2
-; GFX10-NEXT:    s_ashr_i32 s2, s5, 31
-; GFX10-NEXT:    s_mov_b32 s3, s2
-; GFX10-NEXT:    s_cmp_lg_u64 s[0:1], s[2:3]
-; GFX10-NEXT:    s_cselect_b32 s0, 0, s4
-; GFX10-NEXT:    s_cselect_b32 s1, 0, s5
+; GFX10-NEXT:    s_mul_i32 s7, s0, s3
+; GFX10-NEXT:    s_mul_hi_u32 s8, s0, s2
+; GFX10-NEXT:    s_mul_hi_u32 s5, s0, s3
+; GFX10-NEXT:    s_mul_i32 s6, s1, s2
+; GFX10-NEXT:    s_add_u32 s11, s8, s7
+; GFX10-NEXT:    s_mul_hi_u32 s4, s1, s2
+; GFX10-NEXT:    s_addc_u32 s5, 0, s5
+; GFX10-NEXT:    s_mul_hi_i32 s9, s1, s3
+; GFX10-NEXT:    s_add_u32 s11, s11, s6
+; GFX10-NEXT:    s_mul_i32 s10, s1, s3
+; GFX10-NEXT:    s_addc_u32 s4, s5, s4
+; GFX10-NEXT:    s_addc_u32 s5, s9, 0
+; GFX10-NEXT:    s_add_u32 s4, s4, s10
+; GFX10-NEXT:    s_addc_u32 s5, 0, s5
+; GFX10-NEXT:    s_sub_u32 s9, s4, s2
+; GFX10-NEXT:    s_subb_u32 s10, s5, 0
+; GFX10-NEXT:    s_cmp_lt_i32 s1, 0
+; GFX10-NEXT:    s_cselect_b32 s1, s9, s4
+; GFX10-NEXT:    s_cselect_b32 s4, s10, s5
+; GFX10-NEXT:    s_sub_u32 s9, s1, s0
+; GFX10-NEXT:    s_subb_u32 s5, s4, 0
+; GFX10-NEXT:    s_cmp_lt_i32 s3, 0
+; GFX10-NEXT:    s_mul_i32 s0, s0, s2
+; GFX10-NEXT:    s_cselect_b32 s5, s5, s4
+; GFX10-NEXT:    s_cselect_b32 s4, s9, s1
+; GFX10-NEXT:    s_add_i32 s1, s8, s7
+; GFX10-NEXT:    s_add_i32 s1, s1, s6
+; GFX10-NEXT:    s_ashr_i32 s6, s1, 31
+; GFX10-NEXT:    s_mov_b32 s7, s6
+; GFX10-NEXT:    s_cmp_lg_u64 s[4:5], s[6:7]
+; GFX10-NEXT:    s_cselect_b32 s0, 0, s0
+; GFX10-NEXT:    s_cselect_b32 s1, 0, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
index bf7dbcde62fdd..47dd0263d020e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
@@ -28,41 +28,23 @@ define amdgpu_kernel void @rint_f16(
 ; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
-; VI-LABEL: rint_f16:
-; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
-; VI-NEXT:    s_mov_b32 s10, s6
-; VI-NEXT:    s_mov_b32 s11, s7
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s8, s2
-; VI-NEXT:    s_mov_b32 s9, s3
-; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
-; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_rndne_f16_e32 v0, v0
-; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
-; VI-NEXT:    s_endpgm
-;
-; GFX9-LABEL: rint_f16:
-; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    s_mov_b32 s10, s2
-; GFX9-NEXT:    s_mov_b32 s11, s3
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s8, s6
-; GFX9-NEXT:    s_mov_b32 s9, s7
-; GFX9-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
-; GFX9-NEXT:    s_mov_b32 s0, s4
-; GFX9-NEXT:    s_mov_b32 s1, s5
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_rndne_f16_e32 v0, v0
-; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
-; GFX9-NEXT:    s_endpgm
+; GFX89-LABEL: rint_f16:
+; GFX89:       ; %bb.0: ; %entry
+; GFX89-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX89-NEXT:    s_mov_b32 s7, 0xf000
+; GFX89-NEXT:    s_mov_b32 s6, -1
+; GFX89-NEXT:    s_mov_b32 s10, s6
+; GFX89-NEXT:    s_mov_b32 s11, s7
+; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX89-NEXT:    s_mov_b32 s8, s2
+; GFX89-NEXT:    s_mov_b32 s9, s3
+; GFX89-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
+; GFX89-NEXT:    s_mov_b32 s4, s0
+; GFX89-NEXT:    s_mov_b32 s5, s1
+; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    v_rndne_f16_e32 v0, v0
+; GFX89-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; GFX89-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: rint_f16:
 ; GFX11:       ; %bb.0: ; %entry
@@ -149,22 +131,22 @@ define amdgpu_kernel void @rint_v2f16(
 ;
 ; GFX9-LABEL: rint_v2f16:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    s_mov_b32 s10, s2
-; GFX9-NEXT:    s_mov_b32 s11, s3
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    s_mov_b32 s10, s6
+; GFX9-NEXT:    s_mov_b32 s11, s7
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s8, s6
-; GFX9-NEXT:    s_mov_b32 s9, s7
+; GFX9-NEXT:    s_mov_b32 s8, s2
+; GFX9-NEXT:    s_mov_b32 s9, s3
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[8:11], 0
-; GFX9-NEXT:    s_mov_b32 s0, s4
-; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_mov_b32 s4, s0
+; GFX9-NEXT:    s_mov_b32 s5, s1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_rndne_f16_e32 v1, v0
 ; GFX9-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX9-NEXT:    v_pack_b32_f16 v0, v1, v0
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: rint_v2f16:
@@ -198,5 +180,3 @@ entry:
   store <2 x half> %r.val, ptr addrspace(1) %r
   ret void
 }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GFX89: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.ll
index e7b17c30cf753..d5b4f879bf8a0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.round.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.round.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck --check-prefixes=GFX6 %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GFX8 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GFX89,GFX8 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GFX89,GFX9 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GFX11 %s
 ; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck --check-prefixes=R600 %s
 
@@ -24,41 +24,23 @@ define amdgpu_kernel void @round_f32(ptr addrspace(1) %out, float %x) #0 {
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
 ;
-; GFX8-LABEL: round_f32:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dword s6, s[2:3], 0x2c
-; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX8-NEXT:    s_mov_b32 s3, 0xf000
-; GFX8-NEXT:    s_mov_b32 s2, -1
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_trunc_f32_e32 v0, s6
-; GFX8-NEXT:    v_sub_f32_e32 v1, s6, v0
-; GFX8-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, 0.5
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1.0, s[4:5]
-; GFX8-NEXT:    s_brev_b32 s4, -2
-; GFX8-NEXT:    v_mov_b32_e32 v2, s6
-; GFX8-NEXT:    v_bfi_b32 v1, s4, v1, v2
-; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
-; GFX8-NEXT:    s_endpgm
-;
-; GFX9-LABEL: round_f32:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dword s6, s[2:3], 0x2c
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_trunc_f32_e32 v0, s6
-; GFX9-NEXT:    v_sub_f32_e32 v1, s6, v0
-; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, 0.5
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1.0, s[4:5]
-; GFX9-NEXT:    s_brev_b32 s4, -2
-; GFX9-NEXT:    v_mov_b32_e32 v2, s6
-; GFX9-NEXT:    v_bfi_b32 v1, s4, v1, v2
-; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
-; GFX9-NEXT:    s_endpgm
+; GFX89-LABEL: round_f32:
+; GFX89:       ; %bb.0:
+; GFX89-NEXT:    s_load_dword s6, s[2:3], 0x2c
+; GFX89-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX89-NEXT:    s_mov_b32 s3, 0xf000
+; GFX89-NEXT:    s_mov_b32 s2, -1
+; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX89-NEXT:    v_trunc_f32_e32 v0, s6
+; GFX89-NEXT:    v_sub_f32_e32 v1, s6, v0
+; GFX89-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, 0.5
+; GFX89-NEXT:    v_cndmask_b32_e64 v1, 0, 1.0, s[4:5]
+; GFX89-NEXT:    s_brev_b32 s4, -2
+; GFX89-NEXT:    v_mov_b32_e32 v2, s6
+; GFX89-NEXT:    v_bfi_b32 v1, s4, v1, v2
+; GFX89-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX89-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX89-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: round_f32:
 ; GFX11:       ; %bb.0:
@@ -133,57 +115,31 @@ define amdgpu_kernel void @round_v2f32(ptr addrspace(1) %out, <2 x float> %in) #
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
-; GFX8-LABEL: round_v2f32:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX8-NEXT:    s_brev_b32 s8, -2
-; GFX8-NEXT:    s_mov_b32 s7, 0xf000
-; GFX8-NEXT:    s_mov_b32 s6, -1
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_trunc_f32_e32 v0, s3
-; GFX8-NEXT:    v_sub_f32_e32 v1, s3, v0
-; GFX8-NEXT:    s_mov_b32 s4, s0
-; GFX8-NEXT:    s_mov_b32 s5, s1
-; GFX8-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
-; GFX8-NEXT:    v_mov_b32_e32 v2, s3
-; GFX8-NEXT:    v_bfi_b32 v1, s8, v1, v2
-; GFX8-NEXT:    v_add_f32_e32 v1, v0, v1
-; GFX8-NEXT:    v_trunc_f32_e32 v0, s2
-; GFX8-NEXT:    v_sub_f32_e32 v2, s2, v0
-; GFX8-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v2|, 0.5
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s[0:1]
-; GFX8-NEXT:    v_mov_b32_e32 v3, s2
-; GFX8-NEXT:    v_bfi_b32 v2, s8, v2, v3
-; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
-; GFX8-NEXT:    s_endpgm
-;
-; GFX9-LABEL: round_v2f32:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-NEXT:    s_brev_b32 s8, -2
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_trunc_f32_e32 v0, s7
-; GFX9-NEXT:    v_sub_f32_e32 v1, s7, v0
-; GFX9-NEXT:    s_mov_b32 s0, s4
-; GFX9-NEXT:    s_mov_b32 s1, s5
-; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, 0.5
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1.0, s[4:5]
-; GFX9-NEXT:    v_mov_b32_e32 v2, s7
-; GFX9-NEXT:    v_bfi_b32 v1, s8, v1, v2
-; GFX9-NEXT:    v_add_f32_e32 v1, v0, v1
-; GFX9-NEXT:    v_trunc_f32_e32 v0, s6
-; GFX9-NEXT:    v_sub_f32_e32 v2, s6, v0
-; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s[4:5]
-; GFX9-NEXT:    v_mov_b32_e32 v3, s6
-; GFX9-NEXT:    v_bfi_b32 v2, s8, v2, v3
-; GFX9-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
-; GFX9-NEXT:    s_endpgm
+; GFX89-LABEL: round_v2f32:
+; GFX89:       ; %bb.0:
+; GFX89-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX89-NEXT:    s_brev_b32 s8, -2
+; GFX89-NEXT:    s_mov_b32 s7, 0xf000
+; GFX89-NEXT:    s_mov_b32 s6, -1
+; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX89-NEXT:    v_trunc_f32_e32 v0, s3
+; GFX89-NEXT:    v_sub_f32_e32 v1, s3, v0
+; GFX89-NEXT:    s_mov_b32 s4, s0
+; GFX89-NEXT:    s_mov_b32 s5, s1
+; GFX89-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
+; GFX89-NEXT:    v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
+; GFX89-NEXT:    v_mov_b32_e32 v2, s3
+; GFX89-NEXT:    v_bfi_b32 v1, s8, v1, v2
+; GFX89-NEXT:    v_add_f32_e32 v1, v0, v1
+; GFX89-NEXT:    v_trunc_f32_e32 v0, s2
+; GFX89-NEXT:    v_sub_f32_e32 v2, s2, v0
+; GFX89-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v2|, 0.5
+; GFX89-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s[0:1]
+; GFX89-NEXT:    v_mov_b32_e32 v3, s2
+; GFX89-NEXT:    v_bfi_b32 v2, s8, v2, v3
+; GFX89-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX89-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX89-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: round_v2f32:
 ; GFX11:       ; %bb.0:
@@ -279,83 +235,44 @@ define amdgpu_kernel void @round_v4f32(ptr addrspace(1) %out, <4 x float> %in) #
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
 ;
-; GFX8-LABEL: round_v4f32:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x34
-; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX8-NEXT:    s_brev_b32 s10, -2
-; GFX8-NEXT:    s_mov_b32 s3, 0xf000
-; GFX8-NEXT:    s_mov_b32 s2, -1
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_trunc_f32_e32 v0, s7
-; GFX8-NEXT:    v_sub_f32_e32 v1, s7, v0
-; GFX8-NEXT:    v_cmp_ge_f32_e64 s[8:9], |v1|, 0.5
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1.0, s[8:9]
-; GFX8-NEXT:    v_mov_b32_e32 v2, s7
-; GFX8-NEXT:    v_bfi_b32 v1, s10, v1, v2
-; GFX8-NEXT:    v_add_f32_e32 v3, v0, v1
-; GFX8-NEXT:    v_trunc_f32_e32 v0, s6
-; GFX8-NEXT:    v_sub_f32_e32 v1, s6, v0
-; GFX8-NEXT:    v_cmp_ge_f32_e64 s[8:9], |v1|, 0.5
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1.0, s[8:9]
-; GFX8-NEXT:    v_mov_b32_e32 v2, s6
-; GFX8-NEXT:    v_bfi_b32 v1, s10, v1, v2
-; GFX8-NEXT:    v_add_f32_e32 v2, v0, v1
-; GFX8-NEXT:    v_trunc_f32_e32 v0, s5
-; GFX8-NEXT:    v_sub_f32_e32 v1, s5, v0
-; GFX8-NEXT:    v_cmp_ge_f32_e64 s[6:7], |v1|, 0.5
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1.0, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v4, s5
-; GFX8-NEXT:    v_bfi_b32 v1, s10, v1, v4
-; GFX8-NEXT:    v_add_f32_e32 v1, v0, v1
-; GFX8-NEXT:    v_trunc_f32_e32 v0, s4
-; GFX8-NEXT:    v_sub_f32_e32 v4, s4, v0
-; GFX8-NEXT:    v_cmp_ge_f32_e64 s[6:7], |v4|, 0.5
-; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, 1.0, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v5, s4
-; GFX8-NEXT:    v_bfi_b32 v4, s10, v4, v5
-; GFX8-NEXT:    v_add_f32_e32 v0, v0, v4
-; GFX8-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
-; GFX8-NEXT:    s_endpgm
-;
-; GFX9-LABEL: round_v4f32:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x34
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX9-NEXT:    s_brev_b32 s10, -2
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_trunc_f32_e32 v0, s7
-; GFX9-NEXT:    v_sub_f32_e32 v1, s7, v0
-; GFX9-NEXT:    v_cmp_ge_f32_e64 s[8:9], |v1|, 0.5
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1.0, s[8:9]
-; GFX9-NEXT:    v_mov_b32_e32 v2, s7
-; GFX9-NEXT:    v_bfi_b32 v1, s10, v1, v2
-; GFX9-NEXT:    v_add_f32_e32 v3, v0, v1
-; GFX9-NEXT:    v_trunc_f32_e32 v0, s6
-; GFX9-NEXT:    v_sub_f32_e32 v1, s6, v0
-; GFX9-NEXT:    v_cmp_ge_f32_e64 s[8:9], |v1|, 0.5
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1.0, s[8:9]
-; GFX9-NEXT:    v_mov_b32_e32 v2, s6
-; GFX9-NEXT:    v_bfi_b32 v1, s10, v1, v2
-; GFX9-NEXT:    v_add_f32_e32 v2, v0, v1
-; GFX9-NEXT:    v_trunc_f32_e32 v0, s5
-; GFX9-NEXT:    v_sub_f32_e32 v1, s5, v0
-; GFX9-NEXT:    v_cmp_ge_f32_e64 s[6:7], |v1|, 0.5
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1.0, s[6:7]
-; GFX9-NEXT:    v_mov_b32_e32 v4, s5
-; GFX9-NEXT:    v_bfi_b32 v1, s10, v1, v4
-; GFX9-NEXT:    v_add_f32_e32 v1, v0, v1
-; GFX9-NEXT:    v_trunc_f32_e32 v0, s4
-; GFX9-NEXT:    v_sub_f32_e32 v4, s4, v0
-; GFX9-NEXT:    v_cmp_ge_f32_e64 s[6:7], |v4|, 0.5
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1.0, s[6:7]
-; GFX9-NEXT:    v_mov_b32_e32 v5, s4
-; GFX9-NEXT:    v_bfi_b32 v4, s10, v4, v5
-; GFX9-NEXT:    v_add_f32_e32 v0, v0, v4
-; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
-; GFX9-NEXT:    s_endpgm
+; GFX89-LABEL: round_v4f32:
+; GFX89:       ; %bb.0:
+; GFX89-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x34
+; GFX89-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX89-NEXT:    s_brev_b32 s10, -2
+; GFX89-NEXT:    s_mov_b32 s3, 0xf000
+; GFX89-NEXT:    s_mov_b32 s2, -1
+; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX89-NEXT:    v_trunc_f32_e32 v0, s7
+; GFX89-NEXT:    v_sub_f32_e32 v1, s7, v0
+; GFX89-NEXT:    v_cmp_ge_f32_e64 s[8:9], |v1|, 0.5
+; GFX89-NEXT:    v_cndmask_b32_e64 v1, 0, 1.0, s[8:9]
+; GFX89-NEXT:    v_mov_b32_e32 v2, s7
+; GFX89-NEXT:    v_bfi_b32 v1, s10, v1, v2
+; GFX89-NEXT:    v_add_f32_e32 v3, v0, v1
+; GFX89-NEXT:    v_trunc_f32_e32 v0, s6
+; GFX89-NEXT:    v_sub_f32_e32 v1, s6, v0
+; GFX89-NEXT:    v_cmp_ge_f32_e64 s[8:9], |v1|, 0.5
+; GFX89-NEXT:    v_cndmask_b32_e64 v1, 0, 1.0, s[8:9]
+; GFX89-NEXT:    v_mov_b32_e32 v2, s6
+; GFX89-NEXT:    v_bfi_b32 v1, s10, v1, v2
+; GFX89-NEXT:    v_add_f32_e32 v2, v0, v1
+; GFX89-NEXT:    v_trunc_f32_e32 v0, s5
+; GFX89-NEXT:    v_sub_f32_e32 v1, s5, v0
+; GFX89-NEXT:    v_cmp_ge_f32_e64 s[6:7], |v1|, 0.5
+; GFX89-NEXT:    v_cndmask_b32_e64 v1, 0, 1.0, s[6:7]
+; GFX89-NEXT:    v_mov_b32_e32 v4, s5
+; GFX89-NEXT:    v_bfi_b32 v1, s10, v1, v4
+; GFX89-NEXT:    v_add_f32_e32 v1, v0, v1
+; GFX89-NEXT:    v_trunc_f32_e32 v0, s4
+; GFX89-NEXT:    v_sub_f32_e32 v4, s4, v0
+; GFX89-NEXT:    v_cmp_ge_f32_e64 s[6:7], |v4|, 0.5
+; GFX89-NEXT:    v_cndmask_b32_e64 v4, 0, 1.0, s[6:7]
+; GFX89-NEXT:    v_mov_b32_e32 v5, s4
+; GFX89-NEXT:    v_bfi_b32 v4, s10, v4, v5
+; GFX89-NEXT:    v_add_f32_e32 v0, v0, v4
+; GFX89-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GFX89-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: round_v4f32:
 ; GFX11:       ; %bb.0:
@@ -504,141 +421,73 @@ define amdgpu_kernel void @round_v8f32(ptr addrspace(1) %out, <8 x float> %in) #
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[12:15], 0
 ; GFX6-NEXT:    s_endpgm
 ;
-; GFX8-LABEL: round_v8f32:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x44
-; GFX8-NEXT:    s_load_dwordx2 s[12:13], s[2:3], 0x24
-; GFX8-NEXT:    s_brev_b32 s2, -2
-; GFX8-NEXT:    s_mov_b32 s15, 0xf000
-; GFX8-NEXT:    s_mov_b32 s14, -1
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_trunc_f32_e32 v0, s7
-; GFX8-NEXT:    v_sub_f32_e32 v1, s7, v0
-; GFX8-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
-; GFX8-NEXT:    v_mov_b32_e32 v2, s7
-; GFX8-NEXT:    v_bfi_b32 v1, s2, v1, v2
-; GFX8-NEXT:    v_add_f32_e32 v3, v0, v1
-; GFX8-NEXT:    v_trunc_f32_e32 v0, s6
-; GFX8-NEXT:    v_sub_f32_e32 v1, s6, v0
-; GFX8-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
-; GFX8-NEXT:    v_mov_b32_e32 v2, s6
-; GFX8-NEXT:    v_bfi_b32 v1, s2, v1, v2
-; GFX8-NEXT:    v_add_f32_e32 v2, v0, v1
-; GFX8-NEXT:    v_trunc_f32_e32 v0, s5
-; GFX8-NEXT:    v_sub_f32_e32 v1, s5, v0
-; GFX8-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
-; GFX8-NEXT:    v_mov_b32_e32 v4, s5
-; GFX8-NEXT:    v_bfi_b32 v1, s2, v1, v4
-; GFX8-NEXT:    v_add_f32_e32 v1, v0, v1
-; GFX8-NEXT:    v_trunc_f32_e32 v0, s4
-; GFX8-NEXT:    v_sub_f32_e32 v4, s4, v0
-; GFX8-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v4|, 0.5
-; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, 1.0, s[0:1]
-; GFX8-NEXT:    v_mov_b32_e32 v5, s4
-; GFX8-NEXT:    v_bfi_b32 v4, s2, v4, v5
-; GFX8-NEXT:    v_add_f32_e32 v0, v0, v4
-; GFX8-NEXT:    v_trunc_f32_e32 v4, s11
-; GFX8-NEXT:    v_sub_f32_e32 v5, s11, v4
-; GFX8-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5
-; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1.0, s[0:1]
-; GFX8-NEXT:    v_mov_b32_e32 v6, s11
-; GFX8-NEXT:    v_bfi_b32 v5, s2, v5, v6
-; GFX8-NEXT:    v_add_f32_e32 v7, v4, v5
-; GFX8-NEXT:    v_trunc_f32_e32 v4, s10
-; GFX8-NEXT:    v_sub_f32_e32 v5, s10, v4
-; GFX8-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5
-; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1.0, s[0:1]
-; GFX8-NEXT:    v_mov_b32_e32 v6, s10
-; GFX8-NEXT:    v_bfi_b32 v5, s2, v5, v6
-; GFX8-NEXT:    v_add_f32_e32 v6, v4, v5
-; GFX8-NEXT:    v_trunc_f32_e32 v4, s9
-; GFX8-NEXT:    v_sub_f32_e32 v5, s9, v4
-; GFX8-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5
-; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1.0, s[0:1]
-; GFX8-NEXT:    v_mov_b32_e32 v8, s9
-; GFX8-NEXT:    v_bfi_b32 v5, s2, v5, v8
-; GFX8-NEXT:    v_add_f32_e32 v5, v4, v5
-; GFX8-NEXT:    v_trunc_f32_e32 v4, s8
-; GFX8-NEXT:    v_sub_f32_e32 v8, s8, v4
-; GFX8-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v8|, 0.5
-; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1.0, s[0:1]
-; GFX8-NEXT:    v_mov_b32_e32 v9, s8
-; GFX8-NEXT:    v_bfi_b32 v8, s2, v8, v9
-; GFX8-NEXT:    v_add_f32_e32 v4, v4, v8
-; GFX8-NEXT:    buffer_store_dwordx4 v[4:7], off, s[12:15], 0 offset:16
-; GFX8-NEXT:    buffer_store_dwordx4 v[0:3], off, s[12:15], 0
-; GFX8-NEXT:    s_endpgm
-;
-; GFX9-LABEL: round_v8f32:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x44
-; GFX9-NEXT:    s_load_dwordx2 s[12:13], s[2:3], 0x24
-; GFX9-NEXT:    s_brev_b32 s2, -2
-; GFX9-NEXT:    s_mov_b32 s15, 0xf000
-; GFX9-NEXT:    s_mov_b32 s14, -1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_trunc_f32_e32 v0, s7
-; GFX9-NEXT:    v_sub_f32_e32 v1, s7, v0
-; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v2, s7
-; GFX9-NEXT:    v_bfi_b32 v1, s2, v1, v2
-; GFX9-NEXT:    v_add_f32_e32 v3, v0, v1
-; GFX9-NEXT:    v_trunc_f32_e32 v0, s6
-; GFX9-NEXT:    v_sub_f32_e32 v1, s6, v0
-; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v2, s6
-; GFX9-NEXT:    v_bfi_b32 v1, s2, v1, v2
-; GFX9-NEXT:    v_add_f32_e32 v2, v0, v1
-; GFX9-NEXT:    v_trunc_f32_e32 v0, s5
-; GFX9-NEXT:    v_sub_f32_e32 v1, s5, v0
-; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v4, s5
-; GFX9-NEXT:    v_bfi_b32 v1, s2, v1, v4
-; GFX9-NEXT:    v_add_f32_e32 v1, v0, v1
-; GFX9-NEXT:    v_trunc_f32_e32 v0, s4
-; GFX9-NEXT:    v_sub_f32_e32 v4, s4, v0
-; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v4|, 0.5
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1.0, s[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v5, s4
-; GFX9-NEXT:    v_bfi_b32 v4, s2, v4, v5
-; GFX9-NEXT:    v_add_f32_e32 v0, v0, v4
-; GFX9-NEXT:    v_trunc_f32_e32 v4, s11
-; GFX9-NEXT:    v_sub_f32_e32 v5, s11, v4
-; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1.0, s[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v6, s11
-; GFX9-NEXT:    v_bfi_b32 v5, s2, v5, v6
-; GFX9-NEXT:    v_add_f32_e32 v7, v4, v5
-; GFX9-NEXT:    v_trunc_f32_e32 v4, s10
-; GFX9-NEXT:    v_sub_f32_e32 v5, s10, v4
-; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1.0, s[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v6, s10
-; GFX9-NEXT:    v_bfi_b32 v5, s2, v5, v6
-; GFX9-NEXT:    v_add_f32_e32 v6, v4, v5
-; GFX9-NEXT:    v_trunc_f32_e32 v4, s9
-; GFX9-NEXT:    v_sub_f32_e32 v5, s9, v4
-; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1.0, s[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v8, s9
-; GFX9-NEXT:    v_bfi_b32 v5, s2, v5, v8
-; GFX9-NEXT:    v_add_f32_e32 v5, v4, v5
-; GFX9-NEXT:    v_trunc_f32_e32 v4, s8
-; GFX9-NEXT:    v_sub_f32_e32 v8, s8, v4
-; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v8|, 0.5
-; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1.0, s[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v9, s8
-; GFX9-NEXT:    v_bfi_b32 v8, s2, v8, v9
-; GFX9-NEXT:    v_add_f32_e32 v4, v4, v8
-; GFX9-NEXT:    buffer_store_dwordx4 v[4:7], off, s[12:15], 0 offset:16
-; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[12:15], 0
-; GFX9-NEXT:    s_endpgm
+; GFX89-LABEL: round_v8f32:
+; GFX89:       ; %bb.0:
+; GFX89-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x44
+; GFX89-NEXT:    s_load_dwordx2 s[12:13], s[2:3], 0x24
+; GFX89-NEXT:    s_brev_b32 s2, -2
+; GFX89-NEXT:    s_mov_b32 s15, 0xf000
+; GFX89-NEXT:    s_mov_b32 s14, -1
+; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX89-NEXT:    v_trunc_f32_e32 v0, s7
+; GFX89-NEXT:    v_sub_f32_e32 v1, s7, v0
+; GFX89-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
+; GFX89-NEXT:    v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
+; GFX89-NEXT:    v_mov_b32_e32 v2, s7
+; GFX89-NEXT:    v_bfi_b32 v1, s2, v1, v2
+; GFX89-NEXT:    v_add_f32_e32 v3, v0, v1
+; GFX89-NEXT:    v_trunc_f32_e32 v0, s6
+; GFX89-NEXT:    v_sub_f32_e32 v1, s6, v0
+; GFX89-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
+; GFX89-NEXT:    v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
+; GFX89-NEXT:    v_mov_b32_e32 v2, s6
+; GFX89-NEXT:    v_bfi_b32 v1, s2, v1, v2
+; GFX89-NEXT:    v_add_f32_e32 v2, v0, v1
+; GFX89-NEXT:    v_trunc_f32_e32 v0, s5
+; GFX89-NEXT:    v_sub_f32_e32 v1, s5, v0
+; GFX89-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
+; GFX89-NEXT:    v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
+; GFX89-NEXT:    v_mov_b32_e32 v4, s5
+; GFX89-NEXT:    v_bfi_b32 v1, s2, v1, v4
+; GFX89-NEXT:    v_add_f32_e32 v1, v0, v1
+; GFX89-NEXT:    v_trunc_f32_e32 v0, s4
+; GFX89-NEXT:    v_sub_f32_e32 v4, s4, v0
+; GFX89-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v4|, 0.5
+; GFX89-NEXT:    v_cndmask_b32_e64 v4, 0, 1.0, s[0:1]
+; GFX89-NEXT:    v_mov_b32_e32 v5, s4
+; GFX89-NEXT:    v_bfi_b32 v4, s2, v4, v5
+; GFX89-NEXT:    v_add_f32_e32 v0, v0, v4
+; GFX89-NEXT:    v_trunc_f32_e32 v4, s11
+; GFX89-NEXT:    v_sub_f32_e32 v5, s11, v4
+; GFX89-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5
+; GFX89-NEXT:    v_cndmask_b32_e64 v5, 0, 1.0, s[0:1]
+; GFX89-NEXT:    v_mov_b32_e32 v6, s11
+; GFX89-NEXT:    v_bfi_b32 v5, s2, v5, v6
+; GFX89-NEXT:    v_add_f32_e32 v7, v4, v5
+; GFX89-NEXT:    v_trunc_f32_e32 v4, s10
+; GFX89-NEXT:    v_sub_f32_e32 v5, s10, v4
+; GFX89-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5
+; GFX89-NEXT:    v_cndmask_b32_e64 v5, 0, 1.0, s[0:1]
+; GFX89-NEXT:    v_mov_b32_e32 v6, s10
+; GFX89-NEXT:    v_bfi_b32 v5, s2, v5, v6
+; GFX89-NEXT:    v_add_f32_e32 v6, v4, v5
+; GFX89-NEXT:    v_trunc_f32_e32 v4, s9
+; GFX89-NEXT:    v_sub_f32_e32 v5, s9, v4
+; GFX89-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5
+; GFX89-NEXT:    v_cndmask_b32_e64 v5, 0, 1.0, s[0:1]
+; GFX89-NEXT:    v_mov_b32_e32 v8, s9
+; GFX89-NEXT:    v_bfi_b32 v5, s2, v5, v8
+; GFX89-NEXT:    v_add_f32_e32 v5, v4, v5
+; GFX89-NEXT:    v_trunc_f32_e32 v4, s8
+; GFX89-NEXT:    v_sub_f32_e32 v8, s8, v4
+; GFX89-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v8|, 0.5
+; GFX89-NEXT:    v_cndmask_b32_e64 v8, 0, 1.0, s[0:1]
+; GFX89-NEXT:    v_mov_b32_e32 v9, s8
+; GFX89-NEXT:    v_bfi_b32 v8, s2, v8, v9
+; GFX89-NEXT:    v_add_f32_e32 v4, v4, v8
+; GFX89-NEXT:    buffer_store_dwordx4 v[4:7], off, s[12:15], 0 offset:16
+; GFX89-NEXT:    buffer_store_dwordx4 v[0:3], off, s[12:15], 0
+; GFX89-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: round_v8f32:
 ; GFX11:       ; %bb.0:
@@ -797,43 +646,24 @@ define amdgpu_kernel void @round_f16(ptr addrspace(1) %out, i32 %x.arg) #0 {
 ; GFX6-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
 ;
-; GFX8-LABEL: round_f16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dword s4, s[2:3], 0x2c
-; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX8-NEXT:    v_mov_b32_e32 v0, 0x3c00
-; GFX8-NEXT:    s_movk_i32 s5, 0x7fff
-; GFX8-NEXT:    s_mov_b32 s3, 0xf000
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_trunc_f16_e32 v1, s4
-; GFX8-NEXT:    v_sub_f16_e32 v2, s4, v1
-; GFX8-NEXT:    v_cmp_ge_f16_e64 vcc, |v2|, 0.5
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GFX8-NEXT:    v_mov_b32_e32 v2, s4
-; GFX8-NEXT:    v_bfi_b32 v0, s5, v0, v2
-; GFX8-NEXT:    s_mov_b32 s2, -1
-; GFX8-NEXT:    v_add_f16_e32 v0, v1, v0
-; GFX8-NEXT:    buffer_store_short v0, off, s[0:3], 0
-; GFX8-NEXT:    s_endpgm
-;
-; GFX9-LABEL: round_f16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dword s4, s[2:3], 0x2c
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0x3c00
-; GFX9-NEXT:    s_movk_i32 s5, 0x7fff
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_trunc_f16_e32 v1, s4
-; GFX9-NEXT:    v_sub_f16_e32 v2, s4, v1
-; GFX9-NEXT:    v_cmp_ge_f16_e64 vcc, |v2|, 0.5
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-NEXT:    v_bfi_b32 v0, s5, v0, v2
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    v_add_f16_e32 v0, v1, v0
-; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
-; GFX9-NEXT:    s_endpgm
+; GFX89-LABEL: round_f16:
+; GFX89:       ; %bb.0:
+; GFX89-NEXT:    s_load_dword s4, s[2:3], 0x2c
+; GFX89-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX89-NEXT:    v_mov_b32_e32 v0, 0x3c00
+; GFX89-NEXT:    s_movk_i32 s5, 0x7fff
+; GFX89-NEXT:    s_mov_b32 s3, 0xf000
+; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX89-NEXT:    v_trunc_f16_e32 v1, s4
+; GFX89-NEXT:    v_sub_f16_e32 v2, s4, v1
+; GFX89-NEXT:    v_cmp_ge_f16_e64 vcc, |v2|, 0.5
+; GFX89-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GFX89-NEXT:    v_mov_b32_e32 v2, s4
+; GFX89-NEXT:    v_bfi_b32 v0, s5, v0, v2
+; GFX89-NEXT:    s_mov_b32 s2, -1
+; GFX89-NEXT:    v_add_f16_e32 v0, v1, v0
+; GFX89-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX89-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: round_f16:
 ; GFX11:       ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
index 3ae0cf65eb00f..a70f4d8d90065 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
@@ -46,26 +46,26 @@ define amdgpu_kernel void @sin_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
 ;
 ; GFX9-LABEL: sin_f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mul_f16_e32 v1, 0.15915494, v1
 ; GFX9-NEXT:    v_sin_f16_e32 v1, v1
-; GFX9-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: sin_f16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ushort v1, v0, s[6:7]
+; GFX10-NEXT:    global_load_ushort v1, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mul_f16_e32 v1, 0.15915494, v1
 ; GFX10-NEXT:    v_sin_f16_e32 v1, v1
-; GFX10-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: sin_f16:
@@ -142,34 +142,34 @@ define amdgpu_kernel void @sin_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
 ;
 ; GFX9-LABEL: sin_v2f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x3118
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mul_f16_e32 v3, 0.15915494, v1
 ; GFX9-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-NEXT:    v_sin_f16_e32 v2, v3
 ; GFX9-NEXT:    v_sin_f16_e32 v1, v1
 ; GFX9-NEXT:    v_pack_b32_f16 v1, v2, v1
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: sin_v2f16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0x3118
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mul_f16_e32 v3, 0.15915494, v1
 ; GFX10-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    v_sin_f16_e32 v2, v3
 ; GFX10-NEXT:    v_sin_f16_e32 v1, v1
 ; GFX10-NEXT:    v_pack_b32_f16 v1, v2, v1
-; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: sin_v2f16:
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-memops.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-memops.ll
index 57028a0f9b14f..66c68f7cc731e 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-memops.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-memops.ll
@@ -17,10 +17,10 @@ define void @loads(ptr addrspace(8) %buf) {
 ; CHECK-NEXT:    [[VOLATILE:%.*]] = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) align 4 [[BUF]], i32 16, i32 0, i32 -2147483648)
 ; CHECK-NEXT:    [[VOLATILE_NONTEMPORAL:%.*]] = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) align 4 [[BUF]], i32 16, i32 0, i32 -2147483646), !nontemporal [[META0]]
 ; CHECK-NEXT:    fence syncscope("wavefront") release
-; CHECK-NEXT:    [[ATOMIC:%.*]] = call float @llvm.amdgcn.raw.ptr.atomic.buffer.load.f32(ptr addrspace(8) align 4 [[BUF]], i32 16, i32 0, i32 -2147483647)
+; CHECK-NEXT:    [[ATOMIC:%.*]] = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) align 4 [[BUF]], i32 16, i32 0, i32 -2147483647)
 ; CHECK-NEXT:    fence syncscope("wavefront") acquire
-; CHECK-NEXT:    [[ATOMIC_MONOTONIC:%.*]] = call float @llvm.amdgcn.raw.ptr.atomic.buffer.load.f32(ptr addrspace(8) align 4 [[BUF]], i32 16, i32 0, i32 1)
-; CHECK-NEXT:    [[ATOMIC_ACQUIRE:%.*]] = call float @llvm.amdgcn.raw.ptr.atomic.buffer.load.f32(ptr addrspace(8) align 4 [[BUF]], i32 16, i32 0, i32 1)
+; CHECK-NEXT:    [[ATOMIC_MONOTONIC:%.*]] = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) align 4 [[BUF]], i32 16, i32 0, i32 1)
+; CHECK-NEXT:    [[ATOMIC_ACQUIRE:%.*]] = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) align 4 [[BUF]], i32 16, i32 0, i32 1)
 ; CHECK-NEXT:    fence acquire
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll b/llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll
index 40cfa4e7e4dfc..e9a1b38eee157 100644
--- a/llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll
+++ b/llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll
@@ -93,7 +93,7 @@ define amdgpu_kernel void @add_u64_vs(i64 %a) {
 define amdgpu_kernel void @add_u64_ss(i64 %v, i64 %a) {
 ; GCN-LABEL: add_u64_ss:
 ; GCN: s_add_u32
-; GCN: s_addc_u32 s1, s5, s7
+; GCN: s_addc_u32 s1, s1, s3
   %add = add i64 %v, %a
   store i64 %add, ptr undef
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
index a4bde5c9d8215..e8ac1b2887c36 100644
--- a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
@@ -8,12 +8,12 @@
 define amdgpu_kernel void @s_lshr_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
 ; GFX9-LABEL: s_lshr_v2i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-NEXT:    v_pk_lshrrev_b16 v1, s7, v1
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    v_pk_lshrrev_b16 v1, s3, v1
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_lshr_v2i16:
@@ -54,11 +54,11 @@ define amdgpu_kernel void @s_lshr_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <
 ;
 ; GFX10-LABEL: s_lshr_v2i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_pk_lshrrev_b16 v1, s7, s6
-; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-NEXT:    v_pk_lshrrev_b16 v1, s3, s2
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_lshr_v2i16:
@@ -79,13 +79,13 @@ define amdgpu_kernel void @s_lshr_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <
 define amdgpu_kernel void @v_lshr_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; GFX9-LABEL: v_lshr_v2i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[6:7]
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_lshrrev_b16 v0, v1, v0
-; GFX9-NEXT:    global_store_dword v2, v0, s[4:5]
+; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: v_lshr_v2i16:
@@ -131,13 +131,13 @@ define amdgpu_kernel void @v_lshr_v2i16(ptr addrspace(1) %out, ptr addrspace(1)
 ;
 ; GFX10-LABEL: v_lshr_v2i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v2, s[6:7]
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_lshrrev_b16 v0, v1, v0
-; GFX10-NEXT:    global_store_dword v2, v0, s[4:5]
+; GFX10-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_lshr_v2i16:
@@ -363,13 +363,13 @@ define amdgpu_kernel void @lshr_s_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1
 define amdgpu_kernel void @lshr_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; GFX9-LABEL: lshr_imm_v_v2i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_lshrrev_b16 v1, v1, 8 op_sel_hi:[1,0]
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: lshr_imm_v_v2i16:
@@ -414,13 +414,13 @@ define amdgpu_kernel void @lshr_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace
 ;
 ; GFX10-LABEL: lshr_imm_v_v2i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_lshrrev_b16 v1, v1, 8 op_sel_hi:[1,0]
-; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: lshr_imm_v_v2i16:
@@ -450,13 +450,13 @@ define amdgpu_kernel void @lshr_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace
 define amdgpu_kernel void @lshr_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; GFX9-LABEL: lshr_v_imm_v2i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: lshr_v_imm_v2i16:
@@ -497,13 +497,13 @@ define amdgpu_kernel void @lshr_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace
 ;
 ; GFX10-LABEL: lshr_v_imm_v2i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
-; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: lshr_v_imm_v2i16:
@@ -533,14 +533,14 @@ define amdgpu_kernel void @lshr_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace
 define amdgpu_kernel void @v_lshr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; GFX9-LABEL: v_lshr_v4i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[0:3], v4, s[6:7]
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_lshrrev_b16 v1, v3, v1
 ; GFX9-NEXT:    v_pk_lshrrev_b16 v0, v2, v0
-; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[4:5]
+; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: v_lshr_v4i16:
@@ -596,14 +596,14 @@ define amdgpu_kernel void @v_lshr_v4i16(ptr addrspace(1) %out, ptr addrspace(1)
 ;
 ; GFX10-LABEL: v_lshr_v4i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx4 v[0:3], v4, s[6:7]
+; GFX10-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_lshrrev_b16 v1, v3, v1
 ; GFX10-NEXT:    v_pk_lshrrev_b16 v0, v2, v0
-; GFX10-NEXT:    global_store_dwordx2 v4, v[0:1], s[4:5]
+; GFX10-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_lshr_v4i16:
@@ -636,14 +636,14 @@ define amdgpu_kernel void @v_lshr_v4i16(ptr addrspace(1) %out, ptr addrspace(1)
 define amdgpu_kernel void @lshr_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; GFX9-LABEL: lshr_v_imm_v4i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[6:7]
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
 ; GFX9-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: lshr_v_imm_v4i16:
@@ -689,14 +689,14 @@ define amdgpu_kernel void @lshr_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace
 ;
 ; GFX10-LABEL: lshr_v_imm_v4i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v2, s[6:7]
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: lshr_v_imm_v4i16:
diff --git a/llvm/test/CodeGen/AMDGPU/mad.u16.ll b/llvm/test/CodeGen/AMDGPU/mad.u16.ll
index a3e7bf2caf772..3032b1028dc2d 100644
--- a/llvm/test/CodeGen/AMDGPU/mad.u16.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad.u16.ll
@@ -35,34 +35,34 @@ define amdgpu_kernel void @mad_u16(
 ;
 ; GFX9-LABEL: mad_u16:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v1, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_ushort v2, v0, s[8:9] glc
+; GFX9-NEXT:    global_load_ushort v2, v0, s[4:5] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_ushort v3, v0, s[10:11] glc
+; GFX9-NEXT:    global_load_ushort v3, v0, s[6:7] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
-; GFX9-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: mad_u16:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
+; GFX10-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_load_ushort v2, v0, s[8:9] glc dlc
+; GFX10-NEXT:    global_load_ushort v2, v0, s[4:5] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_load_ushort v3, v0, s[10:11] glc dlc
+; GFX10-NEXT:    global_load_ushort v3, v0, s[6:7] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mad_u16 v1, v1, v2, v3
-; GFX10-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: mad_u16:
diff --git a/llvm/test/CodeGen/AMDGPU/madak.ll b/llvm/test/CodeGen/AMDGPU/madak.ll
index f2815915b8425..b8b4d4440d580 100644
--- a/llvm/test/CodeGen/AMDGPU/madak.ll
+++ b/llvm/test/CodeGen/AMDGPU/madak.ll
@@ -225,40 +225,40 @@ define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr ad
 ;
 ; GFX9-LABEL: madak_2_use_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0x41200000
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v2, v0, s[6:7] offset:4 glc
+; GFX9-NEXT:    global_load_dword v2, v0, s[2:3] offset:4 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v3, v0, s[6:7] offset:8 glc
+; GFX9-NEXT:    global_load_dword v3, v0, s[2:3] offset:8 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_madak_f32 v2, v1, v2, 0x41200000
 ; GFX9-NEXT:    v_mac_f32_e32 v4, v1, v3
-; GFX9-NEXT:    global_store_dword v0, v2, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v2, s[0:1]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_store_dword v0, v4, s[6:7] offset:4
+; GFX9-NEXT:    global_store_dword v0, v4, s[2:3] offset:4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-MAD-LABEL: madak_2_use_f32:
 ; GFX10-MAD:       ; %bb.0:
-; GFX10-MAD-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-MAD-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-MAD-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-MAD-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-MAD-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
+; GFX10-MAD-NEXT:    global_load_dword v1, v0, s[2:3] glc dlc
 ; GFX10-MAD-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-MAD-NEXT:    global_load_dword v2, v0, s[6:7] offset:4 glc dlc
+; GFX10-MAD-NEXT:    global_load_dword v2, v0, s[2:3] offset:4 glc dlc
 ; GFX10-MAD-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-MAD-NEXT:    global_load_dword v3, v0, s[6:7] offset:8 glc dlc
+; GFX10-MAD-NEXT:    global_load_dword v3, v0, s[2:3] offset:8 glc dlc
 ; GFX10-MAD-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-MAD-NEXT:    v_madak_f32 v2, v1, v2, 0x41200000
 ; GFX10-MAD-NEXT:    v_madak_f32 v1, v1, v3, 0x41200000
-; GFX10-MAD-NEXT:    global_store_dword v0, v2, s[4:5]
+; GFX10-MAD-NEXT:    global_store_dword v0, v2, s[0:1]
 ; GFX10-MAD-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-MAD-NEXT:    global_store_dword v0, v1, s[6:7] offset:4
+; GFX10-MAD-NEXT:    global_store_dword v0, v1, s[2:3] offset:4
 ; GFX10-MAD-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-MAD-NEXT:    s_endpgm
 ;
@@ -289,41 +289,41 @@ define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr ad
 ;
 ; GFX940-FMA-LABEL: madak_2_use_f32:
 ; GFX940-FMA:       ; %bb.0:
-; GFX940-FMA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX940-FMA-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX940-FMA-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX940-FMA-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX940-FMA-NEXT:    v_mov_b32_e32 v4, 0x41200000
 ; GFX940-FMA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-FMA-NEXT:    global_load_dword v1, v0, s[6:7] sc0 sc1
+; GFX940-FMA-NEXT:    global_load_dword v1, v0, s[2:3] sc0 sc1
 ; GFX940-FMA-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-FMA-NEXT:    global_load_dword v2, v0, s[6:7] offset:4 sc0 sc1
+; GFX940-FMA-NEXT:    global_load_dword v2, v0, s[2:3] offset:4 sc0 sc1
 ; GFX940-FMA-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-FMA-NEXT:    global_load_dword v3, v0, s[6:7] offset:8 sc0 sc1
+; GFX940-FMA-NEXT:    global_load_dword v3, v0, s[2:3] offset:8 sc0 sc1
 ; GFX940-FMA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-FMA-NEXT:    v_fmaak_f32 v2, v1, v2, 0x41200000
 ; GFX940-FMA-NEXT:    v_fmac_f32_e32 v4, v1, v3
-; GFX940-FMA-NEXT:    global_store_dword v0, v2, s[4:5] sc0 sc1
+; GFX940-FMA-NEXT:    global_store_dword v0, v2, s[0:1] sc0 sc1
 ; GFX940-FMA-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-FMA-NEXT:    global_store_dword v0, v4, s[6:7] offset:4 sc0 sc1
+; GFX940-FMA-NEXT:    global_store_dword v0, v4, s[2:3] offset:4 sc0 sc1
 ; GFX940-FMA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-FMA-NEXT:    s_endpgm
 ;
 ; GFX10-FMA-LABEL: madak_2_use_f32:
 ; GFX10-FMA:       ; %bb.0:
-; GFX10-FMA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-FMA-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-FMA-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-FMA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-FMA-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
+; GFX10-FMA-NEXT:    global_load_dword v1, v0, s[2:3] glc dlc
 ; GFX10-FMA-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FMA-NEXT:    global_load_dword v2, v0, s[6:7] offset:4 glc dlc
+; GFX10-FMA-NEXT:    global_load_dword v2, v0, s[2:3] offset:4 glc dlc
 ; GFX10-FMA-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FMA-NEXT:    global_load_dword v3, v0, s[6:7] offset:8 glc dlc
+; GFX10-FMA-NEXT:    global_load_dword v3, v0, s[2:3] offset:8 glc dlc
 ; GFX10-FMA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FMA-NEXT:    v_fmaak_f32 v2, v1, v2, 0x41200000
 ; GFX10-FMA-NEXT:    v_fmaak_f32 v1, v1, v3, 0x41200000
-; GFX10-FMA-NEXT:    global_store_dword v0, v2, s[4:5]
+; GFX10-FMA-NEXT:    global_store_dword v0, v2, s[0:1]
 ; GFX10-FMA-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-FMA-NEXT:    global_store_dword v0, v1, s[6:7] offset:4
+; GFX10-FMA-NEXT:    global_store_dword v0, v1, s[2:3] offset:4
 ; GFX10-FMA-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-FMA-NEXT:    s_endpgm
 ;
@@ -408,24 +408,24 @@ define amdgpu_kernel void @madak_m_inline_imm_f32(ptr addrspace(1) noalias %out,
 ;
 ; GFX9-LABEL: madak_m_inline_imm_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_madak_f32 v1, 4.0, v1, 0x41200000
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-MAD-LABEL: madak_m_inline_imm_f32:
 ; GFX10-MAD:       ; %bb.0:
-; GFX10-MAD-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-MAD-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-MAD-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-MAD-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-MAD-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-MAD-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX10-MAD-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-MAD-NEXT:    v_madak_f32 v1, 4.0, v1, 0x41200000
-; GFX10-MAD-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-MAD-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-MAD-NEXT:    s_endpgm
 ;
 ; GFX11-MAD-LABEL: madak_m_inline_imm_f32:
@@ -446,25 +446,25 @@ define amdgpu_kernel void @madak_m_inline_imm_f32(ptr addrspace(1) noalias %out,
 ;
 ; GFX940-FMA-LABEL: madak_m_inline_imm_f32:
 ; GFX940-FMA:       ; %bb.0:
-; GFX940-FMA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX940-FMA-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX940-FMA-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX940-FMA-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX940-FMA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-FMA-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX940-FMA-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX940-FMA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-FMA-NEXT:    v_fmaak_f32 v1, 4.0, v1, 0x41200000
-; GFX940-FMA-NEXT:    global_store_dword v0, v1, s[4:5] sc0 sc1
+; GFX940-FMA-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-FMA-NEXT:    s_endpgm
 ;
 ; GFX10-FMA-LABEL: madak_m_inline_imm_f32:
 ; GFX10-FMA:       ; %bb.0:
-; GFX10-FMA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-FMA-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-FMA-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-FMA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-FMA-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-FMA-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX10-FMA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FMA-NEXT:    v_fmaak_f32 v1, 4.0, v1, 0x41200000
-; GFX10-FMA-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-FMA-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-FMA-NEXT:    s_endpgm
 ;
 ; GFX11-FMA-LABEL: madak_m_inline_imm_f32:
@@ -961,23 +961,23 @@ define amdgpu_kernel void @s_s_madak_f32(ptr addrspace(1) %out, float %a, float
 ;
 ; GFX9-LABEL: s_s_madak_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x41200000
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v2, s7
-; GFX9-NEXT:    v_mac_f32_e32 v1, s6, v2
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v2, s3
+; GFX9-NEXT:    v_mac_f32_e32 v1, s2, v2
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-MAD-LABEL: s_s_madak_f32:
 ; GFX10-MAD:       ; %bb.0:
-; GFX10-MAD-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-MAD-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-MAD-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-MAD-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-MAD-NEXT:    v_mov_b32_e32 v0, s7
-; GFX10-MAD-NEXT:    v_madak_f32 v0, s6, v0, 0x41200000
-; GFX10-MAD-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-MAD-NEXT:    v_mov_b32_e32 v0, s3
+; GFX10-MAD-NEXT:    v_madak_f32 v0, s2, v0, 0x41200000
+; GFX10-MAD-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-MAD-NEXT:    s_endpgm
 ;
 ; GFX11-MAD-LABEL: s_s_madak_f32:
@@ -994,23 +994,23 @@ define amdgpu_kernel void @s_s_madak_f32(ptr addrspace(1) %out, float %a, float
 ;
 ; GFX940-FMA-LABEL: s_s_madak_f32:
 ; GFX940-FMA:       ; %bb.0:
-; GFX940-FMA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX940-FMA-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX940-FMA-NEXT:    v_mov_b32_e32 v1, 0x41200000
 ; GFX940-FMA-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-FMA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-FMA-NEXT:    v_mov_b32_e32 v2, s7
-; GFX940-FMA-NEXT:    v_fmac_f32_e32 v1, s6, v2
-; GFX940-FMA-NEXT:    global_store_dword v0, v1, s[4:5] sc0 sc1
+; GFX940-FMA-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-FMA-NEXT:    v_fmac_f32_e32 v1, s2, v2
+; GFX940-FMA-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-FMA-NEXT:    s_endpgm
 ;
 ; GFX10-FMA-LABEL: s_s_madak_f32:
 ; GFX10-FMA:       ; %bb.0:
-; GFX10-FMA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-FMA-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-FMA-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-FMA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-FMA-NEXT:    v_mov_b32_e32 v0, s7
-; GFX10-FMA-NEXT:    v_fmaak_f32 v0, s6, v0, 0x41200000
-; GFX10-FMA-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-FMA-NEXT:    v_mov_b32_e32 v0, s3
+; GFX10-FMA-NEXT:    v_fmaak_f32 v0, s2, v0, 0x41200000
+; GFX10-FMA-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-FMA-NEXT:    s_endpgm
 ;
 ; GFX11-FMA-LABEL: s_s_madak_f32:
diff --git a/llvm/test/CodeGen/AMDGPU/memory_clause.ll b/llvm/test/CodeGen/AMDGPU/memory_clause.ll
index 1de9206801e2a..3a065d518f0a9 100644
--- a/llvm/test/CodeGen/AMDGPU/memory_clause.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory_clause.ll
@@ -5,21 +5,21 @@
 define amdgpu_kernel void @vector_clause(ptr addrspace(1) noalias nocapture readonly %arg, ptr addrspace(1) noalias nocapture %arg1) {
 ; GCN-LABEL: vector_clause:
 ; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GCN-NEXT:    v_lshlrev_b32_e32 v16, 4, v0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    global_load_dwordx4 v[0:3], v16, s[4:5]
-; GCN-NEXT:    global_load_dwordx4 v[4:7], v16, s[4:5] offset:16
-; GCN-NEXT:    global_load_dwordx4 v[8:11], v16, s[4:5] offset:32
-; GCN-NEXT:    global_load_dwordx4 v[12:15], v16, s[4:5] offset:48
+; GCN-NEXT:    global_load_dwordx4 v[0:3], v16, s[0:1]
+; GCN-NEXT:    global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
+; GCN-NEXT:    global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
+; GCN-NEXT:    global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
 ; GCN-NEXT:    s_waitcnt vmcnt(3)
-; GCN-NEXT:    global_store_dwordx4 v16, v[0:3], s[6:7]
+; GCN-NEXT:    global_store_dwordx4 v16, v[0:3], s[2:3]
 ; GCN-NEXT:    s_waitcnt vmcnt(3)
-; GCN-NEXT:    global_store_dwordx4 v16, v[4:7], s[6:7] offset:16
+; GCN-NEXT:    global_store_dwordx4 v16, v[4:7], s[2:3] offset:16
 ; GCN-NEXT:    s_waitcnt vmcnt(3)
-; GCN-NEXT:    global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
+; GCN-NEXT:    global_store_dwordx4 v16, v[8:11], s[2:3] offset:32
 ; GCN-NEXT:    s_waitcnt vmcnt(3)
-; GCN-NEXT:    global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
+; GCN-NEXT:    global_store_dwordx4 v16, v[12:15], s[2:3] offset:48
 ; GCN-NEXT:    s_endpgm
 ;
 ; GCN-SCRATCH-LABEL: vector_clause:
diff --git a/llvm/test/CodeGen/AMDGPU/merge-s-load.mir b/llvm/test/CodeGen/AMDGPU/merge-s-load.mir
index 28d30a70e50c1..b08da2e1848ff 100644
--- a/llvm/test/CodeGen/AMDGPU/merge-s-load.mir
+++ b/llvm/test/CodeGen/AMDGPU/merge-s-load.mir
@@ -1,5 +1,4 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1013 -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck %s -check-prefixes=CHECK,GFX10
 # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck %s -check-prefixes=CHECK,GFX11
 # RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck %s -check-prefixes=CHECK,GFX12
 
@@ -7,23 +6,11 @@
 name: merge_s_load_x1_x1
 body: |
   bb.0:
-    ; GFX10-LABEL: name: merge_s_load_x1_x1
-    ; GFX10: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
-    ; GFX10-NEXT: early-clobber %3:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s64), align 4)
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_32_xm0_xexec = COPY %3.sub0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed %3.sub1
-    ;
-    ; GFX11-LABEL: name: merge_s_load_x1_x1
-    ; GFX11: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
-    ; GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s64), align 4)
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[S_LOAD_DWORDX2_IMM]].sub0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[S_LOAD_DWORDX2_IMM]].sub1
-    ;
-    ; GFX12-LABEL: name: merge_s_load_x1_x1
-    ; GFX12: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
-    ; GFX12-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s64), align 4)
-    ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[S_LOAD_DWORDX2_IMM]].sub0
-    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[S_LOAD_DWORDX2_IMM]].sub1
+    ; CHECK-LABEL: name: merge_s_load_x1_x1
+    ; CHECK: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s64), align 4)
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[S_LOAD_DWORDX2_IMM]].sub0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[S_LOAD_DWORDX2_IMM]].sub1
     %0:sgpr_64 = IMPLICIT_DEF
     %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s32))
     %2:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 4, 0 :: (dereferenceable invariant load (s32))
@@ -59,13 +46,6 @@ body: |
 name: merge_s_load_x1_x1_x1
 body: |
   bb.0:
-    ; GFX10-LABEL: name: merge_s_load_x1_x1_x1
-    ; GFX10: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
-    ; GFX10-NEXT: early-clobber %4:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s64), align 4)
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_32_xm0_xexec = COPY %4.sub0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed %4.sub1
-    ; GFX10-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[DEF]], 8, 0 :: (dereferenceable invariant load (s32))
-    ;
     ; GFX11-LABEL: name: merge_s_load_x1_x1_x1
     ; GFX11: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
     ; GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s64), align 4)
@@ -90,16 +70,6 @@ body: |
 name: merge_s_load_x1_x1_x1_x1
 body: |
   bb.0:
-    ; GFX10-LABEL: name: merge_s_load_x1_x1_x1_x1
-    ; GFX10: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
-    ; GFX10-NEXT: early-clobber %7:sgpr_128 = S_LOAD_DWORDX4_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s128), align 4)
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64_xexec = COPY %7.sub0_sub1
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_64_xexec = COPY killed %7.sub2_sub3
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY1]].sub0
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY1]].sub1
-    ;
     ; GFX11-LABEL: name: merge_s_load_x1_x1_x1_x1
     ; GFX11: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
     ; GFX11-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s128), align 4)
@@ -130,24 +100,6 @@ body: |
 name: merge_s_load_x1_x1_x1_x1_x1_x1_x1_x1
 body: |
   bb.0:
-    ; GFX10-LABEL: name: merge_s_load_x1_x1_x1_x1_x1_x1_x1_x1
-    ; GFX10: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
-    ; GFX10-NEXT: early-clobber %15:sgpr_256 = S_LOAD_DWORDX8_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 4)
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %15.sub0_sub1_sub2_sub3
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed %15.sub4_sub5_sub6_sub7
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_64_xexec = COPY [[COPY]].sub0_sub1
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sreg_64_xexec = COPY killed [[COPY]].sub2_sub3
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY2]].sub0
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY2]].sub1
-    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY3]].sub0
-    ; GFX10-NEXT: [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY3]].sub1
-    ; GFX10-NEXT: [[COPY8:%[0-9]+]]:sreg_64_xexec = COPY [[COPY1]].sub0_sub1
-    ; GFX10-NEXT: [[COPY9:%[0-9]+]]:sreg_64_xexec = COPY killed [[COPY1]].sub2_sub3
-    ; GFX10-NEXT: [[COPY10:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY8]].sub0
-    ; GFX10-NEXT: [[COPY11:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY8]].sub1
-    ; GFX10-NEXT: [[COPY12:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY9]].sub0
-    ; GFX10-NEXT: [[COPY13:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY9]].sub1
-    ;
     ; GFX11-LABEL: name: merge_s_load_x1_x1_x1_x1_x1_x1_x1_x1
     ; GFX11: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
     ; GFX11-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 4)
@@ -198,11 +150,6 @@ body: |
 name: merge_s_load_x2_x1
 body: |
   bb.0:
-    ; GFX10-LABEL: name: merge_s_load_x2_x1
-    ; GFX10: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
-    ; GFX10-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sgpr_64 = S_LOAD_DWORDX2_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s64))
-    ; GFX10-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[DEF]], 8, 0 :: (dereferenceable invariant load (s32))
-    ;
     ; GFX11-LABEL: name: merge_s_load_x2_x1
     ; GFX11: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
     ; GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sgpr_64 = S_LOAD_DWORDX2_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s64))
@@ -222,23 +169,11 @@ body: |
 name: merge_s_load_x2_x2
 body: |
   bb.0:
-    ; GFX10-LABEL: name: merge_s_load_x2_x2
-    ; GFX10: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
-    ; GFX10-NEXT: early-clobber %3:sgpr_128 = S_LOAD_DWORDX4_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s128), align 8)
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY %3.sub0_sub1
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_64 = COPY killed %3.sub2_sub3
-    ;
-    ; GFX11-LABEL: name: merge_s_load_x2_x2
-    ; GFX11: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
-    ; GFX11-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s128), align 8)
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_64 = COPY killed [[S_LOAD_DWORDX4_IMM]].sub2_sub3
-    ;
-    ; GFX12-LABEL: name: merge_s_load_x2_x2
-    ; GFX12: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
-    ; GFX12-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s128), align 8)
-    ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1
-    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_64 = COPY killed [[S_LOAD_DWORDX4_IMM]].sub2_sub3
+    ; CHECK-LABEL: name: merge_s_load_x2_x2
+    ; CHECK: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s128), align 8)
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_64 = COPY killed [[S_LOAD_DWORDX4_IMM]].sub2_sub3
     %0:sgpr_64 = IMPLICIT_DEF
     %1:sgpr_64 = S_LOAD_DWORDX2_IMM %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s64))
     %2:sgpr_64 = S_LOAD_DWORDX2_IMM %0:sgpr_64, 8, 0 :: (dereferenceable invariant load (s64))
@@ -248,35 +183,15 @@ body: |
 name: merge_s_load_x2_x2_x2_x2
 body: |
   bb.0:
-    ; GFX10-LABEL: name: merge_s_load_x2_x2_x2_x2
-    ; GFX10: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
-    ; GFX10-NEXT: early-clobber %7:sgpr_256 = S_LOAD_DWORDX8_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 8)
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %7.sub0_sub1_sub2_sub3
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed %7.sub4_sub5_sub6_sub7
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_64 = COPY [[COPY]].sub0_sub1
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY killed [[COPY]].sub2_sub3
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY [[COPY1]].sub0_sub1
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY killed [[COPY1]].sub2_sub3
-    ;
-    ; GFX11-LABEL: name: merge_s_load_x2_x2_x2_x2
-    ; GFX11: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
-    ; GFX11-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 8)
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed [[S_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_64 = COPY [[COPY]].sub0_sub1
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY killed [[COPY]].sub2_sub3
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY [[COPY1]].sub0_sub1
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY killed [[COPY1]].sub2_sub3
-    ;
-    ; GFX12-LABEL: name: merge_s_load_x2_x2_x2_x2
-    ; GFX12: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
-    ; GFX12-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 8)
-    ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3
-    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed [[S_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7
-    ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_64 = COPY [[COPY]].sub0_sub1
-    ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY killed [[COPY]].sub2_sub3
-    ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY [[COPY1]].sub0_sub1
-    ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY killed [[COPY1]].sub2_sub3
+    ; CHECK-LABEL: name: merge_s_load_x2_x2_x2_x2
+    ; CHECK: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 8)
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed [[S_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_64 = COPY [[COPY]].sub0_sub1
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY killed [[COPY]].sub2_sub3
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY [[COPY1]].sub0_sub1
+    ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY killed [[COPY1]].sub2_sub3
     %0:sgpr_64 = IMPLICIT_DEF
     %1:sgpr_64 = S_LOAD_DWORDX2_IMM %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s64))
     %2:sgpr_64 = S_LOAD_DWORDX2_IMM %0:sgpr_64, 8, 0 :: (dereferenceable invariant load (s64))
@@ -302,181 +217,12 @@ body: |
 name: merge_s_load_x4_x4
 body: |
   bb.0:
-    ; GFX10-LABEL: name: merge_s_load_x4_x4
-    ; GFX10: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
-    ; GFX10-NEXT: early-clobber %3:sgpr_256 = S_LOAD_DWORDX8_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 16)
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %3.sub0_sub1_sub2_sub3
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed %3.sub4_sub5_sub6_sub7
-    ;
-    ; GFX11-LABEL: name: merge_s_load_x4_x4
-    ; GFX11: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
-    ; GFX11-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 16)
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed [[S_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7
-    ;
-    ; GFX12-LABEL: name: merge_s_load_x4_x4
-    ; GFX12: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
-    ; GFX12-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 16)
-    ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3
-    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed [[S_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7
+    ; CHECK-LABEL: name: merge_s_load_x4_x4
+    ; CHECK: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 16)
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed [[S_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7
     %0:sgpr_64 = IMPLICIT_DEF
     %1:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s128))
     %2:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 16, 0 :: (dereferenceable invariant load (s128))
 ...
-
-# The constrained multi-dword scalar load merge tests.
----
-name: merge_s_load_x1_x2ec
-body: |
-  bb.0:
-    ; CHECK-LABEL: name: merge_s_load_x1_x2ec
-    ; CHECK: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
-    ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s32))
-    ; CHECK-NEXT: early-clobber %2:sgpr_64 = S_LOAD_DWORDX2_IMM_ec [[DEF]], 4, 0 :: (dereferenceable invariant load (s64))
-    %0:sgpr_64 = IMPLICIT_DEF
-    %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s32))
-    early-clobber %2:sgpr_64 = S_LOAD_DWORDX2_IMM_ec %0:sgpr_64, 4, 0 :: (dereferenceable invariant load (s64))
-...
-
----
-name: merge_s_load_x1_x3ec
-body: |
-  bb.0:
-    ; CHECK-LABEL: name: merge_s_load_x1_x3ec
-    ; CHECK: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
-    ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s32))
-    ; CHECK-NEXT: early-clobber %2:sgpr_96 = S_LOAD_DWORDX3_IMM_ec [[DEF]], 4, 0 :: (dereferenceable invariant load (s96), align 16)
-    %0:sgpr_64 = IMPLICIT_DEF
-    %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s32))
-    early-clobber %2:sgpr_96 = S_LOAD_DWORDX3_IMM_ec %0:sgpr_64, 4, 0 :: (dereferenceable invariant load (s96))
-...
-
----
-name: merge_s_load_x2ec_x1
-body: |
-  bb.0:
-    ; GFX10-LABEL: name: merge_s_load_x2ec_x1
-    ; GFX10: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
-    ; GFX10-NEXT: early-clobber %1:sgpr_64 = S_LOAD_DWORDX2_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s64))
-    ; GFX10-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[DEF]], 8, 0 :: (dereferenceable invariant load (s32))
-    ;
-    ; GFX11-LABEL: name: merge_s_load_x2ec_x1
-    ; GFX11: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
-    ; GFX11-NEXT: early-clobber %1:sgpr_64 = S_LOAD_DWORDX2_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s64))
-    ; GFX11-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[DEF]], 8, 0 :: (dereferenceable invariant load (s32))
-    ;
-    ; GFX12-LABEL: name: merge_s_load_x2ec_x1
-    ; GFX12: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
-    ; GFX12-NEXT: [[S_LOAD_DWORDX3_IMM:%[0-9]+]]:sgpr_96 = S_LOAD_DWORDX3_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s96), align 8)
-    ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY [[S_LOAD_DWORDX3_IMM]].sub0_sub1
-    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[S_LOAD_DWORDX3_IMM]].sub2
-    %0:sgpr_64 = IMPLICIT_DEF
-    early-clobber %1:sgpr_64 = S_LOAD_DWORDX2_IMM_ec %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s64))
-    %2:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 8, 0 :: (dereferenceable invariant load (s32))
-...
-
----
-name: merge_s_load_x2ec_x2ec
-body: |
-  bb.0:
-    ; GFX10-LABEL: name: merge_s_load_x2ec_x2ec
-    ; GFX10: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
-    ; GFX10-NEXT: early-clobber %3:sgpr_128 = S_LOAD_DWORDX4_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s128), align 8)
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY %3.sub0_sub1
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_64 = COPY killed %3.sub2_sub3
-    ;
-    ; GFX11-LABEL: name: merge_s_load_x2ec_x2ec
-    ; GFX11: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
-    ; GFX11-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s128), align 8)
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_64 = COPY killed [[S_LOAD_DWORDX4_IMM]].sub2_sub3
-    ;
-    ; GFX12-LABEL: name: merge_s_load_x2ec_x2ec
-    ; GFX12: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
-    ; GFX12-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s128), align 8)
-    ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1
-    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_64 = COPY killed [[S_LOAD_DWORDX4_IMM]].sub2_sub3
-    %0:sgpr_64 = IMPLICIT_DEF
-    early-clobber %1:sgpr_64 = S_LOAD_DWORDX2_IMM_ec %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s64))
-    early-clobber %2:sgpr_64 = S_LOAD_DWORDX2_IMM_ec %0:sgpr_64, 8, 0 :: (dereferenceable invariant load (s64))
-...
-
----
-name: merge_s_load_x2ec_x2ec_x2ec_x2ec
-body: |
-  bb.0:
-    ; GFX10-LABEL: name: merge_s_load_x2ec_x2ec_x2ec_x2ec
-    ; GFX10: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
-    ; GFX10-NEXT: early-clobber %7:sgpr_256 = S_LOAD_DWORDX8_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 8)
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %7.sub0_sub1_sub2_sub3
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed %7.sub4_sub5_sub6_sub7
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_64 = COPY [[COPY]].sub0_sub1
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY killed [[COPY]].sub2_sub3
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY [[COPY1]].sub0_sub1
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY killed [[COPY1]].sub2_sub3
-    ;
-    ; GFX11-LABEL: name: merge_s_load_x2ec_x2ec_x2ec_x2ec
-    ; GFX11: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
-    ; GFX11-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 8)
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed [[S_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_64 = COPY [[COPY]].sub0_sub1
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY killed [[COPY]].sub2_sub3
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY [[COPY1]].sub0_sub1
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY killed [[COPY1]].sub2_sub3
-    ;
-    ; GFX12-LABEL: name: merge_s_load_x2ec_x2ec_x2ec_x2ec
-    ; GFX12: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
-    ; GFX12-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 8)
-    ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3
-    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed [[S_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7
-    ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_64 = COPY [[COPY]].sub0_sub1
-    ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY killed [[COPY]].sub2_sub3
-    ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY [[COPY1]].sub0_sub1
-    ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY killed [[COPY1]].sub2_sub3
-    %0:sgpr_64 = IMPLICIT_DEF
-    early-clobber %1:sgpr_64 = S_LOAD_DWORDX2_IMM_ec %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s64))
-    early-clobber %2:sgpr_64 = S_LOAD_DWORDX2_IMM_ec %0:sgpr_64, 8, 0 :: (dereferenceable invariant load (s64))
-    early-clobber %3:sgpr_64 = S_LOAD_DWORDX2_IMM_ec %0:sgpr_64, 16, 0 :: (dereferenceable invariant load (s64))
-    early-clobber %4:sgpr_64 = S_LOAD_DWORDX2_IMM_ec %0:sgpr_64, 24, 0 :: (dereferenceable invariant load (s64))
-...
-
----
-name: merge_s_load_x3ec_x1
-body: |
-  bb.0:
-    ; CHECK-LABEL: name: merge_s_load_x3ec_x1
-    ; CHECK: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
-    ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s128))
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_96 = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1_sub2
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[S_LOAD_DWORDX4_IMM]].sub3
-    %0:sgpr_64 = IMPLICIT_DEF
-    early-clobber %1:sgpr_96 = S_LOAD_DWORDX3_IMM_ec %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s96))
-    %2:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 12, 0 :: (dereferenceable invariant load (s32))
-...
-
----
-name: merge_s_load_x4ec_x4ec
-body: |
-  bb.0:
-    ; GFX10-LABEL: name: merge_s_load_x4ec_x4ec
-    ; GFX10: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
-    ; GFX10-NEXT: early-clobber %3:sgpr_256 = S_LOAD_DWORDX8_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 16)
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %3.sub0_sub1_sub2_sub3
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed %3.sub4_sub5_sub6_sub7
-    ;
-    ; GFX11-LABEL: name: merge_s_load_x4ec_x4ec
-    ; GFX11: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
-    ; GFX11-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 16)
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed [[S_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7
-    ;
-    ; GFX12-LABEL: name: merge_s_load_x4ec_x4ec
-    ; GFX12: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
-    ; GFX12-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 16)
-    ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3
-    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed [[S_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7
-    %0:sgpr_64 = IMPLICIT_DEF
-    early-clobber %1:sgpr_128 = S_LOAD_DWORDX4_IMM_ec %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s128))
-    early-clobber %2:sgpr_128 = S_LOAD_DWORDX4_IMM_ec %0:sgpr_64, 16, 0 :: (dereferenceable invariant load (s128))
-...
diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll
index 686797f290b97..a77892c8f5fc7 100644
--- a/llvm/test/CodeGen/AMDGPU/min.ll
+++ b/llvm/test/CodeGen/AMDGPU/min.ll
@@ -2513,38 +2513,38 @@ define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(ptr addrspace(1) %out0,
 ;
 ; GFX9-LABEL: v_test_umin_ult_i32_multi_use:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x0
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[6:7], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dword s2, s[12:13], 0x0
-; GFX9-NEXT:    s_load_dword s3, s[14:15], 0x0
+; GFX9-NEXT:    s_load_dword s8, s[4:5], 0x0
+; GFX9-NEXT:    s_load_dword s9, s[6:7], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_cmp_lt_u32 s2, s3
-; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
-; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
-; GFX9-NEXT:    s_cselect_b32 s0, s2, s3
-; GFX9-NEXT:    v_mov_b32_e32 v2, s0
-; GFX9-NEXT:    global_store_dword v0, v2, s[8:9]
-; GFX9-NEXT:    global_store_byte v0, v1, s[10:11]
+; GFX9-NEXT:    s_cmp_lt_u32 s8, s9
+; GFX9-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; GFX9-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GFX9-NEXT:    s_cselect_b32 s4, s8, s9
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    global_store_dword v0, v2, s[0:1]
+; GFX9-NEXT:    global_store_byte v0, v1, s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_umin_ult_i32_multi_use:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x0
+; GFX10-NEXT:    s_load_dwordx8 s[0:7], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_load_dword s0, s[12:13], 0x0
-; GFX10-NEXT:    s_load_dword s1, s[14:15], 0x0
+; GFX10-NEXT:    s_load_dword s8, s[4:5], 0x0
+; GFX10-NEXT:    s_load_dword s9, s[6:7], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_cmp_lt_u32 s0, s1
-; GFX10-NEXT:    s_cselect_b32 s2, -1, 0
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s2
-; GFX10-NEXT:    s_and_b32 s2, s2, exec_lo
-; GFX10-NEXT:    s_cselect_b32 s0, s0, s1
-; GFX10-NEXT:    v_mov_b32_e32 v2, s0
-; GFX10-NEXT:    global_store_dword v1, v2, s[8:9]
-; GFX10-NEXT:    global_store_byte v1, v0, s[10:11]
+; GFX10-NEXT:    s_cmp_lt_u32 s8, s9
+; GFX10-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX10-NEXT:    s_cselect_b32 s4, s8, s9
+; GFX10-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-NEXT:    global_store_dword v1, v2, s[0:1]
+; GFX10-NEXT:    global_store_byte v1, v0, s[2:3]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_umin_ult_i32_multi_use:
@@ -2665,33 +2665,33 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0,
 ;
 ; GFX9-LABEL: v_test_umin_ult_i16_multi_use:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x0
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[6:7], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v1, v0, s[12:13]
-; GFX9-NEXT:    global_load_ushort v2, v0, s[14:15]
+; GFX9-NEXT:    global_load_ushort v1, v0, s[4:5]
+; GFX9-NEXT:    global_load_ushort v2, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, v1, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
-; GFX9-NEXT:    global_store_short v0, v1, s[8:9]
+; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX9-NEXT:    global_store_byte v0, v1, s[10:11]
+; GFX9-NEXT:    global_store_byte v0, v1, s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_umin_ult_i16_multi_use:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x0
+; GFX10-NEXT:    s_load_dwordx8 s[0:7], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_ushort v1, v0, s[12:13]
-; GFX10-NEXT:    global_load_ushort v2, v0, s[14:15]
+; GFX10-NEXT:    global_load_ushort v1, v0, s[4:5]
+; GFX10-NEXT:    global_load_ushort v2, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cmp_lt_u32_e32 vcc_lo, v1, v2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
-; GFX10-NEXT:    global_store_short v0, v1, s[8:9]
-; GFX10-NEXT:    global_store_byte v0, v2, s[10:11]
+; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-NEXT:    global_store_byte v0, v2, s[2:3]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_umin_ult_i16_multi_use:
diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll
index b696f097d05b7..ece7e28c763fb 100644
--- a/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll
+++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll
@@ -8,17 +8,17 @@ declare i64 @llvm.cttz.i64(i64, i1) nounwind readnone
 define amdgpu_kernel void @ctlz_i64_poison(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind {
 ; GFX9-LABEL: ctlz_i64_poison:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ubyte v0, v1, s[6:7] offset:5
-; GFX9-NEXT:    global_load_ubyte v2, v1, s[6:7] offset:6
-; GFX9-NEXT:    global_load_ubyte v3, v1, s[6:7] offset:7
-; GFX9-NEXT:    global_load_ubyte v4, v1, s[6:7] offset:1
-; GFX9-NEXT:    global_load_ubyte v5, v1, s[6:7] offset:3
-; GFX9-NEXT:    global_load_ubyte v6, v1, s[6:7] offset:4
-; GFX9-NEXT:    global_load_ubyte v7, v1, s[6:7]
-; GFX9-NEXT:    global_load_ubyte v8, v1, s[6:7] offset:2
+; GFX9-NEXT:    global_load_ubyte v0, v1, s[2:3] offset:5
+; GFX9-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:6
+; GFX9-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:7
+; GFX9-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:1
+; GFX9-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:3
+; GFX9-NEXT:    global_load_ubyte v6, v1, s[2:3] offset:4
+; GFX9-NEXT:    global_load_ubyte v7, v1, s[2:3]
+; GFX9-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:2
 ; GFX9-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(5)
@@ -40,23 +40,23 @@ define amdgpu_kernel void @ctlz_i64_poison(ptr addrspace(1) noalias %out, ptr ad
 ; GFX9-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX9-NEXT:    v_add_u32_e64 v2, v2, 32 clamp
 ; GFX9-NEXT:    v_min_u32_e32 v0, v2, v0
-; GFX9-NEXT:    global_store_dwordx2 v1, v[0:1], s[4:5]
+; GFX9-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: ctlz_i64_poison:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x7
-; GFX10-NEXT:    global_load_ubyte v0, v1, s[6:7] offset:5
-; GFX10-NEXT:    global_load_ubyte v2, v1, s[6:7] offset:6
-; GFX10-NEXT:    global_load_ubyte v3, v1, s[6:7] offset:7
-; GFX10-NEXT:    global_load_ubyte v4, v1, s[6:7] offset:1
-; GFX10-NEXT:    global_load_ubyte v5, v1, s[6:7] offset:3
-; GFX10-NEXT:    global_load_ubyte v6, v1, s[6:7]
-; GFX10-NEXT:    global_load_ubyte v7, v1, s[6:7] offset:2
-; GFX10-NEXT:    global_load_ubyte v8, v1, s[6:7] offset:4
+; GFX10-NEXT:    global_load_ubyte v0, v1, s[2:3] offset:5
+; GFX10-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:6
+; GFX10-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:7
+; GFX10-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:1
+; GFX10-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:3
+; GFX10-NEXT:    global_load_ubyte v6, v1, s[2:3]
+; GFX10-NEXT:    global_load_ubyte v7, v1, s[2:3] offset:2
+; GFX10-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:4
 ; GFX10-NEXT:    s_waitcnt vmcnt(7)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; GFX10-NEXT:    s_waitcnt vmcnt(5)
@@ -76,7 +76,7 @@ define amdgpu_kernel void @ctlz_i64_poison(ptr addrspace(1) noalias %out, ptr ad
 ; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX10-NEXT:    v_add_nc_u32_e64 v2, v2, 32 clamp
 ; GFX10-NEXT:    v_min_u32_e32 v0, v2, v0
-; GFX10-NEXT:    global_store_dwordx2 v1, v[0:1], s[4:5]
+; GFX10-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
   %val = load i64, ptr addrspace(1) %arrayidx, align 1
   %ctlz = tail call i64 @llvm.ctlz.i64(i64 %val, i1 true) nounwind readnone
@@ -87,17 +87,17 @@ define amdgpu_kernel void @ctlz_i64_poison(ptr addrspace(1) noalias %out, ptr ad
 define amdgpu_kernel void @ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind {
 ; GFX9-LABEL: ctlz_i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ubyte v0, v1, s[6:7] offset:5
-; GFX9-NEXT:    global_load_ubyte v2, v1, s[6:7] offset:6
-; GFX9-NEXT:    global_load_ubyte v3, v1, s[6:7] offset:7
-; GFX9-NEXT:    global_load_ubyte v4, v1, s[6:7] offset:1
-; GFX9-NEXT:    global_load_ubyte v5, v1, s[6:7] offset:3
-; GFX9-NEXT:    global_load_ubyte v6, v1, s[6:7] offset:4
-; GFX9-NEXT:    global_load_ubyte v7, v1, s[6:7]
-; GFX9-NEXT:    global_load_ubyte v8, v1, s[6:7] offset:2
+; GFX9-NEXT:    global_load_ubyte v0, v1, s[2:3] offset:5
+; GFX9-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:6
+; GFX9-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:7
+; GFX9-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:1
+; GFX9-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:3
+; GFX9-NEXT:    global_load_ubyte v6, v1, s[2:3] offset:4
+; GFX9-NEXT:    global_load_ubyte v7, v1, s[2:3]
+; GFX9-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:2
 ; GFX9-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(5)
@@ -120,23 +120,23 @@ define amdgpu_kernel void @ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspace
 ; GFX9-NEXT:    v_add_u32_e64 v2, v2, 32 clamp
 ; GFX9-NEXT:    v_min_u32_e32 v0, v2, v0
 ; GFX9-NEXT:    v_min_u32_e32 v0, 64, v0
-; GFX9-NEXT:    global_store_dwordx2 v1, v[0:1], s[4:5]
+; GFX9-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: ctlz_i64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x7
-; GFX10-NEXT:    global_load_ubyte v0, v1, s[6:7] offset:5
-; GFX10-NEXT:    global_load_ubyte v2, v1, s[6:7] offset:6
-; GFX10-NEXT:    global_load_ubyte v3, v1, s[6:7] offset:7
-; GFX10-NEXT:    global_load_ubyte v4, v1, s[6:7] offset:1
-; GFX10-NEXT:    global_load_ubyte v5, v1, s[6:7] offset:3
-; GFX10-NEXT:    global_load_ubyte v6, v1, s[6:7]
-; GFX10-NEXT:    global_load_ubyte v7, v1, s[6:7] offset:2
-; GFX10-NEXT:    global_load_ubyte v8, v1, s[6:7] offset:4
+; GFX10-NEXT:    global_load_ubyte v0, v1, s[2:3] offset:5
+; GFX10-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:6
+; GFX10-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:7
+; GFX10-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:1
+; GFX10-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:3
+; GFX10-NEXT:    global_load_ubyte v6, v1, s[2:3]
+; GFX10-NEXT:    global_load_ubyte v7, v1, s[2:3] offset:2
+; GFX10-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:4
 ; GFX10-NEXT:    s_waitcnt vmcnt(7)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; GFX10-NEXT:    s_waitcnt vmcnt(5)
@@ -157,7 +157,7 @@ define amdgpu_kernel void @ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspace
 ; GFX10-NEXT:    v_add_nc_u32_e64 v2, v2, 32 clamp
 ; GFX10-NEXT:    v_min_u32_e32 v0, v2, v0
 ; GFX10-NEXT:    v_min_u32_e32 v0, 64, v0
-; GFX10-NEXT:    global_store_dwordx2 v1, v[0:1], s[4:5]
+; GFX10-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
   %val = load i64, ptr addrspace(1) %arrayidx, align 1
   %ctlz = tail call i64 @llvm.ctlz.i64(i64 %val, i1 false) nounwind readnone
@@ -168,17 +168,17 @@ define amdgpu_kernel void @ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspace
 define amdgpu_kernel void @cttz_i64_poison(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind {
 ; GFX9-LABEL: cttz_i64_poison:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ubyte v0, v1, s[6:7] offset:5
-; GFX9-NEXT:    global_load_ubyte v2, v1, s[6:7] offset:6
-; GFX9-NEXT:    global_load_ubyte v3, v1, s[6:7] offset:7
-; GFX9-NEXT:    global_load_ubyte v4, v1, s[6:7] offset:1
-; GFX9-NEXT:    global_load_ubyte v5, v1, s[6:7] offset:3
-; GFX9-NEXT:    global_load_ubyte v6, v1, s[6:7] offset:4
-; GFX9-NEXT:    global_load_ubyte v7, v1, s[6:7]
-; GFX9-NEXT:    global_load_ubyte v8, v1, s[6:7] offset:2
+; GFX9-NEXT:    global_load_ubyte v0, v1, s[2:3] offset:5
+; GFX9-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:6
+; GFX9-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:7
+; GFX9-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:1
+; GFX9-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:3
+; GFX9-NEXT:    global_load_ubyte v6, v1, s[2:3] offset:4
+; GFX9-NEXT:    global_load_ubyte v7, v1, s[2:3]
+; GFX9-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:2
 ; GFX9-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(5)
@@ -200,23 +200,23 @@ define amdgpu_kernel void @cttz_i64_poison(ptr addrspace(1) noalias %out, ptr ad
 ; GFX9-NEXT:    v_ffbl_b32_e32 v2, v2
 ; GFX9-NEXT:    v_add_u32_e64 v0, v0, 32 clamp
 ; GFX9-NEXT:    v_min_u32_e32 v0, v0, v2
-; GFX9-NEXT:    global_store_dwordx2 v1, v[0:1], s[4:5]
+; GFX9-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: cttz_i64_poison:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x7
-; GFX10-NEXT:    global_load_ubyte v0, v1, s[6:7] offset:5
-; GFX10-NEXT:    global_load_ubyte v2, v1, s[6:7] offset:7
-; GFX10-NEXT:    global_load_ubyte v3, v1, s[6:7] offset:6
-; GFX10-NEXT:    global_load_ubyte v4, v1, s[6:7] offset:1
-; GFX10-NEXT:    global_load_ubyte v5, v1, s[6:7] offset:3
-; GFX10-NEXT:    global_load_ubyte v6, v1, s[6:7] offset:4
-; GFX10-NEXT:    global_load_ubyte v7, v1, s[6:7]
-; GFX10-NEXT:    global_load_ubyte v8, v1, s[6:7] offset:2
+; GFX10-NEXT:    global_load_ubyte v0, v1, s[2:3] offset:5
+; GFX10-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:7
+; GFX10-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:6
+; GFX10-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:1
+; GFX10-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:3
+; GFX10-NEXT:    global_load_ubyte v6, v1, s[2:3] offset:4
+; GFX10-NEXT:    global_load_ubyte v7, v1, s[2:3]
+; GFX10-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:2
 ; GFX10-NEXT:    s_waitcnt vmcnt(7)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; GFX10-NEXT:    s_waitcnt vmcnt(6)
@@ -238,7 +238,7 @@ define amdgpu_kernel void @cttz_i64_poison(ptr addrspace(1) noalias %out, ptr ad
 ; GFX10-NEXT:    v_ffbl_b32_e32 v2, v2
 ; GFX10-NEXT:    v_add_nc_u32_e64 v0, v0, 32 clamp
 ; GFX10-NEXT:    v_min_u32_e32 v0, v0, v2
-; GFX10-NEXT:    global_store_dwordx2 v1, v[0:1], s[4:5]
+; GFX10-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
   %val = load i64, ptr addrspace(1) %arrayidx, align 1
   %cttz = tail call i64 @llvm.cttz.i64(i64 %val, i1 true) nounwind readnone
@@ -249,17 +249,17 @@ define amdgpu_kernel void @cttz_i64_poison(ptr addrspace(1) noalias %out, ptr ad
 define amdgpu_kernel void @cttz_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind {
 ; GFX9-LABEL: cttz_i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ubyte v0, v1, s[6:7] offset:5
-; GFX9-NEXT:    global_load_ubyte v2, v1, s[6:7] offset:6
-; GFX9-NEXT:    global_load_ubyte v3, v1, s[6:7] offset:7
-; GFX9-NEXT:    global_load_ubyte v4, v1, s[6:7] offset:1
-; GFX9-NEXT:    global_load_ubyte v5, v1, s[6:7] offset:3
-; GFX9-NEXT:    global_load_ubyte v6, v1, s[6:7] offset:4
-; GFX9-NEXT:    global_load_ubyte v7, v1, s[6:7]
-; GFX9-NEXT:    global_load_ubyte v8, v1, s[6:7] offset:2
+; GFX9-NEXT:    global_load_ubyte v0, v1, s[2:3] offset:5
+; GFX9-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:6
+; GFX9-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:7
+; GFX9-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:1
+; GFX9-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:3
+; GFX9-NEXT:    global_load_ubyte v6, v1, s[2:3] offset:4
+; GFX9-NEXT:    global_load_ubyte v7, v1, s[2:3]
+; GFX9-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:2
 ; GFX9-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(5)
@@ -282,23 +282,23 @@ define amdgpu_kernel void @cttz_i64(ptr addrspace(1) noalias %out, ptr addrspace
 ; GFX9-NEXT:    v_add_u32_e64 v0, v0, 32 clamp
 ; GFX9-NEXT:    v_min_u32_e32 v0, v0, v2
 ; GFX9-NEXT:    v_min_u32_e32 v0, 64, v0
-; GFX9-NEXT:    global_store_dwordx2 v1, v[0:1], s[4:5]
+; GFX9-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: cttz_i64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x7
-; GFX10-NEXT:    global_load_ubyte v0, v1, s[6:7] offset:5
-; GFX10-NEXT:    global_load_ubyte v2, v1, s[6:7] offset:7
-; GFX10-NEXT:    global_load_ubyte v3, v1, s[6:7] offset:6
-; GFX10-NEXT:    global_load_ubyte v4, v1, s[6:7] offset:1
-; GFX10-NEXT:    global_load_ubyte v5, v1, s[6:7] offset:3
-; GFX10-NEXT:    global_load_ubyte v6, v1, s[6:7] offset:4
-; GFX10-NEXT:    global_load_ubyte v7, v1, s[6:7]
-; GFX10-NEXT:    global_load_ubyte v8, v1, s[6:7] offset:2
+; GFX10-NEXT:    global_load_ubyte v0, v1, s[2:3] offset:5
+; GFX10-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:7
+; GFX10-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:6
+; GFX10-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:1
+; GFX10-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:3
+; GFX10-NEXT:    global_load_ubyte v6, v1, s[2:3] offset:4
+; GFX10-NEXT:    global_load_ubyte v7, v1, s[2:3]
+; GFX10-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:2
 ; GFX10-NEXT:    s_waitcnt vmcnt(7)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; GFX10-NEXT:    s_waitcnt vmcnt(6)
@@ -321,7 +321,7 @@ define amdgpu_kernel void @cttz_i64(ptr addrspace(1) noalias %out, ptr addrspace
 ; GFX10-NEXT:    v_add_nc_u32_e64 v0, v0, 32 clamp
 ; GFX10-NEXT:    v_min_u32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_min_u32_e32 v0, 64, v0
-; GFX10-NEXT:    global_store_dwordx2 v1, v[0:1], s[4:5]
+; GFX10-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
   %val = load i64, ptr addrspace(1) %arrayidx, align 1
   %cttz = tail call i64 @llvm.cttz.i64(i64 %val, i1 false) nounwind readnone
diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll
index 9b44b58c4a01e..0889f8ef6316e 100644
--- a/llvm/test/CodeGen/AMDGPU/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul.ll
@@ -50,40 +50,40 @@ define amdgpu_kernel void @test_mul_v2i32(ptr addrspace(1) %out, ptr addrspace(1
 ;
 ; GFX9-LABEL: test_mul_v2i32:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    s_mov_b32 s10, s2
-; GFX9-NEXT:    s_mov_b32 s11, s3
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    s_mov_b32 s10, s6
+; GFX9-NEXT:    s_mov_b32 s11, s7
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s8, s6
-; GFX9-NEXT:    s_mov_b32 s9, s7
+; GFX9-NEXT:    s_mov_b32 s8, s2
+; GFX9-NEXT:    s_mov_b32 s9, s3
 ; GFX9-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; GFX9-NEXT:    s_mov_b32 s0, s4
-; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_mov_b32 s4, s0
+; GFX9-NEXT:    s_mov_b32 s5, s1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mul_lo_u32 v1, v1, v3
 ; GFX9-NEXT:    v_mul_lo_u32 v0, v0, v2
-; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: test_mul_v2i32:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX10-NEXT:    s_mov_b32 s2, -1
-; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX10-NEXT:    s_mov_b32 s10, s2
-; GFX10-NEXT:    s_mov_b32 s11, s3
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_mov_b32 s6, -1
+; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX10-NEXT:    s_mov_b32 s10, s6
+; GFX10-NEXT:    s_mov_b32 s11, s7
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s8, s6
-; GFX10-NEXT:    s_mov_b32 s9, s7
-; GFX10-NEXT:    s_mov_b32 s0, s4
+; GFX10-NEXT:    s_mov_b32 s8, s2
+; GFX10-NEXT:    s_mov_b32 s9, s3
+; GFX10-NEXT:    s_mov_b32 s4, s0
 ; GFX10-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; GFX10-NEXT:    s_mov_b32 s1, s5
+; GFX10-NEXT:    s_mov_b32 s5, s1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mul_lo_u32 v1, v1, v3
 ; GFX10-NEXT:    v_mul_lo_u32 v0, v0, v2
-; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_mul_v2i32:
@@ -201,47 +201,47 @@ define amdgpu_kernel void @v_mul_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %
 ;
 ; GFX9-LABEL: v_mul_v4i32:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    s_mov_b32 s10, s2
-; GFX9-NEXT:    s_mov_b32 s11, s3
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    s_mov_b32 s10, s6
+; GFX9-NEXT:    s_mov_b32 s11, s7
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s8, s6
-; GFX9-NEXT:    s_mov_b32 s9, s7
+; GFX9-NEXT:    s_mov_b32 s8, s2
+; GFX9-NEXT:    s_mov_b32 s9, s3
 ; GFX9-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
 ; GFX9-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
-; GFX9-NEXT:    s_mov_b32 s0, s4
-; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_mov_b32 s4, s0
+; GFX9-NEXT:    s_mov_b32 s5, s1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mul_lo_u32 v3, v3, v7
 ; GFX9-NEXT:    v_mul_lo_u32 v2, v2, v6
 ; GFX9-NEXT:    v_mul_lo_u32 v1, v1, v5
 ; GFX9-NEXT:    v_mul_lo_u32 v0, v0, v4
-; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_mul_v4i32:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX10-NEXT:    s_mov_b32 s2, -1
-; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX10-NEXT:    s_mov_b32 s10, s2
-; GFX10-NEXT:    s_mov_b32 s11, s3
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_mov_b32 s6, -1
+; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX10-NEXT:    s_mov_b32 s10, s6
+; GFX10-NEXT:    s_mov_b32 s11, s7
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s8, s6
-; GFX10-NEXT:    s_mov_b32 s9, s7
+; GFX10-NEXT:    s_mov_b32 s8, s2
+; GFX10-NEXT:    s_mov_b32 s9, s3
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
 ; GFX10-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
-; GFX10-NEXT:    s_mov_b32 s0, s4
-; GFX10-NEXT:    s_mov_b32 s1, s5
+; GFX10-NEXT:    s_mov_b32 s4, s0
+; GFX10-NEXT:    s_mov_b32 s5, s1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mul_lo_u32 v3, v3, v7
 ; GFX10-NEXT:    v_mul_lo_u32 v2, v2, v6
 ; GFX10-NEXT:    v_mul_lo_u32 v1, v1, v5
 ; GFX10-NEXT:    v_mul_lo_u32 v0, v0, v4
-; GFX10-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GFX10-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_mul_v4i32:
@@ -857,41 +857,41 @@ define amdgpu_kernel void @v_mul64_sext_c(ptr addrspace(1) %out, ptr addrspace(1
 ;
 ; GFX9-LABEL: v_mul64_sext_c:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    s_mov_b32 s10, s2
-; GFX9-NEXT:    s_mov_b32 s11, s3
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    s_mov_b32 s10, s6
+; GFX9-NEXT:    s_mov_b32 s11, s7
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s8, s6
-; GFX9-NEXT:    s_mov_b32 s9, s7
+; GFX9-NEXT:    s_mov_b32 s8, s2
+; GFX9-NEXT:    s_mov_b32 s9, s3
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[8:11], 0
-; GFX9-NEXT:    s_movk_i32 s0, 0x50
-; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_movk_i32 s2, 0x50
+; GFX9-NEXT:    s_mov_b32 s4, s0
+; GFX9-NEXT:    s_mov_b32 s5, s1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mul_hi_i32 v1, v0, s0
-; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s0
-; GFX9-NEXT:    s_mov_b32 s0, s4
-; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT:    v_mul_hi_i32 v1, v0, s2
+; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s2
+; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_mul64_sext_c:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX10-NEXT:    s_mov_b32 s2, -1
-; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX10-NEXT:    s_mov_b32 s10, s2
-; GFX10-NEXT:    s_mov_b32 s11, s3
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_mov_b32 s6, -1
+; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX10-NEXT:    s_mov_b32 s10, s6
+; GFX10-NEXT:    s_mov_b32 s11, s7
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s8, s6
-; GFX10-NEXT:    s_mov_b32 s9, s7
-; GFX10-NEXT:    s_mov_b32 s0, s4
+; GFX10-NEXT:    s_mov_b32 s8, s2
+; GFX10-NEXT:    s_mov_b32 s9, s3
+; GFX10-NEXT:    s_mov_b32 s4, s0
 ; GFX10-NEXT:    buffer_load_dword v0, off, s[8:11], 0
-; GFX10-NEXT:    s_mov_b32 s1, s5
+; GFX10-NEXT:    s_mov_b32 s5, s1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mul_hi_i32 v1, 0x50, v0
 ; GFX10-NEXT:    v_mul_lo_u32 v0, 0x50, v0
-; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_mul64_sext_c:
@@ -1004,41 +1004,41 @@ define amdgpu_kernel void @v_mul64_zext_c(ptr addrspace(1) %out, ptr addrspace(1
 ;
 ; GFX9-LABEL: v_mul64_zext_c:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    s_mov_b32 s10, s2
-; GFX9-NEXT:    s_mov_b32 s11, s3
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    s_mov_b32 s10, s6
+; GFX9-NEXT:    s_mov_b32 s11, s7
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s8, s6
-; GFX9-NEXT:    s_mov_b32 s9, s7
+; GFX9-NEXT:    s_mov_b32 s8, s2
+; GFX9-NEXT:    s_mov_b32 s9, s3
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[8:11], 0
-; GFX9-NEXT:    s_movk_i32 s0, 0x50
-; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_movk_i32 s2, 0x50
+; GFX9-NEXT:    s_mov_b32 s4, s0
+; GFX9-NEXT:    s_mov_b32 s5, s1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mul_hi_u32 v1, v0, s0
-; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s0
-; GFX9-NEXT:    s_mov_b32 s0, s4
-; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT:    v_mul_hi_u32 v1, v0, s2
+; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s2
+; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_mul64_zext_c:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX10-NEXT:    s_mov_b32 s2, -1
-; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX10-NEXT:    s_mov_b32 s10, s2
-; GFX10-NEXT:    s_mov_b32 s11, s3
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_mov_b32 s6, -1
+; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX10-NEXT:    s_mov_b32 s10, s6
+; GFX10-NEXT:    s_mov_b32 s11, s7
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s8, s6
-; GFX10-NEXT:    s_mov_b32 s9, s7
-; GFX10-NEXT:    s_mov_b32 s0, s4
+; GFX10-NEXT:    s_mov_b32 s8, s2
+; GFX10-NEXT:    s_mov_b32 s9, s3
+; GFX10-NEXT:    s_mov_b32 s4, s0
 ; GFX10-NEXT:    buffer_load_dword v0, off, s[8:11], 0
-; GFX10-NEXT:    s_mov_b32 s1, s5
+; GFX10-NEXT:    s_mov_b32 s5, s1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mul_hi_u32 v1, 0x50, v0
 ; GFX10-NEXT:    v_mul_lo_u32 v0, 0x50, v0
-; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_mul64_zext_c:
@@ -1149,40 +1149,40 @@ define amdgpu_kernel void @v_mul64_sext_inline_imm(ptr addrspace(1) %out, ptr ad
 ;
 ; GFX9-LABEL: v_mul64_sext_inline_imm:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    s_mov_b32 s10, s2
-; GFX9-NEXT:    s_mov_b32 s11, s3
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    s_mov_b32 s10, s6
+; GFX9-NEXT:    s_mov_b32 s11, s7
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s8, s6
-; GFX9-NEXT:    s_mov_b32 s9, s7
+; GFX9-NEXT:    s_mov_b32 s8, s2
+; GFX9-NEXT:    s_mov_b32 s9, s3
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[8:11], 0
-; GFX9-NEXT:    s_mov_b32 s0, s4
-; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_mov_b32 s4, s0
+; GFX9-NEXT:    s_mov_b32 s5, s1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mul_hi_i32 v1, v0, 9
 ; GFX9-NEXT:    v_mul_lo_u32 v0, v0, 9
-; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_mul64_sext_inline_imm:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX10-NEXT:    s_mov_b32 s2, -1
-; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX10-NEXT:    s_mov_b32 s10, s2
-; GFX10-NEXT:    s_mov_b32 s11, s3
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_mov_b32 s6, -1
+; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX10-NEXT:    s_mov_b32 s10, s6
+; GFX10-NEXT:    s_mov_b32 s11, s7
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s8, s6
-; GFX10-NEXT:    s_mov_b32 s9, s7
-; GFX10-NEXT:    s_mov_b32 s0, s4
+; GFX10-NEXT:    s_mov_b32 s8, s2
+; GFX10-NEXT:    s_mov_b32 s9, s3
+; GFX10-NEXT:    s_mov_b32 s4, s0
 ; GFX10-NEXT:    buffer_load_dword v0, off, s[8:11], 0
-; GFX10-NEXT:    s_mov_b32 s1, s5
+; GFX10-NEXT:    s_mov_b32 s5, s1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mul_hi_i32 v1, v0, 9
 ; GFX10-NEXT:    v_mul_lo_u32 v0, v0, 9
-; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_mul64_sext_inline_imm:
@@ -1396,38 +1396,38 @@ define amdgpu_kernel void @v_mul_i32(ptr addrspace(1) %out, ptr addrspace(1) %in
 ;
 ; GFX9-LABEL: v_mul_i32:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    s_mov_b32 s10, s2
-; GFX9-NEXT:    s_mov_b32 s11, s3
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    s_mov_b32 s10, s6
+; GFX9-NEXT:    s_mov_b32 s11, s7
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s8, s6
-; GFX9-NEXT:    s_mov_b32 s9, s7
+; GFX9-NEXT:    s_mov_b32 s8, s2
+; GFX9-NEXT:    s_mov_b32 s9, s3
 ; GFX9-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; GFX9-NEXT:    s_mov_b32 s0, s4
-; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_mov_b32 s4, s0
+; GFX9-NEXT:    s_mov_b32 s5, s1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mul_lo_u32 v0, v0, v1
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_mul_i32:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX10-NEXT:    s_mov_b32 s2, -1
-; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX10-NEXT:    s_mov_b32 s10, s2
-; GFX10-NEXT:    s_mov_b32 s11, s3
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_mov_b32 s6, -1
+; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX10-NEXT:    s_mov_b32 s10, s6
+; GFX10-NEXT:    s_mov_b32 s11, s7
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s8, s6
-; GFX10-NEXT:    s_mov_b32 s9, s7
-; GFX10-NEXT:    s_mov_b32 s0, s4
+; GFX10-NEXT:    s_mov_b32 s8, s2
+; GFX10-NEXT:    s_mov_b32 s9, s3
+; GFX10-NEXT:    s_mov_b32 s4, s0
 ; GFX10-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; GFX10-NEXT:    s_mov_b32 s1, s5
+; GFX10-NEXT:    s_mov_b32 s5, s1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mul_lo_u32 v0, v0, v1
-; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX10-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_mul_i32:
@@ -1662,43 +1662,43 @@ define amdgpu_kernel void @v_mul_i1(ptr addrspace(1) %out, ptr addrspace(1) %in)
 ;
 ; GFX9-LABEL: v_mul_i1:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    s_mov_b32 s10, s2
-; GFX9-NEXT:    s_mov_b32 s11, s3
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    s_mov_b32 s10, s6
+; GFX9-NEXT:    s_mov_b32 s11, s7
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s8, s6
-; GFX9-NEXT:    s_mov_b32 s9, s7
+; GFX9-NEXT:    s_mov_b32 s8, s2
+; GFX9-NEXT:    s_mov_b32 s9, s3
 ; GFX9-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0
 ; GFX9-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0 offset:4
-; GFX9-NEXT:    s_mov_b32 s0, s4
-; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_mov_b32 s4, s0
+; GFX9-NEXT:    s_mov_b32 s5, s1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mul_lo_u16_e32 v0, v0, v1
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; GFX9-NEXT:    buffer_store_byte v0, off, s[4:7], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_mul_i1:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX10-NEXT:    s_mov_b32 s2, -1
-; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX10-NEXT:    s_mov_b32 s10, s2
-; GFX10-NEXT:    s_mov_b32 s11, s3
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_mov_b32 s6, -1
+; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX10-NEXT:    s_mov_b32 s10, s6
+; GFX10-NEXT:    s_mov_b32 s11, s7
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s8, s6
-; GFX10-NEXT:    s_mov_b32 s9, s7
+; GFX10-NEXT:    s_mov_b32 s8, s2
+; GFX10-NEXT:    s_mov_b32 s9, s3
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0
 ; GFX10-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0 offset:4
-; GFX10-NEXT:    s_mov_b32 s0, s4
-; GFX10-NEXT:    s_mov_b32 s1, s5
+; GFX10-NEXT:    s_mov_b32 s4, s0
+; GFX10-NEXT:    s_mov_b32 s5, s1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mul_lo_u16 v0, v0, v1
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; GFX10-NEXT:    buffer_store_byte v0, off, s[4:7], 0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_mul_i1:
@@ -2211,67 +2211,67 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace(
 ; GFX9-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX9-NEXT:    s_cbranch_scc0 .LBB15_2
 ; GFX9-NEXT:  ; %bb.1: ; %else
-; GFX9-NEXT:    s_mul_i32 s8, s0, s1
-; GFX9-NEXT:    s_mov_b64 s[0:1], 0
+; GFX9-NEXT:    s_mul_i32 s6, s0, s1
+; GFX9-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-NEXT:    s_branch .LBB15_3
 ; GFX9-NEXT:  .LBB15_2:
-; GFX9-NEXT:    s_mov_b64 s[0:1], -1
-; GFX9-NEXT:    ; implicit-def: $sgpr8
+; GFX9-NEXT:    s_mov_b64 s[4:5], -1
+; GFX9-NEXT:    ; implicit-def: $sgpr6
 ; GFX9-NEXT:  .LBB15_3: ; %Flow
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_vccnz .LBB15_5
 ; GFX9-NEXT:  ; %bb.4: ; %if
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s0, s6
-; GFX9-NEXT:    s_mov_b32 s1, s7
-; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    s_mov_b32 s4, s2
+; GFX9-NEXT:    s_mov_b32 s5, s3
+; GFX9-NEXT:    buffer_load_dword v0, off, s[4:7], 0
 ; GFX9-NEXT:    s_branch .LBB15_6
 ; GFX9-NEXT:  .LBB15_5:
-; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX9-NEXT:  .LBB15_6: ; %endif
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: mul32_in_branch:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
-; GFX10-NEXT:    s_mov_b32 s8, 0
+; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX10-NEXT:    s_cbranch_scc0 .LBB15_2
 ; GFX10-NEXT:  ; %bb.1: ; %else
-; GFX10-NEXT:    s_mul_i32 s0, s0, s1
+; GFX10-NEXT:    s_mul_i32 s5, s0, s1
 ; GFX10-NEXT:    s_branch .LBB15_3
 ; GFX10-NEXT:  .LBB15_2:
-; GFX10-NEXT:    s_mov_b32 s8, -1
-; GFX10-NEXT:    ; implicit-def: $sgpr0
+; GFX10-NEXT:    s_mov_b32 s4, -1
+; GFX10-NEXT:    ; implicit-def: $sgpr5
 ; GFX10-NEXT:  .LBB15_3: ; %Flow
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX10-NEXT:    s_andn2_b32 vcc_lo, exec_lo, s8
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_andn2_b32 vcc_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_vccnz .LBB15_5
 ; GFX10-NEXT:  ; %bb.4: ; %if
-; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX10-NEXT:    s_mov_b32 s2, -1
+; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX10-NEXT:    s_mov_b32 s6, -1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s0, s6
-; GFX10-NEXT:    s_mov_b32 s1, s7
-; GFX10-NEXT:    buffer_load_dword v0, off, s[0:3], 0
+; GFX10-NEXT:    s_mov_b32 s4, s2
+; GFX10-NEXT:    s_mov_b32 s5, s3
+; GFX10-NEXT:    buffer_load_dword v0, off, s[4:7], 0
 ; GFX10-NEXT:    s_branch .LBB15_6
 ; GFX10-NEXT:  .LBB15_5:
-; GFX10-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX10-NEXT:  .LBB15_6: ; %endif
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX10-NEXT:    s_mov_b32 s6, -1
+; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX10-NEXT:    s_mov_b32 s2, -1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: mul32_in_branch:
@@ -2472,72 +2472,72 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace(
 ;
 ; GFX9-LABEL: mul64_in_branch:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b64 s[0:1], 0
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_cmp_lg_u64 s[8:9], 0
+; GFX9-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GFX9-NEXT:    s_cbranch_scc0 .LBB16_3
 ; GFX9-NEXT:  ; %bb.1: ; %else
-; GFX9-NEXT:    s_mul_i32 s2, s8, s11
-; GFX9-NEXT:    s_mul_hi_u32 s3, s8, s10
-; GFX9-NEXT:    s_add_i32 s2, s3, s2
-; GFX9-NEXT:    s_mul_i32 s3, s9, s10
-; GFX9-NEXT:    s_add_i32 s3, s2, s3
-; GFX9-NEXT:    s_mul_i32 s2, s8, s10
-; GFX9-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
+; GFX9-NEXT:    s_mul_i32 s7, s4, s7
+; GFX9-NEXT:    s_mul_hi_u32 s10, s4, s6
+; GFX9-NEXT:    s_add_i32 s7, s10, s7
+; GFX9-NEXT:    s_mul_i32 s5, s5, s6
+; GFX9-NEXT:    s_add_i32 s5, s7, s5
+; GFX9-NEXT:    s_mul_i32 s4, s4, s6
+; GFX9-NEXT:    s_andn2_b64 vcc, exec, s[8:9]
 ; GFX9-NEXT:    s_cbranch_vccnz .LBB16_4
 ; GFX9-NEXT:  .LBB16_2: ; %if
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    s_mov_b32 s0, s6
-; GFX9-NEXT:    s_mov_b32 s1, s7
-; GFX9-NEXT:    buffer_load_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    s_mov_b32 s4, s2
+; GFX9-NEXT:    s_mov_b32 s5, s3
+; GFX9-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX9-NEXT:    s_branch .LBB16_5
 ; GFX9-NEXT:  .LBB16_3:
-; GFX9-NEXT:    ; implicit-def: $sgpr2_sgpr3
+; GFX9-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; GFX9-NEXT:    s_branch .LBB16_2
 ; GFX9-NEXT:  .LBB16_4:
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:  .LBB16_5: ; %endif
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: mul64_in_branch:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_cmp_lg_u64 s[8:9], 0
+; GFX10-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GFX10-NEXT:    s_cbranch_scc0 .LBB16_3
 ; GFX10-NEXT:  ; %bb.1: ; %else
-; GFX10-NEXT:    s_mul_i32 s0, s8, s11
-; GFX10-NEXT:    s_mul_hi_u32 s1, s8, s10
-; GFX10-NEXT:    s_mul_i32 s2, s9, s10
-; GFX10-NEXT:    s_add_i32 s0, s1, s0
-; GFX10-NEXT:    s_add_i32 s1, s0, s2
-; GFX10-NEXT:    s_mul_i32 s0, s8, s10
+; GFX10-NEXT:    s_mul_i32 s7, s4, s7
+; GFX10-NEXT:    s_mul_hi_u32 s8, s4, s6
+; GFX10-NEXT:    s_mul_i32 s5, s5, s6
+; GFX10-NEXT:    s_add_i32 s7, s8, s7
+; GFX10-NEXT:    s_mul_i32 s4, s4, s6
+; GFX10-NEXT:    s_add_i32 s5, s7, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB16_4
 ; GFX10-NEXT:  .LBB16_2: ; %if
-; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX10-NEXT:    s_mov_b32 s2, -1
-; GFX10-NEXT:    s_mov_b32 s0, s6
-; GFX10-NEXT:    s_mov_b32 s1, s7
-; GFX10-NEXT:    buffer_load_dwordx2 v[0:1], off, s[0:3], 0
+; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX10-NEXT:    s_mov_b32 s6, -1
+; GFX10-NEXT:    s_mov_b32 s4, s2
+; GFX10-NEXT:    s_mov_b32 s5, s3
+; GFX10-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX10-NEXT:    s_branch .LBB16_5
 ; GFX10-NEXT:  .LBB16_3:
-; GFX10-NEXT:    ; implicit-def: $sgpr0_sgpr1
+; GFX10-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; GFX10-NEXT:    s_branch .LBB16_2
 ; GFX10-NEXT:  .LBB16_4:
-; GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-NEXT:  .LBB16_5: ; %endif
-; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX10-NEXT:    s_mov_b32 s6, -1
+; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX10-NEXT:    s_mov_b32 s2, -1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: mul64_in_branch:
@@ -3102,12 +3102,12 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a
 ;
 ; GFX9-LABEL: v_mul_i128:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x2c
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x2c
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 4, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v10, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[0:3], v13, s[4:5]
-; GFX9-NEXT:    global_load_dwordx4 v[4:7], v13, s[6:7]
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v13, s[0:1]
+; GFX9-NEXT:    global_load_dwordx4 v[4:7], v13, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v0, v4, 0
 ; GFX9-NEXT:    v_mul_lo_u32 v14, v5, v2
@@ -3128,18 +3128,18 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a
 ; GFX9-NEXT:    v_add3_u32 v3, v16, v3, v4
 ; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v0, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, v1, v3, vcc
-; GFX9-NEXT:    global_store_dwordx4 v13, v[8:11], s[6:7]
+; GFX9-NEXT:    global_store_dwordx4 v13, v[8:11], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_mul_i128:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x2c
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x2c
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 4, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v10, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dwordx4 v[0:3], v13, s[4:5]
-; GFX10-NEXT:    global_load_dwordx4 v[4:7], v13, s[6:7]
+; GFX10-NEXT:    global_load_dwordx4 v[0:3], v13, s[0:1]
+; GFX10-NEXT:    global_load_dwordx4 v[4:7], v13, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mad_u64_u32 v[8:9], s0, v0, v4, 0
 ; GFX10-NEXT:    v_mul_lo_u32 v15, v5, v2
@@ -3160,7 +3160,7 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a
 ; GFX10-NEXT:    v_add3_u32 v3, v7, v3, v12
 ; GFX10-NEXT:    v_add_co_u32 v10, vcc_lo, v0, v2
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v11, vcc_lo, v1, v3, vcc_lo
-; GFX10-NEXT:    global_store_dwordx4 v13, v[8:11], s[6:7]
+; GFX10-NEXT:    global_store_dwordx4 v13, v[8:11], s[2:3]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_mul_i128:
diff --git a/llvm/test/CodeGen/AMDGPU/mul_int24.ll b/llvm/test/CodeGen/AMDGPU/mul_int24.ll
index be77a10380c49..842dc36e00154 100644
--- a/llvm/test/CodeGen/AMDGPU/mul_int24.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul_int24.ll
@@ -39,17 +39,17 @@ define amdgpu_kernel void @test_smul24_i32(ptr addrspace(1) %out, i32 %a, i32 %b
 ;
 ; GFX9-LABEL: test_smul24_i32:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s0, s4
-; GFX9-NEXT:    s_mov_b32 s1, s5
-; GFX9-NEXT:    s_bfe_i32 s4, s6, 0x180000
-; GFX9-NEXT:    s_bfe_i32 s5, s7, 0x180000
-; GFX9-NEXT:    s_mul_i32 s4, s4, s5
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    s_mov_b32 s4, s0
+; GFX9-NEXT:    s_mov_b32 s5, s1
+; GFX9-NEXT:    s_bfe_i32 s0, s2, 0x180000
+; GFX9-NEXT:    s_bfe_i32 s1, s3, 0x180000
+; GFX9-NEXT:    s_mul_i32 s0, s0, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; EG-LABEL: test_smul24_i32:
@@ -126,17 +126,17 @@ define amdgpu_kernel void @test_smulhi24_i64(ptr addrspace(1) %out, i32 %a, i32
 ;
 ; GFX9-LABEL: test_smulhi24_i64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s0, s4
-; GFX9-NEXT:    s_mov_b32 s1, s5
-; GFX9-NEXT:    s_bfe_i32 s4, s6, 0x180000
-; GFX9-NEXT:    s_bfe_i32 s5, s7, 0x180000
-; GFX9-NEXT:    s_mul_hi_i32 s4, s4, s5
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    s_mov_b32 s4, s0
+; GFX9-NEXT:    s_mov_b32 s5, s1
+; GFX9-NEXT:    s_bfe_i32 s0, s2, 0x180000
+; GFX9-NEXT:    s_bfe_i32 s1, s3, 0x180000
+; GFX9-NEXT:    s_mul_hi_i32 s0, s0, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; EG-LABEL: test_smulhi24_i64:
diff --git a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
index a1099554559af..0c0bb830ba847 100644
--- a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
@@ -39,17 +39,17 @@ define amdgpu_kernel void @test_umul24_i32(ptr addrspace(1) %out, i32 %a, i32 %b
 ;
 ; GFX9-LABEL: test_umul24_i32:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s0, s4
-; GFX9-NEXT:    s_mov_b32 s1, s5
-; GFX9-NEXT:    s_and_b32 s4, s6, 0xffffff
-; GFX9-NEXT:    s_and_b32 s5, s7, 0xffffff
-; GFX9-NEXT:    s_mul_i32 s4, s4, s5
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    s_mov_b32 s4, s0
+; GFX9-NEXT:    s_mov_b32 s5, s1
+; GFX9-NEXT:    s_and_b32 s0, s2, 0xffffff
+; GFX9-NEXT:    s_and_b32 s1, s3, 0xffffff
+; GFX9-NEXT:    s_mul_i32 s0, s0, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX9-NEXT:    s_endpgm
 entry:
   %0 = shl i32 %a, 8
@@ -158,18 +158,18 @@ define amdgpu_kernel void @test_umul24_i16_vgpr_sext(ptr addrspace(1) %out, ptr
 ;
 ; GFX9-LABEL: test_umul24_i16_vgpr_sext:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 1, v1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v2, v0, s[6:7]
-; GFX9-NEXT:    global_load_ushort v3, v1, s[6:7]
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    global_load_ushort v2, v0, s[2:3]
+; GFX9-NEXT:    global_load_ushort v3, v1, s[2:3]
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mul_lo_u16_e32 v0, v2, v3
 ; GFX9-NEXT:    v_bfe_i32 v0, v0, 0, 16
-; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
   %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.y = call i32 @llvm.amdgcn.workitem.id.y()
@@ -279,17 +279,17 @@ define amdgpu_kernel void @test_umul24_i16_vgpr(ptr addrspace(1) %out, ptr addrs
 ;
 ; GFX9-LABEL: test_umul24_i16_vgpr:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 1, v1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v2, v0, s[6:7]
-; GFX9-NEXT:    global_load_ushort v3, v1, s[6:7]
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    global_load_ushort v2, v0, s[2:3]
+; GFX9-NEXT:    global_load_ushort v3, v1, s[2:3]
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mul_lo_u16_e32 v0, v2, v3
-; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
   %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.y = call i32 @llvm.amdgcn.workitem.id.y()
@@ -405,17 +405,17 @@ define amdgpu_kernel void @test_umulhi24_i32_i64(ptr addrspace(1) %out, i32 %a,
 ;
 ; GFX9-LABEL: test_umulhi24_i32_i64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s0, s4
-; GFX9-NEXT:    s_mov_b32 s1, s5
-; GFX9-NEXT:    s_and_b32 s4, s6, 0xffffff
-; GFX9-NEXT:    s_and_b32 s5, s7, 0xffffff
-; GFX9-NEXT:    s_mul_hi_u32 s4, s4, s5
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    s_mov_b32 s4, s0
+; GFX9-NEXT:    s_mov_b32 s5, s1
+; GFX9-NEXT:    s_and_b32 s0, s2, 0xffffff
+; GFX9-NEXT:    s_and_b32 s1, s3, 0xffffff
+; GFX9-NEXT:    s_mul_hi_u32 s0, s0, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX9-NEXT:    s_endpgm
 entry:
   %a.24 = and i32 %a, 16777215
@@ -661,14 +661,14 @@ define amdgpu_kernel void @test_umulhi16_i32(ptr addrspace(1) %out, i32 %a, i32
 ;
 ; GFX9-LABEL: test_umulhi16_i32:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_and_b32 s0, s6, 0xffff
-; GFX9-NEXT:    s_and_b32 s1, s7, 0xffff
-; GFX9-NEXT:    s_mul_i32 s0, s0, s1
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-NEXT:    global_store_short_d16_hi v0, v1, s[4:5]
+; GFX9-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX9-NEXT:    s_and_b32 s3, s3, 0xffff
+; GFX9-NEXT:    s_mul_i32 s2, s2, s3
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    global_store_short_d16_hi v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 entry:
   %a.16 = and i32 %a, 65535
diff --git a/llvm/test/CodeGen/AMDGPU/optimize-compare.ll b/llvm/test/CodeGen/AMDGPU/optimize-compare.ll
index 7e5def00ee7cb..d73b1bd29c981 100644
--- a/llvm/test/CodeGen/AMDGPU/optimize-compare.ll
+++ b/llvm/test/CodeGen/AMDGPU/optimize-compare.ll
@@ -62,15 +62,15 @@ define amdgpu_kernel void @if_masked_0x80000000(i32 %arg, ptr addrspace(1) %p)
 define amdgpu_kernel void @if_masked_0x8000000000000000(i64 %arg, ptr addrspace(1) %p)  {
 ; GCN-LABEL: if_masked_0x8000000000000000:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s0, 0
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_and_b32 s1, s5, 0x80000000
+; GCN-NEXT:    s_and_b32 s1, s1, 0x80000000
 ; GCN-NEXT:    s_cmp_eq_u64 s[0:1], 0
 ; GCN-NEXT:    s_cselect_b32 s0, 22, 33
 ; GCN-NEXT:    v_mov_b32_e32 v1, s0
-; GCN-NEXT:    global_store_dword v0, v1, s[6:7]
+; GCN-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GCN-NEXT:    s_endpgm
   %and = and i64 %arg, 9223372036854775808
   %cmp = icmp eq i64 %and, 0
diff --git a/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll b/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll
index d6fc52bdb9322..c72a7ba3eee83 100644
--- a/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll
+++ b/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll
@@ -4,17 +4,17 @@
 define amdgpu_kernel void @fma_vector_vector_scalar_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 {
 ; GCN-LABEL: fma_vector_vector_scalar_lo:
 ; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-NEXT:    ds_read_b32 v2, v0
 ; GCN-NEXT:    ds_read_b32 v0, v0 offset:4
 ; GCN-NEXT:    ds_read_u16 v1, v1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0]
-; GCN-NEXT:    global_store_dword v3, v0, s[4:5]
+; GCN-NEXT:    global_store_dword v3, v0, s[0:1]
 ; GCN-NEXT:    s_endpgm
 bb:
   %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -35,17 +35,17 @@ bb:
 define amdgpu_kernel void @fma_vector_vector_neg_broadcast_scalar_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 {
 ; GCN-LABEL: fma_vector_vector_neg_broadcast_scalar_lo:
 ; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-NEXT:    ds_read_b32 v2, v0
 ; GCN-NEXT:    ds_read_b32 v0, v0 offset:4
 ; GCN-NEXT:    ds_read_u16 v1, v1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1]
-; GCN-NEXT:    global_store_dword v3, v0, s[4:5]
+; GCN-NEXT:    global_store_dword v3, v0, s[0:1]
 ; GCN-NEXT:    s_endpgm
 bb:
   %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -67,17 +67,17 @@ bb:
 define amdgpu_kernel void @fma_vector_vector_neg_scalar_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 {
 ; GCN-LABEL: fma_vector_vector_neg_scalar_lo:
 ; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-NEXT:    ds_read_b32 v2, v0
 ; GCN-NEXT:    ds_read_b32 v0, v0 offset:4
 ; GCN-NEXT:    ds_read_u16 v1, v1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1]
-; GCN-NEXT:    global_store_dword v3, v0, s[4:5]
+; GCN-NEXT:    global_store_dword v3, v0, s[0:1]
 ; GCN-NEXT:    s_endpgm
 bb:
   %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -99,17 +99,17 @@ bb:
 define amdgpu_kernel void @fma_vector_vector_neg_broadcast_neg_scalar_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 {
 ; GCN-LABEL: fma_vector_vector_neg_broadcast_neg_scalar_lo:
 ; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-NEXT:    ds_read_b32 v2, v0
 ; GCN-NEXT:    ds_read_b32 v0, v0 offset:4
 ; GCN-NEXT:    ds_read_u16 v1, v1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0]
-; GCN-NEXT:    global_store_dword v3, v0, s[4:5]
+; GCN-NEXT:    global_store_dword v3, v0, s[0:1]
 ; GCN-NEXT:    s_endpgm
 bb:
   %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -132,17 +132,17 @@ bb:
 define amdgpu_kernel void @fma_vector_vector_scalar_neg_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 {
 ; GCN-LABEL: fma_vector_vector_scalar_neg_lo:
 ; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-NEXT:    ds_read_b32 v2, v0
 ; GCN-NEXT:    ds_read_b32 v0, v0 offset:4
 ; GCN-NEXT:    ds_read_u16 v1, v1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0] neg_lo:[0,0,1]
-; GCN-NEXT:    global_store_dword v3, v0, s[4:5]
+; GCN-NEXT:    global_store_dword v3, v0, s[0:1]
 ; GCN-NEXT:    s_endpgm
 bb:
   %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -163,17 +163,17 @@ bb:
 define amdgpu_kernel void @fma_vector_vector_scalar_neg_hi(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 {
 ; GCN-LABEL: fma_vector_vector_scalar_neg_hi:
 ; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-NEXT:    ds_read_b32 v2, v0
 ; GCN-NEXT:    ds_read_b32 v0, v0 offset:4
 ; GCN-NEXT:    ds_read_u16 v1, v1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0] neg_hi:[0,0,1]
-; GCN-NEXT:    global_store_dword v3, v0, s[4:5]
+; GCN-NEXT:    global_store_dword v3, v0, s[0:1]
 ; GCN-NEXT:    s_endpgm
 bb:
   %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -194,16 +194,16 @@ bb:
 define amdgpu_kernel void @add_vector_neg_bitcast_scalar_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 {
 ; GCN-LABEL: add_vector_neg_bitcast_scalar_lo:
 ; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-NEXT:    ds_read_b32 v0, v0
 ; GCN-NEXT:    ds_read_u16 v1, v1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_pk_add_u16 v0, v0, v1 op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1]
-; GCN-NEXT:    global_store_dword v2, v0, s[4:5]
+; GCN-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GCN-NEXT:    s_endpgm
 bb:
   %vec0 = load volatile <2 x i16>, ptr addrspace(3) %lds, align 4
@@ -222,11 +222,11 @@ bb:
 define amdgpu_kernel void @fma_vector_vector_scalar_lo_neg_scalar_hi(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 {
 ; GCN-LABEL: fma_vector_vector_scalar_lo_neg_scalar_hi:
 ; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GCN-NEXT:    v_mov_b32_e32 v4, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-NEXT:    ds_read_b32 v2, v0
 ; GCN-NEXT:    ds_read_b32 v0, v0 offset:4
 ; GCN-NEXT:    ds_read_u16 v3, v1
@@ -237,7 +237,7 @@ define amdgpu_kernel void @fma_vector_vector_scalar_lo_neg_scalar_hi(ptr addrspa
 ; GCN-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
 ; GCN-NEXT:    v_lshl_or_b32 v1, v1, 16, v3
 ; GCN-NEXT:    v_pk_fma_f16 v0, v2, v0, v1
-; GCN-NEXT:    global_store_dword v4, v0, s[4:5]
+; GCN-NEXT:    global_store_dword v4, v0, s[0:1]
 ; GCN-NEXT:    s_endpgm
 bb:
   %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -261,10 +261,10 @@ bb:
 define amdgpu_kernel void @fma_vector_vector_neg_scalar_lo_scalar_hi(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 {
 ; GCN-LABEL: fma_vector_vector_neg_scalar_lo_scalar_hi:
 ; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-NEXT:    ds_read_b32 v2, v0
 ; GCN-NEXT:    ds_read_b32 v0, v0 offset:4
 ; GCN-NEXT:    ds_read_u16 v3, v1
@@ -273,7 +273,7 @@ define amdgpu_kernel void @fma_vector_vector_neg_scalar_lo_scalar_hi(ptr addrspa
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_pk_fma_f16 v0, v2, v0, v3 neg_lo:[0,0,1] neg_hi:[0,0,1]
-; GCN-NEXT:    global_store_dword v1, v0, s[4:5]
+; GCN-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GCN-NEXT:    s_endpgm
 bb:
   %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -299,15 +299,15 @@ define amdgpu_kernel void @fma_vector_vector_neg_vector_hi(ptr addrspace(1) %out
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_load_dword s2, s[0:1], 0x2c
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    ds_read_b32 v1, v0
 ; GCN-NEXT:    ds_read_b32 v2, v0 offset:4
 ; GCN-NEXT:    ds_read_b32 v0, v0 offset:8
-; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1] neg_lo:[0,0,1] neg_hi:[0,0,1]
-; GCN-NEXT:    global_store_dword v3, v0, s[2:3]
+; GCN-NEXT:    global_store_dword v3, v0, s[0:1]
 ; GCN-NEXT:    s_endpgm
 bb:
   %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -330,15 +330,15 @@ define amdgpu_kernel void @fma_vector_vector_vector_neg_hi(ptr addrspace(1) %out
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_load_dword s2, s[0:1], 0x2c
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    ds_read_b32 v1, v0
 ; GCN-NEXT:    ds_read_b32 v2, v0 offset:4
 ; GCN-NEXT:    ds_read_b32 v0, v0 offset:8
-; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_pk_fma_f16 v0, v1, v2, v0 neg_hi:[0,0,1]
-; GCN-NEXT:    global_store_dword v3, v0, s[2:3]
+; GCN-NEXT:    global_store_dword v3, v0, s[0:1]
 ; GCN-NEXT:    s_endpgm
 bb:
   %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -362,14 +362,14 @@ define amdgpu_kernel void @add_vector_scalar_hi(ptr addrspace(1) %out, ptr addrs
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_load_dword s2, s[0:1], 0x2c
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    ds_read_b32 v1, v0
 ; GCN-NEXT:    ds_read_b32 v0, v0 offset:4
-; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_pk_add_u16 v0, v1, v0 op_sel:[0,1]
-; GCN-NEXT:    global_store_dword v2, v0, s[2:3]
+; GCN-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GCN-NEXT:    s_endpgm
 bb:
   %lds.gep1 = getelementptr inbounds <2 x i16>, ptr addrspace(3) %lds, i32 1
@@ -389,15 +389,15 @@ define amdgpu_kernel void @fma_vector_vector_scalar_hi(ptr addrspace(1) %out, pt
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_load_dword s2, s[0:1], 0x2c
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    ds_read_b32 v1, v0
 ; GCN-NEXT:    ds_read_b32 v2, v0 offset:4
 ; GCN-NEXT:    ds_read_b32 v0, v0 offset:8
-; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1]
-; GCN-NEXT:    global_store_dword v3, v0, s[2:3]
+; GCN-NEXT:    global_store_dword v3, v0, s[0:1]
 ; GCN-NEXT:    s_endpgm
 bb:
   %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -420,15 +420,15 @@ define amdgpu_kernel void @fma_vector_vector_neg_vector_lo_neg_hi(ptr addrspace(
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_load_dword s2, s[0:1], 0x2c
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    ds_read_b32 v1, v0
 ; GCN-NEXT:    ds_read_b32 v2, v0 offset:4
 ; GCN-NEXT:    ds_read_b32 v0, v0 offset:8
-; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_pk_fma_f16 v0, v1, v2, v0
-; GCN-NEXT:    global_store_dword v3, v0, s[2:3]
+; GCN-NEXT:    global_store_dword v3, v0, s[0:1]
 ; GCN-NEXT:    s_endpgm
 bb:
   %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -453,15 +453,15 @@ define amdgpu_kernel void @fma_vector_vector_swap_vector(ptr addrspace(1) %out,
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_load_dword s2, s[0:1], 0x2c
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    ds_read_b32 v1, v0
 ; GCN-NEXT:    ds_read_b32 v2, v0 offset:4
 ; GCN-NEXT:    ds_read_b32 v0, v0 offset:8
-; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1] op_sel_hi:[1,1,0]
-; GCN-NEXT:    global_store_dword v3, v0, s[2:3]
+; GCN-NEXT:    global_store_dword v3, v0, s[0:1]
 ; GCN-NEXT:    s_endpgm
 bb:
   %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -483,15 +483,15 @@ define amdgpu_kernel void @fma_vector_vector_swap_neg_vector(ptr addrspace(1) %o
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_load_dword s2, s[0:1], 0x2c
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    ds_read_b32 v1, v0
 ; GCN-NEXT:    ds_read_b32 v2, v0 offset:4
 ; GCN-NEXT:    ds_read_b32 v0, v0 offset:8
-; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1] op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1]
-; GCN-NEXT:    global_store_dword v3, v0, s[2:3]
+; GCN-NEXT:    global_store_dword v3, v0, s[0:1]
 ; GCN-NEXT:    s_endpgm
 bb:
   %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -514,15 +514,15 @@ define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_0(ptr addrs
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_load_dword s2, s[0:1], 0x2c
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    ds_read_b32 v1, v0
 ; GCN-NEXT:    ds_read_b32 v2, v0 offset:4
 ; GCN-NEXT:    ds_read_b32 v0, v0 offset:8
-; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1] op_sel_hi:[1,1,0] neg_lo:[0,0,1]
-; GCN-NEXT:    global_store_dword v3, v0, s[2:3]
+; GCN-NEXT:    global_store_dword v3, v0, s[0:1]
 ; GCN-NEXT:    s_endpgm
 bb:
   %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -544,15 +544,15 @@ define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_1(ptr addrs
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_load_dword s2, s[0:1], 0x2c
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    ds_read_b32 v1, v0
 ; GCN-NEXT:    ds_read_b32 v2, v0 offset:4
 ; GCN-NEXT:    ds_read_b32 v0, v0 offset:8
-; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_pk_fma_f16 v0, v1, v2, v0 neg_lo:[0,0,1]
-; GCN-NEXT:    global_store_dword v3, v0, s[2:3]
+; GCN-NEXT:    global_store_dword v3, v0, s[0:1]
 ; GCN-NEXT:    s_endpgm
 bb:
   %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -574,15 +574,15 @@ define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_2(ptr addrs
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_load_dword s2, s[0:1], 0x2c
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    ds_read_b32 v1, v0
 ; GCN-NEXT:    ds_read_b32 v2, v0 offset:4
 ; GCN-NEXT:    ds_read_b32 v0, v0 offset:8
-; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_pk_fma_f16 v0, v1, v2, v0 neg_hi:[0,0,1]
-; GCN-NEXT:    global_store_dword v3, v0, s[2:3]
+; GCN-NEXT:    global_store_dword v3, v0, s[0:1]
 ; GCN-NEXT:    s_endpgm
 bb:
   %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -604,15 +604,15 @@ define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_3(ptr addrs
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_load_dword s2, s[0:1], 0x2c
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    ds_read_b32 v1, v0
 ; GCN-NEXT:    ds_read_b32 v2, v0 offset:4
 ; GCN-NEXT:    ds_read_b32 v0, v0 offset:8
-; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1] neg_lo:[0,0,1]
-; GCN-NEXT:    global_store_dword v3, v0, s[2:3]
+; GCN-NEXT:    global_store_dword v3, v0, s[0:1]
 ; GCN-NEXT:    s_endpgm
 bb:
   %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -634,16 +634,16 @@ define amdgpu_kernel void @bitcast_fneg_f32(ptr addrspace(1) %out, ptr addrspace
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_load_dword s2, s[0:1], 0x2c
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    ds_read_b32 v0, v0
-; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    ds_read_b32 v1, v0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
 ; GCN-NEXT:    v_pk_add_f16 v0, v0, v1
-; GCN-NEXT:    global_store_dword v2, v0, s[2:3]
+; GCN-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GCN-NEXT:    s_endpgm
 bb:
   %vec0 = load volatile <2 x half>, ptr addrspace(3) %lds, align 4
@@ -661,16 +661,16 @@ define amdgpu_kernel void @shuffle_bitcast_fneg_f32(ptr addrspace(1) %out, ptr a
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_load_dword s2, s[0:1], 0x2c
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    ds_read_b32 v0, v0
-; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    ds_read_b32 v1, v0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
 ; GCN-NEXT:    v_pk_add_f16 v0, v0, v1 op_sel:[0,1] op_sel_hi:[1,0]
-; GCN-NEXT:    global_store_dword v2, v0, s[2:3]
+; GCN-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GCN-NEXT:    s_endpgm
 bb:
   %vec0 = load volatile <2 x half>, ptr addrspace(3) %lds, align 4
@@ -689,18 +689,18 @@ define amdgpu_kernel void @extract_from_i64(ptr addrspace(1) %out, ptr addrspace
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_load_dword s2, s[0:1], 0x2c
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0xffff
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    ds_read_b32 v2, v0
 ; GCN-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off glc
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    v_and_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GCN-NEXT:    v_lshl_or_b32 v0, v0, 16, v3
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_pk_add_u16 v0, v2, v0
-; GCN-NEXT:    global_store_dword v1, v0, s[2:3]
+; GCN-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GCN-NEXT:    s_endpgm
 bb:
   %vec0 = load volatile <2 x i16>, ptr addrspace(3) %lds, align 4
@@ -726,14 +726,14 @@ define amdgpu_kernel void @bitcast_lo_elt_op_sel(ptr addrspace(1) %out, ptr addr
 ; GCN-NEXT:    ds_read_b32 v1, v0
 ; GCN-NEXT:    ds_read_b32 v2, v0 offset:4
 ; GCN-NEXT:    ds_read_b32 v0, v0 offset:8
-; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    global_load_ushort v3, v[0:1], off glc
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_pk_add_f16 v0, v0, 2.0 op_sel_hi:[1,0]
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-NEXT:    v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1] op_sel_hi:[1,1,0]
-; GCN-NEXT:    global_store_dword v3, v0, s[2:3]
+; GCN-NEXT:    global_store_dword v3, v0, s[0:1]
 ; GCN-NEXT:    s_endpgm
 bb:
   %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -764,7 +764,7 @@ define amdgpu_kernel void @mix_elt_types_op_sel(ptr addrspace(1) %out, ptr addrs
 ; GCN-NEXT:    ds_read_b32 v1, v0
 ; GCN-NEXT:    ds_read_b32 v2, v0 offset:4
 ; GCN-NEXT:    ds_read_b32 v0, v0 offset:8
-; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GCN-NEXT:    ; kill: killed $vgpr0_vgpr1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    global_load_ushort v3, v[0:1], off glc
@@ -776,7 +776,7 @@ define amdgpu_kernel void @mix_elt_types_op_sel(ptr addrspace(1) %out, ptr addrs
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-NEXT:    v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1] op_sel_hi:[1,1,0]
-; GCN-NEXT:    global_store_dword v3, v0, s[2:3]
+; GCN-NEXT:    global_store_dword v3, v0, s[0:1]
 ; GCN-NEXT:    s_endpgm
 bb:
   %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
diff --git a/llvm/test/CodeGen/AMDGPU/post-ra-soft-clause-dbg-info.ll b/llvm/test/CodeGen/AMDGPU/post-ra-soft-clause-dbg-info.ll
index 40542bc6f05a7..f53ca53518a17 100644
--- a/llvm/test/CodeGen/AMDGPU/post-ra-soft-clause-dbg-info.ll
+++ b/llvm/test/CodeGen/AMDGPU/post-ra-soft-clause-dbg-info.ll
@@ -8,16 +8,16 @@
 define amdgpu_kernel void @dbg_clause(ptr addrspace(1) %out, ptr addrspace(1) %aptr) !dbg !4 {
 ; GCN-LABEL: dbg_clause:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    global_load_dword v1, v0, s[6:7]
+; GCN-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GCN-NEXT:    ;DEBUG_VALUE: foo:a <- $vgpr1
-; GCN-NEXT:    global_load_dword v2, v0, s[6:7] offset:32
+; GCN-NEXT:    global_load_dword v2, v0, s[2:3] offset:32
 ; GCN-NEXT:    ;DEBUG_VALUE: foo:b <- $vgpr2
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_add_f32_e32 v1, v1, v2
-; GCN-NEXT:    global_store_dword v0, v1, s[4:5]
+; GCN-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GCN-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/promote-vect3-load.ll b/llvm/test/CodeGen/AMDGPU/promote-vect3-load.ll
index 0c8dbd1db8ec6..dabb9d43bf3d6 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-vect3-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-vect3-load.ll
@@ -21,7 +21,7 @@ define protected amdgpu_kernel void @load_v3i32_align8(ptr addrspace(1) %arg) #0
 ; GCN-LABEL: load_v3i32_align8:
 ; GCN:       ; %bb.0:
 ; GCN:         s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[4:5], 0x0
+; GCN-NEXT:    s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x0
   %vec = load <3 x i32>, ptr addrspace(1) %arg, align 8
   store <3 x i32> %vec, ptr addrspace(1) undef, align 8
   ret void
@@ -52,7 +52,7 @@ define protected amdgpu_kernel void @load_v3f32_align8(ptr addrspace(1) %arg) #0
 ; GCN-LABEL: load_v3f32_align8:
 ; GCN:       ; %bb.0:
 ; GCN:         s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[4:5], 0x0
+; GCN-NEXT:    s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x0
   %vec = load <3 x float>, ptr addrspace(1) %arg, align 8
   store <3 x float> %vec, ptr addrspace(1) undef, align 8
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll b/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll
index f916c9375bc6d..9a8d5acfbe3e9 100644
--- a/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll
+++ b/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll
@@ -5,28 +5,28 @@
 define amdgpu_kernel void @buffers_dont_alias(ptr addrspace(8) noalias %a, ptr addrspace(8) noalias %b) {
 ; SDAG-LABEL: buffers_dont_alias:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; SDAG-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    v_mul_f32_e32 v0, v0, v0
 ; SDAG-NEXT:    v_mul_f32_e32 v1, v1, v1
 ; SDAG-NEXT:    v_mul_f32_e32 v2, v2, v2
 ; SDAG-NEXT:    v_mul_f32_e32 v3, v3, v3
-; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: buffers_dont_alias:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GISEL-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    v_mul_f32_e32 v0, v0, v0
 ; GISEL-NEXT:    v_mul_f32_e32 v1, v1, v1
 ; GISEL-NEXT:    v_mul_f32_e32 v2, v2, v2
 ; GISEL-NEXT:    v_mul_f32_e32 v3, v3, v3
-; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; GISEL-NEXT:    s_endpgm
   %l0 = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %a, i32 0, i32 0, i32 0)
   %s0 = fmul float %l0, %l0
@@ -50,40 +50,40 @@ define amdgpu_kernel void @buffers_dont_alias(ptr addrspace(8) noalias %a, ptr a
 define amdgpu_kernel void @buffers_from_flat_dont_alias(ptr noalias %a.flat, ptr noalias %b.flat) {
 ; SDAG-LABEL: buffers_from_flat_dont_alias:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; SDAG-NEXT:    s_mov_b32 s3, 0
-; SDAG-NEXT:    s_mov_b32 s2, 16
+; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; SDAG-NEXT:    s_mov_b32 s7, 0
+; SDAG-NEXT:    s_mov_b32 s6, 16
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    s_and_b32 s1, s5, 0xffff
-; SDAG-NEXT:    s_mov_b32 s0, s4
-; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
-; SDAG-NEXT:    s_and_b32 s1, s7, 0xffff
-; SDAG-NEXT:    s_mov_b32 s0, s6
+; SDAG-NEXT:    s_and_b32 s5, s1, 0xffff
+; SDAG-NEXT:    s_mov_b32 s4, s0
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; SDAG-NEXT:    s_and_b32 s5, s3, 0xffff
+; SDAG-NEXT:    s_mov_b32 s4, s2
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    v_mul_f32_e32 v0, v0, v0
 ; SDAG-NEXT:    v_mul_f32_e32 v1, v1, v1
 ; SDAG-NEXT:    v_mul_f32_e32 v2, v2, v2
 ; SDAG-NEXT:    v_mul_f32_e32 v3, v3, v3
-; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: buffers_from_flat_dont_alias:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GISEL-NEXT:    s_mov_b32 s3, 0
-; GISEL-NEXT:    s_mov_b32 s2, 16
+; GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GISEL-NEXT:    s_mov_b32 s7, 0
+; GISEL-NEXT:    s_mov_b32 s6, 16
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    s_and_b32 s1, s5, 0xffff
-; GISEL-NEXT:    s_mov_b32 s0, s4
-; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
-; GISEL-NEXT:    s_and_b32 s1, s7, 0xffff
-; GISEL-NEXT:    s_mov_b32 s0, s6
+; GISEL-NEXT:    s_and_b32 s5, s1, 0xffff
+; GISEL-NEXT:    s_mov_b32 s4, s0
+; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    s_and_b32 s5, s3, 0xffff
+; GISEL-NEXT:    s_mov_b32 s4, s2
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    v_mul_f32_e32 v0, v0, v0
 ; GISEL-NEXT:    v_mul_f32_e32 v1, v1, v1
 ; GISEL-NEXT:    v_mul_f32_e32 v2, v2, v2
 ; GISEL-NEXT:    v_mul_f32_e32 v3, v3, v3
-; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; GISEL-NEXT:    s_endpgm
   %a = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr %a.flat, i16 0, i32 16, i32 0)
   %b = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr %b.flat, i16 0, i32 16, i32 0)
@@ -110,46 +110,46 @@ define amdgpu_kernel void @buffers_from_flat_dont_alias(ptr noalias %a.flat, ptr
 define amdgpu_kernel void @buffers_might_alias(ptr addrspace(8) %a, ptr addrspace(8) %b) {
 ; SDAG-LABEL: buffers_might_alias:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; SDAG-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    buffer_load_dword v0, off, s[4:7], 0
+; SDAG-NEXT:    buffer_load_dword v0, off, s[0:3], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    v_mul_f32_e32 v0, v0, v0
-; SDAG-NEXT:    buffer_store_dword v0, off, s[8:11], 0
-; SDAG-NEXT:    buffer_load_dword v0, off, s[4:7], 0 offset:4
+; SDAG-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SDAG-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:4
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    v_mul_f32_e32 v0, v0, v0
-; SDAG-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:4
-; SDAG-NEXT:    buffer_load_dword v0, off, s[4:7], 0 offset:8
+; SDAG-NEXT:    buffer_store_dword v0, off, s[4:7], 0 offset:4
+; SDAG-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:8
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    v_mul_f32_e32 v0, v0, v0
-; SDAG-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:8
-; SDAG-NEXT:    buffer_load_dword v0, off, s[4:7], 0 offset:12
+; SDAG-NEXT:    buffer_store_dword v0, off, s[4:7], 0 offset:8
+; SDAG-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:12
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    v_mul_f32_e32 v0, v0, v0
-; SDAG-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:12
+; SDAG-NEXT:    buffer_store_dword v0, off, s[4:7], 0 offset:12
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: buffers_might_alias:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GISEL-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    buffer_load_dword v0, off, s[4:7], 0
+; GISEL-NEXT:    buffer_load_dword v0, off, s[0:3], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    v_mul_f32_e32 v0, v0, v0
-; GISEL-NEXT:    buffer_store_dword v0, off, s[8:11], 0
-; GISEL-NEXT:    buffer_load_dword v0, off, s[4:7], 0 offset:4
+; GISEL-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GISEL-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:4
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    v_mul_f32_e32 v0, v0, v0
-; GISEL-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:4
-; GISEL-NEXT:    buffer_load_dword v0, off, s[4:7], 0 offset:8
+; GISEL-NEXT:    buffer_store_dword v0, off, s[4:7], 0 offset:4
+; GISEL-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:8
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    v_mul_f32_e32 v0, v0, v0
-; GISEL-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:8
-; GISEL-NEXT:    buffer_load_dword v0, off, s[4:7], 0 offset:12
+; GISEL-NEXT:    buffer_store_dword v0, off, s[4:7], 0 offset:8
+; GISEL-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:12
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    v_mul_f32_e32 v0, v0, v0
-; GISEL-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:12
+; GISEL-NEXT:    buffer_store_dword v0, off, s[4:7], 0 offset:12
 ; GISEL-NEXT:    s_endpgm
   %l0 = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %a, i32 0, i32 0, i32 0)
   %s0 = fmul float %l0, %l0
@@ -173,28 +173,28 @@ define amdgpu_kernel void @buffers_might_alias(ptr addrspace(8) %a, ptr addrspac
 define amdgpu_kernel void @independent_offsets(ptr addrspace(8) %a) {
 ; SDAG-LABEL: independent_offsets:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; SDAG-NEXT:    v_mov_b32_e32 v2, 1.0
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    buffer_load_dword v1, v0, s[4:7], 0 offen offset:4
+; SDAG-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen offset:4
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    buffer_store_dword v2, v0, s[4:7], 0 offen
+; SDAG-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
 ; SDAG-NEXT:    s_waitcnt vmcnt(1)
-; SDAG-NEXT:    buffer_store_dword v1, v0, s[4:7], 0 offen offset:8
+; SDAG-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:8
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: independent_offsets:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 1.0
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    buffer_load_dword v1, v0, s[4:7], 0 offen offset:4
+; GISEL-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen offset:4
 ; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    buffer_store_dword v2, v0, s[4:7], 0 offen
+; GISEL-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
 ; GISEL-NEXT:    s_waitcnt vmcnt(1)
-; GISEL-NEXT:    buffer_store_dword v1, v0, s[4:7], 0 offen offset:8
+; GISEL-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:8
 ; GISEL-NEXT:    s_endpgm
   %lane = call i32 @llvm.amdgcn.workitem.id.x()
   %idx = shl i32 %lane, 2
diff --git a/llvm/test/CodeGen/AMDGPU/rem_i128.ll b/llvm/test/CodeGen/AMDGPU/rem_i128.ll
index fe093d4ac8515..6b036f675929e 100644
--- a/llvm/test/CodeGen/AMDGPU/rem_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/rem_i128.ll
@@ -520,21 +520,28 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v9
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v7
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v8
+; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v5
+; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[8:9], s[6:7]
+; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[8:9], s[8:9]
 ; GFX9-O0-NEXT:    s_mov_b64 s[12:13], 0x7f
-; GFX9-O0-NEXT:    v_cmp_gt_u64_e64 s[14:15], v[5:6], s[12:13]
+; GFX9-O0-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX9-O0-NEXT:    v_cmp_gt_u64_e64 s[14:15], v[5:6], s[14:15]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[14:15]
-; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[14:15], v[8:9], s[6:7]
+; GFX9-O0-NEXT:    s_mov_b64 s[14:15], s[6:7]
+; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[14:15], v[8:9], s[14:15]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[14:15]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v7, v7, v10, s[8:9]
 ; GFX9-O0-NEXT:    v_and_b32_e64 v7, 1, v7
@@ -545,6 +552,7 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
 ; GFX9-O0-NEXT:    s_mov_b32 s14, s13
 ; GFX9-O0-NEXT:    v_xor_b32_e64 v7, v7, s14
+; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec
 ; GFX9-O0-NEXT:    ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13
 ; GFX9-O0-NEXT:    v_xor_b32_e64 v5, v5, s12
 ; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
@@ -1078,10 +1086,10 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 1
 ; GFX9-O0-NEXT:    s_mov_b32 s5, s6
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
@@ -1910,21 +1918,28 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v9
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v7
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v8
+; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v5
+; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[8:9], s[6:7]
+; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[8:9], s[8:9]
 ; GFX9-O0-NEXT:    s_mov_b64 s[12:13], 0x7f
-; GFX9-O0-NEXT:    v_cmp_gt_u64_e64 s[14:15], v[5:6], s[12:13]
+; GFX9-O0-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX9-O0-NEXT:    v_cmp_gt_u64_e64 s[14:15], v[5:6], s[14:15]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[14:15]
-; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[14:15], v[8:9], s[6:7]
+; GFX9-O0-NEXT:    s_mov_b64 s[14:15], s[6:7]
+; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[14:15], v[8:9], s[14:15]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[14:15]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v7, v7, v10, s[8:9]
 ; GFX9-O0-NEXT:    v_and_b32_e64 v7, 1, v7
@@ -1935,6 +1950,7 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
 ; GFX9-O0-NEXT:    s_mov_b32 s14, s13
 ; GFX9-O0-NEXT:    v_xor_b32_e64 v7, v7, s14
+; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec
 ; GFX9-O0-NEXT:    ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13
 ; GFX9-O0-NEXT:    v_xor_b32_e64 v5, v5, s12
 ; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
@@ -2468,10 +2484,10 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 1
 ; GFX9-O0-NEXT:    s_mov_b32 s5, s6
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/required-export-priority.ll b/llvm/test/CodeGen/AMDGPU/required-export-priority.ll
deleted file mode 100644
index ebc209bd4d451..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/required-export-priority.ll
+++ /dev/null
@@ -1,344 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefix=GCN %s
-
-define amdgpu_ps void @test_export_zeroes_f32() #0 {
-; GCN-LABEL: test_export_zeroes_f32:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_setprio 2
-; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    exp mrt0 off, off, off, off
-; GCN-NEXT:    exp mrt0 off, off, off, off done
-; GCN-NEXT:    s_setprio 0
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_endpgm
-  call void @llvm.amdgcn.exp.f32(i32 0, i32 0, float 0.0, float 0.0, float 0.0, float 0.0, i1 false, i1 false)
-  call void @llvm.amdgcn.exp.f32(i32 0, i32 0, float 0.0, float 0.0, float 0.0, float 0.0, i1 true, i1 false)
-  ret void
-}
-
-define amdgpu_ps void @test_export_en_src0_f32() #0 {
-; GCN-LABEL: test_export_en_src0_f32:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_setprio 2
-; GCN-NEXT:    v_mov_b32_e32 v0, 4.0
-; GCN-NEXT:    v_mov_b32_e32 v1, 0.5
-; GCN-NEXT:    v_mov_b32_e32 v2, 2.0
-; GCN-NEXT:    v_mov_b32_e32 v3, 1.0
-; GCN-NEXT:    exp mrt0 v3, off, off, off done
-; GCN-NEXT:    s_setprio 0
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_endpgm
-  call void @llvm.amdgcn.exp.f32(i32 0, i32 1, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
-  ret void
-}
-
-define amdgpu_gs void @test_export_gs() #0 {
-; GCN-LABEL: test_export_gs:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_setprio 2
-; GCN-NEXT:    v_mov_b32_e32 v0, 4.0
-; GCN-NEXT:    v_mov_b32_e32 v1, 0.5
-; GCN-NEXT:    v_mov_b32_e32 v2, 2.0
-; GCN-NEXT:    v_mov_b32_e32 v3, 1.0
-; GCN-NEXT:    exp mrt0 off, v2, off, off done
-; GCN-NEXT:    s_setprio 0
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_endpgm
-  call void @llvm.amdgcn.exp.f32(i32 0, i32 2, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
-  ret void
-}
-
-define amdgpu_hs void @test_export_hs() #0 {
-; GCN-LABEL: test_export_hs:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_setprio 2
-; GCN-NEXT:    v_mov_b32_e32 v0, 4.0
-; GCN-NEXT:    v_mov_b32_e32 v1, 0.5
-; GCN-NEXT:    v_mov_b32_e32 v2, 2.0
-; GCN-NEXT:    v_mov_b32_e32 v3, 1.0
-; GCN-NEXT:    exp mrt0 off, v2, off, off done
-; GCN-NEXT:    s_setprio 0
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_endpgm
-  call void @llvm.amdgcn.exp.f32(i32 0, i32 2, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
-  ret void
-}
-
-define amdgpu_gfx void @test_export_gfx(float %v) #0 {
-; GCN-LABEL: test_export_gfx:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v1, 4.0
-; GCN-NEXT:    v_mov_b32_e32 v2, 0.5
-; GCN-NEXT:    v_mov_b32_e32 v3, 2.0
-; GCN-NEXT:    exp mrt0 off, v3, off, off done
-; GCN-NEXT:    s_setprio 0
-; GCN-NEXT:    s_waitcnt_expcnt null, 0x0
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_setprio 2
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-  call void @llvm.amdgcn.exp.f32(i32 0, i32 2, float %v, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
-  ret void
-}
-
-define amdgpu_cs void @test_export_cs() #0 {
-; GCN-LABEL: test_export_cs:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_mov_b32_e32 v0, 4.0
-; GCN-NEXT:    v_mov_b32_e32 v1, 0.5
-; GCN-NEXT:    v_mov_b32_e32 v2, 2.0
-; GCN-NEXT:    v_mov_b32_e32 v3, 1.0
-; GCN-NEXT:    exp mrt0 off, v2, off, off done
-; GCN-NEXT:    s_endpgm
-  call void @llvm.amdgcn.exp.f32(i32 0, i32 2, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
-  ret void
-}
-
-define amdgpu_kernel void @test_export_kernel() #0 {
-; GCN-LABEL: test_export_kernel:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_mov_b32_e32 v0, 4.0
-; GCN-NEXT:    v_mov_b32_e32 v1, 0.5
-; GCN-NEXT:    v_mov_b32_e32 v2, 2.0
-; GCN-NEXT:    v_mov_b32_e32 v3, 1.0
-; GCN-NEXT:    exp mrt0 off, v2, off, off done
-; GCN-NEXT:    s_endpgm
-  call void @llvm.amdgcn.exp.f32(i32 0, i32 2, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
-  ret void
-}
-
-define amdgpu_gfx void @test_no_export_gfx(float %v) #0 {
-; GCN-LABEL: test_no_export_gfx:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-  ret void
-}
-
-define amdgpu_ps void @test_no_export_ps(float %v) #0 {
-; GCN-LABEL: test_no_export_ps:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_endpgm
-  ret void
-}
-
-define amdgpu_ps void @test_if_export_f32(i32 %flag, float %x, float %y, float %z, float %w) #0 {
-; GCN-LABEL: test_if_export_f32:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_setprio 2
-; GCN-NEXT:    s_mov_b32 s0, exec_lo
-; GCN-NEXT:    v_cmpx_ne_u32_e32 0, v0
-; GCN-NEXT:    s_cbranch_execz .LBB9_2
-; GCN-NEXT:  ; %bb.1: ; %exp
-; GCN-NEXT:    exp mrt0 v1, v2, v3, v4
-; GCN-NEXT:    s_setprio 0
-; GCN-NEXT:    s_waitcnt_expcnt null, 0x0
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_setprio 2
-; GCN-NEXT:  .LBB9_2: ; %end
-; GCN-NEXT:    s_endpgm
-  %cc = icmp eq i32 %flag, 0
-  br i1 %cc, label %end, label %exp
-
-exp:
-  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float %z, float %w, i1 false, i1 false)
-  br label %end
-
-end:
-  ret void
-}
-
-define amdgpu_ps void @test_if_export_vm_f32(i32 %flag, float %x, float %y, float %z, float %w) #0 {
-; GCN-LABEL: test_if_export_vm_f32:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_setprio 2
-; GCN-NEXT:    s_mov_b32 s0, exec_lo
-; GCN-NEXT:    v_cmpx_ne_u32_e32 0, v0
-; GCN-NEXT:    s_cbranch_execz .LBB10_2
-; GCN-NEXT:  ; %bb.1: ; %exp
-; GCN-NEXT:    exp mrt0 v1, v2, v3, v4
-; GCN-NEXT:    s_setprio 0
-; GCN-NEXT:    s_waitcnt_expcnt null, 0x0
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_setprio 2
-; GCN-NEXT:  .LBB10_2: ; %end
-; GCN-NEXT:    s_endpgm
-  %cc = icmp eq i32 %flag, 0
-  br i1 %cc, label %end, label %exp
-
-exp:
-  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float %z, float %w, i1 false, i1 true)
-  br label %end
-
-end:
-  ret void
-}
-
-define amdgpu_ps void @test_if_export_done_f32(i32 %flag, float %x, float %y, float %z, float %w) #0 {
-; GCN-LABEL: test_if_export_done_f32:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_setprio 2
-; GCN-NEXT:    s_mov_b32 s0, exec_lo
-; GCN-NEXT:    v_cmpx_ne_u32_e32 0, v0
-; GCN-NEXT:    s_cbranch_execz .LBB11_2
-; GCN-NEXT:  ; %bb.1: ; %exp
-; GCN-NEXT:    exp mrt0 v1, v2, v3, v4 done
-; GCN-NEXT:    s_setprio 0
-; GCN-NEXT:    s_waitcnt_expcnt null, 0x0
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_setprio 2
-; GCN-NEXT:  .LBB11_2: ; %end
-; GCN-NEXT:    s_endpgm
-  %cc = icmp eq i32 %flag, 0
-  br i1 %cc, label %end, label %exp
-
-exp:
-  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float %z, float %w, i1 true, i1 false)
-  br label %end
-
-end:
-  ret void
-}
-
-define amdgpu_ps void @test_if_export_vm_done_f32(i32 %flag, float %x, float %y, float %z, float %w) #0 {
-; GCN-LABEL: test_if_export_vm_done_f32:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_setprio 2
-; GCN-NEXT:    s_mov_b32 s0, exec_lo
-; GCN-NEXT:    v_cmpx_ne_u32_e32 0, v0
-; GCN-NEXT:    s_cbranch_execz .LBB12_2
-; GCN-NEXT:  ; %bb.1: ; %exp
-; GCN-NEXT:    exp mrt0 v1, v2, v3, v4 done
-; GCN-NEXT:    s_setprio 0
-; GCN-NEXT:    s_waitcnt_expcnt null, 0x0
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_setprio 2
-; GCN-NEXT:  .LBB12_2: ; %end
-; GCN-NEXT:    s_endpgm
-  %cc = icmp eq i32 %flag, 0
-  br i1 %cc, label %end, label %exp
-
-exp:
-  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float %z, float %w, i1 true, i1 true)
-  br label %end
-
-end:
-  ret void
-}
-
-define amdgpu_ps void @test_export_pos_before_param_across_load(i32 %idx) #0 {
-; GCN-LABEL: test_export_pos_before_param_across_load:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_setprio 2
-; GCN-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen
-; GCN-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-NEXT:    v_mov_b32_e32 v2, 1.0
-; GCN-NEXT:    v_mov_b32_e32 v3, 0.5
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    exp pos0 v1, v1, v1, v0 done
-; GCN-NEXT:    exp invalid_target_32 v2, v2, v2, v2
-; GCN-NEXT:    exp invalid_target_33 v2, v2, v2, v3
-; GCN-NEXT:    s_setprio 0
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_endpgm
-  call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float 1.0, float 1.0, float 1.0, float 1.0, i1 false, i1 false)
-  call void @llvm.amdgcn.exp.f32(i32 33, i32 15, float 1.0, float 1.0, float 1.0, float 0.5, i1 false, i1 false)
-  %load = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx, i32 0, i32 0)
-  call void @llvm.amdgcn.exp.f32(i32 12, i32 15, float 0.0, float 0.0, float 0.0, float %load, i1 true, i1 false)
-  ret void
-}
-
-define amdgpu_ps void @test_export_across_store_load(i32 %idx, float %v) #0 {
-; GCN-LABEL: test_export_across_store_load:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_setprio 2
-; GCN-NEXT:    v_mov_b32_e32 v2, 16
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc_lo
-; GCN-NEXT:    v_mov_b32_e32 v2, 0
-; GCN-NEXT:    scratch_store_b32 v0, v1, off
-; GCN-NEXT:    scratch_load_b32 v0, off, off
-; GCN-NEXT:    v_mov_b32_e32 v1, 1.0
-; GCN-NEXT:    exp pos0 v2, v2, v2, v1 done
-; GCN-NEXT:    s_setprio 0
-; GCN-NEXT:    s_waitcnt_expcnt null, 0x0
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_setprio 2
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    exp invalid_target_32 v0, v2, v1, v2
-; GCN-NEXT:    exp invalid_target_33 v0, v2, v1, v2
-; GCN-NEXT:    s_setprio 0
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_endpgm
-  %data0 = alloca <4 x float>, align 8, addrspace(5)
-  %data1 = alloca <4 x float>, align 8, addrspace(5)
-  %cmp = icmp eq i32 %idx, 1
-  %data = select i1 %cmp, ptr addrspace(5) %data0, ptr addrspace(5) %data1
-  store float %v, ptr addrspace(5) %data, align 8
-  call void @llvm.amdgcn.exp.f32(i32 12, i32 15, float 0.0, float 0.0, float 0.0, float 1.0, i1 true, i1 false)
-  %load0 = load float, ptr addrspace(5) %data0, align 8
-  call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float %load0, float 0.0, float 1.0, float 0.0, i1 false, i1 false)
-  call void @llvm.amdgcn.exp.f32(i32 33, i32 15, float %load0, float 0.0, float 1.0, float 0.0, i1 false, i1 false)
-  ret void
-}
-
-define amdgpu_ps void @test_export_in_callee(float %v) #0 {
-; GCN-LABEL: test_export_in_callee:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_setprio 2
-; GCN-NEXT:    s_getpc_b64 s[0:1]
-; GCN-NEXT:    s_add_u32 s0, s0, test_export_gfx at gotpcrel32@lo+4
-; GCN-NEXT:    s_addc_u32 s1, s1, test_export_gfx at gotpcrel32@hi+12
-; GCN-NEXT:    v_add_f32_e32 v0, 1.0, v0
-; GCN-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GCN-NEXT:    s_mov_b32 s32, 0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GCN-NEXT:    s_endpgm
-  %x = fadd float %v, 1.0
-  call void @test_export_gfx(float %x)
-  ret void
-}
-
-define amdgpu_ps void @test_export_in_callee_prio(float %v) #0 {
-; GCN-LABEL: test_export_in_callee_prio:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_setprio 2
-; GCN-NEXT:    s_mov_b32 s32, 0
-; GCN-NEXT:    v_add_f32_e32 v0, 1.0, v0
-; GCN-NEXT:    s_setprio 2
-; GCN-NEXT:    s_getpc_b64 s[0:1]
-; GCN-NEXT:    s_add_u32 s0, s0, test_export_gfx at gotpcrel32@lo+4
-; GCN-NEXT:    s_addc_u32 s1, s1, test_export_gfx at gotpcrel32@hi+12
-; GCN-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GCN-NEXT:    s_endpgm
-  %x = fadd float %v, 1.0
-  call void @llvm.amdgcn.s.setprio(i16 0)
-  call void @test_export_gfx(float %x)
-  ret void
-}
-
-declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1
-declare void @llvm.amdgcn.exp.i32(i32, i32, i32, i32, i32, i32, i1, i1) #1
-declare float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8), i32, i32, i32) #2
-declare void @llvm.amdgcn.s.setprio(i16)
-
-attributes #0 = { nounwind }
-attributes #1 = { nounwind inaccessiblememonly }
-attributes #2 = { nounwind readnone }
diff --git a/llvm/test/CodeGen/AMDGPU/required-export-priority.mir b/llvm/test/CodeGen/AMDGPU/required-export-priority.mir
deleted file mode 100644
index 392519171cd71..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/required-export-priority.mir
+++ /dev/null
@@ -1,294 +0,0 @@
-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -run-pass=post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GFX1150 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx11-generic -run-pass=post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GFX1150 %s
-
---- |
-  define amdgpu_ps void @end_of_shader() {
-    ret void
-  }
-  define amdgpu_ps void @end_of_shader_return_to_epilogue() {
-    ret void
-  }
-  define amdgpu_ps void @end_of_block() {
-    ret void
-  }
-  define amdgpu_ps void @start_of_block() {
-    ret void
-  }
-  define amdgpu_ps void @block_of_exports() {
-    ret void
-  }
-  define amdgpu_ps void @sparse_exports() {
-    ret void
-  }
-  define amdgpu_ps void @existing_setprio_1() {
-    ret void
-  }
-  define amdgpu_ps void @existing_setprio_2() {
-    ret void
-  }
-...
-
----
-name: end_of_shader
-tracksRegLiveness: true
-liveins:
-  - { reg: '$vgpr0' }
-body: |
-  bb.0:
-    liveins: $vgpr0
-    ; GFX1150-LABEL: name: end_of_shader
-    ; GFX1150: liveins: $vgpr0
-    ; GFX1150-NEXT: {{  $}}
-    ; GFX1150-NEXT: S_SETPRIO 2
-    ; GFX1150-NEXT: EXP_DONE 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec
-    ; GFX1150-NEXT: S_SETPRIO 0
-    ; GFX1150-NEXT: S_NOP 0
-    ; GFX1150-NEXT: S_NOP 0
-    ; GFX1150-NEXT: S_ENDPGM 0
-    EXP_DONE 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec
-    S_ENDPGM 0
-...
-
----
-name: end_of_shader_return_to_epilogue
-tracksRegLiveness: true
-liveins:
-  - { reg: '$vgpr0' }
-body: |
-  bb.0:
-    liveins: $vgpr0
-    ; GFX1150-LABEL: name: end_of_shader_return_to_epilogue
-    ; GFX1150: liveins: $vgpr0
-    ; GFX1150-NEXT: {{  $}}
-    ; GFX1150-NEXT: S_SETPRIO 2
-    ; GFX1150-NEXT: EXP_DONE 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec
-    ; GFX1150-NEXT: S_SETPRIO 0
-    ; GFX1150-NEXT: S_WAITCNT_EXPCNT $sgpr_null, 0
-    ; GFX1150-NEXT: S_NOP 0
-    ; GFX1150-NEXT: S_NOP 0
-    ; GFX1150-NEXT: S_SETPRIO 2
-    ; GFX1150-NEXT: SI_RETURN_TO_EPILOG $vgpr0
-    EXP_DONE 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec
-    SI_RETURN_TO_EPILOG $vgpr0
-...
-
----
-name: end_of_block
-tracksRegLiveness: true
-liveins:
-  - { reg: '$vgpr0' }
-body: |
-  ; GFX1150-LABEL: name: end_of_block
-  ; GFX1150: bb.0:
-  ; GFX1150-NEXT:   successors: %bb.1(0x80000000)
-  ; GFX1150-NEXT:   liveins: $vgpr0
-  ; GFX1150-NEXT: {{  $}}
-  ; GFX1150-NEXT:   S_SETPRIO 2
-  ; GFX1150-NEXT:   EXP_DONE 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec
-  ; GFX1150-NEXT:   S_SETPRIO 0
-  ; GFX1150-NEXT:   S_WAITCNT_EXPCNT $sgpr_null, 0
-  ; GFX1150-NEXT:   S_NOP 0
-  ; GFX1150-NEXT:   S_NOP 0
-  ; GFX1150-NEXT:   S_SETPRIO 2
-  ; GFX1150-NEXT: {{  $}}
-  ; GFX1150-NEXT: bb.1:
-  ; GFX1150-NEXT:   S_ENDPGM 0
-  bb.0:
-    liveins: $vgpr0
-    EXP_DONE 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec
-
-  bb.1:
-    S_ENDPGM 0
-...
-
----
-name: start_of_block
-tracksRegLiveness: true
-liveins:
-  - { reg: '$vgpr0' }
-body: |
-  ; GFX1150-LABEL: name: start_of_block
-  ; GFX1150: bb.0:
-  ; GFX1150-NEXT:   successors: %bb.1(0x80000000)
-  ; GFX1150-NEXT:   liveins: $vgpr0
-  ; GFX1150-NEXT: {{  $}}
-  ; GFX1150-NEXT:   S_SETPRIO 2
-  ; GFX1150-NEXT: {{  $}}
-  ; GFX1150-NEXT: bb.1:
-  ; GFX1150-NEXT:   successors: %bb.2(0x80000000)
-  ; GFX1150-NEXT:   liveins: $vgpr0
-  ; GFX1150-NEXT: {{  $}}
-  ; GFX1150-NEXT:   EXP_DONE 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec
-  ; GFX1150-NEXT:   S_SETPRIO 0
-  ; GFX1150-NEXT:   S_WAITCNT_EXPCNT $sgpr_null, 0
-  ; GFX1150-NEXT:   S_NOP 0
-  ; GFX1150-NEXT:   S_NOP 0
-  ; GFX1150-NEXT:   S_SETPRIO 2
-  ; GFX1150-NEXT: {{  $}}
-  ; GFX1150-NEXT: bb.2:
-  ; GFX1150-NEXT:   S_ENDPGM 0
-  bb.0:
-    liveins: $vgpr0
-
-  bb.1:
-    liveins: $vgpr0
-    EXP_DONE 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec
-
-  bb.2:
-    S_ENDPGM 0
-...
-
----
-name: block_of_exports
-tracksRegLiveness: true
-liveins:
-  - { reg: '$vgpr0' }
-body: |
-  bb.0:
-    liveins: $vgpr0
-    ; GFX1150-LABEL: name: block_of_exports
-    ; GFX1150: liveins: $vgpr0
-    ; GFX1150-NEXT: {{  $}}
-    ; GFX1150-NEXT: S_SETPRIO 2
-    ; GFX1150-NEXT: EXP 2, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec
-    ; GFX1150-NEXT: EXP 1, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec
-    ; GFX1150-NEXT: EXP_DONE 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec
-    ; GFX1150-NEXT: S_SETPRIO 0
-    ; GFX1150-NEXT: S_NOP 0
-    ; GFX1150-NEXT: S_NOP 0
-    ; GFX1150-NEXT: S_ENDPGM 0
-    EXP 2, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec
-    EXP 1, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec
-    EXP_DONE 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec
-    S_ENDPGM 0
-...
-
----
-name: sparse_exports
-tracksRegLiveness: true
-liveins:
-  - { reg: '$vgpr0' }
-body: |
-  bb.0:
-    liveins: $vgpr0
-    ; GFX1150-LABEL: name: sparse_exports
-    ; GFX1150: liveins: $vgpr0
-    ; GFX1150-NEXT: {{  $}}
-    ; GFX1150-NEXT: S_SETPRIO 2
-    ; GFX1150-NEXT: EXP 2, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec
-    ; GFX1150-NEXT: S_SETPRIO 0
-    ; GFX1150-NEXT: S_WAITCNT_EXPCNT $sgpr_null, 0
-    ; GFX1150-NEXT: S_NOP 0
-    ; GFX1150-NEXT: S_NOP 0
-    ; GFX1150-NEXT: S_SETPRIO 2
-    ; GFX1150-NEXT: $vgpr0 = V_AND_B32_e32 1, $vgpr0, implicit $exec
-    ; GFX1150-NEXT: EXP 1, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec
-    ; GFX1150-NEXT: S_SETPRIO 0
-    ; GFX1150-NEXT: S_WAITCNT_EXPCNT $sgpr_null, 0
-    ; GFX1150-NEXT: S_NOP 0
-    ; GFX1150-NEXT: S_NOP 0
-    ; GFX1150-NEXT: S_SETPRIO 2
-    ; GFX1150-NEXT: $vgpr0 = V_OR_B32_e32 2, $vgpr0, implicit $exec
-    ; GFX1150-NEXT: EXP_DONE 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec
-    ; GFX1150-NEXT: S_SETPRIO 0
-    ; GFX1150-NEXT: S_NOP 0
-    ; GFX1150-NEXT: S_NOP 0
-    ; GFX1150-NEXT: S_ENDPGM 0
-    EXP 2, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec
-    $vgpr0 = V_AND_B32_e32 1, $vgpr0, implicit $exec
-    EXP 1, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec
-    $vgpr0 = V_OR_B32_e32 2, $vgpr0, implicit $exec
-    EXP_DONE 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec
-    S_ENDPGM 0
-...
-
----
-name: existing_setprio_1
-tracksRegLiveness: true
-liveins:
-  - { reg: '$vgpr0' }
-body: |
-  ; GFX1150-LABEL: name: existing_setprio_1
-  ; GFX1150: bb.0:
-  ; GFX1150-NEXT:   successors: %bb.1(0x80000000)
-  ; GFX1150-NEXT:   liveins: $vgpr0
-  ; GFX1150-NEXT: {{  $}}
-  ; GFX1150-NEXT:   S_SETPRIO 2
-  ; GFX1150-NEXT:   $vgpr0 = V_AND_B32_e32 1, $vgpr0, implicit $exec
-  ; GFX1150-NEXT: {{  $}}
-  ; GFX1150-NEXT: bb.1:
-  ; GFX1150-NEXT:   successors: %bb.2(0x80000000)
-  ; GFX1150-NEXT:   liveins: $vgpr0
-  ; GFX1150-NEXT: {{  $}}
-  ; GFX1150-NEXT:   S_SETPRIO 3
-  ; GFX1150-NEXT:   $vgpr0 = V_OR_B32_e32 2, $vgpr0, implicit $exec
-  ; GFX1150-NEXT:   S_SETPRIO 2
-  ; GFX1150-NEXT: {{  $}}
-  ; GFX1150-NEXT: bb.2:
-  ; GFX1150-NEXT:   successors: %bb.3(0x80000000)
-  ; GFX1150-NEXT:   liveins: $vgpr0
-  ; GFX1150-NEXT: {{  $}}
-  ; GFX1150-NEXT:   S_SETPRIO 3
-  ; GFX1150-NEXT:   $vgpr0 = V_OR_B32_e32 3, $vgpr0, implicit $exec
-  ; GFX1150-NEXT:   S_SETPRIO 2
-  ; GFX1150-NEXT: {{  $}}
-  ; GFX1150-NEXT: bb.3:
-  ; GFX1150-NEXT:   liveins: $vgpr0
-  ; GFX1150-NEXT: {{  $}}
-  ; GFX1150-NEXT:   EXP 1, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec
-  ; GFX1150-NEXT:   EXP_DONE 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec
-  ; GFX1150-NEXT:   S_SETPRIO 0
-  ; GFX1150-NEXT:   S_NOP 0
-  ; GFX1150-NEXT:   S_NOP 0
-  ; GFX1150-NEXT:   S_ENDPGM 0
-  bb.0:
-    liveins: $vgpr0
-    $vgpr0 = V_AND_B32_e32 1, $vgpr0, implicit $exec
-
-  bb.1:
-    liveins: $vgpr0
-    S_SETPRIO 3
-    $vgpr0 = V_OR_B32_e32 2, $vgpr0, implicit $exec
-    S_SETPRIO 0
-
-  bb.2:
-    liveins: $vgpr0
-    S_SETPRIO 1
-    $vgpr0 = V_OR_B32_e32 3, $vgpr0, implicit $exec
-    S_SETPRIO 0
-
-  bb.3:
-    liveins: $vgpr0
-    EXP 1, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec
-    EXP_DONE 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec
-    S_ENDPGM 0
-...
-
----
-name: existing_setprio_2
-tracksRegLiveness: true
-liveins:
-  - { reg: '$vgpr0' }
-body: |
-  bb.0:
-    liveins: $vgpr0
-    ; GFX1150-LABEL: name: existing_setprio_2
-    ; GFX1150: liveins: $vgpr0
-    ; GFX1150-NEXT: {{  $}}
-    ; GFX1150-NEXT: S_SETPRIO 3
-    ; GFX1150-NEXT: EXP 1, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec
-    ; GFX1150-NEXT: EXP_DONE 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec
-    ; GFX1150-NEXT: S_SETPRIO 0
-    ; GFX1150-NEXT: S_WAITCNT_EXPCNT $sgpr_null, 0
-    ; GFX1150-NEXT: S_NOP 0
-    ; GFX1150-NEXT: S_NOP 0
-    ; GFX1150-NEXT: S_SETPRIO 2
-    ; GFX1150-NEXT: S_SETPRIO 3
-    ; GFX1150-NEXT: S_ENDPGM 0
-    S_SETPRIO 3
-    EXP 1, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec
-    EXP_DONE 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec
-    S_SETPRIO 3
-    S_ENDPGM 0
-...
diff --git a/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll b/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll
index 002de8bb4eb51..a640ac985ade4 100644
--- a/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll
+++ b/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll
@@ -113,7 +113,16 @@ define amdgpu_kernel void @test_kernel() !dbg !3 {
   ret void
 }
 
-; STDERR-NOT: test_func
+; STDERR: remark: foo.cl:42:0: Function Name: test_func
+; STDERR-NEXT: remark: foo.cl:42:0:     SGPRs: 0
+; STDERR-NEXT: remark: foo.cl:42:0:     VGPRs: 0
+; STDERR-NEXT: remark: foo.cl:42:0:     AGPRs: 0
+; STDERR-NEXT: remark: foo.cl:42:0:     ScratchSize [bytes/lane]: 0
+; STDERR-NEXT: remark: foo.cl:42:0:     Dynamic Stack: False
+; STDERR-NEXT: remark: foo.cl:42:0:     Occupancy [waves/SIMD]: 0
+; STDERR-NEXT: remark: foo.cl:42:0:     SGPRs Spill: 0
+; STDERR-NEXT: remark: foo.cl:42:0:     VGPRs Spill: 0
+; STDERR-NOT: LDS Size
 define void @test_func() !dbg !6 {
   call void asm sideeffect "; clobber v17", "~{v17}"()
   call void asm sideeffect "; clobber s11", "~{s11}"()
@@ -135,7 +144,15 @@ define amdgpu_kernel void @empty_kernel() !dbg !7 {
   ret void
 }
 
-; STDERR-NOT: empty_func
+; STDERR: remark: foo.cl:52:0: Function Name: empty_func
+; STDERR-NEXT: remark: foo.cl:52:0:     SGPRs: 0
+; STDERR-NEXT: remark: foo.cl:52:0:     VGPRs: 0
+; STDERR-NEXT: remark: foo.cl:52:0:     AGPRs: 0
+; STDERR-NEXT: remark: foo.cl:52:0:     ScratchSize [bytes/lane]: 0
+; STDERR-NEXT: remark: foo.cl:52:0:     Dynamic Stack: False
+; STDERR-NEXT: remark: foo.cl:52:0:     Occupancy [waves/SIMD]: 0
+; STDERR-NEXT: remark: foo.cl:52:0:     SGPRs Spill: 0
+; STDERR-NEXT: remark: foo.cl:52:0:     VGPRs Spill: 0
 define void @empty_func() !dbg !8 {
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/rotl.ll b/llvm/test/CodeGen/AMDGPU/rotl.ll
index f692584ef92be..fdce4431fbbf2 100644
--- a/llvm/test/CodeGen/AMDGPU/rotl.ll
+++ b/llvm/test/CodeGen/AMDGPU/rotl.ll
@@ -47,12 +47,12 @@ define amdgpu_kernel void @rotl_i32(ptr addrspace(1) %in, i32 %x, i32 %y) {
 ;
 ; GFX10-LABEL: rotl_i32:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_sub_i32 s0, 32, s7
-; GFX10-NEXT:    v_alignbit_b32 v1, s6, s6, s0
-; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-NEXT:    s_sub_i32 s3, 32, s3
+; GFX10-NEXT:    v_alignbit_b32 v1, s2, s2, s3
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: rotl_i32:
diff --git a/llvm/test/CodeGen/AMDGPU/rotr.ll b/llvm/test/CodeGen/AMDGPU/rotr.ll
index a368aa1055b28..0e1dd69d930ae 100644
--- a/llvm/test/CodeGen/AMDGPU/rotr.ll
+++ b/llvm/test/CodeGen/AMDGPU/rotr.ll
@@ -43,11 +43,11 @@ define amdgpu_kernel void @rotr_i32(ptr addrspace(1) %in, i32 %x, i32 %y) {
 ;
 ; GFX10-LABEL: rotr_i32:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_alignbit_b32 v1, s6, s6, s7
-; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-NEXT:    v_alignbit_b32 v1, s2, s2, s3
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: rotr_i32:
diff --git a/llvm/test/CodeGen/AMDGPU/saddo.ll b/llvm/test/CodeGen/AMDGPU/saddo.ll
index 57c936d4689eb..684279a3776fc 100644
--- a/llvm/test/CodeGen/AMDGPU/saddo.ll
+++ b/llvm/test/CodeGen/AMDGPU/saddo.ll
@@ -288,35 +288,35 @@ define amdgpu_kernel void @v_saddo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
 ;
 ; GFX9-LABEL: v_saddo_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[8:9]
-; GFX9-NEXT:    global_load_dword v2, v0, s[10:11]
+; GFX9-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX9-NEXT:    global_load_dword v2, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_i32 v3, v1, v2 clamp
 ; GFX9-NEXT:    v_add_u32_e32 v1, v1, v2
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, v1, v3
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX9-NEXT:    global_store_byte v0, v1, s[6:7]
+; GFX9-NEXT:    global_store_byte v0, v1, s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_saddo_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dword v1, v0, s[8:9]
-; GFX10-NEXT:    global_load_dword v2, v0, s[10:11]
+; GFX10-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX10-NEXT:    global_load_dword v2, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_add_nc_i32 v3, v1, v2 clamp
 ; GFX10-NEXT:    v_add_nc_u32_e32 v1, v1, v2
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v3
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
-; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
-; GFX10-NEXT:    global_store_byte v0, v2, s[6:7]
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    global_store_byte v0, v2, s[2:3]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_saddo_i32:
@@ -401,38 +401,38 @@ define amdgpu_kernel void @s_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ;
 ; GFX9-LABEL: s_saddo_i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_add_u32 s0, s8, s10
+; GFX9-NEXT:    s_add_u32 s8, s4, s6
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    s_addc_u32 s9, s5, s7
+; GFX9-NEXT:    v_cmp_lt_i64_e64 s[10:11], s[6:7], 0
+; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s9
-; GFX9-NEXT:    s_addc_u32 s1, s9, s11
-; GFX9-NEXT:    v_cmp_lt_i64_e64 s[2:3], s[10:11], 0
-; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    s_xor_b64 s[0:1], s[2:3], vcc
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_xor_b64 s[0:1], s[10:11], vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; GFX9-NEXT:    global_store_byte v2, v0, s[6:7]
+; GFX9-NEXT:    global_store_byte v2, v0, s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: s_saddo_i64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_add_u32 s0, s8, s10
-; GFX10-NEXT:    s_addc_u32 s1, s9, s11
-; GFX10-NEXT:    v_cmp_lt_i64_e64 s2, s[10:11], 0
-; GFX10-NEXT:    v_cmp_lt_i64_e64 s3, s[0:1], s[8:9]
-; GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-NEXT:    s_xor_b32 s0, s2, s3
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s0
-; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
-; GFX10-NEXT:    global_store_byte v2, v3, s[6:7]
+; GFX10-NEXT:    s_add_u32 s8, s4, s6
+; GFX10-NEXT:    s_addc_u32 s9, s5, s7
+; GFX10-NEXT:    v_cmp_lt_i64_e64 s6, s[6:7], 0
+; GFX10-NEXT:    v_cmp_lt_i64_e64 s4, s[8:9], s[4:5]
+; GFX10-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-NEXT:    v_mov_b32_e32 v1, s9
+; GFX10-NEXT:    s_xor_b32 s4, s6, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s4
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT:    global_store_byte v2, v3, s[2:3]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_saddo_i64:
@@ -656,11 +656,11 @@ define amdgpu_kernel void @v_saddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1)
 ;
 ; GFX9-LABEL: v_saddo_v2i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v6, s[8:9]
-; GFX9-NEXT:    global_load_dwordx2 v[2:3], v6, s[10:11]
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v6, s[4:5]
+; GFX9-NEXT:    global_load_dwordx2 v[2:3], v6, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v5, v1, v3
 ; GFX9-NEXT:    v_add_i32 v1, v1, v3 clamp
@@ -670,18 +670,18 @@ define amdgpu_kernel void @v_saddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1)
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, v4, v0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT:    global_store_dwordx2 v6, v[4:5], s[4:5]
-; GFX9-NEXT:    global_store_dwordx2 v6, v[0:1], s[6:7]
+; GFX9-NEXT:    global_store_dwordx2 v6, v[4:5], s[0:1]
+; GFX9-NEXT:    global_store_dwordx2 v6, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_saddo_v2i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v5, s[8:9]
-; GFX10-NEXT:    global_load_dwordx2 v[2:3], v5, s[10:11]
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v5, s[4:5]
+; GFX10-NEXT:    global_load_dwordx2 v[2:3], v5, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v4, v1, v3
 ; GFX10-NEXT:    v_add_nc_i32 v1, v1, v3 clamp
@@ -691,8 +691,8 @@ define amdgpu_kernel void @v_saddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1)
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v3, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX10-NEXT:    global_store_dwordx2 v5, v[3:4], s[4:5]
-; GFX10-NEXT:    global_store_dwordx2 v5, v[0:1], s[6:7]
+; GFX10-NEXT:    global_store_dwordx2 v5, v[3:4], s[0:1]
+; GFX10-NEXT:    global_store_dwordx2 v5, v[0:1], s[2:3]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_saddo_v2i32:
diff --git a/llvm/test/CodeGen/AMDGPU/sdiv.ll b/llvm/test/CodeGen/AMDGPU/sdiv.ll
index c10600a78e3e7..b57a51f1382ae 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv.ll
@@ -241,23 +241,23 @@ define amdgpu_kernel void @sdiv_i32_4(ptr addrspace(1) %out, ptr addrspace(1) %i
 ;
 ; GFX9-LABEL: sdiv_i32_4:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    s_mov_b32 s10, s2
-; GFX9-NEXT:    s_mov_b32 s11, s3
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    s_mov_b32 s10, s6
+; GFX9-NEXT:    s_mov_b32 s11, s7
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s8, s6
-; GFX9-NEXT:    s_mov_b32 s9, s7
+; GFX9-NEXT:    s_mov_b32 s8, s2
+; GFX9-NEXT:    s_mov_b32 s9, s3
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[8:11], 0
-; GFX9-NEXT:    s_mov_b32 s0, s4
-; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_mov_b32 s4, s0
+; GFX9-NEXT:    s_mov_b32 s5, s1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 30, v1
 ; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 2, v0
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; EG-LABEL: sdiv_i32_4:
@@ -339,25 +339,25 @@ define amdgpu_kernel void @slow_sdiv_i32_3435(ptr addrspace(1) %out, ptr addrspa
 ;
 ; GFX9-LABEL: slow_sdiv_i32_3435:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    s_mov_b32 s10, s2
-; GFX9-NEXT:    s_mov_b32 s11, s3
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    s_mov_b32 s10, s6
+; GFX9-NEXT:    s_mov_b32 s11, s7
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s8, s6
-; GFX9-NEXT:    s_mov_b32 s9, s7
+; GFX9-NEXT:    s_mov_b32 s8, s2
+; GFX9-NEXT:    s_mov_b32 s9, s3
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[8:11], 0
-; GFX9-NEXT:    s_mov_b32 s0, 0x98a1930b
-; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_mov_b32 s2, 0x98a1930b
+; GFX9-NEXT:    s_mov_b32 s4, s0
+; GFX9-NEXT:    s_mov_b32 s5, s1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mul_hi_i32 v1, v0, s0
-; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    v_mul_hi_i32 v1, v0, s2
 ; GFX9-NEXT:    v_add_u32_e32 v0, v1, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 31, v0
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 11, v0
 ; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; EG-LABEL: slow_sdiv_i32_3435:
@@ -732,17 +732,17 @@ define amdgpu_kernel void @sdiv_v2i32_4(ptr addrspace(1) %out, ptr addrspace(1)
 ;
 ; GFX9-LABEL: sdiv_v2i32_4:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    s_mov_b32 s10, s2
-; GFX9-NEXT:    s_mov_b32 s11, s3
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    s_mov_b32 s10, s6
+; GFX9-NEXT:    s_mov_b32 s11, s7
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s8, s6
-; GFX9-NEXT:    s_mov_b32 s9, s7
+; GFX9-NEXT:    s_mov_b32 s8, s2
+; GFX9-NEXT:    s_mov_b32 s9, s3
 ; GFX9-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; GFX9-NEXT:    s_mov_b32 s0, s4
-; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_mov_b32 s4, s0
+; GFX9-NEXT:    s_mov_b32 s5, s1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 31, v0
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v3, 31, v1
@@ -752,7 +752,7 @@ define amdgpu_kernel void @sdiv_v2i32_4(ptr addrspace(1) %out, ptr addrspace(1)
 ; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 2, v1
-; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; EG-LABEL: sdiv_v2i32_4:
@@ -1371,17 +1371,17 @@ define amdgpu_kernel void @sdiv_v4i32_4(ptr addrspace(1) %out, ptr addrspace(1)
 ;
 ; GFX9-LABEL: sdiv_v4i32_4:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    s_mov_b32 s10, s2
-; GFX9-NEXT:    s_mov_b32 s11, s3
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    s_mov_b32 s10, s6
+; GFX9-NEXT:    s_mov_b32 s11, s7
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s8, s6
-; GFX9-NEXT:    s_mov_b32 s9, s7
+; GFX9-NEXT:    s_mov_b32 s8, s2
+; GFX9-NEXT:    s_mov_b32 s9, s3
 ; GFX9-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; GFX9-NEXT:    s_mov_b32 s0, s4
-; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_mov_b32 s4, s0
+; GFX9-NEXT:    s_mov_b32 s5, s1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v4, 31, v0
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v5, 31, v1
@@ -1399,7 +1399,7 @@ define amdgpu_kernel void @sdiv_v4i32_4(ptr addrspace(1) %out, ptr addrspace(1)
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 2, v1
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 2, v2
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v3, 2, v3
-; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; EG-LABEL: sdiv_v4i32_4:
@@ -1515,18 +1515,18 @@ define amdgpu_kernel void @v_sdiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in
 ;
 ; GFX9-LABEL: v_sdiv_i8:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    s_mov_b32 s10, s2
-; GFX9-NEXT:    s_mov_b32 s11, s3
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    s_mov_b32 s10, s6
+; GFX9-NEXT:    s_mov_b32 s11, s7
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s8, s6
-; GFX9-NEXT:    s_mov_b32 s9, s7
+; GFX9-NEXT:    s_mov_b32 s8, s2
+; GFX9-NEXT:    s_mov_b32 s9, s3
 ; GFX9-NEXT:    buffer_load_sbyte v0, off, s[8:11], 0 offset:1
 ; GFX9-NEXT:    buffer_load_sbyte v1, off, s[8:11], 0
-; GFX9-NEXT:    s_mov_b32 s0, s4
-; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_mov_b32 s4, s0
+; GFX9-NEXT:    s_mov_b32 s5, s1
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -1543,7 +1543,7 @@ define amdgpu_kernel void @v_sdiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v0, v4, v0
 ; GFX9-NEXT:    v_bfe_i32 v0, v0, 0, 8
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; EG-LABEL: v_sdiv_i8:
@@ -2253,21 +2253,21 @@ define amdgpu_kernel void @scalarize_mulhs_4xi32(ptr addrspace(1) nocapture read
 ;
 ; GFX9-LABEL: scalarize_mulhs_4xi32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s0, s4
-; GFX9-NEXT:    s_mov_b32 s1, s5
-; GFX9-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
-; GFX9-NEXT:    s_mov_b32 s4, 0x1389c755
-; GFX9-NEXT:    s_mov_b32 s0, s6
-; GFX9-NEXT:    s_mov_b32 s1, s7
+; GFX9-NEXT:    s_mov_b32 s4, s0
+; GFX9-NEXT:    s_mov_b32 s5, s1
+; GFX9-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GFX9-NEXT:    s_mov_b32 s0, 0x1389c755
+; GFX9-NEXT:    s_mov_b32 s4, s2
+; GFX9-NEXT:    s_mov_b32 s5, s3
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mul_hi_i32 v0, v0, s4
-; GFX9-NEXT:    v_mul_hi_i32 v1, v1, s4
-; GFX9-NEXT:    v_mul_hi_i32 v2, v2, s4
-; GFX9-NEXT:    v_mul_hi_i32 v3, v3, s4
+; GFX9-NEXT:    v_mul_hi_i32 v0, v0, s0
+; GFX9-NEXT:    v_mul_hi_i32 v1, v1, s0
+; GFX9-NEXT:    v_mul_hi_i32 v2, v2, s0
+; GFX9-NEXT:    v_mul_hi_i32 v3, v3, s0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 31, v0
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 12, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 31, v1
@@ -2280,7 +2280,7 @@ define amdgpu_kernel void @scalarize_mulhs_4xi32(ptr addrspace(1) nocapture read
 ; GFX9-NEXT:    v_add_u32_e32 v1, v1, v5
 ; GFX9-NEXT:    v_add_u32_e32 v2, v2, v6
 ; GFX9-NEXT:    v_add_u32_e32 v3, v3, v7
-; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; EG-LABEL: scalarize_mulhs_4xi32:
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
index 64b3317edc519..669ed915a002a 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
@@ -36,24 +36,24 @@ define amdgpu_kernel void @add_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) %
 ;
 ; GFX9-LABEL: add_shr_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_u32_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: add_shr_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
   %a = load i32, ptr addrspace(1) %in, align 4
   %shr = lshr i32 %a, 16
@@ -94,24 +94,24 @@ define amdgpu_kernel void @sub_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) %
 ;
 ; GFX9-LABEL: sub_shr_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_sub_u32_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: sub_shr_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_sub_nc_u32_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
   %a = load i32, ptr addrspace(1) %in, align 4
   %shr = lshr i32 %a, 16
@@ -1483,29 +1483,29 @@ define amdgpu_kernel void @sitofp_v2i16_to_v2f16(
 ;
 ; GFX9-LABEL: sitofp_v2i16_to_v2f16:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_f16_i16_e32 v2, v1
 ; GFX9-NEXT:    v_cvt_f16_i16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v1, 16, v2
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: sitofp_v2i16_to_v2f16:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_f16_i16_e32 v2, v1
 ; GFX10-NEXT:    v_cvt_f16_i16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX10-NEXT:    v_lshl_or_b32 v1, v1, 16, v2
-; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) #0 {
@@ -1644,27 +1644,27 @@ define amdgpu_kernel void @immediate_mul_v2i16(ptr addrspace(1) %out, ptr addrsp
 ;
 ; GFX9-LABEL: immediate_mul_v2i16:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT:    s_mov_b32 s0, 0x141007b
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v0, v0, s[2:3]
+; GFX9-NEXT:    s_mov_b32 s2, 0x141007b
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_pk_mul_lo_u16 v0, v0, s0
-; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX9-NEXT:    v_pk_mul_lo_u16 v0, v0, s2
+; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: immediate_mul_v2i16:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_mul_lo_u16 v0, 0x141007b, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1929,12 +1929,12 @@ define amdgpu_kernel void @pulled_out_test(ptr addrspace(1) %sourceA, ptr addrsp
 ;
 ; GFX9-LABEL: pulled_out_test:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 8
-; GFX9-NEXT:    s_movk_i32 s0, 0xff
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[4:5]
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[0:1]
+; GFX9-NEXT:    s_movk_i32 s0, 0xff
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_sdwa v4, v3, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
@@ -1950,18 +1950,18 @@ define amdgpu_kernel void @pulled_out_test(ptr addrspace(1) %sourceA, ptr addrsp
 ; GFX9-NEXT:    v_or_b32_sdwa v3, v8, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: pulled_out_test:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 8
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 24
 ; GFX10-NEXT:    v_mov_b32_e32 v5, 0xff
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v2, s[4:5]
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v2, s[0:1]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_sdwa v6, v3, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX10-NEXT:    v_lshrrev_b32_sdwa v7, v4, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -1975,7 +1975,7 @@ define amdgpu_kernel void @pulled_out_test(ptr addrspace(1) %sourceA, ptr addrsp
 ; GFX10-NEXT:    v_or_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX10-NEXT:    v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX10-NEXT:    s_endpgm
 entry:
   %idxprom = ashr exact i64 15, 32
diff --git a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
index 9b9f03ff74aa3..8c663d963b73e 100644
--- a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
@@ -8,15 +8,15 @@
 define amdgpu_kernel void @s_shl_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
 ; GFX9-LABEL: s_shl_v2i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9-NEXT:    s_mov_b32 s0, s4
-; GFX9-NEXT:    s_mov_b32 s1, s5
-; GFX9-NEXT:    v_pk_lshlrev_b16 v0, s7, v0
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    s_mov_b32 s4, s0
+; GFX9-NEXT:    s_mov_b32 s5, s1
+; GFX9-NEXT:    v_pk_lshlrev_b16 v0, s3, v0
+; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_shl_v2i16:
@@ -59,14 +59,14 @@ define amdgpu_kernel void @s_shl_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2
 ;
 ; GFX10-LABEL: s_shl_v2i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX10-NEXT:    s_mov_b32 s2, -1
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX10-NEXT:    s_mov_b32 s6, -1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_pk_lshlrev_b16 v0, s7, s6
-; GFX10-NEXT:    s_mov_b32 s0, s4
-; GFX10-NEXT:    s_mov_b32 s1, s5
-; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX10-NEXT:    v_pk_lshlrev_b16 v0, s3, s2
+; GFX10-NEXT:    s_mov_b32 s4, s0
+; GFX10-NEXT:    s_mov_b32 s5, s1
+; GFX10-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_shl_v2i16:
@@ -90,13 +90,13 @@ define amdgpu_kernel void @s_shl_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2
 define amdgpu_kernel void @v_shl_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; GFX9-LABEL: v_shl_v2i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[6:7]
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v0, v1, v0
-; GFX9-NEXT:    global_store_dword v2, v0, s[4:5]
+; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: v_shl_v2i16:
@@ -142,13 +142,13 @@ define amdgpu_kernel void @v_shl_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %
 ;
 ; GFX10-LABEL: v_shl_v2i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v2, s[6:7]
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v0, v1, v0
-; GFX10-NEXT:    global_store_dword v2, v0, s[4:5]
+; GFX10-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_shl_v2i16:
@@ -374,13 +374,13 @@ define amdgpu_kernel void @shl_s_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1)
 define amdgpu_kernel void @shl_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; GFX9-LABEL: shl_imm_v_v2i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v1, v1, 8 op_sel_hi:[1,0]
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: shl_imm_v_v2i16:
@@ -426,13 +426,13 @@ define amdgpu_kernel void @shl_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace(
 ;
 ; GFX10-LABEL: shl_imm_v_v2i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v1, v1, 8 op_sel_hi:[1,0]
-; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: shl_imm_v_v2i16:
@@ -462,13 +462,13 @@ define amdgpu_kernel void @shl_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace(
 define amdgpu_kernel void @shl_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; GFX9-LABEL: shl_v_imm_v2i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: shl_v_imm_v2i16:
@@ -510,13 +510,13 @@ define amdgpu_kernel void @shl_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace(
 ;
 ; GFX10-LABEL: shl_v_imm_v2i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
-; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: shl_v_imm_v2i16:
@@ -546,14 +546,14 @@ define amdgpu_kernel void @shl_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace(
 define amdgpu_kernel void @v_shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; GFX9-LABEL: v_shl_v4i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[0:3], v4, s[6:7]
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v1, v3, v1
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v0, v2, v0
-; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[4:5]
+; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: v_shl_v4i16:
@@ -609,14 +609,14 @@ define amdgpu_kernel void @v_shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %
 ;
 ; GFX10-LABEL: v_shl_v4i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx4 v[0:3], v4, s[6:7]
+; GFX10-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v1, v3, v1
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v0, v2, v0
-; GFX10-NEXT:    global_store_dwordx2 v4, v[0:1], s[4:5]
+; GFX10-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_shl_v4i16:
@@ -649,14 +649,14 @@ define amdgpu_kernel void @v_shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %
 define amdgpu_kernel void @shl_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; GFX9-LABEL: shl_v_imm_v4i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[6:7]
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: shl_v_imm_v4i16:
@@ -708,14 +708,14 @@ define amdgpu_kernel void @shl_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace(
 ;
 ; GFX10-LABEL: shl_v_imm_v4i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v2, s[6:7]
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: shl_v_imm_v4i16:
diff --git a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
index 88e2bb772a2d3..ddf331816694a 100644
--- a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
+++ b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
@@ -84,24 +84,24 @@ define amdgpu_kernel void @v_test_i32_x_sub_64(ptr addrspace(1) %out, ptr addrsp
 ;
 ; GFX9-LABEL: v_test_i32_x_sub_64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_subrev_u32_e32 v1, 64, v1
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_i32_x_sub_64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v1, 64, v1
-; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_i32_x_sub_64:
@@ -223,35 +223,35 @@ define amdgpu_kernel void @v_test_i32_x_sub_64_multi_use(ptr addrspace(1) %out,
 ;
 ; GFX9-LABEL: v_test_i32_x_sub_64_multi_use:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_dword v2, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_dword v2, v0, s[2:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_subrev_u32_e32 v1, 64, v1
 ; GFX9-NEXT:    v_subrev_u32_e32 v2, 64, v2
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_store_dword v0, v2, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v2, s[0:1]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_i32_x_sub_64_multi_use:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
+; GFX10-NEXT:    global_load_dword v1, v0, s[2:3] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_load_dword v2, v0, s[6:7] glc dlc
+; GFX10-NEXT:    global_load_dword v2, v0, s[2:3] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v1, 64, v1
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v2, 64, v2
-; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_store_dword v0, v2, s[4:5]
+; GFX10-NEXT:    global_store_dword v0, v2, s[0:1]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -359,24 +359,24 @@ define amdgpu_kernel void @v_test_i32_64_sub_x(ptr addrspace(1) %out, ptr addrsp
 ;
 ; GFX9-LABEL: v_test_i32_64_sub_x:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_sub_u32_e32 v1, 64, v1
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_i32_64_sub_x:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 64, v1
-; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_i32_64_sub_x:
@@ -474,46 +474,46 @@ define amdgpu_kernel void @v_test_i32_x_sub_65(ptr addrspace(1) %out, ptr addrsp
 ;
 ; GFX9-SDAG-LABEL: v_test_i32_x_sub_65:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    v_add_u32_e32 v1, 0xffffffbf, v1
-; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-SDAG-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: v_test_i32_x_sub_65:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v1, 0x41, v1
-; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
 ;
 ; GFX10-SDAG-LABEL: v_test_i32_x_sub_65:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SDAG-NEXT:    v_add_nc_u32_e32 v1, 0xffffffbf, v1
-; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-SDAG-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_test_i32_x_sub_65:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_subrev_nc_u32_e32 v1, 0x41, v1
-; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: v_test_i32_x_sub_65:
@@ -626,24 +626,24 @@ define amdgpu_kernel void @v_test_i32_65_sub_x(ptr addrspace(1) %out, ptr addrsp
 ;
 ; GFX9-LABEL: v_test_i32_65_sub_x:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_sub_u32_e32 v1, 0x41, v1
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_i32_65_sub_x:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 0x41, v1
-; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_i32_65_sub_x:
@@ -741,46 +741,46 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg16(ptr addrspace(1) %out, ptr add
 ;
 ; GFX9-SDAG-LABEL: v_test_i32_x_sub_neg16:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    v_add_u32_e32 v1, 16, v1
-; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-SDAG-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: v_test_i32_x_sub_neg16:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v1, -16, v1
-; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
 ;
 ; GFX10-SDAG-LABEL: v_test_i32_x_sub_neg16:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SDAG-NEXT:    v_add_nc_u32_e32 v1, 16, v1
-; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-SDAG-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_test_i32_x_sub_neg16:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_subrev_nc_u32_e32 v1, -16, v1
-; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: v_test_i32_x_sub_neg16:
@@ -893,24 +893,24 @@ define amdgpu_kernel void @v_test_i32_neg16_sub_x(ptr addrspace(1) %out, ptr add
 ;
 ; GFX9-LABEL: v_test_i32_neg16_sub_x:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_sub_u32_e32 v1, -16, v1
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_i32_neg16_sub_x:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v1, -16, v1
-; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_i32_neg16_sub_x:
@@ -1008,46 +1008,46 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg17(ptr addrspace(1) %out, ptr add
 ;
 ; GFX9-SDAG-LABEL: v_test_i32_x_sub_neg17:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    v_add_u32_e32 v1, 17, v1
-; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-SDAG-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: v_test_i32_x_sub_neg17:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v1, 0xffffffef, v1
-; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
 ;
 ; GFX10-SDAG-LABEL: v_test_i32_x_sub_neg17:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SDAG-NEXT:    v_add_nc_u32_e32 v1, 17, v1
-; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-SDAG-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_test_i32_x_sub_neg17:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_subrev_nc_u32_e32 v1, 0xffffffef, v1
-; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: v_test_i32_x_sub_neg17:
@@ -1160,24 +1160,24 @@ define amdgpu_kernel void @v_test_i32_neg17_sub_x(ptr addrspace(1) %out, ptr add
 ;
 ; GFX9-LABEL: v_test_i32_neg17_sub_x:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_sub_u32_e32 v1, 0xffffffef, v1
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_i32_neg17_sub_x:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 0xffffffef, v1
-; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_i32_neg17_sub_x:
@@ -1330,24 +1330,24 @@ define amdgpu_kernel void @v_test_i16_x_sub_64(ptr addrspace(1) %out, ptr addrsp
 ;
 ; GFX9-LABEL: v_test_i16_x_sub_64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_subrev_u16_e32 v1, 64, v1
-; GFX9-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_i16_x_sub_64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ushort v1, v0, s[6:7]
+; GFX10-NEXT:    global_load_ushort v1, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_sub_nc_u16 v1, v1, 64
-; GFX10-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_i16_x_sub_64:
@@ -1451,27 +1451,27 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out
 ;
 ; GFX9-LABEL: v_test_i16_x_sub_64_zext_to_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v1, v1, s[6:7]
+; GFX9-NEXT:    global_load_ushort v1, v1, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_subrev_u16_e32 v1, 64, v1
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_i16_x_sub_64_zext_to_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ushort v1, v1, s[6:7]
+; GFX10-NEXT:    global_load_ushort v1, v1, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_sub_nc_u16 v1, v1, 64
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_i16_x_sub_64_zext_to_i32:
@@ -1597,35 +1597,35 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(ptr addrspace(1) %out,
 ;
 ; GFX9-LABEL: v_test_i16_x_sub_64_multi_use:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v1, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_ushort v2, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_ushort v2, v0, s[2:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_subrev_u16_e32 v1, 64, v1
 ; GFX9-NEXT:    v_subrev_u16_e32 v2, 64, v2
-; GFX9-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_store_short v0, v2, s[4:5]
+; GFX9-NEXT:    global_store_short v0, v2, s[0:1]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_i16_x_sub_64_multi_use:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_ushort v1, v0, s[6:7] glc dlc
+; GFX10-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_load_ushort v2, v0, s[6:7] glc dlc
+; GFX10-NEXT:    global_load_ushort v2, v0, s[2:3] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_sub_nc_u16 v1, v1, 64
 ; GFX10-NEXT:    v_sub_nc_u16 v2, v2, 64
-; GFX10-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_store_short v0, v2, s[4:5]
+; GFX10-NEXT:    global_store_short v0, v2, s[0:1]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -1748,24 +1748,24 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_64(ptr addrspace(1) %out, ptr a
 ;
 ; GFX9-LABEL: v_test_v2i16_x_sub_64_64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_sub_i16 v1, v1, 64 op_sel_hi:[1,0]
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_v2i16_x_sub_64_64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_sub_i16 v1, v1, 64 op_sel_hi:[1,0]
-; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_v2i16_x_sub_64_64:
@@ -1878,37 +1878,37 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_64(ptr addrspace(1) %out, ptr ad
 ;
 ; GFX9-SDAG-LABEL: v_test_v2i16_x_sub_7_64:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-SDAG-NEXT:    s_mov_b32 s0, 0x400007
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-SDAG-NEXT:    s_mov_b32 s2, 0x400007
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    v_pk_sub_i16 v1, v1, s0
-; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-SDAG-NEXT:    v_pk_sub_i16 v1, v1, s2
+; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-SDAG-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: v_test_v2i16_x_sub_7_64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0x400007
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_pk_sub_i16 v1, v1, v2
-; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_v2i16_x_sub_7_64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_sub_i16 v1, v1, 0x400007
-; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_v2i16_x_sub_7_64:
@@ -2021,37 +2021,37 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(ptr addrspace(1) %out, ptr
 ;
 ; GFX9-SDAG-LABEL: v_test_v2i16_x_sub_64_123:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-SDAG-NEXT:    s_mov_b32 s0, 0x7b0040
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-SDAG-NEXT:    s_mov_b32 s2, 0x7b0040
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    v_pk_sub_i16 v1, v1, s0
-; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-SDAG-NEXT:    v_pk_sub_i16 v1, v1, s2
+; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-SDAG-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: v_test_v2i16_x_sub_64_123:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7b0040
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_pk_sub_i16 v1, v1, v2
-; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_v2i16_x_sub_64_123:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_sub_i16 v1, v1, 0x7b0040
-; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_v2i16_x_sub_64_123:
@@ -2161,24 +2161,24 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_0(ptr addrspace(1) %out, ptr add
 ;
 ; GFX9-LABEL: v_test_v2i16_x_sub_7_0:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_sub_i16 v1, v1, 7
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_v2i16_x_sub_7_0:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_sub_i16 v1, v1, 7
-; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_v2i16_x_sub_7_0:
@@ -2286,24 +2286,24 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_16(ptr addrspace(1) %out, ptr ad
 ;
 ; GFX9-LABEL: v_test_v2i16_x_sub_0_16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_sub_i16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_v2i16_x_sub_0_16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_sub_i16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_v2i16_x_sub_0_16:
@@ -2410,37 +2410,37 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_1_0(ptr addrspace(1) %out, ptr a
 ;
 ; GFX9-SDAG-LABEL: v_test_v2i16_x_sub_0_1_0:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-SDAG-NEXT:    s_brev_b32 s0, 35
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-SDAG-NEXT:    s_brev_b32 s2, 35
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    v_pk_sub_i16 v1, v1, s0
-; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-SDAG-NEXT:    v_pk_sub_i16 v1, v1, s2
+; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-SDAG-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: v_test_v2i16_x_sub_0_1_0:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    v_bfrev_b32_e32 v2, 35
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_pk_sub_i16 v1, v1, v2
-; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_v2i16_x_sub_0_1_0:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_sub_i16 v1, v1, 0xc4000000
-; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_v2i16_x_sub_0_1_0:
@@ -2547,37 +2547,37 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_neg1_0(ptr addrspace(1) %out, pt
 ;
 ; GFX9-SDAG-LABEL: v_test_v2i16_x_sub_0_neg1_0:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-SDAG-NEXT:    s_brev_b32 s0, 34
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-SDAG-NEXT:    s_brev_b32 s2, 34
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    v_pk_sub_i16 v1, v1, s0
-; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-SDAG-NEXT:    v_pk_sub_i16 v1, v1, s2
+; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-SDAG-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: v_test_v2i16_x_sub_0_neg1_0:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    v_bfrev_b32_e32 v2, 34
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_pk_sub_i16 v1, v1, v2
-; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_v2i16_x_sub_0_neg1_0:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_sub_i16 v1, v1, 0x44000000
-; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_v2i16_x_sub_0_neg1_0:
@@ -2691,24 +2691,24 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_neg32(ptr addrspace(1) %out,
 ;
 ; GFX9-LABEL: v_test_v2i16_x_add_neg32_neg32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_sub_u16 v1, v1, 32 op_sel_hi:[1,0]
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_v2i16_x_add_neg32_neg32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_sub_u16 v1, v1, 32 op_sel_hi:[1,0]
-; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_v2i16_x_add_neg32_neg32:
@@ -2815,24 +2815,24 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg32(ptr addrspace(1) %out, ptr
 ;
 ; GFX9-LABEL: v_test_v2i16_x_add_0_neg32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_v2i16_x_add_0_neg32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_v2i16_x_add_0_neg32:
@@ -2941,24 +2941,24 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_0(ptr addrspace(1) %out, ptr
 ;
 ; GFX9-LABEL: v_test_v2i16_x_add_neg32_0:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_sub_u16 v1, v1, 32
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_v2i16_x_add_neg32_0:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_sub_u16 v1, v1, 32
-; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_v2i16_x_add_neg32_0:
@@ -3072,24 +3072,24 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_neg16(ptr addrspace(1) %out,
 ;
 ; GFX9-LABEL: v_test_v2i16_x_add_neg16_neg16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_add_u16 v1, v1, -16 op_sel_hi:[1,0]
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_v2i16_x_add_neg16_neg16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_add_u16 v1, v1, -16 op_sel_hi:[1,0]
-; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_v2i16_x_add_neg16_neg16:
@@ -3196,24 +3196,24 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg16(ptr addrspace(1) %out, ptr
 ;
 ; GFX9-LABEL: v_test_v2i16_x_add_0_neg16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_sub_u16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_v2i16_x_add_0_neg16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_sub_u16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_v2i16_x_add_0_neg16:
@@ -3322,24 +3322,24 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_0(ptr addrspace(1) %out, ptr
 ;
 ; GFX9-LABEL: v_test_v2i16_x_add_neg16_0:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_sub_u16 v1, v1, 16
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_v2i16_x_add_neg16_0:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_sub_u16 v1, v1, 16
-; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_v2i16_x_add_neg16_0:
@@ -3452,48 +3452,48 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(ptr addrspace(1) %out, p
 ;
 ; GFX9-SDAG-LABEL: v_test_v2i16_x_add_neg_fpone:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-SDAG-NEXT:    s_movk_i32 s0, 0xc400
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-SDAG-NEXT:    s_movk_i32 s2, 0xc400
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    v_pk_add_u16 v1, v1, s0 op_sel_hi:[1,0]
-; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-SDAG-NEXT:    v_pk_add_u16 v1, v1, s2 op_sel_hi:[1,0]
+; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-SDAG-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: v_test_v2i16_x_add_neg_fpone:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xc400c400
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_pk_add_u16 v1, v1, v2
-; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
 ;
 ; GFX10-SDAG-LABEL: v_test_v2i16_x_add_neg_fpone:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SDAG-NEXT:    v_pk_add_u16 v1, 0xffffc400, v1 op_sel_hi:[0,1]
-; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-SDAG-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_test_v2i16_x_add_neg_fpone:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_pk_add_u16 v1, 0xc400c400, v1
-; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: v_test_v2i16_x_add_neg_fpone:
@@ -3621,48 +3621,48 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(ptr addrspace(1) %out
 ;
 ; GFX9-SDAG-LABEL: v_test_v2i16_x_add_neg_negfpone:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-SDAG-NEXT:    s_movk_i32 s0, 0x4400
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-SDAG-NEXT:    s_movk_i32 s2, 0x4400
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    v_pk_add_u16 v1, v1, s0 op_sel_hi:[1,0]
-; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-SDAG-NEXT:    v_pk_add_u16 v1, v1, s2 op_sel_hi:[1,0]
+; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-SDAG-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: v_test_v2i16_x_add_neg_negfpone:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0x44004400
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_pk_add_u16 v1, v1, v2
-; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
 ;
 ; GFX10-SDAG-LABEL: v_test_v2i16_x_add_neg_negfpone:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SDAG-NEXT:    v_pk_add_u16 v1, 0x4400, v1 op_sel_hi:[0,1]
-; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-SDAG-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_test_v2i16_x_add_neg_negfpone:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_pk_add_u16 v1, 0x44004400, v1
-; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: v_test_v2i16_x_add_neg_negfpone:
@@ -3790,24 +3790,24 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fptwo(ptr addrspace(1) %out, p
 ;
 ; GFX9-LABEL: v_test_v2i16_x_add_neg_fptwo:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_add_u16 v1, v1, 2.0 op_sel:[0,1]
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_v2i16_x_add_neg_fptwo:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_add_u16 v1, v1, 2.0 op_sel:[0,1]
-; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_v2i16_x_add_neg_fptwo:
@@ -3920,24 +3920,24 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfptwo(ptr addrspace(1) %out
 ;
 ; GFX9-LABEL: v_test_v2i16_x_add_neg_negfptwo:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_add_u16 v1, v1, -2.0 op_sel:[0,1]
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_v2i16_x_add_neg_negfptwo:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_add_u16 v1, v1, -2.0 op_sel:[0,1]
-; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_v2i16_x_add_neg_negfptwo:
@@ -4043,24 +4043,24 @@ define amdgpu_kernel void @v_test_v2i16_x_add_undef_neg32(ptr addrspace(1) %out,
 ;
 ; GFX9-LABEL: v_test_v2i16_x_add_undef_neg32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_v2i16_x_add_undef_neg32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_v2i16_x_add_undef_neg32:
@@ -4163,47 +4163,47 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_undef(ptr addrspace(1) %out,
 ;
 ; GFX9-SDAG-LABEL: v_test_v2i16_x_add_neg32_undef:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    v_pk_sub_u16 v1, v1, 32
-; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-SDAG-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: v_test_v2i16_x_add_neg32_undef:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-GISEL-NEXT:    v_not_b32_e32 v2, 31
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_pk_add_u16 v1, v1, v2
-; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
 ;
 ; GFX10-SDAG-LABEL: v_test_v2i16_x_add_neg32_undef:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SDAG-NEXT:    v_pk_sub_u16 v1, v1, 32
-; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-SDAG-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: v_test_v2i16_x_add_neg32_undef:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_pk_add_u16 v1, 0xffffffe0, v1
-; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: v_test_v2i16_x_add_neg32_undef:
diff --git a/llvm/test/CodeGen/AMDGPU/smrd.ll b/llvm/test/CodeGen/AMDGPU/smrd.ll
index c9c00a84e0f4b..52db7fea08e05 100644
--- a/llvm/test/CodeGen/AMDGPU/smrd.ll
+++ b/llvm/test/CodeGen/AMDGPU/smrd.ll
@@ -92,8 +92,8 @@ entry:
 ; GCN-LABEL: {{^}}smrd6:
 ; SICIVI: s_add_u32 s{{[0-9]}}, s{{[0-9]}}, -4
 ; SICIVI: s_load_dword s{{[0-9]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0
-; GFX9_10: s_add_u32 s0, s6, -4
-; GFX9_10: s_addc_u32 s1, s7, -1
+; GFX9_10: s_add_u32 s2, s2, -4
+; GFX9_10: s_addc_u32 s3, s3, -1
 ; GFX9_10: s_load_dword s{{[0-9]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0
 define amdgpu_kernel void @smrd6(ptr addrspace(1) %out, ptr addrspace(4) %ptr) #0 {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/srem.ll b/llvm/test/CodeGen/AMDGPU/srem.ll
index 7b0241984a349..abf013e39eefa 100644
--- a/llvm/test/CodeGen/AMDGPU/srem.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem.ll
@@ -7,20 +7,20 @@
 define amdgpu_kernel void @srem_i16_7(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GCN-LABEL: srem_i16_7:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    s_movk_i32 s0, 0x4925
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    global_load_ushort v1, v0, s[6:7]
+; GCN-NEXT:    global_load_ushort v1, v0, s[2:3]
+; GCN-NEXT:    s_movk_i32 s2, 0x4925
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_bfe_i32 v2, v1, 0, 16
-; GCN-NEXT:    v_mul_lo_u32 v2, v2, s0
+; GCN-NEXT:    v_mul_lo_u32 v2, v2, s2
 ; GCN-NEXT:    v_lshrrev_b32_e32 v3, 31, v2
 ; GCN-NEXT:    v_ashrrev_i32_e32 v2, 17, v2
 ; GCN-NEXT:    v_add_u16_e32 v2, v2, v3
 ; GCN-NEXT:    v_mul_lo_u16_e32 v2, 7, v2
 ; GCN-NEXT:    v_sub_u16_e32 v1, v1, v2
-; GCN-NEXT:    global_store_short v0, v1, s[4:5]
+; GCN-NEXT:    global_store_short v0, v1, s[0:1]
 ; GCN-NEXT:    s_endpgm
 ;
 ; TAHITI-LABEL: srem_i16_7:
@@ -113,38 +113,38 @@ define amdgpu_kernel void @srem_i16_7(ptr addrspace(1) %out, ptr addrspace(1) %i
 define amdgpu_kernel void @srem_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GCN-LABEL: srem_i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    global_load_dwordx2 v[0:1], v2, s[6:7]
+; GCN-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readfirstlane_b32 s0, v1
-; GCN-NEXT:    s_abs_i32 s0, s0
-; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s0
-; GCN-NEXT:    v_readfirstlane_b32 s1, v0
-; GCN-NEXT:    s_sub_i32 s3, 0, s0
-; GCN-NEXT:    s_ashr_i32 s2, s1, 31
+; GCN-NEXT:    v_readfirstlane_b32 s2, v1
+; GCN-NEXT:    s_abs_i32 s2, s2
+; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s2
+; GCN-NEXT:    v_readfirstlane_b32 s3, v0
+; GCN-NEXT:    s_sub_i32 s5, 0, s2
+; GCN-NEXT:    s_ashr_i32 s4, s3, 31
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GCN-NEXT:    s_abs_i32 s1, s1
+; GCN-NEXT:    s_abs_i32 s3, s3
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_readfirstlane_b32 s6, v0
-; GCN-NEXT:    s_mul_i32 s3, s3, s6
-; GCN-NEXT:    s_mul_hi_u32 s3, s6, s3
-; GCN-NEXT:    s_add_i32 s6, s6, s3
-; GCN-NEXT:    s_mul_hi_u32 s3, s1, s6
-; GCN-NEXT:    s_mul_i32 s3, s3, s0
-; GCN-NEXT:    s_sub_i32 s1, s1, s3
-; GCN-NEXT:    s_sub_i32 s3, s1, s0
-; GCN-NEXT:    s_cmp_ge_u32 s1, s0
-; GCN-NEXT:    s_cselect_b32 s1, s3, s1
-; GCN-NEXT:    s_sub_i32 s3, s1, s0
-; GCN-NEXT:    s_cmp_ge_u32 s1, s0
-; GCN-NEXT:    s_cselect_b32 s0, s3, s1
-; GCN-NEXT:    s_xor_b32 s0, s0, s2
-; GCN-NEXT:    s_sub_i32 s0, s0, s2
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    global_store_dword v2, v0, s[4:5]
+; GCN-NEXT:    s_mul_i32 s5, s5, s6
+; GCN-NEXT:    s_mul_hi_u32 s5, s6, s5
+; GCN-NEXT:    s_add_i32 s6, s6, s5
+; GCN-NEXT:    s_mul_hi_u32 s5, s3, s6
+; GCN-NEXT:    s_mul_i32 s5, s5, s2
+; GCN-NEXT:    s_sub_i32 s3, s3, s5
+; GCN-NEXT:    s_sub_i32 s5, s3, s2
+; GCN-NEXT:    s_cmp_ge_u32 s3, s2
+; GCN-NEXT:    s_cselect_b32 s3, s5, s3
+; GCN-NEXT:    s_sub_i32 s5, s3, s2
+; GCN-NEXT:    s_cmp_ge_u32 s3, s2
+; GCN-NEXT:    s_cselect_b32 s2, s5, s3
+; GCN-NEXT:    s_xor_b32 s2, s2, s4
+; GCN-NEXT:    s_sub_i32 s2, s2, s4
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GCN-NEXT:    s_endpgm
 ;
 ; TAHITI-LABEL: srem_i32:
@@ -277,17 +277,17 @@ define amdgpu_kernel void @srem_i32(ptr addrspace(1) %out, ptr addrspace(1) %in)
 define amdgpu_kernel void @srem_i32_4(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GCN-LABEL: srem_i32_4:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    global_load_dword v1, v0, s[6:7]
+; GCN-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
 ; GCN-NEXT:    v_lshrrev_b32_e32 v2, 30, v2
 ; GCN-NEXT:    v_add_u32_e32 v2, v1, v2
 ; GCN-NEXT:    v_and_b32_e32 v2, -4, v2
 ; GCN-NEXT:    v_sub_u32_e32 v1, v1, v2
-; GCN-NEXT:    global_store_dword v0, v1, s[4:5]
+; GCN-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GCN-NEXT:    s_endpgm
 ;
 ; TAHITI-LABEL: srem_i32_4:
@@ -363,20 +363,20 @@ define amdgpu_kernel void @srem_i32_4(ptr addrspace(1) %out, ptr addrspace(1) %i
 define amdgpu_kernel void @srem_i32_7(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GCN-LABEL: srem_i32_7:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    s_mov_b32 s0, 0x92492493
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    global_load_dword v1, v0, s[6:7]
+; GCN-NEXT:    global_load_dword v1, v0, s[2:3]
+; GCN-NEXT:    s_mov_b32 s2, 0x92492493
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_hi_i32 v2, v1, s0
+; GCN-NEXT:    v_mul_hi_i32 v2, v1, s2
 ; GCN-NEXT:    v_add_u32_e32 v2, v2, v1
 ; GCN-NEXT:    v_lshrrev_b32_e32 v3, 31, v2
 ; GCN-NEXT:    v_ashrrev_i32_e32 v2, 2, v2
 ; GCN-NEXT:    v_add_u32_e32 v2, v2, v3
 ; GCN-NEXT:    v_mul_lo_u32 v2, v2, 7
 ; GCN-NEXT:    v_sub_u32_e32 v1, v1, v2
-; GCN-NEXT:    global_store_dword v0, v1, s[4:5]
+; GCN-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GCN-NEXT:    s_endpgm
 ;
 ; TAHITI-LABEL: srem_i32_7:
@@ -459,64 +459,64 @@ define amdgpu_kernel void @srem_i32_7(ptr addrspace(1) %out, ptr addrspace(1) %i
 define amdgpu_kernel void @srem_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GCN-LABEL: srem_v2i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GCN-NEXT:    v_mov_b32_e32 v4, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    global_load_dwordx4 v[0:3], v4, s[6:7]
+; GCN-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readfirstlane_b32 s0, v2
-; GCN-NEXT:    s_abs_i32 s0, s0
-; GCN-NEXT:    v_cvt_f32_u32_e32 v2, s0
-; GCN-NEXT:    v_readfirstlane_b32 s1, v0
-; GCN-NEXT:    s_sub_i32 s6, 0, s0
-; GCN-NEXT:    s_ashr_i32 s3, s1, 31
+; GCN-NEXT:    v_readfirstlane_b32 s2, v2
+; GCN-NEXT:    s_abs_i32 s2, s2
+; GCN-NEXT:    v_cvt_f32_u32_e32 v2, s2
+; GCN-NEXT:    v_readfirstlane_b32 s3, v0
+; GCN-NEXT:    s_sub_i32 s6, 0, s2
+; GCN-NEXT:    s_ashr_i32 s5, s3, 31
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v2
-; GCN-NEXT:    s_abs_i32 s1, s1
-; GCN-NEXT:    v_readfirstlane_b32 s2, v3
+; GCN-NEXT:    s_abs_i32 s3, s3
+; GCN-NEXT:    v_readfirstlane_b32 s4, v3
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v2
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_readfirstlane_b32 s7, v0
 ; GCN-NEXT:    s_mul_i32 s6, s6, s7
 ; GCN-NEXT:    s_mul_hi_u32 s6, s7, s6
 ; GCN-NEXT:    s_add_i32 s7, s7, s6
-; GCN-NEXT:    s_mul_hi_u32 s6, s1, s7
-; GCN-NEXT:    s_mul_i32 s6, s6, s0
-; GCN-NEXT:    s_sub_i32 s1, s1, s6
-; GCN-NEXT:    s_sub_i32 s6, s1, s0
-; GCN-NEXT:    s_cmp_ge_u32 s1, s0
-; GCN-NEXT:    s_cselect_b32 s1, s6, s1
-; GCN-NEXT:    s_sub_i32 s6, s1, s0
-; GCN-NEXT:    s_cmp_ge_u32 s1, s0
-; GCN-NEXT:    s_cselect_b32 s0, s6, s1
-; GCN-NEXT:    s_abs_i32 s1, s2
-; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s1
-; GCN-NEXT:    s_xor_b32 s0, s0, s3
-; GCN-NEXT:    s_sub_i32 s7, 0, s1
-; GCN-NEXT:    s_sub_i32 s0, s0, s3
+; GCN-NEXT:    s_mul_hi_u32 s6, s3, s7
+; GCN-NEXT:    s_mul_i32 s6, s6, s2
+; GCN-NEXT:    s_sub_i32 s3, s3, s6
+; GCN-NEXT:    s_sub_i32 s6, s3, s2
+; GCN-NEXT:    s_cmp_ge_u32 s3, s2
+; GCN-NEXT:    s_cselect_b32 s3, s6, s3
+; GCN-NEXT:    s_sub_i32 s6, s3, s2
+; GCN-NEXT:    s_cmp_ge_u32 s3, s2
+; GCN-NEXT:    s_cselect_b32 s2, s6, s3
+; GCN-NEXT:    s_abs_i32 s3, s4
+; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s3
+; GCN-NEXT:    s_xor_b32 s2, s2, s5
+; GCN-NEXT:    s_sub_i32 s7, 0, s3
+; GCN-NEXT:    s_sub_i32 s2, s2, s5
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GCN-NEXT:    v_readfirstlane_b32 s2, v1
-; GCN-NEXT:    s_ashr_i32 s6, s2, 31
-; GCN-NEXT:    s_abs_i32 s2, s2
+; GCN-NEXT:    v_readfirstlane_b32 s4, v1
+; GCN-NEXT:    s_ashr_i32 s6, s4, 31
+; GCN-NEXT:    s_abs_i32 s4, s4
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_readfirstlane_b32 s3, v0
-; GCN-NEXT:    s_mul_i32 s7, s7, s3
-; GCN-NEXT:    s_mul_hi_u32 s7, s3, s7
-; GCN-NEXT:    s_add_i32 s3, s3, s7
-; GCN-NEXT:    s_mul_hi_u32 s3, s2, s3
-; GCN-NEXT:    s_mul_i32 s3, s3, s1
-; GCN-NEXT:    s_sub_i32 s2, s2, s3
-; GCN-NEXT:    s_sub_i32 s3, s2, s1
-; GCN-NEXT:    s_cmp_ge_u32 s2, s1
-; GCN-NEXT:    s_cselect_b32 s2, s3, s2
-; GCN-NEXT:    s_sub_i32 s3, s2, s1
-; GCN-NEXT:    s_cmp_ge_u32 s2, s1
-; GCN-NEXT:    s_cselect_b32 s1, s3, s2
-; GCN-NEXT:    s_xor_b32 s1, s1, s6
-; GCN-NEXT:    s_sub_i32 s1, s1, s6
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    v_mov_b32_e32 v1, s1
-; GCN-NEXT:    global_store_dwordx2 v4, v[0:1], s[4:5]
+; GCN-NEXT:    v_readfirstlane_b32 s5, v0
+; GCN-NEXT:    s_mul_i32 s7, s7, s5
+; GCN-NEXT:    s_mul_hi_u32 s7, s5, s7
+; GCN-NEXT:    s_add_i32 s5, s5, s7
+; GCN-NEXT:    s_mul_hi_u32 s5, s4, s5
+; GCN-NEXT:    s_mul_i32 s5, s5, s3
+; GCN-NEXT:    s_sub_i32 s4, s4, s5
+; GCN-NEXT:    s_sub_i32 s5, s4, s3
+; GCN-NEXT:    s_cmp_ge_u32 s4, s3
+; GCN-NEXT:    s_cselect_b32 s4, s5, s4
+; GCN-NEXT:    s_sub_i32 s5, s4, s3
+; GCN-NEXT:    s_cmp_ge_u32 s4, s3
+; GCN-NEXT:    s_cselect_b32 s3, s5, s4
+; GCN-NEXT:    s_xor_b32 s3, s3, s6
+; GCN-NEXT:    s_sub_i32 s3, s3, s6
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
 ; GCN-NEXT:    s_endpgm
 ;
 ; TAHITI-LABEL: srem_v2i32:
@@ -723,26 +723,26 @@ define amdgpu_kernel void @srem_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 define amdgpu_kernel void @srem_v2i32_4(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GCN-LABEL: srem_v2i32_4:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    global_load_dwordx2 v[0:1], v2, s[6:7]
+; GCN-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readfirstlane_b32 s0, v0
-; GCN-NEXT:    v_readfirstlane_b32 s1, v1
-; GCN-NEXT:    s_ashr_i32 s2, s0, 31
-; GCN-NEXT:    s_ashr_i32 s3, s1, 31
-; GCN-NEXT:    s_lshr_b32 s2, s2, 30
-; GCN-NEXT:    s_lshr_b32 s3, s3, 30
-; GCN-NEXT:    s_add_i32 s2, s0, s2
-; GCN-NEXT:    s_add_i32 s3, s1, s3
-; GCN-NEXT:    s_and_b32 s2, s2, -4
-; GCN-NEXT:    s_and_b32 s3, s3, -4
-; GCN-NEXT:    s_sub_i32 s0, s0, s2
-; GCN-NEXT:    s_sub_i32 s1, s1, s3
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    v_mov_b32_e32 v1, s1
-; GCN-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GCN-NEXT:    v_readfirstlane_b32 s2, v0
+; GCN-NEXT:    v_readfirstlane_b32 s3, v1
+; GCN-NEXT:    s_ashr_i32 s4, s2, 31
+; GCN-NEXT:    s_ashr_i32 s5, s3, 31
+; GCN-NEXT:    s_lshr_b32 s4, s4, 30
+; GCN-NEXT:    s_lshr_b32 s5, s5, 30
+; GCN-NEXT:    s_add_i32 s4, s2, s4
+; GCN-NEXT:    s_add_i32 s5, s3, s5
+; GCN-NEXT:    s_and_b32 s4, s4, -4
+; GCN-NEXT:    s_and_b32 s5, s5, -4
+; GCN-NEXT:    s_sub_i32 s2, s2, s4
+; GCN-NEXT:    s_sub_i32 s3, s3, s5
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GCN-NEXT:    s_endpgm
 ;
 ; TAHITI-LABEL: srem_v2i32_4:
@@ -842,118 +842,118 @@ define amdgpu_kernel void @srem_v2i32_4(ptr addrspace(1) %out, ptr addrspace(1)
 define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GCN-LABEL: srem_v4i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    global_load_dwordx4 v[1:4], v0, s[6:7] offset:16
-; GCN-NEXT:    global_load_dwordx4 v[5:8], v0, s[6:7]
+; GCN-NEXT:    global_load_dwordx4 v[1:4], v0, s[2:3] offset:16
+; GCN-NEXT:    global_load_dwordx4 v[5:8], v0, s[2:3]
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_readfirstlane_b32 s0, v1
-; GCN-NEXT:    s_abs_i32 s0, s0
-; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s0
-; GCN-NEXT:    s_sub_i32 s6, 0, s0
+; GCN-NEXT:    v_readfirstlane_b32 s2, v1
+; GCN-NEXT:    s_abs_i32 s2, s2
+; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s2
+; GCN-NEXT:    s_sub_i32 s6, 0, s2
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readfirstlane_b32 s2, v5
-; GCN-NEXT:    s_ashr_i32 s3, s2, 31
+; GCN-NEXT:    v_readfirstlane_b32 s4, v5
+; GCN-NEXT:    s_ashr_i32 s5, s4, 31
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GCN-NEXT:    s_abs_i32 s2, s2
-; GCN-NEXT:    v_readfirstlane_b32 s1, v2
+; GCN-NEXT:    s_abs_i32 s4, s4
+; GCN-NEXT:    v_readfirstlane_b32 s3, v2
 ; GCN-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GCN-NEXT:    v_readfirstlane_b32 s7, v1
 ; GCN-NEXT:    s_mul_i32 s6, s6, s7
 ; GCN-NEXT:    s_mul_hi_u32 s6, s7, s6
 ; GCN-NEXT:    s_add_i32 s7, s7, s6
-; GCN-NEXT:    s_mul_hi_u32 s6, s2, s7
-; GCN-NEXT:    s_mul_i32 s6, s6, s0
-; GCN-NEXT:    s_sub_i32 s2, s2, s6
-; GCN-NEXT:    s_sub_i32 s6, s2, s0
-; GCN-NEXT:    s_cmp_ge_u32 s2, s0
-; GCN-NEXT:    s_cselect_b32 s2, s6, s2
-; GCN-NEXT:    s_sub_i32 s6, s2, s0
-; GCN-NEXT:    s_cmp_ge_u32 s2, s0
-; GCN-NEXT:    s_cselect_b32 s0, s6, s2
-; GCN-NEXT:    s_abs_i32 s1, s1
-; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s1
-; GCN-NEXT:    s_xor_b32 s0, s0, s3
-; GCN-NEXT:    s_sub_i32 s8, 0, s1
-; GCN-NEXT:    s_sub_i32 s0, s0, s3
+; GCN-NEXT:    s_mul_hi_u32 s6, s4, s7
+; GCN-NEXT:    s_mul_i32 s6, s6, s2
+; GCN-NEXT:    s_sub_i32 s4, s4, s6
+; GCN-NEXT:    s_sub_i32 s6, s4, s2
+; GCN-NEXT:    s_cmp_ge_u32 s4, s2
+; GCN-NEXT:    s_cselect_b32 s4, s6, s4
+; GCN-NEXT:    s_sub_i32 s6, s4, s2
+; GCN-NEXT:    s_cmp_ge_u32 s4, s2
+; GCN-NEXT:    s_cselect_b32 s2, s6, s4
+; GCN-NEXT:    s_abs_i32 s3, s3
+; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s3
+; GCN-NEXT:    s_xor_b32 s2, s2, s5
+; GCN-NEXT:    s_sub_i32 s8, 0, s3
+; GCN-NEXT:    s_sub_i32 s2, s2, s5
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v1
 ; GCN-NEXT:    v_readfirstlane_b32 s6, v6
 ; GCN-NEXT:    s_ashr_i32 s7, s6, 31
 ; GCN-NEXT:    s_abs_i32 s6, s6
 ; GCN-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT:    v_readfirstlane_b32 s2, v3
-; GCN-NEXT:    v_readfirstlane_b32 s3, v1
-; GCN-NEXT:    s_mul_i32 s8, s8, s3
-; GCN-NEXT:    s_mul_hi_u32 s8, s3, s8
-; GCN-NEXT:    s_add_i32 s3, s3, s8
-; GCN-NEXT:    s_mul_hi_u32 s3, s6, s3
-; GCN-NEXT:    s_mul_i32 s3, s3, s1
-; GCN-NEXT:    s_sub_i32 s3, s6, s3
-; GCN-NEXT:    s_sub_i32 s6, s3, s1
-; GCN-NEXT:    s_cmp_ge_u32 s3, s1
-; GCN-NEXT:    s_cselect_b32 s3, s6, s3
-; GCN-NEXT:    s_sub_i32 s6, s3, s1
-; GCN-NEXT:    s_cmp_ge_u32 s3, s1
-; GCN-NEXT:    s_cselect_b32 s1, s6, s3
-; GCN-NEXT:    s_abs_i32 s2, s2
-; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s2
-; GCN-NEXT:    s_xor_b32 s1, s1, s7
-; GCN-NEXT:    s_sub_i32 s9, 0, s2
-; GCN-NEXT:    s_sub_i32 s1, s1, s7
+; GCN-NEXT:    v_readfirstlane_b32 s4, v3
+; GCN-NEXT:    v_readfirstlane_b32 s5, v1
+; GCN-NEXT:    s_mul_i32 s8, s8, s5
+; GCN-NEXT:    s_mul_hi_u32 s8, s5, s8
+; GCN-NEXT:    s_add_i32 s5, s5, s8
+; GCN-NEXT:    s_mul_hi_u32 s5, s6, s5
+; GCN-NEXT:    s_mul_i32 s5, s5, s3
+; GCN-NEXT:    s_sub_i32 s5, s6, s5
+; GCN-NEXT:    s_sub_i32 s6, s5, s3
+; GCN-NEXT:    s_cmp_ge_u32 s5, s3
+; GCN-NEXT:    s_cselect_b32 s5, s6, s5
+; GCN-NEXT:    s_sub_i32 s6, s5, s3
+; GCN-NEXT:    s_cmp_ge_u32 s5, s3
+; GCN-NEXT:    s_cselect_b32 s3, s6, s5
+; GCN-NEXT:    s_abs_i32 s4, s4
+; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s4
+; GCN-NEXT:    s_xor_b32 s3, s3, s7
+; GCN-NEXT:    s_sub_i32 s9, 0, s4
+; GCN-NEXT:    s_sub_i32 s3, s3, s7
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v1
 ; GCN-NEXT:    v_readfirstlane_b32 s6, v7
 ; GCN-NEXT:    s_ashr_i32 s8, s6, 31
 ; GCN-NEXT:    s_abs_i32 s6, s6
 ; GCN-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT:    v_readfirstlane_b32 s3, v4
+; GCN-NEXT:    v_readfirstlane_b32 s5, v4
 ; GCN-NEXT:    v_readfirstlane_b32 s7, v1
 ; GCN-NEXT:    s_mul_i32 s9, s9, s7
 ; GCN-NEXT:    s_mul_hi_u32 s9, s7, s9
 ; GCN-NEXT:    s_add_i32 s7, s7, s9
 ; GCN-NEXT:    s_mul_hi_u32 s7, s6, s7
-; GCN-NEXT:    s_mul_i32 s7, s7, s2
+; GCN-NEXT:    s_mul_i32 s7, s7, s4
 ; GCN-NEXT:    s_sub_i32 s6, s6, s7
-; GCN-NEXT:    s_sub_i32 s7, s6, s2
-; GCN-NEXT:    s_cmp_ge_u32 s6, s2
+; GCN-NEXT:    s_sub_i32 s7, s6, s4
+; GCN-NEXT:    s_cmp_ge_u32 s6, s4
 ; GCN-NEXT:    s_cselect_b32 s6, s7, s6
-; GCN-NEXT:    s_sub_i32 s7, s6, s2
-; GCN-NEXT:    s_cmp_ge_u32 s6, s2
-; GCN-NEXT:    s_cselect_b32 s2, s7, s6
-; GCN-NEXT:    s_abs_i32 s3, s3
-; GCN-NEXT:    v_cvt_f32_u32_e32 v2, s3
+; GCN-NEXT:    s_sub_i32 s7, s6, s4
+; GCN-NEXT:    s_cmp_ge_u32 s6, s4
+; GCN-NEXT:    s_cselect_b32 s4, s7, s6
+; GCN-NEXT:    s_abs_i32 s5, s5
+; GCN-NEXT:    v_cvt_f32_u32_e32 v2, s5
 ; GCN-NEXT:    v_readfirstlane_b32 s6, v8
-; GCN-NEXT:    v_mov_b32_e32 v1, s0
-; GCN-NEXT:    s_ashr_i32 s0, s6, 31
+; GCN-NEXT:    v_mov_b32_e32 v1, s2
+; GCN-NEXT:    s_ashr_i32 s2, s6, 31
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v3, v2
-; GCN-NEXT:    v_mov_b32_e32 v2, s1
-; GCN-NEXT:    s_abs_i32 s1, s6
-; GCN-NEXT:    s_sub_i32 s6, 0, s3
+; GCN-NEXT:    v_mov_b32_e32 v2, s3
+; GCN-NEXT:    s_abs_i32 s3, s6
+; GCN-NEXT:    s_sub_i32 s6, 0, s5
 ; GCN-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GCN-NEXT:    s_xor_b32 s2, s2, s8
-; GCN-NEXT:    s_sub_i32 s2, s2, s8
+; GCN-NEXT:    s_xor_b32 s4, s4, s8
+; GCN-NEXT:    s_sub_i32 s4, s4, s8
 ; GCN-NEXT:    v_readfirstlane_b32 s7, v3
 ; GCN-NEXT:    s_mul_i32 s6, s6, s7
 ; GCN-NEXT:    s_mul_hi_u32 s6, s7, s6
 ; GCN-NEXT:    s_add_i32 s7, s7, s6
-; GCN-NEXT:    s_mul_hi_u32 s6, s1, s7
-; GCN-NEXT:    s_mul_i32 s6, s6, s3
-; GCN-NEXT:    s_sub_i32 s1, s1, s6
-; GCN-NEXT:    s_sub_i32 s6, s1, s3
-; GCN-NEXT:    s_cmp_ge_u32 s1, s3
-; GCN-NEXT:    s_cselect_b32 s1, s6, s1
-; GCN-NEXT:    s_sub_i32 s6, s1, s3
-; GCN-NEXT:    s_cmp_ge_u32 s1, s3
-; GCN-NEXT:    s_cselect_b32 s1, s6, s1
-; GCN-NEXT:    s_xor_b32 s1, s1, s0
-; GCN-NEXT:    s_sub_i32 s0, s1, s0
-; GCN-NEXT:    v_mov_b32_e32 v3, s2
-; GCN-NEXT:    v_mov_b32_e32 v4, s0
-; GCN-NEXT:    global_store_dwordx4 v0, v[1:4], s[4:5]
+; GCN-NEXT:    s_mul_hi_u32 s6, s3, s7
+; GCN-NEXT:    s_mul_i32 s6, s6, s5
+; GCN-NEXT:    s_sub_i32 s3, s3, s6
+; GCN-NEXT:    s_sub_i32 s6, s3, s5
+; GCN-NEXT:    s_cmp_ge_u32 s3, s5
+; GCN-NEXT:    s_cselect_b32 s3, s6, s3
+; GCN-NEXT:    s_sub_i32 s6, s3, s5
+; GCN-NEXT:    s_cmp_ge_u32 s3, s5
+; GCN-NEXT:    s_cselect_b32 s3, s6, s3
+; GCN-NEXT:    s_xor_b32 s3, s3, s2
+; GCN-NEXT:    s_sub_i32 s2, s3, s2
+; GCN-NEXT:    v_mov_b32_e32 v3, s4
+; GCN-NEXT:    v_mov_b32_e32 v4, s2
+; GCN-NEXT:    global_store_dwordx4 v0, v[1:4], s[0:1]
 ; GCN-NEXT:    s_endpgm
 ;
 ; TAHITI-LABEL: srem_v4i32:
@@ -1317,40 +1317,40 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 define amdgpu_kernel void @srem_v4i32_4(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GCN-LABEL: srem_v4i32_4:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GCN-NEXT:    v_mov_b32_e32 v4, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    global_load_dwordx4 v[0:3], v4, s[6:7]
+; GCN-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readfirstlane_b32 s0, v0
-; GCN-NEXT:    v_readfirstlane_b32 s1, v1
-; GCN-NEXT:    v_readfirstlane_b32 s2, v2
-; GCN-NEXT:    v_readfirstlane_b32 s3, v3
-; GCN-NEXT:    s_ashr_i32 s6, s0, 31
-; GCN-NEXT:    s_ashr_i32 s7, s1, 31
-; GCN-NEXT:    s_ashr_i32 s8, s2, 31
-; GCN-NEXT:    s_ashr_i32 s9, s3, 31
+; GCN-NEXT:    v_readfirstlane_b32 s2, v0
+; GCN-NEXT:    v_readfirstlane_b32 s3, v1
+; GCN-NEXT:    v_readfirstlane_b32 s4, v2
+; GCN-NEXT:    v_readfirstlane_b32 s5, v3
+; GCN-NEXT:    s_ashr_i32 s6, s2, 31
+; GCN-NEXT:    s_ashr_i32 s7, s3, 31
+; GCN-NEXT:    s_ashr_i32 s8, s4, 31
+; GCN-NEXT:    s_ashr_i32 s9, s5, 31
 ; GCN-NEXT:    s_lshr_b32 s6, s6, 30
 ; GCN-NEXT:    s_lshr_b32 s7, s7, 30
 ; GCN-NEXT:    s_lshr_b32 s8, s8, 30
 ; GCN-NEXT:    s_lshr_b32 s9, s9, 30
-; GCN-NEXT:    s_add_i32 s6, s0, s6
-; GCN-NEXT:    s_add_i32 s7, s1, s7
-; GCN-NEXT:    s_add_i32 s8, s2, s8
-; GCN-NEXT:    s_add_i32 s9, s3, s9
+; GCN-NEXT:    s_add_i32 s6, s2, s6
+; GCN-NEXT:    s_add_i32 s7, s3, s7
+; GCN-NEXT:    s_add_i32 s8, s4, s8
+; GCN-NEXT:    s_add_i32 s9, s5, s9
 ; GCN-NEXT:    s_and_b32 s6, s6, -4
 ; GCN-NEXT:    s_and_b32 s7, s7, -4
 ; GCN-NEXT:    s_and_b32 s8, s8, -4
 ; GCN-NEXT:    s_and_b32 s9, s9, -4
-; GCN-NEXT:    s_sub_i32 s0, s0, s6
-; GCN-NEXT:    s_sub_i32 s1, s1, s7
-; GCN-NEXT:    s_sub_i32 s2, s2, s8
-; GCN-NEXT:    s_sub_i32 s3, s3, s9
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    v_mov_b32_e32 v1, s1
-; GCN-NEXT:    v_mov_b32_e32 v2, s2
-; GCN-NEXT:    v_mov_b32_e32 v3, s3
-; GCN-NEXT:    global_store_dwordx4 v4, v[0:3], s[4:5]
+; GCN-NEXT:    s_sub_i32 s2, s2, s6
+; GCN-NEXT:    s_sub_i32 s3, s3, s7
+; GCN-NEXT:    s_sub_i32 s4, s4, s8
+; GCN-NEXT:    s_sub_i32 s5, s5, s9
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-NEXT:    v_mov_b32_e32 v2, s4
+; GCN-NEXT:    v_mov_b32_e32 v3, s5
+; GCN-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GCN-NEXT:    s_endpgm
 ;
 ; TAHITI-LABEL: srem_v4i32_4:
@@ -2589,10 +2589,10 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
 define amdgpu_kernel void @srem_i64_4(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GCN-LABEL: srem_i64_4:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    global_load_dwordx2 v[0:1], v2, s[6:7]
+; GCN-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_ashrrev_i32_e32 v3, 31, v1
 ; GCN-NEXT:    v_lshrrev_b32_e32 v3, 30, v3
@@ -2601,7 +2601,7 @@ define amdgpu_kernel void @srem_i64_4(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_and_b32_e32 v3, -4, v3
 ; GCN-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v3
 ; GCN-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v4, vcc
-; GCN-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GCN-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GCN-NEXT:    s_endpgm
 ;
 ; TAHITI-LABEL: srem_i64_4:
@@ -4733,10 +4733,10 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 define amdgpu_kernel void @srem_v2i64_4(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GCN-LABEL: srem_v2i64_4:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GCN-NEXT:    v_mov_b32_e32 v4, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    global_load_dwordx4 v[0:3], v4, s[6:7]
+; GCN-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_ashrrev_i32_e32 v5, 31, v1
 ; GCN-NEXT:    v_lshrrev_b32_e32 v5, 30, v5
@@ -4752,7 +4752,7 @@ define amdgpu_kernel void @srem_v2i64_4(ptr addrspace(1) %out, ptr addrspace(1)
 ; GCN-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v7, vcc
 ; GCN-NEXT:    v_sub_co_u32_e32 v2, vcc, v2, v6
 ; GCN-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v8, vcc
-; GCN-NEXT:    global_store_dwordx4 v4, v[0:3], s[4:5]
+; GCN-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GCN-NEXT:    s_endpgm
 ;
 ; TAHITI-LABEL: srem_v2i64_4:
@@ -8883,11 +8883,11 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 define amdgpu_kernel void @srem_v4i64_4(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GCN-LABEL: srem_v4i64_4:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GCN-NEXT:    v_mov_b32_e32 v8, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    global_load_dwordx4 v[0:3], v8, s[6:7]
-; GCN-NEXT:    global_load_dwordx4 v[4:7], v8, s[6:7] offset:16
+; GCN-NEXT:    global_load_dwordx4 v[0:3], v8, s[2:3]
+; GCN-NEXT:    global_load_dwordx4 v[4:7], v8, s[2:3] offset:16
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    v_ashrrev_i32_e32 v9, 31, v1
 ; GCN-NEXT:    v_lshrrev_b32_e32 v9, 30, v9
@@ -8918,8 +8918,8 @@ define amdgpu_kernel void @srem_v4i64_4(ptr addrspace(1) %out, ptr addrspace(1)
 ; GCN-NEXT:    v_subb_co_u32_e32 v5, vcc, v5, v15, vcc
 ; GCN-NEXT:    v_sub_co_u32_e32 v6, vcc, v6, v12
 ; GCN-NEXT:    v_subb_co_u32_e32 v7, vcc, v7, v16, vcc
-; GCN-NEXT:    global_store_dwordx4 v8, v[4:7], s[4:5] offset:16
-; GCN-NEXT:    global_store_dwordx4 v8, v[0:3], s[4:5]
+; GCN-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GCN-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
 ; GCN-NEXT:    s_endpgm
 ;
 ; TAHITI-LABEL: srem_v4i64_4:
diff --git a/llvm/test/CodeGen/AMDGPU/sub.ll b/llvm/test/CodeGen/AMDGPU/sub.ll
index df12e32d971a2..6044873563254 100644
--- a/llvm/test/CodeGen/AMDGPU/sub.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub.ll
@@ -33,12 +33,12 @@ define amdgpu_kernel void @s_sub_i32(ptr addrspace(1) %out, i32 %a, i32 %b) {
 ;
 ; GFX9-LABEL: s_sub_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_sub_i32 s0, s6, s7
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    s_sub_i32 s2, s2, s3
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: s_sub_i32:
@@ -144,13 +144,13 @@ define amdgpu_kernel void @test_sub_i32(ptr addrspace(1) %out, ptr addrspace(1)
 ;
 ; GFX9-LABEL: test_sub_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[6:7]
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v1
-; GFX9-NEXT:    global_store_dword v2, v0, s[4:5]
+; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: test_sub_i32:
@@ -208,13 +208,13 @@ define amdgpu_kernel void @test_sub_imm_i32(ptr addrspace(1) %out, ptr addrspace
 ;
 ; GFX9-LABEL: test_sub_imm_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_sub_u32_e32 v1, 0x7b, v1
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: test_sub_imm_i32:
@@ -272,14 +272,14 @@ define amdgpu_kernel void @test_sub_v2i32(ptr addrspace(1) %out, ptr addrspace(1
 ;
 ; GFX9-LABEL: test_sub_v2i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[0:3], v4, s[6:7]
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_sub_u32_e32 v1, v1, v3
 ; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v2
-; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[4:5]
+; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: test_sub_v2i32:
@@ -350,17 +350,17 @@ define amdgpu_kernel void @test_sub_v4i32(ptr addrspace(1) %out, ptr addrspace(1
 ;
 ; GFX9-LABEL: test_sub_v4i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[0:3], v8, s[6:7] offset:16
-; GFX9-NEXT:    global_load_dwordx4 v[4:7], v8, s[6:7]
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v8, s[2:3] offset:16
+; GFX9-NEXT:    global_load_dwordx4 v[4:7], v8, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_sub_u32_e32 v3, v7, v3
 ; GFX9-NEXT:    v_sub_u32_e32 v2, v6, v2
 ; GFX9-NEXT:    v_sub_u32_e32 v1, v5, v1
 ; GFX9-NEXT:    v_sub_u32_e32 v0, v4, v0
-; GFX9-NEXT:    global_store_dwordx4 v8, v[0:3], s[4:5]
+; GFX9-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: test_sub_v4i32:
@@ -432,16 +432,16 @@ define amdgpu_kernel void @test_sub_i16(ptr addrspace(1) %out, ptr addrspace(1)
 ;
 ; GFX9-LABEL: test_sub_i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v1, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_load_ushort v2, v0, s[6:7] offset:2 glc
+; GFX9-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_sub_u16_e32 v1, v1, v2
-; GFX9-NEXT:    global_store_short v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: test_sub_i16:
@@ -517,14 +517,14 @@ define amdgpu_kernel void @test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace(1
 ;
 ; GFX9-LABEL: test_sub_v2i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v0, s[6:7]
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_sub_i16 v0, v0, v1
-; GFX9-NEXT:    global_store_dword v2, v0, s[4:5]
+; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: test_sub_v2i16:
@@ -608,15 +608,15 @@ define amdgpu_kernel void @test_sub_v4i16(ptr addrspace(1) %out, ptr addrspace(1
 ;
 ; GFX9-LABEL: test_sub_v4i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[0:3], v0, s[6:7]
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_sub_i16 v1, v1, v3
 ; GFX9-NEXT:    v_pk_sub_i16 v0, v0, v2
-; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[4:5]
+; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: test_sub_v4i16:
diff --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
index 5a821db6ff040..fe234a82ba6f7 100644
--- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
@@ -216,15 +216,15 @@ define amdgpu_kernel void @s_test_sub_self_v2i16(ptr addrspace(1) %out, ptr addr
 define amdgpu_kernel void @s_test_sub_v2i16_kernarg(ptr addrspace(1) %out, <2 x i16> %a, <2 x i16> %b) #1 {
 ; GFX9-LABEL: s_test_sub_v2i16_kernarg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s7
-; GFX9-NEXT:    s_mov_b32 s0, s4
-; GFX9-NEXT:    s_mov_b32 s1, s5
-; GFX9-NEXT:    v_pk_sub_i16 v0, s6, v0
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s3
+; GFX9-NEXT:    s_mov_b32 s4, s0
+; GFX9-NEXT:    s_mov_b32 s5, s1
+; GFX9-NEXT:    v_pk_sub_i16 v0, s2, v0
+; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_test_sub_v2i16_kernarg:
@@ -248,14 +248,14 @@ define amdgpu_kernel void @s_test_sub_v2i16_kernarg(ptr addrspace(1) %out, <2 x
 ;
 ; GFX10-LABEL: s_test_sub_v2i16_kernarg:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX10-NEXT:    s_mov_b32 s2, -1
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX10-NEXT:    s_mov_b32 s6, -1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_pk_sub_i16 v0, s6, s7
-; GFX10-NEXT:    s_mov_b32 s0, s4
-; GFX10-NEXT:    s_mov_b32 s1, s5
-; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX10-NEXT:    v_pk_sub_i16 v0, s2, s3
+; GFX10-NEXT:    s_mov_b32 s4, s0
+; GFX10-NEXT:    s_mov_b32 s5, s1
+; GFX10-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_test_sub_v2i16_kernarg:
@@ -279,16 +279,16 @@ define amdgpu_kernel void @s_test_sub_v2i16_kernarg(ptr addrspace(1) %out, <2 x
 define amdgpu_kernel void @v_test_sub_v2i16_constant(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
 ; GFX9-LABEL: v_test_sub_v2i16_constant:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT:    s_mov_b32 s0, 0x1c8007b
+; GFX9-NEXT:    s_mov_b32 s4, 0x1c8007b
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_dword v0, v0, s[2:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
-; GFX9-NEXT:    v_pk_sub_i16 v0, v0, s0
-; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    v_pk_sub_i16 v0, v0, s4
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: v_test_sub_v2i16_constant:
@@ -312,16 +312,16 @@ define amdgpu_kernel void @v_test_sub_v2i16_constant(ptr addrspace(1) %out, ptr
 ;
 ; GFX10-LABEL: v_test_sub_v2i16_constant:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[6:7] glc dlc
+; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX10-NEXT:    s_mov_b32 s6, -1
+; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX10-NEXT:    s_mov_b32 s2, -1
 ; GFX10-NEXT:    v_pk_sub_i16 v0, v0, 0x1c8007b
-; GFX10-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_sub_v2i16_constant:
@@ -353,16 +353,16 @@ define amdgpu_kernel void @v_test_sub_v2i16_constant(ptr addrspace(1) %out, ptr
 define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
 ; GFX9-LABEL: v_test_sub_v2i16_neg_constant:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT:    s_mov_b32 s0, 0xfc21fcb3
+; GFX9-NEXT:    s_mov_b32 s4, 0xfc21fcb3
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_dword v0, v0, s[2:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
-; GFX9-NEXT:    v_pk_sub_i16 v0, v0, s0
-; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    v_pk_sub_i16 v0, v0, s4
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: v_test_sub_v2i16_neg_constant:
@@ -386,16 +386,16 @@ define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(ptr addrspace(1) %out,
 ;
 ; GFX10-LABEL: v_test_sub_v2i16_neg_constant:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[6:7] glc dlc
+; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX10-NEXT:    s_mov_b32 s6, -1
+; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX10-NEXT:    s_mov_b32 s2, -1
 ; GFX10-NEXT:    v_pk_sub_i16 v0, v0, 0xfc21fcb3
-; GFX10-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_sub_v2i16_neg_constant:
@@ -426,15 +426,15 @@ define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(ptr addrspace(1) %out,
 define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
 ; GFX9-LABEL: v_test_sub_v2i16_inline_neg1:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_dword v0, v0, s[2:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    v_pk_sub_i16 v0, v0, -1
-; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: v_test_sub_v2i16_inline_neg1:
@@ -458,16 +458,16 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(ptr addrspace(1) %out, p
 ;
 ; GFX10-LABEL: v_test_sub_v2i16_inline_neg1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[6:7] glc dlc
+; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX10-NEXT:    s_mov_b32 s6, -1
+; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX10-NEXT:    s_mov_b32 s2, -1
 ; GFX10-NEXT:    v_pk_sub_i16 v0, v0, -1
-; GFX10-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_sub_v2i16_inline_neg1:
@@ -498,15 +498,15 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(ptr addrspace(1) %out, p
 define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
 ; GFX9-LABEL: v_test_sub_v2i16_inline_lo_zero_hi:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_dword v0, v0, s[2:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    v_pk_sub_i16 v0, v0, 32
-; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: v_test_sub_v2i16_inline_lo_zero_hi:
@@ -529,16 +529,16 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(ptr addrspace(1) %
 ;
 ; GFX10-LABEL: v_test_sub_v2i16_inline_lo_zero_hi:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[6:7] glc dlc
+; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX10-NEXT:    s_mov_b32 s6, -1
+; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX10-NEXT:    s_mov_b32 s2, -1
 ; GFX10-NEXT:    v_pk_sub_i16 v0, v0, 32
-; GFX10-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_sub_v2i16_inline_lo_zero_hi:
@@ -570,15 +570,15 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(ptr addrspace(1) %
 define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
 ; GFX9-LABEL: v_test_sub_v2i16_inline_fp_split:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v0, s[6:7] glc
+; GFX9-NEXT:    global_load_dword v0, v0, s[2:3] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    v_pk_sub_i16 v0, v0, 1.0
-; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: v_test_sub_v2i16_inline_fp_split:
@@ -601,16 +601,16 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(ptr addrspace(1) %ou
 ;
 ; GFX10-LABEL: v_test_sub_v2i16_inline_fp_split:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v0, s[6:7] glc dlc
+; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX10-NEXT:    s_mov_b32 s6, -1
+; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX10-NEXT:    s_mov_b32 s2, -1
 ; GFX10-NEXT:    v_pk_sub_i16 v0, v0, 1.0
-; GFX10-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_sub_v2i16_inline_fp_split:
diff --git a/llvm/test/CodeGen/AMDGPU/twoaddr-constrain.ll b/llvm/test/CodeGen/AMDGPU/twoaddr-constrain.ll
index d9e0e0298e072..e668c1d2b7f3d 100644
--- a/llvm/test/CodeGen/AMDGPU/twoaddr-constrain.ll
+++ b/llvm/test/CodeGen/AMDGPU/twoaddr-constrain.ll
@@ -11,11 +11,11 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align4(ptr addrspace(4) inreg
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY killed $sgpr1
   ; CHECK-NEXT:   undef [[COPY2:%[0-9]+]].sub0:sreg_64 = COPY killed [[COPY]]
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]].sub1:sreg_64 = COPY killed [[COPY1]]
-  ; CHECK-NEXT:   early-clobber %11:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY2]], 0, 0 :: (invariant load (<2 x s32>) from %ir.ptr, align 4, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY2]], 0, 0 :: (invariant load (<2 x s32>) from %ir.ptr, align 4, addrspace 4)
   ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM killed [[COPY2]], 8, 0 :: (invariant load (s32) from %ir.ptr + 8, addrspace 4)
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY %11.sub0
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0
   ; CHECK-NEXT:   $sgpr0 = COPY killed [[COPY3]]
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY killed %11.sub1
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY killed [[S_LOAD_DWORDX2_IMM]].sub1
   ; CHECK-NEXT:   $sgpr1 = COPY killed [[COPY4]]
   ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY killed [[S_LOAD_DWORD_IMM]]
   ; CHECK-NEXT:   $sgpr2 = COPY killed [[COPY5]]
diff --git a/llvm/test/CodeGen/AMDGPU/uaddo.ll b/llvm/test/CodeGen/AMDGPU/uaddo.ll
index c08571a733cc5..03a1b3598024b 100644
--- a/llvm/test/CodeGen/AMDGPU/uaddo.ll
+++ b/llvm/test/CodeGen/AMDGPU/uaddo.ll
@@ -182,16 +182,16 @@ define amdgpu_kernel void @v_uaddo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
 ;
 ; GFX9-LABEL: v_uaddo_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[8:9]
-; GFX9-NEXT:    global_load_dword v2, v0, s[10:11]
+; GFX9-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX9-NEXT:    global_load_dword v2, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
-; GFX9-NEXT:    global_store_byte v0, v2, s[6:7]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_byte v0, v2, s[2:3]
 ; GFX9-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -268,19 +268,19 @@ define amdgpu_kernel void @v_uaddo_i32_novcc(ptr addrspace(1) %out, ptr addrspac
 ;
 ; GFX9-LABEL: v_uaddo_i32_novcc:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[8:9]
-; GFX9-NEXT:    global_load_dword v2, v0, s[10:11]
+; GFX9-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX9-NEXT:    global_load_dword v2, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    global_store_byte v0, v2, s[6:7]
+; GFX9-NEXT:    global_store_byte v0, v2, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -345,19 +345,19 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ;
 ; GFX9-LABEL: s_uaddo_i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_add_u32 s0, s8, s10
-; GFX9-NEXT:    v_mov_b32_e32 v0, s8
-; GFX9-NEXT:    v_mov_b32_e32 v1, s9
-; GFX9-NEXT:    s_addc_u32 s1, s9, s11
-; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9-NEXT:    s_add_u32 s6, s4, s6
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    s_addc_u32 s7, s5, s7
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[4:5]
-; GFX9-NEXT:    global_store_byte v4, v0, s[6:7]
+; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1]
+; GFX9-NEXT:    global_store_byte v4, v0, s[2:3]
 ; GFX9-NEXT:    s_endpgm
   %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
   %val = extractvalue { i64, i1 } %uadd, 0
@@ -424,18 +424,18 @@ define amdgpu_kernel void @v_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ;
 ; GFX9-LABEL: v_uaddo_i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v4, s[8:9]
-; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[10:11]
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v4, s[4:5]
+; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v0, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v1, v3, vcc
 ; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1]
-; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[4:5]
+; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT:    global_store_byte v4, v0, s[6:7]
+; GFX9-NEXT:    global_store_byte v4, v0, s[2:3]
 ; GFX9-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -508,17 +508,17 @@ define amdgpu_kernel void @v_uaddo_i16(ptr addrspace(1) %out, ptr addrspace(1) %
 ;
 ; GFX9-LABEL: v_uaddo_i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v1, v0, s[8:9]
-; GFX9-NEXT:    global_load_ushort v2, v0, s[10:11]
+; GFX9-NEXT:    global_load_ushort v1, v0, s[4:5]
+; GFX9-NEXT:    global_load_ushort v2, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_u16_e32 v2, v1, v2
 ; GFX9-NEXT:    v_cmp_lt_u16_e32 vcc, v2, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX9-NEXT:    global_store_short v0, v2, s[4:5]
-; GFX9-NEXT:    global_store_byte v0, v1, s[6:7]
+; GFX9-NEXT:    global_store_short v0, v2, s[0:1]
+; GFX9-NEXT:    global_store_byte v0, v1, s[2:3]
 ; GFX9-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -591,18 +591,18 @@ define amdgpu_kernel void @v_uaddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1)
 ;
 ; GFX9-LABEL: v_uaddo_v2i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v4, s[8:9]
-; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[10:11]
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v4, s[4:5]
+; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[4:5]
-; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[6:7]
+; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
+; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[2:3]
 ; GFX9-NEXT:    s_endpgm
   %a = load <2 x i32>, ptr addrspace(1) %aptr, align 4
   %b = load <2 x i32>, ptr addrspace(1) %bptr, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/usubo.ll b/llvm/test/CodeGen/AMDGPU/usubo.ll
index 36a0cbd3f0970..dacc986205983 100644
--- a/llvm/test/CodeGen/AMDGPU/usubo.ll
+++ b/llvm/test/CodeGen/AMDGPU/usubo.ll
@@ -182,16 +182,16 @@ define amdgpu_kernel void @v_usubo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
 ;
 ; GFX9-LABEL: v_usubo_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[8:9]
-; GFX9-NEXT:    global_load_dword v2, v0, s[10:11]
+; GFX9-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX9-NEXT:    global_load_dword v2, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_sub_co_u32_e32 v1, vcc, v1, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
-; GFX9-NEXT:    global_store_byte v0, v2, s[6:7]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    global_store_byte v0, v2, s[2:3]
 ; GFX9-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -268,19 +268,19 @@ define amdgpu_kernel void @v_usubo_i32_novcc(ptr addrspace(1) %out, ptr addrspac
 ;
 ; GFX9-LABEL: v_usubo_i32_novcc:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[8:9]
-; GFX9-NEXT:    global_load_dword v2, v0, s[10:11]
+; GFX9-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX9-NEXT:    global_load_dword v2, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_sub_co_u32_e32 v1, vcc, v1, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    global_store_byte v0, v2, s[6:7]
+; GFX9-NEXT:    global_store_byte v0, v2, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -345,19 +345,19 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ;
 ; GFX9-LABEL: s_usubo_i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_sub_u32 s0, s8, s10
-; GFX9-NEXT:    v_mov_b32_e32 v0, s8
-; GFX9-NEXT:    v_mov_b32_e32 v1, s9
-; GFX9-NEXT:    s_subb_u32 s1, s9, s11
-; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9-NEXT:    s_sub_u32 s6, s4, s6
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    s_subb_u32 s7, s5, s7
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[4:5]
-; GFX9-NEXT:    global_store_byte v4, v0, s[6:7]
+; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1]
+; GFX9-NEXT:    global_store_byte v4, v0, s[2:3]
 ; GFX9-NEXT:    s_endpgm
   %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b)
   %val = extractvalue { i64, i1 } %usub, 0
@@ -424,18 +424,18 @@ define amdgpu_kernel void @v_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ;
 ; GFX9-LABEL: v_usubo_i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v4, s[8:9]
-; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[10:11]
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v4, s[4:5]
+; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, v0, v2
 ; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
 ; GFX9-NEXT:    v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
-; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[4:5]
+; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT:    global_store_byte v4, v0, s[6:7]
+; GFX9-NEXT:    global_store_byte v4, v0, s[2:3]
 ; GFX9-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -508,17 +508,17 @@ define amdgpu_kernel void @v_usubo_i16(ptr addrspace(1) %out, ptr addrspace(1) %
 ;
 ; GFX9-LABEL: v_usubo_i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v1, v0, s[8:9]
-; GFX9-NEXT:    global_load_ushort v2, v0, s[10:11]
+; GFX9-NEXT:    global_load_ushort v1, v0, s[4:5]
+; GFX9-NEXT:    global_load_ushort v2, v0, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_sub_u16_e32 v2, v1, v2
 ; GFX9-NEXT:    v_cmp_gt_u16_e32 vcc, v2, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX9-NEXT:    global_store_short v0, v2, s[4:5]
-; GFX9-NEXT:    global_store_byte v0, v1, s[6:7]
+; GFX9-NEXT:    global_store_short v0, v2, s[0:1]
+; GFX9-NEXT:    global_store_byte v0, v1, s[2:3]
 ; GFX9-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -591,18 +591,18 @@ define amdgpu_kernel void @v_usubo_v2i32(ptr addrspace(1) %out, ptr addrspace(1)
 ;
 ; GFX9-LABEL: v_usubo_v2i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v4, s[8:9]
-; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[10:11]
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v4, s[4:5]
+; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_sub_co_u32_e32 v1, vcc, v1, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[4:5]
-; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[6:7]
+; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
+; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[2:3]
 ; GFX9-NEXT:    s_endpgm
   %a = load <2 x i32>, ptr addrspace(1) %aptr, align 4
   %b = load <2 x i32>, ptr addrspace(1) %bptr, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/v_add_u64_pseudo_sdwa.ll b/llvm/test/CodeGen/AMDGPU/v_add_u64_pseudo_sdwa.ll
index 1bb76cf547e25..2210b6c0d3c3a 100644
--- a/llvm/test/CodeGen/AMDGPU/v_add_u64_pseudo_sdwa.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_add_u64_pseudo_sdwa.ll
@@ -25,16 +25,16 @@ bb:
 define amdgpu_kernel void @test_add_co_sdwa(ptr addrspace(1) %arg, ptr addrspace(1) %arg1) #0 {
 ; GFX9-LABEL: test_add_co_sdwa:
 ; GFX9:       ; %bb.0: ; %bb
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v4, v2, s[6:7]
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v3, s[4:5]
+; GFX9-NEXT:    global_load_dword v4, v2, s[2:3]
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v3, s[0:1]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_co_u32_sdwa v0, vcc, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_store_dwordx2 v3, v[0:1], s[4:5]
+; GFX9-NEXT:    global_store_dwordx2 v3, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
index d1bf5ecb56984..a8f3635416cff 100644
--- a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
@@ -137,13 +137,13 @@ define amdgpu_kernel void @v_cnd_nan(ptr addrspace(1) %out, i32 %c, float %f) #0
 ;
 ; GFX10-LABEL: v_cnd_nan:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_cmp_eq_u32 s6, 0
-; GFX10-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, -1, s7, s[0:1]
-; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX10-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, -1, s3, s[4:5]
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_cnd_nan:
diff --git a/llvm/test/CodeGen/AMDGPU/v_pack.ll b/llvm/test/CodeGen/AMDGPU/v_pack.ll
index 5a7cce39f6103..9f6d27802e184 100644
--- a/llvm/test/CodeGen/AMDGPU/v_pack.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_pack.ll
@@ -7,12 +7,12 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1
 define amdgpu_kernel void @v_pack_b32_v2f16(ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 {
 ; GCN-LABEL: v_pack_b32_v2f16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    global_load_ushort v1, v0, s[4:5] glc dlc
+; GCN-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    global_load_ushort v2, v0, s[6:7] glc dlc
+; GCN-NEXT:    global_load_ushort v2, v0, s[2:3] glc dlc
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_add_f16_e32 v0, 2.0, v1
 ; GCN-NEXT:    v_add_f16_e32 v1, 2.0, v2
@@ -24,12 +24,12 @@ define amdgpu_kernel void @v_pack_b32_v2f16(ptr addrspace(1) %in0, ptr addrspace
 ;
 ; GISEL-LABEL: v_pack_b32_v2f16:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    global_load_ushort v1, v0, s[4:5] glc dlc
+; GISEL-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_load_ushort v2, v0, s[6:7] glc dlc
+; GISEL-NEXT:    global_load_ushort v2, v0, s[2:3] glc dlc
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    v_add_f16_e32 v0, 2.0, v1
 ; GISEL-NEXT:    v_add_f16_e32 v1, 2.0, v2
@@ -56,12 +56,12 @@ define amdgpu_kernel void @v_pack_b32_v2f16(ptr addrspace(1) %in0, ptr addrspace
 define amdgpu_kernel void @v_pack_b32_v2f16_sub(ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 {
 ; GCN-LABEL: v_pack_b32_v2f16_sub:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    global_load_ushort v1, v0, s[4:5] glc dlc
+; GCN-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    global_load_ushort v2, v0, s[6:7] glc dlc
+; GCN-NEXT:    global_load_ushort v2, v0, s[2:3] glc dlc
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_subrev_f16_e32 v0, 2.0, v1
 ; GCN-NEXT:    v_add_f16_e32 v1, 2.0, v2
@@ -73,12 +73,12 @@ define amdgpu_kernel void @v_pack_b32_v2f16_sub(ptr addrspace(1) %in0, ptr addrs
 ;
 ; GISEL-LABEL: v_pack_b32_v2f16_sub:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    global_load_ushort v1, v0, s[4:5] glc dlc
+; GISEL-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_load_ushort v2, v0, s[6:7] glc dlc
+; GISEL-NEXT:    global_load_ushort v2, v0, s[2:3] glc dlc
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    v_subrev_f16_e32 v0, 2.0, v1
 ; GISEL-NEXT:    v_add_f16_e32 v1, 2.0, v2
@@ -105,36 +105,36 @@ define amdgpu_kernel void @v_pack_b32_v2f16_sub(ptr addrspace(1) %in0, ptr addrs
 define amdgpu_kernel void @fptrunc(
 ; GCN-LABEL: fptrunc:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    s_mov_b32 s3, 0x31016000
-; GCN-NEXT:    s_mov_b32 s10, s2
-; GCN-NEXT:    s_mov_b32 s11, s3
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_mov_b32 s7, 0x31016000
+; GCN-NEXT:    s_mov_b32 s10, s6
+; GCN-NEXT:    s_mov_b32 s11, s7
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s8, s6
-; GCN-NEXT:    s_mov_b32 s9, s7
-; GCN-NEXT:    s_mov_b32 s0, s4
+; GCN-NEXT:    s_mov_b32 s8, s2
+; GCN-NEXT:    s_mov_b32 s9, s3
+; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; GCN-NEXT:    s_mov_b32 s1, s5
+; GCN-NEXT:    s_mov_b32 s5, s1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GCN-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: fptrunc:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GISEL-NEXT:    s_mov_b32 s6, -1
-; GISEL-NEXT:    s_mov_b32 s7, 0x31016000
+; GISEL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    v_cvt_f16_f32_e32 v0, s0
-; GISEL-NEXT:    v_cvt_f16_f32_e32 v1, s1
+; GISEL-NEXT:    v_cvt_f16_f32_e32 v0, s2
+; GISEL-NEXT:    v_cvt_f16_f32_e32 v1, s3
+; GISEL-NEXT:    s_mov_b32 s2, -1
+; GISEL-NEXT:    s_mov_b32 s3, 0x31016000
 ; GISEL-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GISEL-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GISEL-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
@@ -147,12 +147,12 @@ define amdgpu_kernel void @fptrunc(
 define amdgpu_kernel void @v_pack_b32.fabs(ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 {
 ; GCN-LABEL: v_pack_b32.fabs:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    global_load_ushort v1, v0, s[4:5] glc dlc
+; GCN-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    global_load_ushort v2, v0, s[6:7] glc dlc
+; GCN-NEXT:    global_load_ushort v2, v0, s[2:3] glc dlc
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_add_f16_e32 v0, 2.0, v1
 ; GCN-NEXT:    v_add_f16_e32 v1, 2.0, v2
@@ -164,12 +164,12 @@ define amdgpu_kernel void @v_pack_b32.fabs(ptr addrspace(1) %in0, ptr addrspace(
 ;
 ; GISEL-LABEL: v_pack_b32.fabs:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    global_load_ushort v1, v0, s[4:5] glc dlc
+; GISEL-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_load_ushort v2, v0, s[6:7] glc dlc
+; GISEL-NEXT:    global_load_ushort v2, v0, s[2:3] glc dlc
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    v_add_f16_e32 v0, 2.0, v1
 ; GISEL-NEXT:    v_add_f16_e32 v1, 2.0, v2
@@ -198,12 +198,12 @@ define amdgpu_kernel void @v_pack_b32.fabs(ptr addrspace(1) %in0, ptr addrspace(
 define amdgpu_kernel void @v_pack_b32.fneg(ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 {
 ; GCN-LABEL: v_pack_b32.fneg:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    global_load_ushort v1, v0, s[4:5] glc dlc
+; GCN-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    global_load_ushort v2, v0, s[6:7] glc dlc
+; GCN-NEXT:    global_load_ushort v2, v0, s[2:3] glc dlc
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_add_f16_e32 v0, 2.0, v1
 ; GCN-NEXT:    v_add_f16_e32 v1, 2.0, v2
@@ -215,12 +215,12 @@ define amdgpu_kernel void @v_pack_b32.fneg(ptr addrspace(1) %in0, ptr addrspace(
 ;
 ; GISEL-LABEL: v_pack_b32.fneg:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GISEL-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    global_load_ushort v1, v0, s[4:5] glc dlc
+; GISEL-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_load_ushort v2, v0, s[6:7] glc dlc
+; GISEL-NEXT:    global_load_ushort v2, v0, s[2:3] glc dlc
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    v_add_f16_e32 v0, 2.0, v1
 ; GISEL-NEXT:    v_add_f16_e32 v1, 2.0, v2
diff --git a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
index ee99fcc586334..8579cbdf47137 100644
--- a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
@@ -104,15 +104,15 @@ define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg
 ;
 ; SDAG-GFX9-LABEL: basic_smax_smin_sgpr:
 ; SDAG-GFX9:       ; %bb.0:
-; SDAG-GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; SDAG-GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; SDAG-GFX9-NEXT:    v_mov_b32_e32 v1, 0xff
 ; SDAG-GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX9-NEXT:    v_med3_i16 v2, s6, 0, v1
-; SDAG-GFX9-NEXT:    v_med3_i16 v1, s7, 0, v1
+; SDAG-GFX9-NEXT:    v_med3_i16 v2, s2, 0, v1
+; SDAG-GFX9-NEXT:    v_med3_i16 v1, s3, 0, v1
 ; SDAG-GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; SDAG-GFX9-NEXT:    v_lshl_or_b32 v1, v1, 16, v2
-; SDAG-GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; SDAG-GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; SDAG-GFX9-NEXT:    s_endpgm
 ;
 ; SDAG-GFX11-LABEL: basic_smax_smin_sgpr:
@@ -156,22 +156,22 @@ define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg
 ;
 ; GISEL-GFX9-LABEL: basic_smax_smin_sgpr:
 ; GISEL-GFX9:       ; %bb.0:
-; GISEL-GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GISEL-GFX9-NEXT:    s_sext_i32_i16 s0, 0
-; GISEL-GFX9-NEXT:    s_sext_i32_i16 s1, 0xff
+; GISEL-GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GISEL-GFX9-NEXT:    s_sext_i32_i16 s4, 0
+; GISEL-GFX9-NEXT:    s_sext_i32_i16 s5, 0xff
 ; GISEL-GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX9-NEXT:    s_sext_i32_i16 s2, s6
-; GISEL-GFX9-NEXT:    s_sext_i32_i16 s3, s7
-; GISEL-GFX9-NEXT:    s_max_i32 s2, s2, s0
-; GISEL-GFX9-NEXT:    s_max_i32 s0, s3, s0
 ; GISEL-GFX9-NEXT:    s_sext_i32_i16 s2, s2
-; GISEL-GFX9-NEXT:    s_sext_i32_i16 s0, s0
-; GISEL-GFX9-NEXT:    s_min_i32 s2, s2, s1
-; GISEL-GFX9-NEXT:    s_min_i32 s0, s0, s1
-; GISEL-GFX9-NEXT:    s_pack_ll_b32_b16 s0, s2, s0
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GISEL-GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
+; GISEL-GFX9-NEXT:    s_sext_i32_i16 s3, s3
+; GISEL-GFX9-NEXT:    s_max_i32 s2, s2, s4
+; GISEL-GFX9-NEXT:    s_max_i32 s3, s3, s4
+; GISEL-GFX9-NEXT:    s_sext_i32_i16 s2, s2
+; GISEL-GFX9-NEXT:    s_sext_i32_i16 s3, s3
+; GISEL-GFX9-NEXT:    s_min_i32 s2, s2, s5
+; GISEL-GFX9-NEXT:    s_min_i32 s3, s3, s5
+; GISEL-GFX9-NEXT:    s_pack_ll_b32_b16 s2, s2, s3
+; GISEL-GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GISEL-GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX9-NEXT:    s_endpgm
 ;
 ; GISEL-GFX11-LABEL: basic_smax_smin_sgpr:
diff --git a/llvm/test/CodeGen/AMDGPU/v_sub_u64_pseudo_sdwa.ll b/llvm/test/CodeGen/AMDGPU/v_sub_u64_pseudo_sdwa.ll
index 973b7f5c04987..02a6024f858e9 100644
--- a/llvm/test/CodeGen/AMDGPU/v_sub_u64_pseudo_sdwa.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_sub_u64_pseudo_sdwa.ll
@@ -25,16 +25,16 @@ bb:
 define amdgpu_kernel void @test_sub_co_sdwa(ptr addrspace(1) %arg, ptr addrspace(1) %arg1) #0 {
 ; GFX9-LABEL: test_sub_co_sdwa:
 ; GFX9:       ; %bb.0: ; %bb
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v4, v2, s[6:7]
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v3, s[4:5]
+; GFX9-NEXT:    global_load_dword v4, v2, s[2:3]
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v3, s[0:1]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_sub_co_u32_sdwa v0, vcc, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_store_dwordx2 v3, v[0:1], s[4:5]
+; GFX9-NEXT:    global_store_dwordx2 v3, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
index 2a280bcda42f5..02da6deb96f1f 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
@@ -473,9 +473,9 @@ define amdgpu_kernel void @livevariables_update_missed_block(ptr addrspace(1) %s
   ; SI-NEXT: bb.1.if.then:
   ; SI-NEXT:   successors: %bb.7(0x80000000)
   ; SI-NEXT: {{  $}}
-  ; SI-NEXT:   early-clobber %33:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec killed [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.src1.kernarg.offset, align 4, addrspace 4)
-  ; SI-NEXT:   [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 %33.sub0, killed %54, 0, implicit $exec
-  ; SI-NEXT:   [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 0, killed %33.sub1, killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+  ; SI-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.src1.kernarg.offset, align 4, addrspace 4)
+  ; SI-NEXT:   [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, killed %54, 0, implicit $exec
+  ; SI-NEXT:   [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 0, killed [[S_LOAD_DWORDX2_IMM]].sub1, killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
   ; SI-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[V_ADD_CO_U32_e64_]], %subreg.sub0, killed [[V_ADDC_U32_e64_]], %subreg.sub1
   ; SI-NEXT:   [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8) from %ir.i10, addrspace 1)
   ; SI-NEXT:   [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
@@ -570,9 +570,9 @@ define protected amdgpu_kernel void @nested_waterfalls(ptr addrspace(1) %tex.coe
   ; SI-NEXT: bb.1.if.then:
   ; SI-NEXT:   successors: %bb.2(0x80000000)
   ; SI-NEXT: {{  $}}
-  ; SI-NEXT:   early-clobber %10:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec killed [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.tex.coerce.kernarg.offset, align 4, addrspace 4)
+  ; SI-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.tex.coerce.kernarg.offset, align 4, addrspace 4)
   ; SI-NEXT:   [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = nuw nsw V_LSHLREV_B32_e64 3, killed [[COPY1]](s32), implicit $exec
-  ; SI-NEXT:   [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR killed %10, killed [[V_LSHLREV_B32_e64_]], 0, 0, implicit $exec :: (load (s64) from %ir.idx, addrspace 1)
+  ; SI-NEXT:   [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR killed [[S_LOAD_DWORDX2_IMM]], killed [[V_LSHLREV_B32_e64_]], 0, 0, implicit $exec :: (load (s64) from %ir.idx, addrspace 1)
   ; SI-NEXT:   [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[GLOBAL_LOAD_DWORDX2_SADDR]], 16, 0, implicit $exec :: (invariant load (s128) from %ir.3 + 16, addrspace 4)
   ; SI-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX4_]].sub3
   ; SI-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX4_]].sub2
diff --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
index 33c06e5d1e3a5..7301b341cbc71 100644
--- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
+++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
@@ -405,33 +405,33 @@ return:
 define amdgpu_kernel void @v8i8_phi_chain(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) {
 ; GFX906-LABEL: v8i8_phi_chain:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX906-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
-; GFX906-NEXT:    v_cmp_lt_u32_e64 s[0:1], 14, v0
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dwordx2 v[1:2], v3, s[4:5]
-; GFX906-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX906-NEXT:    global_load_dwordx2 v[1:2], v3, s[0:1]
+; GFX906-NEXT:    v_cmp_lt_u32_e64 s[0:1], 14, v0
+; GFX906-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX906-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    global_load_dwordx2 v[1:2], v3, s[6:7]
+; GFX906-NEXT:    global_load_dwordx2 v[1:2], v3, s[2:3]
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 7, v0
 ; GFX906-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
-; GFX906-NEXT:    s_and_b64 s[4:5], vcc, exec
-; GFX906-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
+; GFX906-NEXT:    s_and_b64 s[2:3], vcc, exec
+; GFX906-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
 ; GFX906-NEXT:  .LBB8_2: ; %Flow
-; GFX906-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX906-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX906-NEXT:    s_and_saveexec_b64 s[2:3], s[0:1]
 ; GFX906-NEXT:    s_cbranch_execz .LBB8_4
 ; GFX906-NEXT:  ; %bb.3: ; %bb.2
 ; GFX906-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    global_store_dwordx2 v0, v[1:2], s[8:9]
+; GFX906-NEXT:    global_store_dwordx2 v0, v[1:2], s[4:5]
 ; GFX906-NEXT:  .LBB8_4: ; %bb.3
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX906-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    global_store_dwordx2 v0, v[1:2], s[10:11]
+; GFX906-NEXT:    global_store_dwordx2 v0, v[1:2], s[6:7]
 ; GFX906-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -460,28 +460,28 @@ bb.3:
 define amdgpu_kernel void @v8i8_phi_zeroinit(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) {
 ; GFX906-LABEL: v8i8_phi_zeroinit:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX906-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 3, v0
-; GFX906-NEXT:    v_cmp_lt_u32_e64 s[0:1], 14, v0
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
 ; GFX906-NEXT:    ; implicit-def: $vgpr1_vgpr2
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dwordx2 v[3:4], v5, s[4:5]
-; GFX906-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX906-NEXT:    global_load_dwordx2 v[3:4], v5, s[0:1]
+; GFX906-NEXT:    v_cmp_lt_u32_e64 s[0:1], 14, v0
+; GFX906-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX906-NEXT:    s_cbranch_execz .LBB9_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    global_load_dwordx2 v[1:2], v5, s[6:7]
-; GFX906-NEXT:    s_mov_b32 s4, 0
+; GFX906-NEXT:    global_load_dwordx2 v[1:2], v5, s[2:3]
+; GFX906-NEXT:    s_mov_b32 s2, 0
+; GFX906-NEXT:    s_mov_b32 s3, s2
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 7, v0
-; GFX906-NEXT:    s_mov_b32 s5, s4
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_mov_b32_e32 v3, s4
-; GFX906-NEXT:    v_mov_b32_e32 v4, s5
+; GFX906-NEXT:    v_mov_b32_e32 v4, s3
+; GFX906-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX906-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
-; GFX906-NEXT:    s_and_b64 s[4:5], vcc, exec
-; GFX906-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
+; GFX906-NEXT:    s_and_b64 s[2:3], vcc, exec
+; GFX906-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
 ; GFX906-NEXT:  .LBB9_2: ; %Flow
-; GFX906-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX906-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX906-NEXT:    s_and_saveexec_b64 s[2:3], s[0:1]
 ; GFX906-NEXT:    s_cbranch_execz .LBB9_4
 ; GFX906-NEXT:  ; %bb.3: ; %bb.2
@@ -489,12 +489,12 @@ define amdgpu_kernel void @v8i8_phi_zeroinit(ptr addrspace(1) %src1, ptr addrspa
 ; GFX906-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX906-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX906-NEXT:    v_mov_b32_e32 v2, v4
-; GFX906-NEXT:    global_store_dwordx2 v0, v[3:4], s[8:9]
+; GFX906-NEXT:    global_store_dwordx2 v0, v[3:4], s[4:5]
 ; GFX906-NEXT:  .LBB9_4: ; %bb.3
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX906-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    global_store_dwordx2 v0, v[1:2], s[10:11]
+; GFX906-NEXT:    global_store_dwordx2 v0, v[1:2], s[6:7]
 ; GFX906-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -522,9 +522,8 @@ bb.3:
 define amdgpu_kernel void @v8i8_phi_const(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) {
 ; GFX906-LABEL: v8i8_phi_const:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX906-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX906-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
-; GFX906-NEXT:    v_cmp_lt_u32_e64 s[0:1], 14, v0
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
 ; GFX906-NEXT:    ; implicit-def: $vgpr3
 ; GFX906-NEXT:    ; implicit-def: $vgpr13
@@ -534,7 +533,8 @@ define amdgpu_kernel void @v8i8_phi_const(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX906-NEXT:    ; implicit-def: $vgpr12
 ; GFX906-NEXT:    ; implicit-def: $vgpr16
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dwordx2 v[1:2], v4, s[4:5]
+; GFX906-NEXT:    global_load_dwordx2 v[1:2], v4, s[0:1]
+; GFX906-NEXT:    v_cmp_lt_u32_e64 s[0:1], 14, v0
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v5, 24, v2
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
@@ -542,13 +542,13 @@ define amdgpu_kernel void @v8i8_phi_const(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v8, 24, v1
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v10, 8, v1
-; GFX906-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX906-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX906-NEXT:    s_cbranch_execz .LBB10_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    global_load_dwordx2 v[3:4], v4, s[6:7]
+; GFX906-NEXT:    global_load_dwordx2 v[3:4], v4, s[2:3]
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 7, v0
 ; GFX906-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
-; GFX906-NEXT:    s_and_b64 s[4:5], vcc, exec
+; GFX906-NEXT:    s_and_b64 s[2:3], vcc, exec
 ; GFX906-NEXT:    v_mov_b32_e32 v1, 1
 ; GFX906-NEXT:    v_mov_b32_e32 v10, 2
 ; GFX906-NEXT:    v_mov_b32_e32 v9, 3
@@ -557,7 +557,7 @@ define amdgpu_kernel void @v8i8_phi_const(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX906-NEXT:    v_mov_b32_e32 v7, 6
 ; GFX906-NEXT:    v_mov_b32_e32 v6, 7
 ; GFX906-NEXT:    v_mov_b32_e32 v5, 8
-; GFX906-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
+; GFX906-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v16, 24, v4
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v12, 16, v4
@@ -566,7 +566,7 @@ define amdgpu_kernel void @v8i8_phi_const(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v13, 8, v3
 ; GFX906-NEXT:  .LBB10_2: ; %Flow
-; GFX906-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX906-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX906-NEXT:    s_and_saveexec_b64 s[2:3], s[0:1]
 ; GFX906-NEXT:    s_cbranch_execz .LBB10_4
 ; GFX906-NEXT:  ; %bb.3: ; %bb.2
@@ -581,7 +581,7 @@ define amdgpu_kernel void @v8i8_phi_const(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX906-NEXT:    v_or_b32_sdwa v11, v6, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX906-NEXT:    v_or_b32_sdwa v4, v4, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx2 v0, v[3:4], s[8:9]
+; GFX906-NEXT:    global_store_dwordx2 v0, v[3:4], s[4:5]
 ; GFX906-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX906-NEXT:    v_mov_b32_e32 v13, v10
 ; GFX906-NEXT:    v_mov_b32_e32 v11, v9
@@ -603,7 +603,7 @@ define amdgpu_kernel void @v8i8_phi_const(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX906-NEXT:    v_or_b32_sdwa v3, v12, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx2 v2, v[0:1], s[10:11]
+; GFX906-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX906-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -631,31 +631,31 @@ bb.3:
 define amdgpu_kernel void @v8i8_multi_block(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) {
 ; GFX906-LABEL: v8i8_multi_block:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX906-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 3, v0
 ; GFX906-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dwordx2 v[3:4], v6, s[4:5]
+; GFX906-NEXT:    global_load_dwordx2 v[3:4], v6, s[0:1]
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX906-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX906-NEXT:    s_cbranch_execz .LBB11_4
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    global_load_dwordx2 v[1:2], v6, s[6:7]
+; GFX906-NEXT:    global_load_dwordx2 v[1:2], v6, s[2:3]
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 7, v0
 ; GFX906-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX906-NEXT:    s_cbranch_execz .LBB11_3
 ; GFX906-NEXT:  ; %bb.2: ; %bb.2
 ; GFX906-NEXT:    v_mov_b32_e32 v0, 0
-; GFX906-NEXT:    global_store_dwordx2 v0, v[3:4], s[8:9]
+; GFX906-NEXT:    global_store_dwordx2 v0, v[3:4], s[4:5]
 ; GFX906-NEXT:  .LBB11_3: ; %Flow
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX906-NEXT:  .LBB11_4: ; %bb.3
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    global_store_dwordx2 v5, v[1:2], s[10:11]
+; GFX906-NEXT:    global_store_dwordx2 v5, v[1:2], s[6:7]
 ; GFX906-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll
index c3a81771a2790..dae46361b9bcc 100644
--- a/llvm/test/CodeGen/AMDGPU/wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wave32.ll
@@ -200,30 +200,30 @@ define amdgpu_kernel void @test_vcmp_vcnd_f16(ptr addrspace(1) %out, half %x) #0
 define amdgpu_kernel void @test_vop3_cmp_f32_sop_and(ptr addrspace(1) %arg) {
 ; GFX1032-LABEL: test_vop3_cmp_f32_sop_and:
 ; GFX1032:       ; %bb.0:
-; GFX1032-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
+; GFX1032-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x24
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX1032-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-NEXT:    v_cmp_nge_f32_e32 vcc_lo, 0, v1
 ; GFX1032-NEXT:    v_cmp_nle_f32_e64 s0, 1.0, v1
 ; GFX1032-NEXT:    s_and_b32 s0, vcc_lo, s0
 ; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 2.0, 1.0, s0
-; GFX1032-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX1032-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: test_vop3_cmp_f32_sop_and:
 ; GFX1064:       ; %bb.0:
-; GFX1064-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
+; GFX1064-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x24
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX1064-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-NEXT:    v_cmp_nge_f32_e32 vcc, 0, v1
 ; GFX1064-NEXT:    v_cmp_nle_f32_e64 s[0:1], 1.0, v1
 ; GFX1064-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
 ; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 2.0, 1.0, s[0:1]
-; GFX1064-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX1064-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX1064-NEXT:    s_endpgm
   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %lid
@@ -239,30 +239,30 @@ define amdgpu_kernel void @test_vop3_cmp_f32_sop_and(ptr addrspace(1) %arg) {
 define amdgpu_kernel void @test_vop3_cmp_i32_sop_xor(ptr addrspace(1) %arg) {
 ; GFX1032-LABEL: test_vop3_cmp_i32_sop_xor:
 ; GFX1032:       ; %bb.0:
-; GFX1032-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
+; GFX1032-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x24
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX1032-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 0, v1
 ; GFX1032-NEXT:    v_cmp_gt_i32_e64 s0, 1, v1
 ; GFX1032-NEXT:    s_xor_b32 s0, vcc_lo, s0
 ; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 2, 1, s0
-; GFX1032-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX1032-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: test_vop3_cmp_i32_sop_xor:
 ; GFX1064:       ; %bb.0:
-; GFX1064-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
+; GFX1064-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x24
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX1064-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v1
 ; GFX1064-NEXT:    v_cmp_gt_i32_e64 s[0:1], 1, v1
 ; GFX1064-NEXT:    s_xor_b64 s[0:1], vcc, s[0:1]
 ; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 2, 1, s[0:1]
-; GFX1064-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX1064-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX1064-NEXT:    s_endpgm
   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %lid
@@ -278,30 +278,30 @@ define amdgpu_kernel void @test_vop3_cmp_i32_sop_xor(ptr addrspace(1) %arg) {
 define amdgpu_kernel void @test_vop3_cmp_u32_sop_or(ptr addrspace(1) %arg) {
 ; GFX1032-LABEL: test_vop3_cmp_u32_sop_or:
 ; GFX1032:       ; %bb.0:
-; GFX1032-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
+; GFX1032-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x24
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX1032-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 3, v1
 ; GFX1032-NEXT:    v_cmp_gt_u32_e64 s0, 2, v1
 ; GFX1032-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 2, 1, s0
-; GFX1032-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX1032-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: test_vop3_cmp_u32_sop_or:
 ; GFX1064:       ; %bb.0:
-; GFX1064-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
+; GFX1064-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x24
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX1064-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-NEXT:    v_cmp_lt_u32_e32 vcc, 3, v1
 ; GFX1064-NEXT:    v_cmp_gt_u32_e64 s[0:1], 2, v1
 ; GFX1064-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 2, 1, s[0:1]
-; GFX1064-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX1064-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX1064-NEXT:    s_endpgm
   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %lid
@@ -631,26 +631,26 @@ bb8:
 define amdgpu_kernel void @test_addc_vop2b(ptr addrspace(1) %arg, i64 %arg1) #0 {
 ; GFX1032-LABEL: test_addc_vop2b:
 ; GFX1032:       ; %bb.0: ; %bb
-; GFX1032-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    global_load_dwordx2 v[0:1], v2, s[4:5]
+; GFX1032-NEXT:    global_load_dwordx2 v[0:1], v2, s[0:1]
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_add_co_u32 v0, vcc_lo, v0, s6
-; GFX1032-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, s7, v1, vcc_lo
-; GFX1032-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX1032-NEXT:    v_add_co_u32 v0, vcc_lo, v0, s2
+; GFX1032-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
+; GFX1032-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: test_addc_vop2b:
 ; GFX1064:       ; %bb.0: ; %bb
-; GFX1064-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    global_load_dwordx2 v[0:1], v2, s[4:5]
+; GFX1064-NEXT:    global_load_dwordx2 v[0:1], v2, s[0:1]
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_add_co_u32 v0, vcc, v0, s6
-; GFX1064-NEXT:    v_add_co_ci_u32_e32 v1, vcc, s7, v1, vcc
-; GFX1064-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX1064-NEXT:    v_add_co_u32 v0, vcc, v0, s2
+; GFX1064-NEXT:    v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc
+; GFX1064-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX1064-NEXT:    s_endpgm
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -664,26 +664,26 @@ bb:
 define amdgpu_kernel void @test_subbrev_vop2b(ptr addrspace(1) %arg, i64 %arg1) #0 {
 ; GFX1032-LABEL: test_subbrev_vop2b:
 ; GFX1032:       ; %bb.0: ; %bb
-; GFX1032-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    global_load_dwordx2 v[0:1], v2, s[4:5]
+; GFX1032-NEXT:    global_load_dwordx2 v[0:1], v2, s[0:1]
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, s6
-; GFX1032-NEXT:    v_subrev_co_ci_u32_e32 v1, vcc_lo, s7, v1, vcc_lo
-; GFX1032-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX1032-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, s2
+; GFX1032-NEXT:    v_subrev_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
+; GFX1032-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: test_subbrev_vop2b:
 ; GFX1064:       ; %bb.0: ; %bb
-; GFX1064-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    global_load_dwordx2 v[0:1], v2, s[4:5]
+; GFX1064-NEXT:    global_load_dwordx2 v[0:1], v2, s[0:1]
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_sub_co_u32 v0, vcc, v0, s6
-; GFX1064-NEXT:    v_subrev_co_ci_u32_e32 v1, vcc, s7, v1, vcc
-; GFX1064-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX1064-NEXT:    v_sub_co_u32 v0, vcc, v0, s2
+; GFX1064-NEXT:    v_subrev_co_ci_u32_e32 v1, vcc, s3, v1, vcc
+; GFX1064-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX1064-NEXT:    s_endpgm
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -697,26 +697,26 @@ bb:
 define amdgpu_kernel void @test_subb_vop2b(ptr addrspace(1) %arg, i64 %arg1) #0 {
 ; GFX1032-LABEL: test_subb_vop2b:
 ; GFX1032:       ; %bb.0: ; %bb
-; GFX1032-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    global_load_dwordx2 v[0:1], v2, s[4:5]
+; GFX1032-NEXT:    global_load_dwordx2 v[0:1], v2, s[0:1]
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_sub_co_u32 v0, vcc_lo, s6, v0
-; GFX1032-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s7, v1, vcc_lo
-; GFX1032-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX1032-NEXT:    v_sub_co_u32 v0, vcc_lo, s2, v0
+; GFX1032-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
+; GFX1032-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: test_subb_vop2b:
 ; GFX1064:       ; %bb.0: ; %bb
-; GFX1064-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    global_load_dwordx2 v[0:1], v2, s[4:5]
+; GFX1064-NEXT:    global_load_dwordx2 v[0:1], v2, s[0:1]
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_sub_co_u32 v0, vcc, s6, v0
-; GFX1064-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s7, v1, vcc
-; GFX1064-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX1064-NEXT:    v_sub_co_u32 v0, vcc, s2, v0
+; GFX1064-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc
+; GFX1064-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX1064-NEXT:    s_endpgm
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -730,18 +730,18 @@ bb:
 define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
 ; GFX1032-LABEL: test_udiv64:
 ; GFX1032:       ; %bb.0: ; %bb
-; GFX1032-NEXT:    s_load_dwordx2 s[8:9], s[2:3], 0x24
+; GFX1032-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x24
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_load_dwordx4 s[4:7], s[8:9], 0x0
+; GFX1032-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_or_b64 s[2:3], s[6:7], s[4:5]
-; GFX1032-NEXT:    s_mov_b32 s2, 0
-; GFX1032-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1032-NEXT:    s_or_b64 s[8:9], s[6:7], s[4:5]
+; GFX1032-NEXT:    s_mov_b32 s8, 0
+; GFX1032-NEXT:    s_cmp_lg_u64 s[8:9], 0
 ; GFX1032-NEXT:    s_cbranch_scc0 .LBB15_4
 ; GFX1032-NEXT:  ; %bb.1:
 ; GFX1032-NEXT:    v_cvt_f32_u32_e32 v0, s4
 ; GFX1032-NEXT:    v_cvt_f32_u32_e32 v1, s5
-; GFX1032-NEXT:    s_sub_u32 s3, 0, s4
+; GFX1032-NEXT:    s_sub_u32 s9, 0, s4
 ; GFX1032-NEXT:    s_subb_u32 s10, 0, s5
 ; GFX1032-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GFX1032-NEXT:    v_rcp_f32_e32 v0, v0
@@ -753,11 +753,11 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
 ; GFX1032-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX1032-NEXT:    v_readfirstlane_b32 s0, v1
 ; GFX1032-NEXT:    v_readfirstlane_b32 s1, v0
-; GFX1032-NEXT:    s_mul_i32 s11, s3, s0
-; GFX1032-NEXT:    s_mul_hi_u32 s13, s3, s1
+; GFX1032-NEXT:    s_mul_i32 s11, s9, s0
+; GFX1032-NEXT:    s_mul_hi_u32 s13, s9, s1
 ; GFX1032-NEXT:    s_mul_i32 s12, s10, s1
 ; GFX1032-NEXT:    s_add_i32 s11, s13, s11
-; GFX1032-NEXT:    s_mul_i32 s14, s3, s1
+; GFX1032-NEXT:    s_mul_i32 s14, s9, s1
 ; GFX1032-NEXT:    s_add_i32 s11, s11, s12
 ; GFX1032-NEXT:    s_mul_hi_u32 s13, s1, s14
 ; GFX1032-NEXT:    s_mul_hi_u32 s15, s0, s14
@@ -777,46 +777,46 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
 ; GFX1032-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX1032-NEXT:    s_addc_u32 s0, s0, s11
 ; GFX1032-NEXT:    v_readfirstlane_b32 s1, v0
-; GFX1032-NEXT:    s_mul_i32 s11, s3, s0
-; GFX1032-NEXT:    s_mul_hi_u32 s12, s3, s1
+; GFX1032-NEXT:    s_mul_i32 s11, s9, s0
+; GFX1032-NEXT:    s_mul_hi_u32 s12, s9, s1
 ; GFX1032-NEXT:    s_mul_i32 s10, s10, s1
 ; GFX1032-NEXT:    s_add_i32 s11, s12, s11
-; GFX1032-NEXT:    s_mul_i32 s3, s3, s1
+; GFX1032-NEXT:    s_mul_i32 s9, s9, s1
 ; GFX1032-NEXT:    s_add_i32 s11, s11, s10
-; GFX1032-NEXT:    s_mul_hi_u32 s12, s0, s3
-; GFX1032-NEXT:    s_mul_i32 s13, s0, s3
-; GFX1032-NEXT:    s_mul_hi_u32 s3, s1, s3
+; GFX1032-NEXT:    s_mul_hi_u32 s12, s0, s9
+; GFX1032-NEXT:    s_mul_i32 s13, s0, s9
+; GFX1032-NEXT:    s_mul_hi_u32 s9, s1, s9
 ; GFX1032-NEXT:    s_mul_hi_u32 s14, s1, s11
 ; GFX1032-NEXT:    s_mul_i32 s1, s1, s11
 ; GFX1032-NEXT:    s_mul_hi_u32 s10, s0, s11
-; GFX1032-NEXT:    s_add_u32 s1, s3, s1
-; GFX1032-NEXT:    s_addc_u32 s3, 0, s14
+; GFX1032-NEXT:    s_add_u32 s1, s9, s1
+; GFX1032-NEXT:    s_addc_u32 s9, 0, s14
 ; GFX1032-NEXT:    s_add_u32 s1, s1, s13
 ; GFX1032-NEXT:    s_mul_i32 s11, s0, s11
-; GFX1032-NEXT:    s_addc_u32 s1, s3, s12
-; GFX1032-NEXT:    s_addc_u32 s3, s10, 0
+; GFX1032-NEXT:    s_addc_u32 s1, s9, s12
+; GFX1032-NEXT:    s_addc_u32 s9, s10, 0
 ; GFX1032-NEXT:    s_add_u32 s1, s1, s11
-; GFX1032-NEXT:    s_addc_u32 s3, 0, s3
+; GFX1032-NEXT:    s_addc_u32 s9, 0, s9
 ; GFX1032-NEXT:    v_add_co_u32 v0, s1, v0, s1
 ; GFX1032-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX1032-NEXT:    s_addc_u32 s0, s0, s3
+; GFX1032-NEXT:    s_addc_u32 s0, s0, s9
 ; GFX1032-NEXT:    v_readfirstlane_b32 s1, v0
 ; GFX1032-NEXT:    s_mul_i32 s10, s6, s0
-; GFX1032-NEXT:    s_mul_hi_u32 s3, s6, s0
+; GFX1032-NEXT:    s_mul_hi_u32 s9, s6, s0
 ; GFX1032-NEXT:    s_mul_hi_u32 s11, s7, s0
 ; GFX1032-NEXT:    s_mul_i32 s0, s7, s0
 ; GFX1032-NEXT:    s_mul_hi_u32 s12, s6, s1
 ; GFX1032-NEXT:    s_mul_hi_u32 s13, s7, s1
 ; GFX1032-NEXT:    s_mul_i32 s1, s7, s1
 ; GFX1032-NEXT:    s_add_u32 s10, s12, s10
-; GFX1032-NEXT:    s_addc_u32 s3, 0, s3
+; GFX1032-NEXT:    s_addc_u32 s9, 0, s9
 ; GFX1032-NEXT:    s_add_u32 s1, s10, s1
-; GFX1032-NEXT:    s_addc_u32 s1, s3, s13
-; GFX1032-NEXT:    s_addc_u32 s3, s11, 0
+; GFX1032-NEXT:    s_addc_u32 s1, s9, s13
+; GFX1032-NEXT:    s_addc_u32 s9, s11, 0
 ; GFX1032-NEXT:    s_add_u32 s1, s1, s0
-; GFX1032-NEXT:    s_addc_u32 s3, 0, s3
+; GFX1032-NEXT:    s_addc_u32 s9, 0, s9
 ; GFX1032-NEXT:    s_mul_hi_u32 s0, s4, s1
-; GFX1032-NEXT:    s_mul_i32 s11, s4, s3
+; GFX1032-NEXT:    s_mul_i32 s11, s4, s9
 ; GFX1032-NEXT:    s_mul_i32 s12, s4, s1
 ; GFX1032-NEXT:    s_add_i32 s0, s0, s11
 ; GFX1032-NEXT:    v_sub_co_u32 v0, s11, s6, s12
@@ -836,9 +836,9 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
 ; GFX1032-NEXT:    s_cselect_b32 vcc_lo, -1, 0
 ; GFX1032-NEXT:    s_add_u32 s10, s1, 1
 ; GFX1032-NEXT:    v_cndmask_b32_e32 v1, s12, v1, vcc_lo
-; GFX1032-NEXT:    s_addc_u32 s12, s3, 0
+; GFX1032-NEXT:    s_addc_u32 s12, s9, 0
 ; GFX1032-NEXT:    s_add_u32 s13, s1, 2
-; GFX1032-NEXT:    s_addc_u32 s14, s3, 0
+; GFX1032-NEXT:    s_addc_u32 s14, s9, 0
 ; GFX1032-NEXT:    s_cmp_lg_u32 s11, 0
 ; GFX1032-NEXT:    v_cmp_le_u32_e32 vcc_lo, s4, v0
 ; GFX1032-NEXT:    s_subb_u32 s0, s7, s0
@@ -854,9 +854,9 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
 ; GFX1032-NEXT:    v_cndmask_b32_e32 v2, s10, v2, vcc_lo
 ; GFX1032-NEXT:    v_cndmask_b32_e32 v1, s12, v1, vcc_lo
 ; GFX1032-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT:    v_cndmask_b32_e32 v1, s3, v1, vcc_lo
+; GFX1032-NEXT:    v_cndmask_b32_e32 v1, s9, v1, vcc_lo
 ; GFX1032-NEXT:    v_cndmask_b32_e32 v0, s1, v2, vcc_lo
-; GFX1032-NEXT:    s_andn2_b32 vcc_lo, exec_lo, s2
+; GFX1032-NEXT:    s_andn2_b32 vcc_lo, exec_lo, s8
 ; GFX1032-NEXT:    s_cbranch_vccnz .LBB15_3
 ; GFX1032-NEXT:  .LBB15_2:
 ; GFX1032-NEXT:    v_cvt_f32_u32_e32 v0, s4
@@ -870,21 +870,21 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
 ; GFX1032-NEXT:    s_add_i32 s0, s0, s1
 ; GFX1032-NEXT:    s_mul_hi_u32 s0, s6, s0
 ; GFX1032-NEXT:    s_mul_i32 s1, s0, s4
-; GFX1032-NEXT:    s_add_i32 s2, s0, 1
+; GFX1032-NEXT:    s_add_i32 s5, s0, 1
 ; GFX1032-NEXT:    s_sub_i32 s1, s6, s1
-; GFX1032-NEXT:    s_sub_i32 s3, s1, s4
+; GFX1032-NEXT:    s_sub_i32 s6, s1, s4
 ; GFX1032-NEXT:    s_cmp_ge_u32 s1, s4
-; GFX1032-NEXT:    s_cselect_b32 s0, s2, s0
-; GFX1032-NEXT:    s_cselect_b32 s1, s3, s1
-; GFX1032-NEXT:    s_add_i32 s2, s0, 1
+; GFX1032-NEXT:    s_cselect_b32 s0, s5, s0
+; GFX1032-NEXT:    s_cselect_b32 s1, s6, s1
+; GFX1032-NEXT:    s_add_i32 s5, s0, 1
 ; GFX1032-NEXT:    s_cmp_ge_u32 s1, s4
 ; GFX1032-NEXT:    s_mov_b32 s1, 0
-; GFX1032-NEXT:    s_cselect_b32 s0, s2, s0
+; GFX1032-NEXT:    s_cselect_b32 s0, s5, s0
 ; GFX1032-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX1032-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1032-NEXT:  .LBB15_3:
 ; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9] offset:16
+; GFX1032-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] offset:16
 ; GFX1032-NEXT:    s_endpgm
 ; GFX1032-NEXT:  .LBB15_4:
 ; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
@@ -892,9 +892,9 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
 ;
 ; GFX1064-LABEL: test_udiv64:
 ; GFX1064:       ; %bb.0: ; %bb
-; GFX1064-NEXT:    s_load_dwordx2 s[8:9], s[2:3], 0x24
+; GFX1064-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x24
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_load_dwordx4 s[4:7], s[8:9], 0x0
+; GFX1064-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_or_b64 s[0:1], s[6:7], s[4:5]
 ; GFX1064-NEXT:    s_mov_b32 s0, 0
@@ -903,7 +903,7 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
 ; GFX1064-NEXT:  ; %bb.1:
 ; GFX1064-NEXT:    v_cvt_f32_u32_e32 v0, s4
 ; GFX1064-NEXT:    v_cvt_f32_u32_e32 v1, s5
-; GFX1064-NEXT:    s_sub_u32 s3, 0, s4
+; GFX1064-NEXT:    s_sub_u32 s9, 0, s4
 ; GFX1064-NEXT:    s_subb_u32 s10, 0, s5
 ; GFX1064-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GFX1064-NEXT:    v_rcp_f32_e32 v0, v0
@@ -913,92 +913,92 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
 ; GFX1064-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GFX1064-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX1064-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
+; GFX1064-NEXT:    v_readfirstlane_b32 s8, v1
 ; GFX1064-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX1064-NEXT:    s_mul_i32 s1, s3, s2
-; GFX1064-NEXT:    s_mul_hi_u32 s12, s3, s0
+; GFX1064-NEXT:    s_mul_i32 s1, s9, s8
+; GFX1064-NEXT:    s_mul_hi_u32 s12, s9, s0
 ; GFX1064-NEXT:    s_mul_i32 s11, s10, s0
 ; GFX1064-NEXT:    s_add_i32 s1, s12, s1
-; GFX1064-NEXT:    s_mul_i32 s13, s3, s0
+; GFX1064-NEXT:    s_mul_i32 s13, s9, s0
 ; GFX1064-NEXT:    s_add_i32 s1, s1, s11
 ; GFX1064-NEXT:    s_mul_hi_u32 s12, s0, s13
-; GFX1064-NEXT:    s_mul_hi_u32 s14, s2, s13
-; GFX1064-NEXT:    s_mul_i32 s11, s2, s13
+; GFX1064-NEXT:    s_mul_hi_u32 s14, s8, s13
+; GFX1064-NEXT:    s_mul_i32 s11, s8, s13
 ; GFX1064-NEXT:    s_mul_hi_u32 s13, s0, s1
 ; GFX1064-NEXT:    s_mul_i32 s0, s0, s1
-; GFX1064-NEXT:    s_mul_hi_u32 s15, s2, s1
+; GFX1064-NEXT:    s_mul_hi_u32 s15, s8, s1
 ; GFX1064-NEXT:    s_add_u32 s0, s12, s0
 ; GFX1064-NEXT:    s_addc_u32 s12, 0, s13
 ; GFX1064-NEXT:    s_add_u32 s0, s0, s11
-; GFX1064-NEXT:    s_mul_i32 s1, s2, s1
+; GFX1064-NEXT:    s_mul_i32 s1, s8, s1
 ; GFX1064-NEXT:    s_addc_u32 s0, s12, s14
 ; GFX1064-NEXT:    s_addc_u32 s11, s15, 0
 ; GFX1064-NEXT:    s_add_u32 s0, s0, s1
 ; GFX1064-NEXT:    s_addc_u32 s11, 0, s11
 ; GFX1064-NEXT:    v_add_co_u32 v0, s[0:1], v0, s0
 ; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT:    s_addc_u32 s2, s2, s11
+; GFX1064-NEXT:    s_addc_u32 s8, s8, s11
 ; GFX1064-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX1064-NEXT:    s_mul_i32 s1, s3, s2
-; GFX1064-NEXT:    s_mul_hi_u32 s11, s3, s0
+; GFX1064-NEXT:    s_mul_i32 s1, s9, s8
+; GFX1064-NEXT:    s_mul_hi_u32 s11, s9, s0
 ; GFX1064-NEXT:    s_mul_i32 s10, s10, s0
 ; GFX1064-NEXT:    s_add_i32 s1, s11, s1
-; GFX1064-NEXT:    s_mul_i32 s3, s3, s0
+; GFX1064-NEXT:    s_mul_i32 s9, s9, s0
 ; GFX1064-NEXT:    s_add_i32 s1, s1, s10
-; GFX1064-NEXT:    s_mul_hi_u32 s11, s2, s3
-; GFX1064-NEXT:    s_mul_i32 s12, s2, s3
-; GFX1064-NEXT:    s_mul_hi_u32 s3, s0, s3
+; GFX1064-NEXT:    s_mul_hi_u32 s11, s8, s9
+; GFX1064-NEXT:    s_mul_i32 s12, s8, s9
+; GFX1064-NEXT:    s_mul_hi_u32 s9, s0, s9
 ; GFX1064-NEXT:    s_mul_hi_u32 s13, s0, s1
 ; GFX1064-NEXT:    s_mul_i32 s0, s0, s1
-; GFX1064-NEXT:    s_mul_hi_u32 s10, s2, s1
-; GFX1064-NEXT:    s_add_u32 s0, s3, s0
-; GFX1064-NEXT:    s_addc_u32 s3, 0, s13
+; GFX1064-NEXT:    s_mul_hi_u32 s10, s8, s1
+; GFX1064-NEXT:    s_add_u32 s0, s9, s0
+; GFX1064-NEXT:    s_addc_u32 s9, 0, s13
 ; GFX1064-NEXT:    s_add_u32 s0, s0, s12
-; GFX1064-NEXT:    s_mul_i32 s1, s2, s1
-; GFX1064-NEXT:    s_addc_u32 s0, s3, s11
-; GFX1064-NEXT:    s_addc_u32 s3, s10, 0
+; GFX1064-NEXT:    s_mul_i32 s1, s8, s1
+; GFX1064-NEXT:    s_addc_u32 s0, s9, s11
+; GFX1064-NEXT:    s_addc_u32 s9, s10, 0
 ; GFX1064-NEXT:    s_add_u32 s0, s0, s1
-; GFX1064-NEXT:    s_addc_u32 s3, 0, s3
+; GFX1064-NEXT:    s_addc_u32 s9, 0, s9
 ; GFX1064-NEXT:    v_add_co_u32 v0, s[0:1], v0, s0
 ; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT:    s_addc_u32 s0, s2, s3
+; GFX1064-NEXT:    s_addc_u32 s0, s8, s9
 ; GFX1064-NEXT:    v_readfirstlane_b32 s1, v0
-; GFX1064-NEXT:    s_mul_i32 s3, s6, s0
-; GFX1064-NEXT:    s_mul_hi_u32 s2, s6, s0
+; GFX1064-NEXT:    s_mul_i32 s9, s6, s0
+; GFX1064-NEXT:    s_mul_hi_u32 s8, s6, s0
 ; GFX1064-NEXT:    s_mul_hi_u32 s10, s7, s0
 ; GFX1064-NEXT:    s_mul_i32 s0, s7, s0
 ; GFX1064-NEXT:    s_mul_hi_u32 s11, s6, s1
 ; GFX1064-NEXT:    s_mul_hi_u32 s12, s7, s1
 ; GFX1064-NEXT:    s_mul_i32 s1, s7, s1
-; GFX1064-NEXT:    s_add_u32 s3, s11, s3
-; GFX1064-NEXT:    s_addc_u32 s2, 0, s2
-; GFX1064-NEXT:    s_add_u32 s1, s3, s1
-; GFX1064-NEXT:    s_addc_u32 s1, s2, s12
-; GFX1064-NEXT:    s_addc_u32 s2, s10, 0
+; GFX1064-NEXT:    s_add_u32 s9, s11, s9
+; GFX1064-NEXT:    s_addc_u32 s8, 0, s8
+; GFX1064-NEXT:    s_add_u32 s1, s9, s1
+; GFX1064-NEXT:    s_addc_u32 s1, s8, s12
+; GFX1064-NEXT:    s_addc_u32 s8, s10, 0
 ; GFX1064-NEXT:    s_add_u32 s10, s1, s0
-; GFX1064-NEXT:    s_addc_u32 s11, 0, s2
+; GFX1064-NEXT:    s_addc_u32 s11, 0, s8
 ; GFX1064-NEXT:    s_mul_hi_u32 s0, s4, s10
 ; GFX1064-NEXT:    s_mul_i32 s1, s4, s11
-; GFX1064-NEXT:    s_mul_i32 s3, s4, s10
+; GFX1064-NEXT:    s_mul_i32 s9, s4, s10
 ; GFX1064-NEXT:    s_add_i32 s12, s0, s1
-; GFX1064-NEXT:    v_sub_co_u32 v0, s[0:1], s6, s3
-; GFX1064-NEXT:    s_mul_i32 s2, s5, s10
-; GFX1064-NEXT:    s_add_i32 s12, s12, s2
-; GFX1064-NEXT:    v_sub_co_u32 v1, s[2:3], v0, s4
+; GFX1064-NEXT:    v_sub_co_u32 v0, s[0:1], s6, s9
+; GFX1064-NEXT:    s_mul_i32 s8, s5, s10
+; GFX1064-NEXT:    s_add_i32 s12, s12, s8
+; GFX1064-NEXT:    v_sub_co_u32 v1, s[8:9], v0, s4
 ; GFX1064-NEXT:    s_sub_i32 s13, s7, s12
 ; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    s_subb_u32 s13, s13, s5
-; GFX1064-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1064-NEXT:    s_cmp_lg_u64 s[8:9], 0
 ; GFX1064-NEXT:    v_cmp_le_u32_e32 vcc, s4, v1
-; GFX1064-NEXT:    s_subb_u32 s2, s13, 0
-; GFX1064-NEXT:    s_cmp_ge_u32 s2, s5
+; GFX1064-NEXT:    s_subb_u32 s8, s13, 0
+; GFX1064-NEXT:    s_cmp_ge_u32 s8, s5
 ; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
-; GFX1064-NEXT:    s_cselect_b32 s3, -1, 0
-; GFX1064-NEXT:    s_cmp_eq_u32 s2, s5
+; GFX1064-NEXT:    s_cselect_b32 s9, -1, 0
+; GFX1064-NEXT:    s_cmp_eq_u32 s8, s5
 ; GFX1064-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX1064-NEXT:    s_add_u32 s2, s10, 1
-; GFX1064-NEXT:    v_cndmask_b32_e32 v1, s3, v1, vcc
-; GFX1064-NEXT:    s_addc_u32 s3, s11, 0
+; GFX1064-NEXT:    s_add_u32 s8, s10, 1
+; GFX1064-NEXT:    v_cndmask_b32_e32 v1, s9, v1, vcc
+; GFX1064-NEXT:    s_addc_u32 s9, s11, 0
 ; GFX1064-NEXT:    s_add_u32 s13, s10, 2
 ; GFX1064-NEXT:    s_addc_u32 s14, s11, 0
 ; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
@@ -1013,8 +1013,8 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
 ; GFX1064-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, s14
 ; GFX1064-NEXT:    v_cndmask_b32_e64 v0, s7, v0, s[0:1]
-; GFX1064-NEXT:    v_cndmask_b32_e32 v2, s2, v2, vcc
-; GFX1064-NEXT:    v_cndmask_b32_e32 v1, s3, v1, vcc
+; GFX1064-NEXT:    v_cndmask_b32_e32 v2, s8, v2, vcc
+; GFX1064-NEXT:    v_cndmask_b32_e32 v1, s9, v1, vcc
 ; GFX1064-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX1064-NEXT:    v_cndmask_b32_e32 v1, s11, v1, vcc
 ; GFX1064-NEXT:    v_cndmask_b32_e32 v0, s10, v2, vcc
@@ -1031,21 +1031,21 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
 ; GFX1064-NEXT:    s_add_i32 s0, s0, s1
 ; GFX1064-NEXT:    s_mul_hi_u32 s0, s6, s0
 ; GFX1064-NEXT:    s_mul_i32 s1, s0, s4
-; GFX1064-NEXT:    s_add_i32 s2, s0, 1
+; GFX1064-NEXT:    s_add_i32 s5, s0, 1
 ; GFX1064-NEXT:    s_sub_i32 s1, s6, s1
-; GFX1064-NEXT:    s_sub_i32 s3, s1, s4
+; GFX1064-NEXT:    s_sub_i32 s6, s1, s4
 ; GFX1064-NEXT:    s_cmp_ge_u32 s1, s4
-; GFX1064-NEXT:    s_cselect_b32 s0, s2, s0
-; GFX1064-NEXT:    s_cselect_b32 s1, s3, s1
-; GFX1064-NEXT:    s_add_i32 s2, s0, 1
+; GFX1064-NEXT:    s_cselect_b32 s0, s5, s0
+; GFX1064-NEXT:    s_cselect_b32 s1, s6, s1
+; GFX1064-NEXT:    s_add_i32 s5, s0, 1
 ; GFX1064-NEXT:    s_cmp_ge_u32 s1, s4
 ; GFX1064-NEXT:    s_mov_b32 s1, 0
-; GFX1064-NEXT:    s_cselect_b32 s0, s2, s0
+; GFX1064-NEXT:    s_cselect_b32 s0, s5, s0
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1064-NEXT:  .LBB15_3:
 ; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1064-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9] offset:16
+; GFX1064-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] offset:16
 ; GFX1064-NEXT:    s_endpgm
 ; GFX1064-NEXT:  .LBB15_4:
 ; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
@@ -1063,30 +1063,30 @@ bb:
 define amdgpu_kernel void @test_div_scale_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; GFX1032-LABEL: test_div_scale_f32:
 ; GFX1032:       ; %bb.0:
-; GFX1032-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
+; GFX1032-NEXT:    global_load_dword v1, v0, s[2:3] glc dlc
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    global_load_dword v2, v0, s[6:7] offset:4 glc dlc
+; GFX1032-NEXT:    global_load_dword v2, v0, s[2:3] offset:4 glc dlc
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1032-NEXT:    v_div_scale_f32 v1, s0, v2, v2, v1
-; GFX1032-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX1032-NEXT:    v_div_scale_f32 v1, s2, v2, v2, v1
+; GFX1032-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: test_div_scale_f32:
 ; GFX1064:       ; %bb.0:
-; GFX1064-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
+; GFX1064-NEXT:    global_load_dword v1, v0, s[2:3] glc dlc
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    global_load_dword v2, v0, s[6:7] offset:4 glc dlc
+; GFX1064-NEXT:    global_load_dword v2, v0, s[2:3] offset:4 glc dlc
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1064-NEXT:    v_div_scale_f32 v1, s[0:1], v2, v2, v1
-; GFX1064-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX1064-NEXT:    v_div_scale_f32 v1, s[2:3], v2, v2, v1
+; GFX1064-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX1064-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
@@ -1263,10 +1263,9 @@ define amdgpu_kernel void @test_div_fmas_f64(ptr addrspace(1) %out, double %a, d
 define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, ptr addrspace(1) %in, ptr addrspace(1) %dummy) #0 {
 ; GFX1032-LABEL: test_div_fmas_f32_i1_phi_vcc:
 ; GFX1032:       ; %bb.0: ; %entry
-; GFX1032-NEXT:    s_clause 0x1
 ; GFX1032-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX1032-NEXT:    s_load_dwordx2 s[8:9], s[2:3], 0x34
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX1032-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x34
 ; GFX1032-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
 ; GFX1032-NEXT:    s_mov_b32 vcc_lo, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1275,7 +1274,7 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, p
 ; GFX1032-NEXT:    s_cbranch_execz .LBB22_2
 ; GFX1032-NEXT:  ; %bb.1: ; %bb
 ; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1032-NEXT:    global_load_dword v0, v0, s[8:9] glc dlc
+; GFX1032-NEXT:    global_load_dword v0, v0, s[2:3] glc dlc
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_and_b32 vcc_lo, vcc_lo, exec_lo
@@ -1344,40 +1343,40 @@ exit:
 define amdgpu_kernel void @fdiv_f32(ptr addrspace(1) %out, float %a, float %b) #0 {
 ; GFX1032-LABEL: fdiv_f32:
 ; GFX1032:       ; %bb.0: ; %entry
-; GFX1032-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_div_scale_f32 v0, s0, s7, s7, s6
+; GFX1032-NEXT:    v_div_scale_f32 v0, s4, s3, s3, s2
 ; GFX1032-NEXT:    v_rcp_f32_e32 v1, v0
 ; GFX1032-NEXT:    v_fma_f32 v2, -v0, v1, 1.0
 ; GFX1032-NEXT:    v_fmac_f32_e32 v1, v2, v1
-; GFX1032-NEXT:    v_div_scale_f32 v2, vcc_lo, s6, s7, s6
+; GFX1032-NEXT:    v_div_scale_f32 v2, vcc_lo, s2, s3, s2
 ; GFX1032-NEXT:    v_mul_f32_e32 v3, v2, v1
 ; GFX1032-NEXT:    v_fma_f32 v4, -v0, v3, v2
 ; GFX1032-NEXT:    v_fmac_f32_e32 v3, v4, v1
 ; GFX1032-NEXT:    v_fma_f32 v0, -v0, v3, v2
 ; GFX1032-NEXT:    v_div_fmas_f32 v0, v0, v1, v3
 ; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032-NEXT:    v_div_fixup_f32 v0, v0, s7, s6
-; GFX1032-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX1032-NEXT:    v_div_fixup_f32 v0, v0, s3, s2
+; GFX1032-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: fdiv_f32:
 ; GFX1064:       ; %bb.0: ; %entry
-; GFX1064-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_div_scale_f32 v0, s[0:1], s7, s7, s6
+; GFX1064-NEXT:    v_div_scale_f32 v0, s[4:5], s3, s3, s2
 ; GFX1064-NEXT:    v_rcp_f32_e32 v1, v0
 ; GFX1064-NEXT:    v_fma_f32 v2, -v0, v1, 1.0
 ; GFX1064-NEXT:    v_fmac_f32_e32 v1, v2, v1
-; GFX1064-NEXT:    v_div_scale_f32 v2, vcc, s6, s7, s6
+; GFX1064-NEXT:    v_div_scale_f32 v2, vcc, s2, s3, s2
 ; GFX1064-NEXT:    v_mul_f32_e32 v3, v2, v1
 ; GFX1064-NEXT:    v_fma_f32 v4, -v0, v3, v2
 ; GFX1064-NEXT:    v_fmac_f32_e32 v3, v4, v1
 ; GFX1064-NEXT:    v_fma_f32 v0, -v0, v3, v2
 ; GFX1064-NEXT:    v_div_fmas_f32 v0, v0, v1, v3
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064-NEXT:    v_div_fixup_f32 v0, v0, s7, s6
-; GFX1064-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX1064-NEXT:    v_div_fixup_f32 v0, v0, s3, s2
+; GFX1064-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX1064-NEXT:    s_endpgm
 entry:
   %fdiv = fdiv float %a, %b
@@ -1704,30 +1703,30 @@ define amdgpu_kernel void @test_set_inactive(ptr addrspace(1) %out, i32 %in) #0
 define amdgpu_kernel void @test_set_inactive_64(ptr addrspace(1) %out, i64 %in) #0 {
 ; GFX1032-LABEL: test_set_inactive_64:
 ; GFX1032:       ; %bb.0:
-; GFX1032-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_mov_b32_e32 v0, s6
-; GFX1032-NEXT:    v_mov_b32_e32 v1, s7
+; GFX1032-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1032-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX1032-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: test_set_inactive_64:
 ; GFX1064:       ; %bb.0:
-; GFX1064-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_mov_b32_e32 v0, s6
-; GFX1064-NEXT:    v_mov_b32_e32 v1, s7
+; GFX1064-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1064-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX1064-NEXT:    s_not_b64 exec, exec
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064-NEXT:    s_not_b64 exec, exec
-; GFX1064-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX1064-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX1064-NEXT:    s_endpgm
   %tmp = call i64 @llvm.amdgcn.set.inactive.i64(i64 %in, i64 0)
   store i64 %tmp, ptr addrspace(1) %out
@@ -2138,23 +2137,23 @@ main_body:
 define amdgpu_kernel void @test_intr_fcmp_i64(ptr addrspace(1) %out, float %src, float %a) {
 ; GFX1032-LABEL: test_intr_fcmp_i64:
 ; GFX1032:       ; %bb.0:
-; GFX1032-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_cmp_eq_f32_e64 s0, s6, |s7|
-; GFX1032-NEXT:    v_mov_b32_e32 v0, s0
-; GFX1032-NEXT:    global_store_dwordx2 v1, v[0:1], s[4:5]
+; GFX1032-NEXT:    v_cmp_eq_f32_e64 s2, s2, |s3|
+; GFX1032-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1032-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: test_intr_fcmp_i64:
 ; GFX1064:       ; %bb.0:
-; GFX1064-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_cmp_eq_f32_e64 s[0:1], s6, |s7|
-; GFX1064-NEXT:    v_mov_b32_e32 v0, s0
-; GFX1064-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1064-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX1064-NEXT:    v_cmp_eq_f32_e64 s[2:3], s2, |s3|
+; GFX1064-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1064-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1064-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX1064-NEXT:    s_endpgm
   %temp = call float @llvm.fabs.f32(float %a)
   %result = call i64 @llvm.amdgcn.fcmp.i64.f32(float %src, float %temp, i32 1)
@@ -2195,22 +2194,22 @@ define amdgpu_kernel void @test_intr_icmp_i64(ptr addrspace(1) %out, i32 %src) {
 define amdgpu_kernel void @test_intr_fcmp_i32(ptr addrspace(1) %out, float %src, float %a) {
 ; GFX1032-LABEL: test_intr_fcmp_i32:
 ; GFX1032:       ; %bb.0:
-; GFX1032-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_cmp_eq_f32_e64 s0, s6, |s7|
-; GFX1032-NEXT:    v_mov_b32_e32 v1, s0
-; GFX1032-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX1032-NEXT:    v_cmp_eq_f32_e64 s2, s2, |s3|
+; GFX1032-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1032-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: test_intr_fcmp_i32:
 ; GFX1064:       ; %bb.0:
-; GFX1064-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_cmp_eq_f32_e64 s[0:1], s6, |s7|
-; GFX1064-NEXT:    v_mov_b32_e32 v1, s0
-; GFX1064-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX1064-NEXT:    v_cmp_eq_f32_e64 s[2:3], s2, |s3|
+; GFX1064-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1064-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX1064-NEXT:    s_endpgm
   %temp = call float @llvm.fabs.f32(float %a)
   %result = call i32 @llvm.amdgcn.fcmp.i32.f32(float %src, float %temp, i32 1)
@@ -2354,42 +2353,42 @@ define amdgpu_ps float @test_ps_live() #0 {
 define amdgpu_kernel void @test_vccnz_ifcvt_triangle64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; GFX1032-LABEL: test_vccnz_ifcvt_triangle64:
 ; GFX1032:       ; %bb.0: ; %entry
-; GFX1032-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX1032-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_cmp_neq_f64_e64 s2, s[0:1], 1.0
-; GFX1032-NEXT:    s_and_b32 vcc_lo, exec_lo, s2
+; GFX1032-NEXT:    v_cmp_neq_f64_e64 s4, s[2:3], 1.0
+; GFX1032-NEXT:    s_and_b32 vcc_lo, exec_lo, s4
 ; GFX1032-NEXT:    s_cbranch_vccnz .LBB47_2
 ; GFX1032-NEXT:  ; %bb.1: ; %if
-; GFX1032-NEXT:    v_add_f64 v[0:1], s[0:1], s[0:1]
+; GFX1032-NEXT:    v_add_f64 v[0:1], s[2:3], s[2:3]
 ; GFX1032-NEXT:    s_branch .LBB47_3
 ; GFX1032-NEXT:  .LBB47_2:
-; GFX1032-NEXT:    v_mov_b32_e32 v0, s0
-; GFX1032-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1032-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1032-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX1032-NEXT:  .LBB47_3: ; %endif
 ; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX1032-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: test_vccnz_ifcvt_triangle64:
 ; GFX1064:       ; %bb.0: ; %entry
-; GFX1064-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX1064-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_cmp_neq_f64_e64 s[2:3], s[0:1], 1.0
-; GFX1064-NEXT:    s_and_b64 vcc, exec, s[2:3]
+; GFX1064-NEXT:    v_cmp_neq_f64_e64 s[4:5], s[2:3], 1.0
+; GFX1064-NEXT:    s_and_b64 vcc, exec, s[4:5]
 ; GFX1064-NEXT:    s_cbranch_vccnz .LBB47_2
 ; GFX1064-NEXT:  ; %bb.1: ; %if
-; GFX1064-NEXT:    v_add_f64 v[0:1], s[0:1], s[0:1]
+; GFX1064-NEXT:    v_add_f64 v[0:1], s[2:3], s[2:3]
 ; GFX1064-NEXT:    s_branch .LBB47_3
 ; GFX1064-NEXT:  .LBB47_2:
-; GFX1064-NEXT:    v_mov_b32_e32 v0, s0
-; GFX1064-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1064-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1064-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX1064-NEXT:  .LBB47_3: ; %endif
 ; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1064-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX1064-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX1064-NEXT:    s_endpgm
 entry:
   %v = load double, ptr addrspace(1) %in
diff --git a/llvm/test/CodeGen/AMDGPU/zero_extend.ll b/llvm/test/CodeGen/AMDGPU/zero_extend.ll
index af50e09f509a3..f9137b075e462 100644
--- a/llvm/test/CodeGen/AMDGPU/zero_extend.ll
+++ b/llvm/test/CodeGen/AMDGPU/zero_extend.ll
@@ -53,7 +53,7 @@ define amdgpu_kernel void @s_cmp_zext_i1_to_i64(ptr addrspace(1) %out, i32 %a, i
 
 ; GCN-DAG: s_and_b32 [[MASK_A:s[0-9]+]], [[A]], 0xffff{{$}}
 ; GCN-DAG: s_and_b32 [[MASK_B:s[0-9]+]], [[B]], 0xffff{{$}}
-; GCN: s_cmp_eq_u32 [[MASK_A]], [[MASK_B]]
+; GCN: s_cmp_eq_u32 [[MASK_A]], [[B]]
 ; GCN: s_cselect_b64 [[CC:s\[[0-9:]+\]]], -1, 0
 ; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CC]]
 ; GCN: buffer_store_short [[RESULT]]
diff --git a/llvm/test/CodeGen/ARM/neon_vabd.ll b/llvm/test/CodeGen/ARM/neon_vabd.ll
index cdfc48468e044..8695c3e5f3db9 100644
--- a/llvm/test/CodeGen/ARM/neon_vabd.ll
+++ b/llvm/test/CodeGen/ARM/neon_vabd.ll
@@ -340,20 +340,20 @@ define <2 x i64> @uabd_2d(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-NEXT:    sbcs r2, r3, r12
 ; CHECK-NEXT:    sbcs r3, r1, #0
 ; CHECK-NEXT:    sbc r3, r1, #0
-; CHECK-NEXT:    eor r0, r0, r3
-; CHECK-NEXT:    eor r2, r2, r3
-; CHECK-NEXT:    subs r0, r0, r3
-; CHECK-NEXT:    sbc r2, r2, r3
+; CHECK-NEXT:    eor r0, r0, r3, asr #31
+; CHECK-NEXT:    eor r2, r2, r3, asr #31
+; CHECK-NEXT:    subs r0, r0, r3, asr #31
+; CHECK-NEXT:    sbc r2, r2, r3, asr #31
 ; CHECK-NEXT:    subs r3, r4, lr
 ; CHECK-NEXT:    sbcs r6, r5, r6
 ; CHECK-NEXT:    vmov.32 d1[0], r0
 ; CHECK-NEXT:    sbcs r5, r1, #0
 ; CHECK-NEXT:    sbc r1, r1, #0
-; CHECK-NEXT:    eor r3, r3, r1
-; CHECK-NEXT:    subs r0, r3, r1
+; CHECK-NEXT:    eor r3, r3, r1, asr #31
+; CHECK-NEXT:    subs r0, r3, r1, asr #31
 ; CHECK-NEXT:    vmov.32 d0[0], r0
-; CHECK-NEXT:    eor r0, r6, r1
-; CHECK-NEXT:    sbc r0, r0, r1
+; CHECK-NEXT:    eor r0, r6, r1, asr #31
+; CHECK-NEXT:    sbc r0, r0, r1, asr #31
 ; CHECK-NEXT:    vmov.32 d1[1], r2
 ; CHECK-NEXT:    vmov.32 d0[1], r0
 ; CHECK-NEXT:    pop {r4, r5, r6, pc}
diff --git a/llvm/test/CodeGen/LoongArch/andn-icmp.ll b/llvm/test/CodeGen/LoongArch/andn-icmp.ll
index 67d62eca914e9..447f3ac5c34fd 100644
--- a/llvm/test/CodeGen/LoongArch/andn-icmp.ll
+++ b/llvm/test/CodeGen/LoongArch/andn-icmp.ll
@@ -6,12 +6,14 @@ define i1 @andn_icmp_eq_i8(i8 signext %a, i8 signext %b) nounwind {
 ; LA32-LABEL: andn_icmp_eq_i8:
 ; LA32:       # %bb.0:
 ; LA32-NEXT:    andn $a0, $a1, $a0
+; LA32-NEXT:    andi $a0, $a0, 255
 ; LA32-NEXT:    sltui $a0, $a0, 1
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: andn_icmp_eq_i8:
 ; LA64:       # %bb.0:
 ; LA64-NEXT:    andn $a0, $a1, $a0
+; LA64-NEXT:    andi $a0, $a0, 255
 ; LA64-NEXT:    sltui $a0, $a0, 1
 ; LA64-NEXT:    ret
   %and = and i8 %a, %b
@@ -23,12 +25,14 @@ define i1 @andn_icmp_eq_i16(i16 signext %a, i16 signext %b) nounwind {
 ; LA32-LABEL: andn_icmp_eq_i16:
 ; LA32:       # %bb.0:
 ; LA32-NEXT:    andn $a0, $a1, $a0
+; LA32-NEXT:    bstrpick.w $a0, $a0, 15, 0
 ; LA32-NEXT:    sltui $a0, $a0, 1
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: andn_icmp_eq_i16:
 ; LA64:       # %bb.0:
 ; LA64-NEXT:    andn $a0, $a1, $a0
+; LA64-NEXT:    bstrpick.d $a0, $a0, 15, 0
 ; LA64-NEXT:    sltui $a0, $a0, 1
 ; LA64-NEXT:    ret
   %and = and i16 %a, %b
@@ -76,12 +80,14 @@ define i1 @andn_icmp_ne_i8(i8 signext %a, i8 signext %b) nounwind {
 ; LA32-LABEL: andn_icmp_ne_i8:
 ; LA32:       # %bb.0:
 ; LA32-NEXT:    andn $a0, $a1, $a0
+; LA32-NEXT:    andi $a0, $a0, 255
 ; LA32-NEXT:    sltu $a0, $zero, $a0
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: andn_icmp_ne_i8:
 ; LA64:       # %bb.0:
 ; LA64-NEXT:    andn $a0, $a1, $a0
+; LA64-NEXT:    andi $a0, $a0, 255
 ; LA64-NEXT:    sltu $a0, $zero, $a0
 ; LA64-NEXT:    ret
   %and = and i8 %a, %b
@@ -93,12 +99,14 @@ define i1 @andn_icmp_ne_i16(i16 signext %a, i16 signext %b) nounwind {
 ; LA32-LABEL: andn_icmp_ne_i16:
 ; LA32:       # %bb.0:
 ; LA32-NEXT:    andn $a0, $a1, $a0
+; LA32-NEXT:    bstrpick.w $a0, $a0, 15, 0
 ; LA32-NEXT:    sltu $a0, $zero, $a0
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: andn_icmp_ne_i16:
 ; LA64:       # %bb.0:
 ; LA64-NEXT:    andn $a0, $a1, $a0
+; LA64-NEXT:    bstrpick.d $a0, $a0, 15, 0
 ; LA64-NEXT:    sltu $a0, $zero, $a0
 ; LA64-NEXT:    ret
   %and = and i16 %a, %b
@@ -145,13 +153,15 @@ define i1 @andn_icmp_ne_i64(i64 %a, i64 %b) nounwind {
 define i1 @andn_icmp_ult_i8(i8 signext %a, i8 signext %b) nounwind {
 ; LA32-LABEL: andn_icmp_ult_i8:
 ; LA32:       # %bb.0:
-; LA32-NEXT:    and $a0, $a0, $a1
+; LA32-NEXT:    andi $a1, $a1, 255
+; LA32-NEXT:    and $a0, $a1, $a0
 ; LA32-NEXT:    sltu $a0, $a0, $a1
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: andn_icmp_ult_i8:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    and $a0, $a0, $a1
+; LA64-NEXT:    andi $a1, $a1, 255
+; LA64-NEXT:    and $a0, $a1, $a0
 ; LA64-NEXT:    sltu $a0, $a0, $a1
 ; LA64-NEXT:    ret
   %and = and i8 %a, %b
@@ -162,13 +172,15 @@ define i1 @andn_icmp_ult_i8(i8 signext %a, i8 signext %b) nounwind {
 define i1 @andn_icmp_ult_i16(i16 signext %a, i16 signext %b) nounwind {
 ; LA32-LABEL: andn_icmp_ult_i16:
 ; LA32:       # %bb.0:
-; LA32-NEXT:    and $a0, $a0, $a1
+; LA32-NEXT:    bstrpick.w $a1, $a1, 15, 0
+; LA32-NEXT:    and $a0, $a1, $a0
 ; LA32-NEXT:    sltu $a0, $a0, $a1
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: andn_icmp_ult_i16:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    and $a0, $a0, $a1
+; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
+; LA64-NEXT:    and $a0, $a1, $a0
 ; LA64-NEXT:    sltu $a0, $a0, $a1
 ; LA64-NEXT:    ret
   %and = and i16 %a, %b
@@ -179,14 +191,16 @@ define i1 @andn_icmp_ult_i16(i16 signext %a, i16 signext %b) nounwind {
 define i1 @andn_icmp_uge_i8(i8 signext %a, i8 signext %b) nounwind {
 ; LA32-LABEL: andn_icmp_uge_i8:
 ; LA32:       # %bb.0:
-; LA32-NEXT:    and $a0, $a0, $a1
+; LA32-NEXT:    andi $a1, $a1, 255
+; LA32-NEXT:    and $a0, $a1, $a0
 ; LA32-NEXT:    sltu $a0, $a0, $a1
 ; LA32-NEXT:    xori $a0, $a0, 1
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: andn_icmp_uge_i8:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    and $a0, $a0, $a1
+; LA64-NEXT:    andi $a1, $a1, 255
+; LA64-NEXT:    and $a0, $a1, $a0
 ; LA64-NEXT:    sltu $a0, $a0, $a1
 ; LA64-NEXT:    xori $a0, $a0, 1
 ; LA64-NEXT:    ret
@@ -198,14 +212,16 @@ define i1 @andn_icmp_uge_i8(i8 signext %a, i8 signext %b) nounwind {
 define i1 @andn_icmp_uge_i16(i16 signext %a, i16 signext %b) nounwind {
 ; LA32-LABEL: andn_icmp_uge_i16:
 ; LA32:       # %bb.0:
-; LA32-NEXT:    and $a0, $a0, $a1
+; LA32-NEXT:    bstrpick.w $a1, $a1, 15, 0
+; LA32-NEXT:    and $a0, $a1, $a0
 ; LA32-NEXT:    sltu $a0, $a0, $a1
 ; LA32-NEXT:    xori $a0, $a0, 1
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: andn_icmp_uge_i16:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    and $a0, $a0, $a1
+; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
+; LA64-NEXT:    and $a0, $a1, $a0
 ; LA64-NEXT:    sltu $a0, $a0, $a1
 ; LA64-NEXT:    xori $a0, $a0, 1
 ; LA64-NEXT:    ret
@@ -217,13 +233,15 @@ define i1 @andn_icmp_uge_i16(i16 signext %a, i16 signext %b) nounwind {
 define i1 @andn_icmp_ugt_i8(i8 signext %a, i8 signext %b) nounwind {
 ; LA32-LABEL: andn_icmp_ugt_i8:
 ; LA32:       # %bb.0:
-; LA32-NEXT:    and $a0, $a0, $a1
+; LA32-NEXT:    andi $a1, $a1, 255
+; LA32-NEXT:    and $a0, $a1, $a0
 ; LA32-NEXT:    sltu $a0, $a1, $a0
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: andn_icmp_ugt_i8:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    and $a0, $a0, $a1
+; LA64-NEXT:    andi $a1, $a1, 255
+; LA64-NEXT:    and $a0, $a1, $a0
 ; LA64-NEXT:    sltu $a0, $a1, $a0
 ; LA64-NEXT:    ret
   %and = and i8 %a, %b
@@ -234,13 +252,15 @@ define i1 @andn_icmp_ugt_i8(i8 signext %a, i8 signext %b) nounwind {
 define i1 @andn_icmp_ugt_i16(i16 signext %a, i16 signext %b) nounwind {
 ; LA32-LABEL: andn_icmp_ugt_i16:
 ; LA32:       # %bb.0:
-; LA32-NEXT:    and $a0, $a0, $a1
+; LA32-NEXT:    bstrpick.w $a1, $a1, 15, 0
+; LA32-NEXT:    and $a0, $a1, $a0
 ; LA32-NEXT:    sltu $a0, $a1, $a0
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: andn_icmp_ugt_i16:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    and $a0, $a0, $a1
+; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
+; LA64-NEXT:    and $a0, $a1, $a0
 ; LA64-NEXT:    sltu $a0, $a1, $a0
 ; LA64-NEXT:    ret
   %and = and i16 %a, %b
@@ -251,14 +271,16 @@ define i1 @andn_icmp_ugt_i16(i16 signext %a, i16 signext %b) nounwind {
 define i1 @andn_icmp_ule_i8(i8 signext %a, i8 signext %b) nounwind {
 ; LA32-LABEL: andn_icmp_ule_i8:
 ; LA32:       # %bb.0:
-; LA32-NEXT:    and $a0, $a0, $a1
+; LA32-NEXT:    andi $a1, $a1, 255
+; LA32-NEXT:    and $a0, $a1, $a0
 ; LA32-NEXT:    sltu $a0, $a1, $a0
 ; LA32-NEXT:    xori $a0, $a0, 1
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: andn_icmp_ule_i8:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    and $a0, $a0, $a1
+; LA64-NEXT:    andi $a1, $a1, 255
+; LA64-NEXT:    and $a0, $a1, $a0
 ; LA64-NEXT:    sltu $a0, $a1, $a0
 ; LA64-NEXT:    xori $a0, $a0, 1
 ; LA64-NEXT:    ret
@@ -270,14 +292,16 @@ define i1 @andn_icmp_ule_i8(i8 signext %a, i8 signext %b) nounwind {
 define i1 @andn_icmp_ule_i16(i16 signext %a, i16 signext %b) nounwind {
 ; LA32-LABEL: andn_icmp_ule_i16:
 ; LA32:       # %bb.0:
-; LA32-NEXT:    and $a0, $a0, $a1
+; LA32-NEXT:    bstrpick.w $a1, $a1, 15, 0
+; LA32-NEXT:    and $a0, $a1, $a0
 ; LA32-NEXT:    sltu $a0, $a1, $a0
 ; LA32-NEXT:    xori $a0, $a0, 1
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: andn_icmp_ule_i16:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    and $a0, $a0, $a1
+; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
+; LA64-NEXT:    and $a0, $a1, $a0
 ; LA64-NEXT:    sltu $a0, $a1, $a0
 ; LA64-NEXT:    xori $a0, $a0, 1
 ; LA64-NEXT:    ret
@@ -582,6 +606,7 @@ define i1 @andn_icmp_eq_i8_i32(i8 signext %a, i8 signext %b) nounwind {
 ; LA32-LABEL: andn_icmp_eq_i8_i32:
 ; LA32:       # %bb.0:
 ; LA32-NEXT:    andn $a0, $a1, $a0
+; LA32-NEXT:    andi $a0, $a0, 255
 ; LA32-NEXT:    sltui $a0, $a0, 1
 ; LA32-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/LoongArch/cpus.ll b/llvm/test/CodeGen/LoongArch/cpus.ll
index be3e05e394ff9..41ff1be496e3b 100644
--- a/llvm/test/CodeGen/LoongArch/cpus.ll
+++ b/llvm/test/CodeGen/LoongArch/cpus.ll
@@ -3,7 +3,6 @@
 
 ; RUN: llc < %s --mtriple=loongarch64 -mattr=+d --mcpu=loongarch64 2>&1 | FileCheck %s
 ; RUN: llc < %s --mtriple=loongarch64 -mattr=+d --mcpu=la464 2>&1 | FileCheck %s
-; RUN: llc < %s --mtriple=loongarch64 -mattr=+d --mcpu=la664 2>&1 | FileCheck %s
 ; RUN: llc < %s --mtriple=loongarch64 -mattr=+d 2>&1 | FileCheck %s
 
 ; CHECK-NOT: {{.*}} is not a recognized processor for this target
@@ -19,7 +18,3 @@ define void @tune_cpu_loongarch64() "tune-cpu"="loongarch64" {
 define void @tune_cpu_la464() "tune-cpu"="la464" {
   ret void
 }
-
-define void @tune_cpu_la664() "tune-cpu"="la664" {
-  ret void
-}
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvilv.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvilv.ll
deleted file mode 100644
index 22ab19b9fa446..0000000000000
--- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvilv.ll
+++ /dev/null
@@ -1,74 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s
-
-;; xvilvl.b
-define <32 x i8> @shufflevector_xvilvl_v32i8(<32 x i8> %a, <32 x i8> %b) {
-; CHECK-LABEL: shufflevector_xvilvl_v32i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xvilvl.b $xr0, $xr1, $xr0
-; CHECK-NEXT:    ret
-    %c = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39,
-                                                               i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
-    ret <32 x i8> %c
-}
-
-;; xvilvl.h
-define <16 x i16> @shufflevector_xvilvl_v16i16(<16 x i16> %a, <16 x i16> %b) {
-; CHECK-LABEL: shufflevector_xvilvl_v16i16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xvilvl.h $xr0, $xr1, $xr0
-; CHECK-NEXT:    ret
-    %c = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27>
-    ret <16 x i16> %c
-}
-
-;; xvilvl.w
-define <8 x i32> @shufflevector_xvilvl_v8i32(<8 x i32> %a, <8 x i32> %b) {
-; CHECK-LABEL: shufflevector_xvilvl_v8i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xvilvl.w $xr0, $xr1, $xr0
-; CHECK-NEXT:    ret
-    %c = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
-    ret <8 x i32> %c
-}
-
-;; xvilvh.b
-define <32 x i8> @shufflevector_xvilvh_v32i8(<32 x i8> %a, <32 x i8> %b) {
-; CHECK-LABEL: shufflevector_xvilvh_v32i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xvilvh.b $xr0, $xr1, $xr0
-; CHECK-NEXT:    ret
-    %c = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47,
-                                                               i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
-    ret <32 x i8> %c
-}
-
-;; xvilvh.h
-define <16 x i16> @shufflevector_xvilvh_v16i16(<16 x i16> %a, <16 x i16> %b) {
-; CHECK-LABEL: shufflevector_xvilvh_v16i16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xvilvh.h $xr0, $xr1, $xr0
-; CHECK-NEXT:    ret
-    %c = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
-    ret <16 x i16> %c
-}
-
-;; xvilvh.w
-define <8 x i32> @shufflevector_xvilvh_v8i32(<8 x i32> %a, <8 x i32> %b) {
-; CHECK-LABEL: shufflevector_xvilvh_v8i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xvilvh.w $xr0, $xr1, $xr0
-; CHECK-NEXT:    ret
-    %c = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
-    ret <8 x i32> %c
-}
-
-;; xvilvh.w
-define <8 x float> @shufflevector_xvilvh_v8f32(<8 x float> %a, <8 x float> %b) {
-; CHECK-LABEL: shufflevector_xvilvh_v8f32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xvilvh.w $xr0, $xr1, $xr0
-; CHECK-NEXT:    ret
-    %c = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
-    ret <8 x float> %c
-}
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvpack.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvpack.ll
deleted file mode 100644
index 2ff9af4069b9b..0000000000000
--- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvpack.ll
+++ /dev/null
@@ -1,124 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s
-
-;; xvpackev.b
-define <32 x i8> @shufflevector_pack_ev_v32i8(<32 x i8> %a, <32 x i8> %b) {
-; CHECK-LABEL: shufflevector_pack_ev_v32i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xvpackev.b $xr0, $xr1, $xr0
-; CHECK-NEXT:    ret
-    %c = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 32, i32 2, i32 34, i32 4, i32 36, i32 6, i32 38, i32 8, i32 40, i32 10, i32 42, i32 12, i32 44, i32 14, i32 46,
-                                                               i32 16, i32 48, i32 18, i32 50, i32 20, i32 52, i32 22, i32 54, i32 24, i32 56, i32 26, i32 58, i32 28, i32 60, i32 30, i32 62>
-    ret <32 x i8> %c
-}
-
-;; xvpackev.h
-define <16 x i16> @shufflevector_pack_ev_v16i16(<16 x i16> %a, <16 x i16> %b) {
-; CHECK-LABEL: shufflevector_pack_ev_v16i16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xvpackev.h $xr0, $xr1, $xr0
-; CHECK-NEXT:    ret
-    %c = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
-    ret <16 x i16> %c
-}
-
-;; xvpackev.w
-define <8 x i32> @shufflevector_pack_ev_v8i32(<8 x i32> %a, <8 x i32> %b) {
-; CHECK-LABEL: shufflevector_pack_ev_v8i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xvpackev.w $xr0, $xr1, $xr0
-; CHECK-NEXT:    ret
-    %c = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-    ret <8 x i32> %c
-}
-
-;; xvpickev.d/xvpackev.d/xvilvl.d
-define <4 x i64> @shufflevector_pack_ev_v4i64(<4 x i64> %a, <4 x i64> %b) {
-; CHECK-LABEL: shufflevector_pack_ev_v4i64:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xvpackev.d $xr0, $xr1, $xr0
-; CHECK-NEXT:    ret
-    %c = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-    ret <4 x i64> %c
-}
-
-;; xvpackev.w
-define <8 x float> @shufflevector_pack_ev_v8f32(<8 x float> %a, <8 x float> %b) {
-; CHECK-LABEL: shufflevector_pack_ev_v8f32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xvpackev.w $xr0, $xr1, $xr0
-; CHECK-NEXT:    ret
-    %c = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-    ret <8 x float> %c
-}
-
-;; xvpickev.d/xvpackev.d/xvilvl.d
-define <4 x double> @shufflevector_pack_ev_v4f64(<4 x double> %a, <4 x double> %b) {
-; CHECK-LABEL: shufflevector_pack_ev_v4f64:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xvpackev.d $xr0, $xr1, $xr0
-; CHECK-NEXT:    ret
-    %c = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-    ret <4 x double> %c
-}
-
-;; xvpackod.b
-define <32 x i8> @shufflevector_pack_od_v32i8(<32 x i8> %a, <32 x i8> %b) {
-; CHECK-LABEL: shufflevector_pack_od_v32i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xvpackod.b $xr0, $xr1, $xr0
-; CHECK-NEXT:    ret
-    %c = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 1, i32 33, i32 3, i32 35, i32 5, i32 37, i32 7, i32 39, i32 9, i32 41, i32 11, i32 43, i32 13, i32 45, i32 15, i32 47,
-                                                              i32 17, i32 49, i32 19, i32 51, i32 21, i32 53, i32 23, i32 55, i32 25, i32 57, i32 27, i32 59, i32 29, i32 61, i32 31, i32 63>
-    ret <32 x i8> %c
-}
-
-;; xvpackod.h
-define <16 x i16> @shufflevector_pack_od_v16i16(<16 x i16> %a, <16 x i16> %b) {
-; CHECK-LABEL: shufflevector_pack_od_v16i16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xvpackod.h $xr0, $xr1, $xr0
-; CHECK-NEXT:    ret
-    %c = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
-    ret <16 x i16> %c
-}
-
-;; xvpackod.w
-define <8 x i32> @shufflevector_pack_od_v8i32(<8 x i32> %a, <8 x i32> %b) {
-; CHECK-LABEL: shufflevector_pack_od_v8i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xvpackod.w $xr0, $xr1, $xr0
-; CHECK-NEXT:    ret
-    %c = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-    ret <8 x i32> %c
-}
-
-;; xvpickod.d/xvpackod.d/xvilvh.d
-define <4 x i64> @shufflodector_pack_od_v4i64(<4 x i64> %a, <4 x i64> %b) {
-; CHECK-LABEL: shufflodector_pack_od_v4i64:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xvpackod.d $xr0, $xr1, $xr0
-; CHECK-NEXT:    ret
-    %c = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-    ret <4 x i64> %c
-}
-
-;; xvpackod.w
-define <8 x float> @shufflodector_pack_od_v8f32(<8 x float> %a, <8 x float> %b) {
-; CHECK-LABEL: shufflodector_pack_od_v8f32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xvpackod.w $xr0, $xr1, $xr0
-; CHECK-NEXT:    ret
-    %c = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-    ret <8 x float> %c
-}
-
-;; xvpickod.d/xvpackod.d/xvilvh.d
-define <4 x double> @shufflodector_pack_od_v4f64(<4 x double> %a, <4 x double> %b) {
-; CHECK-LABEL: shufflodector_pack_od_v4f64:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xvpackod.d $xr0, $xr1, $xr0
-; CHECK-NEXT:    ret
-    %c = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-    ret <4 x double> %c
-}
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvpick.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvpick.ll
deleted file mode 100644
index 294d292d17640..0000000000000
--- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvpick.ll
+++ /dev/null
@@ -1,84 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s
-
-;; xvpickev.b
-define <32 x i8> @shufflevector_pick_ev_v32i8(<32 x i8> %a, <32 x i8> %b) {
-; CHECK-LABEL: shufflevector_pick_ev_v32i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xvpickev.b $xr0, $xr1, $xr0
-; CHECK-NEXT:    ret
-    %c = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46,
-                                                               i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62>
-    ret <32 x i8> %c
-}
-
-;; xvpickev.h
-define <16 x i16> @shufflevector_pick_ev_v16i16(<16 x i16> %a, <16 x i16> %b) {
-; CHECK-LABEL: shufflevector_pick_ev_v16i16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xvpickev.h $xr0, $xr1, $xr0
-; CHECK-NEXT:    ret
-    %c = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
-    ret <16 x i16> %c
-}
-
-;; xvpickev.w
-define <8 x i32> @shufflevector_pick_ev_v8i32(<8 x i32> %a, <8 x i32> %b) {
-; CHECK-LABEL: shufflevector_pick_ev_v8i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xvpickev.w $xr0, $xr1, $xr0
-; CHECK-NEXT:    ret
-    %c = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
-    ret <8 x i32> %c
-}
-
-;; xvpickev.w
-define <8 x float> @shufflevector_pick_ev_v8f32(<8 x float> %a, <8 x float> %b) {
-; CHECK-LABEL: shufflevector_pick_ev_v8f32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xvpickev.w $xr0, $xr1, $xr0
-; CHECK-NEXT:    ret
-    %c = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
-    ret <8 x float> %c
-}
-
-;; xvpickod.b
-define <32 x i8> @shufflevector_pick_od_v32i8(<32 x i8> %a, <32 x i8> %b) {
-; CHECK-LABEL: shufflevector_pick_od_v32i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xvpickod.b $xr0, $xr1, $xr0
-; CHECK-NEXT:    ret
-    %c = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 33, i32 35, i32 37, i32 39, i32 41, i32 43, i32 45, i32 47,
-                                                               i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 49, i32 51, i32 53, i32 55, i32 57, i32 59, i32 61, i32 63>
-    ret <32 x i8> %c
-}
-
-;; xvpickod.h
-define <16 x i16> @shufflevector_pick_od_v16i16(<16 x i16> %a, <16 x i16> %b) {
-; CHECK-LABEL: shufflevector_pick_od_v16i16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xvpickod.h $xr0, $xr1, $xr0
-; CHECK-NEXT:    ret
-    %c = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
-    ret <16 x i16> %c
-}
-
-;; xvpickod.w
-define <8 x i32> @shufflevector_pick_od_v8i32(<8 x i32> %a, <8 x i32> %b) {
-; CHECK-LABEL: shufflevector_pick_od_v8i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xvpickod.w $xr0, $xr1, $xr0
-; CHECK-NEXT:    ret
-    %c = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
-    ret <8 x i32> %c
-}
-
-;; xvpickod.w
-define <8 x float> @shufflodector_pick_od_v8f32(<8 x float> %a, <8 x float> %b) {
-; CHECK-LABEL: shufflodector_pick_od_v8f32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xvpickod.w $xr0, $xr1, $xr0
-; CHECK-NEXT:    ret
-    %c = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
-    ret <8 x float> %c
-}
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvrepl128vei.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvrepl128vei.ll
deleted file mode 100644
index dce1e4b777e29..0000000000000
--- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvrepl128vei.ll
+++ /dev/null
@@ -1,65 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s
-
-;; xvrepl128vei.b
-define <32 x i8> @shufflevector_v32i8(<32 x i8> %a, <32 x i8> %b) {
-; CHECK-LABEL: shufflevector_v32i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xvrepl128vei.b $xr0, $xr0, 1
-; CHECK-NEXT:    ret
-    %c = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1,
-                                                               i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17>
-    ret <32 x i8> %c
-}
-
-;; xvrepl128vei.h
-define <16 x i16> @shufflevector_v16i16(<16 x i16> %a, <16 x i16> %b) {
-; CHECK-LABEL: shufflevector_v16i16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xvrepl128vei.h $xr0, $xr0, 3
-; CHECK-NEXT:    ret
-    %c = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3,
-                                                                 i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
-    ret <16 x i16> %c
-}
-
-;; xvrepl128vei.w
-define <8 x i32> @shufflevector_v8i32(<8 x i32> %a, <8 x i32> %b) {
-; CHECK-LABEL: shufflevector_v8i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xvpermi.d $xr0, $xr0, 78
-; CHECK-NEXT:    xvrepl128vei.w $xr0, $xr0, 3
-; CHECK-NEXT:    ret
-    %c = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 3, i32 3, i32 3, i32 3>
-    ret <8 x i32> %c
-}
-
-;; xvrepl128vei.d
-define <4 x i64> @shufflevector_v4i64(<4 x i64> %a, <4 x i64> %b) {
-; CHECK-LABEL: shufflevector_v4i64:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xvrepl128vei.d $xr0, $xr0, 1
-; CHECK-NEXT:    ret
-    %c = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
-    ret <4 x i64> %c
-}
-
-;; xvrepl128vei.w
-define <8 x float> @shufflevector_v8f32(<8 x float> %a, <8 x float> %b) {
-; CHECK-LABEL: shufflevector_v8f32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xvrepl128vei.w $xr0, $xr0, 3
-; CHECK-NEXT:    ret
-    %c = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 7, i32 7, i32 7, i32 7>
-    ret <8 x float> %c
-}
-
-;; xvrepl128vei.d
-define <4 x double> @shufflevector_v4f64(<4 x double> %a, <4 x double> %b) {
-; CHECK-LABEL: shufflevector_v4f64:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xvrepl128vei.d $xr0, $xr1, 1
-; CHECK-NEXT:    ret
-    %c = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 5, i32 5, i32 7, i32 7>
-    ret <4 x double> %c
-}
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvshuf.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvshuf.ll
deleted file mode 100644
index 4cc819018f0a8..0000000000000
--- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvshuf.ll
+++ /dev/null
@@ -1,79 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s
-
-;; xvshuf.b
-define <32 x i8> @shufflevector_v32i8(<32 x i8> %a, <32 x i8> %b) {
-; CHECK-LABEL: shufflevector_v32i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI0_0)
-; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI0_0)
-; CHECK-NEXT:    xvld $xr2, $a0, 0
-; CHECK-NEXT:    xvshuf.b $xr0, $xr1, $xr0, $xr2
-; CHECK-NEXT:    ret
-    %c = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 1, i32 3, i32 5, i32 7, i32 8, i32 10, i32 12, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39,
-                                                               i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
-    ret <32 x i8> %c
-}
-
-;; xvshuf.h
-define <16 x i16> @shufflevector_v16i16(<16 x i16> %a, <16 x i16> %b) {
-; CHECK-LABEL: shufflevector_v16i16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI1_0)
-; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI1_0)
-; CHECK-NEXT:    xvld $xr2, $a0, 0
-; CHECK-NEXT:    xvpermi.d $xr0, $xr0, 78
-; CHECK-NEXT:    xvpermi.d $xr1, $xr1, 78
-; CHECK-NEXT:    xvshuf.h $xr2, $xr1, $xr0
-; CHECK-NEXT:    xvori.b $xr0, $xr2, 0
-; CHECK-NEXT:    ret
-    %c = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 27, i32 26, i32 25, i32 24,
-                                                                 i32 16, i32 17, i32 18, i32 19, i32 0, i32 1, i32 2, i32 3>
-    ret <16 x i16> %c
-}
-
-;; xvshuf.w
-define <8 x i32> @shufflevector_v8i32(<8 x i32> %a, <8 x i32> %b) {
-; CHECK-LABEL: shufflevector_v8i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI2_0)
-; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI2_0)
-; CHECK-NEXT:    xvld $xr2, $a0, 0
-; CHECK-NEXT:    xvpermi.d $xr0, $xr0, 68
-; CHECK-NEXT:    xvpermi.d $xr1, $xr1, 68
-; CHECK-NEXT:    xvshuf.w $xr2, $xr1, $xr0
-; CHECK-NEXT:    xvori.b $xr0, $xr2, 0
-; CHECK-NEXT:    ret
-    %c = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 8, i32 9, i32 3, i32 2, i32 8, i32 9, i32 3, i32 2>
-    ret <8 x i32> %c
-}
-
-;; xvshuf.d
-define <4 x i64> @shufflevector_v4i64(<4 x i64> %a, <4 x i64> %b) {
-; CHECK-LABEL: shufflevector_v4i64:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI3_0)
-; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI3_0)
-; CHECK-NEXT:    xvld $xr2, $a0, 0
-; CHECK-NEXT:    xvpermi.d $xr0, $xr0, 238
-; CHECK-NEXT:    xvpermi.d $xr1, $xr1, 238
-; CHECK-NEXT:    xvshuf.d $xr2, $xr1, $xr0
-; CHECK-NEXT:    xvori.b $xr0, $xr2, 0
-; CHECK-NEXT:    ret
-    %c = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
-    ret <4 x i64> %c
-}
-
-;; xvshuf.w
-define <8 x float> @shufflevector_v8f32(<8 x float> %a, <8 x float> %b) {
-; CHECK-LABEL: shufflevector_v8f32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI4_0)
-; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI4_0)
-; CHECK-NEXT:    xvld $xr2, $a0, 0
-; CHECK-NEXT:    xvshuf.w $xr2, $xr1, $xr0
-; CHECK-NEXT:    xvori.b $xr0, $xr2, 0
-; CHECK-NEXT:    ret
-    %c = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 2, i32 0, i32 10, i32 9, i32 4, i32 5, i32 12, i32 13>
-    ret <8 x float> %c
-}
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvshuf4i.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvshuf4i.ll
deleted file mode 100644
index dc4532a7292ab..0000000000000
--- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvshuf4i.ll
+++ /dev/null
@@ -1,43 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s
-
-;; xxvshuf4i.b
-define <32 x i8> @shufflevector_xvshuf4i_v32i8(<32 x i8> %a, <32 x i8> %b) {
-; CHECK-LABEL: shufflevector_xvshuf4i_v32i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xvshuf4i.b $xr0, $xr0, 27
-; CHECK-NEXT:    ret
-    %c = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12,
-                                                               i32 19, i32 18, i32 17, i32 16, i32 23, i32 22, i32 21, i32 20, i32 27, i32 26, i32 25, i32 24, i32 31, i32 30, i32 29, i32 28>
-    ret <32 x i8> %c
-}
-
-;; xvshuf4i.h
-define <16 x i16> @shufflevector_xvshuf4i_v16i16(<16 x i16> %a, <16 x i16> %b) {
-; CHECK-LABEL: shufflevector_xvshuf4i_v16i16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xvshuf4i.h $xr0, $xr0, 27
-; CHECK-NEXT:    ret
-    %c = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
-    ret <16 x i16> %c
-}
-
-;; xvshuf4i.w
-define <8 x i32> @shufflevector_xvshuf4i_v8i32(<8 x i32> %a, <8 x i32> %b) {
-; CHECK-LABEL: shufflevector_xvshuf4i_v8i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xvshuf4i.w $xr0, $xr0, 27
-; CHECK-NEXT:    ret
-    %c = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-    ret <8 x i32> %c
-}
-
-;; xvshuf4i.w
-define <8 x float> @shufflevector_xvshuf4i_v8f32(<8 x float> %a, <8 x float> %b) {
-; CHECK-LABEL: shufflevector_xvshuf4i_v8f32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xvshuf4i.w $xr0, $xr0, 27
-; CHECK-NEXT:    ret
-    %c = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-    ret <8 x float> %c
-}
diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vilv.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vilv.ll
deleted file mode 100644
index 31398c6081c0a..0000000000000
--- a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vilv.ll
+++ /dev/null
@@ -1,82 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s
-
-;; vilvl.b
-define <16 x i8> @shufflevector_vilvl_v16i8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: shufflevector_vilvl_v16i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vilvl.b $vr0, $vr1, $vr0
-; CHECK-NEXT:    ret
-    %c = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
-    ret <16 x i8> %c
-}
-
-;; vilvl.h
-define <8 x i16> @shufflevector_vilvl_v8i16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: shufflevector_vilvl_v8i16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vilvl.h $vr0, $vr1, $vr0
-; CHECK-NEXT:    ret
-    %c = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-    ret <8 x i16> %c
-}
-
-;; vilvl.w
-define <4 x i32> @shufflevector_vilvl_v4i32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: shufflevector_vilvl_v4i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vilvl.w $vr0, $vr1, $vr0
-; CHECK-NEXT:    ret
-    %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
-    ret <4 x i32> %c
-}
-
-;; vilvl.w
-define <4 x float> @shufflevector_vilvl_v4f32(<4 x float> %a, <4 x float> %b) {
-; CHECK-LABEL: shufflevector_vilvl_v4f32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vilvl.w $vr0, $vr1, $vr0
-; CHECK-NEXT:    ret
-    %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
-    ret <4 x float> %c
-}
-
-;; vilvh.b
-define <16 x i8> @shufflevector_vilvh_v16i8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: shufflevector_vilvh_v16i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vilvh.b $vr0, $vr1, $vr0
-; CHECK-NEXT:    ret
-    %c = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
-    ret <16 x i8> %c
-}
-
-;; vilvh.h
-define <8 x i16> @shufflevector_vilvh_v8i16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: shufflevector_vilvh_v8i16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vilvh.h $vr0, $vr1, $vr0
-; CHECK-NEXT:    ret
-    %c = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-    ret <8 x i16> %c
-}
-
-;; vilvh.w
-define <4 x i32> @shufflevector_vilvh_v4i32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: shufflevector_vilvh_v4i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vilvh.w $vr0, $vr1, $vr0
-; CHECK-NEXT:    ret
-    %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-    ret <4 x i32> %c
-}
-
-;; vilvh.w
-define <4 x float> @shufflevector_vilvh_v4f32(<4 x float> %a, <4 x float> %b) {
-; CHECK-LABEL: shufflevector_vilvh_v4f32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vilvh.w $vr0, $vr1, $vr0
-; CHECK-NEXT:    ret
-    %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-    ret <4 x float> %c
-}
diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vpack.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vpack.ll
deleted file mode 100644
index 171e68306cd11..0000000000000
--- a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vpack.ll
+++ /dev/null
@@ -1,122 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s
-
-;; vpackev.b
-define <16 x i8> @shufflevector_pack_ev_v16i8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: shufflevector_pack_ev_v16i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpackev.b $vr0, $vr1, $vr0
-; CHECK-NEXT:    ret
-    %c = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
-    ret <16 x i8> %c
-}
-
-;; vpackev.h
-define <8 x i16> @shufflevector_pack_ev_v8i16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: shufflevector_pack_ev_v8i16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpackev.h $vr0, $vr1, $vr0
-; CHECK-NEXT:    ret
-    %c = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-    ret <8 x i16> %c
-}
-
-;; vpackev.w
-define <4 x i32> @shufflevector_pack_ev_v4i32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: shufflevector_pack_ev_v4i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpackev.w $vr0, $vr1, $vr0
-; CHECK-NEXT:    ret
-    %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-    ret <4 x i32> %c
-}
-
-;; vpickev.d/vpackev.d/vilvl.d
-define <2 x i64> @shufflevector_pack_ev_v2i64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: shufflevector_pack_ev_v2i64:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpackev.d $vr0, $vr1, $vr0
-; CHECK-NEXT:    ret
-    %c = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
-    ret <2 x i64> %c
-}
-
-;; vpackev.w
-define <4 x float> @shufflevector_pack_ev_v4f32(<4 x float> %a, <4 x float> %b) {
-; CHECK-LABEL: shufflevector_pack_ev_v4f32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpackev.w $vr0, $vr1, $vr0
-; CHECK-NEXT:    ret
-    %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-    ret <4 x float> %c
-}
-
-;; vpickev.d/vpackev.d/vilvl.d
-define <2 x double> @shufflevector_pack_ev_v2f64(<2 x double> %a, <2 x double> %b) {
-; CHECK-LABEL: shufflevector_pack_ev_v2f64:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpackev.d $vr0, $vr1, $vr0
-; CHECK-NEXT:    ret
-    %c = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 2>
-    ret <2 x double> %c
-}
-
-;; vpackod.b
-define <16 x i8> @shufflevector_pack_od_v16i8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: shufflevector_pack_od_v16i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpackod.b $vr0, $vr1, $vr0
-; CHECK-NEXT:    ret
-    %c = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
-    ret <16 x i8> %c
-}
-
-;; vpackod.h
-define <8 x i16> @shufflevector_pack_od_v8i16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: shufflevector_pack_od_v8i16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpackod.h $vr0, $vr1, $vr0
-; CHECK-NEXT:    ret
-    %c = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-    ret <8 x i16> %c
-}
-
-;; vpackod.w
-define <4 x i32> @shufflevector_pack_od_v4i32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: shufflevector_pack_od_v4i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpackod.w $vr0, $vr1, $vr0
-; CHECK-NEXT:    ret
-    %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-    ret <4 x i32> %c
-}
-
-;; vpickod.d/vpackod.d/vilvh.d
-define <2 x i64> @shufflodector_pack_od_v2i64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: shufflodector_pack_od_v2i64:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpackod.d $vr0, $vr1, $vr0
-; CHECK-NEXT:    ret
-    %c = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
-    ret <2 x i64> %c
-}
-
-;; vpackod.w
-define <4 x float> @shufflodector_pack_od_v4f32(<4 x float> %a, <4 x float> %b) {
-; CHECK-LABEL: shufflodector_pack_od_v4f32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpackod.w $vr0, $vr1, $vr0
-; CHECK-NEXT:    ret
-    %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-    ret <4 x float> %c
-}
-
-;; vpickod.d/vpackod.d/vilvh.d
-define <2 x double> @shufflodector_pack_od_v2f64(<2 x double> %a, <2 x double> %b) {
-; CHECK-LABEL: shufflodector_pack_od_v2f64:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpackod.d $vr0, $vr1, $vr0
-; CHECK-NEXT:    ret
-    %c = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 3>
-    ret <2 x double> %c
-}
diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vpick.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vpick.ll
deleted file mode 100644
index ca636d942b583..0000000000000
--- a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vpick.ll
+++ /dev/null
@@ -1,82 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s
-
-;; vpickev.b
-define <16 x i8> @shufflevector_pick_ev_v16i8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: shufflevector_pick_ev_v16i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpickev.b $vr0, $vr1, $vr0
-; CHECK-NEXT:    ret
-    %c = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
-    ret <16 x i8> %c
-}
-
-;; vpickev.h
-define <8 x i16> @shufflevector_pick_ev_v8i16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: shufflevector_pick_ev_v8i16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpickev.h $vr0, $vr1, $vr0
-; CHECK-NEXT:    ret
-    %c = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-    ret <8 x i16> %c
-}
-
-;; vpickev.w
-define <4 x i32> @shufflevector_pick_ev_v4i32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: shufflevector_pick_ev_v4i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpickev.w $vr0, $vr1, $vr0
-; CHECK-NEXT:    ret
-    %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-    ret <4 x i32> %c
-}
-
-;; vpickev.w
-define <4 x float> @shufflevector_pick_ev_v4f32(<4 x float> %a, <4 x float> %b) {
-; CHECK-LABEL: shufflevector_pick_ev_v4f32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpickev.w $vr0, $vr1, $vr0
-; CHECK-NEXT:    ret
-    %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-    ret <4 x float> %c
-}
-
-;; vpickod.b
-define <16 x i8> @shufflevector_pick_od_v16i8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: shufflevector_pick_od_v16i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpickod.b $vr0, $vr1, $vr0
-; CHECK-NEXT:    ret
-    %c = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
-    ret <16 x i8> %c
-}
-
-;; vpickod.h
-define <8 x i16> @shufflevector_pick_od_v8i16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: shufflevector_pick_od_v8i16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpickod.h $vr0, $vr1, $vr0
-; CHECK-NEXT:    ret
-    %c = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-    ret <8 x i16> %c
-}
-
-;; vpickod.w
-define <4 x i32> @shufflevector_pick_od_v4i32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: shufflevector_pick_od_v4i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpickod.w $vr0, $vr1, $vr0
-; CHECK-NEXT:    ret
-    %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-    ret <4 x i32> %c
-}
-
-;; vpickod.w
-define <4 x float> @shufflodector_pick_od_v4f32(<4 x float> %a, <4 x float> %b) {
-; CHECK-LABEL: shufflodector_pick_od_v4f32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpickod.w $vr0, $vr1, $vr0
-; CHECK-NEXT:    ret
-    %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-    ret <4 x float> %c
-}
diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vreplvei.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vreplvei.ll
deleted file mode 100644
index 10510786f3216..0000000000000
--- a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vreplvei.ll
+++ /dev/null
@@ -1,62 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s
-
-;; vreplvei.b
-define <16 x i8> @shufflevector_v16i8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: shufflevector_v16i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vreplvei.b $vr0, $vr0, 1
-; CHECK-NEXT:    ret
-    %c = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-    ret <16 x i8> %c
-}
-
-;; vreplvei.h
-define <8 x i16> @shufflevector_v8i16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: shufflevector_v8i16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vreplvei.h $vr0, $vr1, 2
-; CHECK-NEXT:    ret
-    %c = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
-    ret <8 x i16> %c
-}
-
-;; vreplvei.w
-define <4 x i32> @shufflevector_v4i32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: shufflevector_v4i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vreplvei.w $vr0, $vr0, 0
-; CHECK-NEXT:    ret
-    %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
-    ret <4 x i32> %c
-}
-
-;; vreplvei.d
-define <2 x i64> @shufflevector_v2i64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: shufflevector_v2i64:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vreplvei.d $vr0, $vr0, 1
-; CHECK-NEXT:    ret
-    %c = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 1>
-    ret <2 x i64> %c
-}
-
-;; vreplvei.w
-define <4 x float> @shufflevector_v4f32(<4 x float> %a, <4 x float> %b) {
-; CHECK-LABEL: shufflevector_v4f32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vreplvei.w $vr0, $vr0, 0
-; CHECK-NEXT:    ret
-    %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
-    ret <4 x float> %c
-}
-
-;; vreplvei.d
-define <2 x double> @shufflevector_v2f64(<2 x double> %a, <2 x double> %b) {
-; CHECK-LABEL: shufflevector_v2f64:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vreplvei.d $vr0, $vr0, 1
-; CHECK-NEXT:    ret
-    %c = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 1>
-    ret <2 x double> %c
-}
diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vshuf.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vshuf.ll
deleted file mode 100644
index 55800b31446b3..0000000000000
--- a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vshuf.ll
+++ /dev/null
@@ -1,84 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s
-
-define <16 x i8> @shufflevector_v16i8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: shufflevector_v16i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI0_0)
-; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI0_0)
-; CHECK-NEXT:    vld $vr2, $a0, 0
-; CHECK-NEXT:    vshuf.b $vr0, $vr1, $vr0, $vr2
-; CHECK-NEXT:    ret
-    %c = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 8, i32 10, i32 12, i32 15, i32 2, i32 4, i32 6, i32 8, i32 25, i32 30, i32 31, i32 31>
-    ret <16 x i8> %c
-}
-
-;; vshuf.h
-define <8 x i16> @shufflevector_v8i16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: shufflevector_v8i16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI1_0)
-; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI1_0)
-; CHECK-NEXT:    vld $vr2, $a0, 0
-; CHECK-NEXT:    vshuf.h $vr2, $vr1, $vr0
-; CHECK-NEXT:    vori.b $vr0, $vr2, 0
-; CHECK-NEXT:    ret
-    %c = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 8, i32 10, i32 12, i32 15>
-    ret <8 x i16> %c
-}
-
-;; vshuf.w
-define <4 x i32> @shufflevector_v4i32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: shufflevector_v4i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI2_0)
-; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI2_0)
-; CHECK-NEXT:    vld $vr2, $a0, 0
-; CHECK-NEXT:    vshuf.w $vr2, $vr1, $vr0
-; CHECK-NEXT:    vori.b $vr0, $vr2, 0
-; CHECK-NEXT:    ret
-    %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 3, i32 5, i32 7>
-    ret <4 x i32> %c
-}
-
-;; vshuf.d
-define <2 x i64> @shufflevector_v2i64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: shufflevector_v2i64:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI3_0)
-; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI3_0)
-; CHECK-NEXT:    vld $vr2, $a0, 0
-; CHECK-NEXT:    vshuf.d $vr2, $vr1, $vr0
-; CHECK-NEXT:    vori.b $vr0, $vr2, 0
-; CHECK-NEXT:    ret
-    %c = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 3>
-    ret <2 x i64> %c
-}
-
-;; vshuf.w
-define <4 x float> @shufflevector_v4f32(<4 x float> %a, <4 x float> %b) {
-; CHECK-LABEL: shufflevector_v4f32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI4_0)
-; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI4_0)
-; CHECK-NEXT:    vld $vr2, $a0, 0
-; CHECK-NEXT:    vshuf.w $vr2, $vr1, $vr0
-; CHECK-NEXT:    vori.b $vr0, $vr2, 0
-; CHECK-NEXT:    ret
-    %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 3, i32 5, i32 7>
-    ret <4 x float> %c
-}
-
-;; vshuf.d
-define <2 x double> @shufflevector_v2f64(<2 x double> %a, <2 x double> %b) {
-; CHECK-LABEL: shufflevector_v2f64:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI5_0)
-; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI5_0)
-; CHECK-NEXT:    vld $vr2, $a0, 0
-; CHECK-NEXT:    vshuf.d $vr2, $vr1, $vr0
-; CHECK-NEXT:    vori.b $vr0, $vr2, 0
-; CHECK-NEXT:    ret
-    %c = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 3>
-    ret <2 x double> %c
-}
diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vshuf4i.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vshuf4i.ll
deleted file mode 100644
index 660b9581c3d1f..0000000000000
--- a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vshuf4i.ll
+++ /dev/null
@@ -1,42 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s
-
-;; vilvh.b
-define <16 x i8> @shufflevector_vshuf4i_v16i8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: shufflevector_vshuf4i_v16i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vshuf4i.b $vr0, $vr0, 27
-; CHECK-NEXT:    ret
-    %c = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
-    ret <16 x i8> %c
-}
-
-;; vilvh.h
-define <8 x i16> @shufflevector_vshuf4i_v8i4(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: shufflevector_vshuf4i_v8i4:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vshuf4i.h $vr0, $vr0, 27
-; CHECK-NEXT:    ret
-    %c = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-    ret <8 x i16> %c
-}
-
-;; vilvh.w
-define <4 x i32> @shufflevector_vshuf4i_v4i32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: shufflevector_vshuf4i_v4i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vshuf4i.w $vr0, $vr0, 27
-; CHECK-NEXT:    ret
-    %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-    ret <4 x i32> %c
-}
-
-;; vilvh.w
-define <4 x float> @shufflevector_vshuf4i_v4f32(<4 x float> %a, <4 x float> %b) {
-; CHECK-LABEL: shufflevector_vshuf4i_v4f32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vshuf4i.w $vr0, $vr0, 27
-; CHECK-NEXT:    ret
-    %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-    ret <4 x float> %c
-}
diff --git a/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll b/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll
index 6a15d3a9cda30..3e8ef59a75633 100644
--- a/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll
+++ b/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll
@@ -1,11 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc --mtriple=loongarch64 -mattr=+d --code-model=medium --relocation-model=pic --post-RA-scheduler=0 < %s \
+; RUN: llc --mtriple=loongarch64 -mattr=+d --code-model=medium --post-RA-scheduler=0 < %s \
 ; RUN:     | FileCheck %s --check-prefix=MEDIUM_NO_SCH
-; RUN: llc --mtriple=loongarch64 -mattr=+d --code-model=medium --relocation-model=pic --post-RA-scheduler=1 < %s \
+; RUN: llc --mtriple=loongarch64 -mattr=+d --code-model=medium --post-RA-scheduler=1 < %s \
 ; RUN:     | FileCheck %s --check-prefix=MEDIUM_SCH
-; RUN: llc --mtriple=loongarch64 -mattr=+d --code-model=large --relocation-model=pic --post-RA-scheduler=0 < %s \
+; RUN: llc --mtriple=loongarch64 -mattr=+d --code-model=large --post-RA-scheduler=0 < %s \
 ; RUN:     | FileCheck %s --check-prefix=LARGE_NO_SCH
-; RUN: llc --mtriple=loongarch64 -mattr=+d --code-model=large --relocation-model=pic --post-RA-scheduler=1 < %s \
+; RUN: llc --mtriple=loongarch64 -mattr=+d --code-model=large --post-RA-scheduler=1 < %s \
 ; RUN:     | FileCheck %s --check-prefix=LARGE_SCH
 
 @g = dso_local global i64 zeroinitializer, align 4
@@ -24,25 +24,21 @@ define void @foo() nounwind {
 ; MEDIUM_NO_SCH-NEXT:    pcalau12i $a0, %got_pc_hi20(G)
 ; MEDIUM_NO_SCH-NEXT:    ld.d $a0, $a0, %got_pc_lo12(G)
 ; MEDIUM_NO_SCH-NEXT:    ld.d $zero, $a0, 0
-; MEDIUM_NO_SCH-NEXT:    pcalau12i $a0, %pc_hi20(.Lg$local)
-; MEDIUM_NO_SCH-NEXT:    addi.d $a0, $a0, %pc_lo12(.Lg$local)
+; MEDIUM_NO_SCH-NEXT:    pcalau12i $a0, %pc_hi20(g)
+; MEDIUM_NO_SCH-NEXT:    addi.d $a0, $a0, %pc_lo12(g)
 ; MEDIUM_NO_SCH-NEXT:    ld.d $zero, $a0, 0
 ; MEDIUM_NO_SCH-NEXT:    ori $a0, $zero, 1
 ; MEDIUM_NO_SCH-NEXT:    pcaddu18i $ra, %call36(bar)
 ; MEDIUM_NO_SCH-NEXT:    jirl $ra, $ra, 0
-; MEDIUM_NO_SCH-NEXT:    pcalau12i $a0, %gd_pc_hi20(gd)
-; MEDIUM_NO_SCH-NEXT:    addi.d $a0, $a0, %got_pc_lo12(gd)
-; MEDIUM_NO_SCH-NEXT:    pcaddu18i $ra, %call36(__tls_get_addr)
-; MEDIUM_NO_SCH-NEXT:    jirl $ra, $ra, 0
-; MEDIUM_NO_SCH-NEXT:    ld.d $zero, $a0, 0
-; MEDIUM_NO_SCH-NEXT:    pcalau12i $a0, %ld_pc_hi20(ld)
-; MEDIUM_NO_SCH-NEXT:    addi.d $a0, $a0, %got_pc_lo12(ld)
-; MEDIUM_NO_SCH-NEXT:    pcaddu18i $ra, %call36(__tls_get_addr)
-; MEDIUM_NO_SCH-NEXT:    jirl $ra, $ra, 0
-; MEDIUM_NO_SCH-NEXT:    pcalau12i $a1, %ie_pc_hi20(ie)
-; MEDIUM_NO_SCH-NEXT:    ld.d $a1, $a1, %ie_pc_lo12(ie)
-; MEDIUM_NO_SCH-NEXT:    ld.d $zero, $a0, 0
+; MEDIUM_NO_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(gd)
+; MEDIUM_NO_SCH-NEXT:    ld.d $a0, $a0, %ie_pc_lo12(gd)
+; MEDIUM_NO_SCH-NEXT:    pcalau12i $a1, %ie_pc_hi20(ld)
+; MEDIUM_NO_SCH-NEXT:    ld.d $a1, $a1, %ie_pc_lo12(ld)
+; MEDIUM_NO_SCH-NEXT:    pcalau12i $a2, %ie_pc_hi20(ie)
+; MEDIUM_NO_SCH-NEXT:    ld.d $a2, $a2, %ie_pc_lo12(ie)
+; MEDIUM_NO_SCH-NEXT:    ldx.d $zero, $a0, $tp
 ; MEDIUM_NO_SCH-NEXT:    ldx.d $zero, $a1, $tp
+; MEDIUM_NO_SCH-NEXT:    ldx.d $zero, $a2, $tp
 ; MEDIUM_NO_SCH-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
 ; MEDIUM_NO_SCH-NEXT:    addi.d $sp, $sp, 16
 ; MEDIUM_NO_SCH-NEXT:    ret
@@ -54,25 +50,21 @@ define void @foo() nounwind {
 ; MEDIUM_SCH-NEXT:    pcalau12i $a0, %got_pc_hi20(G)
 ; MEDIUM_SCH-NEXT:    ld.d $a0, $a0, %got_pc_lo12(G)
 ; MEDIUM_SCH-NEXT:    ld.d $zero, $a0, 0
-; MEDIUM_SCH-NEXT:    pcalau12i $a0, %pc_hi20(.Lg$local)
-; MEDIUM_SCH-NEXT:    addi.d $a0, $a0, %pc_lo12(.Lg$local)
+; MEDIUM_SCH-NEXT:    pcalau12i $a0, %pc_hi20(g)
+; MEDIUM_SCH-NEXT:    addi.d $a0, $a0, %pc_lo12(g)
 ; MEDIUM_SCH-NEXT:    ld.d $zero, $a0, 0
 ; MEDIUM_SCH-NEXT:    ori $a0, $zero, 1
 ; MEDIUM_SCH-NEXT:    pcaddu18i $ra, %call36(bar)
 ; MEDIUM_SCH-NEXT:    jirl $ra, $ra, 0
-; MEDIUM_SCH-NEXT:    pcalau12i $a0, %gd_pc_hi20(gd)
-; MEDIUM_SCH-NEXT:    addi.d $a0, $a0, %got_pc_lo12(gd)
-; MEDIUM_SCH-NEXT:    pcaddu18i $ra, %call36(__tls_get_addr)
-; MEDIUM_SCH-NEXT:    jirl $ra, $ra, 0
-; MEDIUM_SCH-NEXT:    ld.d $zero, $a0, 0
-; MEDIUM_SCH-NEXT:    pcalau12i $a0, %ld_pc_hi20(ld)
-; MEDIUM_SCH-NEXT:    addi.d $a0, $a0, %got_pc_lo12(ld)
-; MEDIUM_SCH-NEXT:    pcaddu18i $ra, %call36(__tls_get_addr)
-; MEDIUM_SCH-NEXT:    jirl $ra, $ra, 0
-; MEDIUM_SCH-NEXT:    pcalau12i $a1, %ie_pc_hi20(ie)
-; MEDIUM_SCH-NEXT:    ld.d $zero, $a0, 0
-; MEDIUM_SCH-NEXT:    ld.d $a1, $a1, %ie_pc_lo12(ie)
+; MEDIUM_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(gd)
+; MEDIUM_SCH-NEXT:    pcalau12i $a1, %ie_pc_hi20(ld)
+; MEDIUM_SCH-NEXT:    pcalau12i $a2, %ie_pc_hi20(ie)
+; MEDIUM_SCH-NEXT:    ld.d $a0, $a0, %ie_pc_lo12(gd)
+; MEDIUM_SCH-NEXT:    ld.d $a1, $a1, %ie_pc_lo12(ld)
+; MEDIUM_SCH-NEXT:    ld.d $a2, $a2, %ie_pc_lo12(ie)
+; MEDIUM_SCH-NEXT:    ldx.d $zero, $a0, $tp
 ; MEDIUM_SCH-NEXT:    ldx.d $zero, $a1, $tp
+; MEDIUM_SCH-NEXT:    ldx.d $zero, $a2, $tp
 ; MEDIUM_SCH-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
 ; MEDIUM_SCH-NEXT:    addi.d $sp, $sp, 16
 ; MEDIUM_SCH-NEXT:    ret
@@ -87,10 +79,10 @@ define void @foo() nounwind {
 ; LARGE_NO_SCH-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(G)
 ; LARGE_NO_SCH-NEXT:    ldx.d $a0, $t8, $a0
 ; LARGE_NO_SCH-NEXT:    ld.d $zero, $a0, 0
-; LARGE_NO_SCH-NEXT:    pcalau12i $a0, %pc_hi20(.Lg$local)
-; LARGE_NO_SCH-NEXT:    addi.d $t8, $zero, %pc_lo12(.Lg$local)
-; LARGE_NO_SCH-NEXT:    lu32i.d $t8, %pc64_lo20(.Lg$local)
-; LARGE_NO_SCH-NEXT:    lu52i.d $t8, $t8, %pc64_hi12(.Lg$local)
+; LARGE_NO_SCH-NEXT:    pcalau12i $a0, %pc_hi20(g)
+; LARGE_NO_SCH-NEXT:    addi.d $t8, $zero, %pc_lo12(g)
+; LARGE_NO_SCH-NEXT:    lu32i.d $t8, %pc64_lo20(g)
+; LARGE_NO_SCH-NEXT:    lu52i.d $t8, $t8, %pc64_hi12(g)
 ; LARGE_NO_SCH-NEXT:    add.d $a0, $t8, $a0
 ; LARGE_NO_SCH-NEXT:    ld.d $zero, $a0, 0
 ; LARGE_NO_SCH-NEXT:    ori $a0, $zero, 1
@@ -100,36 +92,24 @@ define void @foo() nounwind {
 ; LARGE_NO_SCH-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(bar)
 ; LARGE_NO_SCH-NEXT:    ldx.d $ra, $t8, $ra
 ; LARGE_NO_SCH-NEXT:    jirl $ra, $ra, 0
-; LARGE_NO_SCH-NEXT:    pcalau12i $a0, %gd_pc_hi20(gd)
-; LARGE_NO_SCH-NEXT:    addi.d $t8, $zero, %got_pc_lo12(gd)
-; LARGE_NO_SCH-NEXT:    lu32i.d $t8, %got64_pc_lo20(gd)
-; LARGE_NO_SCH-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(gd)
-; LARGE_NO_SCH-NEXT:    add.d $a0, $t8, $a0
-; LARGE_NO_SCH-NEXT:    pcalau12i $ra, %pc_hi20(__tls_get_addr)
-; LARGE_NO_SCH-NEXT:    addi.d $t8, $zero, %pc_lo12(__tls_get_addr)
-; LARGE_NO_SCH-NEXT:    lu32i.d $t8, %pc64_lo20(__tls_get_addr)
-; LARGE_NO_SCH-NEXT:    lu52i.d $t8, $t8, %pc64_hi12(__tls_get_addr)
-; LARGE_NO_SCH-NEXT:    add.d $ra, $t8, $ra
-; LARGE_NO_SCH-NEXT:    jirl $ra, $ra, 0
-; LARGE_NO_SCH-NEXT:    ld.d $zero, $a0, 0
-; LARGE_NO_SCH-NEXT:    pcalau12i $a0, %ld_pc_hi20(ld)
-; LARGE_NO_SCH-NEXT:    addi.d $t8, $zero, %got_pc_lo12(ld)
-; LARGE_NO_SCH-NEXT:    lu32i.d $t8, %got64_pc_lo20(ld)
-; LARGE_NO_SCH-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(ld)
-; LARGE_NO_SCH-NEXT:    add.d $a0, $t8, $a0
-; LARGE_NO_SCH-NEXT:    pcalau12i $ra, %pc_hi20(__tls_get_addr)
-; LARGE_NO_SCH-NEXT:    addi.d $t8, $zero, %pc_lo12(__tls_get_addr)
-; LARGE_NO_SCH-NEXT:    lu32i.d $t8, %pc64_lo20(__tls_get_addr)
-; LARGE_NO_SCH-NEXT:    lu52i.d $t8, $t8, %pc64_hi12(__tls_get_addr)
-; LARGE_NO_SCH-NEXT:    add.d $ra, $t8, $ra
-; LARGE_NO_SCH-NEXT:    jirl $ra, $ra, 0
-; LARGE_NO_SCH-NEXT:    pcalau12i $a1, %ie_pc_hi20(ie)
+; LARGE_NO_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(gd)
+; LARGE_NO_SCH-NEXT:    addi.d $t8, $zero, %ie_pc_lo12(gd)
+; LARGE_NO_SCH-NEXT:    lu32i.d $t8, %ie64_pc_lo20(gd)
+; LARGE_NO_SCH-NEXT:    lu52i.d $t8, $t8, %ie64_pc_hi12(gd)
+; LARGE_NO_SCH-NEXT:    ldx.d $a0, $t8, $a0
+; LARGE_NO_SCH-NEXT:    pcalau12i $a1, %ie_pc_hi20(ld)
+; LARGE_NO_SCH-NEXT:    addi.d $t8, $zero, %ie_pc_lo12(ld)
+; LARGE_NO_SCH-NEXT:    lu32i.d $t8, %ie64_pc_lo20(ld)
+; LARGE_NO_SCH-NEXT:    lu52i.d $t8, $t8, %ie64_pc_hi12(ld)
+; LARGE_NO_SCH-NEXT:    ldx.d $a1, $t8, $a1
+; LARGE_NO_SCH-NEXT:    pcalau12i $a2, %ie_pc_hi20(ie)
 ; LARGE_NO_SCH-NEXT:    addi.d $t8, $zero, %ie_pc_lo12(ie)
 ; LARGE_NO_SCH-NEXT:    lu32i.d $t8, %ie64_pc_lo20(ie)
 ; LARGE_NO_SCH-NEXT:    lu52i.d $t8, $t8, %ie64_pc_hi12(ie)
-; LARGE_NO_SCH-NEXT:    ldx.d $a1, $t8, $a1
-; LARGE_NO_SCH-NEXT:    ld.d $zero, $a0, 0
+; LARGE_NO_SCH-NEXT:    ldx.d $a2, $t8, $a2
+; LARGE_NO_SCH-NEXT:    ldx.d $zero, $a0, $tp
 ; LARGE_NO_SCH-NEXT:    ldx.d $zero, $a1, $tp
+; LARGE_NO_SCH-NEXT:    ldx.d $zero, $a2, $tp
 ; LARGE_NO_SCH-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
 ; LARGE_NO_SCH-NEXT:    addi.d $sp, $sp, 16
 ; LARGE_NO_SCH-NEXT:    ret
@@ -144,10 +124,10 @@ define void @foo() nounwind {
 ; LARGE_SCH-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(G)
 ; LARGE_SCH-NEXT:    ldx.d $a0, $t8, $a0
 ; LARGE_SCH-NEXT:    ld.d $zero, $a0, 0
-; LARGE_SCH-NEXT:    pcalau12i $a0, %pc_hi20(.Lg$local)
-; LARGE_SCH-NEXT:    addi.d $t8, $zero, %pc_lo12(.Lg$local)
-; LARGE_SCH-NEXT:    lu32i.d $t8, %pc64_lo20(.Lg$local)
-; LARGE_SCH-NEXT:    lu52i.d $t8, $t8, %pc64_hi12(.Lg$local)
+; LARGE_SCH-NEXT:    pcalau12i $a0, %pc_hi20(g)
+; LARGE_SCH-NEXT:    addi.d $t8, $zero, %pc_lo12(g)
+; LARGE_SCH-NEXT:    lu32i.d $t8, %pc64_lo20(g)
+; LARGE_SCH-NEXT:    lu52i.d $t8, $t8, %pc64_hi12(g)
 ; LARGE_SCH-NEXT:    add.d $a0, $t8, $a0
 ; LARGE_SCH-NEXT:    ld.d $zero, $a0, 0
 ; LARGE_SCH-NEXT:    ori $a0, $zero, 1
@@ -157,36 +137,24 @@ define void @foo() nounwind {
 ; LARGE_SCH-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(bar)
 ; LARGE_SCH-NEXT:    ldx.d $ra, $t8, $ra
 ; LARGE_SCH-NEXT:    jirl $ra, $ra, 0
-; LARGE_SCH-NEXT:    pcalau12i $a0, %gd_pc_hi20(gd)
-; LARGE_SCH-NEXT:    addi.d $t8, $zero, %got_pc_lo12(gd)
-; LARGE_SCH-NEXT:    lu32i.d $t8, %got64_pc_lo20(gd)
-; LARGE_SCH-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(gd)
-; LARGE_SCH-NEXT:    add.d $a0, $t8, $a0
-; LARGE_SCH-NEXT:    pcalau12i $ra, %pc_hi20(__tls_get_addr)
-; LARGE_SCH-NEXT:    addi.d $t8, $zero, %pc_lo12(__tls_get_addr)
-; LARGE_SCH-NEXT:    lu32i.d $t8, %pc64_lo20(__tls_get_addr)
-; LARGE_SCH-NEXT:    lu52i.d $t8, $t8, %pc64_hi12(__tls_get_addr)
-; LARGE_SCH-NEXT:    add.d $ra, $t8, $ra
-; LARGE_SCH-NEXT:    jirl $ra, $ra, 0
-; LARGE_SCH-NEXT:    ld.d $zero, $a0, 0
-; LARGE_SCH-NEXT:    pcalau12i $a0, %ld_pc_hi20(ld)
-; LARGE_SCH-NEXT:    addi.d $t8, $zero, %got_pc_lo12(ld)
-; LARGE_SCH-NEXT:    lu32i.d $t8, %got64_pc_lo20(ld)
-; LARGE_SCH-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(ld)
-; LARGE_SCH-NEXT:    add.d $a0, $t8, $a0
-; LARGE_SCH-NEXT:    pcalau12i $ra, %pc_hi20(__tls_get_addr)
-; LARGE_SCH-NEXT:    addi.d $t8, $zero, %pc_lo12(__tls_get_addr)
-; LARGE_SCH-NEXT:    lu32i.d $t8, %pc64_lo20(__tls_get_addr)
-; LARGE_SCH-NEXT:    lu52i.d $t8, $t8, %pc64_hi12(__tls_get_addr)
-; LARGE_SCH-NEXT:    add.d $ra, $t8, $ra
-; LARGE_SCH-NEXT:    jirl $ra, $ra, 0
-; LARGE_SCH-NEXT:    pcalau12i $a1, %ie_pc_hi20(ie)
+; LARGE_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(gd)
+; LARGE_SCH-NEXT:    addi.d $t8, $zero, %ie_pc_lo12(gd)
+; LARGE_SCH-NEXT:    lu32i.d $t8, %ie64_pc_lo20(gd)
+; LARGE_SCH-NEXT:    lu52i.d $t8, $t8, %ie64_pc_hi12(gd)
+; LARGE_SCH-NEXT:    ldx.d $a0, $t8, $a0
+; LARGE_SCH-NEXT:    pcalau12i $a1, %ie_pc_hi20(ld)
+; LARGE_SCH-NEXT:    addi.d $t8, $zero, %ie_pc_lo12(ld)
+; LARGE_SCH-NEXT:    lu32i.d $t8, %ie64_pc_lo20(ld)
+; LARGE_SCH-NEXT:    lu52i.d $t8, $t8, %ie64_pc_hi12(ld)
+; LARGE_SCH-NEXT:    ldx.d $a1, $t8, $a1
+; LARGE_SCH-NEXT:    pcalau12i $a2, %ie_pc_hi20(ie)
 ; LARGE_SCH-NEXT:    addi.d $t8, $zero, %ie_pc_lo12(ie)
 ; LARGE_SCH-NEXT:    lu32i.d $t8, %ie64_pc_lo20(ie)
 ; LARGE_SCH-NEXT:    lu52i.d $t8, $t8, %ie64_pc_hi12(ie)
-; LARGE_SCH-NEXT:    ldx.d $a1, $t8, $a1
-; LARGE_SCH-NEXT:    ld.d $zero, $a0, 0
+; LARGE_SCH-NEXT:    ldx.d $a2, $t8, $a2
+; LARGE_SCH-NEXT:    ldx.d $zero, $a0, $tp
 ; LARGE_SCH-NEXT:    ldx.d $zero, $a1, $tp
+; LARGE_SCH-NEXT:    ldx.d $zero, $a2, $tp
 ; LARGE_SCH-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
 ; LARGE_SCH-NEXT:    addi.d $sp, $sp, 16
 ; LARGE_SCH-NEXT:    ret
diff --git a/llvm/test/CodeGen/NVPTX/i128.ll b/llvm/test/CodeGen/NVPTX/i128.ll
index 396c29512933c..4449e4f2ea4ed 100644
--- a/llvm/test/CodeGen/NVPTX/i128.ll
+++ b/llvm/test/CodeGen/NVPTX/i128.ll
@@ -7,20 +7,20 @@ define i128 @srem_i128(i128 %lhs, i128 %rhs) {
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<19>;
 ; CHECK-NEXT:    .reg .b32 %r<20>;
-; CHECK-NEXT:    .reg .b64 %rd<129>;
+; CHECK-NEXT:    .reg .b64 %rd<127>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %_udiv-special-cases
 ; CHECK-NEXT:    ld.param.v2.u64 {%rd45, %rd46}, [srem_i128_param_0];
 ; CHECK-NEXT:    ld.param.v2.u64 {%rd49, %rd50}, [srem_i128_param_1];
 ; CHECK-NEXT:    shr.s64 %rd2, %rd46, 63;
-; CHECK-NEXT:    mov.u64 %rd119, 0;
-; CHECK-NEXT:    sub.cc.s64 %rd52, %rd119, %rd45;
-; CHECK-NEXT:    subc.cc.s64 %rd53, %rd119, %rd46;
+; CHECK-NEXT:    mov.u64 %rd117, 0;
+; CHECK-NEXT:    sub.cc.s64 %rd52, %rd117, %rd45;
+; CHECK-NEXT:    subc.cc.s64 %rd53, %rd117, %rd46;
 ; CHECK-NEXT:    setp.lt.s64 %p1, %rd46, 0;
 ; CHECK-NEXT:    selp.b64 %rd4, %rd53, %rd46, %p1;
 ; CHECK-NEXT:    selp.b64 %rd3, %rd52, %rd45, %p1;
-; CHECK-NEXT:    sub.cc.s64 %rd54, %rd119, %rd49;
-; CHECK-NEXT:    subc.cc.s64 %rd55, %rd119, %rd50;
+; CHECK-NEXT:    sub.cc.s64 %rd54, %rd117, %rd49;
+; CHECK-NEXT:    subc.cc.s64 %rd55, %rd117, %rd50;
 ; CHECK-NEXT:    setp.lt.s64 %p2, %rd50, 0;
 ; CHECK-NEXT:    selp.b64 %rd6, %rd55, %rd50, %p2;
 ; CHECK-NEXT:    selp.b64 %rd5, %rd54, %rd49, %p2;
@@ -43,109 +43,109 @@ define i128 @srem_i128(i128 %lhs, i128 %rhs) {
 ; CHECK-NEXT:    cvt.u64.u32 %rd63, %r4;
 ; CHECK-NEXT:    add.s64 %rd64, %rd63, 64;
 ; CHECK-NEXT:    selp.b64 %rd65, %rd62, %rd64, %p7;
-; CHECK-NEXT:    sub.cc.s64 %rd66, %rd61, %rd65;
-; CHECK-NEXT:    subc.cc.s64 %rd67, %rd119, 0;
-; CHECK-NEXT:    setp.eq.s64 %p8, %rd67, 0;
-; CHECK-NEXT:    setp.ne.s64 %p9, %rd67, 0;
+; CHECK-NEXT:    sub.cc.s64 %rd7, %rd61, %rd65;
+; CHECK-NEXT:    subc.cc.s64 %rd8, %rd117, 0;
+; CHECK-NEXT:    setp.eq.s64 %p8, %rd8, 0;
+; CHECK-NEXT:    setp.ne.s64 %p9, %rd8, 0;
 ; CHECK-NEXT:    selp.u32 %r5, -1, 0, %p9;
-; CHECK-NEXT:    setp.gt.u64 %p10, %rd66, 127;
+; CHECK-NEXT:    setp.gt.u64 %p10, %rd7, 127;
 ; CHECK-NEXT:    selp.u32 %r6, -1, 0, %p10;
 ; CHECK-NEXT:    selp.b32 %r7, %r6, %r5, %p8;
 ; CHECK-NEXT:    and.b32 %r8, %r7, 1;
 ; CHECK-NEXT:    setp.eq.b32 %p11, %r8, 1;
 ; CHECK-NEXT:    or.pred %p12, %p5, %p11;
-; CHECK-NEXT:    xor.b64 %rd68, %rd66, 127;
-; CHECK-NEXT:    or.b64 %rd69, %rd68, %rd67;
-; CHECK-NEXT:    setp.eq.s64 %p13, %rd69, 0;
-; CHECK-NEXT:    selp.b64 %rd128, 0, %rd4, %p12;
-; CHECK-NEXT:    selp.b64 %rd127, 0, %rd3, %p12;
+; CHECK-NEXT:    xor.b64 %rd66, %rd7, 127;
+; CHECK-NEXT:    or.b64 %rd67, %rd66, %rd8;
+; CHECK-NEXT:    setp.eq.s64 %p13, %rd67, 0;
+; CHECK-NEXT:    selp.b64 %rd126, 0, %rd4, %p12;
+; CHECK-NEXT:    selp.b64 %rd125, 0, %rd3, %p12;
 ; CHECK-NEXT:    or.pred %p14, %p12, %p13;
 ; CHECK-NEXT:    @%p14 bra $L__BB0_5;
 ; CHECK-NEXT:  // %bb.3: // %udiv-bb1
-; CHECK-NEXT:    add.cc.s64 %rd121, %rd66, 1;
-; CHECK-NEXT:    addc.cc.s64 %rd122, %rd67, 0;
-; CHECK-NEXT:    or.b64 %rd72, %rd121, %rd122;
-; CHECK-NEXT:    setp.eq.s64 %p15, %rd72, 0;
-; CHECK-NEXT:    cvt.u32.u64 %r9, %rd66;
+; CHECK-NEXT:    add.cc.s64 %rd119, %rd7, 1;
+; CHECK-NEXT:    addc.cc.s64 %rd120, %rd8, 0;
+; CHECK-NEXT:    or.b64 %rd70, %rd119, %rd120;
+; CHECK-NEXT:    setp.eq.s64 %p15, %rd70, 0;
+; CHECK-NEXT:    cvt.u32.u64 %r9, %rd7;
 ; CHECK-NEXT:    mov.b32 %r10, 127;
 ; CHECK-NEXT:    sub.s32 %r11, %r10, %r9;
-; CHECK-NEXT:    shl.b64 %rd73, %rd4, %r11;
+; CHECK-NEXT:    shl.b64 %rd71, %rd4, %r11;
 ; CHECK-NEXT:    mov.b32 %r12, 64;
 ; CHECK-NEXT:    sub.s32 %r13, %r12, %r11;
-; CHECK-NEXT:    shr.u64 %rd74, %rd3, %r13;
-; CHECK-NEXT:    or.b64 %rd75, %rd73, %rd74;
+; CHECK-NEXT:    shr.u64 %rd72, %rd3, %r13;
+; CHECK-NEXT:    or.b64 %rd73, %rd71, %rd72;
 ; CHECK-NEXT:    mov.b32 %r14, 63;
 ; CHECK-NEXT:    sub.s32 %r15, %r14, %r9;
-; CHECK-NEXT:    shl.b64 %rd76, %rd3, %r15;
+; CHECK-NEXT:    shl.b64 %rd74, %rd3, %r15;
 ; CHECK-NEXT:    setp.gt.s32 %p16, %r11, 63;
-; CHECK-NEXT:    selp.b64 %rd126, %rd76, %rd75, %p16;
-; CHECK-NEXT:    shl.b64 %rd125, %rd3, %r11;
-; CHECK-NEXT:    mov.u64 %rd116, %rd119;
+; CHECK-NEXT:    selp.b64 %rd124, %rd74, %rd73, %p16;
+; CHECK-NEXT:    shl.b64 %rd123, %rd3, %r11;
+; CHECK-NEXT:    mov.u64 %rd114, %rd117;
 ; CHECK-NEXT:    @%p15 bra $L__BB0_4;
 ; CHECK-NEXT:  // %bb.1: // %udiv-preheader
-; CHECK-NEXT:    cvt.u32.u64 %r16, %rd121;
-; CHECK-NEXT:    shr.u64 %rd79, %rd3, %r16;
+; CHECK-NEXT:    cvt.u32.u64 %r16, %rd119;
+; CHECK-NEXT:    shr.u64 %rd77, %rd3, %r16;
 ; CHECK-NEXT:    sub.s32 %r18, %r12, %r16;
-; CHECK-NEXT:    shl.b64 %rd80, %rd4, %r18;
-; CHECK-NEXT:    or.b64 %rd81, %rd79, %rd80;
+; CHECK-NEXT:    shl.b64 %rd78, %rd4, %r18;
+; CHECK-NEXT:    or.b64 %rd79, %rd77, %rd78;
 ; CHECK-NEXT:    add.s32 %r19, %r16, -64;
-; CHECK-NEXT:    shr.u64 %rd82, %rd4, %r19;
+; CHECK-NEXT:    shr.u64 %rd80, %rd4, %r19;
 ; CHECK-NEXT:    setp.gt.s32 %p17, %r16, 63;
-; CHECK-NEXT:    selp.b64 %rd123, %rd82, %rd81, %p17;
-; CHECK-NEXT:    shr.u64 %rd124, %rd4, %r16;
+; CHECK-NEXT:    selp.b64 %rd121, %rd80, %rd79, %p17;
+; CHECK-NEXT:    shr.u64 %rd122, %rd4, %r16;
 ; CHECK-NEXT:    add.cc.s64 %rd35, %rd5, -1;
 ; CHECK-NEXT:    addc.cc.s64 %rd36, %rd6, -1;
-; CHECK-NEXT:    mov.u64 %rd116, 0;
-; CHECK-NEXT:    mov.u64 %rd119, %rd116;
+; CHECK-NEXT:    mov.u64 %rd114, 0;
+; CHECK-NEXT:    mov.u64 %rd117, %rd114;
 ; CHECK-NEXT:  $L__BB0_2: // %udiv-do-while
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    shr.u64 %rd83, %rd123, 63;
-; CHECK-NEXT:    shl.b64 %rd84, %rd124, 1;
-; CHECK-NEXT:    or.b64 %rd85, %rd84, %rd83;
-; CHECK-NEXT:    shl.b64 %rd86, %rd123, 1;
-; CHECK-NEXT:    shr.u64 %rd87, %rd126, 63;
-; CHECK-NEXT:    or.b64 %rd88, %rd86, %rd87;
-; CHECK-NEXT:    shr.u64 %rd89, %rd125, 63;
-; CHECK-NEXT:    shl.b64 %rd90, %rd126, 1;
-; CHECK-NEXT:    or.b64 %rd91, %rd90, %rd89;
-; CHECK-NEXT:    shl.b64 %rd92, %rd125, 1;
-; CHECK-NEXT:    or.b64 %rd125, %rd119, %rd92;
-; CHECK-NEXT:    or.b64 %rd126, %rd116, %rd91;
-; CHECK-NEXT:    sub.cc.s64 %rd93, %rd35, %rd88;
-; CHECK-NEXT:    subc.cc.s64 %rd94, %rd36, %rd85;
-; CHECK-NEXT:    shr.s64 %rd95, %rd94, 63;
-; CHECK-NEXT:    and.b64 %rd119, %rd95, 1;
-; CHECK-NEXT:    and.b64 %rd96, %rd95, %rd5;
-; CHECK-NEXT:    and.b64 %rd97, %rd95, %rd6;
-; CHECK-NEXT:    sub.cc.s64 %rd123, %rd88, %rd96;
-; CHECK-NEXT:    subc.cc.s64 %rd124, %rd85, %rd97;
-; CHECK-NEXT:    add.cc.s64 %rd121, %rd121, -1;
-; CHECK-NEXT:    addc.cc.s64 %rd122, %rd122, -1;
-; CHECK-NEXT:    or.b64 %rd98, %rd121, %rd122;
-; CHECK-NEXT:    setp.eq.s64 %p18, %rd98, 0;
+; CHECK-NEXT:    shr.u64 %rd81, %rd121, 63;
+; CHECK-NEXT:    shl.b64 %rd82, %rd122, 1;
+; CHECK-NEXT:    or.b64 %rd83, %rd82, %rd81;
+; CHECK-NEXT:    shl.b64 %rd84, %rd121, 1;
+; CHECK-NEXT:    shr.u64 %rd85, %rd124, 63;
+; CHECK-NEXT:    or.b64 %rd86, %rd84, %rd85;
+; CHECK-NEXT:    shr.u64 %rd87, %rd123, 63;
+; CHECK-NEXT:    shl.b64 %rd88, %rd124, 1;
+; CHECK-NEXT:    or.b64 %rd89, %rd88, %rd87;
+; CHECK-NEXT:    shl.b64 %rd90, %rd123, 1;
+; CHECK-NEXT:    or.b64 %rd123, %rd117, %rd90;
+; CHECK-NEXT:    or.b64 %rd124, %rd114, %rd89;
+; CHECK-NEXT:    sub.cc.s64 %rd91, %rd35, %rd86;
+; CHECK-NEXT:    subc.cc.s64 %rd92, %rd36, %rd83;
+; CHECK-NEXT:    shr.s64 %rd93, %rd92, 63;
+; CHECK-NEXT:    and.b64 %rd117, %rd93, 1;
+; CHECK-NEXT:    and.b64 %rd94, %rd93, %rd5;
+; CHECK-NEXT:    and.b64 %rd95, %rd93, %rd6;
+; CHECK-NEXT:    sub.cc.s64 %rd121, %rd86, %rd94;
+; CHECK-NEXT:    subc.cc.s64 %rd122, %rd83, %rd95;
+; CHECK-NEXT:    add.cc.s64 %rd119, %rd119, -1;
+; CHECK-NEXT:    addc.cc.s64 %rd120, %rd120, -1;
+; CHECK-NEXT:    or.b64 %rd96, %rd119, %rd120;
+; CHECK-NEXT:    setp.eq.s64 %p18, %rd96, 0;
 ; CHECK-NEXT:    @%p18 bra $L__BB0_4;
 ; CHECK-NEXT:    bra.uni $L__BB0_2;
 ; CHECK-NEXT:  $L__BB0_4: // %udiv-loop-exit
-; CHECK-NEXT:    shr.u64 %rd99, %rd125, 63;
-; CHECK-NEXT:    shl.b64 %rd100, %rd126, 1;
-; CHECK-NEXT:    or.b64 %rd101, %rd100, %rd99;
-; CHECK-NEXT:    shl.b64 %rd102, %rd125, 1;
-; CHECK-NEXT:    or.b64 %rd127, %rd119, %rd102;
-; CHECK-NEXT:    or.b64 %rd128, %rd116, %rd101;
+; CHECK-NEXT:    shr.u64 %rd97, %rd123, 63;
+; CHECK-NEXT:    shl.b64 %rd98, %rd124, 1;
+; CHECK-NEXT:    or.b64 %rd99, %rd98, %rd97;
+; CHECK-NEXT:    shl.b64 %rd100, %rd123, 1;
+; CHECK-NEXT:    or.b64 %rd125, %rd117, %rd100;
+; CHECK-NEXT:    or.b64 %rd126, %rd114, %rd99;
 ; CHECK-NEXT:  $L__BB0_5: // %udiv-end
-; CHECK-NEXT:    mul.hi.u64 %rd103, %rd5, %rd127;
-; CHECK-NEXT:    mul.lo.s64 %rd104, %rd5, %rd128;
+; CHECK-NEXT:    mul.hi.u64 %rd101, %rd5, %rd125;
+; CHECK-NEXT:    mul.lo.s64 %rd102, %rd5, %rd126;
+; CHECK-NEXT:    add.s64 %rd103, %rd101, %rd102;
+; CHECK-NEXT:    mul.lo.s64 %rd104, %rd6, %rd125;
 ; CHECK-NEXT:    add.s64 %rd105, %rd103, %rd104;
-; CHECK-NEXT:    mul.lo.s64 %rd106, %rd6, %rd127;
-; CHECK-NEXT:    add.s64 %rd107, %rd105, %rd106;
-; CHECK-NEXT:    mul.lo.s64 %rd108, %rd5, %rd127;
-; CHECK-NEXT:    sub.cc.s64 %rd109, %rd3, %rd108;
-; CHECK-NEXT:    subc.cc.s64 %rd110, %rd4, %rd107;
-; CHECK-NEXT:    xor.b64 %rd111, %rd109, %rd2;
-; CHECK-NEXT:    xor.b64 %rd112, %rd110, %rd2;
-; CHECK-NEXT:    sub.cc.s64 %rd113, %rd111, %rd2;
-; CHECK-NEXT:    subc.cc.s64 %rd114, %rd112, %rd2;
-; CHECK-NEXT:    st.param.v2.b64 [func_retval0+0], {%rd113, %rd114};
+; CHECK-NEXT:    mul.lo.s64 %rd106, %rd5, %rd125;
+; CHECK-NEXT:    sub.cc.s64 %rd107, %rd3, %rd106;
+; CHECK-NEXT:    subc.cc.s64 %rd108, %rd4, %rd105;
+; CHECK-NEXT:    xor.b64 %rd109, %rd107, %rd2;
+; CHECK-NEXT:    xor.b64 %rd110, %rd108, %rd2;
+; CHECK-NEXT:    sub.cc.s64 %rd111, %rd109, %rd2;
+; CHECK-NEXT:    subc.cc.s64 %rd112, %rd110, %rd2;
+; CHECK-NEXT:    st.param.v2.b64 [func_retval0+0], {%rd111, %rd112};
 ; CHECK-NEXT:    ret;
   %div = srem i128 %lhs, %rhs
   ret i128 %div
@@ -156,7 +156,7 @@ define i128 @urem_i128(i128 %lhs, i128 %rhs) {
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<17>;
 ; CHECK-NEXT:    .reg .b32 %r<20>;
-; CHECK-NEXT:    .reg .b64 %rd<115>;
+; CHECK-NEXT:    .reg .b64 %rd<113>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %_udiv-special-cases
 ; CHECK-NEXT:    ld.param.v2.u64 {%rd41, %rd42}, [urem_i128_param_0];
@@ -180,106 +180,106 @@ define i128 @urem_i128(i128 %lhs, i128 %rhs) {
 ; CHECK-NEXT:    cvt.u64.u32 %rd52, %r4;
 ; CHECK-NEXT:    add.s64 %rd53, %rd52, 64;
 ; CHECK-NEXT:    selp.b64 %rd54, %rd51, %rd53, %p5;
-; CHECK-NEXT:    mov.u64 %rd105, 0;
-; CHECK-NEXT:    sub.cc.s64 %rd56, %rd50, %rd54;
-; CHECK-NEXT:    subc.cc.s64 %rd57, %rd105, 0;
-; CHECK-NEXT:    setp.eq.s64 %p6, %rd57, 0;
-; CHECK-NEXT:    setp.ne.s64 %p7, %rd57, 0;
+; CHECK-NEXT:    mov.u64 %rd103, 0;
+; CHECK-NEXT:    sub.cc.s64 %rd5, %rd50, %rd54;
+; CHECK-NEXT:    subc.cc.s64 %rd6, %rd103, 0;
+; CHECK-NEXT:    setp.eq.s64 %p6, %rd6, 0;
+; CHECK-NEXT:    setp.ne.s64 %p7, %rd6, 0;
 ; CHECK-NEXT:    selp.u32 %r5, -1, 0, %p7;
-; CHECK-NEXT:    setp.gt.u64 %p8, %rd56, 127;
+; CHECK-NEXT:    setp.gt.u64 %p8, %rd5, 127;
 ; CHECK-NEXT:    selp.u32 %r6, -1, 0, %p8;
 ; CHECK-NEXT:    selp.b32 %r7, %r6, %r5, %p6;
 ; CHECK-NEXT:    and.b32 %r8, %r7, 1;
 ; CHECK-NEXT:    setp.eq.b32 %p9, %r8, 1;
 ; CHECK-NEXT:    or.pred %p10, %p3, %p9;
-; CHECK-NEXT:    xor.b64 %rd58, %rd56, 127;
-; CHECK-NEXT:    or.b64 %rd59, %rd58, %rd57;
-; CHECK-NEXT:    setp.eq.s64 %p11, %rd59, 0;
-; CHECK-NEXT:    selp.b64 %rd114, 0, %rd42, %p10;
-; CHECK-NEXT:    selp.b64 %rd113, 0, %rd41, %p10;
+; CHECK-NEXT:    xor.b64 %rd56, %rd5, 127;
+; CHECK-NEXT:    or.b64 %rd57, %rd56, %rd6;
+; CHECK-NEXT:    setp.eq.s64 %p11, %rd57, 0;
+; CHECK-NEXT:    selp.b64 %rd112, 0, %rd42, %p10;
+; CHECK-NEXT:    selp.b64 %rd111, 0, %rd41, %p10;
 ; CHECK-NEXT:    or.pred %p12, %p10, %p11;
 ; CHECK-NEXT:    @%p12 bra $L__BB1_5;
 ; CHECK-NEXT:  // %bb.3: // %udiv-bb1
-; CHECK-NEXT:    add.cc.s64 %rd107, %rd56, 1;
-; CHECK-NEXT:    addc.cc.s64 %rd108, %rd57, 0;
-; CHECK-NEXT:    or.b64 %rd62, %rd107, %rd108;
-; CHECK-NEXT:    setp.eq.s64 %p13, %rd62, 0;
-; CHECK-NEXT:    cvt.u32.u64 %r9, %rd56;
+; CHECK-NEXT:    add.cc.s64 %rd105, %rd5, 1;
+; CHECK-NEXT:    addc.cc.s64 %rd106, %rd6, 0;
+; CHECK-NEXT:    or.b64 %rd60, %rd105, %rd106;
+; CHECK-NEXT:    setp.eq.s64 %p13, %rd60, 0;
+; CHECK-NEXT:    cvt.u32.u64 %r9, %rd5;
 ; CHECK-NEXT:    mov.b32 %r10, 127;
 ; CHECK-NEXT:    sub.s32 %r11, %r10, %r9;
-; CHECK-NEXT:    shl.b64 %rd63, %rd42, %r11;
+; CHECK-NEXT:    shl.b64 %rd61, %rd42, %r11;
 ; CHECK-NEXT:    mov.b32 %r12, 64;
 ; CHECK-NEXT:    sub.s32 %r13, %r12, %r11;
-; CHECK-NEXT:    shr.u64 %rd64, %rd41, %r13;
-; CHECK-NEXT:    or.b64 %rd65, %rd63, %rd64;
+; CHECK-NEXT:    shr.u64 %rd62, %rd41, %r13;
+; CHECK-NEXT:    or.b64 %rd63, %rd61, %rd62;
 ; CHECK-NEXT:    mov.b32 %r14, 63;
 ; CHECK-NEXT:    sub.s32 %r15, %r14, %r9;
-; CHECK-NEXT:    shl.b64 %rd66, %rd41, %r15;
+; CHECK-NEXT:    shl.b64 %rd64, %rd41, %r15;
 ; CHECK-NEXT:    setp.gt.s32 %p14, %r11, 63;
-; CHECK-NEXT:    selp.b64 %rd112, %rd66, %rd65, %p14;
-; CHECK-NEXT:    shl.b64 %rd111, %rd41, %r11;
-; CHECK-NEXT:    mov.u64 %rd102, %rd105;
+; CHECK-NEXT:    selp.b64 %rd110, %rd64, %rd63, %p14;
+; CHECK-NEXT:    shl.b64 %rd109, %rd41, %r11;
+; CHECK-NEXT:    mov.u64 %rd100, %rd103;
 ; CHECK-NEXT:    @%p13 bra $L__BB1_4;
 ; CHECK-NEXT:  // %bb.1: // %udiv-preheader
-; CHECK-NEXT:    cvt.u32.u64 %r16, %rd107;
-; CHECK-NEXT:    shr.u64 %rd69, %rd41, %r16;
+; CHECK-NEXT:    cvt.u32.u64 %r16, %rd105;
+; CHECK-NEXT:    shr.u64 %rd67, %rd41, %r16;
 ; CHECK-NEXT:    sub.s32 %r18, %r12, %r16;
-; CHECK-NEXT:    shl.b64 %rd70, %rd42, %r18;
-; CHECK-NEXT:    or.b64 %rd71, %rd69, %rd70;
+; CHECK-NEXT:    shl.b64 %rd68, %rd42, %r18;
+; CHECK-NEXT:    or.b64 %rd69, %rd67, %rd68;
 ; CHECK-NEXT:    add.s32 %r19, %r16, -64;
-; CHECK-NEXT:    shr.u64 %rd72, %rd42, %r19;
+; CHECK-NEXT:    shr.u64 %rd70, %rd42, %r19;
 ; CHECK-NEXT:    setp.gt.s32 %p15, %r16, 63;
-; CHECK-NEXT:    selp.b64 %rd109, %rd72, %rd71, %p15;
-; CHECK-NEXT:    shr.u64 %rd110, %rd42, %r16;
+; CHECK-NEXT:    selp.b64 %rd107, %rd70, %rd69, %p15;
+; CHECK-NEXT:    shr.u64 %rd108, %rd42, %r16;
 ; CHECK-NEXT:    add.cc.s64 %rd33, %rd3, -1;
 ; CHECK-NEXT:    addc.cc.s64 %rd34, %rd4, -1;
-; CHECK-NEXT:    mov.u64 %rd102, 0;
-; CHECK-NEXT:    mov.u64 %rd105, %rd102;
+; CHECK-NEXT:    mov.u64 %rd100, 0;
+; CHECK-NEXT:    mov.u64 %rd103, %rd100;
 ; CHECK-NEXT:  $L__BB1_2: // %udiv-do-while
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    shr.u64 %rd73, %rd109, 63;
-; CHECK-NEXT:    shl.b64 %rd74, %rd110, 1;
-; CHECK-NEXT:    or.b64 %rd75, %rd74, %rd73;
-; CHECK-NEXT:    shl.b64 %rd76, %rd109, 1;
-; CHECK-NEXT:    shr.u64 %rd77, %rd112, 63;
-; CHECK-NEXT:    or.b64 %rd78, %rd76, %rd77;
-; CHECK-NEXT:    shr.u64 %rd79, %rd111, 63;
-; CHECK-NEXT:    shl.b64 %rd80, %rd112, 1;
-; CHECK-NEXT:    or.b64 %rd81, %rd80, %rd79;
-; CHECK-NEXT:    shl.b64 %rd82, %rd111, 1;
-; CHECK-NEXT:    or.b64 %rd111, %rd105, %rd82;
-; CHECK-NEXT:    or.b64 %rd112, %rd102, %rd81;
-; CHECK-NEXT:    sub.cc.s64 %rd83, %rd33, %rd78;
-; CHECK-NEXT:    subc.cc.s64 %rd84, %rd34, %rd75;
-; CHECK-NEXT:    shr.s64 %rd85, %rd84, 63;
-; CHECK-NEXT:    and.b64 %rd105, %rd85, 1;
-; CHECK-NEXT:    and.b64 %rd86, %rd85, %rd3;
-; CHECK-NEXT:    and.b64 %rd87, %rd85, %rd4;
-; CHECK-NEXT:    sub.cc.s64 %rd109, %rd78, %rd86;
-; CHECK-NEXT:    subc.cc.s64 %rd110, %rd75, %rd87;
-; CHECK-NEXT:    add.cc.s64 %rd107, %rd107, -1;
-; CHECK-NEXT:    addc.cc.s64 %rd108, %rd108, -1;
-; CHECK-NEXT:    or.b64 %rd88, %rd107, %rd108;
-; CHECK-NEXT:    setp.eq.s64 %p16, %rd88, 0;
+; CHECK-NEXT:    shr.u64 %rd71, %rd107, 63;
+; CHECK-NEXT:    shl.b64 %rd72, %rd108, 1;
+; CHECK-NEXT:    or.b64 %rd73, %rd72, %rd71;
+; CHECK-NEXT:    shl.b64 %rd74, %rd107, 1;
+; CHECK-NEXT:    shr.u64 %rd75, %rd110, 63;
+; CHECK-NEXT:    or.b64 %rd76, %rd74, %rd75;
+; CHECK-NEXT:    shr.u64 %rd77, %rd109, 63;
+; CHECK-NEXT:    shl.b64 %rd78, %rd110, 1;
+; CHECK-NEXT:    or.b64 %rd79, %rd78, %rd77;
+; CHECK-NEXT:    shl.b64 %rd80, %rd109, 1;
+; CHECK-NEXT:    or.b64 %rd109, %rd103, %rd80;
+; CHECK-NEXT:    or.b64 %rd110, %rd100, %rd79;
+; CHECK-NEXT:    sub.cc.s64 %rd81, %rd33, %rd76;
+; CHECK-NEXT:    subc.cc.s64 %rd82, %rd34, %rd73;
+; CHECK-NEXT:    shr.s64 %rd83, %rd82, 63;
+; CHECK-NEXT:    and.b64 %rd103, %rd83, 1;
+; CHECK-NEXT:    and.b64 %rd84, %rd83, %rd3;
+; CHECK-NEXT:    and.b64 %rd85, %rd83, %rd4;
+; CHECK-NEXT:    sub.cc.s64 %rd107, %rd76, %rd84;
+; CHECK-NEXT:    subc.cc.s64 %rd108, %rd73, %rd85;
+; CHECK-NEXT:    add.cc.s64 %rd105, %rd105, -1;
+; CHECK-NEXT:    addc.cc.s64 %rd106, %rd106, -1;
+; CHECK-NEXT:    or.b64 %rd86, %rd105, %rd106;
+; CHECK-NEXT:    setp.eq.s64 %p16, %rd86, 0;
 ; CHECK-NEXT:    @%p16 bra $L__BB1_4;
 ; CHECK-NEXT:    bra.uni $L__BB1_2;
 ; CHECK-NEXT:  $L__BB1_4: // %udiv-loop-exit
-; CHECK-NEXT:    shr.u64 %rd89, %rd111, 63;
-; CHECK-NEXT:    shl.b64 %rd90, %rd112, 1;
-; CHECK-NEXT:    or.b64 %rd91, %rd90, %rd89;
-; CHECK-NEXT:    shl.b64 %rd92, %rd111, 1;
-; CHECK-NEXT:    or.b64 %rd113, %rd105, %rd92;
-; CHECK-NEXT:    or.b64 %rd114, %rd102, %rd91;
+; CHECK-NEXT:    shr.u64 %rd87, %rd109, 63;
+; CHECK-NEXT:    shl.b64 %rd88, %rd110, 1;
+; CHECK-NEXT:    or.b64 %rd89, %rd88, %rd87;
+; CHECK-NEXT:    shl.b64 %rd90, %rd109, 1;
+; CHECK-NEXT:    or.b64 %rd111, %rd103, %rd90;
+; CHECK-NEXT:    or.b64 %rd112, %rd100, %rd89;
 ; CHECK-NEXT:  $L__BB1_5: // %udiv-end
-; CHECK-NEXT:    mul.hi.u64 %rd93, %rd3, %rd113;
-; CHECK-NEXT:    mul.lo.s64 %rd94, %rd3, %rd114;
+; CHECK-NEXT:    mul.hi.u64 %rd91, %rd3, %rd111;
+; CHECK-NEXT:    mul.lo.s64 %rd92, %rd3, %rd112;
+; CHECK-NEXT:    add.s64 %rd93, %rd91, %rd92;
+; CHECK-NEXT:    mul.lo.s64 %rd94, %rd4, %rd111;
 ; CHECK-NEXT:    add.s64 %rd95, %rd93, %rd94;
-; CHECK-NEXT:    mul.lo.s64 %rd96, %rd4, %rd113;
-; CHECK-NEXT:    add.s64 %rd97, %rd95, %rd96;
-; CHECK-NEXT:    mul.lo.s64 %rd98, %rd3, %rd113;
-; CHECK-NEXT:    sub.cc.s64 %rd99, %rd41, %rd98;
-; CHECK-NEXT:    subc.cc.s64 %rd100, %rd42, %rd97;
-; CHECK-NEXT:    st.param.v2.b64 [func_retval0+0], {%rd99, %rd100};
+; CHECK-NEXT:    mul.lo.s64 %rd96, %rd3, %rd111;
+; CHECK-NEXT:    sub.cc.s64 %rd97, %rd41, %rd96;
+; CHECK-NEXT:    subc.cc.s64 %rd98, %rd42, %rd95;
+; CHECK-NEXT:    st.param.v2.b64 [func_retval0+0], {%rd97, %rd98};
 ; CHECK-NEXT:    ret;
   %div = urem i128 %lhs, %rhs
   ret i128 %div
@@ -325,19 +325,19 @@ define i128 @sdiv_i128(i128 %lhs, i128 %rhs) {
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<19>;
 ; CHECK-NEXT:    .reg .b32 %r<20>;
-; CHECK-NEXT:    .reg .b64 %rd<122>;
+; CHECK-NEXT:    .reg .b64 %rd<120>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %_udiv-special-cases
 ; CHECK-NEXT:    ld.param.v2.u64 {%rd45, %rd46}, [sdiv_i128_param_0];
 ; CHECK-NEXT:    ld.param.v2.u64 {%rd49, %rd50}, [sdiv_i128_param_1];
-; CHECK-NEXT:    mov.u64 %rd112, 0;
-; CHECK-NEXT:    sub.cc.s64 %rd52, %rd112, %rd45;
-; CHECK-NEXT:    subc.cc.s64 %rd53, %rd112, %rd46;
+; CHECK-NEXT:    mov.u64 %rd110, 0;
+; CHECK-NEXT:    sub.cc.s64 %rd52, %rd110, %rd45;
+; CHECK-NEXT:    subc.cc.s64 %rd53, %rd110, %rd46;
 ; CHECK-NEXT:    setp.lt.s64 %p1, %rd46, 0;
 ; CHECK-NEXT:    selp.b64 %rd2, %rd53, %rd46, %p1;
 ; CHECK-NEXT:    selp.b64 %rd1, %rd52, %rd45, %p1;
-; CHECK-NEXT:    sub.cc.s64 %rd54, %rd112, %rd49;
-; CHECK-NEXT:    subc.cc.s64 %rd55, %rd112, %rd50;
+; CHECK-NEXT:    sub.cc.s64 %rd54, %rd110, %rd49;
+; CHECK-NEXT:    subc.cc.s64 %rd55, %rd110, %rd50;
 ; CHECK-NEXT:    setp.lt.s64 %p2, %rd50, 0;
 ; CHECK-NEXT:    selp.b64 %rd4, %rd55, %rd50, %p2;
 ; CHECK-NEXT:    selp.b64 %rd3, %rd54, %rd49, %p2;
@@ -362,101 +362,101 @@ define i128 @sdiv_i128(i128 %lhs, i128 %rhs) {
 ; CHECK-NEXT:    cvt.u64.u32 %rd64, %r4;
 ; CHECK-NEXT:    add.s64 %rd65, %rd64, 64;
 ; CHECK-NEXT:    selp.b64 %rd66, %rd63, %rd65, %p7;
-; CHECK-NEXT:    sub.cc.s64 %rd67, %rd62, %rd66;
-; CHECK-NEXT:    subc.cc.s64 %rd68, %rd112, 0;
-; CHECK-NEXT:    setp.eq.s64 %p8, %rd68, 0;
-; CHECK-NEXT:    setp.ne.s64 %p9, %rd68, 0;
+; CHECK-NEXT:    sub.cc.s64 %rd7, %rd62, %rd66;
+; CHECK-NEXT:    subc.cc.s64 %rd8, %rd110, 0;
+; CHECK-NEXT:    setp.eq.s64 %p8, %rd8, 0;
+; CHECK-NEXT:    setp.ne.s64 %p9, %rd8, 0;
 ; CHECK-NEXT:    selp.u32 %r5, -1, 0, %p9;
-; CHECK-NEXT:    setp.gt.u64 %p10, %rd67, 127;
+; CHECK-NEXT:    setp.gt.u64 %p10, %rd7, 127;
 ; CHECK-NEXT:    selp.u32 %r6, -1, 0, %p10;
 ; CHECK-NEXT:    selp.b32 %r7, %r6, %r5, %p8;
 ; CHECK-NEXT:    and.b32 %r8, %r7, 1;
 ; CHECK-NEXT:    setp.eq.b32 %p11, %r8, 1;
 ; CHECK-NEXT:    or.pred %p12, %p5, %p11;
-; CHECK-NEXT:    xor.b64 %rd69, %rd67, 127;
-; CHECK-NEXT:    or.b64 %rd70, %rd69, %rd68;
-; CHECK-NEXT:    setp.eq.s64 %p13, %rd70, 0;
-; CHECK-NEXT:    selp.b64 %rd121, 0, %rd2, %p12;
-; CHECK-NEXT:    selp.b64 %rd120, 0, %rd1, %p12;
+; CHECK-NEXT:    xor.b64 %rd67, %rd7, 127;
+; CHECK-NEXT:    or.b64 %rd68, %rd67, %rd8;
+; CHECK-NEXT:    setp.eq.s64 %p13, %rd68, 0;
+; CHECK-NEXT:    selp.b64 %rd119, 0, %rd2, %p12;
+; CHECK-NEXT:    selp.b64 %rd118, 0, %rd1, %p12;
 ; CHECK-NEXT:    or.pred %p14, %p12, %p13;
 ; CHECK-NEXT:    @%p14 bra $L__BB4_5;
 ; CHECK-NEXT:  // %bb.3: // %udiv-bb1
-; CHECK-NEXT:    add.cc.s64 %rd114, %rd67, 1;
-; CHECK-NEXT:    addc.cc.s64 %rd115, %rd68, 0;
-; CHECK-NEXT:    or.b64 %rd73, %rd114, %rd115;
-; CHECK-NEXT:    setp.eq.s64 %p15, %rd73, 0;
-; CHECK-NEXT:    cvt.u32.u64 %r9, %rd67;
+; CHECK-NEXT:    add.cc.s64 %rd112, %rd7, 1;
+; CHECK-NEXT:    addc.cc.s64 %rd113, %rd8, 0;
+; CHECK-NEXT:    or.b64 %rd71, %rd112, %rd113;
+; CHECK-NEXT:    setp.eq.s64 %p15, %rd71, 0;
+; CHECK-NEXT:    cvt.u32.u64 %r9, %rd7;
 ; CHECK-NEXT:    mov.b32 %r10, 127;
 ; CHECK-NEXT:    sub.s32 %r11, %r10, %r9;
-; CHECK-NEXT:    shl.b64 %rd74, %rd2, %r11;
+; CHECK-NEXT:    shl.b64 %rd72, %rd2, %r11;
 ; CHECK-NEXT:    mov.b32 %r12, 64;
 ; CHECK-NEXT:    sub.s32 %r13, %r12, %r11;
-; CHECK-NEXT:    shr.u64 %rd75, %rd1, %r13;
-; CHECK-NEXT:    or.b64 %rd76, %rd74, %rd75;
+; CHECK-NEXT:    shr.u64 %rd73, %rd1, %r13;
+; CHECK-NEXT:    or.b64 %rd74, %rd72, %rd73;
 ; CHECK-NEXT:    mov.b32 %r14, 63;
 ; CHECK-NEXT:    sub.s32 %r15, %r14, %r9;
-; CHECK-NEXT:    shl.b64 %rd77, %rd1, %r15;
+; CHECK-NEXT:    shl.b64 %rd75, %rd1, %r15;
 ; CHECK-NEXT:    setp.gt.s32 %p16, %r11, 63;
-; CHECK-NEXT:    selp.b64 %rd119, %rd77, %rd76, %p16;
-; CHECK-NEXT:    shl.b64 %rd118, %rd1, %r11;
-; CHECK-NEXT:    mov.u64 %rd109, %rd112;
+; CHECK-NEXT:    selp.b64 %rd117, %rd75, %rd74, %p16;
+; CHECK-NEXT:    shl.b64 %rd116, %rd1, %r11;
+; CHECK-NEXT:    mov.u64 %rd107, %rd110;
 ; CHECK-NEXT:    @%p15 bra $L__BB4_4;
 ; CHECK-NEXT:  // %bb.1: // %udiv-preheader
-; CHECK-NEXT:    cvt.u32.u64 %r16, %rd114;
-; CHECK-NEXT:    shr.u64 %rd80, %rd1, %r16;
+; CHECK-NEXT:    cvt.u32.u64 %r16, %rd112;
+; CHECK-NEXT:    shr.u64 %rd78, %rd1, %r16;
 ; CHECK-NEXT:    sub.s32 %r18, %r12, %r16;
-; CHECK-NEXT:    shl.b64 %rd81, %rd2, %r18;
-; CHECK-NEXT:    or.b64 %rd82, %rd80, %rd81;
+; CHECK-NEXT:    shl.b64 %rd79, %rd2, %r18;
+; CHECK-NEXT:    or.b64 %rd80, %rd78, %rd79;
 ; CHECK-NEXT:    add.s32 %r19, %r16, -64;
-; CHECK-NEXT:    shr.u64 %rd83, %rd2, %r19;
+; CHECK-NEXT:    shr.u64 %rd81, %rd2, %r19;
 ; CHECK-NEXT:    setp.gt.s32 %p17, %r16, 63;
-; CHECK-NEXT:    selp.b64 %rd116, %rd83, %rd82, %p17;
-; CHECK-NEXT:    shr.u64 %rd117, %rd2, %r16;
+; CHECK-NEXT:    selp.b64 %rd114, %rd81, %rd80, %p17;
+; CHECK-NEXT:    shr.u64 %rd115, %rd2, %r16;
 ; CHECK-NEXT:    add.cc.s64 %rd35, %rd3, -1;
 ; CHECK-NEXT:    addc.cc.s64 %rd36, %rd4, -1;
-; CHECK-NEXT:    mov.u64 %rd109, 0;
-; CHECK-NEXT:    mov.u64 %rd112, %rd109;
+; CHECK-NEXT:    mov.u64 %rd107, 0;
+; CHECK-NEXT:    mov.u64 %rd110, %rd107;
 ; CHECK-NEXT:  $L__BB4_2: // %udiv-do-while
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    shr.u64 %rd84, %rd116, 63;
-; CHECK-NEXT:    shl.b64 %rd85, %rd117, 1;
-; CHECK-NEXT:    or.b64 %rd86, %rd85, %rd84;
-; CHECK-NEXT:    shl.b64 %rd87, %rd116, 1;
-; CHECK-NEXT:    shr.u64 %rd88, %rd119, 63;
-; CHECK-NEXT:    or.b64 %rd89, %rd87, %rd88;
-; CHECK-NEXT:    shr.u64 %rd90, %rd118, 63;
-; CHECK-NEXT:    shl.b64 %rd91, %rd119, 1;
-; CHECK-NEXT:    or.b64 %rd92, %rd91, %rd90;
-; CHECK-NEXT:    shl.b64 %rd93, %rd118, 1;
-; CHECK-NEXT:    or.b64 %rd118, %rd112, %rd93;
-; CHECK-NEXT:    or.b64 %rd119, %rd109, %rd92;
-; CHECK-NEXT:    sub.cc.s64 %rd94, %rd35, %rd89;
-; CHECK-NEXT:    subc.cc.s64 %rd95, %rd36, %rd86;
-; CHECK-NEXT:    shr.s64 %rd96, %rd95, 63;
-; CHECK-NEXT:    and.b64 %rd112, %rd96, 1;
-; CHECK-NEXT:    and.b64 %rd97, %rd96, %rd3;
-; CHECK-NEXT:    and.b64 %rd98, %rd96, %rd4;
-; CHECK-NEXT:    sub.cc.s64 %rd116, %rd89, %rd97;
-; CHECK-NEXT:    subc.cc.s64 %rd117, %rd86, %rd98;
-; CHECK-NEXT:    add.cc.s64 %rd114, %rd114, -1;
-; CHECK-NEXT:    addc.cc.s64 %rd115, %rd115, -1;
-; CHECK-NEXT:    or.b64 %rd99, %rd114, %rd115;
-; CHECK-NEXT:    setp.eq.s64 %p18, %rd99, 0;
+; CHECK-NEXT:    shr.u64 %rd82, %rd114, 63;
+; CHECK-NEXT:    shl.b64 %rd83, %rd115, 1;
+; CHECK-NEXT:    or.b64 %rd84, %rd83, %rd82;
+; CHECK-NEXT:    shl.b64 %rd85, %rd114, 1;
+; CHECK-NEXT:    shr.u64 %rd86, %rd117, 63;
+; CHECK-NEXT:    or.b64 %rd87, %rd85, %rd86;
+; CHECK-NEXT:    shr.u64 %rd88, %rd116, 63;
+; CHECK-NEXT:    shl.b64 %rd89, %rd117, 1;
+; CHECK-NEXT:    or.b64 %rd90, %rd89, %rd88;
+; CHECK-NEXT:    shl.b64 %rd91, %rd116, 1;
+; CHECK-NEXT:    or.b64 %rd116, %rd110, %rd91;
+; CHECK-NEXT:    or.b64 %rd117, %rd107, %rd90;
+; CHECK-NEXT:    sub.cc.s64 %rd92, %rd35, %rd87;
+; CHECK-NEXT:    subc.cc.s64 %rd93, %rd36, %rd84;
+; CHECK-NEXT:    shr.s64 %rd94, %rd93, 63;
+; CHECK-NEXT:    and.b64 %rd110, %rd94, 1;
+; CHECK-NEXT:    and.b64 %rd95, %rd94, %rd3;
+; CHECK-NEXT:    and.b64 %rd96, %rd94, %rd4;
+; CHECK-NEXT:    sub.cc.s64 %rd114, %rd87, %rd95;
+; CHECK-NEXT:    subc.cc.s64 %rd115, %rd84, %rd96;
+; CHECK-NEXT:    add.cc.s64 %rd112, %rd112, -1;
+; CHECK-NEXT:    addc.cc.s64 %rd113, %rd113, -1;
+; CHECK-NEXT:    or.b64 %rd97, %rd112, %rd113;
+; CHECK-NEXT:    setp.eq.s64 %p18, %rd97, 0;
 ; CHECK-NEXT:    @%p18 bra $L__BB4_4;
 ; CHECK-NEXT:    bra.uni $L__BB4_2;
 ; CHECK-NEXT:  $L__BB4_4: // %udiv-loop-exit
-; CHECK-NEXT:    shr.u64 %rd100, %rd118, 63;
-; CHECK-NEXT:    shl.b64 %rd101, %rd119, 1;
-; CHECK-NEXT:    or.b64 %rd102, %rd101, %rd100;
-; CHECK-NEXT:    shl.b64 %rd103, %rd118, 1;
-; CHECK-NEXT:    or.b64 %rd120, %rd112, %rd103;
-; CHECK-NEXT:    or.b64 %rd121, %rd109, %rd102;
+; CHECK-NEXT:    shr.u64 %rd98, %rd116, 63;
+; CHECK-NEXT:    shl.b64 %rd99, %rd117, 1;
+; CHECK-NEXT:    or.b64 %rd100, %rd99, %rd98;
+; CHECK-NEXT:    shl.b64 %rd101, %rd116, 1;
+; CHECK-NEXT:    or.b64 %rd118, %rd110, %rd101;
+; CHECK-NEXT:    or.b64 %rd119, %rd107, %rd100;
 ; CHECK-NEXT:  $L__BB4_5: // %udiv-end
-; CHECK-NEXT:    xor.b64 %rd104, %rd120, %rd5;
-; CHECK-NEXT:    xor.b64 %rd105, %rd121, %rd5;
-; CHECK-NEXT:    sub.cc.s64 %rd106, %rd104, %rd5;
-; CHECK-NEXT:    subc.cc.s64 %rd107, %rd105, %rd5;
-; CHECK-NEXT:    st.param.v2.b64 [func_retval0+0], {%rd106, %rd107};
+; CHECK-NEXT:    xor.b64 %rd102, %rd118, %rd5;
+; CHECK-NEXT:    xor.b64 %rd103, %rd119, %rd5;
+; CHECK-NEXT:    sub.cc.s64 %rd104, %rd102, %rd5;
+; CHECK-NEXT:    subc.cc.s64 %rd105, %rd103, %rd5;
+; CHECK-NEXT:    st.param.v2.b64 [func_retval0+0], {%rd104, %rd105};
 ; CHECK-NEXT:    ret;
   %div = sdiv i128 %lhs, %rhs
   ret i128 %div
@@ -467,7 +467,7 @@ define i128 @udiv_i128(i128 %lhs, i128 %rhs) {
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<17>;
 ; CHECK-NEXT:    .reg .b32 %r<20>;
-; CHECK-NEXT:    .reg .b64 %rd<107>;
+; CHECK-NEXT:    .reg .b64 %rd<105>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %_udiv-special-cases
 ; CHECK-NEXT:    ld.param.v2.u64 {%rd41, %rd42}, [udiv_i128_param_0];
@@ -491,98 +491,98 @@ define i128 @udiv_i128(i128 %lhs, i128 %rhs) {
 ; CHECK-NEXT:    cvt.u64.u32 %rd52, %r4;
 ; CHECK-NEXT:    add.s64 %rd53, %rd52, 64;
 ; CHECK-NEXT:    selp.b64 %rd54, %rd51, %rd53, %p5;
-; CHECK-NEXT:    mov.u64 %rd97, 0;
-; CHECK-NEXT:    sub.cc.s64 %rd56, %rd50, %rd54;
-; CHECK-NEXT:    subc.cc.s64 %rd57, %rd97, 0;
-; CHECK-NEXT:    setp.eq.s64 %p6, %rd57, 0;
-; CHECK-NEXT:    setp.ne.s64 %p7, %rd57, 0;
+; CHECK-NEXT:    mov.u64 %rd95, 0;
+; CHECK-NEXT:    sub.cc.s64 %rd5, %rd50, %rd54;
+; CHECK-NEXT:    subc.cc.s64 %rd6, %rd95, 0;
+; CHECK-NEXT:    setp.eq.s64 %p6, %rd6, 0;
+; CHECK-NEXT:    setp.ne.s64 %p7, %rd6, 0;
 ; CHECK-NEXT:    selp.u32 %r5, -1, 0, %p7;
-; CHECK-NEXT:    setp.gt.u64 %p8, %rd56, 127;
+; CHECK-NEXT:    setp.gt.u64 %p8, %rd5, 127;
 ; CHECK-NEXT:    selp.u32 %r6, -1, 0, %p8;
 ; CHECK-NEXT:    selp.b32 %r7, %r6, %r5, %p6;
 ; CHECK-NEXT:    and.b32 %r8, %r7, 1;
 ; CHECK-NEXT:    setp.eq.b32 %p9, %r8, 1;
 ; CHECK-NEXT:    or.pred %p10, %p3, %p9;
-; CHECK-NEXT:    xor.b64 %rd58, %rd56, 127;
-; CHECK-NEXT:    or.b64 %rd59, %rd58, %rd57;
-; CHECK-NEXT:    setp.eq.s64 %p11, %rd59, 0;
-; CHECK-NEXT:    selp.b64 %rd106, 0, %rd42, %p10;
-; CHECK-NEXT:    selp.b64 %rd105, 0, %rd41, %p10;
+; CHECK-NEXT:    xor.b64 %rd56, %rd5, 127;
+; CHECK-NEXT:    or.b64 %rd57, %rd56, %rd6;
+; CHECK-NEXT:    setp.eq.s64 %p11, %rd57, 0;
+; CHECK-NEXT:    selp.b64 %rd104, 0, %rd42, %p10;
+; CHECK-NEXT:    selp.b64 %rd103, 0, %rd41, %p10;
 ; CHECK-NEXT:    or.pred %p12, %p10, %p11;
 ; CHECK-NEXT:    @%p12 bra $L__BB5_5;
 ; CHECK-NEXT:  // %bb.3: // %udiv-bb1
-; CHECK-NEXT:    add.cc.s64 %rd99, %rd56, 1;
-; CHECK-NEXT:    addc.cc.s64 %rd100, %rd57, 0;
-; CHECK-NEXT:    or.b64 %rd62, %rd99, %rd100;
-; CHECK-NEXT:    setp.eq.s64 %p13, %rd62, 0;
-; CHECK-NEXT:    cvt.u32.u64 %r9, %rd56;
+; CHECK-NEXT:    add.cc.s64 %rd97, %rd5, 1;
+; CHECK-NEXT:    addc.cc.s64 %rd98, %rd6, 0;
+; CHECK-NEXT:    or.b64 %rd60, %rd97, %rd98;
+; CHECK-NEXT:    setp.eq.s64 %p13, %rd60, 0;
+; CHECK-NEXT:    cvt.u32.u64 %r9, %rd5;
 ; CHECK-NEXT:    mov.b32 %r10, 127;
 ; CHECK-NEXT:    sub.s32 %r11, %r10, %r9;
-; CHECK-NEXT:    shl.b64 %rd63, %rd42, %r11;
+; CHECK-NEXT:    shl.b64 %rd61, %rd42, %r11;
 ; CHECK-NEXT:    mov.b32 %r12, 64;
 ; CHECK-NEXT:    sub.s32 %r13, %r12, %r11;
-; CHECK-NEXT:    shr.u64 %rd64, %rd41, %r13;
-; CHECK-NEXT:    or.b64 %rd65, %rd63, %rd64;
+; CHECK-NEXT:    shr.u64 %rd62, %rd41, %r13;
+; CHECK-NEXT:    or.b64 %rd63, %rd61, %rd62;
 ; CHECK-NEXT:    mov.b32 %r14, 63;
 ; CHECK-NEXT:    sub.s32 %r15, %r14, %r9;
-; CHECK-NEXT:    shl.b64 %rd66, %rd41, %r15;
+; CHECK-NEXT:    shl.b64 %rd64, %rd41, %r15;
 ; CHECK-NEXT:    setp.gt.s32 %p14, %r11, 63;
-; CHECK-NEXT:    selp.b64 %rd104, %rd66, %rd65, %p14;
-; CHECK-NEXT:    shl.b64 %rd103, %rd41, %r11;
-; CHECK-NEXT:    mov.u64 %rd94, %rd97;
+; CHECK-NEXT:    selp.b64 %rd102, %rd64, %rd63, %p14;
+; CHECK-NEXT:    shl.b64 %rd101, %rd41, %r11;
+; CHECK-NEXT:    mov.u64 %rd92, %rd95;
 ; CHECK-NEXT:    @%p13 bra $L__BB5_4;
 ; CHECK-NEXT:  // %bb.1: // %udiv-preheader
-; CHECK-NEXT:    cvt.u32.u64 %r16, %rd99;
-; CHECK-NEXT:    shr.u64 %rd69, %rd41, %r16;
+; CHECK-NEXT:    cvt.u32.u64 %r16, %rd97;
+; CHECK-NEXT:    shr.u64 %rd67, %rd41, %r16;
 ; CHECK-NEXT:    sub.s32 %r18, %r12, %r16;
-; CHECK-NEXT:    shl.b64 %rd70, %rd42, %r18;
-; CHECK-NEXT:    or.b64 %rd71, %rd69, %rd70;
+; CHECK-NEXT:    shl.b64 %rd68, %rd42, %r18;
+; CHECK-NEXT:    or.b64 %rd69, %rd67, %rd68;
 ; CHECK-NEXT:    add.s32 %r19, %r16, -64;
-; CHECK-NEXT:    shr.u64 %rd72, %rd42, %r19;
+; CHECK-NEXT:    shr.u64 %rd70, %rd42, %r19;
 ; CHECK-NEXT:    setp.gt.s32 %p15, %r16, 63;
-; CHECK-NEXT:    selp.b64 %rd101, %rd72, %rd71, %p15;
-; CHECK-NEXT:    shr.u64 %rd102, %rd42, %r16;
+; CHECK-NEXT:    selp.b64 %rd99, %rd70, %rd69, %p15;
+; CHECK-NEXT:    shr.u64 %rd100, %rd42, %r16;
 ; CHECK-NEXT:    add.cc.s64 %rd33, %rd43, -1;
 ; CHECK-NEXT:    addc.cc.s64 %rd34, %rd44, -1;
-; CHECK-NEXT:    mov.u64 %rd94, 0;
-; CHECK-NEXT:    mov.u64 %rd97, %rd94;
+; CHECK-NEXT:    mov.u64 %rd92, 0;
+; CHECK-NEXT:    mov.u64 %rd95, %rd92;
 ; CHECK-NEXT:  $L__BB5_2: // %udiv-do-while
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    shr.u64 %rd73, %rd101, 63;
-; CHECK-NEXT:    shl.b64 %rd74, %rd102, 1;
-; CHECK-NEXT:    or.b64 %rd75, %rd74, %rd73;
-; CHECK-NEXT:    shl.b64 %rd76, %rd101, 1;
-; CHECK-NEXT:    shr.u64 %rd77, %rd104, 63;
-; CHECK-NEXT:    or.b64 %rd78, %rd76, %rd77;
-; CHECK-NEXT:    shr.u64 %rd79, %rd103, 63;
-; CHECK-NEXT:    shl.b64 %rd80, %rd104, 1;
-; CHECK-NEXT:    or.b64 %rd81, %rd80, %rd79;
-; CHECK-NEXT:    shl.b64 %rd82, %rd103, 1;
-; CHECK-NEXT:    or.b64 %rd103, %rd97, %rd82;
-; CHECK-NEXT:    or.b64 %rd104, %rd94, %rd81;
-; CHECK-NEXT:    sub.cc.s64 %rd83, %rd33, %rd78;
-; CHECK-NEXT:    subc.cc.s64 %rd84, %rd34, %rd75;
-; CHECK-NEXT:    shr.s64 %rd85, %rd84, 63;
-; CHECK-NEXT:    and.b64 %rd97, %rd85, 1;
-; CHECK-NEXT:    and.b64 %rd86, %rd85, %rd43;
-; CHECK-NEXT:    and.b64 %rd87, %rd85, %rd44;
-; CHECK-NEXT:    sub.cc.s64 %rd101, %rd78, %rd86;
-; CHECK-NEXT:    subc.cc.s64 %rd102, %rd75, %rd87;
-; CHECK-NEXT:    add.cc.s64 %rd99, %rd99, -1;
-; CHECK-NEXT:    addc.cc.s64 %rd100, %rd100, -1;
-; CHECK-NEXT:    or.b64 %rd88, %rd99, %rd100;
-; CHECK-NEXT:    setp.eq.s64 %p16, %rd88, 0;
+; CHECK-NEXT:    shr.u64 %rd71, %rd99, 63;
+; CHECK-NEXT:    shl.b64 %rd72, %rd100, 1;
+; CHECK-NEXT:    or.b64 %rd73, %rd72, %rd71;
+; CHECK-NEXT:    shl.b64 %rd74, %rd99, 1;
+; CHECK-NEXT:    shr.u64 %rd75, %rd102, 63;
+; CHECK-NEXT:    or.b64 %rd76, %rd74, %rd75;
+; CHECK-NEXT:    shr.u64 %rd77, %rd101, 63;
+; CHECK-NEXT:    shl.b64 %rd78, %rd102, 1;
+; CHECK-NEXT:    or.b64 %rd79, %rd78, %rd77;
+; CHECK-NEXT:    shl.b64 %rd80, %rd101, 1;
+; CHECK-NEXT:    or.b64 %rd101, %rd95, %rd80;
+; CHECK-NEXT:    or.b64 %rd102, %rd92, %rd79;
+; CHECK-NEXT:    sub.cc.s64 %rd81, %rd33, %rd76;
+; CHECK-NEXT:    subc.cc.s64 %rd82, %rd34, %rd73;
+; CHECK-NEXT:    shr.s64 %rd83, %rd82, 63;
+; CHECK-NEXT:    and.b64 %rd95, %rd83, 1;
+; CHECK-NEXT:    and.b64 %rd84, %rd83, %rd43;
+; CHECK-NEXT:    and.b64 %rd85, %rd83, %rd44;
+; CHECK-NEXT:    sub.cc.s64 %rd99, %rd76, %rd84;
+; CHECK-NEXT:    subc.cc.s64 %rd100, %rd73, %rd85;
+; CHECK-NEXT:    add.cc.s64 %rd97, %rd97, -1;
+; CHECK-NEXT:    addc.cc.s64 %rd98, %rd98, -1;
+; CHECK-NEXT:    or.b64 %rd86, %rd97, %rd98;
+; CHECK-NEXT:    setp.eq.s64 %p16, %rd86, 0;
 ; CHECK-NEXT:    @%p16 bra $L__BB5_4;
 ; CHECK-NEXT:    bra.uni $L__BB5_2;
 ; CHECK-NEXT:  $L__BB5_4: // %udiv-loop-exit
-; CHECK-NEXT:    shr.u64 %rd89, %rd103, 63;
-; CHECK-NEXT:    shl.b64 %rd90, %rd104, 1;
-; CHECK-NEXT:    or.b64 %rd91, %rd90, %rd89;
-; CHECK-NEXT:    shl.b64 %rd92, %rd103, 1;
-; CHECK-NEXT:    or.b64 %rd105, %rd97, %rd92;
-; CHECK-NEXT:    or.b64 %rd106, %rd94, %rd91;
+; CHECK-NEXT:    shr.u64 %rd87, %rd101, 63;
+; CHECK-NEXT:    shl.b64 %rd88, %rd102, 1;
+; CHECK-NEXT:    or.b64 %rd89, %rd88, %rd87;
+; CHECK-NEXT:    shl.b64 %rd90, %rd101, 1;
+; CHECK-NEXT:    or.b64 %rd103, %rd95, %rd90;
+; CHECK-NEXT:    or.b64 %rd104, %rd92, %rd89;
 ; CHECK-NEXT:  $L__BB5_5: // %udiv-end
-; CHECK-NEXT:    st.param.v2.b64 [func_retval0+0], {%rd105, %rd106};
+; CHECK-NEXT:    st.param.v2.b64 [func_retval0+0], {%rd103, %rd104};
 ; CHECK-NEXT:    ret;
   %div = udiv i128 %lhs, %rhs
   ret i128 %div
diff --git a/llvm/test/CodeGen/PowerPC/patchable-function-entry.ll b/llvm/test/CodeGen/PowerPC/patchable-function-entry.ll
deleted file mode 100644
index 0c2d2829a6d4b..0000000000000
--- a/llvm/test/CodeGen/PowerPC/patchable-function-entry.ll
+++ /dev/null
@@ -1,58 +0,0 @@
-; RUN: llc -mtriple=powerpc %s -o - | FileCheck %s --check-prefixes=CHECK,PPC32
-; RUN: llc -mtriple=powerpc64 %s -o - | FileCheck %s --check-prefixes=CHECK,PPC64
-
- at a = global i32 0, align 4
-
-define void @f0() {
-; CHECK-LABEL: f0:
-; CHECK-NOT:   nop
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    blr
-; CHECK-NOT:   .section    __patchable_function_entries
-  ret void
-}
-
-define void @f1() "patchable-function-entry"="0" {
-; CHECK-LABEL: f1:
-; CHECK-NOT:   nop
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    blr
-; CHECK-NOT:   .section    __patchable_function_entries
-  ret void
-}
-
-define void @f2() "patchable-function-entry"="1" {
-; CHECK-LABEL: f2:
-; CHECK-LABEL-NEXT:  .Lfunc_begin2:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    blr
-; CHECK:       .section    __patchable_function_entries
-; PPC32:       .p2align    2, 0x0
-; PPC64:       .p2align    3, 0x0
-; PPC32-NEXT:  .long   .Lfunc_begin2
-; PPC64-NEXT:  .quad   .Lfunc_begin2
-  ret void
-}
-
-define i32 @f3() "patchable-function-entry"="1" "patchable-function-prefix"="2" {
-; CHECK-LABEL: .Ltmp0:
-; CHECK-COUNT-2: nop
-; CHECK-LABEL: f3:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    nop
-; PPC32:         lis 3, a at ha
-; PPC32-NEXT:    lwz 3, a at l(3)
-; PPC64:         addis 3, 2, .LC0 at toc@ha
-; PPC64-NEXT:    ld 3, .LC0 at toc@l(3)
-; PPC64-NEXT:    lwz 3, 0(3)
-; CHECK:         blr
-; CHECK:       .section    __patchable_function_entries
-; PPC32:       .p2align    2, 0x0
-; PPC64:       .p2align    3, 0x0
-; PPC32-NEXT:  .long   .Ltmp0
-; PPC64-NEXT:  .quad   .Ltmp0
-entry:
-  %0 = load i32, ptr @a, align 4
-  ret i32 %0
-}
diff --git a/llvm/test/CodeGen/RISCV/atomic-cmpxchg-branch-on-result.ll b/llvm/test/CodeGen/RISCV/atomic-cmpxchg-branch-on-result.ll
index b98d2d57a0b52..e6462ef93998f 100644
--- a/llvm/test/CodeGen/RISCV/atomic-cmpxchg-branch-on-result.ll
+++ b/llvm/test/CodeGen/RISCV/atomic-cmpxchg-branch-on-result.ll
@@ -1,13 +1,13 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=riscv32 -mattr=+a -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=NOZACAS,RV32IA %s
-; RUN: llc -mtriple=riscv32 -mattr=+a,+experimental-zacas -verify-machineinstrs < %s \
+; RUN: llc -mtriple=riscv32 -mattr=+a,+zacas -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=ZACAS,RV32IA-ZACAS %s
 ; RUN: llc -mtriple=riscv64 -mattr=+a -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=NOZACAS,RV64IA %s
-; RUN: llc -mtriple=riscv64 -mattr=+a,+experimental-zacas -verify-machineinstrs < %s \
+; RUN: llc -mtriple=riscv64 -mattr=+a,+zacas -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=ZACAS,RV64IA-ZACAS %s
-; RUN: llc -mtriple=riscv64 -mattr=+a,+experimental-zacas,+zabha -verify-machineinstrs < %s \
+; RUN: llc -mtriple=riscv64 -mattr=+a,+zacas,+zabha -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=ZACAS,RV64IA-ZABHA %s
 
 ; Test cmpxchg followed by a branch on the cmpxchg success value to see if the
diff --git a/llvm/test/CodeGen/RISCV/atomic-cmpxchg.ll b/llvm/test/CodeGen/RISCV/atomic-cmpxchg.ll
index c47db319fc2c3..e336246b450a4 100644
--- a/llvm/test/CodeGen/RISCV/atomic-cmpxchg.ll
+++ b/llvm/test/CodeGen/RISCV/atomic-cmpxchg.ll
@@ -3,25 +3,25 @@
 ; RUN:   | FileCheck -check-prefix=RV32I %s
 ; RUN: llc -mtriple=riscv32 -mattr=+a -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV32IA,RV32IA-WMO %s
-; RUN: llc -mtriple=riscv32 -mattr=+a,+experimental-zacas -verify-machineinstrs < %s \
+; RUN: llc -mtriple=riscv32 -mattr=+a,+zacas -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV32IA,RV32IA-ZACAS,RV32IA-WMO-ZACAS %s
 ; RUN: llc -mtriple=riscv32 -mattr=+a,+ztso -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV32IA,RV32IA-TSO %s
-; RUN: llc -mtriple=riscv32 -mattr=+a,+ztso,+experimental-zacas -verify-machineinstrs < %s \
+; RUN: llc -mtriple=riscv32 -mattr=+a,+ztso,+zacas -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV32IA,RV32IA-ZACAS,RV32IA-TSO-ZACAS %s
 ; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefix=RV64I %s
 ; RUN: llc -mtriple=riscv64 -mattr=+a -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV64IA,RV64IA-WMO %s
-; RUN: llc -mtriple=riscv64 -mattr=+a,+experimental-zacas -verify-machineinstrs < %s \
+; RUN: llc -mtriple=riscv64 -mattr=+a,+zacas -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV64IA,RV64IA-ZACAS,RV64IA-WMO-ZACAS %s
-; RUN: llc -mtriple=riscv64 -mattr=+a,+experimental-zacas,+zabha -verify-machineinstrs < %s \
+; RUN: llc -mtriple=riscv64 -mattr=+a,+zacas,+zabha -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV64IA,RV64IA-ZABHA,RV64IA-WMO-ZABHA %s
 ; RUN: llc -mtriple=riscv64 -mattr=+a,+ztso -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV64IA,RV64IA-TSO %s
-; RUN: llc -mtriple=riscv64 -mattr=+a,+ztso,+experimental-zacas -verify-machineinstrs < %s \
+; RUN: llc -mtriple=riscv64 -mattr=+a,+ztso,+zacas -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV64IA,RV64IA-ZACAS,RV64IA-TSO-ZACAS %s
-; RUN: llc -mtriple=riscv64 -mattr=+a,+ztso,+experimental-zacas,+zabha -verify-machineinstrs < %s \
+; RUN: llc -mtriple=riscv64 -mattr=+a,+ztso,+zacas,+zabha -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV64IA,RV64IA-ZABHA,RV64IA-TSO-ZABHA %s
 
 define void @cmpxchg_i8_monotonic_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
diff --git a/llvm/test/CodeGen/RISCV/atomic-rmw.ll b/llvm/test/CodeGen/RISCV/atomic-rmw.ll
index 4223440b9cb88..c7c9c339a8880 100644
--- a/llvm/test/CodeGen/RISCV/atomic-rmw.ll
+++ b/llvm/test/CodeGen/RISCV/atomic-rmw.ll
@@ -12,22 +12,22 @@
 ; RUN: llc -mtriple=riscv64 -mattr=+a,+ztso -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV64IA,RV64IA-NOZACAS,RV64IA-TSO,RV64IA-TSO-NOZACAS %s
 
-; RUN: llc -mtriple=riscv32 -mattr=+a,+experimental-zacas -verify-machineinstrs < %s \
+; RUN: llc -mtriple=riscv32 -mattr=+a,+zacas -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV32IA,RV32IA-ZACAS,RV32IA-WMO,RV32IA-WMO-ZACAS %s
-; RUN: llc -mtriple=riscv32 -mattr=+a,+ztso,+experimental-zacas -verify-machineinstrs < %s \
+; RUN: llc -mtriple=riscv32 -mattr=+a,+ztso,+zacas -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV32IA,RV32IA-ZACAS,RV32IA-TSO,RV32IA-TSO-ZACAS %s
-; RUN: llc -mtriple=riscv64 -mattr=+a,+experimental-zacas -verify-machineinstrs < %s \
+; RUN: llc -mtriple=riscv64 -mattr=+a,+zacas -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV64IA,RV64IA-ZACAS,RV64IA-WMO,RV64IA-WMO-ZACAS %s
-; RUN: llc -mtriple=riscv64 -mattr=+a,+ztso,+experimental-zacas -verify-machineinstrs < %s \
+; RUN: llc -mtriple=riscv64 -mattr=+a,+ztso,+zacas -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV64IA,RV64IA-ZACAS,RV64IA-TSO,RV64IA-TSO-ZACAS %s
 
 ; RUN: llc -mtriple=riscv64 -mattr=+a,+zabha -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV64IA,RV64IA-WMO,RV64IA-WMO-ZABHA,RV64IA-WMO-ZABHA-NOZACAS %s
 ; RUN: llc -mtriple=riscv64 -mattr=+a,+ztso,+zabha -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV64IA,RV64IA-TSO,RV64IA-TSO-ZABHA,RV64IA-TSO-ZABHA-NOZACAS %s
-; RUN: llc -mtriple=riscv64 -mattr=+a,+zabha,+experimental-zacas -verify-machineinstrs < %s \
+; RUN: llc -mtriple=riscv64 -mattr=+a,+zabha,+zacas -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV64IA,RV64IA-WMO,RV64IA-WMO-ZABHA,RV64IA-WMO-ZABHA-ZACAS %s
-; RUN: llc -mtriple=riscv64 -mattr=+a,+ztso,+zabha,+experimental-zacas -verify-machineinstrs < %s \
+; RUN: llc -mtriple=riscv64 -mattr=+a,+ztso,+zabha,+zacas -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV64IA,RV64IA-TSO,RV64IA-TSO-ZABHA,RV64IA-TSO-ZABHA-ZACAS %s
 
 define i8 @atomicrmw_xchg_i8_monotonic(ptr %a, i8 %b) nounwind {
diff --git a/llvm/test/CodeGen/RISCV/atomic-signext.ll b/llvm/test/CodeGen/RISCV/atomic-signext.ll
index 775c17c3ceb3f..f7268f6288127 100644
--- a/llvm/test/CodeGen/RISCV/atomic-signext.ll
+++ b/llvm/test/CodeGen/RISCV/atomic-signext.ll
@@ -3,13 +3,13 @@
 ; RUN:   | FileCheck -check-prefix=RV32I %s
 ; RUN: llc -mtriple=riscv32 -mattr=+a -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV32IA,RV32IA-NOZACAS %s
-; RUN: llc -mtriple=riscv32 -mattr=+a,+experimental-zacas -verify-machineinstrs < %s \
+; RUN: llc -mtriple=riscv32 -mattr=+a,+zacas -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV32IA,RV32IA-ZACAS %s
 ; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefix=RV64I %s
 ; RUN: llc -mtriple=riscv64 -mattr=+a -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV64IA,RV64IA-NOZACAS %s
-; RUN: llc -mtriple=riscv64 -mattr=+a,+experimental-zacas -verify-machineinstrs < %s \
+; RUN: llc -mtriple=riscv64 -mattr=+a,+zacas -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV64IA,RV64IA-ZACAS %s
 
 define signext i8 @atomic_load_i8_unordered(ptr %a) nounwind {
diff --git a/llvm/test/CodeGen/RISCV/attributes.ll b/llvm/test/CodeGen/RISCV/attributes.ll
index 4f841683a868c..c9fe1059b1378 100644
--- a/llvm/test/CodeGen/RISCV/attributes.ll
+++ b/llvm/test/CodeGen/RISCV/attributes.ll
@@ -120,7 +120,7 @@
 ; RUN: llc -mtriple=riscv32 -mattr=+zfbfmin %s -o - | FileCheck --check-prefixes=CHECK,RV32ZFBFMIN %s
 ; RUN: llc -mtriple=riscv32 -mattr=+zvfbfmin %s -o - | FileCheck --check-prefixes=CHECK,RV32ZVFBFMIN %s
 ; RUN: llc -mtriple=riscv32 -mattr=+zvfbfwma %s -o - | FileCheck --check-prefixes=CHECK,RV32ZVFBFWMA %s
-; RUN: llc -mtriple=riscv32 -mattr=+a,+experimental-zacas %s -o - | FileCheck --check-prefix=RV32ZACAS %s
+; RUN: llc -mtriple=riscv32 -mattr=+a,zacas %s -o - | FileCheck --check-prefix=RV32ZACAS %s
 ; RUN: llc -mtriple=riscv32 -mattr=+experimental-zalasr %s -o - | FileCheck --check-prefix=RV32ZALASR %s
 ; RUN: llc -mtriple=riscv32 -mattr=+zama16b %s -o - | FileCheck --check-prefixes=CHECK,RV32ZAMA16B %s
 ; RUN: llc -mtriple=riscv32 -mattr=+experimental-zicfilp %s -o - | FileCheck --check-prefix=RV32ZICFILP %s
@@ -259,7 +259,7 @@
 ; RUN: llc -mtriple=riscv64 -mattr=+zfbfmin %s -o - | FileCheck --check-prefixes=CHECK,RV64ZFBFMIN %s
 ; RUN: llc -mtriple=riscv64 -mattr=+zvfbfmin %s -o - | FileCheck --check-prefixes=CHECK,RV64ZVFBFMIN %s
 ; RUN: llc -mtriple=riscv64 -mattr=+zvfbfwma %s -o - | FileCheck --check-prefixes=CHECK,RV64ZVFBFWMA %s
-; RUN: llc -mtriple=riscv64 -mattr=+a,+experimental-zacas %s -o - | FileCheck --check-prefix=RV64ZACAS %s
+; RUN: llc -mtriple=riscv64 -mattr=+a,zacas %s -o - | FileCheck --check-prefix=RV64ZACAS %s
 ; RUN: llc -mtriple=riscv64 -mattr=+experimental-zalasr %s -o - | FileCheck --check-prefix=RV64ZALASR %s
 ; RUN: llc -mtriple=riscv64 -mattr=+experimental-zicfilp %s -o - | FileCheck --check-prefix=RV64ZICFILP %s
 ; RUN: llc -mtriple=riscv64 -mattr=+a,+zabha %s -o - | FileCheck --check-prefix=RV64ZABHA %s
diff --git a/llvm/test/CodeGen/RISCV/double-convert.ll b/llvm/test/CodeGen/RISCV/double-convert.ll
index feea4f19720b0..6024a29da33d2 100644
--- a/llvm/test/CodeGen/RISCV/double-convert.ll
+++ b/llvm/test/CodeGen/RISCV/double-convert.ll
@@ -742,8 +742,9 @@ define i64 @fcvt_l_d_sat(double %a) nounwind {
 ; RV32IZFINXZDINX-NEXT:    lw a2, %lo(.LCPI12_0)(a2)
 ; RV32IZFINXZDINX-NEXT:    fle.d a2, a2, s0
 ; RV32IZFINXZDINX-NEXT:    lui a5, 524288
+; RV32IZFINXZDINX-NEXT:    li a4, 1
 ; RV32IZFINXZDINX-NEXT:    lui a3, 524288
-; RV32IZFINXZDINX-NEXT:    beqz a2, .LBB12_2
+; RV32IZFINXZDINX-NEXT:    bne a2, a4, .LBB12_2
 ; RV32IZFINXZDINX-NEXT:  # %bb.1: # %start
 ; RV32IZFINXZDINX-NEXT:    mv a3, a1
 ; RV32IZFINXZDINX-NEXT:  .LBB12_2: # %start
diff --git a/llvm/test/CodeGen/RISCV/double-round-conv-sat.ll b/llvm/test/CodeGen/RISCV/double-round-conv-sat.ll
index 0839f61b2d793..927eee2e9e545 100644
--- a/llvm/test/CodeGen/RISCV/double-round-conv-sat.ll
+++ b/llvm/test/CodeGen/RISCV/double-round-conv-sat.ll
@@ -102,29 +102,30 @@ define i64 @test_floor_si64(double %x) nounwind {
 ; RV32IZFINXZDINX-NEXT:    lui a2, %hi(.LCPI1_0)
 ; RV32IZFINXZDINX-NEXT:    lw a3, %lo(.LCPI1_0+4)(a2)
 ; RV32IZFINXZDINX-NEXT:    lw a2, %lo(.LCPI1_0)(a2)
-; RV32IZFINXZDINX-NEXT:    lui a4, %hi(.LCPI1_1)
-; RV32IZFINXZDINX-NEXT:    lw a5, %lo(.LCPI1_1+4)(a4)
-; RV32IZFINXZDINX-NEXT:    lw a4, %lo(.LCPI1_1)(a4)
-; RV32IZFINXZDINX-NEXT:    fle.d a6, a2, s0
-; RV32IZFINXZDINX-NEXT:    neg a2, a6
-; RV32IZFINXZDINX-NEXT:    and a0, a2, a0
-; RV32IZFINXZDINX-NEXT:    flt.d a4, a4, s0
-; RV32IZFINXZDINX-NEXT:    neg a2, a4
-; RV32IZFINXZDINX-NEXT:    or a0, a2, a0
-; RV32IZFINXZDINX-NEXT:    feq.d a2, s0, s0
-; RV32IZFINXZDINX-NEXT:    neg a2, a2
+; RV32IZFINXZDINX-NEXT:    fle.d a2, a2, s0
 ; RV32IZFINXZDINX-NEXT:    lui a5, 524288
+; RV32IZFINXZDINX-NEXT:    li a4, 1
 ; RV32IZFINXZDINX-NEXT:    lui a3, 524288
-; RV32IZFINXZDINX-NEXT:    beqz a6, .LBB1_2
+; RV32IZFINXZDINX-NEXT:    bne a2, a4, .LBB1_2
 ; RV32IZFINXZDINX-NEXT:  # %bb.1:
 ; RV32IZFINXZDINX-NEXT:    mv a3, a1
 ; RV32IZFINXZDINX-NEXT:  .LBB1_2:
-; RV32IZFINXZDINX-NEXT:    and a0, a2, a0
+; RV32IZFINXZDINX-NEXT:    lui a1, %hi(.LCPI1_1)
+; RV32IZFINXZDINX-NEXT:    lw a6, %lo(.LCPI1_1)(a1)
+; RV32IZFINXZDINX-NEXT:    lw a7, %lo(.LCPI1_1+4)(a1)
+; RV32IZFINXZDINX-NEXT:    flt.d a4, a6, s0
 ; RV32IZFINXZDINX-NEXT:    beqz a4, .LBB1_4
 ; RV32IZFINXZDINX-NEXT:  # %bb.3:
 ; RV32IZFINXZDINX-NEXT:    addi a3, a5, -1
 ; RV32IZFINXZDINX-NEXT:  .LBB1_4:
-; RV32IZFINXZDINX-NEXT:    and a1, a2, a3
+; RV32IZFINXZDINX-NEXT:    feq.d a1, s0, s0
+; RV32IZFINXZDINX-NEXT:    neg a5, a1
+; RV32IZFINXZDINX-NEXT:    and a1, a5, a3
+; RV32IZFINXZDINX-NEXT:    neg a2, a2
+; RV32IZFINXZDINX-NEXT:    and a0, a2, a0
+; RV32IZFINXZDINX-NEXT:    neg a2, a4
+; RV32IZFINXZDINX-NEXT:    or a0, a2, a0
+; RV32IZFINXZDINX-NEXT:    and a0, a5, a0
 ; RV32IZFINXZDINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -346,29 +347,30 @@ define i64 @test_ceil_si64(double %x) nounwind {
 ; RV32IZFINXZDINX-NEXT:    lui a2, %hi(.LCPI5_0)
 ; RV32IZFINXZDINX-NEXT:    lw a3, %lo(.LCPI5_0+4)(a2)
 ; RV32IZFINXZDINX-NEXT:    lw a2, %lo(.LCPI5_0)(a2)
-; RV32IZFINXZDINX-NEXT:    lui a4, %hi(.LCPI5_1)
-; RV32IZFINXZDINX-NEXT:    lw a5, %lo(.LCPI5_1+4)(a4)
-; RV32IZFINXZDINX-NEXT:    lw a4, %lo(.LCPI5_1)(a4)
-; RV32IZFINXZDINX-NEXT:    fle.d a6, a2, s0
-; RV32IZFINXZDINX-NEXT:    neg a2, a6
-; RV32IZFINXZDINX-NEXT:    and a0, a2, a0
-; RV32IZFINXZDINX-NEXT:    flt.d a4, a4, s0
-; RV32IZFINXZDINX-NEXT:    neg a2, a4
-; RV32IZFINXZDINX-NEXT:    or a0, a2, a0
-; RV32IZFINXZDINX-NEXT:    feq.d a2, s0, s0
-; RV32IZFINXZDINX-NEXT:    neg a2, a2
+; RV32IZFINXZDINX-NEXT:    fle.d a2, a2, s0
 ; RV32IZFINXZDINX-NEXT:    lui a5, 524288
+; RV32IZFINXZDINX-NEXT:    li a4, 1
 ; RV32IZFINXZDINX-NEXT:    lui a3, 524288
-; RV32IZFINXZDINX-NEXT:    beqz a6, .LBB5_2
+; RV32IZFINXZDINX-NEXT:    bne a2, a4, .LBB5_2
 ; RV32IZFINXZDINX-NEXT:  # %bb.1:
 ; RV32IZFINXZDINX-NEXT:    mv a3, a1
 ; RV32IZFINXZDINX-NEXT:  .LBB5_2:
-; RV32IZFINXZDINX-NEXT:    and a0, a2, a0
+; RV32IZFINXZDINX-NEXT:    lui a1, %hi(.LCPI5_1)
+; RV32IZFINXZDINX-NEXT:    lw a6, %lo(.LCPI5_1)(a1)
+; RV32IZFINXZDINX-NEXT:    lw a7, %lo(.LCPI5_1+4)(a1)
+; RV32IZFINXZDINX-NEXT:    flt.d a4, a6, s0
 ; RV32IZFINXZDINX-NEXT:    beqz a4, .LBB5_4
 ; RV32IZFINXZDINX-NEXT:  # %bb.3:
 ; RV32IZFINXZDINX-NEXT:    addi a3, a5, -1
 ; RV32IZFINXZDINX-NEXT:  .LBB5_4:
-; RV32IZFINXZDINX-NEXT:    and a1, a2, a3
+; RV32IZFINXZDINX-NEXT:    feq.d a1, s0, s0
+; RV32IZFINXZDINX-NEXT:    neg a5, a1
+; RV32IZFINXZDINX-NEXT:    and a1, a5, a3
+; RV32IZFINXZDINX-NEXT:    neg a2, a2
+; RV32IZFINXZDINX-NEXT:    and a0, a2, a0
+; RV32IZFINXZDINX-NEXT:    neg a2, a4
+; RV32IZFINXZDINX-NEXT:    or a0, a2, a0
+; RV32IZFINXZDINX-NEXT:    and a0, a5, a0
 ; RV32IZFINXZDINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -590,29 +592,30 @@ define i64 @test_trunc_si64(double %x) nounwind {
 ; RV32IZFINXZDINX-NEXT:    lui a2, %hi(.LCPI9_0)
 ; RV32IZFINXZDINX-NEXT:    lw a3, %lo(.LCPI9_0+4)(a2)
 ; RV32IZFINXZDINX-NEXT:    lw a2, %lo(.LCPI9_0)(a2)
-; RV32IZFINXZDINX-NEXT:    lui a4, %hi(.LCPI9_1)
-; RV32IZFINXZDINX-NEXT:    lw a5, %lo(.LCPI9_1+4)(a4)
-; RV32IZFINXZDINX-NEXT:    lw a4, %lo(.LCPI9_1)(a4)
-; RV32IZFINXZDINX-NEXT:    fle.d a6, a2, s0
-; RV32IZFINXZDINX-NEXT:    neg a2, a6
-; RV32IZFINXZDINX-NEXT:    and a0, a2, a0
-; RV32IZFINXZDINX-NEXT:    flt.d a4, a4, s0
-; RV32IZFINXZDINX-NEXT:    neg a2, a4
-; RV32IZFINXZDINX-NEXT:    or a0, a2, a0
-; RV32IZFINXZDINX-NEXT:    feq.d a2, s0, s0
-; RV32IZFINXZDINX-NEXT:    neg a2, a2
+; RV32IZFINXZDINX-NEXT:    fle.d a2, a2, s0
 ; RV32IZFINXZDINX-NEXT:    lui a5, 524288
+; RV32IZFINXZDINX-NEXT:    li a4, 1
 ; RV32IZFINXZDINX-NEXT:    lui a3, 524288
-; RV32IZFINXZDINX-NEXT:    beqz a6, .LBB9_2
+; RV32IZFINXZDINX-NEXT:    bne a2, a4, .LBB9_2
 ; RV32IZFINXZDINX-NEXT:  # %bb.1:
 ; RV32IZFINXZDINX-NEXT:    mv a3, a1
 ; RV32IZFINXZDINX-NEXT:  .LBB9_2:
-; RV32IZFINXZDINX-NEXT:    and a0, a2, a0
+; RV32IZFINXZDINX-NEXT:    lui a1, %hi(.LCPI9_1)
+; RV32IZFINXZDINX-NEXT:    lw a6, %lo(.LCPI9_1)(a1)
+; RV32IZFINXZDINX-NEXT:    lw a7, %lo(.LCPI9_1+4)(a1)
+; RV32IZFINXZDINX-NEXT:    flt.d a4, a6, s0
 ; RV32IZFINXZDINX-NEXT:    beqz a4, .LBB9_4
 ; RV32IZFINXZDINX-NEXT:  # %bb.3:
 ; RV32IZFINXZDINX-NEXT:    addi a3, a5, -1
 ; RV32IZFINXZDINX-NEXT:  .LBB9_4:
-; RV32IZFINXZDINX-NEXT:    and a1, a2, a3
+; RV32IZFINXZDINX-NEXT:    feq.d a1, s0, s0
+; RV32IZFINXZDINX-NEXT:    neg a5, a1
+; RV32IZFINXZDINX-NEXT:    and a1, a5, a3
+; RV32IZFINXZDINX-NEXT:    neg a2, a2
+; RV32IZFINXZDINX-NEXT:    and a0, a2, a0
+; RV32IZFINXZDINX-NEXT:    neg a2, a4
+; RV32IZFINXZDINX-NEXT:    or a0, a2, a0
+; RV32IZFINXZDINX-NEXT:    and a0, a5, a0
 ; RV32IZFINXZDINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -834,29 +837,30 @@ define i64 @test_round_si64(double %x) nounwind {
 ; RV32IZFINXZDINX-NEXT:    lui a2, %hi(.LCPI13_0)
 ; RV32IZFINXZDINX-NEXT:    lw a3, %lo(.LCPI13_0+4)(a2)
 ; RV32IZFINXZDINX-NEXT:    lw a2, %lo(.LCPI13_0)(a2)
-; RV32IZFINXZDINX-NEXT:    lui a4, %hi(.LCPI13_1)
-; RV32IZFINXZDINX-NEXT:    lw a5, %lo(.LCPI13_1+4)(a4)
-; RV32IZFINXZDINX-NEXT:    lw a4, %lo(.LCPI13_1)(a4)
-; RV32IZFINXZDINX-NEXT:    fle.d a6, a2, s0
-; RV32IZFINXZDINX-NEXT:    neg a2, a6
-; RV32IZFINXZDINX-NEXT:    and a0, a2, a0
-; RV32IZFINXZDINX-NEXT:    flt.d a4, a4, s0
-; RV32IZFINXZDINX-NEXT:    neg a2, a4
-; RV32IZFINXZDINX-NEXT:    or a0, a2, a0
-; RV32IZFINXZDINX-NEXT:    feq.d a2, s0, s0
-; RV32IZFINXZDINX-NEXT:    neg a2, a2
+; RV32IZFINXZDINX-NEXT:    fle.d a2, a2, s0
 ; RV32IZFINXZDINX-NEXT:    lui a5, 524288
+; RV32IZFINXZDINX-NEXT:    li a4, 1
 ; RV32IZFINXZDINX-NEXT:    lui a3, 524288
-; RV32IZFINXZDINX-NEXT:    beqz a6, .LBB13_2
+; RV32IZFINXZDINX-NEXT:    bne a2, a4, .LBB13_2
 ; RV32IZFINXZDINX-NEXT:  # %bb.1:
 ; RV32IZFINXZDINX-NEXT:    mv a3, a1
 ; RV32IZFINXZDINX-NEXT:  .LBB13_2:
-; RV32IZFINXZDINX-NEXT:    and a0, a2, a0
+; RV32IZFINXZDINX-NEXT:    lui a1, %hi(.LCPI13_1)
+; RV32IZFINXZDINX-NEXT:    lw a6, %lo(.LCPI13_1)(a1)
+; RV32IZFINXZDINX-NEXT:    lw a7, %lo(.LCPI13_1+4)(a1)
+; RV32IZFINXZDINX-NEXT:    flt.d a4, a6, s0
 ; RV32IZFINXZDINX-NEXT:    beqz a4, .LBB13_4
 ; RV32IZFINXZDINX-NEXT:  # %bb.3:
 ; RV32IZFINXZDINX-NEXT:    addi a3, a5, -1
 ; RV32IZFINXZDINX-NEXT:  .LBB13_4:
-; RV32IZFINXZDINX-NEXT:    and a1, a2, a3
+; RV32IZFINXZDINX-NEXT:    feq.d a1, s0, s0
+; RV32IZFINXZDINX-NEXT:    neg a5, a1
+; RV32IZFINXZDINX-NEXT:    and a1, a5, a3
+; RV32IZFINXZDINX-NEXT:    neg a2, a2
+; RV32IZFINXZDINX-NEXT:    and a0, a2, a0
+; RV32IZFINXZDINX-NEXT:    neg a2, a4
+; RV32IZFINXZDINX-NEXT:    or a0, a2, a0
+; RV32IZFINXZDINX-NEXT:    and a0, a5, a0
 ; RV32IZFINXZDINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -1078,29 +1082,30 @@ define i64 @test_roundeven_si64(double %x) nounwind {
 ; RV32IZFINXZDINX-NEXT:    lui a2, %hi(.LCPI17_0)
 ; RV32IZFINXZDINX-NEXT:    lw a3, %lo(.LCPI17_0+4)(a2)
 ; RV32IZFINXZDINX-NEXT:    lw a2, %lo(.LCPI17_0)(a2)
-; RV32IZFINXZDINX-NEXT:    lui a4, %hi(.LCPI17_1)
-; RV32IZFINXZDINX-NEXT:    lw a5, %lo(.LCPI17_1+4)(a4)
-; RV32IZFINXZDINX-NEXT:    lw a4, %lo(.LCPI17_1)(a4)
-; RV32IZFINXZDINX-NEXT:    fle.d a6, a2, s0
-; RV32IZFINXZDINX-NEXT:    neg a2, a6
-; RV32IZFINXZDINX-NEXT:    and a0, a2, a0
-; RV32IZFINXZDINX-NEXT:    flt.d a4, a4, s0
-; RV32IZFINXZDINX-NEXT:    neg a2, a4
-; RV32IZFINXZDINX-NEXT:    or a0, a2, a0
-; RV32IZFINXZDINX-NEXT:    feq.d a2, s0, s0
-; RV32IZFINXZDINX-NEXT:    neg a2, a2
+; RV32IZFINXZDINX-NEXT:    fle.d a2, a2, s0
 ; RV32IZFINXZDINX-NEXT:    lui a5, 524288
+; RV32IZFINXZDINX-NEXT:    li a4, 1
 ; RV32IZFINXZDINX-NEXT:    lui a3, 524288
-; RV32IZFINXZDINX-NEXT:    beqz a6, .LBB17_2
+; RV32IZFINXZDINX-NEXT:    bne a2, a4, .LBB17_2
 ; RV32IZFINXZDINX-NEXT:  # %bb.1:
 ; RV32IZFINXZDINX-NEXT:    mv a3, a1
 ; RV32IZFINXZDINX-NEXT:  .LBB17_2:
-; RV32IZFINXZDINX-NEXT:    and a0, a2, a0
+; RV32IZFINXZDINX-NEXT:    lui a1, %hi(.LCPI17_1)
+; RV32IZFINXZDINX-NEXT:    lw a6, %lo(.LCPI17_1)(a1)
+; RV32IZFINXZDINX-NEXT:    lw a7, %lo(.LCPI17_1+4)(a1)
+; RV32IZFINXZDINX-NEXT:    flt.d a4, a6, s0
 ; RV32IZFINXZDINX-NEXT:    beqz a4, .LBB17_4
 ; RV32IZFINXZDINX-NEXT:  # %bb.3:
 ; RV32IZFINXZDINX-NEXT:    addi a3, a5, -1
 ; RV32IZFINXZDINX-NEXT:  .LBB17_4:
-; RV32IZFINXZDINX-NEXT:    and a1, a2, a3
+; RV32IZFINXZDINX-NEXT:    feq.d a1, s0, s0
+; RV32IZFINXZDINX-NEXT:    neg a5, a1
+; RV32IZFINXZDINX-NEXT:    and a1, a5, a3
+; RV32IZFINXZDINX-NEXT:    neg a2, a2
+; RV32IZFINXZDINX-NEXT:    and a0, a2, a0
+; RV32IZFINXZDINX-NEXT:    neg a2, a4
+; RV32IZFINXZDINX-NEXT:    or a0, a2, a0
+; RV32IZFINXZDINX-NEXT:    and a0, a5, a0
 ; RV32IZFINXZDINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -1322,29 +1327,30 @@ define i64 @test_rint_si64(double %x) nounwind {
 ; RV32IZFINXZDINX-NEXT:    lui a2, %hi(.LCPI21_0)
 ; RV32IZFINXZDINX-NEXT:    lw a3, %lo(.LCPI21_0+4)(a2)
 ; RV32IZFINXZDINX-NEXT:    lw a2, %lo(.LCPI21_0)(a2)
-; RV32IZFINXZDINX-NEXT:    lui a4, %hi(.LCPI21_1)
-; RV32IZFINXZDINX-NEXT:    lw a5, %lo(.LCPI21_1+4)(a4)
-; RV32IZFINXZDINX-NEXT:    lw a4, %lo(.LCPI21_1)(a4)
-; RV32IZFINXZDINX-NEXT:    fle.d a6, a2, s0
-; RV32IZFINXZDINX-NEXT:    neg a2, a6
-; RV32IZFINXZDINX-NEXT:    and a0, a2, a0
-; RV32IZFINXZDINX-NEXT:    flt.d a4, a4, s0
-; RV32IZFINXZDINX-NEXT:    neg a2, a4
-; RV32IZFINXZDINX-NEXT:    or a0, a2, a0
-; RV32IZFINXZDINX-NEXT:    feq.d a2, s0, s0
-; RV32IZFINXZDINX-NEXT:    neg a2, a2
+; RV32IZFINXZDINX-NEXT:    fle.d a2, a2, s0
 ; RV32IZFINXZDINX-NEXT:    lui a5, 524288
+; RV32IZFINXZDINX-NEXT:    li a4, 1
 ; RV32IZFINXZDINX-NEXT:    lui a3, 524288
-; RV32IZFINXZDINX-NEXT:    beqz a6, .LBB21_2
+; RV32IZFINXZDINX-NEXT:    bne a2, a4, .LBB21_2
 ; RV32IZFINXZDINX-NEXT:  # %bb.1:
 ; RV32IZFINXZDINX-NEXT:    mv a3, a1
 ; RV32IZFINXZDINX-NEXT:  .LBB21_2:
-; RV32IZFINXZDINX-NEXT:    and a0, a2, a0
+; RV32IZFINXZDINX-NEXT:    lui a1, %hi(.LCPI21_1)
+; RV32IZFINXZDINX-NEXT:    lw a6, %lo(.LCPI21_1)(a1)
+; RV32IZFINXZDINX-NEXT:    lw a7, %lo(.LCPI21_1+4)(a1)
+; RV32IZFINXZDINX-NEXT:    flt.d a4, a6, s0
 ; RV32IZFINXZDINX-NEXT:    beqz a4, .LBB21_4
 ; RV32IZFINXZDINX-NEXT:  # %bb.3:
 ; RV32IZFINXZDINX-NEXT:    addi a3, a5, -1
 ; RV32IZFINXZDINX-NEXT:  .LBB21_4:
-; RV32IZFINXZDINX-NEXT:    and a1, a2, a3
+; RV32IZFINXZDINX-NEXT:    feq.d a1, s0, s0
+; RV32IZFINXZDINX-NEXT:    neg a5, a1
+; RV32IZFINXZDINX-NEXT:    and a1, a5, a3
+; RV32IZFINXZDINX-NEXT:    neg a2, a2
+; RV32IZFINXZDINX-NEXT:    and a0, a2, a0
+; RV32IZFINXZDINX-NEXT:    neg a2, a4
+; RV32IZFINXZDINX-NEXT:    or a0, a2, a0
+; RV32IZFINXZDINX-NEXT:    and a0, a5, a0
 ; RV32IZFINXZDINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/pr84653_pr85190.ll b/llvm/test/CodeGen/RISCV/pr84653_pr85190.ll
index f9b9c8a69d431..3fa494e1a57dd 100644
--- a/llvm/test/CodeGen/RISCV/pr84653_pr85190.ll
+++ b/llvm/test/CodeGen/RISCV/pr84653_pr85190.ll
@@ -54,8 +54,7 @@ define i1 @pr85190(i64 %a) {
 ; CHECK-ZBB-NEXT:    li a2, -1
 ; CHECK-ZBB-NEXT:    slli a2, a2, 63
 ; CHECK-ZBB-NEXT:    sub a2, a2, a1
-; CHECK-ZBB-NEXT:    min a1, a2, zero
-; CHECK-ZBB-NEXT:    slt a0, a0, a1
+; CHECK-ZBB-NEXT:    slt a0, a0, a2
 ; CHECK-ZBB-NEXT:    ret
   %or = or i64 %a, 7
   %cmp1 = icmp slt i64 %a, 0
diff --git a/llvm/test/CodeGen/RISCV/pr97304.ll b/llvm/test/CodeGen/RISCV/pr97304.ll
deleted file mode 100644
index 120a0e787384d..0000000000000
--- a/llvm/test/CodeGen/RISCV/pr97304.ll
+++ /dev/null
@@ -1,51 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -mtriple=riscv64 -verify-machineinstrs -stop-after=finalize-isel | FileCheck %s
-
-define i32 @_ZNK2cv12LMSolverImpl3runERKNS_17_InputOutputArrayE(i1 %cmp436) {
-  ; CHECK-LABEL: name: _ZNK2cv12LMSolverImpl3runERKNS_17_InputOutputArrayE
-  ; CHECK: bb.0.entry:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $x10
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gpr = COPY $x10
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gpr = COPY [[COPY]]
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1.for.cond:
-  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[ANDI:%[0-9]+]]:gpr = ANDI [[COPY1]], 1
-  ; CHECK-NEXT:   ADJCALLSTACKDOWN 8, 0, implicit-def dead $x2, implicit $x2
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gpr = COPY $x2
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:gprjalr = COPY $x0
-  ; CHECK-NEXT:   SD [[COPY3]], [[COPY2]], 0 :: (store (s64))
-  ; CHECK-NEXT:   [[ADDI:%[0-9]+]]:gpr = ADDI $x0, 1
-  ; CHECK-NEXT:   [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 32
-  ; CHECK-NEXT:   BNE [[ANDI]], $x0, %bb.3
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.2.for.cond (call-frame-size 8):
-  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.3.for.cond (call-frame-size 8):
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:gpr = PHI [[ADDI1]], %bb.1, [[ADDI]], %bb.2
-  ; CHECK-NEXT:   $x10 = COPY [[COPY3]]
-  ; CHECK-NEXT:   $x11 = COPY [[PHI]]
-  ; CHECK-NEXT:   $x12 = COPY [[COPY3]]
-  ; CHECK-NEXT:   $x13 = COPY [[COPY3]]
-  ; CHECK-NEXT:   $x14 = COPY [[COPY3]]
-  ; CHECK-NEXT:   $x15 = COPY [[COPY3]]
-  ; CHECK-NEXT:   $x16 = COPY [[COPY3]]
-  ; CHECK-NEXT:   $x17 = COPY [[COPY3]]
-  ; CHECK-NEXT:   PseudoCALLIndirect [[COPY3]], csr_ilp32_lp64, implicit-def dead $x1, implicit $x10, implicit $x11, implicit $x12, implicit $x13, implicit $x14, implicit $x15, implicit $x16, implicit $x17, implicit-def $x2, implicit-def $x10
-  ; CHECK-NEXT:   ADJCALLSTACKUP 8, 0, implicit-def dead $x2, implicit $x2
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:gpr = COPY $x10
-  ; CHECK-NEXT:   PseudoBR %bb.1
-entry:
-  br label %for.cond
-
-for.cond:                                         ; preds = %for.cond, %entry
-  %conv = select i1 %cmp436, i32 32, i32 1
-  %call479 = call i32 (ptr, ...) null(ptr null, i32 %conv, i32 0, i32 0, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00)
-  br label %for.cond
-}
diff --git a/llvm/test/CodeGen/RISCV/rvv/pr99782.ll b/llvm/test/CodeGen/RISCV/rvv/pr99782.ll
deleted file mode 100644
index 92c40f41f02b5..0000000000000
--- a/llvm/test/CodeGen/RISCV/rvv/pr99782.ll
+++ /dev/null
@@ -1,18 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -mtriple=riscv64 -mattr=+v -stop-after=finalize-isel | FileCheck %s
-
-define void @vslidedown() {
-  ; CHECK-LABEL: name: vslidedown
-  ; CHECK: bb.0.entry:
-  ; CHECK-NEXT:   [[ADDI:%[0-9]+]]:gpr = ADDI %stack.0.v, 0
-  ; CHECK-NEXT:   [[VL8RE8_V:%[0-9]+]]:vrm8 = VL8RE8_V killed [[ADDI]] :: (load (<vscale x 1 x s512>) from %ir.v, align 1)
-  ; CHECK-NEXT:   [[ADDI1:%[0-9]+]]:gpr = ADDI %stack.1, 0
-  ; CHECK-NEXT:   VS8R_V killed [[VL8RE8_V]], killed [[ADDI1]] :: (store (<vscale x 1 x s512>) into %stack.1)
-  ; CHECK-NEXT:   INLINEASM &"vadd.vv $0, $0, $0", 25 /* sideeffect mayload maystore attdialect */, 262166 /* mem:m */, %stack.0.v, 0, 262166 /* mem:m */, %stack.1, 0
-  ; CHECK-NEXT:   PseudoRET
-entry:
-  %v = alloca <vscale x 64 x i8>, align 1
-  %0 = load <vscale x 64 x i8>, ptr %v, align 1
-  call void asm sideeffect "vadd.vv $0, $0, $0", "=*imr,imr"(ptr elementtype(<vscale x 64 x i8>) %v, <vscale x 64 x i8> %0)
-  ret void
-}
diff --git a/llvm/test/CodeGen/SystemZ/pr60413.ll b/llvm/test/CodeGen/SystemZ/pr60413.ll
index 62f5d49192ea9..8a6a30318ae58 100644
--- a/llvm/test/CodeGen/SystemZ/pr60413.ll
+++ b/llvm/test/CodeGen/SystemZ/pr60413.ll
@@ -13,6 +13,7 @@ declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #0
 define dso_local void @m() local_unnamed_addr #1 {
 ; CHECK-LABEL: m:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    stmg %r13, %r15, 104(%r15)
 ; CHECK-NEXT:    aghi %r15, -168
 ; CHECK-NEXT:    lhrl %r1, f+4
 ; CHECK-NEXT:    sll %r1, 8
@@ -20,66 +21,59 @@ define dso_local void @m() local_unnamed_addr #1 {
 ; CHECK-NEXT:    ic %r1, 6(%r2)
 ; CHECK-NEXT:    larl %r2, e
 ; CHECK-NEXT:    lb %r0, 3(%r2)
+; CHECK-NEXT:    vlvgp %v0, %r0, %r1
+; CHECK-NEXT:    vlvgp %v1, %r1, %r0
 ; CHECK-NEXT:    vlvgf %v1, %r1, 0
-; CHECK-NEXT:    vlvgf %v1, %r1, 1
-; CHECK-NEXT:    larl %r2, .LCPI0_0
-; CHECK-NEXT:    vl %v2, 0(%r2), 3
-; CHECK-NEXT:    vlvgf %v1, %r1, 3
-; CHECK-NEXT:    vlvgf %v3, %r1, 3
-; CHECK-NEXT:    vlvgf %v0, %r1, 1
-; CHECK-NEXT:    vperm %v4, %v1, %v0, %v2
-; CHECK-NEXT:    vlvgf %v0, %r1, 3
+; CHECK-NEXT:    vlvgf %v1, %r1, 2
+; CHECK-NEXT:    vlvgp %v2, %r1, %r1
+; CHECK-NEXT:    # kill: def $r1l killed $r1l killed $r1d
 ; CHECK-NEXT:    nilh %r1, 255
 ; CHECK-NEXT:    chi %r1, 128
 ; CHECK-NEXT:    ipm %r1
 ; CHECK-NEXT:    risbg %r1, %r1, 63, 191, 36
-; CHECK-NEXT:    vperm %v0, %v3, %v0, %v2
-; CHECK-NEXT:    larl %r2, .LCPI0_1
-; CHECK-NEXT:    vl %v5, 0(%r2), 3
-; CHECK-NEXT:    vgbm %v6, 30583
-; CHECK-NEXT:    vn %v0, %v0, %v6
-; CHECK-NEXT:    vn %v4, %v4, %v6
-; CHECK-NEXT:    vperm %v1, %v1, %v1, %v5
-; CHECK-NEXT:    vn %v5, %v1, %v6
-; CHECK-NEXT:    vperm %v1, %v0, %v3, %v2
-; CHECK-NEXT:    vn %v2, %v1, %v6
-; CHECK-NEXT:    vrepif %v1, 127
-; CHECK-NEXT:    vchlf %v3, %v5, %v1
-; CHECK-NEXT:    vlgvf %r3, %v3, 1
-; CHECK-NEXT:    vlgvf %r2, %v3, 0
-; CHECK-NEXT:    risbg %r2, %r2, 48, 176, 15
-; CHECK-NEXT:    rosbg %r2, %r3, 49, 49, 14
-; CHECK-NEXT:    vlgvf %r3, %v3, 2
-; CHECK-NEXT:    rosbg %r2, %r3, 50, 50, 13
-; CHECK-NEXT:    vlgvf %r3, %v3, 3
-; CHECK-NEXT:    rosbg %r2, %r3, 51, 51, 12
-; CHECK-NEXT:    vchlf %v3, %v4, %v1
-; CHECK-NEXT:    vlgvf %r3, %v3, 0
-; CHECK-NEXT:    rosbg %r2, %r3, 52, 52, 11
-; CHECK-NEXT:    vlgvf %r3, %v3, 1
-; CHECK-NEXT:    rosbg %r2, %r3, 53, 53, 10
-; CHECK-NEXT:    vlgvf %r3, %v3, 2
-; CHECK-NEXT:    rosbg %r2, %r3, 54, 54, 9
-; CHECK-NEXT:    vlgvf %r3, %v3, 3
-; CHECK-NEXT:    rosbg %r2, %r3, 55, 55, 8
-; CHECK-NEXT:    vchlf %v2, %v2, %v1
-; CHECK-NEXT:    vlgvf %r3, %v2, 0
-; CHECK-NEXT:    rosbg %r2, %r3, 56, 56, 7
+; CHECK-NEXT:    vlvgf %v0, %r0, 0
+; CHECK-NEXT:    vlvgf %v0, %r0, 2
+; CHECK-NEXT:    vgbm %v3, 30583
+; CHECK-NEXT:    vn %v0, %v0, %v3
+; CHECK-NEXT:    vn %v1, %v1, %v3
+; CHECK-NEXT:    vrepf %v2, %v2, 1
+; CHECK-NEXT:    vn %v2, %v2, %v3
+; CHECK-NEXT:    vrepif %v3, 127
+; CHECK-NEXT:    vchlf %v1, %v1, %v3
+; CHECK-NEXT:    vlgvf %r13, %v1, 0
+; CHECK-NEXT:    vchlf %v2, %v2, %v3
 ; CHECK-NEXT:    vlgvf %r3, %v2, 1
-; CHECK-NEXT:    rosbg %r2, %r3, 57, 57, 6
-; CHECK-NEXT:    vlgvf %r3, %v2, 2
-; CHECK-NEXT:    rosbg %r2, %r3, 58, 58, 5
-; CHECK-NEXT:    vlgvf %r3, %v2, 3
-; CHECK-NEXT:    rosbg %r2, %r3, 59, 59, 4
-; CHECK-NEXT:    vchlf %v0, %v0, %v1
-; CHECK-NEXT:    vlgvf %r3, %v0, 0
-; CHECK-NEXT:    rosbg %r2, %r3, 60, 60, 3
-; CHECK-NEXT:    vlgvf %r3, %v0, 1
-; CHECK-NEXT:    rosbg %r2, %r3, 61, 61, 2
-; CHECK-NEXT:    vlgvf %r3, %v0, 2
-; CHECK-NEXT:    rosbg %r2, %r3, 62, 62, 1
-; CHECK-NEXT:    vlgvf %r3, %v0, 3
-; CHECK-NEXT:    rosbg %r2, %r3, 63, 63, 0
+; CHECK-NEXT:    nilf %r3, 1
+; CHECK-NEXT:    vlgvf %r4, %v2, 0
+; CHECK-NEXT:    risbg %r2, %r4, 48, 176, 15
+; CHECK-NEXT:    rosbg %r2, %r3, 32, 49, 14
+; CHECK-NEXT:    vlgvf %r5, %v2, 2
+; CHECK-NEXT:    nilf %r5, 1
+; CHECK-NEXT:    rosbg %r2, %r5, 32, 50, 13
+; CHECK-NEXT:    vlgvf %r14, %v2, 3
+; CHECK-NEXT:    nilf %r14, 1
+; CHECK-NEXT:    rosbg %r2, %r14, 32, 51, 12
+; CHECK-NEXT:    rosbg %r2, %r13, 52, 52, 11
+; CHECK-NEXT:    vlgvf %r13, %v1, 1
+; CHECK-NEXT:    rosbg %r2, %r13, 53, 53, 10
+; CHECK-NEXT:    vlgvf %r13, %v1, 2
+; CHECK-NEXT:    rosbg %r2, %r13, 54, 54, 9
+; CHECK-NEXT:    vlgvf %r13, %v1, 3
+; CHECK-NEXT:    rosbg %r2, %r13, 55, 55, 8
+; CHECK-NEXT:    vchlf %v0, %v0, %v3
+; CHECK-NEXT:    vlgvf %r13, %v0, 0
+; CHECK-NEXT:    rosbg %r2, %r13, 56, 56, 7
+; CHECK-NEXT:    vlgvf %r13, %v0, 1
+; CHECK-NEXT:    rosbg %r2, %r13, 57, 57, 6
+; CHECK-NEXT:    vlgvf %r13, %v0, 2
+; CHECK-NEXT:    rosbg %r2, %r13, 58, 58, 5
+; CHECK-NEXT:    vlgvf %r13, %v0, 3
+; CHECK-NEXT:    rosbg %r2, %r13, 59, 59, 4
+; CHECK-NEXT:    nilf %r4, 1
+; CHECK-NEXT:    rosbg %r2, %r4, 32, 60, 3
+; CHECK-NEXT:    rosbg %r2, %r3, 32, 61, 2
+; CHECK-NEXT:    rosbg %r2, %r5, 32, 62, 1
+; CHECK-NEXT:    or %r2, %r14
 ; CHECK-NEXT:    vlgvb %r4, %v0, 1
 ; CHECK-NEXT:    vlgvb %r3, %v0, 0
 ; CHECK-NEXT:    risbg %r3, %r3, 48, 176, 15
@@ -122,7 +116,7 @@ define dso_local void @m() local_unnamed_addr #1 {
 ; CHECK-NEXT:    nr %r2, %r0
 ; CHECK-NEXT:    larl %r1, g
 ; CHECK-NEXT:    stc %r2, 0(%r1)
-; CHECK-NEXT:    aghi %r15, 168
+; CHECK-NEXT:    lmg %r13, %r15, 272(%r15)
 ; CHECK-NEXT:    br %r14
 entry:
   %n = alloca i32, align 4
diff --git a/llvm/test/CodeGen/VE/Scalar/max.ll b/llvm/test/CodeGen/VE/Scalar/max.ll
index 51da557c6c49f..12aa101cb48c4 100644
--- a/llvm/test/CodeGen/VE/Scalar/max.ll
+++ b/llvm/test/CodeGen/VE/Scalar/max.ll
@@ -281,13 +281,11 @@ define zeroext i1 @maxi1(i1 zeroext, i1 zeroext) {
 ; CHECK-LABEL: maxi1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    or %s0, %s0, %s1
-; CHECK-NEXT:    and %s0, 1, %s0
 ; CHECK-NEXT:    b.l.t (, %s10)
 ;
 ; OPT-LABEL: maxi1:
 ; OPT:       # %bb.0:
 ; OPT-NEXT:    or %s0, %s0, %s1
-; OPT-NEXT:    and %s0, 1, %s0
 ; OPT-NEXT:    b.l.t (, %s10)
   %3 = xor i1 %1, true
   %4 = and i1 %3, %0
diff --git a/llvm/test/CodeGen/VE/Scalar/min.ll b/llvm/test/CodeGen/VE/Scalar/min.ll
index 69d5ce48601f8..da92ebafd0590 100644
--- a/llvm/test/CodeGen/VE/Scalar/min.ll
+++ b/llvm/test/CodeGen/VE/Scalar/min.ll
@@ -278,7 +278,6 @@ define i32 @min2u32(i32, i32) {
 define zeroext i1 @mini1(i1 zeroext, i1 zeroext) {
 ; CHECK-LABEL: mini1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    and %s0, %s0, (32)0
 ; CHECK-NEXT:    and %s2, %s1, %s0
 ; CHECK-NEXT:    cmov.w.ne %s2, %s1, %s0
 ; CHECK-NEXT:    adds.w.zx %s0, %s2, (0)1
@@ -286,7 +285,6 @@ define zeroext i1 @mini1(i1 zeroext, i1 zeroext) {
 ;
 ; OPT-LABEL: mini1:
 ; OPT:       # %bb.0:
-; OPT-NEXT:    and %s0, %s0, (32)0
 ; OPT-NEXT:    and %s2, %s1, %s0
 ; OPT-NEXT:    cmov.w.ne %s2, %s1, %s0
 ; OPT-NEXT:    adds.w.zx %s0, %s2, (0)1
diff --git a/llvm/test/CodeGen/X86/abdu.ll b/llvm/test/CodeGen/X86/abdu.ll
index d1f07b9eaadcb..11719be4ab5cd 100644
--- a/llvm/test/CodeGen/X86/abdu.ll
+++ b/llvm/test/CodeGen/X86/abdu.ll
@@ -289,6 +289,7 @@ define i64 @abd_ext_i64(i64 %a, i64 %b) nounwind {
 ; X86-NEXT:    movl $0, %esi
 ; X86-NEXT:    sbbl %esi, %esi
 ; X86-NEXT:    sbbl %ecx, %ecx
+; X86-NEXT:    sarl $31, %ecx
 ; X86-NEXT:    xorl %ecx, %edx
 ; X86-NEXT:    xorl %ecx, %eax
 ; X86-NEXT:    subl %ecx, %eax
@@ -324,6 +325,7 @@ define i64 @abd_ext_i64_undef(i64 %a, i64 %b) nounwind {
 ; X86-NEXT:    movl $0, %esi
 ; X86-NEXT:    sbbl %esi, %esi
 ; X86-NEXT:    sbbl %ecx, %ecx
+; X86-NEXT:    sarl $31, %ecx
 ; X86-NEXT:    xorl %ecx, %edx
 ; X86-NEXT:    xorl %ecx, %eax
 ; X86-NEXT:    subl %ecx, %eax
diff --git a/llvm/test/CodeGen/X86/combine-pmadd.ll b/llvm/test/CodeGen/X86/combine-pmadd.ll
index d011efa5b6140..565d9ef6eb3e6 100644
--- a/llvm/test/CodeGen/X86/combine-pmadd.ll
+++ b/llvm/test/CodeGen/X86/combine-pmadd.ll
@@ -63,38 +63,6 @@ define <8 x i32> @combine_pmaddwd_concat(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>
   ret <8 x i32> %3
 }
 
-define <8 x i32> @combine_pmaddwd_concat_freeze(<8 x i16> %a0, <8 x i16> %a1) {
-; SSE-LABEL: combine_pmaddwd_concat_freeze:
-; SSE:       # %bb.0:
-; SSE-NEXT:    pmovsxbw {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1]
-; SSE-NEXT:    pmaddwd %xmm2, %xmm0
-; SSE-NEXT:    pmaddwd %xmm2, %xmm1
-; SSE-NEXT:    retq
-;
-; AVX1-LABEL: combine_pmaddwd_concat_freeze:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1]
-; AVX1-NEXT:    vpmaddwd %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpmaddwd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: combine_pmaddwd_concat_freeze:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1]
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm2, %ymm1
-; AVX2-NEXT:    vpmaddwd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    retq
-  %lo = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a0, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
-  %hi = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a1, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
-  %flo = freeze <4 x i32> %lo
-  %fhi = freeze <4 x i32> %hi
-  %res = shufflevector <4 x i32> %flo, <4 x i32> %fhi, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  ret <8 x i32> %res
-}
-
 define <4 x i32> @combine_pmaddwd_demandedelts(<8 x i16> %a0, <8 x i16> %a1) {
 ; SSE-LABEL: combine_pmaddwd_demandedelts:
 ; SSE:       # %bb.0:
@@ -210,38 +178,6 @@ define <16 x i16> @combine_pmaddubsw_concat(<16 x i8> %a0, <16 x i8> %a1, <16 x
   ret <16 x i16> %3
 }
 
-define <16 x i16> @combine_pmaddubsw_concat_freeze(<16 x i8> %a0, <16 x i8> %a1) {
-; SSE-LABEL: combine_pmaddubsw_concat_freeze:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; SSE-NEXT:    pmaddubsw %xmm2, %xmm0
-; SSE-NEXT:    pmaddubsw %xmm2, %xmm1
-; SSE-NEXT:    retq
-;
-; AVX1-LABEL: combine_pmaddubsw_concat_freeze:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX1-NEXT:    vpmaddubsw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpmaddubsw %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: combine_pmaddubsw_concat_freeze:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm2, %ymm1
-; AVX2-NEXT:    vpmaddubsw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    retq
-  %lo = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a0, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
-  %hi = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a1, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
-  %flo = freeze <8 x i16> %lo
-  %fhi = freeze <8 x i16> %hi
-  %res = shufflevector <8 x i16> %flo, <8 x i16> %fhi, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  ret <16 x i16> %res
-}
-
 define <8 x i16> @combine_pmaddubsw_demandedelts(<16 x i8> %a0, <16 x i8> %a1) {
 ; SSE-LABEL: combine_pmaddubsw_demandedelts:
 ; SSE:       # %bb.0:
@@ -293,41 +229,10 @@ define i32 @combine_pmaddubsw_constant_sat() {
 
 ; Constant folding PMADDWD was causing an infinite loop in the PCMPGT commuting between 2 constant values.
 define i1 @pmaddwd_pcmpgt_infinite_loop() {
-; SSE-LABEL: pmaddwd_pcmpgt_infinite_loop:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648]
-; SSE-NEXT:    pcmpeqd %xmm0, %xmm0
-; SSE-NEXT:    movdqa %xmm0, %xmm1
-; SSE-NEXT:    psrld $1, %xmm1
-; SSE-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE-NEXT:    pand %xmm0, %xmm1
-; SSE-NEXT:    movmskps %xmm1, %eax
-; SSE-NEXT:    testl %eax, %eax
-; SSE-NEXT:    sete %al
-; SSE-NEXT:    retq
-;
-; AVX1-LABEL: pmaddwd_pcmpgt_infinite_loop:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648]
-; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX1-NEXT:    vpsrld $1, %xmm0, %xmm1
-; AVX1-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT:    vtestps %xmm1, %xmm0
-; AVX1-NEXT:    sete %al
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: pmaddwd_pcmpgt_infinite_loop:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX2-NEXT:    vpsrld $1, %xmm0, %xmm1
-; AVX2-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX2-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX2-NEXT:    vtestps %xmm1, %xmm0
-; AVX2-NEXT:    sete %al
-; AVX2-NEXT:    retq
+; CHECK-LABEL: pmaddwd_pcmpgt_infinite_loop:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movb $1, %al
+; CHECK-NEXT:    retq
   %1 = tail call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768>, <8 x i16> <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768>)
   %2 = icmp eq <4 x i32> %1, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
   %3 = select <4 x i1> %2, <4 x i32> <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>, <4 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
index 1c303de55c95d..aa7b77f01d5ba 100644
--- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
+++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
@@ -177,7 +177,7 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $156, %esp
+; X86-NEXT:    subl $152, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -273,44 +273,42 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl %ebp, %esi
 ; X86-NEXT:    orl %ebx, %esi
 ; X86-NEXT:    cmovnel %ecx, %edx
-; X86-NEXT:    xorl %ebx, %ebx
+; X86-NEXT:    xorl %esi, %esi
 ; X86-NEXT:    subl %edx, %edi
+; X86-NEXT:    movl $0, %ebx
+; X86-NEXT:    sbbl %ebx, %ebx
 ; X86-NEXT:    movl $0, %edx
 ; X86-NEXT:    sbbl %edx, %edx
 ; X86-NEXT:    movl $0, %eax
 ; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    movl $0, %esi
-; X86-NEXT:    sbbl %esi, %esi
 ; X86-NEXT:    movl $127, %ecx
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    cmpl %edi, %ecx
+; X86-NEXT:    movl %ebx, %edi
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    sbbl %ebx, %ecx
 ; X86-NEXT:    movl $0, %ecx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl %edx, %ecx
 ; X86-NEXT:    movl $0, %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl %eax, %ecx
-; X86-NEXT:    movl $0, %ecx
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl %esi, %ecx
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload
 ; X86-NEXT:    movl (%esp), %edx # 4-byte Reload
-; X86-NEXT:    cmovnel %ebx, %edx
-; X86-NEXT:    cmovnel %ebx, %ebp
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    cmovnel %ebx, %eax
-; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    movl %ebx, %esi
+; X86-NEXT:    cmovnel %esi, %edx
+; X86-NEXT:    cmovnel %esi, %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    cmovnel %esi, %ebx
+; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    jne .LBB4_8
 ; X86-NEXT:  # %bb.1: # %_udiv-special-cases
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    xorl $127, %edi
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    xorl $127, %eax
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %edi, %ecx
 ; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    orl %edi, %ecx
+; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    je .LBB4_8
 ; X86-NEXT:  # %bb.2: # %udiv-bb1
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -333,34 +331,34 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    shrb $3, %al
 ; X86-NEXT:    andb $15, %al
 ; X86-NEXT:    negb %al
-; X86-NEXT:    movsbl %al, %edi
-; X86-NEXT:    movl 148(%esp,%edi), %edx
-; X86-NEXT:    movl 152(%esp,%edi), %esi
+; X86-NEXT:    movsbl %al, %eax
+; X86-NEXT:    movl 144(%esp,%eax), %edx
+; X86-NEXT:    movl 148(%esp,%eax), %esi
 ; X86-NEXT:    movb %ch, %cl
 ; X86-NEXT:    shldl %cl, %edx, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    shll %cl, %edx
 ; X86-NEXT:    notb %cl
-; X86-NEXT:    movl 144(%esp,%edi), %eax
-; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    movl 140(%esp,%eax), %ebx
+; X86-NEXT:    movl %ebx, %ebp
 ; X86-NEXT:    shrl %ebp
 ; X86-NEXT:    shrl %cl, %ebp
 ; X86-NEXT:    orl %edx, %ebp
-; X86-NEXT:    movl 140(%esp,%edi), %edx
+; X86-NEXT:    movl 136(%esp,%eax), %eax
 ; X86-NEXT:    movb %ch, %cl
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    shll %cl, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %eax, %ebx
+; X86-NEXT:    shll %cl, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    addl $1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    jae .LBB4_3
 ; X86-NEXT:  # %bb.6:
-; X86-NEXT:    xorl %edi, %edi
 ; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    xorl %eax, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    jmp .LBB4_7
 ; X86-NEXT:  .LBB4_3: # %udiv-preheader
@@ -376,176 +374,180 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movb %dl, %ch
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movb %al, %ch
 ; X86-NEXT:    andb $7, %ch
-; X86-NEXT:    movb %dl, %cl
-; X86-NEXT:    shrb $3, %cl
-; X86-NEXT:    andb $15, %cl
-; X86-NEXT:    movzbl %cl, %edx
-; X86-NEXT:    movl 104(%esp,%edx), %ebx
-; X86-NEXT:    movl 100(%esp,%edx), %edi
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edi, %ebp
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    shrb $3, %al
+; X86-NEXT:    andb $15, %al
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    movl 100(%esp,%eax), %esi
+; X86-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 96(%esp,%eax), %edx
+; X86-NEXT:    movl %ebp, %edi
+; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    movb %ch, %cl
-; X86-NEXT:    shrdl %cl, %ebx, %ebp
-; X86-NEXT:    movl 92(%esp,%edx), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 96(%esp,%edx), %esi
-; X86-NEXT:    movl %esi, %edx
-; X86-NEXT:    shrl %cl, %edx
+; X86-NEXT:    shrdl %cl, %esi, %ebp
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 88(%esp,%eax), %ebx
+; X86-NEXT:    movl 92(%esp,%eax), %esi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    shrl %cl, %eax
 ; X86-NEXT:    notb %cl
-; X86-NEXT:    addl %edi, %edi
-; X86-NEXT:    shll %cl, %edi
-; X86-NEXT:    orl %edx, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl %edx, %edx
+; X86-NEXT:    shll %cl, %edx
+; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movb %ch, %cl
-; X86-NEXT:    shrl %cl, %ebx
+; X86-NEXT:    shrl %cl, (%esp) # 4-byte Folded Spill
+; X86-NEXT:    shrdl %cl, %esi, %ebx
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shrdl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    addl $-1, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    adcl $-1, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    adcl $-1, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    adcl $-1, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    addl $-1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    adcl $-1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    adcl $-1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    adcl $-1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    .p2align 4, 0x90
 ; X86-NEXT:  .LBB4_4: # %udiv-do-while
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %ebp, (%esp) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    shldl $1, %ebp, %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebp, %edx
 ; X86-NEXT:    shldl $1, %ebp, (%esp) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl $1, %edx, %ebp
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    shldl $1, %edi, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    shldl $1, %ebp, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    shldl $1, %ebx, %ebp
+; X86-NEXT:    shldl $1, %ecx, %ebx
+; X86-NEXT:    shldl $1, %edi, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    shldl $1, %ecx, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    orl %esi, %edi
+; X86-NEXT:    orl %eax, %edi
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl $1, %eax, %ecx
-; X86-NEXT:    orl %esi, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    shldl $1, %ecx, %eax
-; X86-NEXT:    orl %esi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    addl %ecx, %ecx
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    shldl $1, %esi, %ecx
+; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    cmpl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    addl %esi, %esi
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    cmpl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    sbbl %ebp, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    sbbl (%esp), %ecx # 4-byte Folded Reload
+; X86-NEXT:    sbbl %edx, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    sbbl %ebx, %ecx
+; X86-NEXT:    sbbl (%esp), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    sarl $31, %ecx
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    andl $1, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    andl %edi, %esi
 ; X86-NEXT:    movl %ecx, %edi
 ; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    subl %ecx, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    subl %ecx, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl %eax, %ebp
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl (%esp), %ebp # 4-byte Reload
-; X86-NEXT:    sbbl %edi, %ebp
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    sbbl %esi, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %edi, %edx
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    sbbl %esi, (%esp) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    addl $-1, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    adcl $-1, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    adcl $-1, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    adcl $-1, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    adcl $-1, %esi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %esi, %edi
+; X86-NEXT:    adcl $-1, %edx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %edx, %eax
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %ebx, %ecx
-; X86-NEXT:    orl %edi, %ecx
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %esi, %ecx
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    jne .LBB4_4
 ; X86-NEXT:  # %bb.5:
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl %edi, %ebp
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:  .LBB4_7: # %udiv-loop-exit
 ; X86-NEXT:    shldl $1, %ebp, %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    shldl $1, %eax, %ebp
-; X86-NEXT:    orl %ecx, %ebp
+; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    shldl $1, %ebx, %ebp
+; X86-NEXT:    orl %eax, %ebp
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    shldl $1, %esi, %eax
-; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    shldl $1, %esi, %ebx
+; X86-NEXT:    orl %eax, %ebx
 ; X86-NEXT:    addl %esi, %esi
-; X86-NEXT:    orl %edi, %esi
+; X86-NEXT:    orl %ecx, %esi
 ; X86-NEXT:  .LBB4_8: # %udiv-end
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    xorl %ecx, %edx
-; X86-NEXT:    xorl %ecx, %ebp
-; X86-NEXT:    xorl %ecx, %eax
-; X86-NEXT:    xorl %ecx, %esi
-; X86-NEXT:    subl %ecx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    xorl %eax, %ebp
+; X86-NEXT:    xorl %eax, %ebx
+; X86-NEXT:    xorl %eax, %esi
+; X86-NEXT:    subl %eax, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl %ecx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl %ecx, %ebp
-; X86-NEXT:    sbbl %ecx, %edx
+; X86-NEXT:    sbbl %eax, %ebx
+; X86-NEXT:    sbbl %eax, %ebp
+; X86-NEXT:    sbbl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %esi, (%ecx)
-; X86-NEXT:    movl %eax, 4(%ecx)
-; X86-NEXT:    movl %ebp, 8(%ecx)
-; X86-NEXT:    movl %edx, 12(%ecx)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %esi, (%eax)
+; X86-NEXT:    movl %ebx, 4(%eax)
+; X86-NEXT:    movl %ebp, 8(%eax)
+; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %ebp, %edi
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    addl %ebp, %ecx
-; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %ebp
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ebx, %edx
-; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    adcl %ebp, %edx
+; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
@@ -555,12 +557,12 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    imull %eax, %ecx
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    imull %ebp, %edi
+; X86-NEXT:    imull %esi, %edi
 ; X86-NEXT:    addl %edx, %edi
 ; X86-NEXT:    addl %ecx, %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    imull %esi, %ecx
+; X86-NEXT:    imull %ebx, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    imull %edx, %esi
@@ -584,7 +586,7 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl %edx, 4(%eax)
 ; X86-NEXT:    movl %ebx, 8(%eax)
 ; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    addl $156, %esp
+; X86-NEXT:    addl $152, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/pmul.ll b/llvm/test/CodeGen/X86/pmul.ll
index 2b475644a38cf..fe791e45eff99 100644
--- a/llvm/test/CodeGen/X86/pmul.ll
+++ b/llvm/test/CodeGen/X86/pmul.ll
@@ -964,7 +964,7 @@ define <64 x i8> @mul_v64i8(<64 x i8> %i, <64 x i8> %j) nounwind  {
 ; AVX512F-LABEL: mul_v64i8:
 ; AVX512F:       # %bb.0: # %entry
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
-; AVX512F-NEXT:    vpbroadcastd {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} zmm3 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
 ; AVX512F-NEXT:    vpand %ymm2, %ymm3, %ymm4
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm5
 ; AVX512F-NEXT:    vpmaddubsw %ymm4, %ymm5, %ymm4
diff --git a/llvm/test/CodeGen/X86/pr64589.ll b/llvm/test/CodeGen/X86/pr64589.ll
index d93d54f4c31d0..130ef517ae28e 100644
--- a/llvm/test/CodeGen/X86/pr64589.ll
+++ b/llvm/test/CodeGen/X86/pr64589.ll
@@ -7,8 +7,8 @@
 define i8 @test(ptr %p) {
 ; CHECK-LABEL: test:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movzbl 1(%rdi), %eax
-; CHECK-NEXT:    orb (%rdi), %al
+; CHECK-NEXT:    movzbl (%rdi), %eax
+; CHECK-NEXT:    orb 1(%rdi), %al
 ; CHECK-NEXT:    setne %al
 ; CHECK-NEXT:    addb %al, %al
 ; CHECK-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vector-compare-all_of.ll b/llvm/test/CodeGen/X86/vector-compare-all_of.ll
index 30202701fdb8c..ec7dca4285a35 100644
--- a/llvm/test/CodeGen/X86/vector-compare-all_of.ll
+++ b/llvm/test/CodeGen/X86/vector-compare-all_of.ll
@@ -1550,40 +1550,26 @@ define i1 @select_v2i8(ptr %s0, ptr %s1) {
 ;
 ; SSE42-LABEL: select_v2i8:
 ; SSE42:       # %bb.0:
-; SSE42-NEXT:    movzwl (%rdi), %eax
-; SSE42-NEXT:    movd %eax, %xmm0
-; SSE42-NEXT:    movzwl (%rsi), %eax
-; SSE42-NEXT:    movd %eax, %xmm1
-; SSE42-NEXT:    pcmpeqb %xmm0, %xmm1
-; SSE42-NEXT:    pmovsxbq %xmm1, %xmm0
-; SSE42-NEXT:    movmskpd %xmm0, %eax
-; SSE42-NEXT:    cmpl $3, %eax
+; SSE42-NEXT:    pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
+; SSE42-NEXT:    pmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
+; SSE42-NEXT:    pxor %xmm0, %xmm1
+; SSE42-NEXT:    ptest %xmm1, %xmm1
 ; SSE42-NEXT:    sete %al
 ; SSE42-NEXT:    retq
 ;
 ; AVX1OR2-LABEL: select_v2i8:
 ; AVX1OR2:       # %bb.0:
-; AVX1OR2-NEXT:    movzwl (%rdi), %eax
-; AVX1OR2-NEXT:    vmovd %eax, %xmm0
-; AVX1OR2-NEXT:    movzwl (%rsi), %eax
-; AVX1OR2-NEXT:    vmovd %eax, %xmm1
-; AVX1OR2-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
-; AVX1OR2-NEXT:    vpmovsxbq %xmm0, %xmm0
-; AVX1OR2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX1OR2-NEXT:    vtestpd %xmm1, %xmm0
-; AVX1OR2-NEXT:    setb %al
+; AVX1OR2-NEXT:    vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1OR2-NEXT:    vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1OR2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX1OR2-NEXT:    vptest %xmm0, %xmm0
+; AVX1OR2-NEXT:    sete %al
 ; AVX1OR2-NEXT:    retq
 ;
 ; AVX512-LABEL: select_v2i8:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    movzwl (%rdi), %eax
-; AVX512-NEXT:    vmovd %eax, %xmm0
-; AVX512-NEXT:    movzwl (%rsi), %eax
-; AVX512-NEXT:    vmovd %eax, %xmm1
-; AVX512-NEXT:    vpcmpeqb %xmm1, %xmm0, %k0
-; AVX512-NEXT:    knotw %k0, %k0
-; AVX512-NEXT:    kmovd %k0, %eax
-; AVX512-NEXT:    testb $3, %al
+; AVX512-NEXT:    cmpw (%rsi), %ax
 ; AVX512-NEXT:    sete %al
 ; AVX512-NEXT:    retq
   %v0 = load <2 x i8>, ptr %s0, align 1
diff --git a/llvm/test/CodeGen/X86/vector-compare-any_of.ll b/llvm/test/CodeGen/X86/vector-compare-any_of.ll
index 2df39d69dbb75..951bcfa8fc1b7 100644
--- a/llvm/test/CodeGen/X86/vector-compare-any_of.ll
+++ b/llvm/test/CodeGen/X86/vector-compare-any_of.ll
@@ -1433,25 +1433,19 @@ define i1 @select_v2i8(ptr %s0, ptr %s1) {
 ;
 ; SSE42-LABEL: select_v2i8:
 ; SSE42:       # %bb.0:
-; SSE42-NEXT:    movzwl (%rdi), %eax
-; SSE42-NEXT:    movd %eax, %xmm0
-; SSE42-NEXT:    movzwl (%rsi), %eax
-; SSE42-NEXT:    movd %eax, %xmm1
-; SSE42-NEXT:    pcmpeqb %xmm0, %xmm1
-; SSE42-NEXT:    pmovsxbq %xmm1, %xmm0
-; SSE42-NEXT:    movmskpd %xmm0, %eax
+; SSE42-NEXT:    pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
+; SSE42-NEXT:    pmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
+; SSE42-NEXT:    pcmpeqq %xmm0, %xmm1
+; SSE42-NEXT:    movmskpd %xmm1, %eax
 ; SSE42-NEXT:    testl %eax, %eax
 ; SSE42-NEXT:    setne %al
 ; SSE42-NEXT:    retq
 ;
 ; AVX1OR2-LABEL: select_v2i8:
 ; AVX1OR2:       # %bb.0:
-; AVX1OR2-NEXT:    movzwl (%rdi), %eax
-; AVX1OR2-NEXT:    vmovd %eax, %xmm0
-; AVX1OR2-NEXT:    movzwl (%rsi), %eax
-; AVX1OR2-NEXT:    vmovd %eax, %xmm1
-; AVX1OR2-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
-; AVX1OR2-NEXT:    vpmovsxbq %xmm0, %xmm0
+; AVX1OR2-NEXT:    vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1OR2-NEXT:    vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1OR2-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
 ; AVX1OR2-NEXT:    vtestpd %xmm0, %xmm0
 ; AVX1OR2-NEXT:    setne %al
 ; AVX1OR2-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll
index c725dcd972cd5..b33cc83ac3f79 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll
@@ -3931,40 +3931,49 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ;
 ; AVX512-LABEL: store_i16_stride6_vf32:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vmovdqa 32(%rcx), %ymm4
-; AVX512-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512-NEXT:    vmovdqa 32(%rdx), %ymm8
-; AVX512-NEXT:    vpsrldq {{.*#+}} ymm1 = ymm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm8[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vmovdqa 32(%rcx), %ymm9
+; AVX512-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm9[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vmovdqa 32(%rdx), %ymm12
+; AVX512-NEXT:    vpsrldq {{.*#+}} ymm1 = ymm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm12[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm8[4],ymm4[4],ymm8[5],ymm4[5],ymm8[6],ymm4[6],ymm8[7],ymm4[7],ymm8[12],ymm4[12],ymm8[13],ymm4[13],ymm8[14],ymm4[14],ymm8[15],ymm4[15]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm12[4],ymm9[4],ymm12[5],ymm9[5],ymm12[6],ymm9[6],ymm12[7],ymm9[7],ymm12[12],ymm9[12],ymm12[13],ymm9[13],ymm12[14],ymm9[14],ymm12[15],ymm9[15]
 ; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,2,3,3,5,6,7,7]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3]
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512-NEXT:    vmovdqa 32(%rsi), %ymm9
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm9[2,1,2,3,6,5,6,7]
+; AVX512-NEXT:    vmovdqa 32(%rsi), %ymm8
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm8[2,1,2,3,6,5,6,7]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
-; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm11
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm11[2,1,2,3,6,5,6,7]
+; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm10
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm10[2,1,2,3,6,5,6,7]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm2[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm11[4],ymm9[4],ymm11[5],ymm9[5],ymm11[6],ymm9[6],ymm11[7],ymm9[7],ymm11[12],ymm9[12],ymm11[13],ymm9[13],ymm11[14],ymm9[14],ymm11[15],ymm9[15]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm10[4],ymm8[4],ymm10[5],ymm8[5],ymm10[6],ymm8[6],ymm10[7],ymm8[7],ymm10[12],ymm8[12],ymm10[13],ymm8[13],ymm10[14],ymm8[14],ymm10[15],ymm8[15]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[3,3,3,3]
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm2
 ; AVX512-NEXT:    movw $18724, %ax # imm = 0x4924
 ; AVX512-NEXT:    kmovw %eax, %k1
 ; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm2 {%k1}
 ; AVX512-NEXT:    vextracti64x4 $1, %zmm2, %ymm0
-; AVX512-NEXT:    vmovdqa 32(%r8), %ymm12
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31]
-; AVX512-NEXT:    vpshufb %ymm1, %ymm12, %ymm3
+; AVX512-NEXT:    vmovdqa 32(%r8), %ymm1
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31]
+; AVX512-NEXT:    vpshufb %ymm4, %ymm1, %ymm3
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6],ymm3[7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm12[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm16
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm17 = zmm2[0,1,2,3],zmm0[4,5,6,7]
+; AVX512-NEXT:    vmovdqa 32(%r9), %ymm2
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm18 = ymm0[2,2,2,3]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm2[2,3,2,3,6,7,6,7]
+; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm25
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm19 = ymm0[2,1,2,3]
 ; AVX512-NEXT:    vmovdqa (%rcx), %ymm2
 ; AVX512-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
 ; AVX512-NEXT:    vmovdqa (%rdx), %ymm3
@@ -3974,179 +3983,178 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15]
 ; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm5[1,2,3,3,5,6,7,7]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm0, %zmm5
+; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm0, %zmm6
 ; AVX512-NEXT:    vmovdqa (%rsi), %ymm0
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm6 = ymm0[2,1,2,3,6,5,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm6[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
-; AVX512-NEXT:    vmovdqa (%rdi), %ymm10
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm7 = ymm10[2,1,2,3,6,5,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm7 = ymm7[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[8],ymm6[8],ymm7[9],ymm6[9],ymm7[10],ymm6[10],ymm7[11],ymm6[11]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm7 = ymm10[4],ymm0[4],ymm10[5],ymm0[5],ymm10[6],ymm0[6],ymm10[7],ymm0[7],ymm10[12],ymm0[12],ymm10[13],ymm0[13],ymm10[14],ymm0[14],ymm10[15],ymm0[15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[3,3,3,3]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm7, %zmm6, %zmm6
-; AVX512-NEXT:    vmovdqa32 %zmm5, %zmm6 {%k1}
-; AVX512-NEXT:    vextracti64x4 $1, %zmm6, %ymm5
-; AVX512-NEXT:    vmovdqa (%r8), %ymm14
-; AVX512-NEXT:    vpshufb %ymm1, %ymm14, %ymm1
-; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0],ymm1[1],ymm5[2,3],ymm1[4],ymm5[5,6],ymm1[7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm14[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0],ymm6[1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm5, %zmm17
-; AVX512-NEXT:    vmovdqa (%rcx), %xmm5
-; AVX512-NEXT:    vmovdqa 32(%rcx), %xmm13
-; AVX512-NEXT:    vmovdqa (%rdx), %xmm6
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm0[2,1,2,3,6,5,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm7 = ymm5[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
+; AVX512-NEXT:    vmovdqa (%rdi), %ymm5
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm11 = ymm5[2,1,2,3,6,5,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm11 = ymm11[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm7 = ymm11[0],ymm7[0],ymm11[1],ymm7[1],ymm11[2],ymm7[2],ymm11[3],ymm7[3],ymm11[8],ymm7[8],ymm11[9],ymm7[9],ymm11[10],ymm7[10],ymm11[11],ymm7[11]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm11 = ymm5[4],ymm0[4],ymm5[5],ymm0[5],ymm5[6],ymm0[6],ymm5[7],ymm0[7],ymm5[12],ymm0[12],ymm5[13],ymm0[13],ymm5[14],ymm0[14],ymm5[15],ymm0[15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[3,3,3,3]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm11, %zmm7, %zmm7
+; AVX512-NEXT:    vmovdqa32 %zmm6, %zmm7 {%k1}
+; AVX512-NEXT:    vextracti64x4 $1, %zmm7, %ymm11
+; AVX512-NEXT:    vmovdqa (%r8), %ymm6
+; AVX512-NEXT:    vpshufb %ymm4, %ymm6, %ymm4
+; AVX512-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,1,2,3]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm11[0],ymm4[1],ymm11[2,3],ymm4[4],ymm11[5,6],ymm4[7]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm4
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm11 = ymm6[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm7 = ymm11[0],ymm7[1,2],ymm11[3],ymm7[4,5],ymm11[6],ymm7[7]
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm20 = zmm7[0,1,2,3],zmm4[4,5,6,7]
+; AVX512-NEXT:    vmovdqa (%r9), %ymm4
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm7 = ymm4[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm21 = ymm7[2,2,2,3]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm7 = ymm4[2,3,2,3,6,7,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm7 = ymm7[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm22 = ymm7[2,1,2,3]
+; AVX512-NEXT:    vmovdqa (%rcx), %xmm13
+; AVX512-NEXT:    vmovdqa (%rdx), %xmm14
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm7 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [17,18,17,18,0,0,19,19,5,4,2,2,5,4,6,6]
-; AVX512-NEXT:    vpermt2d %zmm1, %zmm18, %zmm2
-; AVX512-NEXT:    vmovdqa (%rsi), %xmm3
-; AVX512-NEXT:    vmovdqa (%rdi), %xmm7
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[1,1,1,1]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm10[0],ymm0[0],ymm10[1],ymm0[1],ymm10[2],ymm0[2],ymm10[3],ymm0[3],ymm10[8],ymm0[8],ymm10[9],ymm0[9],ymm10[10],ymm0[10],ymm10[11],ymm0[11]
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm24 = [17,18,17,18,0,0,19,19,5,4,2,2,5,4,6,6]
+; AVX512-NEXT:    vpermt2d %zmm7, %zmm24, %zmm2
+; AVX512-NEXT:    vmovdqa (%rsi), %xmm11
+; AVX512-NEXT:    vmovdqa (%rdi), %xmm15
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm15[4],xmm11[4],xmm15[5],xmm11[5],xmm15[6],xmm11[6],xmm15[7],xmm11[7]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = ymm3[1,1,1,1]
+; AVX512-NEXT:    vmovdqa 32(%rcx), %xmm3
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm5[0],ymm0[0],ymm5[1],ymm0[1],ymm5[2],ymm0[2],ymm5[3],ymm0[3],ymm5[8],ymm0[8],ymm5[9],ymm0[9],ymm5[10],ymm0[10],ymm5[11],ymm0[11]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm7, %zmm0
 ; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm2 {%k1}
 ; AVX512-NEXT:    vextracti64x4 $1, %zmm2, %ymm0
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm14[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm14 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
-; AVX512-NEXT:    vmovdqa (%r8), %xmm10
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15]
-; AVX512-NEXT:    vpshufb %xmm1, %xmm10, %xmm0
-; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7]
-; AVX512-NEXT:    vmovdqa 32(%rdx), %xmm0
-; AVX512-NEXT:    vinserti64x4 $1, %ymm14, %zmm2, %zmm2
-; AVX512-NEXT:    vmovdqa 32(%rsi), %xmm14
-; AVX512-NEXT:    vmovdqa 32(%rdi), %xmm15
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm8[0],ymm4[0],ymm8[1],ymm4[1],ymm8[2],ymm4[2],ymm8[3],ymm4[3],ymm8[8],ymm4[8],ymm8[9],ymm4[9],ymm8[10],ymm4[10],ymm8[11],ymm4[11]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7]
-; AVX512-NEXT:    vpermt2d %zmm8, %zmm18, %zmm4
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm6[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,2,2,2]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2],ymm0[3,4],ymm5[5],ymm0[6,7]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm5
+; AVX512-NEXT:    vmovdqa (%r8), %xmm0
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm6 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15]
+; AVX512-NEXT:    vpshufb %xmm6, %xmm0, %xmm7
+; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0],ymm7[1],ymm2[2,3],ymm7[4],ymm2[5,6],ymm7[7]
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm23 = zmm2[0,1,2,3],zmm5[4,5,6,7]
+; AVX512-NEXT:    vmovdqa (%r9), %xmm2
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,2,2,1,4,5,6,7]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm16 = ymm5[0,1,0,1]
+; AVX512-NEXT:    vmovdqa 32(%rdx), %xmm5
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm9 = ymm12[0],ymm9[0],ymm12[1],ymm9[1],ymm12[2],ymm9[2],ymm12[3],ymm9[3],ymm12[8],ymm9[8],ymm12[9],ymm9[9],ymm12[10],ymm9[10],ymm12[11],ymm9[11]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm7 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
+; AVX512-NEXT:    vpermt2d %zmm7, %zmm24, %zmm9
+; AVX512-NEXT:    vmovdqa 32(%rsi), %xmm12
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm4[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm24 = ymm4[2,2,2,2]
+; AVX512-NEXT:    vmovdqa 32(%rdi), %xmm4
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm7 = ymm10[0],ymm8[0],ymm10[1],ymm8[1],ymm10[2],ymm8[2],ymm10[3],ymm8[3],ymm10[8],ymm8[8],ymm10[9],ymm8[9],ymm10[10],ymm8[10],ymm10[11],ymm8[11]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[1,1,1,1]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm9 = ymm11[0],ymm9[0],ymm11[1],ymm9[1],ymm11[2],ymm9[2],ymm11[3],ymm9[3],ymm11[8],ymm9[8],ymm11[9],ymm9[9],ymm11[10],ymm9[10],ymm11[11],ymm9[11]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm9, %zmm8, %zmm8
-; AVX512-NEXT:    vmovdqa32 %zmm8, %zmm4 {%k1}
-; AVX512-NEXT:    vextracti64x4 $1, %zmm4, %ymm8
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm9 = ymm12[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7]
-; AVX512-NEXT:    vmovdqa 32(%r8), %xmm12
-; AVX512-NEXT:    vpshufb %xmm1, %xmm12, %xmm1
-; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2,3],ymm1[4],ymm4[5,6],ymm1[7]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm8, %zmm1, %zmm4
-; AVX512-NEXT:    vmovdqa 32(%r9), %ymm11
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm11[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm8 = ymm1[2,2,2,3]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm11[2,3,2,3,6,7,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm9 = ymm1[2,1,2,3]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3]
-; AVX512-NEXT:    vpsrldq {{.*#+}} xmm13 = xmm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3]
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [1,0,2,2,1,0,2,2,16,17,16,17,16,17,16,17]
-; AVX512-NEXT:    vpermt2d %zmm0, %zmm18, %zmm1
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm14[0,1,2,1]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm13 = xmm15[0,1,2,1]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,7,6,5]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1]
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm13[0,1,2,3],zmm0[0,1,0,1]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm7, %zmm8, %zmm7
+; AVX512-NEXT:    vmovdqa32 %zmm7, %zmm9 {%k1}
+; AVX512-NEXT:    vextracti64x4 $1, %zmm9, %ymm7
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm1[2],ymm7[3,4],ymm1[5],ymm7[6,7]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm1
+; AVX512-NEXT:    vmovdqa 32(%r8), %xmm7
+; AVX512-NEXT:    vpshufb %xmm6, %xmm7, %xmm6
+; AVX512-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm6 = ymm9[0],ymm6[1],ymm9[2,3],ymm6[4],ymm9[5,6],ymm6[7]
+; AVX512-NEXT:    vmovdqa 32(%r9), %xmm10
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm6[0,1,2,3],zmm1[4,5,6,7]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
+; AVX512-NEXT:    vpsrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vpsrldq {{.*#+}} xmm5 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm5 = [1,0,2,2,1,0,2,2,16,17,16,17,16,17,16,17]
+; AVX512-NEXT:    vpermt2d %zmm3, %zmm5, %zmm6
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm12[0,1,2,1]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm8 = xmm4[0,1,2,1]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,5]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm9 = xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm10[2,3,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,1,4,5,6,7]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm8 = ymm3[0,1,0,1]
+; AVX512-NEXT:    vmovdqa64 %ymm25, %ymm3
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm3[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1]
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm9[0,1,0,1]
 ; AVX512-NEXT:    movw $9362, %ax # imm = 0x2492
 ; AVX512-NEXT:    kmovw %eax, %k1
-; AVX512-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
-; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm12[2,1,3,3,4,5,6,7]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm13[0],ymm1[1,2],ymm13[3],ymm1[4,5],ymm13[6],ymm1[7]
-; AVX512-NEXT:    vpmovzxwd {{.*#+}} xmm12 = xmm12[0],zero,xmm12[1],zero,xmm12[2],zero,xmm12[3],zero
-; AVX512-NEXT:    vpbroadcastq %xmm12, %ymm12
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm12[2],ymm0[3,4],ymm12[5],ymm0[6,7]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm12
-; AVX512-NEXT:    vmovdqa (%r9), %ymm0
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm13 = ymm1[2,2,2,3]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm0[2,3,2,3,6,7,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm14 = ymm1[2,1,2,3]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
-; AVX512-NEXT:    vpsrldq {{.*#+}} xmm5 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512-NEXT:    vpsrldq {{.*#+}} xmm6 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
-; AVX512-NEXT:    vpermt2d %zmm5, %zmm18, %zmm1
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm5 = xmm3[0,1,2,1]
+; AVX512-NEXT:    vmovdqa32 %zmm6, %zmm4 {%k1}
+; AVX512-NEXT:    vextracti64x4 $1, %zmm4, %ymm6
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm7[2,1,3,3,4,5,6,7]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm6 = ymm9[0],ymm6[1,2],ymm9[3],ymm6[4,5],ymm9[6],ymm6[7]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm6, %zmm0, %zmm6
+; AVX512-NEXT:    vpmovzxwd {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero
+; AVX512-NEXT:    vpbroadcastq %xmm7, %ymm7
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm7[2],ymm4[3,4],ymm7[5],ymm4[6,7]
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm6[4,5,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm10[0,0,2,1,4,5,6,7]
+; AVX512-NEXT:    vpbroadcastq %xmm6, %ymm6
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm10[0,2,2,3,4,5,6,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3]
+; AVX512-NEXT:    vpsrldq {{.*#+}} xmm10 = xmm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vpsrldq {{.*#+}} xmm12 = xmm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3]
+; AVX512-NEXT:    vpermt2d %zmm10, %zmm5, %zmm9
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm5 = xmm11[0,1,2,1]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,5]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm6 = xmm7[0,1,2,1]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,7,6,5]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm5[0,1,0,1]
-; AVX512-NEXT:    vmovdqa32 %zmm1, %zmm3 {%k1}
-; AVX512-NEXT:    vextracti64x4 $1, %zmm3, %ymm1
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm10[2,1,3,3,4,5,6,7]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0],ymm1[1,2],ymm5[3],ymm1[4,5],ymm5[6],ymm1[7]
-; AVX512-NEXT:    vpmovzxwd {{.*#+}} xmm5 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm10 = xmm15[0,1,2,1]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,7,6,5]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm15[0],xmm11[0],xmm15[1],xmm11[1],xmm15[2],xmm11[2],xmm15[3],xmm11[3]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1]
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm10[0,1,2,3],zmm5[0,1,0,1]
+; AVX512-NEXT:    vmovdqa32 %zmm9, %zmm5 {%k1}
+; AVX512-NEXT:    vextracti64x4 $1, %zmm5, %ymm9
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm0[2,1,3,3,4,5,6,7]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0],ymm9[1,2],ymm10[3],ymm9[4,5],ymm10[6],ymm9[7]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm9, %zmm0, %zmm9
+; AVX512-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX512-NEXT:    vpbroadcastq %xmm0, %ymm0
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm0[2],ymm5[3,4],ymm0[5],ymm5[6,7]
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm9[4,5,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm2[0,0,2,1,4,5,6,7]
 ; AVX512-NEXT:    vpbroadcastq %xmm5, %ymm5
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm3, %zmm1
-; AVX512-NEXT:    vmovdqa (%r9), %xmm3
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm5 = xmm3[2,3,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,2,2,1,4,5,6,7]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1]
-; AVX512-NEXT:    vmovdqa 32(%r9), %xmm6
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm7 = xmm6[2,3,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[0,2,2,1,4,5,6,7]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm10 = ymm11[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,2,2,2]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm6[0,0,2,1,4,5,6,7]
-; AVX512-NEXT:    vpbroadcastq %xmm11, %ymm11
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm3[0,0,2,1,4,5,6,7]
-; AVX512-NEXT:    vpbroadcastq %xmm15, %ymm15
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    vinserti64x4 $1, %ymm9, %zmm8, %zmm8
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0]
-; AVX512-NEXT:    vpternlogd $184, %zmm16, %zmm9, %zmm8
-; AVX512-NEXT:    vinserti64x4 $1, %ymm14, %zmm13, %zmm13
-; AVX512-NEXT:    vpternlogd $184, %zmm17, %zmm9, %zmm13
-; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm5, %zmm0
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512-NEXT:    vpternlogd $184, %zmm2, %zmm5, %zmm0
-; AVX512-NEXT:    vinserti64x4 $1, %ymm10, %zmm7, %zmm2
-; AVX512-NEXT:    vpternlogd $184, %zmm4, %zmm5, %zmm2
-; AVX512-NEXT:    vinserti64x4 $1, %ymm6, %zmm11, %zmm4
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535]
-; AVX512-NEXT:    vpternlogd $184, %zmm12, %zmm5, %zmm4
-; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm15, %zmm3
-; AVX512-NEXT:    vpternlogd $184, %zmm1, %zmm5, %zmm3
-; AVX512-NEXT:    vmovdqa64 %zmm3, (%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm4, 192(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm2, 256(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm0, 64(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm13, 128(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm8, 320(%rax)
+; AVX512-NEXT:    vinserti64x4 $1, %ymm19, %zmm18, %zmm9
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0]
+; AVX512-NEXT:    vpternlogd $184, %zmm17, %zmm10, %zmm9
+; AVX512-NEXT:    vinserti64x4 $1, %ymm22, %zmm21, %zmm11
+; AVX512-NEXT:    vpternlogd $184, %zmm20, %zmm10, %zmm11
+; AVX512-NEXT:    vinserti64x4 $1, %ymm24, %zmm16, %zmm10
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
+; AVX512-NEXT:    vpternlogd $184, %zmm23, %zmm12, %zmm10
+; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm8, %zmm3
+; AVX512-NEXT:    vpternlogd $184, %zmm1, %zmm12, %zmm3
+; AVX512-NEXT:    vinserti64x4 $1, %ymm7, %zmm6, %zmm1
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535]
+; AVX512-NEXT:    vpternlogd $184, %zmm4, %zmm6, %zmm1
+; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm5, %zmm2
+; AVX512-NEXT:    vpternlogd $184, %zmm0, %zmm6, %zmm2
+; AVX512-NEXT:    vmovdqa64 %zmm2, (%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm1, 192(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm3, 256(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm10, 64(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm11, 128(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm9, 320(%rax)
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 ;
@@ -4160,16 +4168,16 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm4, %ymm3
 ; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11]
 ; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm4[4],ymm2[4],ymm4[5],ymm2[5],ymm4[6],ymm2[6],ymm4[7],ymm2[7],ymm4[12],ymm2[12],ymm4[13],ymm2[13],ymm4[14],ymm2[14],ymm4[15],ymm2[15]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm4, %ymm23
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm24
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm4, %ymm24
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm25
 ; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm6 = [2,1,2,3,11,11,11,11]
 ; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm6, %zmm1
-; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %ymm11
+; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %ymm12
 ; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %ymm13
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm13[4],ymm11[4],ymm13[5],ymm11[5],ymm13[6],ymm11[6],ymm13[7],ymm11[7],ymm13[12],ymm11[12],ymm13[13],ymm11[13],ymm13[14],ymm11[14],ymm13[15],ymm11[15]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm13[4],ymm12[4],ymm13[5],ymm12[5],ymm13[6],ymm12[6],ymm13[7],ymm12[7],ymm13[12],ymm12[12],ymm13[13],ymm12[13],ymm13[14],ymm12[14],ymm13[15],ymm12[15]
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [5,6,5,6,5,6,7,7]
 ; AVX512-FCP-NEXT:    vpermd %ymm3, %ymm10, %ymm3
-; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} ymm4 = ymm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm11[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} ymm4 = ymm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm12[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
 ; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} ymm5 = ymm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm13[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
 ; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[8],ymm4[8],ymm5[9],ymm4[9],ymm5[10],ymm4[10],ymm5[11],ymm4[11]
 ; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2]
@@ -4177,17 +4185,17 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    movw $18724, %ax # imm = 0x4924
 ; AVX512-FCP-NEXT:    kmovw %eax, %k1
 ; AVX512-FCP-NEXT:    vmovdqa32 %zmm3, %zmm1 {%k1}
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm11 = [8,21,10,11,20,13,14,23]
 ; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %ymm2
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm12 = [12,1,2,13,4,5,14,7]
-; AVX512-FCP-NEXT:    vmovdqa %ymm1, %ymm5
-; AVX512-FCP-NEXT:    vpermt2d %ymm3, %ymm12, %ymm5
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm17 = [8,21,10,11,20,13,14,23]
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31]
 ; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm2, %ymm3
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm25
-; AVX512-FCP-NEXT:    vpermt2d %zmm3, %zmm17, %zmm1
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm5, %zmm16
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512-FCP-NEXT:    vpermt2d %zmm3, %zmm11, %zmm4
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm27
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm17 = [12,1,2,13,4,5,14,7]
+; AVX512-FCP-NEXT:    vpermt2d %ymm3, %ymm17, %ymm1
+; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm16 = zmm1[0,1,2,3],zmm4[0,1,2,3]
 ; AVX512-FCP-NEXT:    vmovdqa 32(%r9), %ymm3
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31]
 ; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm3, %ymm1
@@ -4198,13 +4206,13 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm18 = [2,2,0,3,10,0,10,11]
 ; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm18, %zmm2
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa (%rsi), %ymm3
-; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm3, %ymm1
+; AVX512-FCP-NEXT:    vmovdqa (%rsi), %ymm9
+; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm9, %ymm1
 ; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm2
 ; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm2, %ymm0
 ; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm27
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm2[4],ymm9[4],ymm2[5],ymm9[5],ymm2[6],ymm9[6],ymm2[7],ymm9[7],ymm2[12],ymm9[12],ymm2[13],ymm9[13],ymm2[14],ymm9[14],ymm2[15],ymm9[15]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm28
 ; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm6, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqa (%rcx), %ymm1
 ; AVX512-FCP-NEXT:    vmovdqa (%rdx), %ymm6
@@ -4217,14 +4225,14 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm4, %zmm2
 ; AVX512-FCP-NEXT:    vmovdqa32 %zmm2, %zmm0 {%k1}
 ; AVX512-FCP-NEXT:    vmovdqa (%r8), %ymm10
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm10[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpermi2d %ymm2, %ymm0, %ymm12
 ; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm10, %ymm2
-; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm17, %zmm0
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm12, %zmm17
-; AVX512-FCP-NEXT:    vmovdqa (%r9), %ymm12
-; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm12, %ymm0
-; AVX512-FCP-NEXT:    vpshufb %ymm15, %ymm12, %ymm2
+; AVX512-FCP-NEXT:    vpermi2d %zmm2, %zmm0, %zmm11
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm10[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpermt2d %ymm2, %ymm17, %ymm0
+; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm17 = zmm0[0,1,2,3],zmm11[0,1,2,3]
+; AVX512-FCP-NEXT:    vmovdqa (%r9), %ymm11
+; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm11, %ymm0
+; AVX512-FCP-NEXT:    vpshufb %ymm15, %ymm11, %ymm2
 ; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm18, %zmm2
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa (%rcx), %xmm14
@@ -4234,120 +4242,120 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm18
 ; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %xmm1
 ; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %xmm6
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm13[0],ymm11[0],ymm13[1],ymm11[1],ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[8],ymm11[8],ymm13[9],ymm11[9],ymm13[10],ymm11[10],ymm13[11],ymm11[11]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[8],ymm12[8],ymm13[9],ymm12[9],ymm13[10],ymm12[10],ymm13[11],ymm12[11]
 ; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7]
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm20
 ; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %xmm13
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11]
 ; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm13, %xmm4
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm0
-; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm0, %xmm5
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm22 = [0,0,2,1,8,9,8,9]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3]
-; AVX512-FCP-NEXT:    vpermt2q %zmm4, %zmm22, %zmm11
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [1,0,2,2,1,0,2,2]
-; AVX512-FCP-NEXT:    # ymm4 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3]
-; AVX512-FCP-NEXT:    vpermd %ymm8, %ymm4, %ymm8
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm3
+; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm3, %xmm12
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm21 = [0,0,2,1,8,9,8,9]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3]
+; AVX512-FCP-NEXT:    vpermt2q %zmm4, %zmm21, %zmm12
+; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} ymm23 = [1,0,2,2,1,0,2,2]
+; AVX512-FCP-NEXT:    # ymm23 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3]
+; AVX512-FCP-NEXT:    vpermd %ymm5, %ymm23, %ymm5
 ; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
 ; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm6 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
 ; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3]
 ; AVX512-FCP-NEXT:    vpbroadcastq %xmm1, %ymm1
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm8, %zmm1
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm5, %zmm1
 ; AVX512-FCP-NEXT:    movw $9362, %ax # imm = 0x2492
 ; AVX512-FCP-NEXT:    kmovw %eax, %k2
-; AVX512-FCP-NEXT:    vmovdqa32 %zmm1, %zmm11 {%k2}
-; AVX512-FCP-NEXT:    vmovdqa %ymm11, %ymm6
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [16,9,10,17,12,13,18,15]
-; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %xmm1
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm1[2,1,3,3,4,5,6,7]
-; AVX512-FCP-NEXT:    vpermt2d %zmm9, %zmm8, %zmm11
-; AVX512-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm9 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm21 = [0,1,8,3,4,9,6,7]
-; AVX512-FCP-NEXT:    vpermt2d %ymm9, %ymm21, %ymm6
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm6, %zmm19
-; AVX512-FCP-NEXT:    vmovdqa (%rsi), %xmm9
-; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm11
-; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm9, %xmm6
-; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm11, %xmm2
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
-; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm22, %zmm5
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
-; AVX512-FCP-NEXT:    vpermd %ymm2, %ymm4, %ymm2
-; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm4 = xmm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm6 = xmm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
-; AVX512-FCP-NEXT:    vpbroadcastq %xmm4, %ymm4
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm2, %zmm2
-; AVX512-FCP-NEXT:    vmovdqa32 %zmm2, %zmm5 {%k2}
-; AVX512-FCP-NEXT:    vmovdqa (%r8), %xmm2
-; AVX512-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; AVX512-FCP-NEXT:    vpermi2d %ymm4, %ymm5, %ymm21
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm2[2,1,3,3,4,5,6,7]
-; AVX512-FCP-NEXT:    vpermt2d %zmm4, %zmm8, %zmm5
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,8,8,0,9]
-; AVX512-FCP-NEXT:    vmovdqa 32(%r9), %xmm6
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm8 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9]
-; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm6, %xmm15
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm6[0,0,2,1,4,5,6,7]
-; AVX512-FCP-NEXT:    vpermt2q %zmm15, %zmm4, %zmm14
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm21, %zmm21
+; AVX512-FCP-NEXT:    vmovdqa32 %zmm1, %zmm12 {%k2}
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [16,9,10,17,12,13,18,15]
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm12, %zmm1
+; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %xmm6
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm6[2,1,3,3,4,5,6,7]
+; AVX512-FCP-NEXT:    vpermt2d %zmm8, %zmm5, %zmm1
+; AVX512-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm8 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm22 = [0,1,8,3,4,9,6,7]
+; AVX512-FCP-NEXT:    vpermt2d %ymm8, %ymm22, %ymm12
+; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm19 = zmm12[0,1,2,3],zmm1[0,1,2,3]
+; AVX512-FCP-NEXT:    vmovdqa (%rsi), %xmm8
+; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm8, %xmm1
+; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm0, %xmm2
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3]
+; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm21, %zmm2
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm21 = [0,0,0,0,8,8,0,9]
+; AVX512-FCP-NEXT:    vmovdqa 32(%r9), %xmm12
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9]
+; AVX512-FCP-NEXT:    vpshufb %xmm7, %xmm12, %xmm4
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm12[0,0,2,1,4,5,6,7]
+; AVX512-FCP-NEXT:    vpermt2q %zmm4, %zmm21, %zmm1
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
+; AVX512-FCP-NEXT:    vpermd %ymm4, %ymm23, %ymm4
+; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm14 = xmm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm15 = xmm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
+; AVX512-FCP-NEXT:    vpbroadcastq %xmm14, %ymm14
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm14, %zmm4, %zmm4
+; AVX512-FCP-NEXT:    vmovdqa32 %zmm4, %zmm2 {%k2}
+; AVX512-FCP-NEXT:    vmovdqa (%r8), %xmm4
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm4[2,1,3,3,4,5,6,7]
+; AVX512-FCP-NEXT:    vpermi2d %zmm14, %zmm2, %zmm5
+; AVX512-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm14 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
+; AVX512-FCP-NEXT:    vpermt2d %ymm14, %ymm22, %ymm2
+; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm22 = zmm2[0,1,2,3],zmm5[0,1,2,3]
 ; AVX512-FCP-NEXT:    vmovdqa (%r9), %xmm5
-; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm5, %xmm8
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm5[0,0,2,1,4,5,6,7]
-; AVX512-FCP-NEXT:    vpermt2q %zmm8, %zmm4, %zmm15
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm27, %ymm4
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7]
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm9 = [1,1,1,1,10,10,10,11]
-; AVX512-FCP-NEXT:    vpermt2q %zmm4, %zmm9, %zmm8
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [1,2,1,2,0,0,3,3,13,12,10,10,13,12,14,14]
-; AVX512-FCP-NEXT:    vpermd %zmm18, %zmm4, %zmm11
-; AVX512-FCP-NEXT:    vmovdqa32 %zmm8, %zmm11 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15]
-; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm18 = [0,9,2,3,8,5,6,11]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm11, %ymm22
-; AVX512-FCP-NEXT:    vpermt2d %ymm2, %ymm18, %ymm22
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [8,9,20,11,12,21,14,15]
+; AVX512-FCP-NEXT:    vpshufb %xmm7, %xmm5, %xmm7
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm5[0,0,2,1,4,5,6,7]
+; AVX512-FCP-NEXT:    vpermt2q %zmm7, %zmm21, %zmm14
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm28, %ymm2
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm7 = ymm2[0],ymm9[0],ymm2[1],ymm9[1],ymm2[2],ymm9[2],ymm2[3],ymm9[3],ymm2[8],ymm9[8],ymm2[9],ymm9[9],ymm2[10],ymm9[10],ymm2[11],ymm9[11]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm15 = [1,1,1,1,10,10,10,11]
+; AVX512-FCP-NEXT:    vpermt2q %zmm7, %zmm15, %zmm0
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [1,2,1,2,0,0,3,3,13,12,10,10,13,12,14,14]
+; AVX512-FCP-NEXT:    vpermd %zmm18, %zmm7, %zmm18
+; AVX512-FCP-NEXT:    vmovdqa32 %zmm0, %zmm18 {%k1}
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [8,9,20,11,12,21,14,15]
 ; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm8 = ymm10[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpermt2d %zmm8, %zmm2, %zmm11
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm10 = ymm12[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm12 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm5, %xmm8
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm18, %zmm10
+; AVX512-FCP-NEXT:    vpermt2d %zmm8, %zmm0, %zmm10
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15]
+; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm4, %xmm4
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm21 = [0,9,2,3,8,5,6,11]
+; AVX512-FCP-NEXT:    vpermt2d %ymm4, %ymm21, %ymm18
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm11[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm11 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm5, %xmm8
 ; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm5 = [0,0,0,1,0,10,10,0]
-; AVX512-FCP-NEXT:    vpermt2q %zmm10, %zmm5, %zmm8
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm23, %ymm10
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm24, %ymm7
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm10 = ymm10[0],ymm7[0],ymm10[1],ymm7[1],ymm10[2],ymm7[2],ymm10[3],ymm7[3],ymm10[8],ymm7[8],ymm10[9],ymm7[9],ymm10[10],ymm7[10],ymm10[11],ymm7[11]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7]
-; AVX512-FCP-NEXT:    vpermt2q %zmm10, %zmm9, %zmm0
-; AVX512-FCP-NEXT:    vpermd %zmm20, %zmm4, %zmm4
-; AVX512-FCP-NEXT:    vmovdqa32 %zmm0, %zmm4 {%k1}
-; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm1, %xmm0
-; AVX512-FCP-NEXT:    vpermi2d %ymm0, %ymm4, %ymm18
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm25, %ymm0
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm2, %zmm4
-; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm6, %xmm0
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm26, %ymm1
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
-; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm5, %zmm0
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm18, %zmm1
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogd $184, %zmm1, %zmm2, %zmm0
+; AVX512-FCP-NEXT:    vpermt2q %zmm4, %zmm5, %zmm8
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm24, %ymm4
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm25, %ymm9
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm4[0],ymm9[0],ymm4[1],ymm9[1],ymm4[2],ymm9[2],ymm4[3],ymm9[3],ymm4[8],ymm9[8],ymm4[9],ymm9[9],ymm4[10],ymm9[10],ymm4[11],ymm9[11]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm13[4],xmm3[5],xmm13[5],xmm3[6],xmm13[6],xmm3[7],xmm13[7]
+; AVX512-FCP-NEXT:    vpermt2q %zmm4, %zmm15, %zmm3
+; AVX512-FCP-NEXT:    vpermd %zmm20, %zmm7, %zmm4
+; AVX512-FCP-NEXT:    vmovdqa32 %zmm3, %zmm4 {%k1}
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm27, %ymm3
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpermi2d %zmm3, %zmm4, %zmm0
+; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm6, %xmm2
+; AVX512-FCP-NEXT:    vpermt2d %ymm2, %ymm21, %ymm4
+; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm12, %xmm2
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm26, %ymm3
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm3[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
+; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm5, %zmm2
+; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm4[0,1,2,3],zmm0[0,1,2,3]
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
+; AVX512-FCP-NEXT:    vpternlogd $184, %zmm0, %zmm3, %zmm2
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, 256(%rax)
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm22, %zmm0
-; AVX512-FCP-NEXT:    vpternlogd $184, %zmm0, %zmm2, %zmm8
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, 256(%rax)
+; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm18[0,1,2,3],zmm10[0,1,2,3]
+; AVX512-FCP-NEXT:    vpternlogd $184, %zmm0, %zmm3, %zmm8
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, 64(%rax)
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogd $184, %zmm21, %zmm0, %zmm15
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm15, (%rax)
-; AVX512-FCP-NEXT:    vpternlogd $184, %zmm19, %zmm0, %zmm14
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm14, 192(%rax)
+; AVX512-FCP-NEXT:    vpternlogd $184, %zmm22, %zmm0, %zmm14
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm14, (%rax)
+; AVX512-FCP-NEXT:    vpternlogd $184, %zmm19, %zmm0, %zmm1
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, 192(%rax)
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0]
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512-FCP-NEXT:    vpternlogd $184, %zmm17, %zmm0, %zmm1
@@ -4362,253 +4370,261 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-LABEL: store_i16_stride6_vf32:
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    vmovdqa (%rcx), %xmm6
-; AVX512DQ-NEXT:    vmovdqa 32(%rcx), %xmm2
-; AVX512DQ-NEXT:    vmovdqa (%rdx), %xmm9
+; AVX512DQ-NEXT:    vmovdqa 32(%rcx), %xmm1
+; AVX512DQ-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vmovdqa (%rdx), %xmm11
 ; AVX512DQ-NEXT:    vmovdqa 32(%rdx), %xmm3
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512DQ-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512DQ-NEXT:    vmovdqa64 %xmm2, %xmm25
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; AVX512DQ-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
 ; AVX512DQ-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512DQ-NEXT:    vmovdqa64 %xmm3, %xmm26
+; AVX512DQ-NEXT:    vmovdqa64 %xmm3, %xmm27
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
 ; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [1,0,2,2,1,0,2,2,16,17,16,17,16,17,16,17]
 ; AVX512DQ-NEXT:    vpermt2d %zmm1, %zmm2, %zmm0
-; AVX512DQ-NEXT:    vmovdqa (%rsi), %xmm8
+; AVX512DQ-NEXT:    vmovdqa (%rsi), %xmm12
 ; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %xmm4
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm4[0,1,2,1]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5]
-; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm10
+; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm15
 ; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %xmm5
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm3 = xmm5[0,1,2,1]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5]
 ; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm5, %xmm27
-; AVX512DQ-NEXT:    vmovdqa64 %xmm4, %xmm28
+; AVX512DQ-NEXT:    vmovdqa64 %xmm5, %xmm28
+; AVX512DQ-NEXT:    vmovdqa64 %xmm4, %xmm29
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
 ; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm3[0,1,2,3],zmm1[0,1,0,1]
 ; AVX512DQ-NEXT:    movw $9362, %ax # imm = 0x2492
 ; AVX512DQ-NEXT:    kmovw %eax, %k1
 ; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
 ; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
-; AVX512DQ-NEXT:    vmovdqa (%r8), %xmm11
+; AVX512DQ-NEXT:    vmovdqa (%r8), %xmm5
 ; AVX512DQ-NEXT:    vmovdqa 32(%r8), %xmm4
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm4[2,1,3,3,4,5,6,7]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1,2],ymm3[3],ymm0[4,5],ymm3[6],ymm0[7]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
-; AVX512DQ-NEXT:    vmovdqa64 %xmm4, %xmm29
+; AVX512DQ-NEXT:    vmovdqa64 %xmm4, %xmm31
 ; AVX512DQ-NEXT:    vpbroadcastq %xmm3, %ymm3
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm16
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3]
+; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
+; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa (%r9), %xmm4
+; AVX512DQ-NEXT:    vmovdqa 32(%r9), %xmm1
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm1[0,0,2,1,4,5,6,7]
+; AVX512DQ-NEXT:    vpbroadcastq %xmm0, %ymm26
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm1, %xmm30
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm19 = ymm0[0,0,2,1]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm11[0],xmm6[0],xmm11[1],xmm6[1],xmm11[2],xmm6[2],xmm11[3],xmm6[3]
 ; AVX512DQ-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512DQ-NEXT:    vmovdqa64 %xmm6, %xmm24
-; AVX512DQ-NEXT:    vpsrldq {{.*#+}} xmm3 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT:    vmovdqa64 %xmm6, %xmm25
+; AVX512DQ-NEXT:    vpsrldq {{.*#+}} xmm3 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
 ; AVX512DQ-NEXT:    vpermt2d %zmm1, %zmm2, %zmm0
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm8[0,1,2,1]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm12[0,1,2,1]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm10[0,1,2,1]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm15[0,1,2,1]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5]
 ; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm8, %xmm31
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
 ; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm1[0,1,0,1]
 ; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
 ; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm11[2,1,3,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm5[2,1,3,3,4,5,6,7]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7]
-; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm11[0],zero,xmm11[1],zero,xmm11[2],zero,xmm11[3],zero
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero
+; AVX512DQ-NEXT:    vmovdqa64 %xmm5, %xmm17
 ; AVX512DQ-NEXT:    vpbroadcastq %xmm2, %ymm2
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm17
-; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %ymm13
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm13[2,1,2,3,6,5,6,7]
+; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm21 = zmm1[0,1,2,3],zmm0[4,5,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm4[0,0,2,1,4,5,6,7]
+; AVX512DQ-NEXT:    vpbroadcastq %xmm0, %ymm22
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm4[0,2,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm4, %xmm16
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm18 = ymm0[0,0,2,1]
+; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %ymm8
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm8[2,1,2,3,6,5,6,7]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
-; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm14
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm14[2,1,2,3,6,5,6,7]
+; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm7
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm7[2,1,2,3,6,5,6,7]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm14[4],ymm13[4],ymm14[5],ymm13[5],ymm14[6],ymm13[6],ymm14[7],ymm13[7],ymm14[12],ymm13[12],ymm14[13],ymm13[13],ymm14[14],ymm13[14],ymm14[15],ymm13[15]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm7[4],ymm8[4],ymm7[5],ymm8[5],ymm7[6],ymm8[6],ymm7[7],ymm8[7],ymm7[12],ymm8[12],ymm7[13],ymm8[13],ymm7[14],ymm8[14],ymm7[15],ymm8[15]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3]
-; AVX512DQ-NEXT:    vmovdqa 32(%rcx), %ymm15
-; AVX512DQ-NEXT:    vpsrldq {{.*#+}} ymm2 = ymm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm15[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512DQ-NEXT:    vmovdqa 32(%rdx), %ymm7
-; AVX512DQ-NEXT:    vpsrldq {{.*#+}} ymm3 = ymm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm7[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT:    vmovdqa 32(%rcx), %ymm13
+; AVX512DQ-NEXT:    vpsrldq {{.*#+}} ymm2 = ymm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm13[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT:    vmovdqa 32(%rdx), %ymm14
+; AVX512DQ-NEXT:    vpsrldq {{.*#+}} ymm3 = ymm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm14[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm2[2,2,2,2]
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm7[4],ymm15[4],ymm7[5],ymm15[5],ymm7[6],ymm15[6],ymm7[7],ymm15[7],ymm7[12],ymm15[12],ymm7[13],ymm15[13],ymm7[14],ymm15[14],ymm7[15],ymm15[15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[1,2,3,3,5,6,7,7]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm14[4],ymm13[4],ymm14[5],ymm13[5],ymm14[6],ymm13[6],ymm14[7],ymm13[7],ymm14[12],ymm13[12],ymm14[13],ymm13[13],ymm14[14],ymm13[14],ymm14[15],ymm13[15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[1,2,3,3,5,6,7,7]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3]
 ; AVX512DQ-NEXT:    movw $18724, %ax # imm = 0x4924
 ; AVX512DQ-NEXT:    kmovw %eax, %k1
-; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm2, %zmm1, %zmm0 {%k1}
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm3, %zmm2, %zmm0 {%k1}
 ; AVX512DQ-NEXT:    vmovdqa 32(%r8), %ymm6
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31]
-; AVX512DQ-NEXT:    vpshufb %ymm5, %ymm6, %ymm1
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31]
+; AVX512DQ-NEXT:    vpshufb %ymm10, %ymm6, %ymm1
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
 ; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm1
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm6[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm22
-; AVX512DQ-NEXT:    vmovdqa (%rsi), %ymm4
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm4[2,1,2,3,6,5,6,7]
+; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm20 = zmm0[0,1,2,3],zmm1[4,5,6,7]
+; AVX512DQ-NEXT:    vmovdqa (%rsi), %ymm5
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm5[2,1,2,3,6,5,6,7]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
-; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm3
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm3[2,1,2,3,6,5,6,7]
+; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm4
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm4[2,1,2,3,6,5,6,7]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm18 = ymm0[2,1,2,3]
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm12 = ymm0[3,3,3,3]
-; AVX512DQ-NEXT:    vmovdqa (%rcx), %ymm2
-; AVX512DQ-NEXT:    vpsrldq {{.*#+}} ymm1 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512DQ-NEXT:    vmovdqa (%rdx), %ymm0
-; AVX512DQ-NEXT:    vpsrldq {{.*#+}} ymm8 = ymm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm0[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm8[0],ymm1[0],ymm8[1],ymm1[1],ymm8[2],ymm1[2],ymm8[3],ymm1[3],ymm8[8],ymm1[8],ymm8[9],ymm1[9],ymm8[10],ymm1[10],ymm8[11],ymm1[11]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm8 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm8 = ymm8[1,2,3,3,5,6,7,7]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm12, %zmm18, %zmm12
-; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm8, %zmm1, %zmm12 {%k1}
-; AVX512DQ-NEXT:    vmovdqa (%r8), %ymm1
-; AVX512DQ-NEXT:    vpshufb %ymm5, %ymm1, %ymm5
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,1,2,3]
-; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm12, %ymm8
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm8[0],ymm5[1],ymm8[2,3],ymm5[4],ymm8[5,6],ymm5[7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm8 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0],ymm12[1,2],ymm8[3],ymm12[4,5],ymm8[6],ymm12[7]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm5, %zmm8, %zmm18
-; AVX512DQ-NEXT:    vmovdqa 32(%r9), %xmm12
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm12[0,0,2,1,4,5,6,7]
-; AVX512DQ-NEXT:    vpbroadcastq %xmm5, %ymm19
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm12[0,2,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm20 = ymm5[0,0,2,1]
-; AVX512DQ-NEXT:    vmovdqa (%r9), %xmm5
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm5[0,0,2,1,4,5,6,7]
-; AVX512DQ-NEXT:    vpbroadcastq %xmm8, %ymm21
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm5[0,2,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm23 = ymm8[0,0,2,1]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm24, %xmm8
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11]
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm24 = [17,18,17,18,0,0,19,19,5,4,2,2,5,4,6,6]
-; AVX512DQ-NEXT:    vpermt2d %zmm8, %zmm24, %zmm0
-; AVX512DQ-NEXT:    vmovdqa64 %xmm31, %xmm2
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[1,1,1,1]
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3]
-; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm3, %zmm2, %zmm0 {%k1}
-; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm3 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15]
-; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm11, %xmm2
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm2
-; AVX512DQ-NEXT:    vmovdqa 32(%r9), %ymm1
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm4 = ymm1[2,3,2,3,6,7,6,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm4[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,1,2,3]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm25, %xmm8
-; AVX512DQ-NEXT:    vmovdqa64 %xmm26, %xmm9
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm7 = ymm7[0],ymm15[0],ymm7[1],ymm15[1],ymm7[2],ymm15[2],ymm7[3],ymm15[3],ymm7[8],ymm15[8],ymm7[9],ymm15[9],ymm7[10],ymm15[10],ymm7[11],ymm15[11]
-; AVX512DQ-NEXT:    vpermt2d %zmm8, %zmm24, %zmm7
-; AVX512DQ-NEXT:    vmovdqa64 %xmm27, %xmm8
-; AVX512DQ-NEXT:    vmovdqa64 %xmm28, %xmm9
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[1,1,1,1]
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm9 = ymm14[0],ymm13[0],ymm14[1],ymm13[1],ymm14[2],ymm13[2],ymm14[3],ymm13[3],ymm14[8],ymm13[8],ymm14[9],ymm13[9],ymm14[10],ymm13[10],ymm14[11],ymm13[11]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm24 = ymm0[2,1,2,3]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm23 = ymm0[3,3,3,3]
+; AVX512DQ-NEXT:    vmovdqa (%rcx), %ymm3
+; AVX512DQ-NEXT:    vpsrldq {{.*#+}} ymm2 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT:    vmovdqa (%rdx), %ymm1
+; AVX512DQ-NEXT:    vpsrldq {{.*#+}} ymm9 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm9[0],ymm2[0],ymm9[1],ymm2[1],ymm9[2],ymm2[2],ymm9[3],ymm2[3],ymm9[8],ymm2[8],ymm9[9],ymm2[9],ymm9[10],ymm2[10],ymm9[11],ymm2[11]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm9 = ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[1,2,3,3,5,6,7,7]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3]
-; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm9, %zmm8, %zmm7 {%k1}
-; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm7, %ymm8
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm6[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,2,2,2]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm6 = ymm8[0,1],ymm6[2],ymm8[3,4],ymm6[5],ymm8[6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm29, %xmm8
-; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm8, %xmm3
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm23, %zmm24, %zmm0
+; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm9, %zmm2, %zmm0 {%k1}
+; AVX512DQ-NEXT:    vmovdqa (%r8), %ymm2
+; AVX512DQ-NEXT:    vpshufb %ymm10, %ymm2, %ymm9
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,1,2,3]
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm10
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5,6],ymm9[7]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm9, %zmm0, %zmm9
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm10 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm10[0],ymm0[1,2],ymm10[3],ymm0[4,5],ymm10[6],ymm0[7]
+; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm23 = zmm0[0,1,2,3],zmm9[4,5,6,7]
+; AVX512DQ-NEXT:    vmovdqa 32(%r9), %ymm9
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm9[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm24 = ymm0[2,2,2,3]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm25, %xmm0
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm10 = ymm9[2,3,2,3,6,7,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm10 = ymm10[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm25 = ymm10[2,1,2,3]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11]
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm10 = [17,18,17,18,0,0,19,19,5,4,2,2,5,4,6,6]
+; AVX512DQ-NEXT:    vpermt2d %zmm0, %zmm10, %zmm3
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm15[4],xmm12[4],xmm15[5],xmm12[5],xmm15[6],xmm12[6],xmm15[7],xmm12[7]
+; AVX512DQ-NEXT:    vmovdqa (%r9), %ymm1
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm11 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm5 = ymm1[2,3,2,3,6,7,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm5[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm12 = ymm5[2,1,2,3]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3]
+; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm4, %zmm0, %zmm3 {%k1}
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm2[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm3, %ymm2
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm2
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm0 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm17, %xmm4
+; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm4, %xmm4
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7]
+; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm3 = xmm16[2,3,2,3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,1,4,5,6,7]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm7[0],ymm3[1],ymm7[2,3],ymm3[4],ymm7[5,6],ymm3[7]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm6, %zmm3, %zmm3
-; AVX512DQ-NEXT:    vmovdqa (%r9), %ymm6
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm7 = ymm6[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm8 = ymm6[2,3,2,3,6,7,6,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm8 = ymm8[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,1,2,3]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,2,2,1,4,5,6,7]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm6[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,2,2,2]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm9 = xmm12[2,3,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm9[0,2,2,1,4,5,6,7]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm27, %xmm5
+; AVX512DQ-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm4 # 16-byte Folded Reload
+; AVX512DQ-NEXT:    # xmm4 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm5 = ymm14[0],ymm13[0],ymm14[1],ymm13[1],ymm14[2],ymm13[2],ymm14[3],ymm13[3],ymm14[8],ymm13[8],ymm14[9],ymm13[9],ymm14[10],ymm13[10],ymm14[11],ymm13[11]
+; AVX512DQ-NEXT:    vpermt2d %zmm4, %zmm10, %zmm5
+; AVX512DQ-NEXT:    vmovdqa64 %xmm28, %xmm4
+; AVX512DQ-NEXT:    vmovdqa64 %xmm29, %xmm10
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[1,1,1,1]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[8],ymm8[8],ymm7[9],ymm8[9],ymm7[10],ymm8[10],ymm7[11],ymm8[11]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3]
+; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm7, %zmm4, %zmm5 {%k1}
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm5, %ymm4
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm6[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,2,2,2]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm6[2],ymm4[3,4],ymm6[5],ymm4[6,7]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm4
+; AVX512DQ-NEXT:    vmovdqa64 %xmm31, %xmm6
+; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2,3],ymm0[4],ymm5[5,6],ymm0[7]
+; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm4[4,5,6,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm4 = xmm30[2,3,2,3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,1,4,5,6,7]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm9[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,2,2,2]
 ; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm20, %zmm19, %zmm10
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535]
-; AVX512DQ-NEXT:    vpternlogd $184, %zmm16, %zmm11, %zmm10
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm23, %zmm21, %zmm12
-; AVX512DQ-NEXT:    vpternlogd $184, %zmm17, %zmm11, %zmm12
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm0
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0]
-; AVX512DQ-NEXT:    vpternlogd $184, %zmm22, %zmm4, %zmm0
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm8, %zmm7, %zmm7
-; AVX512DQ-NEXT:    vpternlogd $184, %zmm18, %zmm4, %zmm7
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm6, %zmm5, %zmm4
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512DQ-NEXT:    vpternlogd $184, %zmm2, %zmm5, %zmm4
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm9, %zmm1
-; AVX512DQ-NEXT:    vpternlogd $184, %zmm3, %zmm5, %zmm1
-; AVX512DQ-NEXT:    vmovdqa64 %zmm1, 256(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm4, 64(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm12, (%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm10, 192(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm7, 128(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm0, 320(%rax)
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm19, %zmm26, %zmm6
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535]
+; AVX512DQ-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm6 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm18, %zmm22, %zmm8
+; AVX512DQ-NEXT:    vpternlogd $184, %zmm21, %zmm7, %zmm8
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm25, %zmm24, %zmm7
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0]
+; AVX512DQ-NEXT:    vpternlogd $184, %zmm20, %zmm9, %zmm7
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm12, %zmm11, %zmm10
+; AVX512DQ-NEXT:    vpternlogd $184, %zmm23, %zmm9, %zmm10
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm3, %zmm1
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
+; AVX512DQ-NEXT:    vpternlogd $184, %zmm2, %zmm3, %zmm1
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm5, %zmm4, %zmm2
+; AVX512DQ-NEXT:    vpternlogd $184, %zmm0, %zmm3, %zmm2
+; AVX512DQ-NEXT:    vmovdqa64 %zmm2, 256(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm1, 64(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm8, (%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm6, 192(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm10, 128(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm7, 320(%rax)
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512DQ-FCP-LABEL: store_i16_stride6_vf32:
 ; AVX512DQ-FCP:       # %bb.0:
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm2
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm31
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %ymm14
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm1
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm14[0],ymm1[1],ymm14[1],ymm1[2],ymm14[2],ymm1[3],ymm14[3],ymm1[8],ymm14[8],ymm1[9],ymm14[9],ymm1[10],ymm14[10],ymm1[11],ymm14[11]
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm1, %ymm25
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %xmm2
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm18 = [1,1,1,1,10,10,10,11]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm18, %zmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %ymm3
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm3, %ymm26
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm27
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm20 = [1,1,1,1,10,10,10,11]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm20, %zmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %ymm2
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm3, %ymm27
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm26
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %xmm3
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %xmm5
@@ -4617,62 +4633,63 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %xmm6
 ; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm2
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm20 = [1,2,1,2,0,0,3,3,13,12,10,10,13,12,14,14]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm2, %zmm20, %zmm19
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm21 = [1,2,1,2,0,0,3,3,13,12,10,10,13,12,14,14]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm2, %zmm21, %zmm19
 ; AVX512DQ-FCP-NEXT:    movw $18724, %ax # imm = 0x4924
 ; AVX512DQ-FCP-NEXT:    kmovw %eax, %k1
 ; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm1, %zmm19 {%k1}
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm17 = [8,9,20,11,12,21,14,15]
+; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %ymm0
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm0, %ymm24
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm19, %zmm23
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm1, %zmm17, %zmm23
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm0, %xmm1
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm17 = [0,9,2,3,8,5,6,11]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm19, %ymm23
-; AVX512DQ-FCP-NEXT:    vpermt2d %ymm1, %ymm17, %ymm23
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm21 = [8,9,20,11,12,21,14,15]
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %ymm9
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %ymm4
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm4[0],ymm9[0],ymm4[1],ymm9[1],ymm4[2],ymm9[2],ymm4[3],ymm9[3],ymm4[8],ymm9[8],ymm4[9],ymm9[9],ymm4[10],ymm9[10],ymm4[11],ymm9[11]
 ; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm6, %xmm30
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm5, %xmm29
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm5
-; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %ymm7
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm7[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpermt2d %zmm1, %zmm21, %zmm19
-; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %ymm8
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm6 = [0,0,0,1,0,10,10,0]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm8[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %xmm2
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm2, %xmm15
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm2, %xmm28
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm6, %zmm15
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %xmm11
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %xmm10
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm3
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm0, %xmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm0, %xmm28
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [0,9,2,3,8,5,6,11]
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm1, %ymm6, %ymm19
+; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %ymm7
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm18 = [0,0,0,1,0,10,10,0]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm7[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %xmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm13 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm0, %xmm12
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm18, %zmm12
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %xmm10
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %xmm8
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %ymm2
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm12 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm13 = xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm12, %zmm18, %zmm13
-; AVX512DQ-FCP-NEXT:    vpermd %zmm5, %zmm20, %zmm20
-; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm13, %zmm20 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %xmm12
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm12, %xmm3
-; AVX512DQ-FCP-NEXT:    vpermi2d %ymm3, %ymm20, %ymm17
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm11 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm11, %zmm20, %zmm0
+; AVX512DQ-FCP-NEXT:    vpermd %zmm3, %zmm21, %zmm20
+; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm0, %zmm20 {%k1}
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %ymm3
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpermt2d %zmm5, %zmm21, %zmm20
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r9), %xmm5
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm5, %xmm13
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm5, %xmm24
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpermi2d %zmm0, %zmm20, %zmm17
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %xmm11
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm11, %xmm0
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm0, %ymm6, %ymm20
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r9), %xmm0
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm0, %xmm13
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm0, %xmm31
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r9), %ymm5
 ; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm5[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm6, %zmm13
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm18, %zmm13
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm2, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm1, %ymm14
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm14[0],ymm0[0],ymm14[1],ymm0[1],ymm14[2],ymm0[2],ymm14[3],ymm0[3],ymm14[8],ymm0[8],ymm14[9],ymm0[9],ymm14[10],ymm0[10],ymm14[11],ymm0[11]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm1, %ymm15
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm15[0],ymm0[0],ymm15[1],ymm0[1],ymm15[2],ymm0[2],ymm15[3],ymm0[3],ymm15[8],ymm0[8],ymm15[9],ymm0[9],ymm15[10],ymm0[10],ymm15[11],ymm0[11]
 ; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
 ; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [2,1,2,3,11,11,11,11]
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
@@ -4684,126 +4701,126 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm4[0],ymm9[0],ymm4[1],ymm9[1],ymm4[2],ymm9[2],ymm4[3],ymm9[3],ymm4[8],ymm9[8],ymm4[9],ymm9[9],ymm4[10],ymm9[10],ymm4[11],ymm9[11]
 ; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2]
 ; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm1, %zmm4, %zmm0 {%k1}
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [12,1,2,13,4,5,14,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, %ymm9
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm3[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpermt2d %ymm1, %ymm4, %ymm9
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [8,21,10,11,20,13,14,23]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm9
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm3, %ymm3
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm3, %ymm15
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm1, %ymm16
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm22 = [8,21,10,11,20,13,14,23]
-; AVX512DQ-FCP-NEXT:    vpermt2d %zmm3, %zmm22, %zmm0
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm9, %zmm18
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm15, %zmm4, %zmm9
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm3[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm22 = [12,1,2,13,4,5,14,7]
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm3, %ymm22, %ymm0
+; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm18 = zmm0[0,1,2,3],zmm9[0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm14, %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm25, %ymm9
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm9, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm31, %ymm1
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm1, %ymm3
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm9, %ymm3
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11]
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm9[4],ymm1[5],ymm9[5],ymm1[6],ymm9[6],ymm1[7],ymm9[7],ymm1[12],ymm9[12],ymm1[13],ymm9[13],ymm1[14],ymm9[14],ymm1[15],ymm9[15]
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm9[4],ymm14[4],ymm9[5],ymm14[5],ymm9[6],ymm14[6],ymm9[7],ymm14[7],ymm9[12],ymm14[12],ymm9[13],ymm14[13],ymm9[14],ymm14[14],ymm9[15],ymm14[15]
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm2, %zmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm5, %ymm3
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25]
-; AVX512DQ-FCP-NEXT:    # ymm6 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm5, %ymm14
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm5, %ymm6
+; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25]
+; AVX512DQ-FCP-NEXT:    # ymm9 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm5, %ymm2
 ; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm5 = [2,2,0,3,10,0,10,11]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm5, %zmm14
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm6, %zmm5, %zmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm27, %ymm14
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm26, %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm27, %ymm9
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm9[4],ymm1[5],ymm9[5],ymm1[6],ymm9[6],ymm1[7],ymm9[7],ymm1[12],ymm9[12],ymm1[13],ymm9[13],ymm1[14],ymm9[14],ymm1[15],ymm9[15]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm3, %ymm21, %ymm3
-; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} ymm9 = ymm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm9[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm9[0],ymm1[1],ymm9[1],ymm1[2],ymm9[2],ymm1[3],ymm9[3],ymm1[8],ymm9[8],ymm1[9],ymm9[9],ymm1[10],ymm9[10],ymm1[11],ymm9[11]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
-; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm3, %zmm1, %zmm0 {%k1}
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm7[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpermi2d %ymm1, %ymm0, %ymm4
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm6 = ymm1[4],ymm14[4],ymm1[5],ymm14[5],ymm1[6],ymm14[6],ymm1[7],ymm14[7],ymm1[12],ymm14[12],ymm1[13],ymm14[13],ymm1[14],ymm14[14],ymm1[15],ymm14[15]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm6, %ymm21, %ymm6
+; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} ymm15 = ymm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm14[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} ymm14 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm14 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[8],ymm15[8],ymm14[9],ymm15[9],ymm14[10],ymm15[10],ymm14[11],ymm15[11]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2]
+; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm6, %zmm14, %zmm0 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm24, %ymm6
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm16, %ymm1
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm7, %ymm1
-; AVX512DQ-FCP-NEXT:    vpermt2d %zmm1, %zmm22, %zmm0
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm4, %zmm0
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm8, %ymm2
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm8, %ymm1
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm5, %zmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm11, %xmm3
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm10, %xmm4
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
+; AVX512DQ-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm4
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm6[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm1, %ymm22, %ymm0
+; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[0,1,2,3],zmm4[0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm7, %ymm3
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm7, %ymm0
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm5, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm10, %xmm3
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm8, %xmm4
 ; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3]
 ; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm5 = [0,0,2,1,8,9,8,9]
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm4, %zmm5, %zmm3
 ; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [1,0,2,2,1,0,2,2]
 ; AVX512DQ-FCP-NEXT:    # ymm4 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm29, %xmm7
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm30, %xmm8
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm29, %xmm8
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm30, %xmm9
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
 ; AVX512DQ-FCP-NEXT:    vpermd %ymm6, %ymm4, %ymm6
-; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm7 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
 ; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm8 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
-; AVX512DQ-FCP-NEXT:    vpbroadcastq %xmm7, %ymm7
+; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm9 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
+; AVX512DQ-FCP-NEXT:    vpbroadcastq %xmm8, %ymm8
 ; AVX512DQ-FCP-NEXT:    movw $9362, %ax # imm = 0x2492
 ; AVX512DQ-FCP-NEXT:    kmovw %eax, %k1
-; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm7, %zmm6, %zmm3 {%k1}
-; AVX512DQ-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm6 = xmm12[0],zero,xmm12[1],zero,xmm12[2],zero,xmm12[3],zero
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm3, %ymm7
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [16,9,10,17,12,13,18,15]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm12[2,1,3,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpermt2d %zmm9, %zmm8, %zmm3
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [0,1,8,3,4,9,6,7]
-; AVX512DQ-FCP-NEXT:    vpermt2d %ymm6, %ymm9, %ymm7
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm7, %zmm3
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm10, %xmm6
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm5, %zmm6
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm24, %xmm2
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm2, %xmm5
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,1,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm10 = [0,0,0,0,8,8,0,9]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm5, %zmm10, %zmm2
+; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm8, %zmm6, %zmm3 {%k1}
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [16,9,10,17,12,13,18,15]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm11[2,1,3,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm9
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm8, %zmm6, %zmm9
+; AVX512DQ-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm8 = xmm11[0],zero,xmm11[1],zero,xmm11[2],zero,xmm11[3],zero
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm5, %ymm4, %ymm4
-; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm5 = xmm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm11 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm11[0],xmm5[0],xmm11[1],xmm5[1],xmm11[2],xmm5[2],xmm11[3],xmm5[3]
-; AVX512DQ-FCP-NEXT:    vpbroadcastq %xmm5, %ymm5
-; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm5, %zmm4, %zmm6 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm4 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero
-; AVX512DQ-FCP-NEXT:    vpermi2d %ymm4, %ymm6, %ymm9
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm5[2,1,3,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpermt2d %zmm4, %zmm8, %zmm6
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm9, %zmm4
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm28, %xmm6
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm6, %xmm5
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm11, %xmm10
+; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm14, %xmm7
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7]
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm7, %zmm5, %zmm10
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [0,1,8,3,4,9,6,7]
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm8, %ymm5, %ymm3
+; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm9[0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm7, %ymm4, %ymm4
+; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm7 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm8 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm8 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9]
+; AVX512DQ-FCP-NEXT:    vpbroadcastq %xmm7, %ymm7
+; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm7, %zmm4, %zmm10 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm31, %xmm7
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm7, %xmm4
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[0,0,2,1,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,8,8,0,9]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm4, %zmm9, %zmm7
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm28, %xmm11
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm11[2,1,3,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpermi2d %zmm4, %zmm10, %zmm6
+; AVX512DQ-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm4 = xmm28[0],zero,xmm28[1],zero,xmm28[2],zero,xmm28[3],zero
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm4, %ymm5, %ymm10
+; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm4 = zmm10[0,1,2,3],zmm6[0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm6, %xmm5
 ; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[0,0,2,1,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm5, %zmm10, %zmm6
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm5, %zmm9, %zmm6
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535]
 ; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm4, %zmm5, %zmm6
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, (%rax)
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm3, %zmm5, %zmm2
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, 192(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0]
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm0, %zmm2, %zmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, 128(%rax)
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm18, %zmm2, %zmm14
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm14, 320(%rax)
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm20, %zmm17, %zmm0
+; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm3, %zmm5, %zmm7
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, 192(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0]
+; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm1, %zmm3, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, 128(%rax)
+; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm18, %zmm3, %zmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, 320(%rax)
+; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm20[0,1,2,3],zmm17[0,1,2,3]
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
 ; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm0, %zmm1, %zmm13
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm13, 256(%rax)
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm19, %zmm23, %zmm0
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm0, %zmm1, %zmm15
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm15, 64(%rax)
+; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm19[0,1,2,3],zmm23[0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm0, %zmm1, %zmm12
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm12, 64(%rax)
 ; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
 ;
@@ -8317,875 +8334,901 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ;
 ; AVX512-LABEL: store_i16_stride6_vf64:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    subq $392, %rsp # imm = 0x188
+; AVX512-NEXT:    subq $600, %rsp # imm = 0x258
 ; AVX512-NEXT:    vmovdqa 96(%rcx), %ymm9
 ; AVX512-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm9[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512-NEXT:    vmovdqa 96(%rdx), %ymm4
-; AVX512-NEXT:    vpsrldq {{.*#+}} ymm1 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vmovdqa 96(%rdx), %ymm6
+; AVX512-NEXT:    vpsrldq {{.*#+}} ymm1 = ymm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm6[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm9[4],ymm4[5],ymm9[5],ymm4[6],ymm9[6],ymm4[7],ymm9[7],ymm4[12],ymm9[12],ymm4[13],ymm9[13],ymm4[14],ymm9[14],ymm4[15],ymm9[15]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm6[4],ymm9[4],ymm6[5],ymm9[5],ymm6[6],ymm9[6],ymm6[7],ymm9[7],ymm6[12],ymm9[12],ymm6[13],ymm9[13],ymm6[14],ymm9[14],ymm6[15],ymm9[15]
 ; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,2,3,3,5,6,7,7]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3]
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vmovdqa 96(%rsi), %ymm10
 ; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm10[2,1,2,3,6,5,6,7]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
-; AVX512-NEXT:    vmovdqa 96(%rdi), %ymm6
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm3 = ymm6[2,1,2,3,6,5,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm3[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11]
+; AVX512-NEXT:    vmovdqa 96(%rdi), %ymm8
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm8[2,1,2,3,6,5,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm2[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm6[4],ymm10[4],ymm6[5],ymm10[5],ymm6[6],ymm10[6],ymm6[7],ymm10[7],ymm6[12],ymm10[12],ymm6[13],ymm10[13],ymm6[14],ymm10[14],ymm6[15],ymm10[15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[3,3,3,3]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm1, %zmm1
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm8[4],ymm10[4],ymm8[5],ymm10[5],ymm8[6],ymm10[6],ymm8[7],ymm10[7],ymm8[12],ymm10[12],ymm8[13],ymm10[13],ymm8[14],ymm10[14],ymm8[15],ymm10[15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[3,3,3,3]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
 ; AVX512-NEXT:    movw $18724, %ax # imm = 0x4924
 ; AVX512-NEXT:    kmovw %eax, %k1
 ; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
 ; AVX512-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
-; AVX512-NEXT:    vmovdqa 96(%r8), %ymm3
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31]
-; AVX512-NEXT:    vpshufb %ymm14, %ymm3, %ymm5
-; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,1,2,3]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3],ymm5[4],ymm0[5,6],ymm5[7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm3[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0],ymm1[1,2],ymm5[3],ymm1[4,5],ymm5[6],ymm1[7]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512-NEXT:    vmovdqa 96(%r8), %ymm5
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31]
+; AVX512-NEXT:    vpshufb %ymm7, %ymm5, %ymm2
+; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm5[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7]
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
+; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa 96(%r9), %ymm1
+; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa 64(%rcx), %ymm5
-; AVX512-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512-NEXT:    vmovdqa 64(%rdx), %ymm7
-; AVX512-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpsrldq {{.*#+}} ymm1 = ymm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm7[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm1[2,3,2,3,6,7,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
+; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vmovdqa 64(%rcx), %ymm2
+; AVX512-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vmovdqa 64(%rdx), %ymm3
+; AVX512-NEXT:    vpsrldq {{.*#+}} ymm1 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm7[4],ymm5[4],ymm7[5],ymm5[5],ymm7[6],ymm5[6],ymm7[7],ymm5[7],ymm7[12],ymm5[12],ymm7[13],ymm5[13],ymm7[14],ymm5[14],ymm7[15],ymm5[15]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15]
+; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm21
+; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm19
 ; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,2,3,3,5,6,7,7]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3]
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512-NEXT:    vmovdqa 64(%rsi), %ymm7
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm7[2,1,2,3,6,5,6,7]
+; AVX512-NEXT:    vmovdqa 64(%rsi), %ymm3
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm3[2,1,2,3,6,5,6,7]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
-; AVX512-NEXT:    vmovdqa 64(%rdi), %ymm2
-; AVX512-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm2[2,1,2,3,6,5,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm5[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm5[0],ymm1[0],ymm5[1],ymm1[1],ymm5[2],ymm1[2],ymm5[3],ymm1[3],ymm5[8],ymm1[8],ymm5[9],ymm1[9],ymm5[10],ymm1[10],ymm5[11],ymm1[11]
+; AVX512-NEXT:    vmovdqa 64(%rdi), %ymm4
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm4[2,1,2,3,6,5,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm2[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm7[4],ymm2[5],ymm7[5],ymm2[6],ymm7[6],ymm2[7],ymm7[7],ymm2[12],ymm7[12],ymm2[13],ymm7[13],ymm2[14],ymm7[14],ymm2[15],ymm7[15]
-; AVX512-NEXT:    vmovdqa64 %ymm7, %ymm20
-; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[3,3,3,3]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm1, %zmm1
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15]
+; AVX512-NEXT:    vmovdqa64 %ymm4, %ymm16
+; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm18
+; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[3,3,3,3]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
 ; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
 ; AVX512-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
-; AVX512-NEXT:    vmovdqa 64(%r8), %ymm12
-; AVX512-NEXT:    vpshufb %ymm14, %ymm12, %ymm5
-; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,1,2,3]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3],ymm5[4],ymm0[5,6],ymm5[7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm12[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0],ymm1[1,2],ymm5[3],ymm1[4,5],ymm5[6],ymm1[7]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512-NEXT:    vmovdqa 64(%r8), %ymm11
+; AVX512-NEXT:    vpshufb %ymm7, %ymm11, %ymm2
+; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm11[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7]
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
+; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa 64(%r9), %ymm1
+; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa 32(%rcx), %ymm5
-; AVX512-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512-NEXT:    vmovdqa 32(%rdx), %ymm7
-; AVX512-NEXT:    vpsrldq {{.*#+}} ymm1 = ymm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm7[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm1[2,3,2,3,6,7,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
+; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vmovdqa 32(%rcx), %ymm13
+; AVX512-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm13[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vmovdqa 32(%rdx), %ymm12
+; AVX512-NEXT:    vpsrldq {{.*#+}} ymm1 = ymm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm12[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm7[4],ymm5[4],ymm7[5],ymm5[5],ymm7[6],ymm5[6],ymm7[7],ymm5[7],ymm7[12],ymm5[12],ymm7[13],ymm5[13],ymm7[14],ymm5[14],ymm7[15],ymm5[15]
-; AVX512-NEXT:    vmovdqa64 %ymm7, %ymm29
-; AVX512-NEXT:    vmovdqa64 %ymm5, %ymm18
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm12[4],ymm13[4],ymm12[5],ymm13[5],ymm12[6],ymm13[6],ymm12[7],ymm13[7],ymm12[12],ymm13[12],ymm12[13],ymm13[13],ymm12[14],ymm13[14],ymm12[15],ymm13[15]
 ; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,2,3,3,5,6,7,7]
-; AVX512-NEXT:    vmovdqa 32(%rsi), %ymm7
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm7[2,1,2,3,6,5,6,7]
-; AVX512-NEXT:    vmovdqa %ymm7, %ymm8
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm5[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
-; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm2
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm7 = ymm2[2,1,2,3,6,5,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm7 = ymm7[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm5 = ymm7[0],ymm5[0],ymm7[1],ymm5[1],ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[8],ymm5[8],ymm7[9],ymm5[9],ymm7[10],ymm5[10],ymm7[11],ymm5[11]
+; AVX512-NEXT:    vmovdqa 32(%rsi), %ymm4
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm4[2,1,2,3,6,5,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm2[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
+; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm14
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm3 = ymm14[2,1,2,3,6,5,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm3[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm14[4],ymm4[4],ymm14[5],ymm4[5],ymm14[6],ymm4[6],ymm14[7],ymm4[7],ymm14[12],ymm4[12],ymm14[13],ymm4[13],ymm14[14],ymm4[14],ymm14[15],ymm4[15]
+; AVX512-NEXT:    vmovdqa64 %ymm14, %ymm23
+; AVX512-NEXT:    vmovdqa64 %ymm4, %ymm24
+; AVX512-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[3,3,3,3]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3]
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm5[2,1,2,3]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm8[4],ymm2[5],ymm8[5],ymm2[6],ymm8[6],ymm2[7],ymm8[7],ymm2[12],ymm8[12],ymm2[13],ymm8[13],ymm2[14],ymm8[14],ymm2[15],ymm8[15]
-; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm17
-; AVX512-NEXT:    vmovdqa64 %ymm8, %ymm16
-; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[3,3,3,3]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm1, %zmm1
+; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm1
 ; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
-; AVX512-NEXT:    vmovdqa 32(%r8), %ymm15
-; AVX512-NEXT:    vpshufb %ymm14, %ymm15, %ymm0
+; AVX512-NEXT:    vmovdqa 32(%r8), %ymm3
+; AVX512-NEXT:    vpshufb %ymm7, %ymm3, %ymm0
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
-; AVX512-NEXT:    vextracti64x4 $1, %zmm1, %ymm5
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2,3],ymm0[4],ymm5[5,6],ymm0[7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm15[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0],ymm1[1,2],ymm5[3],ymm1[4,5],ymm5[6],ymm1[7]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm3[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
+; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm17
+; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7]
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa (%rcx), %ymm8
-; AVX512-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm8[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512-NEXT:    vmovdqa (%rdx), %ymm13
-; AVX512-NEXT:    vpsrldq {{.*#+}} ymm1 = ymm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm13[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm19 = ymm0[2,2,2,2]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm13[4],ymm8[4],ymm13[5],ymm8[5],ymm13[6],ymm8[6],ymm13[7],ymm8[7],ymm13[12],ymm8[12],ymm13[13],ymm8[13],ymm13[14],ymm8[14],ymm13[15],ymm8[15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm7 = ymm1[1,2,3,3,5,6,7,7]
-; AVX512-NEXT:    vmovdqa (%rsi), %ymm2
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm2[2,1,2,3,6,5,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm5[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
-; AVX512-NEXT:    vmovdqa (%rdi), %ymm5
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm11 = ymm5[2,1,2,3,6,5,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm11 = ymm11[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[1],ymm0[1],ymm11[2],ymm0[2],ymm11[3],ymm0[3],ymm11[8],ymm0[8],ymm11[9],ymm0[9],ymm11[10],ymm0[10],ymm11[11],ymm0[11]
+; AVX512-NEXT:    vmovdqa 32(%r9), %ymm1
+; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
+; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm1[2,3,2,3,6,7,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm22 = ymm0[2,1,2,3]
+; AVX512-NEXT:    vmovdqa (%rcx), %ymm2
+; AVX512-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vmovdqa (%rdx), %ymm1
+; AVX512-NEXT:    vpsrldq {{.*#+}} ymm3 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm25 = ymm0[2,2,2,2]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm20 = ymm3[1,2,3,3,5,6,7,7]
+; AVX512-NEXT:    vmovdqa (%rsi), %ymm4
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm3 = ymm4[2,1,2,3,6,5,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm3[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
+; AVX512-NEXT:    vmovdqa (%rdi), %ymm3
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm15 = ymm3[2,1,2,3,6,5,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm15 = ymm15[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm15[0],ymm0[0],ymm15[1],ymm0[1],ymm15[2],ymm0[2],ymm15[3],ymm0[3],ymm15[8],ymm0[8],ymm15[9],ymm0[9],ymm15[10],ymm0[10],ymm15[11],ymm0[11]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm11 = ymm5[4],ymm2[4],ymm5[5],ymm2[5],ymm5[6],ymm2[6],ymm5[7],ymm2[7],ymm5[12],ymm2[12],ymm5[13],ymm2[13],ymm5[14],ymm2[14],ymm5[15],ymm2[15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[3,3,3,3]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm7, %zmm19, %zmm7
-; AVX512-NEXT:    vinserti64x4 $1, %ymm11, %zmm0, %zmm0
-; AVX512-NEXT:    vmovdqa32 %zmm7, %zmm0 {%k1}
-; AVX512-NEXT:    vmovdqa (%r8), %ymm7
-; AVX512-NEXT:    vpshufb %ymm14, %ymm7, %ymm11
-; AVX512-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,1,2,3]
-; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm14
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3],ymm11[4],ymm14[5,6],ymm11[7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm14 = ymm7[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm15 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[3,3,3,3]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm20 = ymm20[2,2,2,3]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm20, %zmm25, %zmm20
+; AVX512-NEXT:    vinserti64x4 $1, %ymm15, %zmm0, %zmm15
+; AVX512-NEXT:    vmovdqa32 %zmm20, %zmm15 {%k1}
+; AVX512-NEXT:    vmovdqa (%r8), %ymm0
+; AVX512-NEXT:    vpshufb %ymm7, %ymm0, %ymm7
+; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3]
+; AVX512-NEXT:    vextracti64x4 $1, %zmm15, %ymm14
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm7 = ymm14[0],ymm7[1],ymm14[2,3],ymm7[4],ymm14[5,6],ymm7[7]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm7, %zmm0, %zmm7
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm14 = ymm0[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0],ymm0[1,2],ymm14[3],ymm0[4,5],ymm14[6],ymm0[7]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm11, %zmm0, %zmm0
-; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm13[0],ymm8[0],ymm13[1],ymm8[1],ymm13[2],ymm8[2],ymm13[3],ymm8[3],ymm13[8],ymm8[8],ymm13[9],ymm8[9],ymm13[10],ymm8[10],ymm13[11],ymm8[11]
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm23 = [17,18,17,18,0,0,19,19,5,4,2,2,5,4,6,6]
-; AVX512-NEXT:    vmovdqa (%rcx), %xmm13
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0],ymm15[1,2],ymm14[3],ymm15[4,5],ymm14[6],ymm15[7]
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm7 = zmm14[0,1,2,3],zmm7[4,5,6,7]
+; AVX512-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm29 = [17,18,17,18,0,0,19,19,5,4,2,2,5,4,6,6]
+; AVX512-NEXT:    vmovdqa (%rcx), %xmm7
 ; AVX512-NEXT:    vmovdqa (%rdx), %xmm14
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
-; AVX512-NEXT:    vpermt2d %zmm8, %zmm23, %zmm0
-; AVX512-NEXT:    vmovdqa (%rsi), %xmm1
-; AVX512-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vmovdqa (%rdi), %xmm8
-; AVX512-NEXT:    vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[1,1,1,1]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm5[0],ymm2[0],ymm5[1],ymm2[1],ymm5[2],ymm2[2],ymm5[3],ymm2[3],ymm5[8],ymm2[8],ymm5[9],ymm2[9],ymm5[10],ymm2[10],ymm5[11],ymm2[11]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm8, %zmm1
-; AVX512-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm7[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
-; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm5
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2],ymm5[3,4],ymm1[5],ymm5[6,7]
-; AVX512-NEXT:    vmovdqa (%r8), %xmm8
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15]
-; AVX512-NEXT:    vpshufb %xmm2, %xmm8, %xmm5
-; AVX512-NEXT:    vmovdqa %xmm2, %xmm11
-; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3],ymm5[4],ymm0[5,6],ymm5[7]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm14[4],xmm7[4],xmm14[5],xmm7[5],xmm14[6],xmm7[6],xmm14[7],xmm7[7]
+; AVX512-NEXT:    vmovdqa64 %xmm14, %xmm26
+; AVX512-NEXT:    vpermt2d %zmm2, %zmm29, %zmm1
+; AVX512-NEXT:    vmovdqa (%rsi), %xmm14
+; AVX512-NEXT:    vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovdqa (%rdi), %xmm2
+; AVX512-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[1,1,1,1]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512-NEXT:    vmovdqa32 %zmm2, %zmm1 {%k1}
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
+; AVX512-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
+; AVX512-NEXT:    vmovdqa (%r8), %xmm2
+; AVX512-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm3 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15]
+; AVX512-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512-NEXT:    vmovdqa %xmm3, %xmm14
+; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7]
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm7 = ymm4[0],ymm9[0],ymm4[1],ymm9[1],ymm4[2],ymm9[2],ymm4[3],ymm9[3],ymm4[8],ymm9[8],ymm4[9],ymm9[9],ymm4[10],ymm9[10],ymm4[11],ymm9[11]
-; AVX512-NEXT:    vmovdqa 96(%rcx), %xmm4
-; AVX512-NEXT:    vmovdqa 96(%rdx), %xmm5
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
-; AVX512-NEXT:    vpermt2d %zmm0, %zmm23, %zmm7
-; AVX512-NEXT:    vmovdqa 96(%rsi), %xmm0
-; AVX512-NEXT:    vmovdqa 96(%rdi), %xmm2
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[1,1,1,1]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm6[0],ymm10[0],ymm6[1],ymm10[1],ymm6[2],ymm10[2],ymm6[3],ymm10[3],ymm6[8],ymm10[8],ymm6[9],ymm10[9],ymm6[10],ymm10[10],ymm6[11],ymm10[11]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm6, %zmm1, %zmm1
-; AVX512-NEXT:    vmovdqa32 %zmm1, %zmm7 {%k1}
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
-; AVX512-NEXT:    vextracti64x4 $1, %zmm7, %ymm3
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7]
-; AVX512-NEXT:    vmovdqa 96(%r8), %xmm1
-; AVX512-NEXT:    vpshufb %xmm11, %xmm1, %xmm6
-; AVX512-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6],ymm6[7]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm6, %zmm3
-; AVX512-NEXT:    vmovdqu64 %zmm3, (%rsp) # 64-byte Spill
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm6[0],ymm9[0],ymm6[1],ymm9[1],ymm6[2],ymm9[2],ymm6[3],ymm9[3],ymm6[8],ymm9[8],ymm6[9],ymm9[9],ymm6[10],ymm9[10],ymm6[11],ymm9[11]
+; AVX512-NEXT:    vmovdqa 96(%rcx), %xmm3
+; AVX512-NEXT:    vmovdqa 96(%rdx), %xmm4
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; AVX512-NEXT:    vpermt2d %zmm0, %zmm29, %zmm6
+; AVX512-NEXT:    vmovdqa 96(%rsi), %xmm1
+; AVX512-NEXT:    vmovdqa 96(%rdi), %xmm0
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[1,1,1,1]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm8 = ymm8[0],ymm10[0],ymm8[1],ymm10[1],ymm8[2],ymm10[2],ymm8[3],ymm10[3],ymm8[8],ymm10[8],ymm8[9],ymm10[9],ymm8[10],ymm10[10],ymm8[11],ymm10[11]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm8, %zmm2, %zmm2
+; AVX512-NEXT:    vmovdqa32 %zmm2, %zmm6 {%k1}
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm5[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
+; AVX512-NEXT:    vextracti64x4 $1, %zmm6, %ymm5
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2],ymm5[3,4],ymm2[5],ymm5[6,7]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm5
+; AVX512-NEXT:    vmovdqa 96(%r8), %xmm2
+; AVX512-NEXT:    vpshufb %xmm14, %xmm2, %xmm8
+; AVX512-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0],ymm8[1],ymm6[2,3],ymm8[4],ymm6[5,6],ymm8[7]
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm27 = zmm6[0,1,2,3],zmm5[4,5,6,7]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; AVX512-NEXT:    vpsrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
 ; AVX512-NEXT:    vpsrldq {{.*#+}} xmm4 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512-NEXT:    vpsrldq {{.*#+}} xmm5 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
 ; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm31 = [1,0,2,2,1,0,2,2,16,17,16,17,16,17,16,17]
-; AVX512-NEXT:    vpermt2d %zmm4, %zmm31, %zmm3
+; AVX512-NEXT:    vpermt2d %zmm3, %zmm31, %zmm5
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[0,1,2,1]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5]
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[0,1,2,1]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,5]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm5 = xmm2[0,1,2,1]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,5]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm4[0,1,0,1]
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm3[0,1,0,1]
 ; AVX512-NEXT:    movw $9362, %ax # imm = 0x2492
 ; AVX512-NEXT:    kmovw %eax, %k2
-; AVX512-NEXT:    vmovdqa32 %zmm3, %zmm0 {%k2}
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm1[2,1,3,3,4,5,6,7]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
+; AVX512-NEXT:    vmovdqa32 %zmm5, %zmm0 {%k2}
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm2[2,1,3,3,4,5,6,7]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
 ; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7]
-; AVX512-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; AVX512-NEXT:    vpbroadcastq %xmm1, %ymm1
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm22
-; AVX512-NEXT:    vmovdqa 64(%rcx), %xmm3
-; AVX512-NEXT:    vmovdqa 64(%rdx), %xmm2
-; AVX512-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm3[1,2],ymm1[3],ymm3[4,5],ymm1[6],ymm3[7]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm1
+; AVX512-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; AVX512-NEXT:    vpbroadcastq %xmm2, %ymm2
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7]
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm28 = zmm0[0,1,2,3],zmm1[4,5,6,7]
+; AVX512-NEXT:    vmovdqa 64(%rcx), %xmm2
+; AVX512-NEXT:    vmovdqa 64(%rdx), %xmm3
+; AVX512-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512-NEXT:    vpermt2d %zmm0, %zmm31, %zmm1
-; AVX512-NEXT:    vmovdqa 64(%rsi), %xmm6
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm6[0,1,2,1]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,7,6,5]
-; AVX512-NEXT:    vmovdqa 64(%rdi), %xmm0
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm5 = xmm0[0,1,2,1]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,5]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1]
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm4[0,1,0,1]
-; AVX512-NEXT:    vmovdqa32 %zmm1, %zmm4 {%k2}
-; AVX512-NEXT:    vmovdqa 64(%r8), %xmm1
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm1[2,1,3,3,4,5,6,7]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; AVX512-NEXT:    vpermt2d %zmm0, %zmm31, %zmm5
+; AVX512-NEXT:    vmovdqa 64(%rsi), %xmm1
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[0,1,2,1]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5]
+; AVX512-NEXT:    vmovdqa 64(%rdi), %xmm4
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm6 = xmm4[0,1,2,1]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,7,6,5]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1]
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm0[0,1,0,1]
+; AVX512-NEXT:    vmovdqa32 %zmm5, %zmm6 {%k2}
+; AVX512-NEXT:    vmovdqa 64(%r8), %xmm0
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm0[2,1,3,3,4,5,6,7]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1]
-; AVX512-NEXT:    vextracti64x4 $1, %zmm4, %ymm7
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0],ymm7[1,2],ymm5[3],ymm7[4,5],ymm5[6],ymm7[7]
-; AVX512-NEXT:    vpmovzxwd {{.*#+}} xmm7 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; AVX512-NEXT:    vpbroadcastq %xmm7, %ymm7
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm7[2],ymm4[3,4],ymm7[5],ymm4[6,7]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm4, %zmm28
-; AVX512-NEXT:    vmovdqa 96(%r9), %ymm7
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm7[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm24 = ymm5[2,2,2,3]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm7[2,3,2,3,6,7,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm5[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm25 = ymm5[2,1,2,3]
-; AVX512-NEXT:    vmovdqa 64(%r9), %ymm4
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm4[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm26 = ymm5[2,2,2,3]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm4[2,3,2,3,6,7,6,7]
-; AVX512-NEXT:    vmovdqa64 %ymm4, %ymm21
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm5[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm27 = ymm5[2,1,2,3]
-; AVX512-NEXT:    vmovdqa 32(%r9), %ymm5
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm9 = ymm5[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm30 = ymm9[2,2,2,3]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
-; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX512-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm3 = ymm3[0],mem[0],ymm3[1],mem[1],ymm3[2],mem[2],ymm3[3],mem[3],ymm3[8],mem[8],ymm3[9],mem[9],ymm3[10],mem[10],ymm3[11],mem[11]
-; AVX512-NEXT:    vpermt2d %zmm2, %zmm23, %zmm3
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1]
-; AVX512-NEXT:    vmovdqa64 %ymm20, %ymm6
-; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm6[0],ymm2[1],ymm6[1],ymm2[2],ymm6[2],ymm2[3],ymm6[3],ymm2[8],ymm6[8],ymm2[9],ymm6[9],ymm2[10],ymm6[10],ymm2[11],ymm6[11]
+; AVX512-NEXT:    vextracti64x4 $1, %zmm6, %ymm8
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0],ymm8[1,2],ymm5[3],ymm8[4,5],ymm5[6],ymm8[7]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm0, %zmm5
+; AVX512-NEXT:    vpmovzxwd {{.*#+}} xmm8 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX512-NEXT:    vpbroadcastq %xmm8, %ymm8
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm8[2],ymm6[3,4],ymm8[5],ymm6[6,7]
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm30 = zmm6[0,1,2,3],zmm5[4,5,6,7]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; AVX512-NEXT:    vmovdqa64 %ymm21, %ymm3
+; AVX512-NEXT:    vmovdqa64 %ymm19, %ymm5
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm5[0],ymm3[1],ymm5[1],ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[8],ymm5[8],ymm3[9],ymm5[9],ymm3[10],ymm5[10],ymm3[11],ymm5[11]
+; AVX512-NEXT:    vpermt2d %zmm2, %zmm29, %zmm3
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[1,1,1,1]
+; AVX512-NEXT:    vmovdqa64 %ymm16, %ymm2
+; AVX512-NEXT:    vmovdqa64 %ymm18, %ymm4
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm3 {%k1}
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm12[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512-NEXT:    vmovdqa32 %zmm1, %zmm3 {%k1}
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm11[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
 ; AVX512-NEXT:    vextracti64x4 $1, %zmm3, %ymm2
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7]
-; AVX512-NEXT:    vmovdqa %xmm11, %xmm4
-; AVX512-NEXT:    vpshufb %xmm11, %xmm1, %xmm1
-; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6],ymm1[7]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm20
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
+; AVX512-NEXT:    vmovdqa %xmm14, %xmm4
+; AVX512-NEXT:    vpshufb %xmm14, %xmm0, %xmm0
+; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3],ymm0[4],ymm3[5,6],ymm0[7]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm1
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm25 = zmm0[0,1,2,3],zmm1[4,5,6,7]
+; AVX512-NEXT:    vmovdqa (%r9), %ymm0
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm12[0],ymm13[0],ymm12[1],ymm13[1],ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[8],ymm13[8],ymm12[9],ymm13[9],ymm12[10],ymm13[10],ymm12[11],ymm13[11]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm20 = ymm1[2,2,2,3]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm0[2,3,2,3,6,7,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm16 = ymm1[2,1,2,3]
+; AVX512-NEXT:    vmovdqa (%r9), %xmm11
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm11[2,3,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,1,4,5,6,7]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm1[0,1,0,1]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm10 = ymm0[2,2,2,2]
 ; AVX512-NEXT:    vmovdqa 32(%rcx), %xmm0
-; AVX512-NEXT:    vmovdqa 32(%rdx), %xmm1
-; AVX512-NEXT:    vmovdqa64 %ymm29, %ymm2
-; AVX512-NEXT:    vmovdqa64 %ymm18, %ymm3
+; AVX512-NEXT:    vmovdqa 32(%rdx), %xmm13
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7]
+; AVX512-NEXT:    vpermt2d %zmm1, %zmm29, %zmm6
+; AVX512-NEXT:    vmovdqa 32(%rsi), %xmm15
+; AVX512-NEXT:    vmovdqa 32(%rdi), %xmm1
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[1,1,1,1]
+; AVX512-NEXT:    vmovdqa64 %ymm23, %ymm2
+; AVX512-NEXT:    vmovdqa64 %ymm24, %ymm3
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm9 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX512-NEXT:    vpermt2d %zmm2, %zmm23, %zmm9
-; AVX512-NEXT:    vmovdqa 32(%rsi), %xmm2
-; AVX512-NEXT:    vmovdqa 32(%rdi), %xmm3
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm10 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[1,1,1,1]
-; AVX512-NEXT:    vmovdqa64 %ymm16, %ymm6
-; AVX512-NEXT:    vmovdqa64 %ymm17, %ymm11
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm11 = ymm11[0],ymm6[0],ymm11[1],ymm6[1],ymm11[2],ymm6[2],ymm11[3],ymm6[3],ymm11[8],ymm6[8],ymm11[9],ymm6[9],ymm11[10],ymm6[10],ymm11[11],ymm6[11]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm11, %zmm10, %zmm11
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm10 = ymm5[2,3,2,3,6,7,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm10 = ymm10[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm17 = ymm10[2,1,2,3]
-; AVX512-NEXT:    vmovdqa32 %zmm11, %zmm9 {%k1}
-; AVX512-NEXT:    vextracti64x4 $1, %zmm9, %ymm11
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm12 = ymm15[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[2,2,2,2]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7]
-; AVX512-NEXT:    vmovdqa 32(%r8), %xmm15
-; AVX512-NEXT:    vpshufb %xmm4, %xmm15, %xmm12
-; AVX512-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0],ymm12[1],ymm9[2,3],ymm12[4],ymm9[5,6],ymm12[7]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm11, %zmm9, %zmm19
-; AVX512-NEXT:    vmovdqa (%r9), %ymm11
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm9, %zmm8, %zmm8
+; AVX512-NEXT:    vmovdqa32 %zmm8, %zmm6 {%k1}
+; AVX512-NEXT:    vmovdqa64 %ymm17, %ymm2
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm8 = ymm2[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,2,2,2]
+; AVX512-NEXT:    vextracti64x4 $1, %zmm6, %ymm9
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm14 = ymm9[0,1],ymm8[2],ymm9[3,4],ymm8[5],ymm9[6,7]
+; AVX512-NEXT:    vmovdqa 32(%r8), %xmm8
+; AVX512-NEXT:    vpshufb %xmm4, %xmm8, %xmm9
+; AVX512-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm9 = ymm6[0],ymm9[1],ymm6[2,3],ymm9[4],ymm6[5,6],ymm9[7]
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 32-byte Folded Reload
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm29 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0]
+; AVX512-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm6 # 64-byte Folded Reload
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm23 # 32-byte Folded Reload
+; AVX512-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm23 # 64-byte Folded Reload
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-NEXT:    vinserti64x4 $1, %ymm22, %zmm2, %zmm12
+; AVX512-NEXT:    vinserti64x4 $1, %ymm14, %zmm0, %zmm14
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm22 = zmm9[0,1,2,3],zmm14[4,5,6,7]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3]
 ; AVX512-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512-NEXT:    vpermt2d %zmm0, %zmm31, %zmm12
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm2[0,1,2,1]
+; AVX512-NEXT:    vpsrldq {{.*#+}} xmm13 = xmm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3]
+; AVX512-NEXT:    vpermt2d %zmm0, %zmm31, %zmm9
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm15[0,1,2,1]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm13 = xmm1[0,1,2,1]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,7,6,5]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7]
+; AVX512-NEXT:    vmovdqa 96(%r9), %xmm13
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm14 = xmm13[2,3,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm14[0,2,2,1,4,5,6,7]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm24 = ymm14[0,1,0,1]
+; AVX512-NEXT:    vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm14 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm17 = ymm14[2,2,2,2]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[0,1,0,1]
+; AVX512-NEXT:    vmovdqa32 %zmm9, %zmm0 {%k2}
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm8[2,1,3,3,4,5,6,7]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm9[0],ymm1[1,2],ymm9[3],ymm1[4,5],ymm9[6],ymm1[7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm13[0,0,2,1,4,5,6,7]
+; AVX512-NEXT:    vpbroadcastq %xmm9, %ymm18
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm13[0,2,2,3,4,5,6,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,4,4]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm19 = ymm9[0,0,2,1]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm1
+; AVX512-NEXT:    vpmovzxwd {{.*#+}} xmm8 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero
+; AVX512-NEXT:    vpbroadcastq %xmm8, %ymm8
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm8[2],ymm0[3,4],ymm8[5],ymm0[6,7]
+; AVX512-NEXT:    vmovdqa 64(%r9), %xmm8
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm21 = zmm0[0,1,2,3],zmm1[4,5,6,7]
+; AVX512-NEXT:    vmovdqa64 %xmm26, %xmm2
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3]
+; AVX512-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vpsrldq {{.*#+}} xmm7 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm8[0,0,2,1,4,5,6,7]
+; AVX512-NEXT:    vpbroadcastq %xmm7, %ymm7
+; AVX512-NEXT:    vpermt2d %zmm1, %zmm31, %zmm0
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm3[0,1,2,1]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm11[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm18 = ymm0[2,2,2,3]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm11[2,3,2,3,6,7,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm2[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm16 = ymm2[2,1,2,3]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm3[0,1,2,3],zmm1[0,1,0,1]
-; AVX512-NEXT:    vmovdqa32 %zmm12, %zmm1 {%k2}
-; AVX512-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm15[2,1,3,3,4,5,6,7]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm12[0],ymm3[1,2],ymm12[3],ymm3[4,5],ymm12[6],ymm3[7]
-; AVX512-NEXT:    vpmovzxwd {{.*#+}} xmm12 = xmm15[0],zero,xmm15[1],zero,xmm15[2],zero,xmm15[3],zero
-; AVX512-NEXT:    vpbroadcastq %xmm12, %ymm12
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm12[2],ymm1[3,4],ymm12[5],ymm1[6,7]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm1, %zmm15
-; AVX512-NEXT:    vmovdqa (%r9), %xmm3
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,1,4,5,6,7]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm23 = ymm1[0,1,0,1]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm11[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm29 = ymm1[2,2,2,2]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3]
-; AVX512-NEXT:    vpsrldq {{.*#+}} xmm11 = xmm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512-NEXT:    vpsrldq {{.*#+}} xmm12 = xmm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
-; AVX512-NEXT:    vpermt2d %zmm11, %zmm31, %zmm1
-; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm11 = xmm0[0,1,2,1]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,7,6,5]
 ; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm12 = xmm2[0,1,2,1]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,7,6,5]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm11 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1]
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm11 = zmm12[0,1,2,3],zmm11[0,1,0,1]
-; AVX512-NEXT:    vmovdqa32 %zmm1, %zmm11 {%k2}
-; AVX512-NEXT:    vextracti64x4 $1, %zmm11, %ymm1
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm8[2,1,3,3,4,5,6,7]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0],ymm1[1,2],ymm12[3],ymm1[4,5],ymm12[6],ymm1[7]
-; AVX512-NEXT:    vpmovzxwd {{.*#+}} xmm8 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero
-; AVX512-NEXT:    vpbroadcastq %xmm8, %ymm8
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm11[0,1],ymm8[2],ymm11[3,4],ymm8[5],ymm11[6,7]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm8, %zmm1
-; AVX512-NEXT:    vmovdqa 96(%r9), %xmm8
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm11 = xmm8[2,3,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm11[0,2,2,1,4,5,6,7]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm7[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm8[0,0,2,1,4,5,6,7]
-; AVX512-NEXT:    vpbroadcastq %xmm12, %ymm12
-; AVX512-NEXT:    vmovdqa 64(%r9), %xmm13
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm8[0,2,2,3,4,5,6,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm13[0,0,2,1,4,5,6,7]
-; AVX512-NEXT:    vpbroadcastq %xmm14, %ymm14
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm13[0,2,2,3,4,5,6,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm25, %zmm24, %zmm24
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm25 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0]
-; AVX512-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm24 # 64-byte Folded Reload
-; AVX512-NEXT:    vinserti64x4 $1, %ymm27, %zmm26, %zmm26
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm13 = xmm13[2,3,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm13[0,2,2,1,4,5,6,7]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1]
-; AVX512-NEXT:    vmovdqa 32(%r9), %xmm0
-; AVX512-NEXT:    vmovdqa64 %ymm21, %ymm2
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm7 = ymm2[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,2,2,2]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm10 = xmm0[2,3,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm10[0,2,2,1,4,5,6,7]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1]
-; AVX512-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm26 # 64-byte Folded Reload
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm5[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,2,2,2]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm0[0,0,2,1,4,5,6,7]
-; AVX512-NEXT:    vpbroadcastq %xmm9, %ymm9
-; AVX512-NEXT:    vinserti64x4 $1, %ymm17, %zmm30, %zmm17
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm3[0,0,2,1,4,5,6,7]
-; AVX512-NEXT:    vpbroadcastq %xmm2, %ymm2
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
-; AVX512-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm17 # 64-byte Folded Reload
-; AVX512-NEXT:    vinserti64x4 $1, %ymm16, %zmm18, %zmm16
-; AVX512-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm16 # 64-byte Folded Reload
-; AVX512-NEXT:    vinserti64x4 $1, %ymm29, %zmm23, %zmm18
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm23 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm18 # 64-byte Folded Reload
-; AVX512-NEXT:    vinserti64x4 $1, %ymm4, %zmm11, %zmm4
-; AVX512-NEXT:    vpternlogd $184, (%rsp), %zmm23, %zmm4 # 64-byte Folded Reload
-; AVX512-NEXT:    vinserti64x4 $1, %ymm7, %zmm13, %zmm7
-; AVX512-NEXT:    vpternlogd $184, %zmm20, %zmm23, %zmm7
-; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm10, %zmm5
-; AVX512-NEXT:    vpternlogd $184, %zmm19, %zmm23, %zmm5
-; AVX512-NEXT:    vinserti64x4 $1, %ymm8, %zmm12, %zmm8
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535]
-; AVX512-NEXT:    vpternlogd $184, %zmm22, %zmm10, %zmm8
-; AVX512-NEXT:    vinserti64x4 $1, %ymm6, %zmm14, %zmm6
-; AVX512-NEXT:    vpternlogd $184, %zmm28, %zmm10, %zmm6
-; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm9, %zmm0
-; AVX512-NEXT:    vpternlogd $184, %zmm15, %zmm10, %zmm0
-; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
-; AVX512-NEXT:    vpternlogd $184, %zmm1, %zmm10, %zmm2
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm9 = xmm2[0,1,2,1]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,7,6,5]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm8[0,2,2,3,4,5,6,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,4,4]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm8 = xmm8[2,3,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm8[0,2,2,1,4,5,6,7]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1]
+; AVX512-NEXT:    vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm13 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[2,2,2,2]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512-NEXT:    vmovdqa 32(%r9), %xmm15
+; AVX512-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1]
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm14[0,1,2,3],zmm1[0,1,0,1]
+; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k2}
+; AVX512-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm3[2,1,3,3,4,5,6,7]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0],ymm0[1,2],ymm14[3],ymm0[4,5],ymm14[6],ymm0[7]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm14 = xmm15[2,3,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm14[0,2,2,1,4,5,6,7]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1]
+; AVX512-NEXT:    vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm2 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
+; AVX512-NEXT:    vpmovzxwd {{.*#+}} xmm26 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
+; AVX512-NEXT:    vpbroadcastq %xmm26, %ymm3
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm15[0,0,2,1,4,5,6,7]
+; AVX512-NEXT:    vpbroadcastq %xmm3, %ymm3
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm15[0,2,2,3,4,5,6,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,4,4,4]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[0,0,2,1]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm11[0,0,2,1,4,5,6,7]
+; AVX512-NEXT:    vpbroadcastq %xmm1, %ymm1
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm11[0,2,2,3,4,5,6,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,4,4]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1]
+; AVX512-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm12 # 64-byte Folded Reload
+; AVX512-NEXT:    vinserti64x4 $1, %ymm16, %zmm20, %zmm4
+; AVX512-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm4 # 64-byte Folded Reload
+; AVX512-NEXT:    vinserti64x4 $1, %ymm10, %zmm5, %zmm5
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
+; AVX512-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm5 # 64-byte Folded Reload
+; AVX512-NEXT:    vinserti64x4 $1, %ymm17, %zmm24, %zmm16
+; AVX512-NEXT:    vpternlogd $184, %zmm27, %zmm10, %zmm16
+; AVX512-NEXT:    vinserti64x4 $1, %ymm13, %zmm8, %zmm8
+; AVX512-NEXT:    vpternlogd $184, %zmm25, %zmm10, %zmm8
+; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm14, %zmm2
+; AVX512-NEXT:    vpternlogd $184, %zmm22, %zmm10, %zmm2
+; AVX512-NEXT:    vinserti64x4 $1, %ymm19, %zmm18, %zmm10
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535]
+; AVX512-NEXT:    vpternlogd $184, %zmm28, %zmm13, %zmm10
+; AVX512-NEXT:    vinserti64x4 $1, %ymm9, %zmm7, %zmm7
+; AVX512-NEXT:    vpternlogd $184, %zmm30, %zmm13, %zmm7
+; AVX512-NEXT:    vinserti64x4 $1, %ymm15, %zmm3, %zmm3
+; AVX512-NEXT:    vpternlogd $184, %zmm21, %zmm13, %zmm3
+; AVX512-NEXT:    vinserti64x4 $1, %ymm11, %zmm1, %zmm1
+; AVX512-NEXT:    vpternlogd $184, %zmm0, %zmm13, %zmm1
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    vmovdqa64 %zmm2, (%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm0, 192(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm5, 256(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm7, 448(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm6, 384(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm8, 576(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm4, 640(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm18, 64(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm16, 128(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm17, 320(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm26, 512(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm24, 704(%rax)
-; AVX512-NEXT:    addq $392, %rsp # imm = 0x188
+; AVX512-NEXT:    vmovdqa64 %zmm1, (%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm3, 192(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm2, 256(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm8, 448(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm7, 384(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm10, 576(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm16, 640(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm5, 64(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm4, 128(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm12, 320(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm23, 512(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm6, 704(%rax)
+; AVX512-NEXT:    addq $600, %rsp # imm = 0x258
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 ;
 ; AVX512-FCP-LABEL: store_i16_stride6_vf64:
 ; AVX512-FCP:       # %bb.0:
-; AVX512-FCP-NEXT:    subq $1240, %rsp # imm = 0x4D8
-; AVX512-FCP-NEXT:    vmovdqa 96(%rcx), %ymm1
-; AVX512-FCP-NEXT:    vmovdqa 96(%rdx), %ymm2
-; AVX512-FCP-NEXT:    vmovdqa 64(%rcx), %ymm6
+; AVX512-FCP-NEXT:    subq $1256, %rsp # imm = 0x4E8
+; AVX512-FCP-NEXT:    vmovdqa 96(%rcx), %ymm3
+; AVX512-FCP-NEXT:    vmovdqa 96(%rdx), %ymm4
+; AVX512-FCP-NEXT:    vmovdqa 64(%rcx), %ymm2
 ; AVX512-FCP-NEXT:    vmovdqa 64(%rdx), %ymm5
 ; AVX512-FCP-NEXT:    vmovdqa 96(%rcx), %xmm9
-; AVX512-FCP-NEXT:    vmovdqa 96(%rdx), %xmm10
-; AVX512-FCP-NEXT:    vmovdqa (%rcx), %xmm7
-; AVX512-FCP-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %xmm0
-; AVX512-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 64(%rcx), %xmm13
-; AVX512-FCP-NEXT:    vmovdqa (%rdx), %xmm8
-; AVX512-FCP-NEXT:    vmovdqa %xmm8, (%rsp) # 16-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %xmm12
+; AVX512-FCP-NEXT:    vmovdqa 96(%rdx), %xmm11
+; AVX512-FCP-NEXT:    vmovdqa (%rcx), %xmm8
+; AVX512-FCP-NEXT:    vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %xmm12
 ; AVX512-FCP-NEXT:    vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 64(%rdx), %xmm11
-; AVX512-FCP-NEXT:    vmovdqa (%rcx), %ymm4
-; AVX512-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa (%rdx), %ymm3
-; AVX512-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm4, %zmm3
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm4, %zmm3
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[8],ymm6[8],ymm5[9],ymm6[9],ymm5[10],ymm6[10],ymm5[11],ymm6[11]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm11[4],xmm13[4],xmm11[5],xmm13[5],xmm11[6],xmm13[6],xmm11[7],xmm13[7]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm13, %xmm18
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm4, %zmm3
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %ymm3
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %ymm7
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm7[0],ymm3[0],ymm7[1],ymm3[1],ymm7[2],ymm3[2],ymm7[3],ymm3[3],ymm7[8],ymm3[8],ymm7[9],ymm3[9],ymm7[10],ymm3[10],ymm7[11],ymm3[11]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7]
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm8, %zmm4
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa 64(%rcx), %xmm6
+; AVX512-FCP-NEXT:    vmovdqa (%rdx), %xmm7
+; AVX512-FCP-NEXT:    vmovdqa %xmm7, (%rsp) # 16-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %xmm13
+; AVX512-FCP-NEXT:    vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa 64(%rdx), %xmm10
+; AVX512-FCP-NEXT:    vmovdqa (%rcx), %ymm1
+; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa (%rdx), %ymm0
+; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7]
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm5[0],ymm2[0],ymm5[1],ymm2[1],ymm5[2],ymm2[2],ymm5[3],ymm2[3],ymm5[8],ymm2[8],ymm5[9],ymm2[9],ymm5[10],ymm2[10],ymm5[11],ymm2[11]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm10, %xmm19
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm6, %xmm23
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %ymm7
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %ymm10
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm10[0],ymm7[0],ymm10[1],ymm7[1],ymm10[2],ymm7[2],ymm10[3],ymm7[3],ymm10[8],ymm7[8],ymm10[9],ymm7[9],ymm10[10],ymm7[10],ymm10[11],ymm7[11]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7]
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa 96(%rsi), %ymm12
 ; AVX512-FCP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm13 = [u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm12, %ymm4
-; AVX512-FCP-NEXT:    vmovdqa 96(%rdi), %ymm0
-; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm0, %ymm8
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm8 = ymm8[0],ymm4[0],ymm8[1],ymm4[1],ymm8[2],ymm4[2],ymm8[3],ymm4[3],ymm8[8],ymm4[8],ymm8[9],ymm4[9],ymm8[10],ymm4[10],ymm8[11],ymm4[11]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm12, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa 96(%rdi), %ymm1
+; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm1, %ymm8
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm8[0],ymm0[0],ymm8[1],ymm0[1],ymm8[2],ymm0[2],ymm8[3],ymm0[3],ymm8[8],ymm0[8],ymm8[9],ymm0[9],ymm8[10],ymm0[10],ymm8[11],ymm0[11]
 ; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm27 = [2,1,2,3,11,11,11,11]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm0[4],ymm12[4],ymm0[5],ymm12[5],ymm0[6],ymm12[6],ymm0[7],ymm12[7],ymm0[12],ymm12[12],ymm0[13],ymm12[13],ymm0[14],ymm12[14],ymm0[15],ymm12[15]
-; AVX512-FCP-NEXT:    vpermt2q %zmm4, %zmm27, %zmm8
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [5,6,5,6,5,6,7,7]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm12 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15]
-; AVX512-FCP-NEXT:    vpermd %ymm12, %ymm4, %ymm12
-; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} ymm1 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm12, %zmm0, %zmm0
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm8 = ymm1[4],ymm12[4],ymm1[5],ymm12[5],ymm1[6],ymm12[6],ymm1[7],ymm12[7],ymm1[12],ymm12[12],ymm1[13],ymm12[13],ymm1[14],ymm12[14],ymm1[15],ymm12[15]
+; AVX512-FCP-NEXT:    vpermt2q %zmm8, %zmm27, %zmm0
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm26 = [5,6,5,6,5,6,7,7]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm8 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15]
+; AVX512-FCP-NEXT:    vpermd %ymm8, %ymm26, %ymm8
+; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} ymm3 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} ymm4 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2]
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm3, %zmm3
 ; AVX512-FCP-NEXT:    movw $18724, %ax # imm = 0x4924
 ; AVX512-FCP-NEXT:    kmovw %eax, %k1
-; AVX512-FCP-NEXT:    vmovdqa32 %zmm0, %zmm8 {%k1}
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm25 = [12,1,2,13,4,5,14,7]
-; AVX512-FCP-NEXT:    vmovdqa %ymm8, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa 96(%r8), %ymm2
-; AVX512-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpermt2d %ymm1, %ymm25, %ymm0
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm28 = [8,21,10,11,20,13,14,23]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31]
-; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm2, %ymm1
-; AVX512-FCP-NEXT:    vmovdqa %ymm14, %ymm12
-; AVX512-FCP-NEXT:    vpermt2d %zmm1, %zmm28, %zmm8
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm0, %zmm0
+; AVX512-FCP-NEXT:    vmovdqa32 %zmm3, %zmm0 {%k1}
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm25 = [8,21,10,11,20,13,14,23]
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm3
+; AVX512-FCP-NEXT:    vmovdqa 96(%r8), %ymm1
+; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm13 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31]
+; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm1, %ymm4
+; AVX512-FCP-NEXT:    vmovdqa %ymm13, %ymm8
+; AVX512-FCP-NEXT:    vpermt2d %zmm4, %zmm25, %zmm3
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm29 = [12,1,2,13,4,5,14,7]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpermt2d %ymm4, %ymm29, %ymm0
+; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm3[0,1,2,3]
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm29 = [2,2,0,3,10,0,10,11]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm28 = [2,2,0,3,10,0,10,11]
 ; AVX512-FCP-NEXT:    vmovdqa 96(%r9), %ymm1
 ; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm15 = [u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31]
 ; AVX512-FCP-NEXT:    vpshufb %ymm15, %ymm1, %ymm0
 ; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm14 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25]
 ; AVX512-FCP-NEXT:    # ymm14 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm1, %ymm1
-; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm29, %zmm1
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 64(%rsi), %ymm2
-; AVX512-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm2, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %ymm8
-; AVX512-FCP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm8, %ymm1
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm8[4],ymm2[4],ymm8[5],ymm2[5],ymm8[6],ymm2[6],ymm8[7],ymm2[7],ymm8[12],ymm2[12],ymm8[13],ymm2[13],ymm8[14],ymm2[14],ymm8[15],ymm2[15]
-; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm27, %zmm0
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[12],ymm6[12],ymm5[13],ymm6[13],ymm5[14],ymm6[14],ymm5[15],ymm6[15]
-; AVX512-FCP-NEXT:    vpermd %ymm1, %ymm4, %ymm1
-; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} ymm6 = ymm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm6[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm1, %ymm3
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm28, %zmm3
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa 64(%rsi), %ymm1
+; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm1, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %ymm4
+; AVX512-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm4, %ymm3
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm4[4],ymm1[4],ymm4[5],ymm1[5],ymm4[6],ymm1[6],ymm4[7],ymm1[7],ymm4[12],ymm1[12],ymm4[13],ymm1[13],ymm4[14],ymm1[14],ymm4[15],ymm1[15]
+; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm27, %zmm0
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm5[4],ymm2[4],ymm5[5],ymm2[5],ymm5[6],ymm2[6],ymm5[7],ymm2[7],ymm5[12],ymm2[12],ymm5[13],ymm2[13],ymm5[14],ymm2[14],ymm5[15],ymm2[15]
+; AVX512-FCP-NEXT:    vpermd %ymm3, %ymm26, %ymm3
+; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} ymm4 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
 ; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} ymm5 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[8],ymm6[8],ymm5[9],ymm6[9],ymm5[10],ymm6[10],ymm5[11],ymm6[11]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,2,2,2]
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm5, %zmm1
-; AVX512-FCP-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa %ymm0, %ymm1
-; AVX512-FCP-NEXT:    vmovdqa 64(%r8), %ymm2
-; AVX512-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpermt2d %ymm5, %ymm25, %ymm1
-; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm2, %ymm5
-; AVX512-FCP-NEXT:    vpermt2d %zmm5, %zmm28, %zmm0
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[8],ymm4[8],ymm5[9],ymm4[9],ymm5[10],ymm4[10],ymm5[11],ymm4[11]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2]
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm4, %zmm3
+; AVX512-FCP-NEXT:    vmovdqa32 %zmm3, %zmm0 {%k1}
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm3
+; AVX512-FCP-NEXT:    vmovdqa 64(%r8), %ymm1
+; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm1, %ymm4
+; AVX512-FCP-NEXT:    vpermt2d %zmm4, %zmm25, %zmm3
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpermt2d %ymm4, %ymm29, %ymm0
+; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm3[0,1,2,3]
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa 64(%r9), %ymm1
 ; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-FCP-NEXT:    vpshufb %ymm15, %ymm1, %ymm0
-; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm1, %ymm1
-; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm29, %zmm1
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 96(%rsi), %xmm2
-; AVX512-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11]
-; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm2, %xmm0
-; AVX512-FCP-NEXT:    vmovdqa 96(%rdi), %xmm5
-; AVX512-FCP-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm5, %xmm1
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm16 = [0,0,2,1,8,9,8,9]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
-; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm16, %zmm1
-; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} ymm30 = [1,0,2,2,1,0,2,2]
-; AVX512-FCP-NEXT:    # ymm30 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
-; AVX512-FCP-NEXT:    vpermd %ymm0, %ymm30, %ymm0
+; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm1, %ymm3
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm28, %zmm3
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa 96(%rsi), %xmm0
+; AVX512-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11]
+; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm0, %xmm3
+; AVX512-FCP-NEXT:    vmovdqa 96(%rdi), %xmm1
+; AVX512-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm1, %xmm4
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm18 = [0,0,2,1,8,9,8,9]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm18, %zmm4
+; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} ymm31 = [1,0,2,2,1,0,2,2]
+; AVX512-FCP-NEXT:    # ymm31 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
+; AVX512-FCP-NEXT:    vpermd %ymm3, %ymm31, %ymm3
 ; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm5 = xmm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm5 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
 ; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
 ; AVX512-FCP-NEXT:    vpbroadcastq %xmm2, %ymm2
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
 ; AVX512-FCP-NEXT:    movw $9362, %ax # imm = 0x2492
 ; AVX512-FCP-NEXT:    kmovw %eax, %k2
-; AVX512-FCP-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k2}
-; AVX512-FCP-NEXT:    vmovdqa %ymm1, %ymm0
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm17 = [16,9,10,17,12,13,18,15]
-; AVX512-FCP-NEXT:    vmovdqa 96(%r8), %xmm5
-; AVX512-FCP-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm5[2,1,3,3,4,5,6,7]
-; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm17, %zmm1
-; AVX512-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm26 = [0,1,8,3,4,9,6,7]
-; AVX512-FCP-NEXT:    vpermt2d %ymm2, %ymm26, %ymm0
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-FCP-NEXT:    vmovdqa32 %zmm2, %zmm4 {%k2}
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm30 = [16,9,10,17,12,13,18,15]
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, %zmm2
+; AVX512-FCP-NEXT:    vmovdqa 96(%r8), %xmm0
+; AVX512-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm0[2,1,3,3,4,5,6,7]
+; AVX512-FCP-NEXT:    vpermt2d %zmm3, %zmm30, %zmm2
+; AVX512-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm16 = [0,1,8,3,4,9,6,7]
+; AVX512-FCP-NEXT:    vpermt2d %ymm3, %ymm16, %ymm4
+; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm4[0,1,2,3],zmm2[0,1,2,3]
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm31 = [0,0,0,0,8,8,0,9]
-; AVX512-FCP-NEXT:    vmovdqa 96(%r9), %xmm1
-; AVX512-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9]
-; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm1, %xmm0
-; AVX512-FCP-NEXT:    vmovdqa %xmm2, %xmm6
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,1,4,5,6,7]
-; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm31, %zmm1
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 64(%rsi), %xmm2
-; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm2, %xmm0
-; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %xmm5
-; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm5, %xmm1
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm5, %xmm23
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm2, %xmm24
-; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm16, %zmm1
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm18, %xmm2
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm11[0],xmm2[0],xmm11[1],xmm2[1],xmm11[2],xmm2[2],xmm11[3],xmm2[3]
-; AVX512-FCP-NEXT:    vpermd %ymm0, %ymm30, %ymm0
-; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm5 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
-; AVX512-FCP-NEXT:    vpbroadcastq %xmm2, %ymm2
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX512-FCP-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k2}
-; AVX512-FCP-NEXT:    vmovdqa %ymm1, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa 64(%r8), %xmm5
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm5[2,1,3,3,4,5,6,7]
-; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm17, %zmm1
-; AVX512-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm5, %xmm19
-; AVX512-FCP-NEXT:    vpermt2d %ymm2, %ymm26, %ymm0
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm17 = [0,0,0,0,8,8,0,9]
+; AVX512-FCP-NEXT:    vmovdqa 96(%r9), %xmm0
+; AVX512-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm13 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9]
+; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm0, %xmm2
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
+; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm17, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 64(%r9), %xmm1
+; AVX512-FCP-NEXT:    vmovdqa 64(%rsi), %xmm0
+; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm0, %xmm2
+; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %xmm1
 ; AVX512-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm1, %xmm0
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,1,4,5,6,7]
-; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm31, %zmm1
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %ymm2
-; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm2, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm5
-; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm5, %ymm1
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm5[4],ymm2[4],ymm5[5],ymm2[5],ymm5[6],ymm2[6],ymm5[7],ymm2[7],ymm5[12],ymm2[12],ymm5[13],ymm2[13],ymm5[14],ymm2[14],ymm5[15],ymm2[15]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm5, %ymm21
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm20
-; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm27, %zmm0
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm7[4],ymm3[4],ymm7[5],ymm3[5],ymm7[6],ymm3[6],ymm7[7],ymm3[7],ymm7[12],ymm3[12],ymm7[13],ymm3[13],ymm7[14],ymm3[14],ymm7[15],ymm3[15]
-; AVX512-FCP-NEXT:    vpermd %ymm1, %ymm4, %ymm1
-; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} ymm2 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} ymm3 = ymm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm7[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm1, %xmm3
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm0, %xmm20
+; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm18, %zmm3
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm23, %xmm0
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm19, %xmm1
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512-FCP-NEXT:    vpermd %ymm2, %ymm31, %ymm2
+; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm4 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm5 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; AVX512-FCP-NEXT:    vpbroadcastq %xmm4, %ymm4
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm2, %zmm2
+; AVX512-FCP-NEXT:    vmovdqa32 %zmm2, %zmm3 {%k2}
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm2
+; AVX512-FCP-NEXT:    vmovdqa 64(%r8), %xmm0
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm0[2,1,3,3,4,5,6,7]
+; AVX512-FCP-NEXT:    vpermt2d %zmm4, %zmm30, %zmm2
+; AVX512-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm0, %xmm24
+; AVX512-FCP-NEXT:    vpermt2d %ymm4, %ymm16, %ymm3
+; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm3[0,1,2,3],zmm2[0,1,2,3]
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa 64(%r9), %xmm0
+; AVX512-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm0, %xmm2
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
+; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm17, %zmm0
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %ymm0
+; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm0, %ymm2
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm1
+; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm1, %ymm3
 ; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1
-; AVX512-FCP-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa %ymm0, %ymm1
-; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %ymm3
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm3[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpermt2d %ymm2, %ymm25, %ymm1
-; AVX512-FCP-NEXT:    vmovdqa %ymm12, %ymm5
-; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm3, %ymm2
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm18
-; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm28, %zmm0
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm1, %ymm22
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm23
+; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm27, %zmm2
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm10[4],ymm7[4],ymm10[5],ymm7[5],ymm10[6],ymm7[6],ymm10[7],ymm7[7],ymm10[12],ymm7[12],ymm10[13],ymm7[13],ymm10[14],ymm7[14],ymm10[15],ymm7[15]
+; AVX512-FCP-NEXT:    vpermd %ymm3, %ymm26, %ymm3
+; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} ymm4 = ymm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm7[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} ymm5 = ymm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm10[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[8],ymm4[8],ymm5[9],ymm4[9],ymm5[10],ymm4[10],ymm5[11],ymm4[11]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2]
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm4, %zmm3
+; AVX512-FCP-NEXT:    vmovdqa32 %zmm3, %zmm2 {%k1}
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm3
+; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %ymm0
+; AVX512-FCP-NEXT:    vmovdqa %ymm8, %ymm5
+; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm0, %ymm4
+; AVX512-FCP-NEXT:    vpermt2d %zmm4, %zmm25, %zmm3
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm0[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm19
+; AVX512-FCP-NEXT:    vpermt2d %ymm4, %ymm29, %ymm2
+; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm3[0,1,2,3]
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa (%rsi), %ymm12
-; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm12, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm11
-; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm11, %ymm1
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm11[4],ymm12[4],ymm11[5],ymm12[5],ymm11[6],ymm12[6],ymm11[7],ymm12[7],ymm11[12],ymm12[12],ymm11[13],ymm12[13],ymm11[14],ymm12[14],ymm11[15],ymm12[15]
-; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm27, %zmm0
-; AVX512-FCP-NEXT:    vmovdqa 32(%r9), %ymm3
-; AVX512-FCP-NEXT:    vpshufb %ymm15, %ymm3, %ymm1
-; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm3, %ymm2
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm22
-; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm29, %zmm2
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa (%rsi), %ymm9
+; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm9, %ymm3
+; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm8
+; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm8, %ymm1
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm8[4],ymm9[4],ymm8[5],ymm9[5],ymm8[6],ymm9[6],ymm8[7],ymm9[7],ymm8[12],ymm9[12],ymm8[13],ymm9[13],ymm8[14],ymm9[14],ymm8[15],ymm9[15]
+; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm27, %zmm1
+; AVX512-FCP-NEXT:    vmovdqa 32(%r9), %ymm2
+; AVX512-FCP-NEXT:    vpshufb %ymm15, %ymm2, %ymm3
+; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm2, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm21
+; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm28, %zmm0
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15]
-; AVX512-FCP-NEXT:    vpermd %ymm1, %ymm4, %ymm1
-; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} ymm3 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} ymm4 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2]
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm3, %zmm1
-; AVX512-FCP-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa (%r8), %ymm10
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm10[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpermi2d %ymm1, %ymm0, %ymm25
-; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm10, %ymm1
-; AVX512-FCP-NEXT:    vpermt2d %zmm1, %zmm28, %zmm0
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm25, %zmm25
-; AVX512-FCP-NEXT:    vmovdqa (%r9), %ymm7
-; AVX512-FCP-NEXT:    vpshufb %ymm15, %ymm7, %ymm0
-; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm7, %ymm13
-; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm29, %zmm13
-; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %xmm15
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm14
-; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm15, %xmm0
-; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm14, %xmm1
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
-; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm16, %zmm4
-; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
+; AVX512-FCP-NEXT:    vpermd %ymm3, %ymm26, %ymm3
+; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} ymm4 = ymm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm0[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} ymm6 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm6[0],ymm4[0],ymm6[1],ymm4[1],ymm6[2],ymm4[2],ymm6[3],ymm4[3],ymm6[8],ymm4[8],ymm6[9],ymm4[9],ymm6[10],ymm4[10],ymm6[11],ymm4[11]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2]
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm4, %zmm3
+; AVX512-FCP-NEXT:    vmovdqa32 %zmm3, %zmm1 {%k1}
+; AVX512-FCP-NEXT:    vmovdqa (%r8), %ymm3
+; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm3, %ymm4
+; AVX512-FCP-NEXT:    vpermi2d %zmm4, %zmm1, %zmm25
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm3[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpermt2d %ymm4, %ymm29, %ymm1
+; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm25 = zmm1[0,1,2,3],zmm25[0,1,2,3]
+; AVX512-FCP-NEXT:    vmovdqa (%r9), %ymm5
+; AVX512-FCP-NEXT:    vpshufb %ymm15, %ymm5, %ymm1
+; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm5, %ymm10
+; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm28, %zmm10
+; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %xmm0
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm11
+; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm0, %xmm1
+; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm11, %xmm6
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm0, %xmm29
+; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm18, %zmm7
+; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512-FCP-NEXT:    vpermd %ymm0, %ymm30, %ymm5
-; AVX512-FCP-NEXT:    vmovdqa (%rsi), %xmm6
-; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm3
-; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm6, %xmm9
-; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm3, %xmm8
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3]
-; AVX512-FCP-NEXT:    vpermt2q %zmm8, %zmm16, %zmm0
-; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm8 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm9 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
-; AVX512-FCP-NEXT:    vpbroadcastq %xmm8, %ymm8
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm5, %zmm5
-; AVX512-FCP-NEXT:    vmovdqa32 %zmm5, %zmm4 {%k2}
-; AVX512-FCP-NEXT:    vmovdqa %ymm4, %ymm5
-; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %xmm8
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm8[2,1,3,3,4,5,6,7]
-; AVX512-FCP-NEXT:    vpermt2d %zmm9, %zmm17, %zmm4
-; AVX512-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm9 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero
-; AVX512-FCP-NEXT:    vpermt2d %ymm9, %ymm26, %ymm5
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm5, %zmm16
-; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX512-FCP-NEXT:    vpermd %ymm1, %ymm31, %ymm1
+; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm6 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm15 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm15[0],xmm6[0],xmm15[1],xmm6[1],xmm15[2],xmm6[2],xmm15[3],xmm6[3]
+; AVX512-FCP-NEXT:    vpbroadcastq %xmm6, %ymm6
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm1, %zmm1
+; AVX512-FCP-NEXT:    vmovdqa32 %zmm1, %zmm7 {%k2}
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, %zmm1
+; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %xmm6
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm6[2,1,3,3,4,5,6,7]
+; AVX512-FCP-NEXT:    vpermt2d %zmm15, %zmm30, %zmm1
+; AVX512-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm15 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero
+; AVX512-FCP-NEXT:    vpermt2d %ymm15, %ymm16, %ymm7
+; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm26 = zmm7[0,1,2,3],zmm1[0,1,2,3]
+; AVX512-FCP-NEXT:    vmovdqa (%rsi), %xmm15
+; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm15, %xmm1
+; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm0, %xmm7
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3]
+; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm18, %zmm12
+; AVX512-FCP-NEXT:    vmovdqa 32(%r9), %xmm7
+; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm7, %xmm1
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm7[0,0,2,1,4,5,6,7]
+; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm17, %zmm4
 ; AVX512-FCP-NEXT:    vmovdqa (%rsp), %xmm2 # 16-byte Reload
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512-FCP-NEXT:    vpermd %ymm4, %ymm30, %ymm4
-; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm5 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm9 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3]
-; AVX512-FCP-NEXT:    vpbroadcastq %xmm5, %ymm5
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm4, %zmm4
-; AVX512-FCP-NEXT:    vmovdqa32 %zmm4, %zmm0 {%k2}
-; AVX512-FCP-NEXT:    vmovdqa (%r8), %xmm1
-; AVX512-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; AVX512-FCP-NEXT:    vpermi2d %ymm4, %ymm0, %ymm26
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm1[2,1,3,3,4,5,6,7]
-; AVX512-FCP-NEXT:    vpermt2d %zmm4, %zmm17, %zmm0
-; AVX512-FCP-NEXT:    vmovdqa 32(%r9), %xmm9
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9]
-; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm9, %xmm4
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm9[0,0,2,1,4,5,6,7]
-; AVX512-FCP-NEXT:    vpermt2q %zmm4, %zmm31, %zmm5
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm26, %zmm17
-; AVX512-FCP-NEXT:    vmovdqa (%r9), %xmm0
-; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm0, %xmm2
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm0[0,0,2,1,4,5,6,7]
-; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm31, %zmm4
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm11[0],ymm12[0],ymm11[1],ymm12[1],ymm11[2],ymm12[2],ymm11[3],ymm12[3],ymm11[8],ymm12[8],ymm11[9],ymm12[9],ymm11[10],ymm12[10],ymm11[11],ymm12[11]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7]
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm12 = [1,1,1,1,10,10,10,11]
-; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm12, %zmm3
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm26 = [1,2,1,2,0,0,3,3,13,12,10,10,13,12,14,14]
-; AVX512-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm2 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    vmovdqa32 %zmm3, %zmm2 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm11 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15]
-; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm1, %xmm3
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm27 = [0,9,2,3,8,5,6,11]
-; AVX512-FCP-NEXT:    vmovdqa %ymm2, %ymm1
-; AVX512-FCP-NEXT:    vpermt2d %ymm3, %ymm27, %ymm1
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm10[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm30 = [8,9,20,11,12,21,14,15]
-; AVX512-FCP-NEXT:    vpermt2d %zmm3, %zmm30, %zmm2
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm7 = ymm7[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm0, %xmm3
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm6, %xmm31
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm28 = [0,0,0,1,0,10,10,0]
-; AVX512-FCP-NEXT:    vpermt2q %zmm7, %zmm28, %zmm3
+; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3]
+; AVX512-FCP-NEXT:    vpermd %ymm1, %ymm31, %ymm18
+; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm14 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm14[0],xmm1[0],xmm14[1],xmm1[1],xmm14[2],xmm1[2],xmm14[3],xmm1[3]
+; AVX512-FCP-NEXT:    vpbroadcastq %xmm1, %ymm1
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm18, %zmm1
+; AVX512-FCP-NEXT:    vmovdqa32 %zmm1, %zmm12 {%k2}
+; AVX512-FCP-NEXT:    vmovdqa (%r8), %xmm14
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm14[2,1,3,3,4,5,6,7]
+; AVX512-FCP-NEXT:    vpermi2d %zmm1, %zmm12, %zmm30
+; AVX512-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm14[0],zero,xmm14[1],zero,xmm14[2],zero,xmm14[3],zero
+; AVX512-FCP-NEXT:    vpermt2d %ymm1, %ymm16, %ymm12
+; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm16 = zmm12[0,1,2,3],zmm30[0,1,2,3]
+; AVX512-FCP-NEXT:    vmovdqa (%r9), %xmm1
+; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm1, %xmm2
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm1[0,0,2,1,4,5,6,7]
+; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm17, %zmm12
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[8],ymm9[8],ymm8[9],ymm9[9],ymm8[10],ymm9[10],ymm8[11],ymm9[11]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm9 = [1,1,1,1,10,10,10,11]
+; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm9, %zmm0
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm15 = [1,2,1,2,0,0,3,3,13,12,10,10,13,12,14,14]
+; AVX512-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm2 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    vmovdqa32 %zmm0, %zmm2 {%k1}
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm18 = [8,9,20,11,12,21,14,15]
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm17
+; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm18, %zmm17
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm8 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15]
+; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm14, %xmm0
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm14 = [0,9,2,3,8,5,6,11]
+; AVX512-FCP-NEXT:    vpermt2d %ymm0, %ymm14, %ymm2
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm5[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm1, %xmm3
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm0, %xmm30
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm27 = [0,0,0,1,0,10,10,0]
+; AVX512-FCP-NEXT:    vpermt2q %zmm5, %zmm27, %zmm3
 ; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512-FCP-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm7 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11]
+; AVX512-FCP-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    # ymm5 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11]
 ; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512-FCP-NEXT:    # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; AVX512-FCP-NEXT:    vpermt2q %zmm7, %zmm12, %zmm0
-; AVX512-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm7 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    vmovdqa32 %zmm0, %zmm7 {%k1}
+; AVX512-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
+; AVX512-FCP-NEXT:    # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX512-FCP-NEXT:    vpermt2q %zmm5, %zmm9, %zmm1
+; AVX512-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm5 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    vmovdqa32 %zmm1, %zmm5 {%k1}
+; AVX512-FCP-NEXT:    vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    # ymm1 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm28
+; AVX512-FCP-NEXT:    vpermt2d %zmm1, %zmm18, %zmm28
 ; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm0, %xmm0
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm7, %ymm29
-; AVX512-FCP-NEXT:    vpermt2d %ymm0, %ymm27, %ymm29
+; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm0, %xmm1
+; AVX512-FCP-NEXT:    vpermt2d %ymm1, %ymm14, %ymm5
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512-FCP-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    # ymm1 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm20, %xmm13
+; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7]
+; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm9, %zmm0
+; AVX512-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm1 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
 ; AVX512-FCP-NEXT:    vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
 ; AVX512-FCP-NEXT:    # ymm0 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm30, %zmm7
-; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512-FCP-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm23, %xmm10
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm24, %xmm6
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm10 = xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7]
-; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm12, %zmm10
-; AVX512-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm0 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    vmovdqa32 %zmm10, %zmm0 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm19, %xmm6
-; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm6, %xmm10
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm19
-; AVX512-FCP-NEXT:    vpermt2d %ymm10, %ymm27, %ymm19
-; AVX512-FCP-NEXT:    vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm10 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpermt2d %zmm10, %zmm30, %zmm0
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm21, %ymm6
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm20, %ymm10
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm10 = ymm6[0],ymm10[0],ymm6[1],ymm10[1],ymm6[2],ymm10[2],ymm6[3],ymm10[3],ymm6[8],ymm10[8],ymm6[9],ymm10[9],ymm6[10],ymm10[10],ymm6[11],ymm10[11]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm14 = xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7]
-; AVX512-FCP-NEXT:    vpermt2q %zmm10, %zmm12, %zmm14
-; AVX512-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm10 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    vmovdqa32 %zmm14, %zmm10 {%k1}
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm20
+; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm18, %zmm20
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm24, %xmm0
+; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm0, %xmm0
+; AVX512-FCP-NEXT:    vpermt2d %ymm0, %ymm14, %ymm1
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm22, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm23, %ymm13
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm13[0],ymm0[1],ymm13[1],ymm0[2],ymm13[2],ymm0[3],ymm13[3],ymm0[8],ymm13[8],ymm0[9],ymm13[9],ymm0[10],ymm13[10],ymm0[11],ymm13[11]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm29, %xmm13
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm11 = xmm11[4],xmm13[4],xmm11[5],xmm13[5],xmm11[6],xmm13[6],xmm11[7],xmm13[7]
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm9, %zmm11
+; AVX512-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm0 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    vmovdqa32 %zmm11, %zmm0 {%k1}
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm19, %ymm9
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm9 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpermi2d %zmm9, %zmm0, %zmm18
+; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm6, %xmm6
+; AVX512-FCP-NEXT:    vpermt2d %ymm6, %ymm14, %ymm0
+; AVX512-FCP-NEXT:    vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    # ymm6 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
+; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm30, %xmm11
 ; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm8, %xmm8
-; AVX512-FCP-NEXT:    vpermi2d %ymm8, %ymm10, %ymm27
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm18, %ymm6
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm8 = ymm6[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpermt2d %zmm8, %zmm30, %zmm10
-; AVX512-FCP-NEXT:    vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm8 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
-; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm31, %xmm14
-; AVX512-FCP-NEXT:    vpshufb %xmm14, %xmm11, %xmm11
-; AVX512-FCP-NEXT:    vpermt2q %zmm8, %zmm28, %zmm11
-; AVX512-FCP-NEXT:    vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm8 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
-; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX512-FCP-NEXT:    vpshufb %xmm14, %xmm6, %xmm12
-; AVX512-FCP-NEXT:    vpermt2q %zmm8, %zmm28, %zmm12
-; AVX512-FCP-NEXT:    vpshufb %xmm14, %xmm9, %xmm6
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm22, %ymm8
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm8 = ymm8[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
-; AVX512-FCP-NEXT:    vpermt2q %zmm8, %zmm28, %zmm6
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm27, %zmm8
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogd $184, %zmm8, %zmm9, %zmm6
+; AVX512-FCP-NEXT:    vpermt2q %zmm6, %zmm27, %zmm8
+; AVX512-FCP-NEXT:    vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    # ymm6 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
+; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
+; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm9, %xmm9
+; AVX512-FCP-NEXT:    vpermt2q %zmm6, %zmm27, %zmm9
+; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm7, %xmm6
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm21, %ymm7
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm7 = ymm7[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
+; AVX512-FCP-NEXT:    vpermt2q %zmm7, %zmm27, %zmm6
+; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm18[0,1,2,3]
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
+; AVX512-FCP-NEXT:    vpternlogd $184, %zmm0, %zmm7, %zmm6
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, 256(%rax)
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm19, %zmm0
-; AVX512-FCP-NEXT:    vpternlogd $184, %zmm0, %zmm9, %zmm12
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm12, 448(%rax)
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm29, %zmm0
-; AVX512-FCP-NEXT:    vpternlogd $184, %zmm0, %zmm9, %zmm11
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm11, 640(%rax)
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm0
-; AVX512-FCP-NEXT:    vpternlogd $184, %zmm0, %zmm9, %zmm3
+; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm20[0,1,2,3]
+; AVX512-FCP-NEXT:    vpternlogd $184, %zmm0, %zmm7, %zmm9
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm9, 448(%rax)
+; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm5[0,1,2,3],zmm28[0,1,2,3]
+; AVX512-FCP-NEXT:    vpternlogd $184, %zmm0, %zmm7, %zmm8
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, 640(%rax)
+; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm17[0,1,2,3]
+; AVX512-FCP-NEXT:    vpternlogd $184, %zmm0, %zmm7, %zmm3
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, 64(%rax)
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogd $184, %zmm17, %zmm0, %zmm4
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, (%rax)
-; AVX512-FCP-NEXT:    vpternlogd $184, %zmm16, %zmm0, %zmm5
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, 192(%rax)
+; AVX512-FCP-NEXT:    vpternlogd $184, %zmm16, %zmm0, %zmm12
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm12, (%rax)
+; AVX512-FCP-NEXT:    vpternlogd $184, %zmm26, %zmm0, %zmm4
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, 192(%rax)
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0]
-; AVX512-FCP-NEXT:    vpternlogd $184, %zmm25, %zmm1, %zmm13
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm13, 128(%rax)
+; AVX512-FCP-NEXT:    vpternlogd $184, %zmm25, %zmm1, %zmm10
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm10, 128(%rax)
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512-FCP-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm2 # 64-byte Folded Reload
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, 320(%rax)
@@ -9201,15 +9244,15 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512-FCP-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, 704(%rax)
-; AVX512-FCP-NEXT:    addq $1240, %rsp # imm = 0x4D8
+; AVX512-FCP-NEXT:    addq $1256, %rsp # imm = 0x4E8
 ; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: store_i16_stride6_vf64:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    subq $584, %rsp # imm = 0x248
+; AVX512DQ-NEXT:    subq $936, %rsp # imm = 0x3A8
 ; AVX512DQ-NEXT:    vmovdqa 96(%rcx), %xmm0
-; AVX512DQ-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX512DQ-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa 96(%rdx), %xmm2
 ; AVX512DQ-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
@@ -9219,7 +9262,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [1,0,2,2,1,0,2,2,16,17,16,17,16,17,16,17]
 ; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm0, %zmm1
 ; AVX512DQ-NEXT:    vmovdqa 96(%rsi), %xmm4
-; AVX512DQ-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vmovdqa %xmm4, (%rsp) # 16-byte Spill
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm4[0,1,2,1]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5]
 ; AVX512DQ-NEXT:    vmovdqa 96(%rdi), %xmm5
@@ -9239,12 +9282,22 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm4[2,1,3,3,4,5,6,7]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0],ymm1[1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm1
 ; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
 ; AVX512DQ-NEXT:    vpbroadcastq %xmm3, %ymm3
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1
+; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm1[4,5,6,7]
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 32(%rcx), %xmm5
+; AVX512DQ-NEXT:    vmovdqa 96(%r9), %xmm2
+; AVX512DQ-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm2[0,0,2,1,4,5,6,7]
+; AVX512DQ-NEXT:    vpbroadcastq %xmm1, %ymm1
+; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm2[0,2,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
+; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vmovdqa 32(%rcx), %xmm4
 ; AVX512DQ-NEXT:    vmovdqa 64(%rcx), %xmm2
 ; AVX512DQ-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa 32(%rdx), %xmm6
@@ -9255,47 +9308,57 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vpsrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
 ; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm0, %zmm1
-; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %xmm4
-; AVX512DQ-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 64(%rsi), %xmm7
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm7[0,1,2,1]
+; AVX512DQ-NEXT:    vmovdqa 64(%rsi), %xmm5
+; AVX512DQ-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm5[0,1,2,1]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5]
-; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %xmm8
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm3 = xmm8[0,1,2,1]
+; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %xmm7
+; AVX512DQ-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm3 = xmm7[0,1,2,1]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5]
 ; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm8, %xmm29
-; AVX512DQ-NEXT:    vmovdqa64 %xmm7, %xmm30
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
 ; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[0,1,0,1]
 ; AVX512DQ-NEXT:    vmovdqa32 %zmm1, %zmm2 {%k1}
 ; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm2, %ymm1
-; AVX512DQ-NEXT:    vmovdqa 64(%r8), %xmm7
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm7[2,1,3,3,4,5,6,7]
+; AVX512DQ-NEXT:    vmovdqa 64(%r8), %xmm5
+; AVX512DQ-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm5[2,1,3,3,4,5,6,7]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0],ymm1[1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7]
-; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero
-; AVX512DQ-NEXT:    vmovdqa64 %xmm7, %xmm31
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm1
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero
 ; AVX512DQ-NEXT:    vpbroadcastq %xmm3, %ymm3
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
-; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %xmm7
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1
+; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm1[4,5,6,7]
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT:    vmovdqa 64(%r9), %xmm2
+; AVX512DQ-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm2[0,0,2,1,4,5,6,7]
+; AVX512DQ-NEXT:    vpbroadcastq %xmm1, %ymm1
+; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm2[0,2,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
+; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
 ; AVX512DQ-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
 ; AVX512DQ-NEXT:    vpermt2d %zmm1, %zmm0, %zmm2
+; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %xmm4
+; AVX512DQ-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm4[0,1,2,1]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5]
-; AVX512DQ-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm3 = xmm7[0,1,2,1]
+; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %xmm5
+; AVX512DQ-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm3 = xmm5[0,1,2,1]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5]
 ; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
 ; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm3[0,1,2,3],zmm1[0,1,0,1]
 ; AVX512DQ-NEXT:    vmovdqa32 %zmm2, %zmm1 {%k1}
@@ -9305,30 +9368,39 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
 ; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm2
 ; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
 ; AVX512DQ-NEXT:    vpbroadcastq %xmm3, %ymm3
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7]
+; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa 32(%r9), %xmm2
+; AVX512DQ-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm2[0,0,2,1,4,5,6,7]
+; AVX512DQ-NEXT:    vpbroadcastq %xmm1, %ymm1
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm2[0,2,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
+; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa (%rcx), %xmm3
 ; AVX512DQ-NEXT:    vmovdqa (%rdx), %xmm4
 ; AVX512DQ-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
 ; AVX512DQ-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm4, %xmm27
-; AVX512DQ-NEXT:    vmovdqa64 %xmm3, %xmm28
+; AVX512DQ-NEXT:    vmovdqa64 %xmm4, %xmm31
+; AVX512DQ-NEXT:    vmovdqa64 %xmm3, %xmm19
 ; AVX512DQ-NEXT:    vpermt2d %zmm1, %zmm0, %zmm2
 ; AVX512DQ-NEXT:    vmovdqa (%rsi), %xmm3
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm3[0,1,2,1]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5]
-; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm4
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm4[0,1,2,1]
+; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm15
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm15[0,1,2,1]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5]
 ; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm4, %xmm20
-; AVX512DQ-NEXT:    vmovdqa64 %xmm3, %xmm21
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm15[0],xmm3[0],xmm15[1],xmm3[1],xmm15[2],xmm3[2],xmm15[3],xmm3[3]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm3, %xmm18
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
 ; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[0,1,0,1]
 ; AVX512DQ-NEXT:    vmovdqa32 %zmm2, %zmm0 {%k1}
@@ -9337,11 +9409,12 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
 ; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm1
 ; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
-; AVX512DQ-NEXT:    vmovdqa64 %xmm3, %xmm18
+; AVX512DQ-NEXT:    vmovdqa64 %xmm3, %xmm26
 ; AVX512DQ-NEXT:    vpbroadcastq %xmm2, %ymm2
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa 96(%rsi), %ymm2
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm2[2,1,2,3,6,5,6,7]
@@ -9352,32 +9425,34 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
 ; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm22
-; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm26
+; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm28
+; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm27
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3]
-; AVX512DQ-NEXT:    vmovdqa 96(%rcx), %ymm12
-; AVX512DQ-NEXT:    vpsrldq {{.*#+}} ymm2 = ymm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm12[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512DQ-NEXT:    vmovdqa 96(%rdx), %ymm9
-; AVX512DQ-NEXT:    vpsrldq {{.*#+}} ymm3 = ymm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm9[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT:    vmovdqa 96(%rcx), %ymm4
+; AVX512DQ-NEXT:    vpsrldq {{.*#+}} ymm2 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT:    vmovdqa 96(%rdx), %ymm11
+; AVX512DQ-NEXT:    vpsrldq {{.*#+}} ymm3 = ymm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm11[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm9[4],ymm12[4],ymm9[5],ymm12[5],ymm9[6],ymm12[6],ymm9[7],ymm12[7],ymm9[12],ymm12[12],ymm9[13],ymm12[13],ymm9[14],ymm12[14],ymm9[15],ymm12[15]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm11[4],ymm4[4],ymm11[5],ymm4[5],ymm11[6],ymm4[6],ymm11[7],ymm4[7],ymm11[12],ymm4[12],ymm11[13],ymm4[13],ymm11[14],ymm4[14],ymm11[15],ymm4[15]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm4, %ymm21
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[1,2,3,3,5,6,7,7]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3]
 ; AVX512DQ-NEXT:    movw $18724, %ax # imm = 0x4924
 ; AVX512DQ-NEXT:    kmovw %eax, %k1
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm3, %zmm2, %zmm0 {%k1}
-; AVX512DQ-NEXT:    vmovdqa 96(%r8), %ymm6
+; AVX512DQ-NEXT:    vmovdqa 96(%r8), %ymm9
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm13 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31]
-; AVX512DQ-NEXT:    vpshufb %ymm13, %ymm6, %ymm1
+; AVX512DQ-NEXT:    vpshufb %ymm13, %ymm9, %ymm1
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
 ; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm6[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm1
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm9[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa 64(%rsi), %ymm2
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm2[2,1,2,3,6,5,6,7]
@@ -9388,49 +9463,50 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
 ; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm16
-; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm17
+; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm22
+; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm24
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3]
-; AVX512DQ-NEXT:    vmovdqa 64(%rcx), %ymm7
-; AVX512DQ-NEXT:    vpsrldq {{.*#+}} ymm2 = ymm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm7[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512DQ-NEXT:    vmovdqa 64(%rdx), %ymm5
-; AVX512DQ-NEXT:    vpsrldq {{.*#+}} ymm3 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT:    vmovdqa 64(%rcx), %ymm12
+; AVX512DQ-NEXT:    vpsrldq {{.*#+}} ymm2 = ymm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm12[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT:    vmovdqa 64(%rdx), %ymm10
+; AVX512DQ-NEXT:    vpsrldq {{.*#+}} ymm3 = ymm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm10[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm5[4],ymm7[4],ymm5[5],ymm7[5],ymm5[6],ymm7[6],ymm5[7],ymm7[7],ymm5[12],ymm7[12],ymm5[13],ymm7[13],ymm5[14],ymm7[14],ymm5[15],ymm7[15]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm10[4],ymm12[4],ymm10[5],ymm12[5],ymm10[6],ymm12[6],ymm10[7],ymm12[7],ymm10[12],ymm12[12],ymm10[13],ymm12[13],ymm10[14],ymm12[14],ymm10[15],ymm12[15]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[1,2,3,3,5,6,7,7]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3]
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm3, %zmm2, %zmm0 {%k1}
-; AVX512DQ-NEXT:    vmovdqa 64(%r8), %ymm8
-; AVX512DQ-NEXT:    vpshufb %ymm13, %ymm8, %ymm1
+; AVX512DQ-NEXT:    vmovdqa 64(%r8), %ymm3
+; AVX512DQ-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpshufb %ymm13, %ymm3, %ymm1
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
 ; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm8[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm1
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm3[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %ymm2
-; AVX512DQ-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm2[2,1,2,3,6,5,6,7]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
-; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm3
-; AVX512DQ-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm3[2,1,2,3,6,5,6,7]
+; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm14
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm14[2,1,2,3,6,5,6,7]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm14[4],ymm2[4],ymm14[5],ymm2[5],ymm14[6],ymm2[6],ymm14[7],ymm2[7],ymm14[12],ymm2[12],ymm14[13],ymm2[13],ymm14[14],ymm2[14],ymm14[15],ymm2[15]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm20
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3]
-; AVX512DQ-NEXT:    vmovdqa 32(%rcx), %ymm11
-; AVX512DQ-NEXT:    vpsrldq {{.*#+}} ymm2 = ymm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm11[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512DQ-NEXT:    vmovdqa 32(%rdx), %ymm10
-; AVX512DQ-NEXT:    vpsrldq {{.*#+}} ymm3 = ymm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm10[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT:    vmovdqa 32(%rcx), %ymm7
+; AVX512DQ-NEXT:    vpsrldq {{.*#+}} ymm2 = ymm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm7[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT:    vmovdqa 32(%rdx), %ymm6
+; AVX512DQ-NEXT:    vpsrldq {{.*#+}} ymm3 = ymm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm6[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm10[4],ymm11[4],ymm10[5],ymm11[5],ymm10[6],ymm11[6],ymm10[7],ymm11[7],ymm10[12],ymm11[12],ymm10[13],ymm11[13],ymm10[14],ymm11[14],ymm10[15],ymm11[15]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm6[4],ymm7[4],ymm6[5],ymm7[5],ymm6[6],ymm7[6],ymm6[7],ymm7[7],ymm6[12],ymm7[12],ymm6[13],ymm7[13],ymm6[14],ymm7[14],ymm6[15],ymm7[15]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[1,2,3,3,5,6,7,7]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3]
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
@@ -9440,257 +9516,252 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
 ; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm1
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm3[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm19
+; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm25
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa (%rsi), %ymm4
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm4[2,1,2,3,6,5,6,7]
+; AVX512DQ-NEXT:    vmovdqa (%rsi), %ymm5
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm5[2,1,2,3,6,5,6,7]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
 ; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm3
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm3[2,1,2,3,6,5,6,7]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm24 = ymm0[2,1,2,3]
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm23 = ymm0[3,3,3,3]
-; AVX512DQ-NEXT:    vmovdqa (%rcx), %ymm2
-; AVX512DQ-NEXT:    vpsrldq {{.*#+}} ymm1 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512DQ-NEXT:    vmovdqa (%rdx), %ymm0
-; AVX512DQ-NEXT:    vpsrldq {{.*#+}} ymm15 = ymm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm0[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm15[0],ymm1[0],ymm15[1],ymm1[1],ymm15[2],ymm1[2],ymm15[3],ymm1[3],ymm15[8],ymm1[8],ymm15[9],ymm1[9],ymm15[10],ymm1[10],ymm15[11],ymm1[11]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm25 = ymm1[2,2,2,2]
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm15 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm15 = ymm15[1,2,3,3,5,6,7,7]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm23, %zmm24, %zmm1
-; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm15, %zmm25, %zmm1 {%k1}
-; AVX512DQ-NEXT:    vmovdqa (%r8), %ymm15
-; AVX512DQ-NEXT:    vpshufb %ymm13, %ymm15, %ymm14
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3]
-; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm13
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2,3],ymm14[4],ymm13[5,6],ymm14[7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm14 = ymm15[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm14[0],ymm1[1,2],ymm14[3],ymm1[4,5],ymm14[6],ymm1[7]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm13, %zmm1, %zmm23
-; AVX512DQ-NEXT:    vmovdqa64 %xmm27, %xmm1
-; AVX512DQ-NEXT:    vmovdqa64 %xmm28, %xmm13
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7]
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11]
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm24 = [17,18,17,18,0,0,19,19,5,4,2,2,5,4,6,6]
-; AVX512DQ-NEXT:    vpermt2d %zmm1, %zmm24, %zmm0
-; AVX512DQ-NEXT:    vmovdqa64 %xmm20, %xmm1
-; AVX512DQ-NEXT:    vmovdqa64 %xmm21, %xmm2
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[1,1,1,1]
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
-; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm2, %zmm1, %zmm0 {%k1}
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm15[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm17 = ymm0[2,1,2,3]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm3[4],ymm5[4],ymm3[5],ymm5[5],ymm3[6],ymm5[6],ymm3[7],ymm5[7],ymm3[12],ymm5[12],ymm3[13],ymm5[13],ymm3[14],ymm5[14],ymm3[15],ymm5[15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm16 = ymm0[3,3,3,3]
+; AVX512DQ-NEXT:    vmovdqa (%rcx), %ymm4
+; AVX512DQ-NEXT:    vpsrldq {{.*#+}} ymm1 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT:    vmovdqa (%rdx), %ymm2
+; AVX512DQ-NEXT:    vpsrldq {{.*#+}} ymm8 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm8[0],ymm1[0],ymm8[1],ymm1[1],ymm8[2],ymm1[2],ymm8[3],ymm1[3],ymm8[8],ymm1[8],ymm8[9],ymm1[9],ymm8[10],ymm1[10],ymm8[11],ymm1[11]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
-; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm3 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm18, %xmm2
-; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512DQ-NEXT:    vmovdqa %xmm3, %xmm4
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm25
-; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512DQ-NEXT:    vpunpckhwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512DQ-NEXT:    # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm9[0],ymm12[0],ymm9[1],ymm12[1],ymm9[2],ymm12[2],ymm9[3],ymm12[3],ymm9[8],ymm12[8],ymm9[9],ymm12[9],ymm9[10],ymm12[10],ymm9[11],ymm12[11]
-; AVX512DQ-NEXT:    vpermt2d %zmm0, %zmm24, %zmm1
-; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512DQ-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512DQ-NEXT:    # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm8 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm8 = ymm8[1,2,3,3,5,6,7,7]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm16, %zmm17, %zmm0
+; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm8, %zmm1, %zmm0 {%k1}
+; AVX512DQ-NEXT:    vmovdqa (%r8), %ymm1
+; AVX512DQ-NEXT:    vpshufb %ymm13, %ymm1, %ymm8
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,1,2,3]
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm13
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm8 = ymm13[0],ymm8[1],ymm13[2,3],ymm8[4],ymm13[5,6],ymm8[7]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm8, %zmm0, %zmm8
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm13 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0],ymm0[1,2],ymm13[3],ymm0[4,5],ymm13[6],ymm0[7]
+; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm29 = zmm0[0,1,2,3],zmm8[4,5,6,7]
+; AVX512DQ-NEXT:    vmovdqa (%r9), %xmm8
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm8[0,0,2,1,4,5,6,7]
+; AVX512DQ-NEXT:    vpbroadcastq %xmm0, %ymm30
+; AVX512DQ-NEXT:    vmovdqa64 %xmm31, %xmm0
+; AVX512DQ-NEXT:    vmovdqa64 %xmm19, %xmm13
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm8[0,2,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,4,4,4]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm31 = ymm13[0,0,2,1]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11]
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [17,18,17,18,0,0,19,19,5,4,2,2,5,4,6,6]
+; AVX512DQ-NEXT:    vpermt2d %zmm0, %zmm4, %zmm2
+; AVX512DQ-NEXT:    vmovdqa64 %xmm18, %xmm0
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7]
+; AVX512DQ-NEXT:    vmovdqa 96(%r9), %ymm15
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm13 = ymm15[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm23 = ymm13[2,2,2,3]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm5[0],ymm3[1],ymm5[1],ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[8],ymm5[8],ymm3[9],ymm5[9],ymm3[10],ymm5[10],ymm3[11],ymm5[11]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm5 = ymm15[2,3,2,3,6,7,6,7]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm15, %ymm16
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm5[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm18 = ymm5[2,1,2,3]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm22, %ymm2
-; AVX512DQ-NEXT:    vmovdqa64 %ymm26, %ymm3
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
-; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm2, %zmm0, %zmm1 {%k1}
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm6[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3]
+; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm3, %zmm0, %zmm2 {%k1}
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
-; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7]
-; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
-; AVX512DQ-NEXT:    vmovdqa %xmm4, %xmm9
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm26
-; AVX512DQ-NEXT:    vmovdqa 96(%r9), %xmm6
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm6[0,0,2,1,4,5,6,7]
-; AVX512DQ-NEXT:    vpbroadcastq %xmm0, %ymm0
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm6[0,2,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
-; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX512DQ-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
-; AVX512DQ-NEXT:    # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm5[0],ymm7[0],ymm5[1],ymm7[1],ymm5[2],ymm7[2],ymm5[3],ymm7[3],ymm5[8],ymm7[8],ymm5[9],ymm7[9],ymm5[10],ymm7[10],ymm5[11],ymm7[11]
-; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm24, %zmm3
-; AVX512DQ-NEXT:    vmovdqa64 %xmm29, %xmm2
-; AVX512DQ-NEXT:    vmovdqa64 %xmm30, %xmm4
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm2, %ymm1
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm26, %xmm0
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7]
+; AVX512DQ-NEXT:    vmovdqa 64(%r9), %ymm0
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vmovdqa %ymm0, %ymm15
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm17 = ymm5[2,2,2,3]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm3
+; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm19 = zmm2[0,1,2,3],zmm3[4,5,6,7]
+; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512DQ-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
+; AVX512DQ-NEXT:    # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm21, %ymm0
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm11 = ymm11[0],ymm0[0],ymm11[1],ymm0[1],ymm11[2],ymm0[2],ymm11[3],ymm0[3],ymm11[8],ymm0[8],ymm11[9],ymm0[9],ymm11[10],ymm0[10],ymm11[11],ymm0[11]
+; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm4, %zmm11
+; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512DQ-NEXT:    vpunpckhwd (%rsp), %xmm0, %xmm2 # 16-byte Folded Reload
+; AVX512DQ-NEXT:    # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[1,1,1,1]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm16, %ymm4
-; AVX512DQ-NEXT:    vmovdqa64 %ymm17, %ymm5
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm16
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm28 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535]
-; AVX512DQ-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm16 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm4[2,2,2,3]
-; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm1, %zmm2, %zmm3 {%k1}
-; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm3, %ymm1
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm28, %ymm0
+; AVX512DQ-NEXT:    vmovdqa64 %ymm27, %ymm3
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[8],ymm3[8],ymm0[9],ymm3[9],ymm0[10],ymm3[10],ymm0[11],ymm3[11]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3]
+; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm3, %zmm2, %zmm11 {%k1}
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
-; AVX512DQ-NEXT:    vmovdqa (%r9), %xmm14
-; AVX512DQ-NEXT:    vmovdqa64 %xmm31, %xmm0
-; AVX512DQ-NEXT:    vpshufb %xmm9, %xmm0, %xmm2
-; AVX512DQ-NEXT:    vmovdqa %xmm9, %xmm5
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7]
-; AVX512DQ-NEXT:    vmovdqa 32(%r9), %xmm8
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm27
-; AVX512DQ-NEXT:    vmovdqa 64(%r9), %xmm9
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm9[0,0,2,1,4,5,6,7]
-; AVX512DQ-NEXT:    vpbroadcastq %xmm1, %ymm1
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm9[0,2,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm8[0,0,2,1,4,5,6,7]
-; AVX512DQ-NEXT:    vpbroadcastq %xmm2, %ymm0
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm8[0,2,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm12 = ymm2[0,0,2,1]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm14[0,0,2,1,4,5,6,7]
-; AVX512DQ-NEXT:    vpbroadcastq %xmm2, %ymm29
-; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX512DQ-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload
-; AVX512DQ-NEXT:    # xmm3 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm14[0,2,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm30 = ymm2[0,0,2,1]
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[8],ymm11[8],ymm10[9],ymm11[9],ymm10[10],ymm11[10],ymm10[11],ymm11[11]
-; AVX512DQ-NEXT:    vmovdqa 96(%r9), %ymm11
-; AVX512DQ-NEXT:    vpermt2d %zmm3, %zmm24, %zmm2
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm11[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm24 = ymm3[2,2,2,3]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm4 = ymm11[2,3,2,3,6,7,6,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm4[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm31 = ymm4[2,1,2,3]
-; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX512DQ-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm10 # 16-byte Folded Reload
-; AVX512DQ-NEXT:    # xmm10 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7]
-; AVX512DQ-NEXT:    vmovdqa 64(%r9), %ymm4
-; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX512DQ-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm7 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm7 = ymm3[0],mem[0],ymm3[1],mem[1],ymm3[2],mem[2],ymm3[3],mem[3],ymm3[8],mem[8],ymm3[9],mem[9],ymm3[10],mem[10],ymm3[11],mem[11]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm13 = ymm4[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm18 = ymm13[2,2,2,3]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[1,1,1,1]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3]
-; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm7, %zmm10, %zmm2 {%k1}
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm7 = ymm4[2,3,2,3,6,7,6,7]
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm11, %ymm3
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm9 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
+; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512DQ-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
+; AVX512DQ-NEXT:    # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm13 = ymm10[0],ymm12[0],ymm10[1],ymm12[1],ymm10[2],ymm12[2],ymm10[3],ymm12[3],ymm10[8],ymm12[8],ymm10[9],ymm12[9],ymm10[10],ymm12[10],ymm10[11],ymm12[11]
+; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm4, %zmm13
+; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512DQ-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
+; AVX512DQ-NEXT:    # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[1,1,1,1]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm22, %ymm0
+; AVX512DQ-NEXT:    vmovdqa64 %ymm24, %ymm3
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[8],ymm3[8],ymm0[9],ymm3[9],ymm0[10],ymm3[10],ymm0[11],ymm3[11]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3]
+; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512DQ-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload
+; AVX512DQ-NEXT:    # xmm10 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm5 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[8],ymm7[8],ymm6[9],ymm7[9],ymm6[10],ymm7[10],ymm6[11],ymm7[11]
+; AVX512DQ-NEXT:    vpermt2d %zmm10, %zmm4, %zmm5
+; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512DQ-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload
+; AVX512DQ-NEXT:    # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[1,1,1,1]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm20, %ymm0
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm14[0],ymm0[0],ymm14[1],ymm0[1],ymm14[2],ymm0[2],ymm14[3],ymm0[3],ymm14[8],ymm0[8],ymm14[9],ymm0[9],ymm14[10],ymm0[10],ymm14[11],ymm0[11]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3]
+; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm3, %zmm2, %zmm13 {%k1}
+; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm6, %zmm4, %zmm5 {%k1}
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm26 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535]
+; AVX512DQ-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm2 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm3 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm0, %xmm7
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm10 = ymm11[0],ymm7[1],ymm11[2,3],ymm7[4],ymm11[5,6],ymm7[7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm7 = ymm15[2,3,2,3,6,7,6,7]
+; AVX512DQ-NEXT:    vmovdqa %ymm15, %ymm6
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm7 = ymm7[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3]
-; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm2, %ymm10
-; AVX512DQ-NEXT:    vmovdqa64 %ymm19, %ymm3
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm15 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm15 = ymm10[0,1],ymm15[2],ymm10[3,4],ymm15[5],ymm10[6,7]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm21 = ymm7[2,1,2,3]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm9, %zmm0, %zmm9
+; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm20 = zmm10[0,1,2,3],zmm9[4,5,6,7]
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm13, %ymm10
+; AVX512DQ-NEXT:    vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm11 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm11 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7]
 ; AVX512DQ-NEXT:    vmovdqa 32(%r9), %ymm10
-; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX512DQ-NEXT:    vpshufb %xmm5, %xmm3, %xmm3
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm10[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm0, %xmm12
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2,3],ymm12[4],ymm13[5,6],ymm12[7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm13 = ymm10[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm24 = ymm13[2,2,2,3]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm11, %zmm0, %zmm11
+; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm22 = zmm12[0,1,2,3],zmm11[4,5,6,7]
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm5, %ymm12
+; AVX512DQ-NEXT:    vmovdqa64 %ymm25, %ymm0
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm14 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm14[2],ymm12[3,4],ymm14[5],ymm12[6,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm14 = ymm10[2,3,2,3,6,7,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm14 = ymm14[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3]
+; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
+; AVX512DQ-NEXT:    vmovdqa (%r9), %ymm9
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0],ymm1[1],ymm5[2,3],ymm1[4],ymm5[5,6],ymm1[7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm9[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm3 = ymm10[2,3,2,3,6,7,6,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm3[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3]
-; AVX512DQ-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm1 # 64-byte Folded Reload
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm12, %zmm0, %zmm12
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm15, %zmm2, %zmm2
-; AVX512DQ-NEXT:    vmovdqa (%r9), %ymm15
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm15[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm13 = ymm15[2,3,2,3,6,7,6,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm13 = ymm13[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm14 = xmm14[2,3,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm14[0,2,2,1,4,5,6,7]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm15 = ymm15[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[2,3,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,1,4,5,6,7]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm11 = ymm11[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm9 = xmm9[2,3,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm9[0,2,2,1,4,5,6,7]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm4[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2]
+; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm12[4,5,6,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm12 = ymm9[2,3,2,3,6,7,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm12 = ymm12[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm8 = xmm8[2,3,2,3]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm8[0,2,2,1,4,5,6,7]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm9 = ymm9[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2]
+; AVX512DQ-NEXT:    vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
+; AVX512DQ-NEXT:    # xmm15 = mem[2,3,2,3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm15[0,2,2,1,4,5,6,7]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm16, %ymm0
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
+; AVX512DQ-NEXT:    vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
+; AVX512DQ-NEXT:    # xmm7 = mem[2,3,2,3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[0,2,2,1,4,5,6,7]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm11 = ymm6[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2]
+; AVX512DQ-NEXT:    vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
+; AVX512DQ-NEXT:    # xmm13 = mem[2,3,2,3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm13[0,2,2,1,4,5,6,7]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm10 = ymm10[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,2,2,2]
-; AVX512DQ-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm12 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm30, %zmm29, %zmm17
-; AVX512DQ-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm17 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm31, %zmm24, %zmm22
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm24 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0]
-; AVX512DQ-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm22 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm7, %zmm18, %zmm7
-; AVX512DQ-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm7 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm3, %zmm5, %zmm3
-; AVX512DQ-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm3 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm13, %zmm0, %zmm0
-; AVX512DQ-NEXT:    vpternlogd $184, %zmm23, %zmm24, %zmm0
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm15, %zmm14, %zmm5
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512DQ-NEXT:    vpternlogd $184, %zmm25, %zmm13, %zmm5
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm11, %zmm6, %zmm6
-; AVX512DQ-NEXT:    vpternlogd $184, %zmm26, %zmm13, %zmm6
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm4, %zmm9, %zmm4
-; AVX512DQ-NEXT:    vpternlogd $184, %zmm27, %zmm13, %zmm4
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm10, %zmm8, %zmm8
-; AVX512DQ-NEXT:    vpternlogd $184, %zmm2, %zmm13, %zmm8
+; AVX512DQ-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm4 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm31, %zmm30, %zmm25
+; AVX512DQ-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm25 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm18, %zmm23, %zmm6
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm18 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0]
+; AVX512DQ-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm6 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm21, %zmm17, %zmm16
+; AVX512DQ-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm16 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm14, %zmm24, %zmm14
+; AVX512DQ-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm14 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm12, %zmm5, %zmm5
+; AVX512DQ-NEXT:    vpternlogd $184, %zmm29, %zmm18, %zmm5
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm9, %zmm8, %zmm8
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
+; AVX512DQ-NEXT:    vpternlogd $184, %zmm19, %zmm9, %zmm8
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm15, %zmm0
+; AVX512DQ-NEXT:    vpternlogd $184, %zmm20, %zmm9, %zmm0
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm11, %zmm7, %zmm7
+; AVX512DQ-NEXT:    vpternlogd $184, %zmm22, %zmm9, %zmm7
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm10, %zmm13, %zmm10
+; AVX512DQ-NEXT:    vpternlogd $184, %zmm1, %zmm9, %zmm10
 ; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-NEXT:    vmovdqa64 %zmm8, 256(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm4, 448(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm6, 640(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm5, 64(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm17, (%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm12, 192(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm1, 384(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm16, 576(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm0, 128(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm3, 320(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm7, 512(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm22, 704(%rax)
-; AVX512DQ-NEXT:    addq $584, %rsp # imm = 0x248
+; AVX512DQ-NEXT:    vmovdqa64 %zmm10, 256(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm7, 448(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm0, 640(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm8, 64(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm25, (%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm4, 192(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm3, 384(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm2, 576(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm5, 128(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm14, 320(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm16, 512(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm6, 704(%rax)
+; AVX512DQ-NEXT:    addq $936, %rsp # imm = 0x3A8
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512DQ-FCP-LABEL: store_i16_stride6_vf64:
 ; AVX512DQ-FCP:       # %bb.0:
-; AVX512DQ-FCP-NEXT:    subq $1176, %rsp # imm = 0x498
+; AVX512DQ-FCP-NEXT:    subq $1224, %rsp # imm = 0x4C8
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %ymm1
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %ymm0
@@ -9700,8 +9771,8 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %xmm4
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rcx), %xmm6
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rcx), %xmm7
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %xmm2
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %xmm5
@@ -9711,95 +9782,95 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rcx), %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdx), %ymm7
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm7[0],ymm2[0],ymm7[1],ymm2[1],ymm7[2],ymm2[2],ymm7[3],ymm2[3],ymm7[8],ymm2[8],ymm7[9],ymm2[9],ymm7[10],ymm2[10],ymm7[11],ymm2[11]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm7, %ymm29
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdx), %ymm6
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm6[0],ymm2[0],ymm6[1],ymm2[1],ymm6[2],ymm2[2],ymm6[3],ymm2[3],ymm6[8],ymm2[8],ymm6[9],ymm2[9],ymm6[10],ymm2[10],ymm6[11],ymm2[11]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm6, %ymm30
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm31
 ; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rcx), %xmm2
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdx), %xmm7
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdx), %xmm6
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm6
 ; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rcx), %ymm1
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdx), %ymm8
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm8[0],ymm1[0],ymm8[1],ymm1[1],ymm8[2],ymm1[2],ymm8[3],ymm1[3],ymm8[8],ymm1[8],ymm8[9],ymm1[9],ymm8[10],ymm1[10],ymm8[11],ymm1[11]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm8, %ymm30
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm3, %zmm20
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %ymm3
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm8, %ymm28
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm3, %zmm17
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %ymm1
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11]
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
 ; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm3, %zmm16
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm17 = [1,1,1,1,10,10,10,11]
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %ymm3
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm18 = [1,1,1,1,10,10,10,11]
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm1
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11]
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %xmm4
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %xmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm3
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm17, %zmm3
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm19 = [1,2,1,2,0,0,3,3,13,12,10,10,13,12,14,14]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm0, %zmm19, %zmm21
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm18, %zmm3
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm20 = [1,2,1,2,0,0,3,3,13,12,10,10,13,12,14,14]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm0, %zmm20, %zmm22
 ; AVX512DQ-FCP-NEXT:    movw $18724, %ax # imm = 0x4924
 ; AVX512DQ-FCP-NEXT:    kmovw %eax, %k1
-; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm3, %zmm21 {%k1}
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm24 = [0,9,2,3,8,5,6,11]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm21, %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm8 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm0, %xmm0
-; AVX512DQ-FCP-NEXT:    vpermt2d %ymm0, %ymm24, %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm18 = [8,9,20,11,12,21,14,15]
+; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm3, %zmm22 {%k1}
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm25 = [8,9,20,11,12,21,14,15]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm22, %zmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm18, %zmm21
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm22 = [0,0,0,1,0,10,10,0]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm25, %zmm1
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm19 = [0,9,2,3,8,5,6,11]
+; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %xmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm15 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm15, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm0, %ymm19, %ymm22
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm23 = [0,0,0,1,0,10,10,0]
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %xmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm14 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm14, %xmm1, %xmm1
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm22, %zmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm8 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm23, %zmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rsi), %ymm9
 ; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdi), %ymm5
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm5[0],ymm9[0],ymm5[1],ymm9[1],ymm5[2],ymm9[2],ymm5[3],ymm9[3],ymm5[8],ymm9[8],ymm5[9],ymm9[9],ymm5[10],ymm9[10],ymm5[11],ymm9[11]
 ; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rsi), %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdi), %xmm15
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm0, %xmm23
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm17, %zmm4
-; AVX512DQ-FCP-NEXT:    vpermd %zmm2, %zmm19, %zmm25
-; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm4, %zmm25 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm25, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%r8), %xmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm1, %xmm2
-; AVX512DQ-FCP-NEXT:    vpermt2d %ymm2, %ymm24, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdi), %xmm14
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm0, %xmm21
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm18, %zmm4
+; AVX512DQ-FCP-NEXT:    vpermd %zmm6, %zmm20, %zmm26
+; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm4, %zmm26 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm26, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa 96(%r8), %ymm10
 ; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm10[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpermt2d %zmm3, %zmm18, %zmm25
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm3, %zmm25, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%r8), %xmm0
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm15, %xmm0, %xmm3
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm0, %xmm24
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm3, %ymm19, %ymm26
 ; AVX512DQ-FCP-NEXT:    vmovdqa 96(%r9), %ymm11
 ; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm11[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
 ; AVX512DQ-FCP-NEXT:    vmovdqa 96(%r9), %xmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm14, %xmm0, %xmm0
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm4, %zmm22, %zmm0
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm4, %zmm23, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rsi), %xmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -9809,329 +9880,332 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %ymm12
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm7 = ymm12[0],ymm13[0],ymm12[1],ymm13[1],ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[8],ymm13[8],ymm12[9],ymm13[9],ymm12[10],ymm13[10],ymm12[11],ymm13[11]
 ; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm7, %zmm17, %zmm6
-; AVX512DQ-FCP-NEXT:    vpermd %zmm20, %zmm19, %zmm28
-; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm6, %zmm28 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm28, %ymm20
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%r8), %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm0, %xmm6
-; AVX512DQ-FCP-NEXT:    vpermt2d %ymm6, %ymm24, %ymm20
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm7, %zmm18, %zmm6
+; AVX512DQ-FCP-NEXT:    vpermd %zmm17, %zmm20, %zmm17
+; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm6, %zmm17 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm17, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa 64(%r8), %ymm6
 ; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm7 = ymm6[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpermt2d %zmm7, %zmm18, %zmm28
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm7, %zmm25, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%r8), %xmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm15, %xmm0, %xmm7
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm7, %ymm19, %ymm17
 ; AVX512DQ-FCP-NEXT:    vmovdqa 64(%r9), %xmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa 64(%r9), %ymm7
 ; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm7[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm14, %xmm1, %xmm1
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm22, %zmm1
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm23, %zmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %xmm3
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %xmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm1, (%rsp) # 16-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %ymm2
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11]
 ; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm4, %zmm17, %zmm0
-; AVX512DQ-FCP-NEXT:    vpermd %zmm16, %zmm19, %zmm19
-; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm0, %zmm19 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm0, %xmm0
-; AVX512DQ-FCP-NEXT:    vpermi2d %ymm0, %ymm19, %ymm24
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm4, %zmm18, %zmm0
+; AVX512DQ-FCP-NEXT:    vpermd %zmm16, %zmm20, %zmm20
+; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm0, %zmm20 {%k1}
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm18, %zmm19
+; AVX512DQ-FCP-NEXT:    vpermi2d %zmm0, %zmm20, %zmm25
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %xmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm15, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm0, %ymm19, %ymm20
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r9), %xmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm14, %xmm0, %xmm1
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm0, %xmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r9), %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm22, %zmm1
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm23, %zmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm14 = [u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm9, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm5, %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm15 = [u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm9, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm5, %ymm1
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm8 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
 ; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm5[4],ymm9[4],ymm5[5],ymm9[5],ymm5[6],ymm9[6],ymm5[7],ymm9[7],ymm5[12],ymm9[12],ymm5[13],ymm9[13],ymm5[14],ymm9[14],ymm5[15],ymm9[15]
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm27 = [2,1,2,3,11,11,11,11]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm27, %zmm8
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm26 = [5,6,5,6,5,6,7,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm29, %ymm2
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm29 = [2,1,2,3,11,11,11,11]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm29, %zmm8
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm27 = [5,6,5,6,5,6,7,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm30, %ymm2
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm31, %ymm1
 ; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm0, %ymm26, %ymm0
+; AVX512DQ-FCP-NEXT:    vpermd %ymm0, %ymm27, %ymm0
 ; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} ymm5 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm5[0],ymm1[0],ymm5[1],ymm1[1],ymm5[2],ymm1[2],ymm5[3],ymm1[3],ymm5[8],ymm1[8],ymm5[9],ymm1[9],ymm5[10],ymm1[10],ymm5[11],ymm1[11]
+; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} ymm2 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
 ; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
 ; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm0, %zmm1, %zmm8 {%k1}
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm22 = [12,1,2,13,4,5,14,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm8, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm10[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpermt2d %ymm1, %ymm22, %ymm0
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm23 = [8,21,10,11,20,13,14,23]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31]
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm10, %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm2, %ymm10
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm29 = [8,21,10,11,20,13,14,23]
-; AVX512DQ-FCP-NEXT:    vpermt2d %zmm1, %zmm29, %zmm8
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm0, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm2, %ymm9
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm1, %zmm23, %zmm0
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm10[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm30 = [12,1,2,13,4,5,14,7]
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm1, %ymm30, %ymm8
+; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm8[0,1,2,3],zmm0[0,1,2,3]
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31]
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm11, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm1, %ymm5
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm1, %ymm8
 ; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25]
 ; AVX512DQ-FCP-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm11, %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm2, %ymm8
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm2, %ymm11
 ; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm31 = [2,2,0,3,10,0,10,11]
 ; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm31, %zmm1
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm13, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm12, %ymm2
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm13, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm12, %ymm2
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
 ; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm12[4],ymm13[4],ymm12[5],ymm13[5],ymm12[6],ymm13[6],ymm12[7],ymm13[7],ymm12[12],ymm13[12],ymm12[13],ymm13[13],ymm12[14],ymm13[14],ymm12[15],ymm13[15]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm27, %zmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm30, %ymm4
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm29, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm28, %ymm4
 ; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
 ; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm2, %ymm26, %ymm2
+; AVX512DQ-FCP-NEXT:    vpermd %ymm2, %ymm27, %ymm2
 ; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} ymm3 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
 ; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} ymm4 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11]
 ; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2]
 ; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm2, %zmm3, %zmm0 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm2
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm6, %ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm9, %ymm10
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm3, %zmm23, %zmm2
 ; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm6[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpermt2d %ymm3, %ymm22, %ymm2
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm6, %ymm3
-; AVX512DQ-FCP-NEXT:    vpermt2d %zmm3, %zmm29, %zmm0
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm3, %ymm30, %ymm0
+; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[0,1,2,3]
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm7, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm7, %ymm3
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm8, %ymm12
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm31, %zmm3
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm23, %xmm4
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm4, %xmm0
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm15, %xmm2
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm15[0],xmm4[0],xmm15[1],xmm4[1],xmm15[2],xmm4[2],xmm15[3],xmm4[3]
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm18 = [0,0,2,1,8,9,8,9]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm18, %zmm2
-; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} ymm30 = [1,0,2,2,1,0,2,2]
-; AVX512DQ-FCP-NEXT:    # ymm30 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm7, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm7, %ymm2
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm31, %zmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm21, %xmm4
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm4, %xmm0
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm14, %xmm3
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm14[0],xmm4[0],xmm14[1],xmm4[1],xmm14[2],xmm4[2],xmm14[3],xmm4[3]
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm19 = [0,0,2,1,8,9,8,9]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm19, %zmm3
+; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [1,0,2,2,1,0,2,2]
+; AVX512DQ-FCP-NEXT:    # ymm0 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm0, %ymm30, %ymm0
-; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm4 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm4, %ymm0, %ymm4
 ; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm5 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX512DQ-FCP-NEXT:    vpbroadcastq %xmm4, %ymm4
+; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm6 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; AVX512DQ-FCP-NEXT:    vpbroadcastq %xmm5, %ymm5
 ; AVX512DQ-FCP-NEXT:    movw $9362, %ax # imm = 0x2492
 ; AVX512DQ-FCP-NEXT:    kmovw %eax, %k2
-; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm4, %zmm0, %zmm2 {%k2}
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,1,3,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm2, %ymm5
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm17 = [16,9,10,17,12,13,18,15]
-; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm17, %zmm2
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [0,1,8,3,4,9,6,7]
-; AVX512DQ-FCP-NEXT:    vpermt2d %ymm4, %ymm0, %ymm5
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm5, %zmm2
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm5, %zmm4, %zmm3 {%k2}
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm28 = [16,9,10,17,12,13,18,15]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm4
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm24, %xmm6
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm6[2,1,3,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm5, %zmm28, %zmm4
+; AVX512DQ-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm5 = xmm24[0],zero,xmm24[1],zero,xmm24[2],zero,xmm24[3],zero
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm16 = [0,1,8,3,4,9,6,7]
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm5, %ymm16, %ymm3
+; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm4[0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9]
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm4, %xmm5, %xmm2
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm4, %xmm9
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm5[0,0,2,1,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm16 = [0,0,0,0,8,8,0,9]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm16, %zmm4
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm4, %xmm5, %xmm3
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm4, %xmm8
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm5[0,0,2,1,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm18 = [0,0,0,0,8,8,0,9]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm18, %zmm7
+; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm5, %xmm3
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm6, %xmm2
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm7, %xmm5
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm18, %zmm5
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm6, %xmm4
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm19, %zmm4
+; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm5 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
 ; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm6 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm7 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm2, %ymm30, %ymm2
-; AVX512DQ-FCP-NEXT:    vpbroadcastq %xmm6, %ymm6
-; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm6, %zmm2, %zmm5 {%k2}
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm5, %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm7[2,1,3,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpermt2d %zmm6, %zmm17, %zmm5
-; AVX512DQ-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm6 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero
-; AVX512DQ-FCP-NEXT:    vpermt2d %ymm6, %ymm0, %ymm2
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm2, %zmm23
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm5, %xmm2
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,0,2,1,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm16, %zmm5
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm9, %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm7, %ymm6
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm6[0],ymm2[0],ymm6[1],ymm2[1],ymm6[2],ymm2[2],ymm6[3],ymm2[3],ymm6[8],ymm2[8],ymm6[9],ymm2[9],ymm6[10],ymm2[10],ymm6[11],ymm2[11]
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm7[4],ymm9[4],ymm7[5],ymm9[5],ymm7[6],ymm9[6],ymm7[7],ymm9[7],ymm7[12],ymm9[12],ymm7[13],ymm9[13],ymm7[14],ymm9[14],ymm7[15],ymm9[15]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm27, %zmm6
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm9[4],ymm7[4],ymm9[5],ymm7[5],ymm9[6],ymm7[6],ymm9[7],ymm7[7],ymm9[12],ymm7[12],ymm9[13],ymm7[13],ymm9[14],ymm7[14],ymm9[15],ymm7[15]
-; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} ymm7 = ymm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm7[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} ymm9 = ymm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm9[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm7 = ymm9[0],ymm7[0],ymm9[1],ymm7[1],ymm9[2],ymm7[2],ymm9[3],ymm7[3],ymm9[8],ymm7[8],ymm9[9],ymm7[9],ymm9[10],ymm7[10],ymm9[11],ymm7[11]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm2, %ymm26, %ymm2
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,2,2,2]
-; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm2, %zmm7, %zmm6 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm6, %ymm7
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm9[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpermt2d %ymm2, %ymm22, %ymm7
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm10, %ymm13
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm9, %ymm2
-; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm29, %zmm6
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm11, %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm10, %ymm9
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm9[0],ymm2[0],ymm9[1],ymm2[1],ymm9[2],ymm2[2],ymm9[3],ymm2[3],ymm9[8],ymm2[8],ymm9[9],ymm2[9],ymm9[10],ymm2[10],ymm9[11],ymm2[11]
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm9 = ymm10[4],ymm11[4],ymm10[5],ymm11[5],ymm10[6],ymm11[6],ymm10[7],ymm11[7],ymm10[12],ymm11[12],ymm10[13],ymm11[13],ymm10[14],ymm11[14],ymm10[15],ymm11[15]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm9, %zmm27, %zmm2
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm7, %zmm27
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm3, %ymm0, %ymm3
+; AVX512DQ-FCP-NEXT:    vpbroadcastq %xmm5, %ymm5
+; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm5, %zmm3, %zmm4 {%k2}
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, %zmm3
+; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm6[2,1,3,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm5, %zmm28, %zmm3
+; AVX512DQ-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm5 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm5, %ymm16, %ymm4
+; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm12 = zmm4[0,1,2,3],zmm3[0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm4, %xmm3
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm8, %xmm21
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[0,0,2,1,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm18, %zmm4
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm8, %ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm6, %ymm5
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm5[0],ymm3[0],ymm5[1],ymm3[1],ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[8],ymm3[8],ymm5[9],ymm3[9],ymm5[10],ymm3[10],ymm5[11],ymm3[11]
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm6[4],ymm8[4],ymm6[5],ymm8[5],ymm6[6],ymm8[6],ymm6[7],ymm8[7],ymm6[12],ymm8[12],ymm6[13],ymm8[13],ymm6[14],ymm8[14],ymm6[15],ymm8[15]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm5, %zmm29, %zmm3
 ; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm6, %ymm7
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm12, %ymm15
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm6, %ymm6
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm7, %zmm31, %zmm6
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm8[4],ymm6[4],ymm8[5],ymm6[5],ymm8[6],ymm6[6],ymm8[7],ymm6[7],ymm8[12],ymm6[12],ymm8[13],ymm6[13],ymm8[14],ymm6[14],ymm8[15],ymm6[15]
+; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} ymm6 = ymm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm6[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} ymm8 = ymm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm8[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm8[0],ymm6[0],ymm8[1],ymm6[1],ymm8[2],ymm6[2],ymm8[3],ymm6[3],ymm8[8],ymm6[8],ymm8[9],ymm6[9],ymm8[10],ymm6[10],ymm8[11],ymm6[11]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm5, %ymm27, %ymm5
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,2,2,2]
+; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm5, %zmm6, %zmm3 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm6
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm10, %ymm14
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm8, %ymm5
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm5, %zmm23, %zmm6
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm8[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm5, %ymm30, %ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm13, %ymm5
 ; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm7 = ymm10[4],ymm9[4],ymm10[5],ymm9[5],ymm10[6],ymm9[6],ymm10[7],ymm9[7],ymm10[12],ymm9[12],ymm10[13],ymm9[13],ymm10[14],ymm9[14],ymm10[15],ymm9[15]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm7, %ymm26, %ymm7
-; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} ymm11 = ymm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm9[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} ymm12 = ymm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm10[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm11 = ymm12[0],ymm11[0],ymm12[1],ymm11[1],ymm12[2],ymm11[2],ymm12[3],ymm11[3],ymm12[8],ymm11[8],ymm12[9],ymm11[9],ymm12[10],ymm11[10],ymm12[11],ymm11[11]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2]
-; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm7, %zmm11, %zmm2 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm7 = ymm9[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpermi2d %ymm7, %ymm2, %ymm22
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm13, %ymm9, %ymm7
-; AVX512DQ-FCP-NEXT:    vpermt2d %zmm7, %zmm29, %zmm2
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm7, %ymm11
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm7, %ymm7
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm11, %zmm31, %zmm7
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm9, %xmm11
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rsp), %xmm10 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm10, %xmm12
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm12 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7]
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm12, %zmm18, %zmm11
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm12, %ymm30, %ymm12
-; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm13 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm14 = xmm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm10, %ymm8
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm5 = ymm8[0],ymm5[0],ymm8[1],ymm5[1],ymm8[2],ymm5[2],ymm8[3],ymm5[3],ymm8[8],ymm5[8],ymm8[9],ymm5[9],ymm8[10],ymm5[10],ymm8[11],ymm5[11]
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm8 = ymm10[4],ymm13[4],ymm10[5],ymm13[5],ymm10[6],ymm13[6],ymm10[7],ymm13[7],ymm10[12],ymm13[12],ymm10[13],ymm13[13],ymm10[14],ymm13[14],ymm10[15],ymm13[15]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm8, %zmm29, %zmm5
+; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm29 = zmm3[0,1,2,3],zmm6[0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm13 = [u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm13, %ymm3, %ymm8
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm11, %ymm15
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm3, %ymm3
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm8, %zmm31, %zmm3
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm8 = ymm11[4],ymm6[4],ymm11[5],ymm6[5],ymm11[6],ymm6[6],ymm11[7],ymm6[7],ymm11[12],ymm6[12],ymm11[13],ymm6[13],ymm11[14],ymm6[14],ymm11[15],ymm6[15]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm8, %ymm27, %ymm8
+; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} ymm10 = ymm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm6[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} ymm11 = ymm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm11[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm10 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[8],ymm10[8],ymm11[9],ymm10[9],ymm11[10],ymm10[10],ymm11[11],ymm10[11]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,2,2,2]
+; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm8, %zmm10, %zmm5 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm6, %ymm8
+; AVX512DQ-FCP-NEXT:    vpermi2d %zmm8, %zmm5, %zmm23
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm8 = ymm6[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm8, %ymm30, %ymm5
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm13, %ymm6, %ymm10
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm6, %ymm8
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm10, %zmm31, %zmm8
+; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm6, %xmm10
+; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm13, %xmm11
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm11 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7]
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm13[0],xmm6[0],xmm13[1],xmm6[1],xmm13[2],xmm6[2],xmm13[3],xmm6[3]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm11, %zmm19, %zmm10
+; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm11, %ymm0, %ymm11
+; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm13 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm14 = xmm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3]
 ; AVX512DQ-FCP-NEXT:    vpbroadcastq %xmm13, %ymm13
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm10, %xmm14
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm9, %xmm8
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm14 = xmm8[4],xmm14[4],xmm8[5],xmm14[5],xmm8[6],xmm14[6],xmm8[7],xmm14[7]
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm14, %zmm18, %zmm8
-; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm13, %zmm12, %zmm11 {%k2}
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm12 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm9[2,1,3,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm11, %ymm14
-; AVX512DQ-FCP-NEXT:    vpermt2d %zmm13, %zmm17, %zmm11
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm13, %ymm30, %ymm13
-; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm15 = xmm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm10 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm15[0],xmm10[1],xmm15[1],xmm10[2],xmm15[2],xmm10[3],xmm15[3]
-; AVX512DQ-FCP-NEXT:    vpbroadcastq %xmm10, %ymm10
-; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm10, %zmm13, %zmm8 {%k2}
-; AVX512DQ-FCP-NEXT:    vpermt2d %ymm12, %ymm0, %ymm14
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm10 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero
-; AVX512DQ-FCP-NEXT:    vpermi2d %ymm10, %ymm8, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm9[2,1,3,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpermt2d %zmm10, %zmm17, %zmm8
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm22, %zmm2
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm14, %zmm10
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm13 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm9, %xmm11
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm9[0,0,2,1,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm11, %zmm16, %zmm12
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm0, %zmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm9, %xmm8
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm9[0,0,2,1,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm8, %zmm16, %zmm11
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm0, %zmm8, %zmm11
+; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm13, %zmm11, %zmm10 {%k2}
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm10, %zmm11
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rsp), %xmm15 # 16-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm15[2,1,3,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm13, %zmm28, %zmm11
+; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm14, %xmm13
+; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm6, %xmm9
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm13[4],xmm9[5],xmm13[5],xmm9[6],xmm13[6],xmm9[7],xmm13[7]
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm6[0],xmm14[0],xmm6[1],xmm14[1],xmm6[2],xmm14[2],xmm6[3],xmm14[3]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm9, %zmm19, %zmm13
+; AVX512DQ-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm9 = xmm15[0],zero,xmm15[1],zero,xmm15[2],zero,xmm15[3],zero
+; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm6[0],xmm15[0],xmm6[1],xmm15[1],xmm6[2],xmm15[2],xmm6[3],xmm15[3]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm14, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm14 = xmm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm15 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
+; AVX512DQ-FCP-NEXT:    vpbroadcastq %xmm14, %ymm14
+; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm14, %zmm0, %zmm13 {%k2}
+; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm6[2,1,3,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpermi2d %zmm0, %zmm13, %zmm28
+; AVX512DQ-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm9, %ymm16, %ymm10
+; AVX512DQ-FCP-NEXT:    vpermt2d %ymm0, %ymm16, %ymm13
+; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm5[0,1,2,3],zmm23[0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm10[0,1,2,3],zmm11[0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm21, %xmm11
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm11, %xmm6, %xmm9
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm6[0,0,2,1,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm9, %zmm18, %zmm10
+; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm9 = zmm13[0,1,2,3],zmm28[0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm11, %xmm6, %xmm11
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm6[0,0,2,1,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm11, %zmm18, %zmm13
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535]
+; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm9, %zmm11, %zmm13
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm11, (%rax)
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm10, %zmm8, %zmm12
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm12, 192(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0]
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm2, %zmm0, %zmm7
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, 128(%rax)
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm27, %zmm0, %zmm6
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, 320(%rax)
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm19, %zmm24, %zmm2
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm2, %zmm6, %zmm7
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, 256(%rax)
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm28, %zmm20, %zmm2
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm2, %zmm6, %zmm7
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, 448(%rax)
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm23, %zmm8, %zmm5
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, 384(%rax)
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm4 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, 576(%rax)
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, 512(%rax)
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm13, (%rax)
+; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm5, %zmm11, %zmm10
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm10, 192(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0]
+; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm0, %zmm5, %zmm8
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, 128(%rax)
+; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm29, %zmm5, %zmm3
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, 320(%rax)
+; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm20[0,1,2,3],zmm25[0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm0, %zmm3, %zmm6
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, 256(%rax)
+; AVX512DQ-FCP-NEXT:    vshufi64x2 $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm0 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # zmm0 = zmm17[0,1,2,3],mem[0,1,2,3]
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm0, %zmm3, %zmm6
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, 448(%rax)
+; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm12, %zmm11, %zmm4
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, 384(%rax)
+; AVX512DQ-FCP-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm7 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, 576(%rax)
+; AVX512DQ-FCP-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm2 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, 512(%rax)
+; AVX512DQ-FCP-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm1 # 64-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, 704(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm25, %zmm0, %zmm0
+; AVX512DQ-FCP-NEXT:    vshufi64x2 $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm0 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # zmm0 = zmm26[0,1,2,3],mem[0,1,2,3]
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm0, %zmm6, %zmm1
+; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm0, %zmm3, %zmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, 640(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm21, %zmm0, %zmm0
+; AVX512DQ-FCP-NEXT:    vshufi64x2 $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm0 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # zmm0 = zmm22[0,1,2,3],mem[0,1,2,3]
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm0, %zmm6, %zmm1
+; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm0, %zmm3, %zmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, 64(%rax)
-; AVX512DQ-FCP-NEXT:    addq $1176, %rsp # imm = 0x498
+; AVX512DQ-FCP-NEXT:    addq $1224, %rsp # imm = 0x4C8
 ; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
index dc362d729fcd3..208ee607909ed 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
@@ -2850,8 +2850,8 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-NEXT:    vporq %ymm1, %ymm4, %ymm17
 ; AVX512-NEXT:    vmovdqa (%rdi), %xmm1
 ; AVX512-NEXT:    vmovdqa (%rsi), %xmm4
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm5 = ymm3[u,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[14,15,u,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[16,17,u,u],zero,zero
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm10 = ymm2[u,u,u,u,u,u,u,u,12,13,14,15],zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm2[u,u,u,u]
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm5 = ymm3[u,u,u,u,u,u,u,u,u,u],zero,zero,ymm3[14,15,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm3[16,17,u,u,u,u]
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm10 = ymm2[u,u,u,u,u,u,u,u,u,u,14,15],zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm2[u,u,u,u]
 ; AVX512-NEXT:    vporq %ymm5, %ymm10, %ymm19
 ; AVX512-NEXT:    vpbroadcastd 8(%rax), %ymm5
 ; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm5
@@ -2982,8 +2982,8 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm4[u,u,u,u,u,u,14,15],zero,zero,ymm4[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm4[u,u,u,u,u,u,u,u]
 ; AVX512-FCP-NEXT:    vporq %ymm0, %ymm3, %ymm17
 ; AVX512-FCP-NEXT:    vmovdqa (%rcx), %xmm14
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm2[u,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[14,15,u,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[16,17,u,u],zero,zero
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,u,u,u,u,u,12,13,14,15],zero,zero,ymm1[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm1[u,u,u,u]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[14,15,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[16,17,u,u,u,u]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,u,u,u,u,u,u,u,14,15],zero,zero,ymm1[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm1[u,u,u,u]
 ; AVX512-FCP-NEXT:    vporq %ymm0, %ymm3, %ymm18
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,u,u,u,u,26,27,u,u,u,u,u,u]
 ; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm5[2,2,2,2,6,6,6,6]
@@ -3112,8 +3112,8 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vporq %ymm1, %ymm4, %ymm17
 ; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm1
 ; AVX512DQ-NEXT:    vmovdqa (%rsi), %xmm4
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm5 = ymm3[u,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[14,15,u,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[16,17,u,u],zero,zero
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm10 = ymm2[u,u,u,u,u,u,u,u,12,13,14,15],zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm2[u,u,u,u]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm5 = ymm3[u,u,u,u,u,u,u,u,u,u],zero,zero,ymm3[14,15,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm3[16,17,u,u,u,u]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm10 = ymm2[u,u,u,u,u,u,u,u,u,u,14,15],zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm2[u,u,u,u]
 ; AVX512DQ-NEXT:    vporq %ymm5, %ymm10, %ymm19
 ; AVX512DQ-NEXT:    vpbroadcastd 8(%rax), %ymm5
 ; AVX512DQ-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm5
@@ -3244,8 +3244,8 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm4[u,u,u,u,u,u,14,15],zero,zero,ymm4[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm4[u,u,u,u,u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vporq %ymm0, %ymm3, %ymm17
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %xmm14
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm2[u,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[14,15,u,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[16,17,u,u],zero,zero
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,u,u,u,u,u,12,13,14,15],zero,zero,ymm1[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm1[u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[14,15,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[16,17,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,u,u,u,u,u,u,u,14,15],zero,zero,ymm1[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm1[u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vporq %ymm0, %ymm3, %ymm18
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,u,u,u,u,26,27,u,u,u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm5[2,2,2,2,6,6,6,6]
@@ -5893,1257 +5893,1237 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ;
 ; AVX512-LABEL: store_i16_stride7_vf32:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    subq $680, %rsp # imm = 0x2A8
+; AVX512-NEXT:    subq $632, %rsp # imm = 0x278
 ; AVX512-NEXT:    vmovdqa (%rcx), %ymm1
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128]
-; AVX512-NEXT:    vpshufb %ymm13, %ymm1, %ymm0
-; AVX512-NEXT:    vmovdqa64 %ymm1, %ymm27
-; AVX512-NEXT:    vmovdqa (%rdx), %ymm8
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm14 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpshufb %ymm14, %ymm8, %ymm1
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm9 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128]
+; AVX512-NEXT:    vpshufb %ymm9, %ymm1, %ymm0
+; AVX512-NEXT:    vmovdqa64 %ymm1, %ymm16
+; AVX512-NEXT:    vmovdqa (%rdx), %ymm2
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm13 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpshufb %ymm13, %ymm2, %ymm1
+; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm29
 ; AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vmovdqa (%rsi), %ymm9
+; AVX512-NEXT:    vmovdqa (%rsi), %ymm1
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm12 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128]
-; AVX512-NEXT:    vpshufb %ymm12, %ymm9, %ymm0
-; AVX512-NEXT:    vmovdqa (%rdi), %ymm11
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm15 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19]
-; AVX512-NEXT:    vpshufb %ymm15, %ymm11, %ymm1
+; AVX512-NEXT:    vpshufb %ymm12, %ymm1, %ymm0
+; AVX512-NEXT:    vmovdqa %ymm1, %ymm15
+; AVX512-NEXT:    vmovdqa (%rdi), %ymm2
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm14 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19]
+; AVX512-NEXT:    vpshufb %ymm14, %ymm2, %ymm1
+; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm17
 ; AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vmovdqa (%r9), %ymm1
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
-; AVX512-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
-; AVX512-NEXT:    vmovdqa64 %ymm1, %ymm16
-; AVX512-NEXT:    vmovdqa (%r8), %ymm4
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u]
-; AVX512-NEXT:    vpshufb %ymm1, %ymm4, %ymm3
-; AVX512-NEXT:    vmovdqa64 %ymm4, %ymm17
+; AVX512-NEXT:    vmovdqa (%r9), %ymm10
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
+; AVX512-NEXT:    vpshufb %ymm1, %ymm10, %ymm2
+; AVX512-NEXT:    vmovdqa (%r8), %ymm11
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u]
+; AVX512-NEXT:    vpshufb %ymm0, %ymm11, %ymm3
 ; AVX512-NEXT:    vpor %ymm2, %ymm3, %ymm2
 ; AVX512-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vmovdqa 32(%r9), %xmm2
-; AVX512-NEXT:    vmovdqa 32(%r8), %xmm10
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3]
-; AVX512-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,6]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX512-NEXT:    vmovdqa 32(%r9), %xmm3
+; AVX512-NEXT:    vmovdqa 32(%r8), %xmm6
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm11[3,3,3,3,7,7,7,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm10[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,3,6,6,6,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm2[2],ymm4[3,4],ymm2[5],ymm4[6,7,8,9],ymm2[10],ymm4[11,12],ymm2[13],ymm4[14,15]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3]
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [4,5,4,5,4,5,6,7,16,17,16,17,16,17,17,19]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm2[0,1,3,2,4,5,6,7]
+; AVX512-NEXT:    vpermi2d %zmm5, %zmm4, %zmm7
+; AVX512-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa (%rcx), %xmm8
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,6]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
 ; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [16,18,19,19,19,19,0,0,0,1,0,1,2,3,2,3]
-; AVX512-NEXT:    vpermi2d %zmm3, %zmm2, %zmm4
+; AVX512-NEXT:    vpermi2d %zmm2, %zmm3, %zmm4
 ; AVX512-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm2
-; AVX512-NEXT:    vmovdqa 32(%rsi), %ymm10
-; AVX512-NEXT:    vpshufb %ymm12, %ymm10, %ymm3
-; AVX512-NEXT:    vpshufb %ymm15, %ymm2, %ymm4
-; AVX512-NEXT:    vpor %ymm3, %ymm4, %ymm3
-; AVX512-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm3
+; AVX512-NEXT:    vmovdqa 32(%rsi), %ymm6
+; AVX512-NEXT:    vpshufb %ymm12, %ymm6, %ymm2
+; AVX512-NEXT:    vpshufb %ymm14, %ymm3, %ymm4
+; AVX512-NEXT:    vpor %ymm2, %ymm4, %ymm2
+; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa 32(%rcx), %ymm12
-; AVX512-NEXT:    vmovdqa 32(%rdx), %ymm15
-; AVX512-NEXT:    vpshufb %ymm13, %ymm12, %ymm3
-; AVX512-NEXT:    vpshufb %ymm14, %ymm15, %ymm4
-; AVX512-NEXT:    vpor %ymm3, %ymm4, %ymm3
-; AVX512-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa 32(%rdx), %ymm7
+; AVX512-NEXT:    vpshufb %ymm9, %ymm12, %ymm2
+; AVX512-NEXT:    vpshufb %ymm13, %ymm7, %ymm4
+; AVX512-NEXT:    vpor %ymm2, %ymm4, %ymm2
+; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa 32(%r9), %ymm13
 ; AVX512-NEXT:    vmovdqa 32(%r8), %ymm14
-; AVX512-NEXT:    vpshufb %ymm0, %ymm13, %ymm0
-; AVX512-NEXT:    vpshufb %ymm1, %ymm14, %ymm1
-; AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX512-NEXT:    vpshufb %ymm1, %ymm13, %ymm1
+; AVX512-NEXT:    vpshufb %ymm0, %ymm14, %ymm0
+; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa (%rcx), %xmm0
-; AVX512-NEXT:    vmovdqa (%rdx), %xmm3
-; AVX512-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
-; AVX512-NEXT:    vpshufb %xmm1, %xmm0, %xmm4
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm5 = xmm3[1,1,2,2]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3],xmm4[4],xmm5[5,6],xmm4[7]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,2,4,5,6,7]
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm26 = [0,16,0,0,17,17,0,0,0,0,0,1,2,0,0,3]
-; AVX512-NEXT:    vpermi2d %zmm5, %zmm4, %zmm26
-; AVX512-NEXT:    vmovdqa (%r9), %xmm4
-; AVX512-NEXT:    vmovdqa (%r8), %xmm5
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,4,5,7,6]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,2,4,5,6,7]
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm25 = [0,1,0,1,0,1,1,3,16,18,19,19,19,19,0,0]
-; AVX512-NEXT:    vpermi2d %zmm7, %zmm6, %zmm25
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovdqa (%rdx), %xmm0
+; AVX512-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
+; AVX512-NEXT:    vpshufb %xmm2, %xmm8, %xmm1
+; AVX512-NEXT:    vmovdqa64 %xmm2, %xmm20
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,2]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,2,4,5,6,7]
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [0,16,0,0,17,17,0,0,0,0,0,1,2,0,0,3]
+; AVX512-NEXT:    vpermi2d %zmm2, %zmm1, %zmm4
+; AVX512-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa (%r9), %xmm1
+; AVX512-NEXT:    vmovdqa (%r8), %xmm2
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm4[0,1,2,3,4,5,7,6]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,2,4,5,6,7]
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm9 = [0,1,0,1,0,1,1,3,16,18,19,19,19,19,0,0]
+; AVX512-NEXT:    vpermi2d %zmm5, %zmm4, %zmm9
+; AVX512-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vprold $16, %ymm10, %ymm4
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm11[1,2,2,3,5,6,6,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7,8,9],ymm4[10],ymm5[11,12],ymm4[13],ymm5[14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm11[0,0,2,1,4,4,6,5]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm9 = ymm10[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[0,0,0,0,4,4,4,4]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm5 = ymm9[0,1,2],ymm5[3],ymm9[4,5],ymm5[6],ymm9[7,8,9,10],ymm5[11],ymm9[12,13],ymm5[14],ymm9[15]
+; AVX512-NEXT:    vpmovsxbq {{.*#+}} zmm27 = [2,2,3,3,10,9,11,10]
+; AVX512-NEXT:    vpermi2q %zmm4, %zmm5, %zmm27
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7]
+; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm31
+; AVX512-NEXT:    vmovdqa (%rax), %ymm4
+; AVX512-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-NEXT:    vpbroadcastd 8(%rax), %ymm0
 ; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX512-NEXT:    vmovdqa (%rax), %ymm3
-; AVX512-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm7 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128]
-; AVX512-NEXT:    vpshufb %ymm7, %ymm3, %ymm6
-; AVX512-NEXT:    vmovdqa64 %ymm7, %ymm21
-; AVX512-NEXT:    vinserti64x4 $1, %ymm6, %zmm0, %zmm0
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm5 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128]
+; AVX512-NEXT:    vpshufb %ymm5, %ymm4, %ymm4
+; AVX512-NEXT:    vmovdqa64 %ymm5, %ymm28
+; AVX512-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm4 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm15[2,2,2,2,6,6,6,6]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15]
-; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm28
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm12[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm15[3,3,3,3,7,7,7,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm3 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15]
-; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm29
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm10[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm2[2,2,2,2,6,6,6,6]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm3 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15]
-; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm30
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm10[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm2[3,3,3,3,7,7,7,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm3 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8,9,10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15]
-; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm31
-; AVX512-NEXT:    vprold $16, %ymm13, %ymm4
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm14[1,2,2,3,5,6,6,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm3 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7,8,9],ymm4[10],ymm5[11,12],ymm4[13],ymm5[14,15]
-; AVX512-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm13[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,3,6,6,6,7]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm14[3,3,3,3,7,7,7,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15]
-; AVX512-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vmovdqa64 %ymm27, %ymm3
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm3[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm8[3,3,3,3,7,7,7,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15]
-; AVX512-NEXT:    vmovdqu %ymm4, (%rsp) # 32-byte Spill
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm9[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm11[3,3,3,3,7,7,7,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8,9,10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15]
-; AVX512-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm7[2,2,2,2,6,6,6,6]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
+; AVX512-NEXT:    vmovdqa64 %ymm0, %ymm30
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm12[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm7[3,3,3,3,7,7,7,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15]
+; AVX512-NEXT:    vmovdqa64 %ymm0, %ymm23
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm6[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm3[2,2,2,2,6,6,6,6]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm11 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm6[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm3[3,3,3,3,7,7,7,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
+; AVX512-NEXT:    vmovdqa64 %ymm0, %ymm19
+; AVX512-NEXT:    vprold $16, %ymm13, %ymm0
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm14[1,2,2,3,5,6,6,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15]
+; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm13[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,3,6,6,6,7]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm14[3,3,3,3,7,7,7,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
+; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-NEXT:    vmovdqa64 %ymm16, %ymm4
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,3,6,6,6,7]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm17[3,3,3,3,7,7,7,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15]
-; AVX512-NEXT:    vmovdqa64 %ymm4, %ymm19
-; AVX512-NEXT:    vmovdqa 32(%rdi), %xmm4
-; AVX512-NEXT:    vmovdqa 32(%rsi), %xmm5
-; AVX512-NEXT:    vprold $16, %xmm5, %xmm6
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm7 = xmm4[1,1,2,3]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2],xmm7[3,4],xmm6[5],xmm7[6,7]
-; AVX512-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
-; AVX512-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vmovdqa 32(%rcx), %xmm4
-; AVX512-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
-; AVX512-NEXT:    vmovdqa 32(%rdx), %xmm5
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm7 = xmm5[1,1,2,2]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm7[0],xmm1[1],xmm7[2,3],xmm1[4],xmm7[5,6],xmm1[7]
-; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX512-NEXT:    vmovdqa64 %xmm1, %xmm18
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
-; AVX512-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm2[1,1,1,1,5,5,5,5]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm10[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15]
-; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm15[0,1,1,3,4,5,5,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm12[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,0,0,0,4,4,4,4]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15]
-; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm14[0,0,2,1,4,4,6,5]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm13[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,0,0,0,4,4,4,4]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm4[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm29[3,3,3,3,7,7,7,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm5 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15]
+; AVX512-NEXT:    vmovdqa64 %ymm5, %ymm18
+; AVX512-NEXT:    vmovdqa %ymm15, %ymm8
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm15[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm17[3,3,3,3,7,7,7,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm5 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
+; AVX512-NEXT:    vmovdqa 32(%rdi), %xmm0
+; AVX512-NEXT:    vmovdqa 32(%rsi), %xmm9
+; AVX512-NEXT:    vprold $16, %xmm9, %xmm1
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm15 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm15[0,1],xmm1[2],xmm15[3,4],xmm1[5],xmm15[6,7]
 ; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,5,0,0,0,6,0,0,6,0,0,0,7,0,0,7]
-; AVX512-NEXT:    vmovdqa 32(%rax), %ymm2
-; AVX512-NEXT:    vpermd %zmm2, %zmm1, %zmm27
-; AVX512-NEXT:    vmovdqa64 %ymm21, %ymm1
-; AVX512-NEXT:    vpshufb %ymm1, %ymm2, %ymm1
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,1,1,3,4,5,5,7]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
-; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm20
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
-; AVX512-NEXT:    vpshufb %xmm1, %xmm0, %xmm5
-; AVX512-NEXT:    vpshufb %xmm1, %xmm6, %xmm14
-; AVX512-NEXT:    vmovdqa %xmm1, %xmm4
-; AVX512-NEXT:    vmovdqa (%rdi), %xmm1
-; AVX512-NEXT:    vmovdqa (%rsi), %xmm2
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX512-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
-; AVX512-NEXT:    vmovdqa64 %ymm0, %ymm21
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; AVX512-NEXT:    vprold $16, %xmm2, %xmm2
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7]
-; AVX512-NEXT:    vmovdqa64 %ymm1, %ymm24
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm3[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm8[0,1,1,3,4,5,5,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15]
-; AVX512-NEXT:    vmovdqa64 %ymm1, %ymm23
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm8[2,2,2,2,6,6,6,6]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15]
-; AVX512-NEXT:    vmovdqa64 %ymm1, %ymm22
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm9[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm11[1,1,1,1,5,5,5,5]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm13 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm11[2,2,2,2,6,6,6,6]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm9[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,2,6,6,6,6]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm11 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6,7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14,15]
-; AVX512-NEXT:    vmovdqa64 %ymm16, %ymm4
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm17[0,0,2,1,4,4,6,5]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm8 = ymm1[0,1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7,8,9,10],ymm2[11],ymm1[12,13],ymm2[14],ymm1[15]
-; AVX512-NEXT:    vprold $16, %ymm16, %ymm1
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm10 = ymm17[1,2,2,3,5,6,6,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm7 = ymm10[0,1],ymm1[2],ymm10[3,4],ymm1[5],ymm10[6,7,8,9],ymm1[10],ymm10[11,12],ymm1[13],ymm10[14,15]
-; AVX512-NEXT:    vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
-; AVX512-NEXT:    # xmm10 = mem[0,2,3,3,4,5,6,7]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,4]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[0,0,1,3]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm3 = ymm5[0,0,1,1]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm28 = ymm28[2,2,2,3]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm29 = ymm29[0,2,2,3]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm30 = ymm30[0,2,2,3]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm31 = ymm31[2,1,3,3]
-; AVX512-NEXT:    vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm6 = mem[2,1,3,2]
-; AVX512-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm5 = mem[2,2,2,3]
-; AVX512-NEXT:    vpermq $232, (%rsp), %ymm1 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm1 = mem[0,2,2,3]
-; AVX512-NEXT:    vmovdqa64 %xmm18, %xmm0
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm0[0,1,3,2,4,5,6,7]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[0,0,1,1]
-; AVX512-NEXT:    vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm0 = mem[2,1,3,3]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm9 = ymm14[0,0,1,1]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm4 = ymm19[2,2,2,3]
-; AVX512-NEXT:    vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
-; AVX512-NEXT:    # xmm15 = mem[0,1,3,2,4,5,6,7]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm15 = xmm15[0,1,1,3]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm29, %zmm28, %zmm28
-; AVX512-NEXT:    vinserti64x4 $1, %ymm31, %zmm30, %zmm29
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm30 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
-; AVX512-NEXT:    vpternlogq $184, %zmm28, %zmm30, %zmm29
-; AVX512-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,1,1,3]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm12, %zmm1, %zmm1
-; AVX512-NEXT:    vinserti64x4 $1, %ymm9, %zmm0, %zmm9
-; AVX512-NEXT:    vpternlogq $226, %zmm1, %zmm30, %zmm9
-; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm0 # 32-byte Folded Reload
-; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm1 # 32-byte Folded Reload
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
-; AVX512-NEXT:    vpternlogq $226, %zmm0, %zmm2, %zmm1
-; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm0 # 32-byte Folded Reload
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512-NEXT:    vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm0
-; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0
-; AVX512-NEXT:    vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm1 = mem[2,3,3,3,6,7,7,7]
-; AVX512-NEXT:    vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm3 = mem[0,0,2,1]
-; AVX512-NEXT:    vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
-; AVX512-NEXT:    # xmm10 = mem[2,1,2,3,4,5,6,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,5,4]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,0,1,3]
-; AVX512-NEXT:    vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm12 = mem[0,0,1,1]
-; AVX512-NEXT:    vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
-; AVX512-NEXT:    # xmm14 = mem[0,2,3,3,4,5,6,7]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1]
-; AVX512-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm17 = mem[2,2,2,3]
-; AVX512-NEXT:    vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm18 = mem[2,1,3,2]
-; AVX512-NEXT:    vpermq $250, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm19 = mem[2,2,3,3]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm21 = ymm21[0,0,1,1]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm28 = ymm24[0,0,2,1]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm30 = ymm23[2,1,3,2]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm31 = ymm22[2,2,2,3]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7]
+; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovdqa 32(%rcx), %xmm9
+; AVX512-NEXT:    vmovdqa64 %xmm20, %xmm0
+; AVX512-NEXT:    vpshufb %xmm0, %xmm9, %xmm0
+; AVX512-NEXT:    vmovdqa 32(%rdx), %xmm15
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm10 = xmm15[1,1,2,2]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm10[0],xmm0[1],xmm10[2,3],xmm0[4],xmm10[5,6],xmm0[7]
+; AVX512-NEXT:    vmovdqa64 %ymm0, %ymm26
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm9 = xmm15[4],xmm9[4],xmm15[5],xmm9[5],xmm15[6],xmm9[6],xmm15[7],xmm9[7]
+; AVX512-NEXT:    vmovdqa64 %xmm9, %xmm25
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[1,1,1,1,5,5,5,5]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm6[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm6 = ymm6[0,0,2,1,4,4,6,5]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm3 = ymm6[0,1],ymm3[2],ymm6[3,4],ymm3[5],ymm6[6,7,8,9],ymm3[10],ymm6[11,12],ymm3[13],ymm6[14,15]
+; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm24
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm3 = ymm7[0,1,1,3,4,5,5,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm12[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm6 = ymm6[0,0,0,0,4,4,4,4]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm6[2],ymm3[3,4],ymm6[5],ymm3[6,7,8,9],ymm6[10],ymm3[11,12],ymm6[13],ymm3[14,15]
+; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm22
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm3 = ymm14[0,0,2,1,4,4,6,5]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm13[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm6 = ymm6[0,0,0,0,4,4,4,4]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3],ymm6[4,5],ymm3[6],ymm6[7,8,9,10],ymm3[11],ymm6[12,13],ymm3[14],ymm6[15]
+; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm21
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [0,5,0,0,0,6,0,0,6,0,0,0,7,0,0,7]
+; AVX512-NEXT:    vmovdqa 32(%rax), %ymm6
+; AVX512-NEXT:    vpermd %zmm6, %zmm3, %zmm3
+; AVX512-NEXT:    vmovdqa64 %ymm28, %ymm7
+; AVX512-NEXT:    vpshufb %ymm7, %ymm6, %ymm10
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm6 = ymm6[0,1,1,3,4,5,5,7]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3]
+; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm6
+; AVX512-NEXT:    vinserti64x4 $1, %ymm6, %zmm10, %zmm6
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX512-NEXT:    vpshufb %xmm7, %xmm2, %xmm10
+; AVX512-NEXT:    vpshufb %xmm7, %xmm1, %xmm1
+; AVX512-NEXT:    vmovdqa (%rdi), %xmm13
+; AVX512-NEXT:    vmovdqa (%rsi), %xmm14
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
+; AVX512-NEXT:    vpshufb %xmm7, %xmm2, %xmm2
+; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm20
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm15 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
+; AVX512-NEXT:    vprold $16, %xmm14, %xmm14
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm13 = xmm13[1,1,2,3]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm13[0,1],xmm14[2],xmm13[3,4],xmm14[5],xmm13[6,7]
+; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm16
+; AVX512-NEXT:    vmovdqa %ymm4, %ymm2
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm14 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm14 = ymm14[0,0,0,0,4,4,4,4]
+; AVX512-NEXT:    vmovdqa64 %ymm29, %ymm12
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm9 = ymm29[0,1,1,3,4,5,5,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm14 = ymm9[0,1],ymm14[2],ymm9[3,4],ymm14[5],ymm9[6,7,8,9],ymm14[10],ymm9[11,12],ymm14[13],ymm9[14,15]
+; AVX512-NEXT:    vmovdqa64 %xmm31, %xmm4
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm4[0,2,3,3,4,5,6,7]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm15[2,1,2,3,4,5,6,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,4]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[0,0,1,3]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,0,1,1]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm28 = ymm30[2,2,2,3]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm29 = ymm23[0,2,2,3]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,1,3,2]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm6, %zmm5
-; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm29, %zmm5
-; AVX512-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm27
-; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm15[0,0,0,1]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = ymm19[2,1,3,3]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm30 = ymm18[0,2,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm31 = ymm5[2,1,3,3]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1]
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm12[2,2,2,2,6,6,6,6]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7,8,9],ymm4[10],ymm2[11,12],ymm4[13],ymm2[14,15]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm8[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm17[1,1,1,1,5,5,5,5]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm17[2,2,2,2,6,6,6,6]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm12 = ymm8[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm12 = ymm12[2,2,2,2,6,6,6,6]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2,3],ymm12[4],ymm5[5,6,7,8],ymm12[9],ymm5[10,11],ymm12[12],ymm5[13,14,15]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm29, %zmm28, %zmm12
+; AVX512-NEXT:    vinserti64x4 $1, %ymm7, %zmm11, %zmm7
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
+; AVX512-NEXT:    vpternlogq $184, %zmm12, %zmm11, %zmm7
+; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm30, %zmm0
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm31, %zmm1
+; AVX512-NEXT:    vpternlogq $226, %zmm0, %zmm11, %zmm1
+; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm0 # 32-byte Folded Reload
+; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm9 # 32-byte Folded Reload
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
+; AVX512-NEXT:    vpternlogq $226, %zmm0, %zmm11, %zmm9
+; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm10[0,1,2,3],zmm0[4,5,6,7]
+; AVX512-NEXT:    vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm10 = mem[2,1,3,2]
+; AVX512-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm12 = mem[2,2,2,3]
+; AVX512-NEXT:    vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm15 = mem[2,3,3,3,6,7,7,7]
+; AVX512-NEXT:    vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm17 = mem[0,0,2,1]
+; AVX512-NEXT:    vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm8 = mem[2,1,2,3,4,5,6,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,5,4]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,0,1,3]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm18 = ymm26[0,0,1,1]
+; AVX512-NEXT:    vmovdqa64 %xmm25, %xmm13
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm13[0,2,3,3,4,5,6,7]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm19 = ymm24[2,2,2,3]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm28 = ymm22[2,1,3,2]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm29 = ymm21[2,2,3,3]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm30 = ymm20[0,0,1,1]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm31 = ymm16[0,0,2,1]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[2,1,3,2]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3]
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512-NEXT:    vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm16, %zmm0
+; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm0
+; AVX512-NEXT:    vinserti64x4 $1, %ymm12, %zmm10, %zmm9
+; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm9
+; AVX512-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm3
+; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = ymm15[2,1,3,2]
+; AVX512-NEXT:    vpbroadcastd 32(%rax), %ymm9
+; AVX512-NEXT:    vinserti64x4 $1, %ymm9, %zmm7, %zmm7
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm7
+; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm7
+; AVX512-NEXT:    vinserti64x4 $1, %ymm8, %zmm17, %zmm1
+; AVX512-NEXT:    vinserti64x4 $1, %ymm13, %zmm18, %zmm8
+; AVX512-NEXT:    vpternlogq $226, %zmm1, %zmm11, %zmm8
+; AVX512-NEXT:    vpbroadcastd 36(%rax), %ymm1
+; AVX512-NEXT:    vpbroadcastd 40(%rax), %ymm9
+; AVX512-NEXT:    vinserti64x4 $1, %ymm9, %zmm1, %zmm1
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm1
+; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm1
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512-NEXT:    vinserti64x4 $1, %ymm19, %zmm8, %zmm8
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512-NEXT:    vinserti64x4 $1, %ymm28, %zmm9, %zmm9
+; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm9
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],zmm29[0,1,2,3]
+; AVX512-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm6
+; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm6
+; AVX512-NEXT:    vinserti64x4 $1, %ymm31, %zmm30, %zmm8
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm8
+; AVX512-NEXT:    vpbroadcastd (%rax), %ymm9
+; AVX512-NEXT:    vpbroadcastd 4(%rax), %ymm10
+; AVX512-NEXT:    vinserti64x4 $1, %ymm10, %zmm9, %zmm9
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm9
+; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm9
+; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm14, %zmm2
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm4, %zmm4
-; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2]
-; AVX512-NEXT:    vpbroadcastd 32(%rax), %ymm5
-; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm1, %zmm1
-; AVX512-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm1
-; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm1
-; AVX512-NEXT:    vinserti64x4 $1, %ymm10, %zmm3, %zmm3
-; AVX512-NEXT:    vinserti64x4 $1, %ymm14, %zmm12, %zmm4
-; AVX512-NEXT:    vpternlogq $226, %zmm3, %zmm2, %zmm4
-; AVX512-NEXT:    vpbroadcastd 36(%rax), %ymm2
-; AVX512-NEXT:    vpbroadcastd 40(%rax), %ymm3
-; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm2
+; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm4
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0]
+; AVX512-NEXT:    vpermd (%rax), %zmm2, %zmm2
+; AVX512-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm27, %zmm2
 ; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm2
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512-NEXT:    vinserti64x4 $1, %ymm17, %zmm3, %zmm3
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512-NEXT:    vinserti64x4 $1, %ymm18, %zmm4, %zmm4
-; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm4
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm19[0,1,2,3]
-; AVX512-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm20
-; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm20
-; AVX512-NEXT:    vinserti64x4 $1, %ymm28, %zmm21, %zmm3
-; AVX512-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm26, %zmm3
-; AVX512-NEXT:    vpbroadcastd (%rax), %ymm4
-; AVX512-NEXT:    vpbroadcastd 4(%rax), %ymm5
-; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm4, %zmm4
-; AVX512-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm25, %zmm4
-; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm4
-; AVX512-NEXT:    vinserti64x4 $1, %ymm31, %zmm30, %zmm3
-; AVX512-NEXT:    vinserti64x4 $1, %ymm11, %zmm13, %zmm5
-; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm5
-; AVX512-NEXT:    vinserti64x4 $1, %ymm7, %zmm8, %zmm3
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm6 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0]
-; AVX512-NEXT:    vpermd (%rax), %zmm6, %zmm6
-; AVX512-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm6
-; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm6
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    vmovdqa64 %zmm6, 128(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm4, (%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm20, 320(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm2, 256(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm1, 192(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm2, 128(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm9, (%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm6, 320(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm1, 256(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm7, 192(%rax)
 ; AVX512-NEXT:    vmovdqa64 %zmm0, 64(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm27, 384(%rax)
-; AVX512-NEXT:    addq $680, %rsp # imm = 0x2A8
+; AVX512-NEXT:    vmovdqa64 %zmm3, 384(%rax)
+; AVX512-NEXT:    addq $632, %rsp # imm = 0x278
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 ;
 ; AVX512-FCP-LABEL: store_i16_stride7_vf32:
 ; AVX512-FCP:       # %bb.0:
-; AVX512-FCP-NEXT:    subq $264, %rsp # imm = 0x108
-; AVX512-FCP-NEXT:    vmovdqa (%rcx), %ymm2
+; AVX512-FCP-NEXT:    subq $248, %rsp
+; AVX512-FCP-NEXT:    vmovdqa (%rcx), %ymm1
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128]
-; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm2, %ymm1
-; AVX512-FCP-NEXT:    vmovdqa %ymm2, %ymm10
-; AVX512-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
+; AVX512-FCP-NEXT:    vmovdqa %ymm1, %ymm10
+; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa (%rdx), %ymm4
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm4, %ymm3
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm4, %ymm3
 ; AVX512-FCP-NEXT:    vmovdqa64 %ymm4, %ymm18
 ; AVX512-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpor %ymm1, %ymm3, %ymm1
-; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpor %ymm2, %ymm3, %ymm2
+; AVX512-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa (%rsi), %ymm4
-; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %ymm13
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128]
-; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm4, %ymm3
+; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %ymm11
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm3 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm4, %ymm2
 ; AVX512-FCP-NEXT:    vmovdqa %ymm4, %ymm9
-; AVX512-FCP-NEXT:    vmovdqu %ymm4, (%rsp) # 32-byte Spill
+; AVX512-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm6
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm5 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19]
 ; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm6, %ymm4
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm6, %ymm17
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm6, %ymm16
 ; AVX512-FCP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpor %ymm3, %ymm4, %ymm3
-; AVX512-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa (%r9), %ymm15
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
-; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm15, %ymm4
-; AVX512-FCP-NEXT:    vmovdqa (%r8), %ymm11
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u]
-; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm11, %ymm8
-; AVX512-FCP-NEXT:    vpor %ymm4, %ymm8, %ymm3
-; AVX512-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm13, %ymm1
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm12
-; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm12, %ymm5
-; AVX512-FCP-NEXT:    vpor %ymm1, %ymm5, %ymm1
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %ymm8
-; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm8, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %ymm14
-; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm14, %ymm1
+; AVX512-FCP-NEXT:    vpor %ymm2, %ymm4, %ymm2
+; AVX512-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa (%r9), %ymm13
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
+; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm13, %ymm6
+; AVX512-FCP-NEXT:    vmovdqa (%r8), %ymm14
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u]
+; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm14, %ymm7
+; AVX512-FCP-NEXT:    vporq %ymm6, %ymm7, %ymm25
+; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm11, %ymm6
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm8
+; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm8, %ymm5
+; AVX512-FCP-NEXT:    vpor %ymm6, %ymm5, %ymm3
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %ymm3
+; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm29
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %ymm12
+; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm12, %ymm1
 ; AVX512-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 32(%r9), %ymm1
-; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm1, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm1, %ymm20
-; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %ymm2
-; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm2, %ymm1
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm16
-; AVX512-FCP-NEXT:    vporq %ymm0, %ymm1, %ymm22
+; AVX512-FCP-NEXT:    vmovdqa 32(%r9), %ymm15
+; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm15, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %ymm6
+; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm6, %ymm1
+; AVX512-FCP-NEXT:    vporq %ymm0, %ymm1, %ymm21
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29]
-; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm13, %ymm1
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm12[3,3,3,3,7,7,7,7]
+; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm11, %ymm1
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm8[3,3,3,3,7,7,7,7]
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm13[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm12[2,2,2,2,6,6,6,6]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[2,3],ymm2[4],ymm5[5,6,7,8],ymm2[9],ymm5[10,11],ymm2[12],ymm5[13,14,15]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm11[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm11, %ymm30
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm8[2,2,2,2,6,6,6,6]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3],ymm2[4],ymm4[5,6,7,8],ymm2[9],ymm4[10,11],ymm2[12],ymm4[13,14,15]
 ; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm24 = [0,2,2,3,10,0,11,0]
 ; AVX512-FCP-NEXT:    vpermi2q %zmm1, %zmm2, %zmm24
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm10[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u]
 ; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm18[3,3,3,3,7,7,7,7]
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15]
-; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %xmm3
-; AVX512-FCP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %xmm2
-; AVX512-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5]
-; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm3, %xmm27
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm25 = [0,2,2,3,8,0,9,0]
-; AVX512-FCP-NEXT:    vpermi2q %zmm2, %zmm1, %zmm25
+; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %xmm10
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %xmm5
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm23 = [0,2,2,3,8,0,9,0]
+; AVX512-FCP-NEXT:    vpermi2q %zmm2, %zmm1, %zmm23
 ; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm9, %ymm0
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm17[3,3,3,3,7,7,7,7]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm16[3,3,3,3,7,7,7,7]
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
 ; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm1
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
-; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %xmm10
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3]
-; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512-FCP-NEXT:    vmovdqa %xmm3, %xmm4
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm21 = [2,1,3,3,8,8,9,9]
-; AVX512-FCP-NEXT:    vpermi2q %zmm2, %zmm0, %zmm21
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm11[3,3,3,3,7,7,7,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7,8,9],ymm2[10],ymm0[11,12],ymm2[13],ymm0[14,15]
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm23 = [2,2,2,3,8,8,8,9]
-; AVX512-FCP-NEXT:    vmovdqa 32(%r9), %xmm2
-; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %xmm7
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm3, %xmm6
-; AVX512-FCP-NEXT:    vpermi2q %zmm6, %zmm0, %zmm23
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7]
-; AVX512-FCP-NEXT:    vprold $16, %xmm10, %xmm6
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm11 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %xmm2
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm4, %xmm4
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm20 = [2,1,3,3,8,8,9,9]
+; AVX512-FCP-NEXT:    vpermi2q %zmm4, %zmm0, %zmm20
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm14[3,3,3,3,7,7,7,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm0[0,1],ymm4[2],ymm0[3,4],ymm4[5],ymm0[6,7,8,9],ymm4[10],ymm0[11,12],ymm4[13],ymm0[14,15]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm22 = [2,2,2,3,8,8,8,9]
+; AVX512-FCP-NEXT:    vmovdqa 32(%r9), %xmm0
+; AVX512-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %xmm3
+; AVX512-FCP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
+; AVX512-FCP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm3, %xmm7
+; AVX512-FCP-NEXT:    vpermi2q %zmm7, %zmm4, %zmm22
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; AVX512-FCP-NEXT:    vprold $16, %xmm2, %xmm2
 ; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm6[2],xmm1[3,4],xmm6[5],xmm1[6,7]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9]
-; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm0, %xmm0
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9]
+; AVX512-FCP-NEXT:    vpshufb %xmm7, %xmm4, %xmm2
 ; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm26 = [0,0,0,1,8,8,9,0]
-; AVX512-FCP-NEXT:    vpermi2q %zmm0, %zmm1, %zmm26
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7]
-; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
-; AVX512-FCP-NEXT:    vpshufb %xmm7, %xmm3, %xmm1
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm29 = [0,1,1,3,8,8,9,9]
-; AVX512-FCP-NEXT:    vpermi2q %zmm0, %zmm1, %zmm29
-; AVX512-FCP-NEXT:    vmovdqa (%rcx), %xmm2
-; AVX512-FCP-NEXT:    vmovdqa (%rdx), %xmm0
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm2[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,2,2]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3],xmm1[4],xmm3[5,6],xmm1[7]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm27, %xmm4
-; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm27 = [0,0,1,0,8,8,9,9]
-; AVX512-FCP-NEXT:    vpermi2q %zmm1, %zmm3, %zmm27
-; AVX512-FCP-NEXT:    vmovdqa (%r9), %xmm6
+; AVX512-FCP-NEXT:    vpermi2q %zmm2, %zmm1, %zmm26
+; AVX512-FCP-NEXT:    vmovdqa (%r9), %xmm2
 ; AVX512-FCP-NEXT:    vmovdqa (%r8), %xmm9
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3]
-; AVX512-FCP-NEXT:    vpshufb %xmm7, %xmm3, %xmm7
-; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm3, %xmm3
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm28 = [0,0,0,1,8,9,9,11]
-; AVX512-FCP-NEXT:    vpermi2q %zmm7, %zmm3, %zmm28
-; AVX512-FCP-NEXT:    vprold $16, %ymm15, %ymm3
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm11[1,2,2,3,5,6,6,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7,8,9],ymm3[10],ymm5[11,12],ymm3[13],ymm5[14,15]
-; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm4 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21]
-; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm15, %ymm5
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm7 = ymm11[0,0,2,1,4,4,6,5]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3],ymm5[4,5],ymm7[6],ymm5[7,8,9,10],ymm7[11],ymm5[12,13],ymm7[14],ymm5[15]
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm31 = [2,2,3,3,10,9,11,10]
-; AVX512-FCP-NEXT:    vpermi2q %zmm3, %zmm5, %zmm31
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm0, %xmm19
-; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm5
-; AVX512-FCP-NEXT:    vmovdqa (%rsi), %xmm3
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
-; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm0, %xmm0
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm18
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3]
+; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm27 = [0,0,0,1,8,9,9,11]
+; AVX512-FCP-NEXT:    vpermi2q %zmm4, %zmm0, %zmm27
+; AVX512-FCP-NEXT:    vprold $16, %ymm13, %ymm0
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm14[1,2,2,3,5,6,6,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm4[0,1],ymm0[2],ymm4[3,4],ymm0[5],ymm4[6,7,8,9],ymm0[10],ymm4[11,12],ymm0[13],ymm4[14,15]
+; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21]
+; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm13, %ymm4
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm13 = ymm14[0,0,2,1,4,4,6,5]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm13[3],ymm4[4,5],ymm13[6],ymm4[7,8,9,10],ymm13[11],ymm4[12,13],ymm13[14],ymm4[15]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm28 = [2,2,3,3,10,9,11,10]
+; AVX512-FCP-NEXT:    vpermi2q %zmm0, %zmm4, %zmm28
+; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512-FCP-NEXT:    vmovdqa (%rsi), %xmm14
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7]
+; AVX512-FCP-NEXT:    vpshufb %xmm7, %xmm4, %xmm1
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm1, %ymm19
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7]
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm25, %zmm0, %zmm4
+; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm2, %xmm2
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm25 = [0,0,1,1,12,13,0,15]
+; AVX512-FCP-NEXT:    vpermi2q %zmm4, %zmm2, %zmm25
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FCP-NEXT:    vpbroadcastd 8(%rax), %ymm0
-; AVX512-FCP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa (%rax), %ymm10
+; AVX512-FCP-NEXT:    vpbroadcastd 8(%rax), %ymm2
+; AVX512-FCP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
+; AVX512-FCP-NEXT:    vmovdqa (%rax), %ymm9
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128]
-; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm10, %ymm2
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm30
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm7 = xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm12[1,1,1,1,5,5,5,5]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm6[2],ymm0[3,4],ymm6[5],ymm0[6,7,8,9],ymm6[10],ymm0[11,12],ymm6[13],ymm0[14,15]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm17
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm8[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm14[0,1,1,3,4,5,5,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm6[0,1],ymm0[2],ymm6[3,4],ymm0[5],ymm6[6,7,8,9],ymm0[10],ymm6[11,12],ymm0[13],ymm6[14,15]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm20, %ymm13
-; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm13, %ymm0
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm16[0,0,2,1,4,4,6,5]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7,8,9,10],ymm4[11],ymm0[12,13],ymm4[14],ymm0[15]
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm9 = [0,1,0,3,10,10,11,11]
-; AVX512-FCP-NEXT:    vpermi2q %zmm0, %zmm22, %zmm9
-; AVX512-FCP-NEXT:    vmovdqa 32(%rax), %ymm2
-; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm2, %ymm1
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm22 = [0,1,4,5,4,5,5,7,12,13,10,10,14,14,14,14]
-; AVX512-FCP-NEXT:    vpermd %ymm2, %ymm22, %ymm4
-; AVX512-FCP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm1, %zmm1
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm8[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm12 = ymm14[3,3,3,3,7,7,7,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm12[0],ymm4[1],ymm12[2,3],ymm4[4],ymm12[5,6,7,8],ymm4[9],ymm12[10,11],ymm4[12],ymm12[13,14,15]
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27]
-; AVX512-FCP-NEXT:    # ymm6 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm8, %ymm12
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm8 = ymm14[2,2,2,2,6,6,6,6]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm8[2],ymm12[3,4],ymm8[5],ymm12[6,7,8,9],ymm8[10],ymm12[11,12],ymm8[13],ymm12[14,15]
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm20 = [2,2,2,3,8,10,10,11]
-; AVX512-FCP-NEXT:    vpermt2q %zmm4, %zmm20, %zmm12
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm8 = ymm16[3,3,3,3,7,7,7,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm8[2],ymm4[3,4],ymm8[5],ymm4[6,7,8,9],ymm8[10],ymm4[11,12],ymm8[13],ymm4[14,15]
-; AVX512-FCP-NEXT:    vprold $16, %ymm13, %ymm8
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm15 = ymm16[1,2,2,3,5,6,6,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm15 = ymm15[0,1],ymm8[2],ymm15[3,4],ymm8[5],ymm15[6,7,8,9],ymm8[10],ymm15[11,12],ymm8[13],ymm15[14,15]
-; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm0[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9]
-; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm14 = xmm13[1,1,2,2]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm14[0],xmm8[1],xmm14[2,3],xmm8[4],xmm14[5,6],xmm8[7]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7]
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm14 = [0,0,1,1,8,8,10,9]
-; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm14, %zmm8
-; AVX512-FCP-NEXT:    vprold $16, %xmm3, %xmm0
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm13 = xmm5[1,1,2,3]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm13 = xmm13[0,1],xmm0[2],xmm13[3,4],xmm0[5],xmm13[6,7]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
-; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm7, %xmm0
-; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm3, %xmm3
-; AVX512-FCP-NEXT:    vpermt2q %zmm13, %zmm14, %zmm3
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm5 = [2,1,3,2,10,10,10,11]
-; AVX512-FCP-NEXT:    vpermt2q %zmm4, %zmm5, %zmm15
-; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm7, %ymm4
-; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm13[2,2,2,2,6,6,6,6]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm6[2],ymm4[3,4],ymm6[5],ymm4[6,7,8,9],ymm6[10],ymm4[11,12],ymm6[13],ymm4[14,15]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm7[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm7 = ymm13[0,1,1,3,4,5,5,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7,8,9],ymm6[10],ymm7[11,12],ymm6[13],ymm7[14,15]
-; AVX512-FCP-NEXT:    vpermt2q %zmm4, %zmm5, %zmm6
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm24, %zmm16, %zmm12
-; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm12, %zmm15
-; AVX512-FCP-NEXT:    vmovdqa64 (%rax), %zmm5
+; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm9, %ymm4
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm2, %zmm31
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm30, %ymm2
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm8[1,1,1,1,5,5,5,5]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7,8,9],ymm4[10],ymm2[11,12],ymm4[13],ymm2[14,15]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm18
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm29, %ymm7
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm7[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm12[0,1,1,3,4,5,5,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm4[0,1],ymm2[2],ymm4[3,4],ymm2[5],ymm4[6,7,8,9],ymm2[10],ymm4[11,12],ymm2[13],ymm4[14,15]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm17
+; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm15, %ymm2
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm6[0,0,2,1,4,4,6,5]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8,9,10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm29 = [0,1,0,3,10,10,11,11]
+; AVX512-FCP-NEXT:    vpermi2q %zmm2, %zmm21, %zmm29
+; AVX512-FCP-NEXT:    vmovdqa 32(%rax), %ymm8
+; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm8, %ymm1
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm21 = [0,1,4,5,4,5,5,7,12,13,10,10,14,14,14,14]
+; AVX512-FCP-NEXT:    vpermd %ymm8, %ymm21, %ymm2
+; AVX512-FCP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm16
+; AVX512-FCP-NEXT:    vmovdqa %ymm7, %ymm4
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm7[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm12[3,3,3,3,7,7,7,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14,15]
+; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27]
+; AVX512-FCP-NEXT:    # ymm7 = mem[0,1,0,1]
+; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm4, %ymm13
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm12 = ymm12[2,2,2,2,6,6,6,6]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7,8,9],ymm12[10],ymm13[11,12],ymm12[13],ymm13[14,15]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm30 = [2,2,2,3,8,10,10,11]
+; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm30, %zmm12
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm13 = ymm6[3,3,3,3,7,7,7,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm13[2],ymm3[3,4],ymm13[5],ymm3[6,7,8,9],ymm13[10],ymm3[11,12],ymm13[13],ymm3[14,15]
+; AVX512-FCP-NEXT:    vprold $16, %ymm15, %ymm13
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm6[1,2,2,3,5,6,6,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm13[2],ymm6[3,4],ymm13[5],ymm6[6,7,8,9],ymm13[10],ymm6[11,12],ymm13[13],ymm6[14,15]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm13 = [2,1,3,2,10,10,10,11]
+; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm13, %zmm6
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7]
+; AVX512-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm15 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
+; AVX512-FCP-NEXT:    vpshufb %xmm15, %xmm10, %xmm10
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0],xmm10[1],xmm5[2,3],xmm10[4],xmm5[5,6],xmm10[7]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,2,3,3,4,5,6,7]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [0,0,1,1,8,8,10,9]
+; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm2, %zmm5
+; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload
+; AVX512-FCP-NEXT:    # xmm3 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
+; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm1[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
+; AVX512-FCP-NEXT:    vmovdqa %xmm11, %xmm1
+; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm3, %xmm3
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm4 = [0,1,1,3,8,8,9,9]
+; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm4, %zmm10
+; AVX512-FCP-NEXT:    vprold $16, %xmm14, %xmm3
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm11 = xmm0[1,1,2,3]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm11[0,1],xmm3[2],xmm11[3,4],xmm3[5],xmm11[6,7]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
+; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm0, %xmm11
+; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm2, %zmm11
+; AVX512-FCP-NEXT:    vmovdqa (%rcx), %xmm0
+; AVX512-FCP-NEXT:    vpshufb %xmm15, %xmm0, %xmm2
+; AVX512-FCP-NEXT:    vmovdqa (%rdx), %xmm3
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm14 = xmm3[1,1,2,2]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm14[0],xmm2[1],xmm14[2,3],xmm2[4],xmm14[5,6],xmm2[7]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm14 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5]
+; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm4, %zmm0
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
+; AVX512-FCP-NEXT:    vpternlogq $226, %zmm24, %zmm2, %zmm12
+; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm12, %zmm6
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm12, %ymm3
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm1[2,2,2,2,6,6,6,6]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm12[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm7 = ymm1[0,1,1,3,4,5,5,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm7[0,1],ymm4[2],ymm7[3,4],ymm4[5],ymm7[6,7,8,9],ymm4[10],ymm7[11,12],ymm4[13],ymm7[14,15]
+; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm13, %zmm4
+; AVX512-FCP-NEXT:    vmovdqa64 (%rax), %zmm3
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [4,5,2,2,6,6,6,6,30,31,27,27,31,31,30,31]
-; AVX512-FCP-NEXT:    vpermi2d %zmm5, %zmm2, %zmm7
-; AVX512-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm15, %zmm7
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm26, %zmm2, %zmm8
-; AVX512-FCP-NEXT:    vpbroadcastd 36(%rax), %ymm12
-; AVX512-FCP-NEXT:    vpbroadcastd 40(%rax), %ymm13
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm13, %zmm12, %zmm12
-; AVX512-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm29, %zmm12
-; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm12
-; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm27
-; AVX512-FCP-NEXT:    vmovdqu (%rsp), %ymm4 # 32-byte Reload
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm4[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u]
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Reload
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm8 = ymm24[2,2,2,2,6,6,6,6]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm8[0],ymm3[1],ymm8[2,3],ymm3[4],ymm8[5,6,7,8],ymm3[9],ymm8[10,11],ymm3[12],ymm8[13,14,15]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm19, %xmm8
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm8[0,2,3,3,4,5,6,7]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm13 = ymm18[0,0,1,3]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm14 = ymm17[2,2,2,3]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,1,3,2]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm24[1,1,1,1,5,5,5,5]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm15[0,1],ymm4[2],ymm15[3,4],ymm4[5],ymm15[6,7,8,9],ymm4[10],ymm15[11,12],ymm4[13],ymm15[14,15]
-; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm20, %zmm4
-; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm4
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [6,0,0,0,7,0,0,7]
-; AVX512-FCP-NEXT:    vpermd %ymm10, %ymm3, %ymm3
+; AVX512-FCP-NEXT:    vpermi2d %zmm3, %zmm8, %zmm7
+; AVX512-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm7
+; AVX512-FCP-NEXT:    vpbroadcastd 36(%rax), %ymm6
+; AVX512-FCP-NEXT:    vpbroadcastd 40(%rax), %ymm8
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm6, %zmm6
+; AVX512-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm6
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
+; AVX512-FCP-NEXT:    vpternlogq $226, %zmm26, %zmm8, %zmm5
+; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm6
+; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm0
+; AVX512-FCP-NEXT:    vpbroadcastd (%rax), %ymm5
+; AVX512-FCP-NEXT:    vpbroadcastd 4(%rax), %ymm10
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm5, %zmm5
-; AVX512-FCP-NEXT:    vpermd %zmm5, %zmm22, %zmm5
-; AVX512-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm31, %zmm5
-; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm5
-; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm4 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm6 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm4, %zmm2, %zmm6
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm25, %zmm16, %zmm21
-; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm30, %zmm0
-; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm0
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm14[0,2,3,3,4,5,6,7]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm11 = ymm19[0,0,1,3]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm12 = ymm18[2,2,2,3]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm13 = ymm17[2,1,3,2]
+; AVX512-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm27, %zmm5
+; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm5
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm15[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u]
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm14 = ymm1[2,2,2,2,6,6,6,6]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm14[0],ymm0[1],ymm14[2,3],ymm0[4],ymm14[5,6,7,8],ymm0[9],ymm14[10,11],ymm0[12],ymm14[13,14,15]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm14 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm15 = ymm1[1,1,1,1,5,5,5,5]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7,8,9],ymm15[10],ymm14[11,12],ymm15[13],ymm14[14,15]
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm30, %zmm14
+; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm14
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [6,0,0,0,7,0,0,7]
+; AVX512-FCP-NEXT:    vpermd %ymm9, %ymm0, %ymm0
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm9, %zmm3, %zmm3
+; AVX512-FCP-NEXT:    vpermd %zmm3, %zmm21, %zmm3
+; AVX512-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm28, %zmm3
+; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm14, %zmm3
+; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm4 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm9 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    vpternlogq $226, %zmm4, %zmm8, %zmm9
+; AVX512-FCP-NEXT:    vpternlogq $226, %zmm23, %zmm2, %zmm20
+; AVX512-FCP-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm25, %zmm31
+; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm31
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm14, %zmm2, %zmm2
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm12, %zmm2, %zmm2
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm4, %zmm4
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm13, %zmm4, %zmm4
 ; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm4
-; AVX512-FCP-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm1
-; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm1
+; AVX512-FCP-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm29, %zmm16
+; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm16
 ; AVX512-FCP-NEXT:    vpbroadcastd 32(%rax), %ymm2
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm23, %zmm2
-; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm21, %zmm2
-; AVX512-FCP-NEXT:    vpbroadcastd (%rax), %ymm3
-; AVX512-FCP-NEXT:    vpbroadcastd 4(%rax), %ymm4
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm3, %zmm3
-; AVX512-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm28, %zmm3
-; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm27, %zmm3
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm22, %zmm0
+; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm20, %zmm0
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, 128(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, (%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, 320(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm12, 256(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, 192(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, 128(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, (%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm16, 320(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, 256(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, 192(%rax)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, 384(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, 64(%rax)
-; AVX512-FCP-NEXT:    addq $264, %rsp # imm = 0x108
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm31, 64(%rax)
+; AVX512-FCP-NEXT:    addq $248, %rsp
 ; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: store_i16_stride7_vf32:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    subq $680, %rsp # imm = 0x2A8
+; AVX512DQ-NEXT:    subq $632, %rsp # imm = 0x278
 ; AVX512DQ-NEXT:    vmovdqa (%rcx), %ymm1
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128]
-; AVX512DQ-NEXT:    vpshufb %ymm13, %ymm1, %ymm0
-; AVX512DQ-NEXT:    vmovdqa64 %ymm1, %ymm27
-; AVX512DQ-NEXT:    vmovdqa (%rdx), %ymm8
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm14 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpshufb %ymm14, %ymm8, %ymm1
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm9 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128]
+; AVX512DQ-NEXT:    vpshufb %ymm9, %ymm1, %ymm0
+; AVX512DQ-NEXT:    vmovdqa64 %ymm1, %ymm16
+; AVX512DQ-NEXT:    vmovdqa (%rdx), %ymm2
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm13 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpshufb %ymm13, %ymm2, %ymm1
+; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm29
 ; AVX512DQ-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512DQ-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vmovdqa (%rsi), %ymm9
+; AVX512DQ-NEXT:    vmovdqa (%rsi), %ymm1
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm12 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128]
-; AVX512DQ-NEXT:    vpshufb %ymm12, %ymm9, %ymm0
-; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm11
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm15 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19]
-; AVX512DQ-NEXT:    vpshufb %ymm15, %ymm11, %ymm1
+; AVX512DQ-NEXT:    vpshufb %ymm12, %ymm1, %ymm0
+; AVX512DQ-NEXT:    vmovdqa %ymm1, %ymm15
+; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm2
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm14 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19]
+; AVX512DQ-NEXT:    vpshufb %ymm14, %ymm2, %ymm1
+; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm17
 ; AVX512DQ-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512DQ-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vmovdqa (%r9), %ymm1
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
-; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
-; AVX512DQ-NEXT:    vmovdqa64 %ymm1, %ymm16
-; AVX512DQ-NEXT:    vmovdqa (%r8), %ymm4
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u]
-; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm4, %ymm3
-; AVX512DQ-NEXT:    vmovdqa64 %ymm4, %ymm17
+; AVX512DQ-NEXT:    vmovdqa (%r9), %ymm10
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
+; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm10, %ymm2
+; AVX512DQ-NEXT:    vmovdqa (%r8), %ymm11
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u]
+; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm11, %ymm3
 ; AVX512DQ-NEXT:    vpor %ymm2, %ymm3, %ymm2
 ; AVX512DQ-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 32(%r9), %xmm2
-; AVX512DQ-NEXT:    vmovdqa 32(%r8), %xmm10
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3]
-; AVX512DQ-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,6]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX512DQ-NEXT:    vmovdqa 32(%r9), %xmm3
+; AVX512DQ-NEXT:    vmovdqa 32(%r8), %xmm6
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm11[3,3,3,3,7,7,7,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm10[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,3,6,6,6,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm2[2],ymm4[3,4],ymm2[5],ymm4[6,7,8,9],ymm2[10],ymm4[11,12],ymm2[13],ymm4[14,15]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3]
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [4,5,4,5,4,5,6,7,16,17,16,17,16,17,17,19]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm2[0,1,3,2,4,5,6,7]
+; AVX512DQ-NEXT:    vpermi2d %zmm5, %zmm4, %zmm7
+; AVX512DQ-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa (%rcx), %xmm8
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,6]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
 ; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [16,18,19,19,19,19,0,0,0,1,0,1,2,3,2,3]
-; AVX512DQ-NEXT:    vpermi2d %zmm3, %zmm2, %zmm4
+; AVX512DQ-NEXT:    vpermi2d %zmm2, %zmm3, %zmm4
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm2
-; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %ymm10
-; AVX512DQ-NEXT:    vpshufb %ymm12, %ymm10, %ymm3
-; AVX512DQ-NEXT:    vpshufb %ymm15, %ymm2, %ymm4
-; AVX512DQ-NEXT:    vpor %ymm3, %ymm4, %ymm3
-; AVX512DQ-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm3
+; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %ymm6
+; AVX512DQ-NEXT:    vpshufb %ymm12, %ymm6, %ymm2
+; AVX512DQ-NEXT:    vpshufb %ymm14, %ymm3, %ymm4
+; AVX512DQ-NEXT:    vpor %ymm2, %ymm4, %ymm2
+; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa 32(%rcx), %ymm12
-; AVX512DQ-NEXT:    vmovdqa 32(%rdx), %ymm15
-; AVX512DQ-NEXT:    vpshufb %ymm13, %ymm12, %ymm3
-; AVX512DQ-NEXT:    vpshufb %ymm14, %ymm15, %ymm4
-; AVX512DQ-NEXT:    vpor %ymm3, %ymm4, %ymm3
-; AVX512DQ-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa 32(%rdx), %ymm7
+; AVX512DQ-NEXT:    vpshufb %ymm9, %ymm12, %ymm2
+; AVX512DQ-NEXT:    vpshufb %ymm13, %ymm7, %ymm4
+; AVX512DQ-NEXT:    vpor %ymm2, %ymm4, %ymm2
+; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa 32(%r9), %ymm13
 ; AVX512DQ-NEXT:    vmovdqa 32(%r8), %ymm14
-; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm13, %ymm0
-; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm14, %ymm1
-; AVX512DQ-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm13, %ymm1
+; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm14, %ymm0
+; AVX512DQ-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa (%rcx), %xmm0
-; AVX512DQ-NEXT:    vmovdqa (%rdx), %xmm3
-; AVX512DQ-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
-; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm0, %xmm4
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm5 = xmm3[1,1,2,2]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3],xmm4[4],xmm5[5,6],xmm4[7]
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,2,4,5,6,7]
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm26 = [0,16,0,0,17,17,0,0,0,0,0,1,2,0,0,3]
-; AVX512DQ-NEXT:    vpermi2d %zmm5, %zmm4, %zmm26
-; AVX512DQ-NEXT:    vmovdqa (%r9), %xmm4
-; AVX512DQ-NEXT:    vmovdqa (%r8), %xmm5
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,4,5,7,6]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,2,4,5,6,7]
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm25 = [0,1,0,1,0,1,1,3,16,18,19,19,19,19,0,0]
-; AVX512DQ-NEXT:    vpermi2d %zmm7, %zmm6, %zmm25
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; AVX512DQ-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vmovdqa (%rdx), %xmm0
+; AVX512DQ-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
+; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm8, %xmm1
+; AVX512DQ-NEXT:    vmovdqa64 %xmm2, %xmm20
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,2]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,2,4,5,6,7]
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [0,16,0,0,17,17,0,0,0,0,0,1,2,0,0,3]
+; AVX512DQ-NEXT:    vpermi2d %zmm2, %zmm1, %zmm4
+; AVX512DQ-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa (%r9), %xmm1
+; AVX512DQ-NEXT:    vmovdqa (%r8), %xmm2
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm4[0,1,2,3,4,5,7,6]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,2,4,5,6,7]
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm9 = [0,1,0,1,0,1,1,3,16,18,19,19,19,19,0,0]
+; AVX512DQ-NEXT:    vpermi2d %zmm5, %zmm4, %zmm9
+; AVX512DQ-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vprold $16, %ymm10, %ymm4
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm5 = ymm11[1,2,2,3,5,6,6,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7,8,9],ymm4[10],ymm5[11,12],ymm4[13],ymm5[14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm5 = ymm11[0,0,2,1,4,4,6,5]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm9 = ymm10[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[0,0,0,0,4,4,4,4]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm5 = ymm9[0,1,2],ymm5[3],ymm9[4,5],ymm5[6],ymm9[7,8,9,10],ymm5[11],ymm9[12,13],ymm5[14],ymm9[15]
+; AVX512DQ-NEXT:    vpmovsxbq {{.*#+}} zmm27 = [2,2,3,3,10,9,11,10]
+; AVX512DQ-NEXT:    vpermi2q %zmm4, %zmm5, %zmm27
 ; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm0, %xmm31
+; AVX512DQ-NEXT:    vmovdqa (%rax), %ymm4
+; AVX512DQ-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-NEXT:    vpbroadcastd 8(%rax), %ymm0
 ; AVX512DQ-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX512DQ-NEXT:    vmovdqa (%rax), %ymm3
-; AVX512DQ-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm7 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128]
-; AVX512DQ-NEXT:    vpshufb %ymm7, %ymm3, %ymm6
-; AVX512DQ-NEXT:    vmovdqa64 %ymm7, %ymm21
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm6, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm5 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128]
+; AVX512DQ-NEXT:    vpshufb %ymm5, %ymm4, %ymm4
+; AVX512DQ-NEXT:    vmovdqa64 %ymm5, %ymm28
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm4 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm5 = ymm15[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm28
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm12[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm5 = ymm15[3,3,3,3,7,7,7,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm3 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm29
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm10[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm5 = ymm2[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm3 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm30
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm10[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm5 = ymm2[3,3,3,3,7,7,7,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm3 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8,9,10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm31
-; AVX512DQ-NEXT:    vprold $16, %ymm13, %ymm4
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm5 = ymm14[1,2,2,3,5,6,6,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm3 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7,8,9],ymm4[10],ymm5[11,12],ymm4[13],ymm5[14,15]
-; AVX512DQ-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm13[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,3,6,6,6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm5 = ymm14[3,3,3,3,7,7,7,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15]
-; AVX512DQ-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vmovdqa64 %ymm27, %ymm3
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm3[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm5 = ymm8[3,3,3,3,7,7,7,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15]
-; AVX512DQ-NEXT:    vmovdqu %ymm4, (%rsp) # 32-byte Spill
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm9[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm5 = ymm11[3,3,3,3,7,7,7,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8,9,10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15]
-; AVX512DQ-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm7[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm0, %ymm30
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm12[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm7[3,3,3,3,7,7,7,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm0, %ymm23
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm6[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm3[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm11 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm6[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm3[3,3,3,3,7,7,7,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm0, %ymm19
+; AVX512DQ-NEXT:    vprold $16, %ymm13, %ymm0
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm14[1,2,2,3,5,6,6,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15]
+; AVX512DQ-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm13[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,3,6,6,6,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm14[3,3,3,3,7,7,7,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
+; AVX512DQ-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm16, %ymm4
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,3,6,6,6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm5 = ymm17[3,3,3,3,7,7,7,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm4, %ymm19
-; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %xmm4
-; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %xmm5
-; AVX512DQ-NEXT:    vprold $16, %xmm5, %xmm6
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm7 = xmm4[1,1,2,3]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2],xmm7[3,4],xmm6[5],xmm7[6,7]
-; AVX512DQ-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
-; AVX512DQ-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 32(%rcx), %xmm4
-; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
-; AVX512DQ-NEXT:    vmovdqa 32(%rdx), %xmm5
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm7 = xmm5[1,1,2,2]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm7[0],xmm1[1],xmm7[2,3],xmm1[4],xmm7[5,6],xmm1[7]
-; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm1, %xmm18
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
-; AVX512DQ-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm2[1,1,1,1,5,5,5,5]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm10[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15]
-; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm15[0,1,1,3,4,5,5,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm12[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,0,0,0,4,4,4,4]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15]
-; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm14[0,0,2,1,4,4,6,5]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm13[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,0,0,0,4,4,4,4]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm4[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm29[3,3,3,3,7,7,7,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm5 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm5, %ymm18
+; AVX512DQ-NEXT:    vmovdqa %ymm15, %ymm8
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm15[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm17[3,3,3,3,7,7,7,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm5 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
+; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %xmm0
+; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %xmm9
+; AVX512DQ-NEXT:    vprold $16, %xmm9, %xmm1
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm15 = xmm0[1,1,2,3]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm15[0,1],xmm1[2],xmm15[3,4],xmm1[5],xmm15[6,7]
 ; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,5,0,0,0,6,0,0,6,0,0,0,7,0,0,7]
-; AVX512DQ-NEXT:    vmovdqa 32(%rax), %ymm2
-; AVX512DQ-NEXT:    vpermd %zmm2, %zmm1, %zmm27
-; AVX512DQ-NEXT:    vmovdqa64 %ymm21, %ymm1
-; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm2, %ymm1
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,1,1,3,4,5,5,7]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX512DQ-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm20
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
-; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm0, %xmm5
-; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm6, %xmm14
-; AVX512DQ-NEXT:    vmovdqa %xmm1, %xmm4
-; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm1
-; AVX512DQ-NEXT:    vmovdqa (%rsi), %xmm2
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
-; AVX512DQ-NEXT:    vmovdqa64 %ymm0, %ymm21
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; AVX512DQ-NEXT:    vprold $16, %xmm2, %xmm2
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm1, %ymm24
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm3[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm8[0,1,1,3,4,5,5,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm1, %ymm23
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm8[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm1, %ymm22
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm9[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm11[1,1,1,1,5,5,5,5]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm13 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm11[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm9[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm11 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6,7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14,15]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm16, %ymm4
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm17[0,0,2,1,4,4,6,5]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm8 = ymm1[0,1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7,8,9,10],ymm2[11],ymm1[12,13],ymm2[14],ymm1[15]
-; AVX512DQ-NEXT:    vprold $16, %ymm16, %ymm1
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm10 = ymm17[1,2,2,3,5,6,6,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm7 = ymm10[0,1],ymm1[2],ymm10[3,4],ymm1[5],ymm10[6,7,8,9],ymm1[10],ymm10[11,12],ymm1[13],ymm10[14,15]
-; AVX512DQ-NEXT:    vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
-; AVX512DQ-NEXT:    # xmm10 = mem[0,2,3,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,4]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[0,0,1,3]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm5[0,0,1,1]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm28 = ymm28[2,2,2,3]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm29 = ymm29[0,2,2,3]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm30 = ymm30[0,2,2,3]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm31 = ymm31[2,1,3,3]
-; AVX512DQ-NEXT:    vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm6 = mem[2,1,3,2]
-; AVX512DQ-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm5 = mem[2,2,2,3]
-; AVX512DQ-NEXT:    vpermq $232, (%rsp), %ymm1 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm1 = mem[0,2,2,3]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm18, %xmm0
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm0[0,1,3,2,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[0,0,1,1]
-; AVX512DQ-NEXT:    vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm0 = mem[2,1,3,3]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm9 = ymm14[0,0,1,1]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm4 = ymm19[2,2,2,3]
-; AVX512DQ-NEXT:    vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
-; AVX512DQ-NEXT:    # xmm15 = mem[0,1,3,2,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm15 = xmm15[0,1,1,3]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm29, %zmm28, %zmm28
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm31, %zmm30, %zmm29
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm30 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
-; AVX512DQ-NEXT:    vpternlogq $184, %zmm28, %zmm30, %zmm29
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,1,1,3]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm12, %zmm1, %zmm1
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm9, %zmm0, %zmm9
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm1, %zmm30, %zmm9
-; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm0 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm1 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm0, %zmm2, %zmm1
-; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm0 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-NEXT:    vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm0
-; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0
-; AVX512DQ-NEXT:    vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm1 = mem[2,3,3,3,6,7,7,7]
-; AVX512DQ-NEXT:    vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm3 = mem[0,0,2,1]
-; AVX512DQ-NEXT:    vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
-; AVX512DQ-NEXT:    # xmm10 = mem[2,1,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,5,4]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,0,1,3]
-; AVX512DQ-NEXT:    vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm12 = mem[0,0,1,1]
-; AVX512DQ-NEXT:    vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
-; AVX512DQ-NEXT:    # xmm14 = mem[0,2,3,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1]
-; AVX512DQ-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm17 = mem[2,2,2,3]
-; AVX512DQ-NEXT:    vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm18 = mem[2,1,3,2]
-; AVX512DQ-NEXT:    vpermq $250, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm19 = mem[2,2,3,3]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm21 = ymm21[0,0,1,1]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm28 = ymm24[0,0,2,1]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm30 = ymm23[2,1,3,2]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm31 = ymm22[2,2,2,3]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7]
+; AVX512DQ-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vmovdqa 32(%rcx), %xmm9
+; AVX512DQ-NEXT:    vmovdqa64 %xmm20, %xmm0
+; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm9, %xmm0
+; AVX512DQ-NEXT:    vmovdqa 32(%rdx), %xmm15
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm10 = xmm15[1,1,2,2]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm10[0],xmm0[1],xmm10[2,3],xmm0[4],xmm10[5,6],xmm0[7]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm0, %ymm26
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm9 = xmm15[4],xmm9[4],xmm15[5],xmm9[5],xmm15[6],xmm9[6],xmm15[7],xmm9[7]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm9, %xmm25
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[1,1,1,1,5,5,5,5]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm6[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm6 = ymm6[0,0,2,1,4,4,6,5]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm3 = ymm6[0,1],ymm3[2],ymm6[3,4],ymm3[5],ymm6[6,7,8,9],ymm3[10],ymm6[11,12],ymm3[13],ymm6[14,15]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm24
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm3 = ymm7[0,1,1,3,4,5,5,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm12[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm6 = ymm6[0,0,0,0,4,4,4,4]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm6[2],ymm3[3,4],ymm6[5],ymm3[6,7,8,9],ymm6[10],ymm3[11,12],ymm6[13],ymm3[14,15]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm22
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm3 = ymm14[0,0,2,1,4,4,6,5]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm13[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm6 = ymm6[0,0,0,0,4,4,4,4]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3],ymm6[4,5],ymm3[6],ymm6[7,8,9,10],ymm3[11],ymm6[12,13],ymm3[14],ymm6[15]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm21
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [0,5,0,0,0,6,0,0,6,0,0,0,7,0,0,7]
+; AVX512DQ-NEXT:    vmovdqa 32(%rax), %ymm6
+; AVX512DQ-NEXT:    vpermd %zmm6, %zmm3, %zmm3
+; AVX512DQ-NEXT:    vmovdqa64 %ymm28, %ymm7
+; AVX512DQ-NEXT:    vpshufb %ymm7, %ymm6, %ymm10
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm6 = ymm6[0,1,1,3,4,5,5,7]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3]
+; AVX512DQ-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm6
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm6, %zmm10, %zmm6
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX512DQ-NEXT:    vpshufb %xmm7, %xmm2, %xmm10
+; AVX512DQ-NEXT:    vpshufb %xmm7, %xmm1, %xmm1
+; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm13
+; AVX512DQ-NEXT:    vmovdqa (%rsi), %xmm14
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
+; AVX512DQ-NEXT:    vpshufb %xmm7, %xmm2, %xmm2
+; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm20
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm15 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
+; AVX512DQ-NEXT:    vprold $16, %xmm14, %xmm14
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm13 = xmm13[1,1,2,3]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm13[0,1],xmm14[2],xmm13[3,4],xmm14[5],xmm13[6,7]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm16
+; AVX512DQ-NEXT:    vmovdqa %ymm4, %ymm2
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm14 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm14 = ymm14[0,0,0,0,4,4,4,4]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm29, %ymm12
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm9 = ymm29[0,1,1,3,4,5,5,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm14 = ymm9[0,1],ymm14[2],ymm9[3,4],ymm14[5],ymm9[6,7,8,9],ymm14[10],ymm9[11,12],ymm14[13],ymm9[14,15]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm31, %xmm4
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm4[0,2,3,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm15[2,1,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,4]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[0,0,1,3]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,0,1,1]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm28 = ymm30[2,2,2,3]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm29 = ymm23[0,2,2,3]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,1,3,2]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm5, %zmm6, %zmm5
-; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm29, %zmm5
-; AVX512DQ-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm27
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm5 = ymm15[0,0,0,1]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm7 = ymm19[2,1,3,3]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm30 = ymm18[0,2,2,3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm31 = ymm5[2,1,3,3]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm4 = ymm12[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7,8,9],ymm4[10],ymm2[11,12],ymm4[13],ymm2[14,15]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm8[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm5 = ymm17[1,1,1,1,5,5,5,5]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm5 = ymm17[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm12 = ymm8[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm12 = ymm12[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2,3],ymm12[4],ymm5[5,6,7,8],ymm12[9],ymm5[10,11],ymm12[12],ymm5[13,14,15]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm29, %zmm28, %zmm12
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm7, %zmm11, %zmm7
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
+; AVX512DQ-NEXT:    vpternlogq $184, %zmm12, %zmm11, %zmm7
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm30, %zmm0
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm31, %zmm1
+; AVX512DQ-NEXT:    vpternlogq $226, %zmm0, %zmm11, %zmm1
+; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm0 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm9 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
+; AVX512DQ-NEXT:    vpternlogq $226, %zmm0, %zmm11, %zmm9
+; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm10[0,1,2,3],zmm0[4,5,6,7]
+; AVX512DQ-NEXT:    vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm10 = mem[2,1,3,2]
+; AVX512DQ-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm12 = mem[2,2,2,3]
+; AVX512DQ-NEXT:    vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm15 = mem[2,3,3,3,6,7,7,7]
+; AVX512DQ-NEXT:    vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm17 = mem[0,0,2,1]
+; AVX512DQ-NEXT:    vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
+; AVX512DQ-NEXT:    # xmm8 = mem[2,1,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,5,4]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,0,1,3]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm18 = ymm26[0,0,1,1]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm25, %xmm13
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm13[0,2,3,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm19 = ymm24[2,2,2,3]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm28 = ymm22[2,1,3,2]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm29 = ymm21[2,2,3,3]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm30 = ymm20[0,0,1,1]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm31 = ymm16[0,0,2,1]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[2,1,3,2]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3]
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512DQ-NEXT:    vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm16, %zmm0
+; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm0
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm12, %zmm10, %zmm9
+; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm9
+; AVX512DQ-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm3
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm7 = ymm15[2,1,3,2]
+; AVX512DQ-NEXT:    vpbroadcastd 32(%rax), %ymm9
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm9, %zmm7, %zmm7
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQ-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm7
+; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm7
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm8, %zmm17, %zmm1
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm13, %zmm18, %zmm8
+; AVX512DQ-NEXT:    vpternlogq $226, %zmm1, %zmm11, %zmm8
+; AVX512DQ-NEXT:    vpbroadcastd 36(%rax), %ymm1
+; AVX512DQ-NEXT:    vpbroadcastd 40(%rax), %ymm9
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm9, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQ-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm1
+; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm1
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm19, %zmm8, %zmm8
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm28, %zmm9, %zmm9
+; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm9
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],zmm29[0,1,2,3]
+; AVX512DQ-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm6
+; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm6
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm31, %zmm30, %zmm8
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQ-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm8
+; AVX512DQ-NEXT:    vpbroadcastd (%rax), %ymm9
+; AVX512DQ-NEXT:    vpbroadcastd 4(%rax), %ymm10
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm10, %zmm9, %zmm9
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQ-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm9
+; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm9
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm14, %zmm2
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm5, %zmm4, %zmm4
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2]
-; AVX512DQ-NEXT:    vpbroadcastd 32(%rax), %ymm5
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm5, %zmm1, %zmm1
-; AVX512DQ-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm1
-; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm1
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm10, %zmm3, %zmm3
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm14, %zmm12, %zmm4
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm3, %zmm2, %zmm4
-; AVX512DQ-NEXT:    vpbroadcastd 36(%rax), %ymm2
-; AVX512DQ-NEXT:    vpbroadcastd 40(%rax), %ymm3
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm2
+; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm4
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0]
+; AVX512DQ-NEXT:    vpermd (%rax), %zmm2, %zmm2
+; AVX512DQ-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm27, %zmm2
 ; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm2
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm17, %zmm3, %zmm3
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm18, %zmm4, %zmm4
-; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm4
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm19[0,1,2,3]
-; AVX512DQ-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm20
-; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm20
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm28, %zmm21, %zmm3
-; AVX512DQ-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm26, %zmm3
-; AVX512DQ-NEXT:    vpbroadcastd (%rax), %ymm4
-; AVX512DQ-NEXT:    vpbroadcastd 4(%rax), %ymm5
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm5, %zmm4, %zmm4
-; AVX512DQ-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm25, %zmm4
-; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm4
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm31, %zmm30, %zmm3
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm11, %zmm13, %zmm5
-; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm5
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm7, %zmm8, %zmm3
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm6 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0]
-; AVX512DQ-NEXT:    vpermd (%rax), %zmm6, %zmm6
-; AVX512DQ-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm6
-; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm6
 ; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-NEXT:    vmovdqa64 %zmm6, 128(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm4, (%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm20, 320(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm2, 256(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm1, 192(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm2, 128(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm9, (%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm6, 320(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm1, 256(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm7, 192(%rax)
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm0, 64(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm27, 384(%rax)
-; AVX512DQ-NEXT:    addq $680, %rsp # imm = 0x2A8
+; AVX512DQ-NEXT:    vmovdqa64 %zmm3, 384(%rax)
+; AVX512DQ-NEXT:    addq $632, %rsp # imm = 0x278
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512DQ-FCP-LABEL: store_i16_stride7_vf32:
 ; AVX512DQ-FCP:       # %bb.0:
-; AVX512DQ-FCP-NEXT:    subq $264, %rsp # imm = 0x108
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %ymm2
+; AVX512DQ-FCP-NEXT:    subq $248, %rsp
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %ymm1
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm2, %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm2, %ymm10
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm1, %ymm10
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %ymm4
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm4, %ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm4, %ymm3
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm4, %ymm18
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpor %ymm1, %ymm3, %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpor %ymm2, %ymm3, %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %ymm4
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %ymm13
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm4, %ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %ymm11
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm3 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm4, %ymm2
 ; AVX512DQ-FCP-NEXT:    vmovdqa %ymm4, %ymm9
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm4, (%rsp) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm6
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm5 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19]
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm6, %ymm4
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm6, %ymm17
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm6, %ymm16
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpor %ymm3, %ymm4, %ymm3
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %ymm15
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm15, %ymm4
-; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %ymm11
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm7, %ymm11, %ymm8
-; AVX512DQ-FCP-NEXT:    vpor %ymm4, %ymm8, %ymm3
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm13, %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm12
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm12, %ymm5
-; AVX512DQ-FCP-NEXT:    vpor %ymm1, %ymm5, %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %ymm8
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm8, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %ymm14
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm14, %ymm1
+; AVX512DQ-FCP-NEXT:    vpor %ymm2, %ymm4, %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %ymm13
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm13, %ymm6
+; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %ymm14
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm14, %ymm7
+; AVX512DQ-FCP-NEXT:    vporq %ymm6, %ymm7, %ymm25
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm11, %ymm6
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm8
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm8, %ymm5
+; AVX512DQ-FCP-NEXT:    vpor %ymm6, %ymm5, %ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %ymm3
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm3, %ymm29
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %ymm12
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm12, %ymm1
 ; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r9), %ymm1
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm1, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm1, %ymm20
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %ymm2
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm7, %ymm2, %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm16
-; AVX512DQ-FCP-NEXT:    vporq %ymm0, %ymm1, %ymm22
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r9), %ymm15
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm15, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %ymm6
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm6, %ymm1
+; AVX512DQ-FCP-NEXT:    vporq %ymm0, %ymm1, %ymm21
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm13, %ymm1
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm12[3,3,3,3,7,7,7,7]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm11, %ymm1
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm8[3,3,3,3,7,7,7,7]
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm13[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm12[2,2,2,2,6,6,6,6]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[2,3],ymm2[4],ymm5[5,6,7,8],ymm2[9],ymm5[10,11],ymm2[12],ymm5[13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm11[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm11, %ymm30
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm8[2,2,2,2,6,6,6,6]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3],ymm2[4],ymm4[5,6,7,8],ymm2[9],ymm4[10,11],ymm2[12],ymm4[13,14,15]
 ; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm24 = [0,2,2,3,10,0,11,0]
 ; AVX512DQ-FCP-NEXT:    vpermi2q %zmm1, %zmm2, %zmm24
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm10[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm18[3,3,3,3,7,7,7,7]
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15]
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %xmm3
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %xmm2
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm3, %xmm27
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm25 = [0,2,2,3,8,0,9,0]
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm2, %zmm1, %zmm25
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %xmm10
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %xmm5
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5]
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm23 = [0,2,2,3,8,0,9,0]
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm2, %zmm1, %zmm23
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm9, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm17[3,3,3,3,7,7,7,7]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm16[3,3,3,3,7,7,7,7]
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %xmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %xmm10
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm3, %xmm4
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm21 = [2,1,3,3,8,8,9,9]
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm2, %zmm0, %zmm21
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm11[3,3,3,3,7,7,7,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7,8,9],ymm2[10],ymm0[11,12],ymm2[13],ymm0[14,15]
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm23 = [2,2,2,3,8,8,8,9]
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r9), %xmm2
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %xmm7
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm3, %xmm6
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm6, %zmm0, %zmm23
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7]
-; AVX512DQ-FCP-NEXT:    vprold $16, %xmm10, %xmm6
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm11 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %xmm2
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm11, %xmm4, %xmm4
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm20 = [2,1,3,3,8,8,9,9]
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm4, %zmm0, %zmm20
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm14[3,3,3,3,7,7,7,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm0[0,1],ymm4[2],ymm0[3,4],ymm4[5],ymm0[6,7,8,9],ymm4[10],ymm0[11,12],ymm4[13],ymm0[14,15]
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm22 = [2,2,2,3,8,8,8,9]
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r9), %xmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %xmm3
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm3, %xmm7
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm7, %zmm4, %zmm22
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; AVX512DQ-FCP-NEXT:    vprold $16, %xmm2, %xmm2
 ; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm6[2],xmm1[3,4],xmm6[5],xmm1[6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm4, %xmm2
 ; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm26 = [0,0,0,1,8,8,9,0]
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm0, %zmm1, %zmm26
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm3, %xmm1
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm29 = [0,1,1,3,8,8,9,9]
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm0, %zmm1, %zmm29
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %xmm2
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %xmm0
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm2[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,2,2]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3],xmm1[4],xmm3[5,6],xmm1[7]
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm27, %xmm4
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm27 = [0,0,1,0,8,8,9,9]
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm1, %zmm3, %zmm27
-; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %xmm6
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm2, %zmm1, %zmm26
+; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %xmm2
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %xmm9
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm3, %xmm7
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm3, %xmm3
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm28 = [0,0,0,1,8,9,9,11]
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm7, %zmm3, %zmm28
-; AVX512DQ-FCP-NEXT:    vprold $16, %ymm15, %ymm3
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm11[1,2,2,3,5,6,6,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7,8,9],ymm3[10],ymm5[11,12],ymm3[13],ymm5[14,15]
-; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm4 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm15, %ymm5
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm7 = ymm11[0,0,2,1,4,4,6,5]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3],ymm5[4,5],ymm7[6],ymm5[7,8,9,10],ymm7[11],ymm5[12,13],ymm7[14],ymm5[15]
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm31 = [2,2,3,3,10,9,11,10]
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm3, %zmm5, %zmm31
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm0, %xmm19
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm5
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %xmm3
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm0, %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm0, %ymm18
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm27 = [0,0,0,1,8,9,9,11]
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm4, %zmm0, %zmm27
+; AVX512DQ-FCP-NEXT:    vprold $16, %ymm13, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm14[1,2,2,3,5,6,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm4[0,1],ymm0[2],ymm4[3,4],ymm0[5],ymm4[6,7,8,9],ymm0[10],ymm4[11,12],ymm0[13],ymm4[14,15]
+; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm13, %ymm4
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm13 = ymm14[0,0,2,1,4,4,6,5]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm13[3],ymm4[4,5],ymm13[6],ymm4[7,8,9,10],ymm13[11],ymm4[12,13],ymm13[14],ymm4[15]
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm28 = [2,2,3,3,10,9,11,10]
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm0, %zmm4, %zmm28
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %xmm14
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm4, %xmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm1, %ymm19
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm25, %zmm0, %zmm4
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm11, %xmm2, %xmm2
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm25 = [0,0,1,1,12,13,0,15]
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm4, %zmm2, %zmm25
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FCP-NEXT:    vpbroadcastd 8(%rax), %ymm0
-; AVX512DQ-FCP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rax), %ymm10
+; AVX512DQ-FCP-NEXT:    vpbroadcastd 8(%rax), %ymm2
+; AVX512DQ-FCP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rax), %ymm9
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm10, %ymm2
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm30
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm7 = xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm12[1,1,1,1,5,5,5,5]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm6[2],ymm0[3,4],ymm6[5],ymm0[6,7,8,9],ymm6[10],ymm0[11,12],ymm6[13],ymm0[14,15]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm0, %ymm17
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm8[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm14[0,1,1,3,4,5,5,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm6[0,1],ymm0[2],ymm6[3,4],ymm0[5],ymm6[6,7,8,9],ymm0[10],ymm6[11,12],ymm0[13],ymm6[14,15]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm20, %ymm13
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm13, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm16[0,0,2,1,4,4,6,5]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7,8,9,10],ymm4[11],ymm0[12,13],ymm4[14],ymm0[15]
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm9 = [0,1,0,3,10,10,11,11]
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm0, %zmm22, %zmm9
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rax), %ymm2
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm2, %ymm1
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm22 = [0,1,4,5,4,5,5,7,12,13,10,10,14,14,14,14]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm2, %ymm22, %ymm4
-; AVX512DQ-FCP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm1, %zmm1
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm8[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm12 = ymm14[3,3,3,3,7,7,7,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm12[0],ymm4[1],ymm12[2,3],ymm4[4],ymm12[5,6,7,8],ymm4[9],ymm12[10,11],ymm4[12],ymm12[13,14,15]
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27]
-; AVX512DQ-FCP-NEXT:    # ymm6 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm8, %ymm12
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm8 = ymm14[2,2,2,2,6,6,6,6]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm8[2],ymm12[3,4],ymm8[5],ymm12[6,7,8,9],ymm8[10],ymm12[11,12],ymm8[13],ymm12[14,15]
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm20 = [2,2,2,3,8,10,10,11]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm4, %zmm20, %zmm12
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm8 = ymm16[3,3,3,3,7,7,7,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm8[2],ymm4[3,4],ymm8[5],ymm4[6,7,8,9],ymm8[10],ymm4[11,12],ymm8[13],ymm4[14,15]
-; AVX512DQ-FCP-NEXT:    vprold $16, %ymm13, %ymm8
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm15 = ymm16[1,2,2,3,5,6,6,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm15 = ymm15[0,1],ymm8[2],ymm15[3,4],ymm8[5],ymm15[6,7,8,9],ymm8[10],ymm15[11,12],ymm8[13],ymm15[14,15]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm0[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm14 = xmm13[1,1,2,2]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm14[0],xmm8[1],xmm14[2,3],xmm8[4],xmm14[5,6],xmm8[7]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm14 = [0,0,1,1,8,8,10,9]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm14, %zmm8
-; AVX512DQ-FCP-NEXT:    vprold $16, %xmm3, %xmm0
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm13 = xmm5[1,1,2,3]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm13 = xmm13[0,1],xmm0[2],xmm13[3,4],xmm0[5],xmm13[6,7]
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm7, %xmm0
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm3, %xmm3
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm13, %zmm14, %zmm3
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm5 = [2,1,3,2,10,10,10,11]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm4, %zmm5, %zmm15
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm7, %ymm4
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm13[2,2,2,2,6,6,6,6]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm6[2],ymm4[3,4],ymm6[5],ymm4[6,7,8,9],ymm6[10],ymm4[11,12],ymm6[13],ymm4[14,15]
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm7[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm7 = ymm13[0,1,1,3,4,5,5,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7,8,9],ymm6[10],ymm7[11,12],ymm6[13],ymm7[14,15]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm4, %zmm5, %zmm6
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm24, %zmm16, %zmm12
-; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm12, %zmm15
-; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rax), %zmm5
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm9, %ymm4
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm2, %zmm31
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm30, %ymm2
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm8[1,1,1,1,5,5,5,5]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7,8,9],ymm4[10],ymm2[11,12],ymm4[13],ymm2[14,15]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm18
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm29, %ymm7
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm7[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm12[0,1,1,3,4,5,5,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm4[0,1],ymm2[2],ymm4[3,4],ymm2[5],ymm4[6,7,8,9],ymm2[10],ymm4[11,12],ymm2[13],ymm4[14,15]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm17
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm15, %ymm2
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm6[0,0,2,1,4,4,6,5]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8,9,10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15]
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm29 = [0,1,0,3,10,10,11,11]
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm2, %zmm21, %zmm29
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rax), %ymm8
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm8, %ymm1
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm21 = [0,1,4,5,4,5,5,7,12,13,10,10,14,14,14,14]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm8, %ymm21, %ymm2
+; AVX512DQ-FCP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm16
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm7, %ymm4
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm7[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm12[3,3,3,3,7,7,7,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14,15]
+; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27]
+; AVX512DQ-FCP-NEXT:    # ymm7 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm7, %ymm4, %ymm13
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm12 = ymm12[2,2,2,2,6,6,6,6]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7,8,9],ymm12[10],ymm13[11,12],ymm12[13],ymm13[14,15]
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm30 = [2,2,2,3,8,10,10,11]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm30, %zmm12
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm13 = ymm6[3,3,3,3,7,7,7,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm13[2],ymm3[3,4],ymm13[5],ymm3[6,7,8,9],ymm13[10],ymm3[11,12],ymm13[13],ymm3[14,15]
+; AVX512DQ-FCP-NEXT:    vprold $16, %ymm15, %ymm13
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm6[1,2,2,3,5,6,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm13[2],ymm6[3,4],ymm13[5],ymm6[6,7,8,9],ymm13[10],ymm6[11,12],ymm13[13],ymm6[14,15]
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm13 = [2,1,3,2,10,10,10,11]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm13, %zmm6
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7]
+; AVX512DQ-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm15 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm15, %xmm10, %xmm10
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0],xmm10[1],xmm5[2,3],xmm10[4],xmm5[5,6],xmm10[7]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,2,3,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [0,0,1,1,8,8,10,9]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm2, %zmm5
+; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # xmm3 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm1[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm11, %xmm1
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm11, %xmm3, %xmm3
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm4 = [0,1,1,3,8,8,9,9]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm4, %zmm10
+; AVX512DQ-FCP-NEXT:    vprold $16, %xmm14, %xmm3
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm11 = xmm0[1,1,2,3]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm11[0,1],xmm3[2],xmm11[3,4],xmm3[5],xmm11[6,7]
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm0, %xmm11
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm2, %zmm11
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %xmm0
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm15, %xmm0, %xmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %xmm3
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm14 = xmm3[1,1,2,2]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm14[0],xmm2[1],xmm14[2,3],xmm2[4],xmm14[5,6],xmm2[7]
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm14 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm4, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
+; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm24, %zmm2, %zmm12
+; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm12, %zmm6
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm7, %ymm12, %ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm1[2,2,2,2,6,6,6,6]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15]
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm12[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm7 = ymm1[0,1,1,3,4,5,5,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm7[0,1],ymm4[2],ymm7[3,4],ymm4[5],ymm7[6,7,8,9],ymm4[10],ymm7[11,12],ymm4[13],ymm7[14,15]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm13, %zmm4
+; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rax), %zmm3
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [4,5,2,2,6,6,6,6,30,31,27,27,31,31,30,31]
-; AVX512DQ-FCP-NEXT:    vpermi2d %zmm5, %zmm2, %zmm7
-; AVX512DQ-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm15, %zmm7
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm26, %zmm2, %zmm8
-; AVX512DQ-FCP-NEXT:    vpbroadcastd 36(%rax), %ymm12
-; AVX512DQ-FCP-NEXT:    vpbroadcastd 40(%rax), %ymm13
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm13, %zmm12, %zmm12
-; AVX512DQ-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm29, %zmm12
-; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm12
-; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm27
-; AVX512DQ-FCP-NEXT:    vmovdqu (%rsp), %ymm4 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm4[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm8 = ymm24[2,2,2,2,6,6,6,6]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm8[0],ymm3[1],ymm8[2,3],ymm3[4],ymm8[5,6,7,8],ymm3[9],ymm8[10,11],ymm3[12],ymm8[13,14,15]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm19, %xmm8
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm8[0,2,3,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm13 = ymm18[0,0,1,3]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm14 = ymm17[2,2,2,3]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,1,3,2]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm24[1,1,1,1,5,5,5,5]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm15[0,1],ymm4[2],ymm15[3,4],ymm4[5],ymm15[6,7,8,9],ymm4[10],ymm15[11,12],ymm4[13],ymm15[14,15]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm20, %zmm4
-; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm4
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [6,0,0,0,7,0,0,7]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm10, %ymm3, %ymm3
+; AVX512DQ-FCP-NEXT:    vpermi2d %zmm3, %zmm8, %zmm7
+; AVX512DQ-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm7
+; AVX512DQ-FCP-NEXT:    vpbroadcastd 36(%rax), %ymm6
+; AVX512DQ-FCP-NEXT:    vpbroadcastd 40(%rax), %ymm8
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm6, %zmm6
+; AVX512DQ-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm6
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
+; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm26, %zmm8, %zmm5
+; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm6
+; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm0
+; AVX512DQ-FCP-NEXT:    vpbroadcastd (%rax), %ymm5
+; AVX512DQ-FCP-NEXT:    vpbroadcastd 4(%rax), %ymm10
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm5, %zmm5
-; AVX512DQ-FCP-NEXT:    vpermd %zmm5, %zmm22, %zmm5
-; AVX512DQ-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm31, %zmm5
-; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm5
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm4 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm6 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm4, %zmm2, %zmm6
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm25, %zmm16, %zmm21
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm30, %zmm0
-; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm0
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm14[0,2,3,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm11 = ymm19[0,0,1,3]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm12 = ymm18[2,2,2,3]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm13 = ymm17[2,1,3,2]
+; AVX512DQ-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm27, %zmm5
+; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm5
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm15[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm14 = ymm1[2,2,2,2,6,6,6,6]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm14[0],ymm0[1],ymm14[2,3],ymm0[4],ymm14[5,6,7,8],ymm0[9],ymm14[10,11],ymm0[12],ymm14[13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm14 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm15 = ymm1[1,1,1,1,5,5,5,5]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7,8,9],ymm15[10],ymm14[11,12],ymm15[13],ymm14[14,15]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm30, %zmm14
+; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm14
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [6,0,0,0,7,0,0,7]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm9, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm9, %zmm3, %zmm3
+; AVX512DQ-FCP-NEXT:    vpermd %zmm3, %zmm21, %zmm3
+; AVX512DQ-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm28, %zmm3
+; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm14, %zmm3
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm4 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm9 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm4, %zmm8, %zmm9
+; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm23, %zmm2, %zmm20
+; AVX512DQ-FCP-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm25, %zmm31
+; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm31
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm14, %zmm2, %zmm2
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm12, %zmm2, %zmm2
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm4, %zmm4
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm13, %zmm4, %zmm4
 ; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm4
-; AVX512DQ-FCP-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm1
-; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm1
+; AVX512DQ-FCP-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm29, %zmm16
+; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm16
 ; AVX512DQ-FCP-NEXT:    vpbroadcastd 32(%rax), %ymm2
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512DQ-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm23, %zmm2
-; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm21, %zmm2
-; AVX512DQ-FCP-NEXT:    vpbroadcastd (%rax), %ymm3
-; AVX512DQ-FCP-NEXT:    vpbroadcastd 4(%rax), %ymm4
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm3, %zmm3
-; AVX512DQ-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm28, %zmm3
-; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm27, %zmm3
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512DQ-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm22, %zmm0
+; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm20, %zmm0
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, 128(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, (%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, 320(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm12, 256(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, 192(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, 128(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, (%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm16, 320(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, 256(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, 192(%rax)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, 384(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, 64(%rax)
-; AVX512DQ-FCP-NEXT:    addq $264, %rsp # imm = 0x108
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm31, 64(%rax)
+; AVX512DQ-FCP-NEXT:    addq $248, %rsp
 ; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
 ;
@@ -12639,2656 +12619,2682 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ;
 ; AVX512-LABEL: store_i16_stride7_vf64:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    subq $2808, %rsp # imm = 0xAF8
-; AVX512-NEXT:    vmovdqa 96(%rcx), %ymm6
-; AVX512-NEXT:    vmovdqa 96(%rdx), %ymm13
+; AVX512-NEXT:    subq $2168, %rsp # imm = 0x878
+; AVX512-NEXT:    vmovdqa 96(%rcx), %ymm2
+; AVX512-NEXT:    vmovdqa 96(%rdx), %ymm9
 ; AVX512-NEXT:    vmovdqa 96(%rdi), %ymm7
 ; AVX512-NEXT:    vmovdqa 96(%rsi), %ymm8
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128]
-; AVX512-NEXT:    vpshufb %ymm0, %ymm6, %ymm2
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpshufb %ymm1, %ymm13, %ymm3
-; AVX512-NEXT:    vporq %ymm2, %ymm3, %ymm17
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm10 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128]
-; AVX512-NEXT:    vpshufb %ymm10, %ymm8, %ymm2
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm11 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19]
-; AVX512-NEXT:    vpshufb %ymm11, %ymm7, %ymm3
-; AVX512-NEXT:    vporq %ymm2, %ymm3, %ymm18
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
+; AVX512-NEXT:    vpshufb %ymm0, %ymm2, %ymm1
+; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm18
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm11 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpshufb %ymm11, %ymm9, %ymm2
+; AVX512-NEXT:    vporq %ymm1, %ymm2, %ymm16
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm12 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512-NEXT:    vpshufb %ymm12, %ymm8, %ymm1
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm13 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19]
+; AVX512-NEXT:    vpshufb %ymm13, %ymm7, %ymm2
+; AVX512-NEXT:    vporq %ymm1, %ymm2, %ymm17
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
 ; AVX512-NEXT:    vmovdqa 64(%r9), %ymm2
-; AVX512-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpshufb %ymm12, %ymm2, %ymm2
+; AVX512-NEXT:    vpshufb %ymm3, %ymm2, %ymm1
+; AVX512-NEXT:    vmovdqa %ymm3, %ymm10
+; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm21
 ; AVX512-NEXT:    vmovdqa 64(%r8), %ymm3
-; AVX512-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u]
-; AVX512-NEXT:    vpshufb %ymm9, %ymm3, %ymm3
-; AVX512-NEXT:    vpor %ymm2, %ymm3, %ymm2
-; AVX512-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vmovdqa 64(%rcx), %ymm3
-; AVX512-NEXT:    vpshufb %ymm0, %ymm3, %ymm2
-; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm26
-; AVX512-NEXT:    vmovdqa 64(%rdx), %ymm4
-; AVX512-NEXT:    vpshufb %ymm1, %ymm4, %ymm3
-; AVX512-NEXT:    vmovdqa64 %ymm4, %ymm27
-; AVX512-NEXT:    vpor %ymm2, %ymm3, %ymm2
-; AVX512-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vmovdqa 64(%rsi), %ymm3
-; AVX512-NEXT:    vpshufb %ymm10, %ymm3, %ymm2
-; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm23
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm15 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u]
+; AVX512-NEXT:    vpshufb %ymm15, %ymm3, %ymm2
+; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm20
+; AVX512-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vmovdqa 64(%rcx), %ymm2
+; AVX512-NEXT:    vpshufb %ymm0, %ymm2, %ymm1
+; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm23
+; AVX512-NEXT:    vmovdqa 64(%rdx), %ymm6
+; AVX512-NEXT:    vpshufb %ymm11, %ymm6, %ymm2
+; AVX512-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vmovdqa 64(%rsi), %ymm5
+; AVX512-NEXT:    vpshufb %ymm12, %ymm5, %ymm1
 ; AVX512-NEXT:    vmovdqa 64(%rdi), %ymm4
-; AVX512-NEXT:    vpshufb %ymm11, %ymm4, %ymm3
-; AVX512-NEXT:    vmovdqa64 %ymm4, %ymm22
-; AVX512-NEXT:    vpor %ymm2, %ymm3, %ymm2
+; AVX512-NEXT:    vpshufb %ymm13, %ymm4, %ymm2
+; AVX512-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vmovdqa (%r9), %ymm1
+; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpshufb %ymm10, %ymm1, %ymm1
+; AVX512-NEXT:    vmovdqa %ymm10, %ymm3
+; AVX512-NEXT:    vmovdqa (%r8), %ymm2
 ; AVX512-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vmovdqa (%r9), %ymm2
-; AVX512-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpshufb %ymm12, %ymm2, %ymm2
-; AVX512-NEXT:    vmovdqa (%r8), %ymm3
-; AVX512-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpshufb %ymm9, %ymm3, %ymm3
-; AVX512-NEXT:    vpor %ymm2, %ymm3, %ymm2
-; AVX512-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vmovdqa (%rcx), %ymm2
-; AVX512-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpshufb %ymm0, %ymm2, %ymm2
-; AVX512-NEXT:    vmovdqa (%rdx), %ymm3
-; AVX512-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpshufb %ymm1, %ymm3, %ymm3
-; AVX512-NEXT:    vpor %ymm2, %ymm3, %ymm2
+; AVX512-NEXT:    vpshufb %ymm15, %ymm2, %ymm2
+; AVX512-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vmovdqa (%rcx), %ymm1
+; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpshufb %ymm0, %ymm1, %ymm1
+; AVX512-NEXT:    vmovdqa (%rdx), %ymm2
 ; AVX512-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpshufb %ymm11, %ymm2, %ymm2
+; AVX512-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-NEXT:    vmovdqa (%rsi), %ymm2
-; AVX512-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpshufb %ymm10, %ymm2, %ymm2
-; AVX512-NEXT:    vmovdqa (%rdi), %ymm3
-; AVX512-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpshufb %ymm11, %ymm3, %ymm3
-; AVX512-NEXT:    vpor %ymm2, %ymm3, %ymm2
-; AVX512-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vmovdqa 32(%rcx), %ymm4
-; AVX512-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
-; AVX512-NEXT:    vmovdqa 32(%rdx), %ymm2
-; AVX512-NEXT:    vpshufb %ymm1, %ymm2, %ymm1
-; AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm0
-; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa 32(%rsi), %ymm5
-; AVX512-NEXT:    vpshufb %ymm10, %ymm5, %ymm0
-; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm3
-; AVX512-NEXT:    vpshufb %ymm11, %ymm3, %ymm1
-; AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX512-NEXT:    vpshufb %ymm12, %ymm2, %ymm1
+; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm28
+; AVX512-NEXT:    vmovdqa (%rdi), %ymm10
+; AVX512-NEXT:    vpshufb %ymm13, %ymm10, %ymm2
+; AVX512-NEXT:    vmovdqa64 %ymm10, %ymm22
+; AVX512-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vmovdqa 32(%rcx), %ymm2
+; AVX512-NEXT:    vpshufb %ymm0, %ymm2, %ymm0
+; AVX512-NEXT:    vmovdqa 32(%rdx), %ymm1
+; AVX512-NEXT:    vpshufb %ymm11, %ymm1, %ymm11
+; AVX512-NEXT:    vpor %ymm0, %ymm11, %ymm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa 32(%r8), %ymm1
-; AVX512-NEXT:    vpshufb %ymm9, %ymm1, %ymm9
-; AVX512-NEXT:    vmovdqa 32(%r9), %ymm0
-; AVX512-NEXT:    vpshufb %ymm12, %ymm0, %ymm10
-; AVX512-NEXT:    vpor %ymm10, %ymm9, %ymm9
-; AVX512-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27]
-; AVX512-NEXT:    # ymm10 = mem[0,1,0,1]
-; AVX512-NEXT:    vpshufb %ymm10, %ymm6, %ymm9
-; AVX512-NEXT:    vmovdqa64 %ymm10, %ymm31
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm10 = ymm13[2,2,2,2,6,6,6,6]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7,8,9],ymm10[10],ymm9[11,12],ymm10[13],ymm9[14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3]
-; AVX512-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm9 = ymm6[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[2,2,2,2,6,6,6,6]
+; AVX512-NEXT:    vmovdqa 32(%rsi), %ymm0
+; AVX512-NEXT:    vpshufb %ymm12, %ymm0, %ymm12
+; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm11
+; AVX512-NEXT:    vpshufb %ymm13, %ymm11, %ymm13
+; AVX512-NEXT:    vpor %ymm12, %ymm13, %ymm10
+; AVX512-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa 32(%r8), %ymm13
+; AVX512-NEXT:    vpshufb %ymm15, %ymm13, %ymm12
+; AVX512-NEXT:    vmovdqa 32(%r9), %ymm15
+; AVX512-NEXT:    vpshufb %ymm3, %ymm15, %ymm14
+; AVX512-NEXT:    vpor %ymm14, %ymm12, %ymm10
+; AVX512-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vprold $16, %ymm15, %ymm12
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm14 = ymm13[1,2,2,3,5,6,6,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm12 = ymm14[0,1],ymm12[2],ymm14[3,4],ymm12[5],ymm14[6,7,8,9],ymm12[10],ymm14[11,12],ymm12[13],ymm14[14,15]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm14 = ymm15[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm14 = ymm14[2,2,2,3,6,6,6,7]
 ; AVX512-NEXT:    vpshufd {{.*#+}} ymm10 = ymm13[3,3,3,3,7,7,7,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5,6,7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13,14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3]
-; AVX512-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm9 = ymm8[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[2,2,2,2,6,6,6,6]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm10 = ymm7[2,2,2,2,6,6,6,6]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5,6,7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13,14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3]
-; AVX512-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm9 = ymm8[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[2,2,2,2,6,6,6,6]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm10 = ymm7[3,3,3,3,7,7,7,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7,8,9,10],ymm9[11],ymm10[12,13],ymm9[14],ymm10[15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,1,3,3]
-; AVX512-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vmovdqa 96(%r9), %ymm9
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm10 = ymm9[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[3,3,3,3]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm10 = ymm14[0,1],ymm10[2],ymm14[3,4],ymm10[5],ymm14[6,7,8,9],ymm10[10],ymm14[11,12],ymm10[13],ymm14[14,15]
+; AVX512-NEXT:    vpmovsxbq {{.*#+}} zmm14 = [2,1,3,2,10,10,10,11]
+; AVX512-NEXT:    vpermi2q %zmm10, %zmm12, %zmm14
+; AVX512-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm12 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27]
+; AVX512-NEXT:    # ymm12 = mem[0,1,0,1]
+; AVX512-NEXT:    vmovdqa64 %ymm18, %ymm14
+; AVX512-NEXT:    vpshufb %ymm12, %ymm14, %ymm10
+; AVX512-NEXT:    vmovdqa64 %ymm12, %ymm19
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm12 = ymm9[2,2,2,2,6,6,6,6]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm12[2],ymm10[3,4],ymm12[5],ymm10[6,7,8,9],ymm12[10],ymm10[11,12],ymm12[13],ymm10[14,15]
 ; AVX512-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm10 = ymm9[2,1,2,3,6,5,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm10 = ymm10[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm30 = ymm10[2,2,2,2]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm10 = ymm13[0,1,1,3,4,5,5,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm6[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm6 = ymm6[0,0,0,0,4,4,4,4]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm6 = ymm10[0,1],ymm6[2],ymm10[3,4],ymm6[5],ymm10[6,7,8,9],ymm6[10],ymm10[11,12],ymm6[13],ymm10[14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,1,3,2]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm10 = ymm14[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm10 = ymm10[2,2,2,2,6,6,6,6]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm12 = ymm9[3,3,3,3,7,7,7,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm10 = ymm12[0],ymm10[1],ymm12[2,3],ymm10[4],ymm12[5,6,7,8],ymm10[9],ymm12[10,11],ymm10[12],ymm12[13,14,15]
+; AVX512-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm10 = ymm8[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm10 = ymm10[2,2,2,2,6,6,6,6]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm12 = ymm7[2,2,2,2,6,6,6,6]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm10 = ymm12[0],ymm10[1],ymm12[2,3],ymm10[4],ymm12[5,6,7,8],ymm10[9],ymm12[10,11],ymm10[12],ymm12[13,14,15]
+; AVX512-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm10 = ymm8[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm10 = ymm10[2,2,2,2,6,6,6,6]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm12 = ymm7[3,3,3,3,7,7,7,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm10 = ymm12[0,1,2],ymm10[3],ymm12[4,5],ymm10[6],ymm12[7,8,9,10],ymm10[11],ymm12[12,13],ymm10[14],ymm12[15]
+; AVX512-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[0,1,1,3,4,5,5,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm10 = ymm14[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm10 = ymm10[0,0,0,0,4,4,4,4]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7,8,9],ymm10[10],ymm9[11,12],ymm10[13],ymm9[14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,1,3,2]
 ; AVX512-NEXT:    vpshufd {{.*#+}} ymm7 = ymm7[1,1,1,1,5,5,5,5]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} ymm8 = ymm8[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
 ; AVX512-NEXT:    vpshufd {{.*#+}} ymm8 = ymm8[0,0,2,1,4,4,6,5]
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm7 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7,8,9],ymm7[10],ymm8[11,12],ymm7[13],ymm8[14,15]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3]
-; AVX512-NEXT:    vprold $16, %ymm9, %ymm8
-; AVX512-NEXT:    vpshufb %ymm12, %ymm9, %ymm9
-; AVX512-NEXT:    vinserti64x4 $1, %ymm6, %zmm17, %zmm6
-; AVX512-NEXT:    vinserti64x4 $1, %ymm7, %zmm18, %zmm7
-; AVX512-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm7
-; AVX512-NEXT:    vmovdqa 96(%r8), %ymm6
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm10 = ymm6[1,2,2,3,5,6,6,7]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm9, %zmm16, %zmm8
+; AVX512-NEXT:    vinserti64x4 $1, %ymm7, %zmm17, %zmm7
+; AVX512-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm7
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm26 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535]
+; AVX512-NEXT:    vmovdqa 96(%r8), %ymm10
 ; AVX512-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm10 = ymm6[0,0,2,1,4,4,6,5]
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm6 = ymm6[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm6[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm6[16,17,u,u,u,u],zero,zero
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535]
-; AVX512-NEXT:    vpternlogq $248, %ymm11, %ymm7, %ymm6
-; AVX512-NEXT:    vmovdqa64 %zmm11, %zmm12
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512-NEXT:    vpternlogq $248, %ymm11, %ymm6, %ymm9
-; AVX512-NEXT:    vextracti64x4 $1, %zmm7, %ymm6
-; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = ymm10[2,1,3,3]
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm8 = ymm10[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm10[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm10[16,17,u,u,u,u],zero,zero
+; AVX512-NEXT:    vpternlogq $248, %ymm26, %ymm7, %ymm8
+; AVX512-NEXT:    vmovdqa 96(%r9), %ymm12
+; AVX512-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpshufb %ymm3, %ymm12, %ymm9
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm14 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
+; AVX512-NEXT:    vpternlogq $248, %ymm14, %ymm8, %ymm9
+; AVX512-NEXT:    vextracti64x4 $1, %zmm7, %ymm7
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm8 = ymm10[0,0,2,1,4,4,6,5]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,1,3,3]
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0]
-; AVX512-NEXT:    vpternlogq $184, %ymm6, %ymm10, %ymm7
-; AVX512-NEXT:    vpermq {{.*#+}} ymm6 = ymm8[2,2,2,2]
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
-; AVX512-NEXT:    vpternlogq $184, %ymm7, %ymm8, %ymm6
-; AVX512-NEXT:    vinserti64x4 $1, %ymm6, %zmm0, %zmm6
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm6 = zmm9[0,1,2,3],zmm6[4,5,6,7]
-; AVX512-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vpternlogq $184, %ymm7, %ymm10, %ymm8
+; AVX512-NEXT:    vprold $16, %ymm12, %ymm7
+; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,2,2,2]
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm14 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
+; AVX512-NEXT:    vpternlogq $184, %ymm8, %ymm14, %ymm7
+; AVX512-NEXT:    vinserti64x4 $1, %ymm7, %zmm0, %zmm7
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm7 = zmm9[0,1,2,3],zmm7[4,5,6,7]
+; AVX512-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [0,5,0,0,0,6,0,0,6,0,0,0,7,0,0,7]
-; AVX512-NEXT:    vmovdqa 96(%rax), %ymm6
-; AVX512-NEXT:    vpermd %zmm6, %zmm18, %zmm7
+; AVX512-NEXT:    vmovdqa 96(%rax), %ymm7
+; AVX512-NEXT:    vpermd %zmm7, %zmm18, %zmm9
+; AVX512-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm9 = ymm7[0,1,1,3,4,5,5,7]
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm10 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128]
+; AVX512-NEXT:    vpshufb %ymm10, %ymm7, %ymm7
+; AVX512-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3]
+; AVX512-NEXT:    vpandnq %ymm9, %ymm26, %ymm9
+; AVX512-NEXT:    vinserti64x4 $1, %ymm9, %zmm7, %zmm7
 ; AVX512-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm7 = ymm6[0,1,1,3,4,5,5,7]
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm11 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128]
-; AVX512-NEXT:    vpshufb %ymm11, %ymm6, %ymm6
-; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3]
-; AVX512-NEXT:    vpandn %ymm7, %ymm12, %ymm7
-; AVX512-NEXT:    vmovdqa64 %zmm12, %zmm19
-; AVX512-NEXT:    vinserti64x4 $1, %ymm7, %zmm6, %zmm6
-; AVX512-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm28 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535]
-; AVX512-NEXT:    vpbroadcastd 72(%rax), %ymm6
-; AVX512-NEXT:    vpandnq %ymm6, %ymm28, %ymm6
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535]
+; AVX512-NEXT:    vpbroadcastd 72(%rax), %ymm7
+; AVX512-NEXT:    vpandnq %ymm7, %ymm16, %ymm9
 ; AVX512-NEXT:    vmovdqa 64(%rax), %ymm7
-; AVX512-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpshufb %ymm11, %ymm7, %ymm7
-; AVX512-NEXT:    vinserti64x4 $1, %ymm7, %zmm6, %zmm6
-; AVX512-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa 64(%r9), %xmm7
-; AVX512-NEXT:    vmovdqa 64(%r8), %xmm8
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
-; AVX512-NEXT:    vmovdqa64 %xmm8, %xmm17
-; AVX512-NEXT:    vmovdqa64 %xmm7, %xmm20
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
-; AVX512-NEXT:    vpshufb %xmm7, %xmm6, %xmm6
-; AVX512-NEXT:    vmovdqa %xmm7, %xmm15
-; AVX512-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1]
-; AVX512-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa 64(%rcx), %xmm9
-; AVX512-NEXT:    vmovdqa 64(%rdx), %xmm7
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm8[0,2,3,3,4,5,6,7]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1]
-; AVX512-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa 64(%rdi), %xmm14
-; AVX512-NEXT:    vmovdqa 64(%rsi), %xmm10
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm10[4],xmm14[4],xmm10[5],xmm14[5],xmm10[6],xmm14[6],xmm10[7],xmm14[7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm8[2,1,2,3,4,5,6,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,5,4]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,0,1,3]
-; AVX512-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpbroadcastd 8(%rax), %ymm8
-; AVX512-NEXT:    vpandnq %ymm8, %ymm28, %ymm8
-; AVX512-NEXT:    vmovdqa (%rax), %ymm12
-; AVX512-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpshufb %ymm11, %ymm12, %ymm13
-; AVX512-NEXT:    vinserti64x4 $1, %ymm13, %zmm8, %zmm8
-; AVX512-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa (%r9), %xmm6
-; AVX512-NEXT:    vmovdqa (%r8), %xmm12
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7]
-; AVX512-NEXT:    vmovdqa64 %xmm12, %xmm29
-; AVX512-NEXT:    vmovdqa64 %xmm6, %xmm24
-; AVX512-NEXT:    vpshufb %xmm15, %xmm8, %xmm8
-; AVX512-NEXT:    vmovdqa64 %xmm15, %xmm25
-; AVX512-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1]
-; AVX512-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa (%rcx), %xmm8
-; AVX512-NEXT:    vmovdqa (%rdx), %xmm13
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm15 = xmm13[4],xmm8[4],xmm13[5],xmm8[5],xmm13[6],xmm8[6],xmm13[7],xmm8[7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm15[0,2,3,3,4,5,6,7]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm12 = ymm15[0,0,2,1]
-; AVX512-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa (%rdi), %xmm6
-; AVX512-NEXT:    vmovdqa (%rsi), %xmm12
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm15 = xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7]
-; AVX512-NEXT:    vmovdqa64 %xmm12, %xmm21
-; AVX512-NEXT:    vmovdqa64 %xmm6, %xmm16
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm15[2,1,2,3,4,5,6,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,4]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm12 = ymm15[0,0,1,3]
-; AVX512-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm15 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm15 = ymm15[0,0,0,0,4,4,4,4]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm12 = ymm2[0,1,1,3,4,5,5,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm15[2],ymm12[3,4],ymm15[5],ymm12[6,7,8,9],ymm15[10],ymm12[11,12],ymm15[13],ymm12[14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[2,1,3,2]
-; AVX512-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm12 = ymm5[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm12 = ymm12[0,0,2,1,4,4,6,5]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm15 = ymm3[1,1,1,1,5,5,5,5]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm15[2],ymm12[3,4],ymm15[5],ymm12[6,7,8,9],ymm15[10],ymm12[11,12],ymm15[13],ymm12[14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3]
-; AVX512-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vmovdqa 32(%rax), %ymm15
-; AVX512-NEXT:    vpshufb %ymm11, %ymm15, %ymm11
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm12 = ymm15[0,1,1,3,4,5,5,7]
+; AVX512-NEXT:    vpshufb %ymm10, %ymm7, %ymm12
+; AVX512-NEXT:    vinserti64x4 $1, %ymm12, %zmm9, %zmm9
+; AVX512-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vpbroadcastd 8(%rax), %ymm9
+; AVX512-NEXT:    vpandnq %ymm9, %ymm16, %ymm9
+; AVX512-NEXT:    vmovdqa (%rax), %ymm8
+; AVX512-NEXT:    vpshufb %ymm10, %ymm8, %ymm12
+; AVX512-NEXT:    vmovdqa64 %ymm8, %ymm24
+; AVX512-NEXT:    vinserti64x4 $1, %ymm12, %zmm9, %zmm9
+; AVX512-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm9 = ymm2[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[0,0,0,0,4,4,4,4]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm12 = ymm1[0,1,1,3,4,5,5,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm9 = ymm12[0,1],ymm9[2],ymm12[3,4],ymm9[5],ymm12[6,7,8,9],ymm9[10],ymm12[11,12],ymm9[13],ymm12[14,15]
+; AVX512-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm9 = ymm0[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[0,0,2,1,4,4,6,5]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm12 = ymm11[1,1,1,1,5,5,5,5]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm12[2],ymm9[3,4],ymm12[5],ymm9[6,7,8,9],ymm12[10],ymm9[11,12],ymm12[13],ymm9[14,15]
+; AVX512-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vmovdqa 32(%rax), %ymm9
+; AVX512-NEXT:    vpshufb %ymm10, %ymm9, %ymm10
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm12 = ymm9[0,1,1,3,4,5,5,7]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,2,2,3]
-; AVX512-NEXT:    vpandnq %ymm12, %ymm19, %ymm12
-; AVX512-NEXT:    vinserti64x4 $1, %ymm12, %zmm11, %zmm11
-; AVX512-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm11 = ymm0[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm11 = ymm11[0,0,0,0,4,4,4,4]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm12 = ymm1[0,0,2,1,4,4,6,5]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7,8,9,10],ymm12[11],ymm11[12,13],ymm12[14],ymm11[15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,2,3,3]
-; AVX512-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa64 %ymm31, %ymm6
-; AVX512-NEXT:    vpshufb %ymm6, %ymm4, %ymm11
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm12 = ymm2[2,2,2,2,6,6,6,6]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7,8,9],ymm12[10],ymm11[11,12],ymm12[13],ymm11[14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3]
-; AVX512-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[3,3,3,3,7,7,7,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6,7,8],ymm4[9],ymm2[10,11],ymm4[12],ymm2[13,14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX512-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm5[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,2,6,6,6,6]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm3[2,2,2,2,6,6,6,6]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3],ymm2[4],ymm4[5,6,7,8],ymm2[9],ymm4[10,11],ymm2[12],ymm4[13,14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm3[3,3,3,3,7,7,7,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm5[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,2,6,6,6,6]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8,9,10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3]
-; AVX512-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vprold $16, %ymm0, %ymm2
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm3 = ymm1[1,2,2,3,5,6,6,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,1,3,2]
-; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vpandnq %ymm12, %ymm26, %ymm12
+; AVX512-NEXT:    vinserti64x4 $1, %ymm12, %zmm10, %zmm10
+; AVX512-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm10 = ymm13[0,0,2,1,4,4,6,5]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm12 = ymm15[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm12 = ymm12[0,0,0,0,4,4,4,4]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm8 = ymm12[0,1,2],ymm10[3],ymm12[4,5],ymm10[6],ymm12[7,8,9,10],ymm10[11],ymm12[12,13],ymm10[14],ymm12[15]
+; AVX512-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vmovdqa64 %ymm19, %ymm13
+; AVX512-NEXT:    vpshufb %ymm13, %ymm2, %ymm10
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm12 = ymm1[2,2,2,2,6,6,6,6]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm8 = ymm10[0,1],ymm12[2],ymm10[3,4],ymm12[5],ymm10[6,7,8,9],ymm12[10],ymm10[11,12],ymm12[13],ymm10[14,15]
+; AVX512-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[3,3,3,3,7,7,7,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,3,6,6,6,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,2,6,6,6,6]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6,7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14,15]
+; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm0[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm11[2,2,2,2,6,6,6,6]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15]
+; AVX512-NEXT:    vmovdqa64 %ymm1, %ymm31
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm11[3,3,3,3,7,7,7,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
 ; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpermd %zmm15, %zmm18, %zmm0
+; AVX512-NEXT:    vpermd %zmm9, %zmm18, %zmm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa64 %ymm23, %ymm12
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm12[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm5[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
 ; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm22[1,1,1,1,5,5,5,5]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm4[1,1,1,1,5,5,5,5]
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
-; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm12[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
+; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm5[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
 ; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm22[2,2,2,2,6,6,6,6]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm4[2,2,2,2,6,6,6,6]
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vmovdqa64 %ymm26, %ymm2
+; AVX512-NEXT:    vmovdqa64 %ymm23, %ymm2
 ; AVX512-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm2[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
 ; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm27[0,1,1,3,4,5,5,7]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm6[0,1,1,3,4,5,5,7]
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2]
-; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpshufb %ymm6, %ymm2, %ymm0
-; AVX512-NEXT:    vmovdqa64 %ymm26, %ymm3
-; AVX512-NEXT:    vmovdqa64 %ymm31, %ymm11
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm27[2,2,2,2,6,6,6,6]
+; AVX512-NEXT:    vmovdqa64 %ymm0, %ymm30
+; AVX512-NEXT:    vpshufb %ymm13, %ymm2, %ymm0
+; AVX512-NEXT:    vmovdqa64 %ymm23, %ymm3
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm6[2,2,2,2,6,6,6,6]
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
 ; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm5[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
-; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm2[0,0,2,1,4,4,6,5]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8,9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3]
-; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vprold $16, %ymm5, %ymm0
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm2[1,2,2,3,5,6,6,7]
-; AVX512-NEXT:    vmovdqa %ymm2, %ymm6
+; AVX512-NEXT:    vmovdqa64 %ymm21, %ymm8
+; AVX512-NEXT:    vprold $16, %ymm21, %ymm0
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm20[1,2,2,3,5,6,6,7]
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2]
-; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm27[3,3,3,3,7,7,7,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm8[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm20[0,0,2,1,4,4,6,5]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7,8,9,10],ymm2[11],ymm1[12,13],ymm2[14],ymm1[15]
+; AVX512-NEXT:    vpmovsxbq {{.*#+}} zmm10 = [2,2,3,3,10,9,11,10]
+; AVX512-NEXT:    vpermt2q %zmm0, %zmm10, %zmm1
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm11 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0]
+; AVX512-NEXT:    vpermd 64(%rax), %zmm11, %zmm0
+; AVX512-NEXT:    vpternlogd $184, %zmm1, %zmm26, %zmm0
+; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm6[3,3,3,3,7,7,7,7]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm3[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
 ; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6]
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6,7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa 96(%rcx), %xmm0
-; AVX512-NEXT:    vmovdqa 96(%rdx), %xmm1
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,2,4,5,6,7]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
-; AVX512-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm22[3,3,3,3,7,7,7,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm12[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,2,6,6,6,6]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8,9,10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3]
-; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa 96(%rsi), %xmm2
-; AVX512-NEXT:    vmovdqa 96(%rdi), %xmm3
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512-NEXT:    vmovdqa64 %xmm25, %xmm12
-; AVX512-NEXT:    vpshufb %xmm12, %xmm4, %xmm4
-; AVX512-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1]
-; AVX512-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm6[3,3,3,3,7,7,7,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm5[2,2,2,3,6,6,6,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7,8,9],ymm4[10],ymm5[11,12],ymm4[13],ymm5[14,15]
-; AVX512-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vprold $16, %xmm2, %xmm4
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm5 = xmm3[1,1,2,3]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7]
-; AVX512-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
-; AVX512-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
-; AVX512-NEXT:    vpshufb %xmm3, %xmm0, %xmm2
-; AVX512-NEXT:    vmovdqa %xmm3, %xmm4
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[1,1,2,2]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6],xmm2[7]
-; AVX512-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm4[3,3,3,3,7,7,7,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm5[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8,9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
+; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm20[3,3,3,3,7,7,7,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm8[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,3,6,6,6,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm3 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15]
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm12 = [4,5,4,5,4,5,6,7,16,17,16,17,16,17,17,19]
 ; AVX512-NEXT:    vmovdqa 96(%r9), %xmm0
 ; AVX512-NEXT:    vmovdqa 96(%r8), %xmm1
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX512-NEXT:    vpshufb %xmm12, %xmm0, %xmm0
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [16,18,19,19,19,19,0,0,0,1,0,1,2,3,2,3]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5,7,6]
-; AVX512-NEXT:    vpermt2d %zmm1, %zmm18, %zmm0
-; AVX512-NEXT:    vpbroadcastd 100(%rax), %ymm1
-; AVX512-NEXT:    vpbroadcastd 104(%rax), %ymm2
-; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm31
-; AVX512-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm31
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3]
-; AVX512-NEXT:    vpshufb %xmm4, %xmm9, %xmm1
-; AVX512-NEXT:    vmovdqa %xmm4, %xmm6
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm7[1,1,2,2]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm2[0,1,3,2,4,5,6,7]
+; AVX512-NEXT:    vpermt2d %zmm4, %zmm12, %zmm3
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm7[2,3,3,3,6,7,7,7]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,1,3,2]
+; AVX512-NEXT:    vpbroadcastd 96(%rax), %ymm5
+; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm4, %zmm4
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535]
+; AVX512-NEXT:    vpternlogd $184, %zmm3, %zmm5, %zmm4
+; AVX512-NEXT:    vmovdqa64 %zmm5, %zmm15
+; AVX512-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa 96(%rsi), %xmm3
+; AVX512-NEXT:    vmovdqa 96(%rdi), %xmm4
+; AVX512-NEXT:    vprold $16, %xmm3, %xmm5
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm6 = xmm4[1,1,2,3]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2],xmm6[3,4],xmm5[5],xmm6[6,7]
+; AVX512-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; AVX512-NEXT:    vmovdqa64 %xmm5, %xmm23
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; AVX512-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovdqa 96(%rcx), %xmm3
+; AVX512-NEXT:    vmovdqa 96(%rdx), %xmm4
+; AVX512-NEXT:    vpbroadcastq {{.*#+}} xmm6 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
+; AVX512-NEXT:    vpshufb %xmm6, %xmm3, %xmm5
+; AVX512-NEXT:    vmovdqa %xmm6, %xmm7
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm6 = xmm4[1,1,2,2]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3],xmm5[4],xmm6[5,6],xmm5[7]
+; AVX512-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; AVX512-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; AVX512-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,6]
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX512-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm29 = [16,18,19,19,19,19,0,0,0,1,0,1,2,3,2,3]
+; AVX512-NEXT:    vpermt2d %zmm2, %zmm29, %zmm1
+; AVX512-NEXT:    vpbroadcastd 100(%rax), %ymm2
+; AVX512-NEXT:    vpbroadcastd 104(%rax), %ymm3
+; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512-NEXT:    vpternlogd $184, %zmm1, %zmm14, %zmm2
+; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa 64(%rcx), %xmm1
+; AVX512-NEXT:    vmovdqa 64(%rdx), %xmm2
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; AVX512-NEXT:    vmovdqa64 %xmm3, %xmm27
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX512-NEXT:    vpshufb %xmm7, %xmm1, %xmm1
+; AVX512-NEXT:    vmovdqa %xmm7, %xmm8
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,2]
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7]
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [16,16,17,17,17,17,0,0,0,1,0,1,2,3,2,3]
-; AVX512-NEXT:    vpermt2d %zmm0, %zmm2, %zmm1
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3]
-; AVX512-NEXT:    vpshufb %xmm12, %xmm0, %xmm0
-; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1]
-; AVX512-NEXT:    vprold $16, %xmm10, %xmm3
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm4 = xmm14[1,1,2,3]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm14
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0]
-; AVX512-NEXT:    vpternlogq $226, %zmm1, %zmm0, %zmm14
-; AVX512-NEXT:    vmovdqa64 %xmm17, %xmm1
-; AVX512-NEXT:    vmovdqa64 %xmm20, %xmm3
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,5,7,6]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm3[0,1,3,2,4,5,6,7]
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [16,16,17,17,17,17,0,0,0,1,0,1,2,3,2,3]
+; AVX512-NEXT:    vpermt2d %zmm2, %zmm3, %zmm1
+; AVX512-NEXT:    vmovdqa 64(%rdi), %xmm2
+; AVX512-NEXT:    vmovdqa 64(%rsi), %xmm4
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; AVX512-NEXT:    vpshufb %xmm0, %xmm5, %xmm5
+; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,0,1,1]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
+; AVX512-NEXT:    vmovdqa64 %xmm6, %xmm25
+; AVX512-NEXT:    vprold $16, %xmm4, %xmm4
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3,4],xmm4[5],xmm2[6,7]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm5, %zmm2
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0]
+; AVX512-NEXT:    vpternlogq $226, %zmm1, %zmm4, %zmm2
+; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa 64(%r9), %xmm1
+; AVX512-NEXT:    vmovdqa 64(%r8), %xmm2
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; AVX512-NEXT:    vmovdqa64 %xmm5, %xmm17
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,5,7,6]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7]
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [0,1,0,1,0,1,1,3,16,18,19,19,19,19,0,0]
-; AVX512-NEXT:    vpermt2d %zmm3, %zmm4, %zmm1
-; AVX512-NEXT:    vpbroadcastd 64(%rax), %ymm3
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm6 = [0,1,0,1,0,1,1,3,16,18,19,19,19,19,0,0]
+; AVX512-NEXT:    vpermt2d %zmm2, %zmm6, %zmm1
+; AVX512-NEXT:    vpbroadcastd 64(%rax), %ymm2
 ; AVX512-NEXT:    vpbroadcastd 68(%rax), %ymm5
-; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm3, %zmm25
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm19 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512-NEXT:    vpternlogd $184, %zmm1, %zmm19, %zmm25
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3]
-; AVX512-NEXT:    vpshufb %xmm6, %xmm8, %xmm3
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm5 = xmm13[1,1,2,2]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2,3],xmm3[4],xmm5[5,6],xmm3[7]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm2, %zmm2
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
+; AVX512-NEXT:    vpternlogd $184, %zmm1, %zmm9, %zmm2
+; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa (%rcx), %xmm1
+; AVX512-NEXT:    vmovdqa (%rdx), %xmm5
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
+; AVX512-NEXT:    vmovdqa64 %xmm2, %xmm20
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
+; AVX512-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
+; AVX512-NEXT:    vmovdqa %xmm8, %xmm14
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3],xmm1[4],xmm5[5,6],xmm1[7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm7[0,1,3,2,4,5,6,7]
+; AVX512-NEXT:    vpermt2d %zmm5, %zmm3, %zmm1
+; AVX512-NEXT:    vmovdqa (%rdi), %xmm3
+; AVX512-NEXT:    vmovdqa (%rsi), %xmm5
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
+; AVX512-NEXT:    vpshufb %xmm0, %xmm7, %xmm7
+; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,0,1,1]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
+; AVX512-NEXT:    vmovdqa64 %xmm2, %xmm19
+; AVX512-NEXT:    vprold $16, %xmm5, %xmm5
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2],xmm3[3,4],xmm5[5],xmm3[6,7]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm7, %zmm2
+; AVX512-NEXT:    vpternlogq $226, %zmm1, %zmm4, %zmm2
+; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa (%r9), %xmm1
+; AVX512-NEXT:    vmovdqa (%r8), %xmm3
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,5,7,6]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7]
-; AVX512-NEXT:    vpermt2d %zmm1, %zmm2, %zmm3
-; AVX512-NEXT:    vmovdqa64 %xmm16, %xmm5
-; AVX512-NEXT:    vmovdqa64 %xmm21, %xmm2
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
-; AVX512-NEXT:    vpshufb %xmm12, %xmm1, %xmm1
-; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1]
-; AVX512-NEXT:    vprold $16, %xmm21, %xmm2
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm5 = xmm16[1,1,2,3]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2],xmm5[3,4],xmm2[5],xmm5[6,7]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm5
-; AVX512-NEXT:    vpternlogq $226, %zmm3, %zmm0, %zmm5
-; AVX512-NEXT:    vmovdqa64 %xmm29, %xmm0
-; AVX512-NEXT:    vmovdqa64 %xmm24, %xmm1
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,7,6]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7]
-; AVX512-NEXT:    vpermt2d %zmm1, %zmm4, %zmm0
-; AVX512-NEXT:    vpbroadcastd (%rax), %ymm1
-; AVX512-NEXT:    vpbroadcastd 4(%rax), %ymm2
-; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm20
-; AVX512-NEXT:    vpternlogd $184, %zmm0, %zmm19, %zmm20
-; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm6[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5]
-; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm2[1,1,1,1,5,5,5,5]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
-; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm6[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm2[2,2,2,2,6,6,6,6]
-; AVX512-NEXT:    vmovdqa %ymm2, %ymm10
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpermt2d %zmm3, %zmm6, %zmm1
+; AVX512-NEXT:    vpbroadcastd (%rax), %ymm3
+; AVX512-NEXT:    vpbroadcastd 4(%rax), %ymm4
+; AVX512-NEXT:    vinserti64x4 $1, %ymm4, %zmm3, %zmm2
+; AVX512-NEXT:    vpternlogd $184, %zmm1, %zmm9, %zmm2
+; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa64 %ymm28, %ymm7
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm7[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5]
+; AVX512-NEXT:    vmovdqa64 %ymm22, %ymm2
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm22[1,1,1,1,5,5,5,5]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7,8,9],ymm4[10],ymm1[11,12],ymm4[13],ymm1[14,15]
+; AVX512-NEXT:    vmovdqa64 %ymm1, %ymm22
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm2[2,2,2,2,6,6,6,6]
+; AVX512-NEXT:    vmovdqa %ymm2, %ymm9
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2,3],ymm1[4],ymm4[5,6,7,8],ymm1[9],ymm4[10,11],ymm1[12],ymm4[13,14,15]
+; AVX512-NEXT:    vmovdqa64 %ymm1, %ymm21
 ; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm2[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; AVX512-NEXT:    vprold $16, %ymm2, %ymm1
 ; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm3[0,1,1,3,4,5,5,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2]
-; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpshufb %ymm11, %ymm2, %ymm0
-; AVX512-NEXT:    vmovdqa %ymm2, %ymm9
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm3[2,2,2,2,6,6,6,6]
-; AVX512-NEXT:    vmovdqa %ymm3, %ymm11
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
-; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm7[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
-; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm4[0,0,2,1,4,4,6,5]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm3 = ymm0[0,1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7,8,9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15]
-; AVX512-NEXT:    vprold $16, %ymm7, %ymm0
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm4[1,2,2,3,5,6,6,7]
-; AVX512-NEXT:    vmovdqa %ymm4, %ymm8
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm4 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7,8,9],ymm0[10],ymm2[11,12],ymm0[13],ymm2[14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm11[3,3,3,3,7,7,7,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm9[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,2,6,6,6,6]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6,7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm10[3,3,3,3,7,7,7,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm6 = ymm6[2,2,2,2,6,6,6,6]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3],ymm0[4,5],ymm6[6],ymm0[7,8,9,10],ymm6[11],ymm0[12,13],ymm6[14],ymm0[15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm6 = ymm8[3,3,3,3,7,7,7,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm7 = ymm7[2,2,2,3,6,6,6,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7,8,9],ymm6[10],ymm7[11,12],ymm6[13],ymm7[14,15]
-; AVX512-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpermq {{.*#+}} ymm22 = ymm1[2,2,2,3]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm26 = ymm3[2,2,3,3]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm24 = ymm4[2,1,3,2]
-; AVX512-NEXT:    vmovdqa 32(%rdi), %xmm9
-; AVX512-NEXT:    vmovdqa 32(%rsi), %xmm10
-; AVX512-NEXT:    vprold $16, %xmm10, %xmm1
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm9[1,1,2,3]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm3[1,2,2,3,5,6,6,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm4[0,1],ymm1[2],ymm4[3,4],ymm1[5],ymm4[6,7,8,9],ymm1[10],ymm4[11,12],ymm1[13],ymm4[14,15]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm2[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[0,0,0,0,4,4,4,4]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm6 = ymm3[0,0,2,1,4,4,6,5]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3],ymm4[4,5],ymm6[6],ymm4[7,8,9,10],ymm6[11],ymm4[12,13],ymm6[14],ymm4[15]
+; AVX512-NEXT:    vpermt2q %zmm1, %zmm10, %zmm4
+; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm8[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4]
+; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm6 = ymm10[0,1,1,3,4,5,5,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm6[0,1],ymm1[2],ymm6[3,4],ymm1[5],ymm6[6,7,8,9],ymm1[10],ymm6[11,12],ymm1[13],ymm6[14,15]
+; AVX512-NEXT:    vmovdqa64 %ymm1, %ymm16
+; AVX512-NEXT:    vpshufb %ymm13, %ymm8, %ymm1
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm6 = ymm10[2,2,2,2,6,6,6,6]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm6[2],ymm1[3,4],ymm6[5],ymm1[6,7,8,9],ymm6[10],ymm1[11,12],ymm6[13],ymm1[14,15]
+; AVX512-NEXT:    vmovdqa64 %ymm1, %ymm18
+; AVX512-NEXT:    vpermd (%rax), %zmm11, %zmm1
+; AVX512-NEXT:    vpternlogd $184, %zmm4, %zmm26, %zmm1
+; AVX512-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm10[3,3,3,3,7,7,7,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm8[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3],ymm4[4],ymm1[5,6,7,8],ymm4[9],ymm1[10,11],ymm4[12],ymm1[13,14,15]
 ; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpermq {{.*#+}} ymm21 = ymm2[0,2,2,3]
-; AVX512-NEXT:    vmovdqa 32(%r9), %xmm1
-; AVX512-NEXT:    vmovdqa 32(%r8), %xmm2
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
-; AVX512-NEXT:    vpshufb %xmm12, %xmm2, %xmm3
-; AVX512-NEXT:    vpshufb %xmm12, %xmm1, %xmm1
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,4,5,7,6]
-; AVX512-NEXT:    vmovdqa64 %xmm6, %xmm27
-; AVX512-NEXT:    vpermt2d %zmm2, %zmm18, %zmm1
-; AVX512-NEXT:    vpbroadcastd 36(%rax), %ymm2
-; AVX512-NEXT:    vpbroadcastd 40(%rax), %ymm4
-; AVX512-NEXT:    vinserti64x4 $1, %ymm4, %zmm2, %zmm13
-; AVX512-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm13
-; AVX512-NEXT:    vmovdqa 32(%rcx), %xmm7
-; AVX512-NEXT:    vmovdqa 32(%rdx), %xmm6
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,2,4,5,6,7]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm12 = xmm4[0,0,1,1]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm15 = ymm0[2,1,3,3]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm8 = ymm3[0,0,1,1]
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 32-byte Folded Reload
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 32-byte Folded Reload
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
-; AVX512-NEXT:    vpternlogq $184, %zmm3, %zmm4, %zmm1
-; AVX512-NEXT:    vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm3 = mem[2,1,3,2]
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm29 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535]
-; AVX512-NEXT:    vpternlogq $184, %ymm1, %ymm29, %ymm3
-; AVX512-NEXT:    vpternlogq $184, %ymm3, %ymm28, %ymm30
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm9[3,3,3,3,7,7,7,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm7[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3],ymm1[4,5],ymm4[6],ymm1[7,8,9,10],ymm4[11],ymm1[12,13],ymm4[14],ymm1[15]
+; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm3[3,3,3,3,7,7,7,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm2[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,3,6,6,6,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm4[0,1],ymm1[2],ymm4[3,4],ymm1[5],ymm4[6,7,8,9],ymm1[10],ymm4[11,12],ymm1[13],ymm4[14,15]
+; AVX512-NEXT:    vmovdqa 32(%r9), %xmm7
+; AVX512-NEXT:    vmovdqa 32(%r8), %xmm4
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm6[0,1,3,2,4,5,6,7]
+; AVX512-NEXT:    vpermt2d %zmm11, %zmm12, %zmm1
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm11 = ymm24[2,3,3,3,6,7,7,7]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,1,3,2]
+; AVX512-NEXT:    vpbroadcastd 32(%rax), %ymm12
+; AVX512-NEXT:    vinserti64x4 $1, %ymm12, %zmm11, %zmm26
+; AVX512-NEXT:    vpternlogd $184, %zmm1, %zmm15, %zmm26
+; AVX512-NEXT:    vmovdqa 32(%rdi), %xmm1
+; AVX512-NEXT:    vmovdqa 32(%rsi), %xmm12
+; AVX512-NEXT:    vprold $16, %xmm12, %xmm15
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm13 = xmm1[1,1,2,3]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm13[0,1],xmm15[2],xmm13[3,4],xmm15[5],xmm13[6,7]
+; AVX512-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm12[4],xmm1[4],xmm12[5],xmm1[5],xmm12[6],xmm1[6],xmm12[7],xmm1[7]
+; AVX512-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7]
+; AVX512-NEXT:    vmovdqa64 %xmm17, %xmm1
+; AVX512-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
+; AVX512-NEXT:    vpshufb %xmm0, %xmm5, %xmm3
+; AVX512-NEXT:    vmovdqa64 %xmm23, %xmm1
+; AVX512-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
+; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpshufb %xmm0, %xmm13, %xmm1
+; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpshufb %xmm0, %xmm4, %xmm4
+; AVX512-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm24 = mem[2,2,2,3]
+; AVX512-NEXT:    vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm23 = mem[0,2,2,3]
+; AVX512-NEXT:    vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm7 = mem[0,2,2,3]
+; AVX512-NEXT:    vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm17 = mem[2,1,3,3]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,4,5,7,6]
+; AVX512-NEXT:    vpermt2d %zmm0, %zmm29, %zmm4
+; AVX512-NEXT:    vpbroadcastd 36(%rax), %ymm0
+; AVX512-NEXT:    vpbroadcastd 40(%rax), %ymm6
+; AVX512-NEXT:    vinserti64x4 $1, %ymm6, %zmm0, %zmm28
+; AVX512-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm28
+; AVX512-NEXT:    vmovdqa 32(%rcx), %xmm6
+; AVX512-NEXT:    vpshufb %xmm14, %xmm6, %xmm4
+; AVX512-NEXT:    vmovdqa 32(%rdx), %xmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm12 = xmm0[1,1,2,2]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm12[0],xmm4[1],xmm12[2,3],xmm4[4],xmm12[5,6],xmm4[7]
+; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
+; AVX512-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
+; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm1[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm29 = ymm0[3,3,3,3]
+; AVX512-NEXT:    vpshufd $233, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm0 = mem[1,2,2,3,5,6,6,7]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm14 = ymm1[2,1,2,3,6,5,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm14 = ymm14[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1]
+; AVX512-NEXT:    vmovdqa64 %xmm27, %xmm1
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm1[0,2,3,3,4,5,6,7]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1]
+; AVX512-NEXT:    vmovdqa64 %xmm25, %xmm1
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm1[2,1,2,3,4,5,6,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,5,4]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,0,1,3]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1]
+; AVX512-NEXT:    vmovdqa64 %xmm20, %xmm1
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm1[0,2,3,3,4,5,6,7]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[0,0,2,1]
+; AVX512-NEXT:    vmovdqa64 %xmm19, %xmm1
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm1[2,1,2,3,4,5,6,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,5,4]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,0,1,3]
+; AVX512-NEXT:    vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm4 = mem[2,1,3,2]
+; AVX512-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm5 = mem[2,2,2,3]
+; AVX512-NEXT:    vpermq $250, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm6 = mem[2,2,3,3]
+; AVX512-NEXT:    vpermpd $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm1 = mem[2,2,2,3]
+; AVX512-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vpermpd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm1 = mem[0,2,2,3]
+; AVX512-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vpermq {{.*#+}} ymm31 = ymm31[0,2,2,3]
+; AVX512-NEXT:    vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm27 = mem[2,1,3,3]
+; AVX512-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm20 = mem[2,2,2,3]
+; AVX512-NEXT:    vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm11 = mem[0,2,2,3]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm19 = ymm30[2,1,3,2]
+; AVX512-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm1 = mem[2,2,2,3]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm25 = ymm22[2,2,2,3]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm13 = ymm21[0,2,2,3]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm16 = ymm16[2,1,3,2]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm9 = ymm18[2,2,2,3]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm23, %zmm24, %zmm23
+; AVX512-NEXT:    vinserti64x4 $1, %ymm17, %zmm7, %zmm24
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm30 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
+; AVX512-NEXT:    vpternlogq $184, %zmm23, %zmm30, %zmm24
+; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2]
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535]
+; AVX512-NEXT:    vpternlogq $184, %ymm24, %ymm7, %ymm0
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm18 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535]
+; AVX512-NEXT:    vpternlogq $184, %ymm0, %ymm18, %ymm14
+; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm0[4,5,6,7]
+; AVX512-NEXT:    vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm2 # 64-byte Folded Reload
+; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm0[4,5,6,7]
+; AVX512-NEXT:    vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm3 # 64-byte Folded Reload
+; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 32-byte Folded Reload
+; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 32-byte Folded Reload
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm23 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
+; AVX512-NEXT:    vpternlogq $226, %zmm10, %zmm23, %zmm8
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm17 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535]
+; AVX512-NEXT:    vpternlogq $226, %zmm2, %zmm17, %zmm8
+; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm2 # 32-byte Folded Reload
+; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm10 # 32-byte Folded Reload
+; AVX512-NEXT:    vpternlogq $226, %zmm2, %zmm23, %zmm10
+; AVX512-NEXT:    vpternlogq $226, %zmm3, %zmm17, %zmm10
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 32-byte Folded Reload
-; AVX512-NEXT:    vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm3 # 64-byte Folded Reload
+; AVX512-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm2
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
-; AVX512-NEXT:    vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm0 # 64-byte Folded Reload
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm23 # 32-byte Folded Reload
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm18 # 32-byte Folded Reload
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm28 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
-; AVX512-NEXT:    vpternlogq $226, %zmm23, %zmm28, %zmm18
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535]
-; AVX512-NEXT:    vpternlogq $226, %zmm3, %zmm2, %zmm18
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm23 # 32-byte Folded Reload
-; AVX512-NEXT:    vpternlogq $226, %zmm3, %zmm28, %zmm23
-; AVX512-NEXT:    vpternlogq $226, %zmm0, %zmm2, %zmm23
+; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm0, %zmm21
+; AVX512-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm21
+; AVX512-NEXT:    vextracti64x4 $1, %zmm24, %ymm2
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
+; AVX512-NEXT:    vpternlogd $226, 124(%r8){1to8}, %ymm0, %ymm2
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535]
+; AVX512-NEXT:    vpternlogq $184, %ymm2, %ymm3, %ymm29
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512-NEXT:    vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm22 # 64-byte Folded Reload
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm17 # 32-byte Folded Reload
-; AVX512-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm17
-; AVX512-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
-; AVX512-NEXT:    vpternlogd $226, 124(%r8){1to8}, %ymm19, %ymm0
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535]
-; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX512-NEXT:    vpternlogq $184, %ymm0, %ymm1, %ymm2
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512-NEXT:    vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm19 # 64-byte Folded Reload
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm24 = zmm0[0,1,2,3],zmm6[0,1,2,3]
+; AVX512-NEXT:    vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm24 # 64-byte Folded Reload
+; AVX512-NEXT:    vinserti64x4 $1, %ymm29, %zmm0, %zmm2
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm14[0,1,2,3],zmm2[4,5,6,7]
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0]
+; AVX512-NEXT:    vpternlogd $184, %zmm2, %zmm0, %zmm29
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 64-byte Folded Reload
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535]
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm18 # 64-byte Folded Reload
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm17 # 64-byte Folded Reload
+; AVX512-NEXT:    vinserti64x4 $1, %ymm11, %zmm20, %zmm2
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm19, %zmm3
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535]
+; AVX512-NEXT:    vpternlogq $226, %zmm2, %zmm4, %zmm3
+; AVX512-NEXT:    vinserti64x4 $1, %ymm13, %zmm25, %zmm2
+; AVX512-NEXT:    vinserti64x4 $1, %ymm9, %zmm16, %zmm5
+; AVX512-NEXT:    vpternlogq $226, %zmm2, %zmm4, %zmm5
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0]
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512-NEXT:    vpternlogq $184, %zmm3, %zmm2, %zmm20
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512-NEXT:    vpternlogq $184, %zmm5, %zmm2, %zmm25
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-NEXT:    vshufi64x2 $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
-; AVX512-NEXT:    # zmm0 = zmm0[0,1,2,3],mem[0,1,2,3]
-; AVX512-NEXT:    vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
-; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm1
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm30[0,1,2,3],zmm1[4,5,6,7]
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0]
-; AVX512-NEXT:    vpternlogd $184, %zmm1, %zmm2, %zmm30
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512-NEXT:    vpternlogd $184, %zmm1, %zmm2, %zmm11
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535]
-; AVX512-NEXT:    vpternlogq $184, %zmm14, %zmm1, %zmm25
-; AVX512-NEXT:    vpternlogq $184, %zmm5, %zmm1, %zmm20
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 32-byte Folded Reload
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535]
-; AVX512-NEXT:    vpternlogq $226, %zmm1, %zmm5, %zmm2
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512-NEXT:    vinserti64x4 $1, %ymm22, %zmm3, %zmm22
-; AVX512-NEXT:    vpternlogq $226, %zmm1, %zmm5, %zmm22
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm14 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0]
-; AVX512-NEXT:    vpermd 64(%rax), %zmm14, %zmm5
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535]
-; AVX512-NEXT:    vpternlogd $184, %zmm1, %zmm3, %zmm5
-; AVX512-NEXT:    vinserti64x4 $1, %ymm24, %zmm26, %zmm1
-; AVX512-NEXT:    vpermd (%rax), %zmm14, %zmm14
-; AVX512-NEXT:    vpternlogd $184, %zmm1, %zmm3, %zmm14
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0]
-; AVX512-NEXT:    vpternlogq $184, %zmm2, %zmm1, %zmm5
-; AVX512-NEXT:    vpternlogq $184, %zmm22, %zmm1, %zmm14
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
+; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 32-byte Folded Reload
+; AVX512-NEXT:    vinserti64x4 $1, %ymm27, %zmm31, %zmm3
+; AVX512-NEXT:    vpternlogq $184, %zmm2, %zmm30, %zmm3
+; AVX512-NEXT:    vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm2 = mem[0,2,2,3]
+; AVX512-NEXT:    vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm4 = mem[0,1,3,2,4,5,6,7]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,0,1,1]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,1,3]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm4, %zmm2, %zmm2
+; AVX512-NEXT:    vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm4 = mem[2,1,3,3]
+; AVX512-NEXT:    vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm5 = mem[0,0,1,1]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm4, %zmm4
+; AVX512-NEXT:    vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm5 = mem[0,0,2,1]
+; AVX512-NEXT:    vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm6 = mem[2,1,2,3,4,5,6,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,5,4]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,0,1,3]
+; AVX512-NEXT:    vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm19 = mem[0,0,1,1]
+; AVX512-NEXT:    vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm9 = mem[0,2,3,3,4,5,6,7]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1]
+; AVX512-NEXT:    vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm11 = mem[0,2,2,3]
+; AVX512-NEXT:    vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm12 = mem[0,1,3,2,4,5,6,7]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[0,0,1,1]
+; AVX512-NEXT:    vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm13 = mem[2,1,3,3]
+; AVX512-NEXT:    vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm14 = mem[0,0,1,1]
+; AVX512-NEXT:    vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm15 = mem[0,0,2,1]
+; AVX512-NEXT:    vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = mem[2,1,2,3,4,5,6,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,4]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3]
+; AVX512-NEXT:    vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm16 = mem[0,0,1,1]
+; AVX512-NEXT:    vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm1 = mem[0,2,3,3,4,5,6,7]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
+; AVX512-NEXT:    vpternlogq $226, %zmm2, %zmm30, %zmm4
+; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm12[0,1,1,3]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm11, %zmm2
+; AVX512-NEXT:    vinserti64x4 $1, %ymm14, %zmm13, %zmm11
+; AVX512-NEXT:    vpternlogq $226, %zmm2, %zmm30, %zmm11
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535]
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512-NEXT:    vpternlogq $184, %zmm4, %zmm2, %zmm12
+; AVX512-NEXT:    vpternlogq $184, %zmm11, %zmm2, %zmm26
+; AVX512-NEXT:    vinserti64x4 $1, %ymm6, %zmm5, %zmm2
+; AVX512-NEXT:    vinserti64x4 $1, %ymm9, %zmm19, %zmm4
+; AVX512-NEXT:    vpternlogq $226, %zmm2, %zmm23, %zmm4
+; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm15, %zmm0
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm16, %zmm1
+; AVX512-NEXT:    vpternlogq $226, %zmm0, %zmm23, %zmm1
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0]
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 32-byte Folded Reload
-; AVX512-NEXT:    vpternlogq $184, %zmm1, %zmm4, %zmm2
-; AVX512-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm1 = mem[0,1,1,3]
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm3, %zmm1
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm22 # 32-byte Folded Reload
-; AVX512-NEXT:    vpternlogq $226, %zmm1, %zmm4, %zmm22
-; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm12[0,1,1,3]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm21, %zmm1
-; AVX512-NEXT:    vinserti64x4 $1, %ymm8, %zmm15, %zmm8
-; AVX512-NEXT:    vpternlogq $226, %zmm1, %zmm4, %zmm8
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm4 = xmm7[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm9 = xmm6[1,1,2,2]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm4 = xmm9[0],xmm4[1],xmm9[2,3],xmm4[4],xmm9[5,6],xmm4[7]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
-; AVX512-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm7 = mem[2,2,2,3]
-; AVX512-NEXT:    vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
-; AVX512-NEXT:    # xmm9 = mem[0,1,3,2,4,5,6,7]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm9 = xmm9[0,1,1,3]
-; AVX512-NEXT:    vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm10 = mem[2,3,3,3,6,7,7,7]
-; AVX512-NEXT:    vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm16 = mem[0,0,2,1]
-; AVX512-NEXT:    vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
-; AVX512-NEXT:    # xmm12 = mem[2,1,2,3,4,5,6,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,5,4]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,0,1,3]
-; AVX512-NEXT:    vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm21 = mem[0,0,1,1]
-; AVX512-NEXT:    vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
-; AVX512-NEXT:    # xmm15 = mem[0,2,3,3,4,5,6,7]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[0,0,2,1]
-; AVX512-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm24 = mem[2,2,2,3]
-; AVX512-NEXT:    vmovdqa64 %xmm27, %xmm3
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,2,4,5,6,7]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,1,1,3]
-; AVX512-NEXT:    vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm26 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm26 = mem[2,3,3,3,6,7,7,7]
-; AVX512-NEXT:    vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm27 = mem[0,0,2,1]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[0,2,3,3,4,5,6,7]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,0,0,1]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm9, %zmm7, %zmm7
-; AVX512-NEXT:    vpermq {{.*#+}} ymm9 = ymm10[2,1,3,2]
-; AVX512-NEXT:    vpbroadcastd 96(%rax), %ymm10
-; AVX512-NEXT:    vinserti64x4 $1, %ymm10, %zmm9, %zmm9
-; AVX512-NEXT:    vpternlogd $184, %zmm7, %zmm29, %zmm9
-; AVX512-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm24, %zmm3
-; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = ymm26[2,1,3,2]
-; AVX512-NEXT:    vpbroadcastd 32(%rax), %ymm10
-; AVX512-NEXT:    vinserti64x4 $1, %ymm10, %zmm7, %zmm7
-; AVX512-NEXT:    vpternlogd $184, %zmm3, %zmm29, %zmm7
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535]
-; AVX512-NEXT:    vpternlogq $184, %zmm22, %zmm3, %zmm9
-; AVX512-NEXT:    vpternlogq $184, %zmm8, %zmm3, %zmm7
-; AVX512-NEXT:    vinserti64x4 $1, %ymm12, %zmm16, %zmm3
-; AVX512-NEXT:    vinserti64x4 $1, %ymm15, %zmm21, %zmm8
-; AVX512-NEXT:    vpternlogq $226, %zmm3, %zmm28, %zmm8
-; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm27, %zmm1
-; AVX512-NEXT:    vinserti64x4 $1, %ymm6, %zmm4, %zmm3
-; AVX512-NEXT:    vpternlogq $226, %zmm1, %zmm28, %zmm3
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0]
-; AVX512-NEXT:    vpternlogq $184, %zmm8, %zmm1, %zmm31
-; AVX512-NEXT:    vpternlogq $184, %zmm3, %zmm1, %zmm13
-; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm17, %zmm0
-; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm11
+; AVX512-NEXT:    vpternlogq $184, %zmm4, %zmm0, %zmm2
+; AVX512-NEXT:    vpternlogq $184, %zmm1, %zmm0, %zmm28
+; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm21, %zmm24
+; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm7
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    vmovdqa64 %zmm0, 320(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm13, 256(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm7, 192(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm14, 128(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm23, 64(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm20, (%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm25, 448(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm31, 704(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm9, 640(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm5, 576(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm18, 512(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm11, 384(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm19, 768(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm30, 832(%rax)
-; AVX512-NEXT:    addq $2808, %rsp # imm = 0xAF8
+; AVX512-NEXT:    vmovdqa64 %zmm24, 320(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm28, 256(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm26, 192(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm25, 128(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm10, 64(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm17, (%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm18, 448(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm2, 704(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm12, 640(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm20, 576(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm8, 512(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm7, 384(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm22, 768(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm29, 832(%rax)
+; AVX512-NEXT:    addq $2168, %rsp # imm = 0x878
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 ;
 ; AVX512-FCP-LABEL: store_i16_stride7_vf64:
 ; AVX512-FCP:       # %bb.0:
 ; AVX512-FCP-NEXT:    subq $1496, %rsp # imm = 0x5D8
-; AVX512-FCP-NEXT:    vmovdqa 96(%rcx), %ymm3
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128]
-; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm3, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa 96(%rdx), %ymm1
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm1, %ymm2
-; AVX512-FCP-NEXT:    vporq %ymm0, %ymm2, %ymm16
-; AVX512-FCP-NEXT:    vmovdqa 96(%rsi), %ymm4
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm10 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128]
-; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm4, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa 96(%rdi), %ymm5
+; AVX512-FCP-NEXT:    vmovdqa 96(%rcx), %ymm0
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm15 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128]
+; AVX512-FCP-NEXT:    vpshufb %ymm15, %ymm0, %ymm2
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm20
+; AVX512-FCP-NEXT:    vmovdqa 96(%rdx), %ymm9
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm9, %ymm3
+; AVX512-FCP-NEXT:    vmovdqa %ymm0, %ymm10
+; AVX512-FCP-NEXT:    vporq %ymm2, %ymm3, %ymm16
+; AVX512-FCP-NEXT:    vmovdqa 96(%rsi), %ymm6
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm14 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm6, %ymm2
+; AVX512-FCP-NEXT:    vmovdqa 96(%rdi), %ymm7
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm11 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19]
-; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm5, %ymm2
-; AVX512-FCP-NEXT:    vpor %ymm0, %ymm2, %ymm7
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
-; AVX512-FCP-NEXT:    vmovdqa 64(%r9), %ymm2
-; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm2, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm26
-; AVX512-FCP-NEXT:    vmovdqa 64(%r8), %ymm12
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm13 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u]
-; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm12, %ymm2
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm12, %ymm28
-; AVX512-FCP-NEXT:    vpor %ymm0, %ymm2, %ymm0
+; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm7, %ymm3
+; AVX512-FCP-NEXT:    vporq %ymm2, %ymm3, %ymm17
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
+; AVX512-FCP-NEXT:    vmovdqa 64(%r9), %ymm0
 ; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm0, %ymm2
+; AVX512-FCP-NEXT:    vmovdqa 64(%r8), %ymm0
+; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u]
+; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm0, %ymm4
+; AVX512-FCP-NEXT:    vporq %ymm2, %ymm4, %ymm18
 ; AVX512-FCP-NEXT:    vmovdqa 64(%rcx), %ymm0
-; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm0, %ymm14
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm31
+; AVX512-FCP-NEXT:    vmovdqu %ymm0, (%rsp) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpshufb %ymm15, %ymm0, %ymm8
 ; AVX512-FCP-NEXT:    vmovdqa 64(%rdx), %ymm0
-; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm0, %ymm15
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm21
-; AVX512-FCP-NEXT:    vpor %ymm14, %ymm15, %ymm12
-; AVX512-FCP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm0, %ymm12
+; AVX512-FCP-NEXT:    vpor %ymm8, %ymm12, %ymm0
+; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa 64(%rsi), %ymm0
-; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm0, %ymm14
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm29
-; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %ymm0
-; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm0, %ymm15
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm17
-; AVX512-FCP-NEXT:    vpor %ymm14, %ymm15, %ymm12
-; AVX512-FCP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa (%r9), %ymm0
 ; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm0, %ymm14
-; AVX512-FCP-NEXT:    vmovdqa (%r8), %ymm0
+; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm0, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %ymm1
+; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm1, %ymm13
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm1, %ymm28
+; AVX512-FCP-NEXT:    vpor %ymm0, %ymm13, %ymm0
 ; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm0, %ymm15
-; AVX512-FCP-NEXT:    vpor %ymm14, %ymm15, %ymm12
-; AVX512-FCP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa (%r9), %ymm0
+; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa (%r8), %ymm1
+; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm1, %ymm13
+; AVX512-FCP-NEXT:    vporq %ymm0, %ymm13, %ymm19
 ; AVX512-FCP-NEXT:    vmovdqa (%rcx), %ymm0
 ; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm0, %ymm14
-; AVX512-FCP-NEXT:    vmovdqa (%rdx), %ymm0
+; AVX512-FCP-NEXT:    vpshufb %ymm15, %ymm0, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa (%rdx), %ymm1
+; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa %ymm10, %ymm2
+; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm1, %ymm13
+; AVX512-FCP-NEXT:    vpor %ymm0, %ymm13, %ymm0
 ; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm0, %ymm12
-; AVX512-FCP-NEXT:    vpor %ymm14, %ymm12, %ymm12
-; AVX512-FCP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa (%rsi), %ymm0
 ; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm0, %ymm12
-; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm0, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm1
+; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm1, %ymm13
+; AVX512-FCP-NEXT:    vpor %ymm0, %ymm13, %ymm0
 ; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm0, %ymm14
-; AVX512-FCP-NEXT:    vpor %ymm12, %ymm14, %ymm12
-; AVX512-FCP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %ymm14
-; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm14, %ymm8
+; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %ymm10
+; AVX512-FCP-NEXT:    vpshufb %ymm15, %ymm10, %ymm0
 ; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %ymm15
-; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm15, %ymm9
-; AVX512-FCP-NEXT:    vpor %ymm8, %ymm9, %ymm8
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %ymm0
-; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm0, %ymm8
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm2
-; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm2, %ymm9
-; AVX512-FCP-NEXT:    vpor %ymm8, %ymm9, %ymm8
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %ymm8
-; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm8, %ymm10
-; AVX512-FCP-NEXT:    vmovdqa 32(%r9), %ymm9
-; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm9, %ymm11
-; AVX512-FCP-NEXT:    vporq %ymm11, %ymm10, %ymm22
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm11 = ymm3[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm12 = ymm1[0,1,1,3,4,5,5,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm12[0,1],ymm11[2],ymm12[3,4],ymm11[5],ymm12[6,7,8,9],ymm11[10],ymm12[11,12],ymm11[13],ymm12[14,15]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,1,3,2]
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
-; AVX512-FCP-NEXT:    # ymm13 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm4, %ymm12
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm13, %ymm30
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm13 = ymm5[1,1,1,1,5,5,5,5]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm13[2],ymm12[3,4],ymm13[5],ymm12[6,7,8,9],ymm13[10],ymm12[11,12],ymm13[13],ymm12[14,15]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3]
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm16, %zmm11
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm12, %zmm7, %zmm12
-; AVX512-FCP-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm12
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vmovdqa 96(%r8), %ymm7
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm11 = ymm7[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[16,17,u,u,u,u],zero,zero
-; AVX512-FCP-NEXT:    vpternlogq $248, %ymm13, %ymm12, %ymm11
-; AVX512-FCP-NEXT:    vmovdqa 96(%r9), %ymm10
-; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm10, %ymm6
-; AVX512-FCP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogq $248, %ymm16, %ymm11, %ymm6
-; AVX512-FCP-NEXT:    vextracti64x4 $1, %zmm12, %ymm11
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm12 = [0,4,0,0,0,5,0,0]
-; AVX512-FCP-NEXT:    vpermd %ymm7, %ymm12, %ymm12
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0]
-; AVX512-FCP-NEXT:    vpternlogq $184, %ymm11, %ymm16, %ymm12
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm16, %zmm19
-; AVX512-FCP-NEXT:    vprold $16, %ymm10, %ymm11
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2]
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogq $184, %ymm12, %ymm10, %ymm11
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm0, %zmm11
-; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm11[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm15, %ymm13
+; AVX512-FCP-NEXT:    vpor %ymm0, %ymm13, %ymm0
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %ymm8
+; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm8, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm4
+; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm4, %ymm11
+; AVX512-FCP-NEXT:    vpor %ymm0, %ymm11, %ymm0
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %ymm13
+; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm13, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa 32(%r9), %ymm14
+; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm14, %ymm5
+; AVX512-FCP-NEXT:    vmovdqa %ymm3, %ymm12
+; AVX512-FCP-NEXT:    vporq %ymm5, %ymm0, %ymm21
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm20, %ymm2
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm2[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm11 = ymm9[0,1,1,3,4,5,5,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm11[0,1],ymm0[2],ymm11[3,4],ymm0[5],ymm11[6,7,8,9],ymm0[10],ymm11[11,12],ymm0[13],ymm11[14,15]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm11 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm7[1,1,1,1,5,5,5,5]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm11[0,1],ymm3[2],ymm11[3,4],ymm3[5],ymm11[6,7,8,9],ymm3[10],ymm11[11,12],ymm3[13],ymm11[14,15]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3]
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm16, %zmm0
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm17, %zmm3
+; AVX512-FCP-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm3
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm25 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535]
+; AVX512-FCP-NEXT:    vmovdqa 96(%r8), %ymm11
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm11[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[16,17,u,u,u,u],zero,zero
+; AVX512-FCP-NEXT:    vpternlogq $248, %ymm25, %ymm3, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa 96(%r9), %ymm5
+; AVX512-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm5, %ymm1
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
+; AVX512-FCP-NEXT:    vpternlogq $248, %ymm12, %ymm0, %ymm1
+; AVX512-FCP-NEXT:    vextracti64x4 $1, %zmm3, %ymm0
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,4,0,0,0,5,0,0]
+; AVX512-FCP-NEXT:    vpermd %ymm11, %ymm3, %ymm3
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0]
+; AVX512-FCP-NEXT:    vpternlogq $184, %ymm0, %ymm17, %ymm3
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm17, %zmm22
+; AVX512-FCP-NEXT:    vprold $16, %ymm5, %ymm0
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
+; AVX512-FCP-NEXT:    vpternlogq $184, %ymm3, %ymm5, %ymm0
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
+; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm11 = [0,1,4,5,4,5,5,7]
-; AVX512-FCP-NEXT:    vmovdqa 96(%rax), %ymm6
-; AVX512-FCP-NEXT:    vpermd %ymm6, %ymm11, %ymm11
-; AVX512-FCP-NEXT:    vpandn %ymm11, %ymm13, %ymm11
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm13, %zmm18
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm13 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128]
-; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm6, %ymm12
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm12, %zmm11
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29]
-; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm4, %ymm11
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm12 = ymm5[3,3,3,3,7,7,7,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7,8,9,10],ymm11[11],ymm12[12,13],ymm11[14],ymm12[15]
-; AVX512-FCP-NEXT:    vpmovsxdq {{.*#+}} ymm12 = [151522058,0,421010202,421010202]
-; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm4, %ymm4
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm12, %ymm23
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm5[2,2,2,2,6,6,6,6]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15]
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm5 = [0,2,2,3,10,9,11,11]
-; AVX512-FCP-NEXT:    vpermt2q %zmm11, %zmm5, %zmm4
-; AVX512-FCP-NEXT:    vpmovsxdq {{.*#+}} ymm12 = [218894094,0,488382238,488382238]
-; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm3, %ymm11
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm12, %ymm16
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm12 = ymm1[3,3,3,3,7,7,7,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm12[0],ymm11[1],ymm12[2,3],ymm11[4],ymm12[5,6,7,8],ymm11[9],ymm12[10,11],ymm11[12],ymm12[13,14,15]
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm12 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27]
-; AVX512-FCP-NEXT:    # ymm12 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm3, %ymm3
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7,8,9],ymm1[10],ymm3[11,12],ymm1[13],ymm3[14,15]
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm20 = [2,2,2,3,8,10,10,11]
-; AVX512-FCP-NEXT:    vpermt2q %zmm11, %zmm20, %zmm1
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm27 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm4, %zmm27, %zmm1
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,1,4,5,4,5,5,7]
+; AVX512-FCP-NEXT:    vmovdqa 96(%rax), %ymm0
+; AVX512-FCP-NEXT:    vpermd %ymm0, %ymm1, %ymm1
+; AVX512-FCP-NEXT:    vpandnq %ymm1, %ymm25, %ymm1
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm5 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128]
+; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm0, %ymm3
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm5, %ymm20
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm3, %zmm1
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29]
+; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm6, %ymm1
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm26
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm7[3,3,3,3,7,7,7,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5],ymm1[6],ymm3[7,8,9,10],ymm1[11],ymm3[12,13],ymm1[14],ymm3[15]
+; AVX512-FCP-NEXT:    vpmovsxdq {{.*#+}} ymm5 = [151522058,0,421010202,421010202]
+; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm6, %ymm3
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm5, %ymm29
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm7[2,2,2,2,6,6,6,6]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2,3],ymm3[4],ymm6[5,6,7,8],ymm3[9],ymm6[10,11],ymm3[12],ymm6[13,14,15]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm6 = [0,2,2,3,10,9,11,11]
+; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm6, %zmm3
+; AVX512-FCP-NEXT:    vpmovsxdq {{.*#+}} ymm7 = [218894094,0,488382238,488382238]
+; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm2, %ymm1
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm7, %ymm27
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm7 = ymm9[3,3,3,3,7,7,7,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm7[0],ymm1[1],ymm7[2,3],ymm1[4],ymm7[5,6,7,8],ymm1[9],ymm7[10,11],ymm1[12],ymm7[13,14,15]
+; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27]
+; AVX512-FCP-NEXT:    # ymm5 = mem[0,1,0,1]
+; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm2, %ymm7
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm5, %ymm16
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[2,2,2,2,6,6,6,6]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm9[2],ymm7[3,4],ymm9[5],ymm7[6,7,8,9],ymm9[10],ymm7[11,12],ymm9[13],ymm7[14,15]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm17 = [2,2,2,3,8,10,10,11]
+; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm17, %zmm7
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
+; AVX512-FCP-NEXT:    vpternlogq $226, %zmm3, %zmm1, %zmm7
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm24
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [5,0,0,0,6,0,0,6]
-; AVX512-FCP-NEXT:    vpermd %ymm7, %ymm1, %ymm1
+; AVX512-FCP-NEXT:    vpermd %ymm11, %ymm1, %ymm1
 ; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 64(%rax), %zmm1
+; AVX512-FCP-NEXT:    vmovdqa64 64(%rax), %zmm12
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [0,5,0,0,0,6,0,0,30,0,0,0,31,0,0,31]
-; AVX512-FCP-NEXT:    vpermi2d %zmm1, %zmm6, %zmm3
+; AVX512-FCP-NEXT:    vpermi2d %zmm12, %zmm0, %zmm3
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535]
-; AVX512-FCP-NEXT:    vpbroadcastd 72(%rax), %ymm3
-; AVX512-FCP-NEXT:    vpandn %ymm3, %ymm6, %ymm3
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm23 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535]
+; AVX512-FCP-NEXT:    vpbroadcastd 72(%rax), %ymm0
+; AVX512-FCP-NEXT:    vpandnq %ymm0, %ymm23, %ymm0
 ; AVX512-FCP-NEXT:    vmovdqa 64(%rax), %ymm7
-; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm7, %ymm4
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm7, %ymm25
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm3, %zmm3
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm20, %ymm5
+; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm7, %ymm3
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm7, %ymm30
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm11
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm18, %zmm0, %zmm0
+; AVX512-FCP-NEXT:    vmovdqa 64(%r9), %xmm1
+; AVX512-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa 64(%r8), %xmm2
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm2, %xmm20
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm3, %xmm3
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm18 = [0,0,1,1,12,13,14,15]
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm18, %zmm3
+; AVX512-FCP-NEXT:    vpternlogq $248, %zmm23, %zmm3, %zmm11
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpbroadcastd 8(%rax), %ymm0
+; AVX512-FCP-NEXT:    vpandnq %ymm0, %ymm23, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa (%rax), %ymm1
+; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm1, %ymm3
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm5
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm19, %zmm0, %zmm0
+; AVX512-FCP-NEXT:    vmovdqa (%r9), %xmm1
+; AVX512-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa (%r8), %xmm2
+; AVX512-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm3, %xmm3
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm18, %zmm3
+; AVX512-FCP-NEXT:    vpternlogq $248, %zmm23, %zmm3, %zmm5
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm7 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21]
+; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm14, %ymm0
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm13[0,0,2,1,4,4,6,5]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5],ymm3[6],ymm0[7,8,9,10],ymm3[11],ymm0[12,13],ymm3[14],ymm0[15]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm3 = [0,1,0,3,10,10,11,11]
+; AVX512-FCP-NEXT:    vpermi2q %zmm0, %zmm21, %zmm3
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpbroadcastd 8(%rax), %ymm3
-; AVX512-FCP-NEXT:    vpandn %ymm3, %ymm6, %ymm3
-; AVX512-FCP-NEXT:    vmovdqa (%rax), %ymm4
+; AVX512-FCP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm26, %ymm9
+; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm8, %ymm0
 ; AVX512-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm4, %ymm4
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm3, %zmm3
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm4 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21]
-; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm9, %ymm3
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm4, %ymm24
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm8[0,0,2,1,4,4,6,5]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8,9,10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15]
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm4 = [0,1,0,3,10,10,11,11]
-; AVX512-FCP-NEXT:    vpermi2q %zmm3, %zmm22, %zmm4
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm0, %ymm3
-; AVX512-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm2[3,3,3,3,7,7,7,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7,8,9,10],ymm3[11],ymm4[12,13],ymm3[14],ymm4[15]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm23, %ymm7
-; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm0, %ymm4
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm2[2,2,2,2,6,6,6,6]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6,7,8],ymm4[9],ymm6[10,11],ymm4[12],ymm6[13,14,15]
-; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm5, %zmm4
-; AVX512-FCP-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm16, %ymm2
-; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm14, %ymm3
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm4[3,3,3,3,7,7,7,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5],ymm0[6],ymm3[7,8,9,10],ymm0[11],ymm3[12,13],ymm0[14],ymm3[15]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm29, %ymm1
+; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm8, %ymm3
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm4[2,2,2,2,6,6,6,6]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6,7,8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13,14,15]
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm6, %zmm3
+; AVX512-FCP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm27, %ymm11
+; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm10, %ymm0
 ; AVX512-FCP-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm15[3,3,3,3,7,7,7,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6,7,8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13,14,15]
-; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm14, %ymm5
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2,3],ymm0[4],ymm5[5,6,7,8],ymm0[9],ymm5[10,11],ymm0[12],ymm5[13,14,15]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm16, %ymm4
+; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm10, %ymm5
 ; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm15[2,2,2,2,6,6,6,6]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7,8,9],ymm6[10],ymm5[11,12],ymm6[13],ymm5[14,15]
-; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm20, %zmm0
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm4, %zmm27, %zmm0
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm8[3,3,3,3,7,7,7,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15]
-; AVX512-FCP-NEXT:    vprold $16, %ymm9, %ymm4
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm8[1,2,2,3,5,6,6,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7,8,9],ymm4[10],ymm5[11,12],ymm4[13],ymm5[14,15]
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm11 = [2,1,3,2,10,10,10,11]
-; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm11, %zmm4
-; AVX512-FCP-NEXT:    vmovdqa64 (%rax), %zmm16
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [0,21,0,0,0,22,0,0,14,0,0,0,15,0,0,15]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7,8,9],ymm6[10],ymm5[11,12],ymm6[13],ymm5[14,15]
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm17, %zmm2
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm24, %zmm16
+; AVX512-FCP-NEXT:    vpternlogq $226, %zmm3, %zmm24, %zmm2
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
+; AVX512-FCP-NEXT:    # ymm15 = mem[0,1,0,1]
+; AVX512-FCP-NEXT:    vpshufb %ymm15, %ymm14, %ymm0
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm13[3,3,3,3,7,7,7,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7,8,9],ymm3[10],ymm0[11,12],ymm3[13],ymm0[14,15]
+; AVX512-FCP-NEXT:    vprold $16, %ymm14, %ymm3
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm13[1,2,2,3,5,6,6,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7,8,9],ymm3[10],ymm5[11,12],ymm3[13],ymm5[14,15]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm18 = [2,1,3,2,10,10,10,11]
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm18, %zmm3
+; AVX512-FCP-NEXT:    vmovdqa64 (%rax), %zmm19
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [0,21,0,0,0,22,0,0,14,0,0,0,15,0,0,15]
 ; AVX512-FCP-NEXT:    vmovdqa 32(%rax), %ymm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpermi2d %zmm0, %zmm16, %zmm3
-; AVX512-FCP-NEXT:    vpternlogd $184, %zmm4, %zmm19, %zmm3
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm29, %ymm10
-; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm10, %ymm3
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm17, %ymm19
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm17[2,2,2,2,6,6,6,6]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14,15]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm30, %ymm9
-; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm10, %ymm4
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm17[1,1,1,1,5,5,5,5]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15]
-; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm20, %zmm5
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm31, %ymm8
-; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm8, %ymm3
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm21[2,2,2,2,6,6,6,6]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm8[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm21[0,1,1,3,4,5,5,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7,8,9],ymm4[10],ymm6[11,12],ymm4[13],ymm6[14,15]
-; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm11, %zmm6
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm27 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm5, %zmm27, %zmm6
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm26, %ymm12
-; AVX512-FCP-NEXT:    vprold $16, %ymm26, %ymm3
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm28, %ymm22
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm28[1,2,2,3,5,6,6,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7,8,9],ymm3[10],ymm5[11,12],ymm3[13],ymm5[14,15]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm24, %ymm0
-; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm12, %ymm5
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm26, %ymm30
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm7 = ymm28[0,0,2,1,4,4,6,5]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3],ymm5[4,5],ymm7[6],ymm5[7,8,9,10],ymm7[11],ymm5[12,13],ymm7[14],ymm5[15]
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm23 = [2,2,3,3,10,9,11,10]
-; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm23, %zmm5
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm25, %zmm1, %zmm1
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [0,0,4,0,0,0,5,0,0,13,0,0,0,14,0,0]
-; AVX512-FCP-NEXT:    vpermd %zmm1, %zmm0, %zmm0
-; AVX512-FCP-NEXT:    vpternlogd $184, %zmm5, %zmm18, %zmm0
-; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm0
+; AVX512-FCP-NEXT:    vpermi2d %zmm0, %zmm19, %zmm2
+; AVX512-FCP-NEXT:    vpternlogd $184, %zmm3, %zmm22, %zmm2
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm13, %ymm0
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm28[2,2,2,2,6,6,6,6]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3],ymm0[4],ymm3[5,6,7,8],ymm0[9],ymm3[10,11],ymm0[12],ymm3[13,14,15]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm28[1,1,1,1,5,5,5,5]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7,8,9],ymm5[10],ymm3[11,12],ymm5[13],ymm3[14,15]
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm17, %zmm3
+; AVX512-FCP-NEXT:    vmovdqu (%rsp), %ymm1 # 32-byte Reload
+; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm1, %ymm0
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm2[2,2,2,2,6,6,6,6]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm5[2],ymm0[3,4],ymm5[5],ymm0[6,7,8,9],ymm5[10],ymm0[11,12],ymm5[13],ymm0[14,15]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm5 = ymm1[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm2[0,1,1,3,4,5,5,7]
+; AVX512-FCP-NEXT:    vmovdqa %ymm2, %ymm4
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7,8,9],ymm5[10],ymm6[11,12],ymm5[13],ymm6[14,15]
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm18, %zmm5
+; AVX512-FCP-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm5
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX512-FCP-NEXT:    vprold $16, %ymm8, %ymm0
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm2[1,2,2,3,5,6,6,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7,8,9],ymm0[10],ymm3[11,12],ymm0[13],ymm3[14,15]
+; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm8, %ymm3
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm2[0,0,2,1,4,4,6,5]
+; AVX512-FCP-NEXT:    vmovdqa %ymm2, %ymm14
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3],ymm3[4,5],ymm6[6],ymm3[7,8,9,10],ymm6[11],ymm3[12,13],ymm6[14],ymm3[15]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [2,2,3,3,10,9,11,10]
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm2, %zmm3
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm30, %zmm12, %zmm0
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [0,0,4,0,0,0,5,0,0,13,0,0,0,14,0,0]
+; AVX512-FCP-NEXT:    vpermd %zmm0, %zmm2, %zmm0
+; AVX512-FCP-NEXT:    vpternlogd $184, %zmm3, %zmm25, %zmm0
+; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm8, %ymm0
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm21[3,3,3,3,7,7,7,7]
+; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm1, %ymm0
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm4[3,3,3,3,7,7,7,7]
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15]
-; AVX512-FCP-NEXT:    vmovdqa 64(%rcx), %xmm0
-; AVX512-FCP-NEXT:    vmovdqa 64(%rdx), %xmm2
-; AVX512-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm5 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
-; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm0, %xmm3
-; AVX512-FCP-NEXT:    vmovdqa %xmm5, %xmm14
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm2[1,1,2,2]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2,3],xmm3[4],xmm5[5,6],xmm3[7]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; AVX512-FCP-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm15 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5]
-; AVX512-FCP-NEXT:    vpshufb %xmm15, %xmm2, %xmm2
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm0 = [0,1,1,3,8,8,9,9]
-; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm0, %zmm2
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm13
-; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %xmm0
-; AVX512-FCP-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa 96(%rcx), %xmm3
+; AVX512-FCP-NEXT:    vmovdqa 96(%rdx), %xmm0
+; AVX512-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm3, %xmm29
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5]
+; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX512-FCP-NEXT:    vmovdqa %xmm2, %xmm7
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm31 = [0,2,2,3,8,9,9,11]
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm31, %zmm1
+; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm13, %ymm0
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm28[3,3,3,3,7,7,7,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7,8,9,10],ymm0[11],ymm2[12,13],ymm0[14],ymm2[15]
+; AVX512-FCP-NEXT:    vmovdqa 96(%rsi), %xmm4
+; AVX512-FCP-NEXT:    vmovdqa 96(%rdi), %xmm3
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm4, %xmm26
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm3, %xmm24
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm2, %xmm2
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm3 = [2,1,3,3,8,8,9,9]
+; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm3, %zmm0
+; AVX512-FCP-NEXT:    vpternlogq $226, %zmm1, %zmm16, %zmm0
+; AVX512-FCP-NEXT:    vpshufb %ymm15, %ymm8, %ymm1
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm14[3,3,3,3,7,7,7,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm13 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15]
+; AVX512-FCP-NEXT:    vmovdqa 64(%rcx), %xmm2
+; AVX512-FCP-NEXT:    vmovdqa 64(%rdx), %xmm1
+; AVX512-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm4 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
+; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm2, %xmm5
+; AVX512-FCP-NEXT:    vmovdqa %xmm4, %xmm8
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm1[1,1,2,2]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3],xmm5[4],xmm6[5,6],xmm5[7]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; AVX512-FCP-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX512-FCP-NEXT:    vmovdqa %xmm7, %xmm14
+; AVX512-FCP-NEXT:    vpshufb %xmm7, %xmm1, %xmm2
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [0,1,1,3,8,8,9,9]
+; AVX512-FCP-NEXT:    vpermt2q %zmm5, %zmm1, %zmm2
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm11
+; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %xmm4
+; AVX512-FCP-NEXT:    vmovdqa %xmm4, (%rsp) # 16-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa 64(%rsi), %xmm6
 ; AVX512-FCP-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vprold $16, %xmm6, %xmm3
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm0[1,1,2,3]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2],xmm5[3,4],xmm3[5],xmm5[6,7]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
-; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm5, %xmm5
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm28 = [0,0,1,1,8,8,10,9]
-; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm28, %zmm5
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0]
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm2, %zmm3, %zmm5
-; AVX512-FCP-NEXT:    vmovdqa 64(%r9), %xmm2
-; AVX512-FCP-NEXT:    vmovdqa 64(%r8), %xmm6
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
-; AVX512-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
+; AVX512-FCP-NEXT:    vprold $16, %xmm6, %xmm1
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm4[1,1,2,3]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm1[2],xmm5[3,4],xmm1[5],xmm5[6,7]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
+; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm1, %xmm1
+; AVX512-FCP-NEXT:    vmovdqa %xmm9, %xmm3
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm27 = [0,0,1,1,8,8,10,9]
+; AVX512-FCP-NEXT:    vpermt2q %zmm5, %zmm27, %zmm1
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0]
+; AVX512-FCP-NEXT:    vpternlogq $226, %zmm2, %zmm5, %zmm1
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm20, %xmm2
+; AVX512-FCP-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
+; AVX512-FCP-NEXT:    # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
-; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm2, %xmm6
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
+; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm2, %xmm6
 ; AVX512-FCP-NEXT:    vpshufb %xmm7, %xmm2, %xmm2
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm26 = [0,0,0,1,8,9,9,11]
-; AVX512-FCP-NEXT:    vpermt2q %zmm6, %zmm26, %zmm2
-; AVX512-FCP-NEXT:    vpbroadcastd 64(%rax), %ymm6
-; AVX512-FCP-NEXT:    vpbroadcastd 68(%rax), %ymm8
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm6, %zmm31
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogd $184, %zmm2, %zmm21, %zmm31
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm16 = [0,0,0,1,8,9,9,11]
+; AVX512-FCP-NEXT:    vpermt2q %zmm6, %zmm16, %zmm2
+; AVX512-FCP-NEXT:    vmovdqa (%rcx), %xmm4
+; AVX512-FCP-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa (%rdx), %xmm9
+; AVX512-FCP-NEXT:    vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm4, %xmm6
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm8 = xmm9[1,1,2,2]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm8[0],xmm6[1],xmm8[2,3],xmm6[4],xmm8[5,6],xmm6[7]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
+; AVX512-FCP-NEXT:    vpshufb %xmm14, %xmm8, %xmm8
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm14, %xmm21
+; AVX512-FCP-NEXT:    vpermt2q %zmm6, %zmm11, %zmm8
+; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm4
+; AVX512-FCP-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa (%rsi), %xmm11
+; AVX512-FCP-NEXT:    vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT:    vprold $16, %xmm11, %xmm6
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm9 = xmm4[1,1,2,3]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm9[0,1],xmm6[2],xmm9[3,4],xmm6[5],xmm9[6,7]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm4[0],xmm11[0],xmm4[1],xmm11[1],xmm4[2],xmm11[2],xmm4[3],xmm11[3]
+; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm9, %xmm9
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm3, %xmm28
+; AVX512-FCP-NEXT:    vpermt2q %zmm6, %zmm27, %zmm9
+; AVX512-FCP-NEXT:    vpternlogq $226, %zmm8, %zmm5, %zmm9
+; AVX512-FCP-NEXT:    vpbroadcastd 64(%rax), %ymm5
+; AVX512-FCP-NEXT:    vpbroadcastd 68(%rax), %ymm6
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm5, %zmm23
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
+; AVX512-FCP-NEXT:    vpternlogd $184, %zmm2, %zmm4, %zmm23
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogq $184, %zmm5, %zmm2, %zmm31
-; AVX512-FCP-NEXT:    vmovdqa (%rcx), %xmm0
-; AVX512-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa (%rdx), %xmm8
-; AVX512-FCP-NEXT:    vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vpshufb %xmm14, %xmm0, %xmm5
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm14, %xmm29
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm8[1,1,2,2]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3],xmm5[4],xmm6[5,6],xmm5[7]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3]
-; AVX512-FCP-NEXT:    vpshufb %xmm15, %xmm6, %xmm6
-; AVX512-FCP-NEXT:    vpermt2q %zmm5, %zmm13, %zmm6
-; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa (%rsi), %xmm13
-; AVX512-FCP-NEXT:    vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vprold $16, %xmm13, %xmm5
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm8 = xmm0[1,1,2,3]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm8[0,1],xmm5[2],xmm8[3,4],xmm5[5],xmm8[6,7]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3]
-; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm8, %xmm8
-; AVX512-FCP-NEXT:    vpermt2q %zmm5, %zmm28, %zmm8
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm6, %zmm3, %zmm8
-; AVX512-FCP-NEXT:    vmovdqa (%r9), %xmm3
-; AVX512-FCP-NEXT:    vmovdqa (%r8), %xmm5
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
-; AVX512-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm3, %xmm5
-; AVX512-FCP-NEXT:    vpshufb %xmm7, %xmm3, %xmm3
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm7, %xmm17
-; AVX512-FCP-NEXT:    vpermt2q %zmm5, %zmm26, %zmm3
+; AVX512-FCP-NEXT:    vpternlogq $184, %zmm1, %zmm2, %zmm23
+; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-FCP-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
+; AVX512-FCP-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
+; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm1, %xmm5
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm10, %xmm22
+; AVX512-FCP-NEXT:    vpshufb %xmm7, %xmm1, %xmm1
+; AVX512-FCP-NEXT:    vmovdqa %xmm7, %xmm3
+; AVX512-FCP-NEXT:    vpermt2q %zmm5, %zmm16, %zmm1
 ; AVX512-FCP-NEXT:    vpbroadcastd (%rax), %ymm5
 ; AVX512-FCP-NEXT:    vpbroadcastd 4(%rax), %ymm6
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm5, %zmm26
-; AVX512-FCP-NEXT:    vpternlogd $184, %zmm3, %zmm21, %zmm26
-; AVX512-FCP-NEXT:    vpternlogq $184, %zmm8, %zmm2, %zmm26
-; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm14[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u]
-; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm6[2,2,2,2,6,6,6,6]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14,15]
-; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm14, %ymm3
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm6[1,1,1,1,5,5,5,5]
-; AVX512-FCP-NEXT:    vmovdqa %ymm6, %ymm9
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7,8,9],ymm5[10],ymm3[11,12],ymm5[13],ymm3[14,15]
-; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm20, %zmm3
-; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm5, %zmm20
+; AVX512-FCP-NEXT:    vpternlogd $184, %zmm1, %zmm4, %zmm20
+; AVX512-FCP-NEXT:    vpternlogq $184, %zmm9, %zmm2, %zmm20
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm4[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u]
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Reload
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm17[2,2,2,2,6,6,6,6]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm17[1,1,1,1,5,5,5,5]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm5[2],ymm2[3,4],ymm5[5],ymm2[6,7,8,9],ymm5[10],ymm2[11,12],ymm5[13],ymm2[14,15]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm5 = [2,2,2,3,8,10,10,11]
+; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm5, %zmm2
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
 ; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
 ; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm7[2,2,2,2,6,6,6,6]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm5[2],ymm2[3,4],ymm5[5],ymm2[6,7,8,9],ymm5[10],ymm2[11,12],ymm5[13],ymm2[14,15]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm5 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm5[2],ymm1[3,4],ymm5[5],ymm1[6,7,8,9],ymm5[10],ymm1[11,12],ymm5[13],ymm1[14,15]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm5 = ymm9[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
 ; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm7[0,1,1,3,4,5,5,7]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm7, %ymm18
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7,8,9],ymm5[10],ymm6[11,12],ymm5[13],ymm6[14,15]
-; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm11, %zmm5
-; AVX512-FCP-NEXT:    vmovdqa 96(%rcx), %xmm11
-; AVX512-FCP-NEXT:    vmovdqa 96(%rdx), %xmm7
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3]
-; AVX512-FCP-NEXT:    vpshufb %xmm15, %xmm2, %xmm2
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm6 = [0,2,2,3,8,9,9,11]
-; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm6, %zmm1
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm13 = [u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29]
-; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm10, %ymm2
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm8 = ymm19[3,3,3,3,7,7,7,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm8[0,1,2],ymm2[3],ymm8[4,5],ymm2[6],ymm8[7,8,9,10],ymm2[11],ymm8[12,13],ymm2[14],ymm8[15]
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm3, %zmm27, %zmm5
-; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX512-FCP-NEXT:    vprold $16, %ymm4, %ymm2
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Reload
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm19[1,2,2,3,5,6,6,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm4, %ymm24
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm19[0,0,2,1,4,4,6,5]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8,9,10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15]
-; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm23, %zmm8
-; AVX512-FCP-NEXT:    vmovdqa 96(%rsi), %xmm3
-; AVX512-FCP-NEXT:    vmovdqa 96(%rdi), %xmm4
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm2, %xmm2
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm20 = [2,1,3,3,8,8,9,9]
-; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm20, %zmm10
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm1, %zmm21, %zmm10
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm30, %ymm1
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm22[3,3,3,3,7,7,7,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15]
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Reload
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm23, %zmm16, %zmm1
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm16 = [0,0,4,0,0,0,5,0,0,13,0,0,0,14,0,0]
-; AVX512-FCP-NEXT:    vpermd %zmm1, %zmm16, %zmm22
-; AVX512-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm22
-; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm22
-; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %xmm5
-; AVX512-FCP-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %xmm1
-; AVX512-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3]
-; AVX512-FCP-NEXT:    vpshufb %xmm15, %xmm1, %xmm1
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm18[3,3,3,3,7,7,7,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2,3],ymm0[4],ymm5[5,6,7,8],ymm0[9],ymm5[10,11],ymm0[12],ymm5[13,14,15]
-; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm6, %zmm0
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm30 = [2,2,2,3,8,8,8,9]
-; AVX512-FCP-NEXT:    vmovdqa 96(%r9), %xmm6
-; AVX512-FCP-NEXT:    vmovdqa 96(%r8), %xmm5
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm17, %xmm1
-; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm8, %xmm1
-; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm30, %zmm2
-; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm14, %ymm1
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[3,3,3,3,7,7,7,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm9[0,1,2],ymm1[3],ymm9[4,5],ymm1[6],ymm9[7,8,9,10],ymm1[11],ymm9[12,13],ymm1[14],ymm9[15]
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm15
-; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %xmm9
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
-; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm14, %xmm14
-; AVX512-FCP-NEXT:    vpermt2q %zmm14, %zmm20, %zmm1
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm17 = [6,7,3,3,7,7,6,7]
-; AVX512-FCP-NEXT:    vpermd %ymm25, %ymm17, %ymm16
-; AVX512-FCP-NEXT:    vpbroadcastd 96(%rax), %ymm20
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm20, %zmm16, %zmm25
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm18 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogd $184, %zmm2, %zmm18, %zmm25
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm27 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogq $184, %zmm10, %zmm27, %zmm25
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; AVX512-FCP-NEXT:    vprold $16, %xmm3, %xmm3
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,3]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7]
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm0, %zmm21, %zmm1
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm2[4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9]
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm21 = [0,0,2,1,8,8,9,11]
-; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm21, %zmm3
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm7[4],xmm11[4],xmm7[5],xmm11[5],xmm7[6],xmm11[6],xmm7[7],xmm11[7]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm29, %xmm2
-; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm11, %xmm2
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[1,1,2,2]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0],xmm2[1],xmm7[2,3],xmm2[4],xmm7[5,6],xmm2[7]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7]
-; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm28, %zmm7
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm28 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm3, %zmm28, %zmm7
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
+; AVX512-FCP-NEXT:    vmovdqa %ymm7, %ymm15
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7,8,9],ymm5[10],ymm6[11,12],ymm5[13],ymm6[14,15]
+; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm18, %zmm8
+; AVX512-FCP-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm8
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX512-FCP-NEXT:    vprold $16, %ymm5, %ymm1
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpternlogq $184, %ymm0, %ymm18, %ymm16
-; AVX512-FCP-NEXT:    vextracti64x4 $1, %zmm0, %ymm13
-; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm2, %xmm2
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm8[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm3 = [0,1,1,3,8,8,9,9]
-; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm3, %zmm6
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm24, %ymm2
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm19[3,3,3,3,7,7,7,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15]
-; AVX512-FCP-NEXT:    vmovdqa 32(%r9), %xmm3
-; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %xmm11
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm2[0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15]
-; AVX512-FCP-NEXT:    vpermt2q %zmm5, %zmm30, %zmm8
-; AVX512-FCP-NEXT:    vpbroadcastd 100(%rax), %ymm5
-; AVX512-FCP-NEXT:    vpbroadcastd 104(%rax), %ymm19
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm19, %zmm5, %zmm5
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm24 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogd $184, %zmm6, %zmm24, %zmm5
-; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm6 = ymm4[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[3,3,3,3]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2]
-; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
-; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm10, %xmm14
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[0,0,1,1]
-; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
-; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm10, %xmm10
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm12, %xmm30
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,0,1,1]
-; AVX512-FCP-NEXT:    vpermd %ymm23, %ymm17, %ymm17
-; AVX512-FCP-NEXT:    vpbroadcastd 32(%rax), %ymm19
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm19, %zmm17, %zmm20
-; AVX512-FCP-NEXT:    vpternlogd $184, %zmm8, %zmm18, %zmm20
-; AVX512-FCP-NEXT:    vpternlogq $184, %zmm1, %zmm27, %zmm20
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0]
-; AVX512-FCP-NEXT:    vpternlogq $184, %zmm7, %zmm1, %zmm5
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm7 = xmm9[4],xmm15[4],xmm9[5],xmm15[5],xmm9[6],xmm15[6],xmm9[7],xmm15[7]
-; AVX512-FCP-NEXT:    vprold $16, %xmm9, %xmm8
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm9 = xmm15[1,1,2,3]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm9[0,1],xmm8[2],xmm9[3,4],xmm8[5],xmm9[6,7]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm15 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9]
-; AVX512-FCP-NEXT:    vpshufb %xmm15, %xmm7, %xmm7
-; AVX512-FCP-NEXT:    vpermt2q %zmm7, %zmm21, %zmm8
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm16[1,2,2,3,5,6,6,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm5, %ymm18
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm16[0,0,2,1,4,4,6,5]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm5[3],ymm2[4,5],ymm5[6],ymm2[7,8,9,10],ymm5[11],ymm2[12,13],ymm5[14],ymm2[15]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm5 = [2,2,3,3,10,9,11,10]
+; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm5, %zmm2
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm5 = [2,2,2,3,8,8,8,9]
+; AVX512-FCP-NEXT:    vmovdqa 96(%r9), %xmm11
+; AVX512-FCP-NEXT:    vmovdqa 96(%r8), %xmm7
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3]
+; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm10, %xmm1
+; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm5, %zmm13
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm14, %zmm19, %zmm1
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm5 = [0,0,4,0,0,0,5,0,0,13,0,0,0,14,0,0]
+; AVX512-FCP-NEXT:    vpermd %zmm1, %zmm5, %zmm19
+; AVX512-FCP-NEXT:    vpternlogd $184, %zmm2, %zmm25, %zmm19
+; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm19
+; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %xmm12
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %xmm8
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm12[0],xmm8[1],xmm12[1],xmm8[2],xmm12[2],xmm8[3],xmm12[3]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm21, %xmm2
+; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm9[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm9 = ymm15[3,3,3,3,7,7,7,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm15 = ymm9[0],ymm2[1],ymm9[2,3],ymm2[4],ymm9[5,6,7,8],ymm2[9],ymm9[10,11],ymm2[12],ymm9[13,14,15]
+; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm31, %zmm15
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm31 = [6,7,3,3,7,7,6,7]
+; AVX512-FCP-NEXT:    vpermd %ymm30, %ymm31, %ymm1
+; AVX512-FCP-NEXT:    vpbroadcastd 96(%rax), %ymm2
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm21
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm30 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535]
+; AVX512-FCP-NEXT:    vpternlogd $184, %zmm13, %zmm30, %zmm21
+; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm21
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm24, %xmm2
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm26, %xmm1
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; AVX512-FCP-NEXT:    vprold $16, %xmm26, %xmm1
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm24[1,1,2,3]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm17[3,3,3,3,7,7,7,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15]
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm5
+; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %xmm4
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm28, %xmm6
+; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm13, %xmm13
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [2,1,3,3,8,8,9,9]
+; AVX512-FCP-NEXT:    vpermt2q %zmm13, %zmm2, %zmm1
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm17 = [0,0,2,1,8,8,9,11]
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm17, %zmm3
 ; AVX512-FCP-NEXT:    vmovdqa64 %xmm29, %xmm0
 ; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm9, %xmm7
-; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm9 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm12 = xmm0[1,1,2,2]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm12[0],xmm7[1],xmm12[2,3],xmm7[4],xmm12[5,6],xmm7[7]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm9[0,2,3,3,4,5,6,7]
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm0 = [0,0,1,1,8,8,10,9]
-; AVX512-FCP-NEXT:    vpermt2q %zmm9, %zmm0, %zmm7
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm8, %zmm28, %zmm7
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm11[4],xmm3[4],xmm11[5],xmm3[5],xmm11[6],xmm3[6],xmm11[7],xmm3[7]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm30, %xmm0
-; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm3, %xmm3
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm0 = [0,1,1,3,8,8,9,9]
-; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm0, %zmm2
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7]
+; AVX512-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm24 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm24, %xmm13
+; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm0, %xmm0
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm9 = xmm9[1,1,2,2]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm9[0],xmm0[1],xmm9[2,3],xmm0[4],xmm9[5,6],xmm0[7]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7]
+; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm27, %zmm0
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm25 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
+; AVX512-FCP-NEXT:    vpternlogq $226, %zmm3, %zmm25, %zmm0
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm11[4],xmm7[5],xmm11[5],xmm7[6],xmm11[6],xmm7[7],xmm11[7]
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpternlogq $184, %ymm3, %ymm30, %ymm7
+; AVX512-FCP-NEXT:    vextracti64x4 $1, %zmm3, %ymm11
+; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm2, %xmm2
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm22, %xmm13
+; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm10, %xmm3
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm28 = [0,1,1,3,8,8,9,9]
+; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm28, %zmm3
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm6[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm26 = ymm2[3,3,3,3]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm29 = ymm2[2,2,2,2]
+; AVX512-FCP-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm15, %zmm1
+; AVX512-FCP-NEXT:    vpbroadcastd 100(%rax), %ymm2
+; AVX512-FCP-NEXT:    vpbroadcastd 104(%rax), %ymm6
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm2, %zmm2
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm22 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
+; AVX512-FCP-NEXT:    vpternlogd $184, %zmm3, %zmm22, %zmm2
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm18, %ymm3
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm16[3,3,3,3,7,7,7,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm3[0,1],ymm6[2],ymm3[3,4],ymm6[5],ymm3[6,7,8,9],ymm6[10],ymm3[11,12],ymm6[13],ymm3[14,15]
+; AVX512-FCP-NEXT:    vmovdqa 32(%r9), %xmm15
+; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %xmm3
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm6[0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15]
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm16 = [2,2,2,3,8,8,8,9]
+; AVX512-FCP-NEXT:    vpermt2q %zmm9, %zmm16, %zmm10
+; AVX512-FCP-NEXT:    vpermd %ymm14, %ymm31, %ymm9
+; AVX512-FCP-NEXT:    vpbroadcastd 32(%rax), %ymm18
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm18, %zmm9, %zmm9
+; AVX512-FCP-NEXT:    vpternlogd $184, %zmm10, %zmm30, %zmm9
+; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm9
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0]
+; AVX512-FCP-NEXT:    vpternlogq $184, %zmm0, %zmm1, %zmm2
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
+; AVX512-FCP-NEXT:    vprold $16, %xmm4, %xmm4
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,3]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9]
+; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm0, %xmm0
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm17, %zmm4
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm24, %xmm0
+; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm12, %xmm0
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7]
+; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm8 = xmm8[1,1,2,2]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm8[0],xmm0[1],xmm8[2,3],xmm0[4],xmm8[5,6],xmm0[7]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,2,3,3,4,5,6,7]
+; AVX512-FCP-NEXT:    vpermt2q %zmm5, %zmm27, %zmm0
+; AVX512-FCP-NEXT:    vpternlogq $226, %zmm4, %zmm25, %zmm0
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm15[4],xmm3[5],xmm15[5],xmm3[6],xmm15[6],xmm3[7],xmm15[7]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm6, %xmm4
+; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm28, %zmm4
 ; AVX512-FCP-NEXT:    vpbroadcastd 36(%rax), %ymm3
-; AVX512-FCP-NEXT:    vpbroadcastd 40(%rax), %ymm8
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm3, %zmm3
-; AVX512-FCP-NEXT:    vpternlogd $184, %zmm2, %zmm24, %zmm3
-; AVX512-FCP-NEXT:    vpternlogq $184, %zmm7, %zmm1, %zmm3
+; AVX512-FCP-NEXT:    vpbroadcastd 40(%rax), %ymm5
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm3, %zmm3
+; AVX512-FCP-NEXT:    vpternlogd $184, %zmm4, %zmm22, %zmm3
+; AVX512-FCP-NEXT:    vpternlogq $184, %zmm0, %zmm1, %zmm3
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535]
+; AVX512-FCP-NEXT:    vpternlogq $184, %ymm7, %ymm0, %ymm29
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogd $226, 124(%r8){1to8}, %ymm0, %ymm13
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogq $184, %ymm13, %ymm1, %ymm6
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535]
-; AVX512-FCP-NEXT:    vpternlogq $184, %ymm16, %ymm2, %ymm4
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm0, %zmm0
-; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm4[0,1,2,3],zmm0[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm13
-; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm0 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm0 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm4
-; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm2 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    vpternlogd $226, 124(%r8){1to8}, %ymm0, %ymm11
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535]
+; AVX512-FCP-NEXT:    vpternlogq $184, %ymm11, %ymm0, %ymm26
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm26, %zmm0, %zmm1
+; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm29[0,1,2,3],zmm1[4,5,6,7]
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm12
+; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-FCP-NEXT:    vpunpckhwd (%rsp), %xmm1, %xmm1 # 16-byte Folded Reload
+; AVX512-FCP-NEXT:    # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
 ; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX512-FCP-NEXT:    vpunpckhwd (%rsp), %xmm4, %xmm4 # 16-byte Folded Reload
+; AVX512-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
 ; AVX512-FCP-NEXT:    # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7]
-; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX512-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload
-; AVX512-FCP-NEXT:    # xmm6 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7]
-; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX512-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload
-; AVX512-FCP-NEXT:    # xmm7 = xmm7[4],mem[4],xmm7[5],mem[5],xmm7[6],mem[6],xmm7[7],mem[7]
-; AVX512-FCP-NEXT:    vpshufb %xmm15, %xmm4, %xmm4
-; AVX512-FCP-NEXT:    vpshufb %xmm15, %xmm7, %xmm7
-; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm8 = ymm8[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
-; AVX512-FCP-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm9 = mem[0,1,1,3,4,5,5,7]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm9[0,1],ymm8[2],ymm9[3,4],ymm8[5],ymm9[6,7,8,9],ymm8[10],ymm9[11,12],ymm8[13],ymm9[14,15]
-; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
-; AVX512-FCP-NEXT:    vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm10 = mem[1,1,1,1,5,5,5,5]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7,8,9],ymm10[10],ymm9[11,12],ymm10[13],ymm9[14,15]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [0,1,4,5,4,5,5,7]
+; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; AVX512-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
+; AVX512-FCP-NEXT:    # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7]
+; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm1, %xmm1
+; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm5, %xmm5
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm6[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512-FCP-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    # ymm7 = mem[0,1,1,3,4,5,5,7]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7,8,9],ymm6[10],ymm7[11,12],ymm6[13],ymm7[14,15]
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
+; AVX512-FCP-NEXT:    vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    # ymm8 = mem[1,1,1,1,5,5,5,5]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7,8,9],ymm8[10],ymm7[11,12],ymm8[13],ymm7[14,15]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [0,1,4,5,4,5,5,7]
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpermd %ymm10, %ymm8, %ymm8
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm10 = ymm10[12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[16,17],zero,zero
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535]
+; AVX512-FCP-NEXT:    vpandn %ymm8, %ymm11, %ymm8
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm10, %zmm8
+; AVX512-FCP-NEXT:    vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
+; AVX512-FCP-NEXT:    # xmm10 = mem[0,2,3,3,4,5,6,7]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[0,2,3,3,4,5,6,7]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,0,1,3]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,1,3,2]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3]
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpermd %ymm11, %ymm10, %ymm10
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm11 = ymm11[12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[16,17],zero,zero
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpandn %ymm10, %ymm12, %ymm10
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm11, %zmm10
-; AVX512-FCP-NEXT:    vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
-; AVX512-FCP-NEXT:    # xmm11 = mem[0,2,3,3,4,5,6,7]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,0,1,3]
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[0,2,3,3,4,5,6,7]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,0,1,3]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,1,3,2]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3]
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm12, %zmm14
-; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm13
+; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    vpternlogq $226, %zmm10, %zmm25, %zmm1
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm11, %zmm28, %zmm4
-; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm6, %zmm28, %zmm7
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535]
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm0, %zmm6, %zmm4
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm2, %zmm6, %zmm7
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm0, %zmm0
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm9, %zmm2, %zmm2
-; AVX512-FCP-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT:    vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm10 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm10
+; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    vpternlogq $226, %zmm4, %zmm25, %zmm5
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535]
+; AVX512-FCP-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm1 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm5 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm4, %zmm4
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm6, %zmm6
+; AVX512-FCP-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm6
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm8
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm10, 320(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, 320(%rax)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, 256(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm20, 192(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm22, 128(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm26, (%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm31, 448(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, 704(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm25, 640(%rax)
-; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovaps %zmm1, 576(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm14, 384(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, 64(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, 512(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm13, 832(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, 768(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm9, 192(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm19, 128(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm20, (%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm23, 448(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, 704(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm21, 640(%rax)
+; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovaps %zmm0, 576(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm13, 384(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, 64(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, 512(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm12, 832(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, 768(%rax)
 ; AVX512-FCP-NEXT:    addq $1496, %rsp # imm = 0x5D8
 ; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: store_i16_stride7_vf64:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    subq $2808, %rsp # imm = 0xAF8
-; AVX512DQ-NEXT:    vmovdqa 96(%rcx), %ymm6
-; AVX512DQ-NEXT:    vmovdqa 96(%rdx), %ymm13
+; AVX512DQ-NEXT:    subq $2168, %rsp # imm = 0x878
+; AVX512DQ-NEXT:    vmovdqa 96(%rcx), %ymm2
+; AVX512DQ-NEXT:    vmovdqa 96(%rdx), %ymm9
 ; AVX512DQ-NEXT:    vmovdqa 96(%rdi), %ymm7
 ; AVX512DQ-NEXT:    vmovdqa 96(%rsi), %ymm8
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128]
-; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm6, %ymm2
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm13, %ymm3
-; AVX512DQ-NEXT:    vporq %ymm2, %ymm3, %ymm17
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm10 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128]
-; AVX512DQ-NEXT:    vpshufb %ymm10, %ymm8, %ymm2
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm11 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19]
-; AVX512DQ-NEXT:    vpshufb %ymm11, %ymm7, %ymm3
-; AVX512DQ-NEXT:    vporq %ymm2, %ymm3, %ymm18
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
+; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm2, %ymm1
+; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm18
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm11 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpshufb %ymm11, %ymm9, %ymm2
+; AVX512DQ-NEXT:    vporq %ymm1, %ymm2, %ymm16
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm12 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512DQ-NEXT:    vpshufb %ymm12, %ymm8, %ymm1
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm13 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19]
+; AVX512DQ-NEXT:    vpshufb %ymm13, %ymm7, %ymm2
+; AVX512DQ-NEXT:    vporq %ymm1, %ymm2, %ymm17
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
 ; AVX512DQ-NEXT:    vmovdqa 64(%r9), %ymm2
-; AVX512DQ-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpshufb %ymm12, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm2, %ymm1
+; AVX512DQ-NEXT:    vmovdqa %ymm3, %ymm10
+; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm21
 ; AVX512DQ-NEXT:    vmovdqa 64(%r8), %ymm3
-; AVX512DQ-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u]
-; AVX512DQ-NEXT:    vpshufb %ymm9, %ymm3, %ymm3
-; AVX512DQ-NEXT:    vpor %ymm2, %ymm3, %ymm2
-; AVX512DQ-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 64(%rcx), %ymm3
-; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm3, %ymm2
-; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm26
-; AVX512DQ-NEXT:    vmovdqa 64(%rdx), %ymm4
-; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm4, %ymm3
-; AVX512DQ-NEXT:    vmovdqa64 %ymm4, %ymm27
-; AVX512DQ-NEXT:    vpor %ymm2, %ymm3, %ymm2
-; AVX512DQ-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 64(%rsi), %ymm3
-; AVX512DQ-NEXT:    vpshufb %ymm10, %ymm3, %ymm2
-; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm23
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm15 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u]
+; AVX512DQ-NEXT:    vpshufb %ymm15, %ymm3, %ymm2
+; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm20
+; AVX512DQ-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vmovdqa 64(%rcx), %ymm2
+; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm2, %ymm1
+; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm23
+; AVX512DQ-NEXT:    vmovdqa 64(%rdx), %ymm6
+; AVX512DQ-NEXT:    vpshufb %ymm11, %ymm6, %ymm2
+; AVX512DQ-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vmovdqa 64(%rsi), %ymm5
+; AVX512DQ-NEXT:    vpshufb %ymm12, %ymm5, %ymm1
 ; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %ymm4
-; AVX512DQ-NEXT:    vpshufb %ymm11, %ymm4, %ymm3
-; AVX512DQ-NEXT:    vmovdqa64 %ymm4, %ymm22
-; AVX512DQ-NEXT:    vpor %ymm2, %ymm3, %ymm2
-; AVX512DQ-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vmovdqa (%r9), %ymm2
-; AVX512DQ-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpshufb %ymm12, %ymm2, %ymm2
-; AVX512DQ-NEXT:    vmovdqa (%r8), %ymm3
-; AVX512DQ-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpshufb %ymm9, %ymm3, %ymm3
-; AVX512DQ-NEXT:    vpor %ymm2, %ymm3, %ymm2
-; AVX512DQ-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vmovdqa (%rcx), %ymm2
+; AVX512DQ-NEXT:    vpshufb %ymm13, %ymm4, %ymm2
+; AVX512DQ-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vmovdqa (%r9), %ymm1
+; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpshufb %ymm10, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vmovdqa %ymm10, %ymm3
+; AVX512DQ-NEXT:    vmovdqa (%r8), %ymm2
 ; AVX512DQ-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm2, %ymm2
-; AVX512DQ-NEXT:    vmovdqa (%rdx), %ymm3
-; AVX512DQ-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm3, %ymm3
-; AVX512DQ-NEXT:    vpor %ymm2, %ymm3, %ymm2
+; AVX512DQ-NEXT:    vpshufb %ymm15, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vmovdqa (%rcx), %ymm1
+; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vmovdqa (%rdx), %ymm2
 ; AVX512DQ-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpshufb %ymm11, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa (%rsi), %ymm2
-; AVX512DQ-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpshufb %ymm10, %ymm2, %ymm2
-; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm3
-; AVX512DQ-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpshufb %ymm11, %ymm3, %ymm3
-; AVX512DQ-NEXT:    vpor %ymm2, %ymm3, %ymm2
-; AVX512DQ-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 32(%rcx), %ymm4
-; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
-; AVX512DQ-NEXT:    vmovdqa 32(%rdx), %ymm2
-; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm2, %ymm1
-; AVX512DQ-NEXT:    vpor %ymm0, %ymm1, %ymm0
-; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %ymm5
-; AVX512DQ-NEXT:    vpshufb %ymm10, %ymm5, %ymm0
-; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm3
-; AVX512DQ-NEXT:    vpshufb %ymm11, %ymm3, %ymm1
-; AVX512DQ-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX512DQ-NEXT:    vpshufb %ymm12, %ymm2, %ymm1
+; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm28
+; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm10
+; AVX512DQ-NEXT:    vpshufb %ymm13, %ymm10, %ymm2
+; AVX512DQ-NEXT:    vmovdqa64 %ymm10, %ymm22
+; AVX512DQ-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vmovdqa 32(%rcx), %ymm2
+; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm2, %ymm0
+; AVX512DQ-NEXT:    vmovdqa 32(%rdx), %ymm1
+; AVX512DQ-NEXT:    vpshufb %ymm11, %ymm1, %ymm11
+; AVX512DQ-NEXT:    vpor %ymm0, %ymm11, %ymm0
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 32(%r8), %ymm1
-; AVX512DQ-NEXT:    vpshufb %ymm9, %ymm1, %ymm9
-; AVX512DQ-NEXT:    vmovdqa 32(%r9), %ymm0
-; AVX512DQ-NEXT:    vpshufb %ymm12, %ymm0, %ymm10
-; AVX512DQ-NEXT:    vpor %ymm10, %ymm9, %ymm9
-; AVX512DQ-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27]
-; AVX512DQ-NEXT:    # ymm10 = mem[0,1,0,1]
-; AVX512DQ-NEXT:    vpshufb %ymm10, %ymm6, %ymm9
-; AVX512DQ-NEXT:    vmovdqa64 %ymm10, %ymm31
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm10 = ymm13[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7,8,9],ymm10[10],ymm9[11,12],ymm10[13],ymm9[14,15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3]
-; AVX512DQ-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm9 = ymm6[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %ymm0
+; AVX512DQ-NEXT:    vpshufb %ymm12, %ymm0, %ymm12
+; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm11
+; AVX512DQ-NEXT:    vpshufb %ymm13, %ymm11, %ymm13
+; AVX512DQ-NEXT:    vpor %ymm12, %ymm13, %ymm10
+; AVX512DQ-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa 32(%r8), %ymm13
+; AVX512DQ-NEXT:    vpshufb %ymm15, %ymm13, %ymm12
+; AVX512DQ-NEXT:    vmovdqa 32(%r9), %ymm15
+; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm15, %ymm14
+; AVX512DQ-NEXT:    vpor %ymm14, %ymm12, %ymm10
+; AVX512DQ-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vprold $16, %ymm15, %ymm12
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm14 = ymm13[1,2,2,3,5,6,6,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm12 = ymm14[0,1],ymm12[2],ymm14[3,4],ymm12[5],ymm14[6,7,8,9],ymm12[10],ymm14[11,12],ymm12[13],ymm14[14,15]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm14 = ymm15[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm14 = ymm14[2,2,2,3,6,6,6,7]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm10 = ymm13[3,3,3,3,7,7,7,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5,6,7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13,14,15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3]
-; AVX512DQ-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm9 = ymm8[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm10 = ymm7[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5,6,7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13,14,15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3]
-; AVX512DQ-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm9 = ymm8[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm10 = ymm7[3,3,3,3,7,7,7,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7,8,9,10],ymm9[11],ymm10[12,13],ymm9[14],ymm10[15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,1,3,3]
-; AVX512DQ-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 96(%r9), %ymm9
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm10 = ymm9[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[3,3,3,3]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm10 = ymm14[0,1],ymm10[2],ymm14[3,4],ymm10[5],ymm14[6,7,8,9],ymm10[10],ymm14[11,12],ymm10[13],ymm14[14,15]
+; AVX512DQ-NEXT:    vpmovsxbq {{.*#+}} zmm14 = [2,1,3,2,10,10,10,11]
+; AVX512DQ-NEXT:    vpermi2q %zmm10, %zmm12, %zmm14
+; AVX512DQ-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm12 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27]
+; AVX512DQ-NEXT:    # ymm12 = mem[0,1,0,1]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm18, %ymm14
+; AVX512DQ-NEXT:    vpshufb %ymm12, %ymm14, %ymm10
+; AVX512DQ-NEXT:    vmovdqa64 %ymm12, %ymm19
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm12 = ymm9[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm12[2],ymm10[3,4],ymm12[5],ymm10[6,7,8,9],ymm12[10],ymm10[11,12],ymm12[13],ymm10[14,15]
 ; AVX512DQ-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm10 = ymm9[2,1,2,3,6,5,6,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm10 = ymm10[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm30 = ymm10[2,2,2,2]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm10 = ymm13[0,1,1,3,4,5,5,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm6[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm6 = ymm6[0,0,0,0,4,4,4,4]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm6 = ymm10[0,1],ymm6[2],ymm10[3,4],ymm6[5],ymm10[6,7,8,9],ymm6[10],ymm10[11,12],ymm6[13],ymm10[14,15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,1,3,2]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm10 = ymm14[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm10 = ymm10[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm12 = ymm9[3,3,3,3,7,7,7,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm10 = ymm12[0],ymm10[1],ymm12[2,3],ymm10[4],ymm12[5,6,7,8],ymm10[9],ymm12[10,11],ymm10[12],ymm12[13,14,15]
+; AVX512DQ-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm10 = ymm8[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm10 = ymm10[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm12 = ymm7[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm10 = ymm12[0],ymm10[1],ymm12[2,3],ymm10[4],ymm12[5,6,7,8],ymm10[9],ymm12[10,11],ymm10[12],ymm12[13,14,15]
+; AVX512DQ-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm10 = ymm8[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm10 = ymm10[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm12 = ymm7[3,3,3,3,7,7,7,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm10 = ymm12[0,1,2],ymm10[3],ymm12[4,5],ymm10[6],ymm12[7,8,9,10],ymm10[11],ymm12[12,13],ymm10[14],ymm12[15]
+; AVX512DQ-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[0,1,1,3,4,5,5,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm10 = ymm14[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm10 = ymm10[0,0,0,0,4,4,4,4]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7,8,9],ymm10[10],ymm9[11,12],ymm10[13],ymm9[14,15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,1,3,2]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm7 = ymm7[1,1,1,1,5,5,5,5]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm8 = ymm8[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm8 = ymm8[0,0,2,1,4,4,6,5]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm7 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7,8,9],ymm7[10],ymm8[11,12],ymm7[13],ymm8[14,15]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3]
-; AVX512DQ-NEXT:    vprold $16, %ymm9, %ymm8
-; AVX512DQ-NEXT:    vpshufb %ymm12, %ymm9, %ymm9
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm6, %zmm17, %zmm6
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm7, %zmm18, %zmm7
-; AVX512DQ-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm7
-; AVX512DQ-NEXT:    vmovdqa 96(%r8), %ymm6
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm10 = ymm6[1,2,2,3,5,6,6,7]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm9, %zmm16, %zmm8
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm7, %zmm17, %zmm7
+; AVX512DQ-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm7
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm26 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535]
+; AVX512DQ-NEXT:    vmovdqa 96(%r8), %ymm10
 ; AVX512DQ-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm10 = ymm6[0,0,2,1,4,4,6,5]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm6 = ymm6[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm6[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm6[16,17,u,u,u,u],zero,zero
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535]
-; AVX512DQ-NEXT:    vpternlogq $248, %ymm11, %ymm7, %ymm6
-; AVX512DQ-NEXT:    vmovdqa64 %zmm11, %zmm12
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512DQ-NEXT:    vpternlogq $248, %ymm11, %ymm6, %ymm9
-; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm7, %ymm6
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm7 = ymm10[2,1,3,3]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm8 = ymm10[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm10[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm10[16,17,u,u,u,u],zero,zero
+; AVX512DQ-NEXT:    vpternlogq $248, %ymm26, %ymm7, %ymm8
+; AVX512DQ-NEXT:    vmovdqa 96(%r9), %ymm12
+; AVX512DQ-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm12, %ymm9
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm14 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
+; AVX512DQ-NEXT:    vpternlogq $248, %ymm14, %ymm8, %ymm9
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm7, %ymm7
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm8 = ymm10[0,0,2,1,4,4,6,5]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,1,3,3]
 ; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0]
-; AVX512DQ-NEXT:    vpternlogq $184, %ymm6, %ymm10, %ymm7
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm6 = ymm8[2,2,2,2]
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-NEXT:    vpternlogq $184, %ymm7, %ymm8, %ymm6
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm6, %zmm0, %zmm6
-; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm6 = zmm9[0,1,2,3],zmm6[4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vpternlogq $184, %ymm7, %ymm10, %ymm8
+; AVX512DQ-NEXT:    vprold $16, %ymm12, %ymm7
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,2,2,2]
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm14 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
+; AVX512DQ-NEXT:    vpternlogq $184, %ymm8, %ymm14, %ymm7
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm7, %zmm0, %zmm7
+; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm7 = zmm9[0,1,2,3],zmm7[4,5,6,7]
+; AVX512DQ-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [0,5,0,0,0,6,0,0,6,0,0,0,7,0,0,7]
-; AVX512DQ-NEXT:    vmovdqa 96(%rax), %ymm6
-; AVX512DQ-NEXT:    vpermd %zmm6, %zmm18, %zmm7
+; AVX512DQ-NEXT:    vmovdqa 96(%rax), %ymm7
+; AVX512DQ-NEXT:    vpermd %zmm7, %zmm18, %zmm9
+; AVX512DQ-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm9 = ymm7[0,1,1,3,4,5,5,7]
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm10 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128]
+; AVX512DQ-NEXT:    vpshufb %ymm10, %ymm7, %ymm7
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3]
+; AVX512DQ-NEXT:    vpandnq %ymm9, %ymm26, %ymm9
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm9, %zmm7, %zmm7
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm7 = ymm6[0,1,1,3,4,5,5,7]
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm11 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128]
-; AVX512DQ-NEXT:    vpshufb %ymm11, %ymm6, %ymm6
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3]
-; AVX512DQ-NEXT:    vpandn %ymm7, %ymm12, %ymm7
-; AVX512DQ-NEXT:    vmovdqa64 %zmm12, %zmm19
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm7, %zmm6, %zmm6
-; AVX512DQ-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm28 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535]
-; AVX512DQ-NEXT:    vpbroadcastd 72(%rax), %ymm6
-; AVX512DQ-NEXT:    vpandnq %ymm6, %ymm28, %ymm6
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535]
+; AVX512DQ-NEXT:    vpbroadcastd 72(%rax), %ymm7
+; AVX512DQ-NEXT:    vpandnq %ymm7, %ymm16, %ymm9
 ; AVX512DQ-NEXT:    vmovdqa 64(%rax), %ymm7
-; AVX512DQ-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpshufb %ymm11, %ymm7, %ymm7
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm7, %zmm6, %zmm6
-; AVX512DQ-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 64(%r9), %xmm7
-; AVX512DQ-NEXT:    vmovdqa 64(%r8), %xmm8
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm8, %xmm17
-; AVX512DQ-NEXT:    vmovdqa64 %xmm7, %xmm20
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
-; AVX512DQ-NEXT:    vpshufb %xmm7, %xmm6, %xmm6
-; AVX512DQ-NEXT:    vmovdqa %xmm7, %xmm15
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1]
-; AVX512DQ-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 64(%rcx), %xmm9
-; AVX512DQ-NEXT:    vmovdqa 64(%rdx), %xmm7
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm8[0,2,3,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1]
-; AVX512DQ-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %xmm14
-; AVX512DQ-NEXT:    vmovdqa 64(%rsi), %xmm10
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm10[4],xmm14[4],xmm10[5],xmm14[5],xmm10[6],xmm14[6],xmm10[7],xmm14[7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm8[2,1,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,5,4]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,0,1,3]
-; AVX512DQ-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpbroadcastd 8(%rax), %ymm8
-; AVX512DQ-NEXT:    vpandnq %ymm8, %ymm28, %ymm8
-; AVX512DQ-NEXT:    vmovdqa (%rax), %ymm12
-; AVX512DQ-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpshufb %ymm11, %ymm12, %ymm13
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm13, %zmm8, %zmm8
-; AVX512DQ-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa (%r9), %xmm6
-; AVX512DQ-NEXT:    vmovdqa (%r8), %xmm12
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm12, %xmm29
-; AVX512DQ-NEXT:    vmovdqa64 %xmm6, %xmm24
-; AVX512DQ-NEXT:    vpshufb %xmm15, %xmm8, %xmm8
-; AVX512DQ-NEXT:    vmovdqa64 %xmm15, %xmm25
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1]
-; AVX512DQ-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa (%rcx), %xmm8
-; AVX512DQ-NEXT:    vmovdqa (%rdx), %xmm13
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm15 = xmm13[4],xmm8[4],xmm13[5],xmm8[5],xmm13[6],xmm8[6],xmm13[7],xmm8[7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm15[0,2,3,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm12 = ymm15[0,0,2,1]
-; AVX512DQ-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm6
-; AVX512DQ-NEXT:    vmovdqa (%rsi), %xmm12
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm15 = xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm12, %xmm21
-; AVX512DQ-NEXT:    vmovdqa64 %xmm6, %xmm16
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm15[2,1,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,4]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm12 = ymm15[0,0,1,3]
-; AVX512DQ-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm15 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm15 = ymm15[0,0,0,0,4,4,4,4]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm12 = ymm2[0,1,1,3,4,5,5,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm15[2],ymm12[3,4],ymm15[5],ymm12[6,7,8,9],ymm15[10],ymm12[11,12],ymm15[13],ymm12[14,15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[2,1,3,2]
-; AVX512DQ-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm12 = ymm5[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm12 = ymm12[0,0,2,1,4,4,6,5]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm15 = ymm3[1,1,1,1,5,5,5,5]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm15[2],ymm12[3,4],ymm15[5],ymm12[6,7,8,9],ymm15[10],ymm12[11,12],ymm15[13],ymm12[14,15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3]
-; AVX512DQ-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 32(%rax), %ymm15
-; AVX512DQ-NEXT:    vpshufb %ymm11, %ymm15, %ymm11
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm12 = ymm15[0,1,1,3,4,5,5,7]
+; AVX512DQ-NEXT:    vpshufb %ymm10, %ymm7, %ymm12
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm12, %zmm9, %zmm9
+; AVX512DQ-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vpbroadcastd 8(%rax), %ymm9
+; AVX512DQ-NEXT:    vpandnq %ymm9, %ymm16, %ymm9
+; AVX512DQ-NEXT:    vmovdqa (%rax), %ymm8
+; AVX512DQ-NEXT:    vpshufb %ymm10, %ymm8, %ymm12
+; AVX512DQ-NEXT:    vmovdqa64 %ymm8, %ymm24
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm12, %zmm9, %zmm9
+; AVX512DQ-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm9 = ymm2[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[0,0,0,0,4,4,4,4]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm12 = ymm1[0,1,1,3,4,5,5,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm9 = ymm12[0,1],ymm9[2],ymm12[3,4],ymm9[5],ymm12[6,7,8,9],ymm9[10],ymm12[11,12],ymm9[13],ymm12[14,15]
+; AVX512DQ-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm9 = ymm0[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[0,0,2,1,4,4,6,5]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm12 = ymm11[1,1,1,1,5,5,5,5]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm12[2],ymm9[3,4],ymm12[5],ymm9[6,7,8,9],ymm12[10],ymm9[11,12],ymm12[13],ymm9[14,15]
+; AVX512DQ-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vmovdqa 32(%rax), %ymm9
+; AVX512DQ-NEXT:    vpshufb %ymm10, %ymm9, %ymm10
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm12 = ymm9[0,1,1,3,4,5,5,7]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,2,2,3]
-; AVX512DQ-NEXT:    vpandnq %ymm12, %ymm19, %ymm12
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm12, %zmm11, %zmm11
-; AVX512DQ-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm11 = ymm0[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm11 = ymm11[0,0,0,0,4,4,4,4]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm12 = ymm1[0,0,2,1,4,4,6,5]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7,8,9,10],ymm12[11],ymm11[12,13],ymm12[14],ymm11[15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,2,3,3]
-; AVX512DQ-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa64 %ymm31, %ymm6
-; AVX512DQ-NEXT:    vpshufb %ymm6, %ymm4, %ymm11
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm12 = ymm2[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7,8,9],ymm12[10],ymm11[11,12],ymm12[13],ymm11[14,15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3]
-; AVX512DQ-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[3,3,3,3,7,7,7,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6,7,8],ymm4[9],ymm2[10,11],ymm4[12],ymm2[13,14,15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX512DQ-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm5[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm4 = ymm3[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3],ymm2[4],ymm4[5,6,7,8],ymm2[9],ymm4[10,11],ymm2[12],ymm4[13,14,15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm3[3,3,3,3,7,7,7,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm5[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8,9,10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3]
-; AVX512DQ-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vprold $16, %ymm0, %ymm2
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm3 = ymm1[1,2,2,3,5,6,6,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,1,3,2]
-; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vpandnq %ymm12, %ymm26, %ymm12
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm12, %zmm10, %zmm10
+; AVX512DQ-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm10 = ymm13[0,0,2,1,4,4,6,5]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm12 = ymm15[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm12 = ymm12[0,0,0,0,4,4,4,4]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm8 = ymm12[0,1,2],ymm10[3],ymm12[4,5],ymm10[6],ymm12[7,8,9,10],ymm10[11],ymm12[12,13],ymm10[14],ymm12[15]
+; AVX512DQ-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vmovdqa64 %ymm19, %ymm13
+; AVX512DQ-NEXT:    vpshufb %ymm13, %ymm2, %ymm10
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm12 = ymm1[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm8 = ymm10[0,1],ymm12[2],ymm10[3,4],ymm12[5],ymm10[6,7,8,9],ymm12[10],ymm10[11,12],ymm12[13],ymm10[14,15]
+; AVX512DQ-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[3,3,3,3,7,7,7,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,3,6,6,6,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6,7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14,15]
+; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm0[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm11[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm1, %ymm31
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm11[3,3,3,3,7,7,7,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
 ; AVX512DQ-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpermd %zmm15, %zmm18, %zmm0
+; AVX512DQ-NEXT:    vpermd %zmm9, %zmm18, %zmm0
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa64 %ymm23, %ymm12
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm12[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm5[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm22[1,1,1,1,5,5,5,5]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm4[1,1,1,1,5,5,5,5]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
-; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm12[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
+; AVX512DQ-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm5[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm22[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm4[2,2,2,2,6,6,6,6]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; AVX512DQ-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vmovdqa64 %ymm26, %ymm2
+; AVX512DQ-NEXT:    vmovdqa64 %ymm23, %ymm2
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm2[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm27[0,1,1,3,4,5,5,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm6[0,1,1,3,4,5,5,7]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2]
-; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpshufb %ymm6, %ymm2, %ymm0
-; AVX512DQ-NEXT:    vmovdqa64 %ymm26, %ymm3
-; AVX512DQ-NEXT:    vmovdqa64 %ymm31, %ymm11
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm27[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm0, %ymm30
+; AVX512DQ-NEXT:    vpshufb %ymm13, %ymm2, %ymm0
+; AVX512DQ-NEXT:    vmovdqa64 %ymm23, %ymm3
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm6[2,2,2,2,6,6,6,6]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
 ; AVX512DQ-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm5[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
-; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm2[0,0,2,1,4,4,6,5]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8,9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3]
-; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vprold $16, %ymm5, %ymm0
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm2[1,2,2,3,5,6,6,7]
-; AVX512DQ-NEXT:    vmovdqa %ymm2, %ymm6
+; AVX512DQ-NEXT:    vmovdqa64 %ymm21, %ymm8
+; AVX512DQ-NEXT:    vprold $16, %ymm21, %ymm0
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm20[1,2,2,3,5,6,6,7]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2]
-; AVX512DQ-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm27[3,3,3,3,7,7,7,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm8[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm20[0,0,2,1,4,4,6,5]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7,8,9,10],ymm2[11],ymm1[12,13],ymm2[14],ymm1[15]
+; AVX512DQ-NEXT:    vpmovsxbq {{.*#+}} zmm10 = [2,2,3,3,10,9,11,10]
+; AVX512DQ-NEXT:    vpermt2q %zmm0, %zmm10, %zmm1
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm11 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0]
+; AVX512DQ-NEXT:    vpermd 64(%rax), %zmm11, %zmm0
+; AVX512DQ-NEXT:    vpternlogd $184, %zmm1, %zmm26, %zmm0
+; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm6[3,3,3,3,7,7,7,7]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm3[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6,7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14,15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 96(%rcx), %xmm0
-; AVX512DQ-NEXT:    vmovdqa 96(%rdx), %xmm1
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,2,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
-; AVX512DQ-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm22[3,3,3,3,7,7,7,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm12[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8,9,10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3]
-; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 96(%rsi), %xmm2
-; AVX512DQ-NEXT:    vmovdqa 96(%rdi), %xmm3
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm25, %xmm12
-; AVX512DQ-NEXT:    vpshufb %xmm12, %xmm4, %xmm4
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1]
-; AVX512DQ-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm4 = ymm6[3,3,3,3,7,7,7,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm5 = ymm5[2,2,2,3,6,6,6,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7,8,9],ymm4[10],ymm5[11,12],ymm4[13],ymm5[14,15]
-; AVX512DQ-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vprold $16, %xmm2, %xmm4
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm5 = xmm3[1,1,2,3]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7]
-; AVX512DQ-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
-; AVX512DQ-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
-; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm0, %xmm2
-; AVX512DQ-NEXT:    vmovdqa %xmm3, %xmm4
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[1,1,2,2]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6],xmm2[7]
-; AVX512DQ-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX512DQ-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm4[3,3,3,3,7,7,7,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm5[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8,9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
+; AVX512DQ-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm20[3,3,3,3,7,7,7,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm8[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,3,6,6,6,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm3 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15]
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm12 = [4,5,4,5,4,5,6,7,16,17,16,17,16,17,17,19]
 ; AVX512DQ-NEXT:    vmovdqa 96(%r9), %xmm0
 ; AVX512DQ-NEXT:    vmovdqa 96(%r8), %xmm1
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512DQ-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX512DQ-NEXT:    vpshufb %xmm12, %xmm0, %xmm0
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [16,18,19,19,19,19,0,0,0,1,0,1,2,3,2,3]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5,7,6]
-; AVX512DQ-NEXT:    vpermt2d %zmm1, %zmm18, %zmm0
-; AVX512DQ-NEXT:    vpbroadcastd 100(%rax), %ymm1
-; AVX512DQ-NEXT:    vpbroadcastd 104(%rax), %ymm2
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm31
-; AVX512DQ-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm31
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3]
-; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm9, %xmm1
-; AVX512DQ-NEXT:    vmovdqa %xmm4, %xmm6
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm7[1,1,2,2]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm2[0,1,3,2,4,5,6,7]
+; AVX512DQ-NEXT:    vpermt2d %zmm4, %zmm12, %zmm3
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm4 = ymm7[2,3,3,3,6,7,7,7]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,1,3,2]
+; AVX512DQ-NEXT:    vpbroadcastd 96(%rax), %ymm5
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm5, %zmm4, %zmm4
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535]
+; AVX512DQ-NEXT:    vpternlogd $184, %zmm3, %zmm5, %zmm4
+; AVX512DQ-NEXT:    vmovdqa64 %zmm5, %zmm15
+; AVX512DQ-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa 96(%rsi), %xmm3
+; AVX512DQ-NEXT:    vmovdqa 96(%rdi), %xmm4
+; AVX512DQ-NEXT:    vprold $16, %xmm3, %xmm5
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm6 = xmm4[1,1,2,3]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2],xmm6[3,4],xmm5[5],xmm6[6,7]
+; AVX512DQ-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm5, %xmm23
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; AVX512DQ-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vmovdqa 96(%rcx), %xmm3
+; AVX512DQ-NEXT:    vmovdqa 96(%rdx), %xmm4
+; AVX512DQ-NEXT:    vpbroadcastq {{.*#+}} xmm6 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
+; AVX512DQ-NEXT:    vpshufb %xmm6, %xmm3, %xmm5
+; AVX512DQ-NEXT:    vmovdqa %xmm6, %xmm7
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm6 = xmm4[1,1,2,2]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3],xmm5[4],xmm6[5,6],xmm5[7]
+; AVX512DQ-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; AVX512DQ-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; AVX512DQ-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,6]
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm29 = [16,18,19,19,19,19,0,0,0,1,0,1,2,3,2,3]
+; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm29, %zmm1
+; AVX512DQ-NEXT:    vpbroadcastd 100(%rax), %ymm2
+; AVX512DQ-NEXT:    vpbroadcastd 104(%rax), %ymm3
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512DQ-NEXT:    vpternlogd $184, %zmm1, %zmm14, %zmm2
+; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa 64(%rcx), %xmm1
+; AVX512DQ-NEXT:    vmovdqa 64(%rdx), %xmm2
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm3, %xmm27
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX512DQ-NEXT:    vpshufb %xmm7, %xmm1, %xmm1
+; AVX512DQ-NEXT:    vmovdqa %xmm7, %xmm8
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,2]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7]
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [16,16,17,17,17,17,0,0,0,1,0,1,2,3,2,3]
-; AVX512DQ-NEXT:    vpermt2d %zmm0, %zmm2, %zmm1
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3]
-; AVX512DQ-NEXT:    vpshufb %xmm12, %xmm0, %xmm0
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1]
-; AVX512DQ-NEXT:    vprold $16, %xmm10, %xmm3
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm4 = xmm14[1,1,2,3]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm14
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0]
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm1, %zmm0, %zmm14
-; AVX512DQ-NEXT:    vmovdqa64 %xmm17, %xmm1
-; AVX512DQ-NEXT:    vmovdqa64 %xmm20, %xmm3
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,5,7,6]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm3[0,1,3,2,4,5,6,7]
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [16,16,17,17,17,17,0,0,0,1,0,1,2,3,2,3]
+; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm3, %zmm1
+; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %xmm2
+; AVX512DQ-NEXT:    vmovdqa 64(%rsi), %xmm4
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm5, %xmm5
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,0,1,1]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm6, %xmm25
+; AVX512DQ-NEXT:    vprold $16, %xmm4, %xmm4
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3,4],xmm4[5],xmm2[6,7]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm5, %zmm2
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0]
+; AVX512DQ-NEXT:    vpternlogq $226, %zmm1, %zmm4, %zmm2
+; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa 64(%r9), %xmm1
+; AVX512DQ-NEXT:    vmovdqa 64(%r8), %xmm2
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm5, %xmm17
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,5,7,6]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7]
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [0,1,0,1,0,1,1,3,16,18,19,19,19,19,0,0]
-; AVX512DQ-NEXT:    vpermt2d %zmm3, %zmm4, %zmm1
-; AVX512DQ-NEXT:    vpbroadcastd 64(%rax), %ymm3
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm6 = [0,1,0,1,0,1,1,3,16,18,19,19,19,19,0,0]
+; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm6, %zmm1
+; AVX512DQ-NEXT:    vpbroadcastd 64(%rax), %ymm2
 ; AVX512DQ-NEXT:    vpbroadcastd 68(%rax), %ymm5
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm5, %zmm3, %zmm25
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm19 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512DQ-NEXT:    vpternlogd $184, %zmm1, %zmm19, %zmm25
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3]
-; AVX512DQ-NEXT:    vpshufb %xmm6, %xmm8, %xmm3
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm5 = xmm13[1,1,2,2]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2,3],xmm3[4],xmm5[5,6],xmm3[7]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm5, %zmm2, %zmm2
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
+; AVX512DQ-NEXT:    vpternlogd $184, %zmm1, %zmm9, %zmm2
+; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa (%rcx), %xmm1
+; AVX512DQ-NEXT:    vmovdqa (%rdx), %xmm5
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm2, %xmm20
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
+; AVX512DQ-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
+; AVX512DQ-NEXT:    vmovdqa %xmm8, %xmm14
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3],xmm1[4],xmm5[5,6],xmm1[7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm7[0,1,3,2,4,5,6,7]
+; AVX512DQ-NEXT:    vpermt2d %zmm5, %zmm3, %zmm1
+; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm3
+; AVX512DQ-NEXT:    vmovdqa (%rsi), %xmm5
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
+; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm7, %xmm7
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,0,1,1]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm2, %xmm19
+; AVX512DQ-NEXT:    vprold $16, %xmm5, %xmm5
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2],xmm3[3,4],xmm5[5],xmm3[6,7]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm3, %zmm7, %zmm2
+; AVX512DQ-NEXT:    vpternlogq $226, %zmm1, %zmm4, %zmm2
+; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa (%r9), %xmm1
+; AVX512DQ-NEXT:    vmovdqa (%r8), %xmm3
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,5,7,6]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7]
-; AVX512DQ-NEXT:    vpermt2d %zmm1, %zmm2, %zmm3
-; AVX512DQ-NEXT:    vmovdqa64 %xmm16, %xmm5
-; AVX512DQ-NEXT:    vmovdqa64 %xmm21, %xmm2
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
-; AVX512DQ-NEXT:    vpshufb %xmm12, %xmm1, %xmm1
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1]
-; AVX512DQ-NEXT:    vprold $16, %xmm21, %xmm2
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm5 = xmm16[1,1,2,3]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2],xmm5[3,4],xmm2[5],xmm5[6,7]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm5
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm3, %zmm0, %zmm5
-; AVX512DQ-NEXT:    vmovdqa64 %xmm29, %xmm0
-; AVX512DQ-NEXT:    vmovdqa64 %xmm24, %xmm1
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,7,6]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7]
-; AVX512DQ-NEXT:    vpermt2d %zmm1, %zmm4, %zmm0
-; AVX512DQ-NEXT:    vpbroadcastd (%rax), %ymm1
-; AVX512DQ-NEXT:    vpbroadcastd 4(%rax), %ymm2
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm20
-; AVX512DQ-NEXT:    vpternlogd $184, %zmm0, %zmm19, %zmm20
-; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm6[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5]
-; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm2[1,1,1,1,5,5,5,5]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
-; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm6[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm2[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT:    vmovdqa %ymm2, %ymm10
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX512DQ-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpermt2d %zmm3, %zmm6, %zmm1
+; AVX512DQ-NEXT:    vpbroadcastd (%rax), %ymm3
+; AVX512DQ-NEXT:    vpbroadcastd 4(%rax), %ymm4
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm4, %zmm3, %zmm2
+; AVX512DQ-NEXT:    vpternlogd $184, %zmm1, %zmm9, %zmm2
+; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa64 %ymm28, %ymm7
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm7[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm22, %ymm2
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm4 = ymm22[1,1,1,1,5,5,5,5]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7,8,9],ymm4[10],ymm1[11,12],ymm4[13],ymm1[14,15]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm1, %ymm22
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm4 = ymm2[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT:    vmovdqa %ymm2, %ymm9
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2,3],ymm1[4],ymm4[5,6,7,8],ymm1[9],ymm4[10,11],ymm1[12],ymm4[13,14,15]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm1, %ymm21
 ; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm2[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; AVX512DQ-NEXT:    vprold $16, %ymm2, %ymm1
 ; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm3[0,1,1,3,4,5,5,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2]
-; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpshufb %ymm11, %ymm2, %ymm0
-; AVX512DQ-NEXT:    vmovdqa %ymm2, %ymm9
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm3[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT:    vmovdqa %ymm3, %ymm11
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
-; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm7[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
-; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm4[0,0,2,1,4,4,6,5]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm3 = ymm0[0,1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7,8,9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15]
-; AVX512DQ-NEXT:    vprold $16, %ymm7, %ymm0
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm4[1,2,2,3,5,6,6,7]
-; AVX512DQ-NEXT:    vmovdqa %ymm4, %ymm8
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm4 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7,8,9],ymm0[10],ymm2[11,12],ymm0[13],ymm2[14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm11[3,3,3,3,7,7,7,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm9[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6,7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm10[3,3,3,3,7,7,7,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm6 = ymm6[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3],ymm0[4,5],ymm6[6],ymm0[7,8,9,10],ymm6[11],ymm0[12,13],ymm6[14],ymm0[15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm6 = ymm8[3,3,3,3,7,7,7,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm7 = ymm7[2,2,2,3,6,6,6,7]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7,8,9],ymm6[10],ymm7[11,12],ymm6[13],ymm7[14,15]
-; AVX512DQ-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm22 = ymm1[2,2,2,3]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm26 = ymm3[2,2,3,3]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm24 = ymm4[2,1,3,2]
-; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %xmm9
-; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %xmm10
-; AVX512DQ-NEXT:    vprold $16, %xmm10, %xmm1
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm3 = xmm9[1,1,2,3]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm4 = ymm3[1,2,2,3,5,6,6,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm4[0,1],ymm1[2],ymm4[3,4],ymm1[5],ymm4[6,7,8,9],ymm1[10],ymm4[11,12],ymm1[13],ymm4[14,15]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm2[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[0,0,0,0,4,4,4,4]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm6 = ymm3[0,0,2,1,4,4,6,5]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3],ymm4[4,5],ymm6[6],ymm4[7,8,9,10],ymm6[11],ymm4[12,13],ymm6[14],ymm4[15]
+; AVX512DQ-NEXT:    vpermt2q %zmm1, %zmm10, %zmm4
+; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm8[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4]
+; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm6 = ymm10[0,1,1,3,4,5,5,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm6[0,1],ymm1[2],ymm6[3,4],ymm1[5],ymm6[6,7,8,9],ymm1[10],ymm6[11,12],ymm1[13],ymm6[14,15]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm1, %ymm16
+; AVX512DQ-NEXT:    vpshufb %ymm13, %ymm8, %ymm1
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm6 = ymm10[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm6[2],ymm1[3,4],ymm6[5],ymm1[6,7,8,9],ymm6[10],ymm1[11,12],ymm6[13],ymm1[14,15]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm1, %ymm18
+; AVX512DQ-NEXT:    vpermd (%rax), %zmm11, %zmm1
+; AVX512DQ-NEXT:    vpternlogd $184, %zmm4, %zmm26, %zmm1
+; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm10[3,3,3,3,7,7,7,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm8[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3],ymm4[4],ymm1[5,6,7,8],ymm4[9],ymm1[10,11],ymm4[12],ymm1[13,14,15]
 ; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm21 = ymm2[0,2,2,3]
-; AVX512DQ-NEXT:    vmovdqa 32(%r9), %xmm1
-; AVX512DQ-NEXT:    vmovdqa 32(%r8), %xmm2
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
-; AVX512DQ-NEXT:    vpshufb %xmm12, %xmm2, %xmm3
-; AVX512DQ-NEXT:    vpshufb %xmm12, %xmm1, %xmm1
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,4,5,7,6]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm6, %xmm27
-; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm18, %zmm1
-; AVX512DQ-NEXT:    vpbroadcastd 36(%rax), %ymm2
-; AVX512DQ-NEXT:    vpbroadcastd 40(%rax), %ymm4
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm4, %zmm2, %zmm13
-; AVX512DQ-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm13
-; AVX512DQ-NEXT:    vmovdqa 32(%rcx), %xmm7
-; AVX512DQ-NEXT:    vmovdqa 32(%rdx), %xmm6
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,2,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm12 = xmm4[0,0,1,1]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm15 = ymm0[2,1,3,3]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm8 = ymm3[0,0,1,1]
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
-; AVX512DQ-NEXT:    vpternlogq $184, %zmm3, %zmm4, %zmm1
-; AVX512DQ-NEXT:    vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm3 = mem[2,1,3,2]
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm29 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535]
-; AVX512DQ-NEXT:    vpternlogq $184, %ymm1, %ymm29, %ymm3
-; AVX512DQ-NEXT:    vpternlogq $184, %ymm3, %ymm28, %ymm30
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm9[3,3,3,3,7,7,7,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm7[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3],ymm1[4,5],ymm4[6],ymm1[7,8,9,10],ymm4[11],ymm1[12,13],ymm4[14],ymm1[15]
+; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm3[3,3,3,3,7,7,7,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm2[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,3,6,6,6,7]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm4[0,1],ymm1[2],ymm4[3,4],ymm1[5],ymm4[6,7,8,9],ymm1[10],ymm4[11,12],ymm1[13],ymm4[14,15]
+; AVX512DQ-NEXT:    vmovdqa 32(%r9), %xmm7
+; AVX512DQ-NEXT:    vmovdqa 32(%r8), %xmm4
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm6[0,1,3,2,4,5,6,7]
+; AVX512DQ-NEXT:    vpermt2d %zmm11, %zmm12, %zmm1
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm11 = ymm24[2,3,3,3,6,7,7,7]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,1,3,2]
+; AVX512DQ-NEXT:    vpbroadcastd 32(%rax), %ymm12
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm12, %zmm11, %zmm26
+; AVX512DQ-NEXT:    vpternlogd $184, %zmm1, %zmm15, %zmm26
+; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %xmm1
+; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %xmm12
+; AVX512DQ-NEXT:    vprold $16, %xmm12, %xmm15
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm13 = xmm1[1,1,2,3]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm13[0,1],xmm15[2],xmm13[3,4],xmm15[5],xmm13[6,7]
+; AVX512DQ-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm12[4],xmm1[4],xmm12[5],xmm1[5],xmm12[6],xmm1[6],xmm12[7],xmm1[7]
+; AVX512DQ-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm17, %xmm1
+; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
+; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm5, %xmm3
+; AVX512DQ-NEXT:    vmovdqa64 %xmm23, %xmm1
+; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
+; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm13, %xmm1
+; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm4, %xmm4
+; AVX512DQ-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm24 = mem[2,2,2,3]
+; AVX512DQ-NEXT:    vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm23 = mem[0,2,2,3]
+; AVX512DQ-NEXT:    vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm7 = mem[0,2,2,3]
+; AVX512DQ-NEXT:    vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm17 = mem[2,1,3,3]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,4,5,7,6]
+; AVX512DQ-NEXT:    vpermt2d %zmm0, %zmm29, %zmm4
+; AVX512DQ-NEXT:    vpbroadcastd 36(%rax), %ymm0
+; AVX512DQ-NEXT:    vpbroadcastd 40(%rax), %ymm6
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm6, %zmm0, %zmm28
+; AVX512DQ-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm28
+; AVX512DQ-NEXT:    vmovdqa 32(%rcx), %xmm6
+; AVX512DQ-NEXT:    vpshufb %xmm14, %xmm6, %xmm4
+; AVX512DQ-NEXT:    vmovdqa 32(%rdx), %xmm0
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm12 = xmm0[1,1,2,2]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm12[0],xmm4[1],xmm12[2,3],xmm4[4],xmm12[5,6],xmm4[7]
+; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
+; AVX512DQ-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
+; AVX512DQ-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm1[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm29 = ymm0[3,3,3,3]
+; AVX512DQ-NEXT:    vpshufd $233, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm0 = mem[1,2,2,3,5,6,6,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm14 = ymm1[2,1,2,3,6,5,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm14 = ymm14[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm27, %xmm1
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm1[0,2,3,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm25, %xmm1
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm1[2,1,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,5,4]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,0,1,3]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm20, %xmm1
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm1[0,2,3,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[0,0,2,1]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm19, %xmm1
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm1[2,1,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,5,4]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,0,1,3]
+; AVX512DQ-NEXT:    vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm4 = mem[2,1,3,2]
+; AVX512DQ-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm5 = mem[2,2,2,3]
+; AVX512DQ-NEXT:    vpermq $250, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm6 = mem[2,2,3,3]
+; AVX512DQ-NEXT:    vpermpd $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm1 = mem[2,2,2,3]
+; AVX512DQ-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vpermpd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm1 = mem[0,2,2,3]
+; AVX512DQ-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm31 = ymm31[0,2,2,3]
+; AVX512DQ-NEXT:    vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm27 = mem[2,1,3,3]
+; AVX512DQ-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm20 = mem[2,2,2,3]
+; AVX512DQ-NEXT:    vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm11 = mem[0,2,2,3]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm19 = ymm30[2,1,3,2]
+; AVX512DQ-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm1 = mem[2,2,2,3]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm25 = ymm22[2,2,2,3]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm13 = ymm21[0,2,2,3]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm16 = ymm16[2,1,3,2]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm9 = ymm18[2,2,2,3]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm23, %zmm24, %zmm23
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm17, %zmm7, %zmm24
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm30 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
+; AVX512DQ-NEXT:    vpternlogq $184, %zmm23, %zmm30, %zmm24
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2]
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535]
+; AVX512DQ-NEXT:    vpternlogq $184, %ymm24, %ymm7, %ymm0
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm18 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535]
+; AVX512DQ-NEXT:    vpternlogq $184, %ymm0, %ymm18, %ymm14
+; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm0[4,5,6,7]
+; AVX512DQ-NEXT:    vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm2 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm0[4,5,6,7]
+; AVX512DQ-NEXT:    vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm3 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm23 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
+; AVX512DQ-NEXT:    vpternlogq $226, %zmm10, %zmm23, %zmm8
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm17 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535]
+; AVX512DQ-NEXT:    vpternlogq $226, %zmm2, %zmm17, %zmm8
+; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm2 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm10 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    vpternlogq $226, %zmm2, %zmm23, %zmm10
+; AVX512DQ-NEXT:    vpternlogq $226, %zmm3, %zmm17, %zmm10
 ; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm3 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm2
 ; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm0 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm23 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm18 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm28 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm23, %zmm28, %zmm18
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535]
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm3, %zmm2, %zmm18
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm23 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm3, %zmm28, %zmm23
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm0, %zmm2, %zmm23
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm5, %zmm0, %zmm21
+; AVX512DQ-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm21
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm24, %ymm2
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
+; AVX512DQ-NEXT:    vpternlogd $226, 124(%r8){1to8}, %ymm0, %ymm2
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535]
+; AVX512DQ-NEXT:    vpternlogq $184, %ymm2, %ymm3, %ymm29
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512DQ-NEXT:    vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm22 # 64-byte Folded Reload
 ; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm17 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm17
-; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
-; AVX512DQ-NEXT:    vpternlogd $226, 124(%r8){1to8}, %ymm19, %ymm0
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535]
-; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX512DQ-NEXT:    vpternlogq $184, %ymm0, %ymm1, %ymm2
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512DQ-NEXT:    vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm19 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm24 = zmm0[0,1,2,3],zmm6[0,1,2,3]
+; AVX512DQ-NEXT:    vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm24 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm29, %zmm0, %zmm2
+; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm14[0,1,2,3],zmm2[4,5,6,7]
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0]
+; AVX512DQ-NEXT:    vpternlogd $184, %zmm2, %zmm0, %zmm29
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQ-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535]
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512DQ-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm18 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512DQ-NEXT:    vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm17 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm11, %zmm20, %zmm2
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm19, %zmm3
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535]
+; AVX512DQ-NEXT:    vpternlogq $226, %zmm2, %zmm4, %zmm3
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm13, %zmm25, %zmm2
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm9, %zmm16, %zmm5
+; AVX512DQ-NEXT:    vpternlogq $226, %zmm2, %zmm4, %zmm5
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0]
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512DQ-NEXT:    vpternlogq $184, %zmm3, %zmm2, %zmm20
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512DQ-NEXT:    vpternlogq $184, %zmm5, %zmm2, %zmm25
 ; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-NEXT:    vshufi64x2 $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    # zmm0 = zmm0[0,1,2,3],mem[0,1,2,3]
-; AVX512DQ-NEXT:    vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm1
-; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm30[0,1,2,3],zmm1[4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0]
-; AVX512DQ-NEXT:    vpternlogd $184, %zmm1, %zmm2, %zmm30
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512DQ-NEXT:    vpternlogd $184, %zmm1, %zmm2, %zmm11
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535]
-; AVX512DQ-NEXT:    vpternlogq $184, %zmm14, %zmm1, %zmm25
-; AVX512DQ-NEXT:    vpternlogq $184, %zmm5, %zmm1, %zmm20
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535]
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm1, %zmm5, %zmm2
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm22, %zmm3, %zmm22
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm1, %zmm5, %zmm22
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm14 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0]
-; AVX512DQ-NEXT:    vpermd 64(%rax), %zmm14, %zmm5
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535]
-; AVX512DQ-NEXT:    vpternlogd $184, %zmm1, %zmm3, %zmm5
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm24, %zmm26, %zmm1
-; AVX512DQ-NEXT:    vpermd (%rax), %zmm14, %zmm14
-; AVX512DQ-NEXT:    vpternlogd $184, %zmm1, %zmm3, %zmm14
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0]
-; AVX512DQ-NEXT:    vpternlogq $184, %zmm2, %zmm1, %zmm5
-; AVX512DQ-NEXT:    vpternlogq $184, %zmm22, %zmm1, %zmm14
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm27, %zmm31, %zmm3
+; AVX512DQ-NEXT:    vpternlogq $184, %zmm2, %zmm30, %zmm3
+; AVX512DQ-NEXT:    vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm2 = mem[0,2,2,3]
+; AVX512DQ-NEXT:    vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
+; AVX512DQ-NEXT:    # xmm4 = mem[0,1,3,2,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,0,1,1]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,1,3]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm4, %zmm2, %zmm2
+; AVX512DQ-NEXT:    vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm4 = mem[2,1,3,3]
+; AVX512DQ-NEXT:    vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm5 = mem[0,0,1,1]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm5, %zmm4, %zmm4
+; AVX512DQ-NEXT:    vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm5 = mem[0,0,2,1]
+; AVX512DQ-NEXT:    vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
+; AVX512DQ-NEXT:    # xmm6 = mem[2,1,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,5,4]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,0,1,3]
+; AVX512DQ-NEXT:    vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm19 = mem[0,0,1,1]
+; AVX512DQ-NEXT:    vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
+; AVX512DQ-NEXT:    # xmm9 = mem[0,2,3,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1]
+; AVX512DQ-NEXT:    vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm11 = mem[0,2,2,3]
+; AVX512DQ-NEXT:    vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
+; AVX512DQ-NEXT:    # xmm12 = mem[0,1,3,2,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[0,0,1,1]
+; AVX512DQ-NEXT:    vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm13 = mem[2,1,3,3]
+; AVX512DQ-NEXT:    vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm14 = mem[0,0,1,1]
+; AVX512DQ-NEXT:    vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm15 = mem[0,0,2,1]
+; AVX512DQ-NEXT:    vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512DQ-NEXT:    # xmm0 = mem[2,1,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,4]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3]
+; AVX512DQ-NEXT:    vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm16 = mem[0,0,1,1]
+; AVX512DQ-NEXT:    vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; AVX512DQ-NEXT:    # xmm1 = mem[0,2,3,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
+; AVX512DQ-NEXT:    vpternlogq $226, %zmm2, %zmm30, %zmm4
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm12[0,1,1,3]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm11, %zmm2
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm14, %zmm13, %zmm11
+; AVX512DQ-NEXT:    vpternlogq $226, %zmm2, %zmm30, %zmm11
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535]
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQ-NEXT:    vpternlogq $184, %zmm4, %zmm2, %zmm12
+; AVX512DQ-NEXT:    vpternlogq $184, %zmm11, %zmm2, %zmm26
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm6, %zmm5, %zmm2
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm9, %zmm19, %zmm4
+; AVX512DQ-NEXT:    vpternlogq $226, %zmm2, %zmm23, %zmm4
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm15, %zmm0
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm16, %zmm1
+; AVX512DQ-NEXT:    vpternlogq $226, %zmm0, %zmm23, %zmm1
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0]
 ; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    vpternlogq $184, %zmm1, %zmm4, %zmm2
-; AVX512DQ-NEXT:    vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm1 = mem[0,1,1,3]
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm3, %zmm1
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm22 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm1, %zmm4, %zmm22
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm12[0,1,1,3]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm21, %zmm1
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm8, %zmm15, %zmm8
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm1, %zmm4, %zmm8
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm4 = xmm7[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm9 = xmm6[1,1,2,2]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm4 = xmm9[0],xmm4[1],xmm9[2,3],xmm4[4],xmm9[5,6],xmm4[7]
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
-; AVX512DQ-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm7 = mem[2,2,2,3]
-; AVX512DQ-NEXT:    vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
-; AVX512DQ-NEXT:    # xmm9 = mem[0,1,3,2,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm9 = xmm9[0,1,1,3]
-; AVX512DQ-NEXT:    vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm10 = mem[2,3,3,3,6,7,7,7]
-; AVX512DQ-NEXT:    vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm16 = mem[0,0,2,1]
-; AVX512DQ-NEXT:    vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
-; AVX512DQ-NEXT:    # xmm12 = mem[2,1,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,5,4]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,0,1,3]
-; AVX512DQ-NEXT:    vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm21 = mem[0,0,1,1]
-; AVX512DQ-NEXT:    vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
-; AVX512DQ-NEXT:    # xmm15 = mem[0,2,3,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[0,0,2,1]
-; AVX512DQ-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm24 = mem[2,2,2,3]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm27, %xmm3
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,2,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,1,1,3]
-; AVX512DQ-NEXT:    vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm26 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm26 = mem[2,3,3,3,6,7,7,7]
-; AVX512DQ-NEXT:    vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm27 = mem[0,0,2,1]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[0,2,3,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,0,0,1]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm9, %zmm7, %zmm7
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm9 = ymm10[2,1,3,2]
-; AVX512DQ-NEXT:    vpbroadcastd 96(%rax), %ymm10
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm10, %zmm9, %zmm9
-; AVX512DQ-NEXT:    vpternlogd $184, %zmm7, %zmm29, %zmm9
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm3, %zmm24, %zmm3
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm7 = ymm26[2,1,3,2]
-; AVX512DQ-NEXT:    vpbroadcastd 32(%rax), %ymm10
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm10, %zmm7, %zmm7
-; AVX512DQ-NEXT:    vpternlogd $184, %zmm3, %zmm29, %zmm7
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535]
-; AVX512DQ-NEXT:    vpternlogq $184, %zmm22, %zmm3, %zmm9
-; AVX512DQ-NEXT:    vpternlogq $184, %zmm8, %zmm3, %zmm7
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm12, %zmm16, %zmm3
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm15, %zmm21, %zmm8
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm3, %zmm28, %zmm8
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm27, %zmm1
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm6, %zmm4, %zmm3
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm1, %zmm28, %zmm3
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0]
-; AVX512DQ-NEXT:    vpternlogq $184, %zmm8, %zmm1, %zmm31
-; AVX512DQ-NEXT:    vpternlogq $184, %zmm3, %zmm1, %zmm13
-; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm17, %zmm0
-; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm11
+; AVX512DQ-NEXT:    vpternlogq $184, %zmm4, %zmm0, %zmm2
+; AVX512DQ-NEXT:    vpternlogq $184, %zmm1, %zmm0, %zmm28
+; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm21, %zmm24
+; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm7
 ; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-NEXT:    vmovdqa64 %zmm0, 320(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm13, 256(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm7, 192(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm14, 128(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm23, 64(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm20, (%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm25, 448(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm31, 704(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm9, 640(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm5, 576(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm18, 512(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm11, 384(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm19, 768(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm30, 832(%rax)
-; AVX512DQ-NEXT:    addq $2808, %rsp # imm = 0xAF8
+; AVX512DQ-NEXT:    vmovdqa64 %zmm24, 320(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm28, 256(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm26, 192(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm25, 128(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm10, 64(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm17, (%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm18, 448(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm2, 704(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm12, 640(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm20, 576(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm8, 512(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm7, 384(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm22, 768(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm29, 832(%rax)
+; AVX512DQ-NEXT:    addq $2168, %rsp # imm = 0x878
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512DQ-FCP-LABEL: store_i16_stride7_vf64:
 ; AVX512DQ-FCP:       # %bb.0:
 ; AVX512DQ-FCP-NEXT:    subq $1496, %rsp # imm = 0x5D8
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rcx), %ymm3
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm3, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdx), %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm1, %ymm2
-; AVX512DQ-FCP-NEXT:    vporq %ymm0, %ymm2, %ymm16
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rsi), %ymm4
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm10 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm4, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdi), %ymm5
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rcx), %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm15 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm0, %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm0, %ymm20
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdx), %ymm9
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm9, %ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, %ymm10
+; AVX512DQ-FCP-NEXT:    vporq %ymm2, %ymm3, %ymm16
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rsi), %ymm6
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm14 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm6, %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdi), %ymm7
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm11 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm5, %ymm2
-; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm2, %ymm7
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%r9), %ymm2
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm2, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm26
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%r8), %ymm12
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm13 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm13, %ymm12, %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm12, %ymm28
-; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm2, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm7, %ymm3
+; AVX512DQ-FCP-NEXT:    vporq %ymm2, %ymm3, %ymm17
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%r9), %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm0, %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%r8), %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm0, %ymm4
+; AVX512DQ-FCP-NEXT:    vporq %ymm2, %ymm4, %ymm18
 ; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rcx), %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm0, %ymm14
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm0, %ymm31
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, (%rsp) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm0, %ymm8
 ; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdx), %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm0, %ymm15
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm0, %ymm21
-; AVX512DQ-FCP-NEXT:    vpor %ymm14, %ymm15, %ymm12
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm0, %ymm12
+; AVX512DQ-FCP-NEXT:    vpor %ymm8, %ymm12, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rsi), %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm0, %ymm14
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm0, %ymm29
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm0, %ymm15
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm0, %ymm17
-; AVX512DQ-FCP-NEXT:    vpor %ymm14, %ymm15, %ymm12
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm0, %ymm14
-; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %ymm1
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm1, %ymm13
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm1, %ymm28
+; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm13, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm13, %ymm0, %ymm15
-; AVX512DQ-FCP-NEXT:    vpor %ymm14, %ymm15, %ymm12
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm1, %ymm13
+; AVX512DQ-FCP-NEXT:    vporq %ymm0, %ymm13, %ymm19
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm0, %ymm14
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm10, %ymm2
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm1, %ymm13
+; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm13, %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm0, %ymm12
-; AVX512DQ-FCP-NEXT:    vpor %ymm14, %ymm12, %ymm12
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm0, %ymm12
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm1, %ymm13
+; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm13, %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm0, %ymm14
-; AVX512DQ-FCP-NEXT:    vpor %ymm12, %ymm14, %ymm12
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %ymm14
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm14, %ymm8
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %ymm10
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm10, %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %ymm15
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm15, %ymm9
-; AVX512DQ-FCP-NEXT:    vpor %ymm8, %ymm9, %ymm8
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm0, %ymm8
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm2
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm2, %ymm9
-; AVX512DQ-FCP-NEXT:    vpor %ymm8, %ymm9, %ymm8
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %ymm8
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm13, %ymm8, %ymm10
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r9), %ymm9
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm9, %ymm11
-; AVX512DQ-FCP-NEXT:    vporq %ymm11, %ymm10, %ymm22
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm11 = ymm3[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm12 = ymm1[0,1,1,3,4,5,5,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm12[0,1],ymm11[2],ymm12[3,4],ymm11[5],ymm12[6,7,8,9],ymm11[10],ymm12[11,12],ymm11[13],ymm12[14,15]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,1,3,2]
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
-; AVX512DQ-FCP-NEXT:    # ymm13 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm13, %ymm4, %ymm12
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm13, %ymm30
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm13 = ymm5[1,1,1,1,5,5,5,5]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm13[2],ymm12[3,4],ymm13[5],ymm12[6,7,8,9],ymm13[10],ymm12[11,12],ymm13[13],ymm12[14,15]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm16, %zmm11
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm12, %zmm7, %zmm12
-; AVX512DQ-FCP-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm12
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%r8), %ymm7
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm11 = ymm7[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[16,17,u,u,u,u],zero,zero
-; AVX512DQ-FCP-NEXT:    vpternlogq $248, %ymm13, %ymm12, %ymm11
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%r9), %ymm10
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm10, %ymm6
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq $248, %ymm16, %ymm11, %ymm6
-; AVX512DQ-FCP-NEXT:    vextracti64x4 $1, %zmm12, %ymm11
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm12 = [0,4,0,0,0,5,0,0]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm7, %ymm12, %ymm12
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0]
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %ymm11, %ymm16, %ymm12
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm16, %zmm19
-; AVX512DQ-FCP-NEXT:    vprold $16, %ymm10, %ymm11
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %ymm12, %ymm10, %ymm11
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm0, %zmm11
-; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm11[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm15, %ymm13
+; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm13, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %ymm8
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm8, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm4
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm4, %ymm11
+; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm11, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %ymm13
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm13, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r9), %ymm14
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm14, %ymm5
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm3, %ymm12
+; AVX512DQ-FCP-NEXT:    vporq %ymm5, %ymm0, %ymm21
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm20, %ymm2
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm2[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm11 = ymm9[0,1,1,3,4,5,5,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm11[0,1],ymm0[2],ymm11[3,4],ymm0[5],ymm11[6,7,8,9],ymm0[10],ymm11[11,12],ymm0[13],ymm11[14,15]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm11 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm7[1,1,1,1,5,5,5,5]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm11[0,1],ymm3[2],ymm11[3,4],ymm3[5],ymm11[6,7,8,9],ymm3[10],ymm11[11,12],ymm3[13],ymm11[14,15]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm16, %zmm0
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm17, %zmm3
+; AVX512DQ-FCP-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm3
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm25 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535]
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%r8), %ymm11
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm11[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[16,17,u,u,u,u],zero,zero
+; AVX512DQ-FCP-NEXT:    vpternlogq $248, %ymm25, %ymm3, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%r9), %ymm5
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm5, %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
+; AVX512DQ-FCP-NEXT:    vpternlogq $248, %ymm12, %ymm0, %ymm1
+; AVX512DQ-FCP-NEXT:    vextracti64x4 $1, %zmm3, %ymm0
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,4,0,0,0,5,0,0]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm11, %ymm3, %ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0]
+; AVX512DQ-FCP-NEXT:    vpternlogq $184, %ymm0, %ymm17, %ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm17, %zmm22
+; AVX512DQ-FCP-NEXT:    vprold $16, %ymm5, %ymm0
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
+; AVX512DQ-FCP-NEXT:    vpternlogq $184, %ymm3, %ymm5, %ymm0
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
+; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm11 = [0,1,4,5,4,5,5,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rax), %ymm6
-; AVX512DQ-FCP-NEXT:    vpermd %ymm6, %ymm11, %ymm11
-; AVX512DQ-FCP-NEXT:    vpandn %ymm11, %ymm13, %ymm11
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm13, %zmm18
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm13 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm13, %ymm6, %ymm12
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm12, %zmm11
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm4, %ymm11
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm12 = ymm5[3,3,3,3,7,7,7,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7,8,9,10],ymm11[11],ymm12[12,13],ymm11[14],ymm12[15]
-; AVX512DQ-FCP-NEXT:    vpmovsxdq {{.*#+}} ymm12 = [151522058,0,421010202,421010202]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm4, %ymm4
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm12, %ymm23
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm5[2,2,2,2,6,6,6,6]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15]
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm5 = [0,2,2,3,10,9,11,11]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm11, %zmm5, %zmm4
-; AVX512DQ-FCP-NEXT:    vpmovsxdq {{.*#+}} ymm12 = [218894094,0,488382238,488382238]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm3, %ymm11
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm12, %ymm16
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm12 = ymm1[3,3,3,3,7,7,7,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm12[0],ymm11[1],ymm12[2,3],ymm11[4],ymm12[5,6,7,8],ymm11[9],ymm12[10,11],ymm11[12],ymm12[13,14,15]
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm12 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27]
-; AVX512DQ-FCP-NEXT:    # ymm12 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm3, %ymm3
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7,8,9],ymm1[10],ymm3[11,12],ymm1[13],ymm3[14,15]
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm20 = [2,2,2,3,8,10,10,11]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm11, %zmm20, %zmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm27 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm4, %zmm27, %zmm1
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,1,4,5,4,5,5,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rax), %ymm0
+; AVX512DQ-FCP-NEXT:    vpermd %ymm0, %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT:    vpandnq %ymm1, %ymm25, %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm5 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm0, %ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm5, %ymm20
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm3, %zmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm6, %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm3, %ymm26
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm7[3,3,3,3,7,7,7,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5],ymm1[6],ymm3[7,8,9,10],ymm1[11],ymm3[12,13],ymm1[14],ymm3[15]
+; AVX512DQ-FCP-NEXT:    vpmovsxdq {{.*#+}} ymm5 = [151522058,0,421010202,421010202]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm6, %ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm5, %ymm29
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm7[2,2,2,2,6,6,6,6]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2,3],ymm3[4],ymm6[5,6,7,8],ymm3[9],ymm6[10,11],ymm3[12],ymm6[13,14,15]
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm6 = [0,2,2,3,10,9,11,11]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm6, %zmm3
+; AVX512DQ-FCP-NEXT:    vpmovsxdq {{.*#+}} ymm7 = [218894094,0,488382238,488382238]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm7, %ymm2, %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm7, %ymm27
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm7 = ymm9[3,3,3,3,7,7,7,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm7[0],ymm1[1],ymm7[2,3],ymm1[4],ymm7[5,6,7,8],ymm1[9],ymm7[10,11],ymm1[12],ymm7[13,14,15]
+; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27]
+; AVX512DQ-FCP-NEXT:    # ymm5 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm2, %ymm7
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm5, %ymm16
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[2,2,2,2,6,6,6,6]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm9[2],ymm7[3,4],ymm9[5],ymm7[6,7,8,9],ymm9[10],ymm7[11,12],ymm9[13],ymm7[14,15]
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm17 = [2,2,2,3,8,10,10,11]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm17, %zmm7
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
+; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm3, %zmm1, %zmm7
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm24
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [5,0,0,0,6,0,0,6]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm7, %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT:    vpermd %ymm11, %ymm1, %ymm1
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rax), %zmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rax), %zmm12
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [0,5,0,0,0,6,0,0,30,0,0,0,31,0,0,31]
-; AVX512DQ-FCP-NEXT:    vpermi2d %zmm1, %zmm6, %zmm3
+; AVX512DQ-FCP-NEXT:    vpermi2d %zmm12, %zmm0, %zmm3
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535]
-; AVX512DQ-FCP-NEXT:    vpbroadcastd 72(%rax), %ymm3
-; AVX512DQ-FCP-NEXT:    vpandn %ymm3, %ymm6, %ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm23 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535]
+; AVX512DQ-FCP-NEXT:    vpbroadcastd 72(%rax), %ymm0
+; AVX512DQ-FCP-NEXT:    vpandnq %ymm0, %ymm23, %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rax), %ymm7
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm13, %ymm7, %ymm4
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm7, %ymm25
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm3, %zmm3
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm20, %ymm5
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm7, %ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm7, %ymm30
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm11
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm18, %zmm0, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%r9), %xmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%r8), %xmm2
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm2, %xmm20
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm3, %xmm3
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm18 = [0,0,1,1,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm18, %zmm3
+; AVX512DQ-FCP-NEXT:    vpternlogq $248, %zmm23, %zmm3, %zmm11
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpbroadcastd 8(%rax), %ymm0
+; AVX512DQ-FCP-NEXT:    vpandnq %ymm0, %ymm23, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rax), %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm1, %ymm3
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm5
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm19, %zmm0, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %xmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %xmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm3, %xmm3
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm18, %zmm3
+; AVX512DQ-FCP-NEXT:    vpternlogq $248, %zmm23, %zmm3, %zmm5
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm7 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm7, %ymm14, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm13[0,0,2,1,4,4,6,5]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5],ymm3[6],ymm0[7,8,9,10],ymm3[11],ymm0[12,13],ymm3[14],ymm0[15]
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm3 = [0,1,0,3,10,10,11,11]
+; AVX512DQ-FCP-NEXT:    vpermi2q %zmm0, %zmm21, %zmm3
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpbroadcastd 8(%rax), %ymm3
-; AVX512DQ-FCP-NEXT:    vpandn %ymm3, %ymm6, %ymm3
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rax), %ymm4
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm26, %ymm9
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm8, %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm13, %ymm4, %ymm4
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm3, %zmm3
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm4 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm9, %ymm3
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm4, %ymm24
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm8[0,0,2,1,4,4,6,5]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8,9,10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15]
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm4 = [0,1,0,3,10,10,11,11]
-; AVX512DQ-FCP-NEXT:    vpermi2q %zmm3, %zmm22, %zmm4
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm0, %ymm3
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm2[3,3,3,3,7,7,7,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7,8,9,10],ymm3[11],ymm4[12,13],ymm3[14],ymm4[15]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm23, %ymm7
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm7, %ymm0, %ymm4
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm2[2,2,2,2,6,6,6,6]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6,7,8],ymm4[9],ymm6[10,11],ymm4[12],ymm6[13,14,15]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm5, %zmm4
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm16, %ymm2
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm14, %ymm3
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm4[3,3,3,3,7,7,7,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5],ymm0[6],ymm3[7,8,9,10],ymm0[11],ymm3[12,13],ymm0[14],ymm3[15]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm29, %ymm1
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm8, %ymm3
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm4[2,2,2,2,6,6,6,6]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6,7,8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13,14,15]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm6, %zmm3
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm27, %ymm11
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm10, %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm15[3,3,3,3,7,7,7,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6,7,8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm14, %ymm5
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2,3],ymm0[4],ymm5[5,6,7,8],ymm0[9],ymm5[10,11],ymm0[12],ymm5[13,14,15]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm16, %ymm4
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm10, %ymm5
 ; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm15[2,2,2,2,6,6,6,6]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7,8,9],ymm6[10],ymm5[11,12],ymm6[13],ymm5[14,15]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm20, %zmm0
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm4, %zmm27, %zmm0
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm8[3,3,3,3,7,7,7,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15]
-; AVX512DQ-FCP-NEXT:    vprold $16, %ymm9, %ymm4
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm8[1,2,2,3,5,6,6,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7,8,9],ymm4[10],ymm5[11,12],ymm4[13],ymm5[14,15]
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm11 = [2,1,3,2,10,10,10,11]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm11, %zmm4
-; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rax), %zmm16
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [0,21,0,0,0,22,0,0,14,0,0,0,15,0,0,15]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7,8,9],ymm6[10],ymm5[11,12],ymm6[13],ymm5[14,15]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm17, %zmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm24, %zmm16
+; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm3, %zmm24, %zmm2
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
+; AVX512DQ-FCP-NEXT:    # ymm15 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm14, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm13[3,3,3,3,7,7,7,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7,8,9],ymm3[10],ymm0[11,12],ymm3[13],ymm0[14,15]
+; AVX512DQ-FCP-NEXT:    vprold $16, %ymm14, %ymm3
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm13[1,2,2,3,5,6,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7,8,9],ymm3[10],ymm5[11,12],ymm3[13],ymm5[14,15]
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm18 = [2,1,3,2,10,10,10,11]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm18, %zmm3
+; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rax), %zmm19
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [0,21,0,0,0,22,0,0,14,0,0,0,15,0,0,15]
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rax), %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpermi2d %zmm0, %zmm16, %zmm3
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm4, %zmm19, %zmm3
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm29, %ymm10
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm7, %ymm10, %ymm3
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm17, %ymm19
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm17[2,2,2,2,6,6,6,6]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14,15]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm30, %ymm9
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm10, %ymm4
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm17[1,1,1,1,5,5,5,5]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm20, %zmm5
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm31, %ymm8
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm8, %ymm3
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm21[2,2,2,2,6,6,6,6]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15]
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm8[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm21[0,1,1,3,4,5,5,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7,8,9],ymm4[10],ymm6[11,12],ymm4[13],ymm6[14,15]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm11, %zmm6
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm27 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm5, %zmm27, %zmm6
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm26, %ymm12
-; AVX512DQ-FCP-NEXT:    vprold $16, %ymm26, %ymm3
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm28, %ymm22
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm28[1,2,2,3,5,6,6,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7,8,9],ymm3[10],ymm5[11,12],ymm3[13],ymm5[14,15]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm24, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm12, %ymm5
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm26, %ymm30
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm7 = ymm28[0,0,2,1,4,4,6,5]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3],ymm5[4,5],ymm7[6],ymm5[7,8,9,10],ymm7[11],ymm5[12,13],ymm7[14],ymm5[15]
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm23 = [2,2,3,3,10,9,11,10]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm23, %zmm5
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm25, %zmm1, %zmm1
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [0,0,4,0,0,0,5,0,0,13,0,0,0,14,0,0]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm1, %zmm0, %zmm0
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm5, %zmm18, %zmm0
-; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm0
+; AVX512DQ-FCP-NEXT:    vpermi2d %zmm0, %zmm19, %zmm2
+; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm3, %zmm22, %zmm2
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm13, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm28[2,2,2,2,6,6,6,6]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3],ymm0[4],ymm3[5,6,7,8],ymm0[9],ymm3[10,11],ymm0[12],ymm3[13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm28[1,1,1,1,5,5,5,5]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7,8,9],ymm5[10],ymm3[11,12],ymm5[13],ymm3[14,15]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm17, %zmm3
+; AVX512DQ-FCP-NEXT:    vmovdqu (%rsp), %ymm1 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm1, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm2[2,2,2,2,6,6,6,6]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm5[2],ymm0[3,4],ymm5[5],ymm0[6,7,8,9],ymm5[10],ymm0[11,12],ymm5[13],ymm0[14,15]
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm5 = ymm1[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm2[0,1,1,3,4,5,5,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm2, %ymm4
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7,8,9],ymm5[10],ymm6[11,12],ymm5[13],ymm6[14,15]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm18, %zmm5
+; AVX512DQ-FCP-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm5
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vprold $16, %ymm8, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm2[1,2,2,3,5,6,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7,8,9],ymm0[10],ymm3[11,12],ymm0[13],ymm3[14,15]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm7, %ymm8, %ymm3
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm2[0,0,2,1,4,4,6,5]
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm2, %ymm14
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3],ymm3[4,5],ymm6[6],ymm3[7,8,9,10],ymm6[11],ymm3[12,13],ymm6[14],ymm3[15]
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [2,2,3,3,10,9,11,10]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm2, %zmm3
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm30, %zmm12, %zmm0
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [0,0,4,0,0,0,5,0,0,13,0,0,0,14,0,0]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm0, %zmm2, %zmm0
+; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm3, %zmm25, %zmm0
+; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm8, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm21[3,3,3,3,7,7,7,7]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm1, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm4[3,3,3,3,7,7,7,7]
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15]
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rcx), %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdx), %xmm2
-; AVX512DQ-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm5 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm0, %xmm3
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm5, %xmm14
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm2[1,1,2,2]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2,3],xmm3[4],xmm5[5,6],xmm3[7]
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm15 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm15, %xmm2, %xmm2
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm0 = [0,1,1,3,8,8,9,9]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm0, %zmm2
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm13
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rcx), %xmm3
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdx), %xmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm3, %xmm29
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm2, %xmm7
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm31 = [0,2,2,3,8,9,9,11]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm31, %zmm1
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm13, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm28[3,3,3,3,7,7,7,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7,8,9,10],ymm0[11],ymm2[12,13],ymm0[14],ymm2[15]
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rsi), %xmm4
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdi), %xmm3
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm4, %xmm26
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm3, %xmm24
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm2, %xmm2
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm3 = [2,1,3,3,8,8,9,9]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm3, %zmm0
+; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm1, %zmm16, %zmm0
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm8, %ymm1
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm14[3,3,3,3,7,7,7,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm13 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15]
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rcx), %xmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdx), %xmm1
+; AVX512DQ-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm4 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm4, %xmm2, %xmm5
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm4, %xmm8
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm1[1,1,2,2]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3],xmm5[4],xmm6[5,6],xmm5[7]
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm7, %xmm14
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm1, %xmm2
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [0,1,1,3,8,8,9,9]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm5, %zmm1, %zmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm11
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %xmm4
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm4, (%rsp) # 16-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rsi), %xmm6
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vprold $16, %xmm6, %xmm3
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm0[1,1,2,3]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2],xmm5[3,4],xmm3[5],xmm5[6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm12, %xmm5, %xmm5
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm28 = [0,0,1,1,8,8,10,9]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm28, %zmm5
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0]
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm2, %zmm3, %zmm5
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%r9), %xmm2
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%r8), %xmm6
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
+; AVX512DQ-FCP-NEXT:    vprold $16, %xmm6, %xmm1
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm4[1,1,2,3]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm1[2],xmm5[3,4],xmm1[5],xmm5[6,7]
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm1, %xmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm9, %xmm3
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm27 = [0,0,1,1,8,8,10,9]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm5, %zmm27, %zmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0]
+; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm2, %zmm5, %zmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm20, %xmm2
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm4, %xmm2, %xmm6
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm2, %xmm6
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm2, %xmm2
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm26 = [0,0,0,1,8,9,9,11]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm6, %zmm26, %zmm2
-; AVX512DQ-FCP-NEXT:    vpbroadcastd 64(%rax), %ymm6
-; AVX512DQ-FCP-NEXT:    vpbroadcastd 68(%rax), %ymm8
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm6, %zmm31
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm2, %zmm21, %zmm31
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm16 = [0,0,0,1,8,9,9,11]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm6, %zmm16, %zmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %xmm4
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %xmm9
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm4, %xmm6
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm8 = xmm9[1,1,2,2]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm8[0],xmm6[1],xmm8[2,3],xmm6[4],xmm8[5,6],xmm6[7]
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm14, %xmm8, %xmm8
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm14, %xmm21
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm6, %zmm11, %zmm8
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm4
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %xmm11
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT:    vprold $16, %xmm11, %xmm6
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm9 = xmm4[1,1,2,3]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm9[0,1],xmm6[2],xmm9[3,4],xmm6[5],xmm9[6,7]
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm4[0],xmm11[0],xmm4[1],xmm11[1],xmm4[2],xmm11[2],xmm4[3],xmm11[3]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm9, %xmm9
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm3, %xmm28
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm6, %zmm27, %zmm9
+; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm8, %zmm5, %zmm9
+; AVX512DQ-FCP-NEXT:    vpbroadcastd 64(%rax), %ymm5
+; AVX512DQ-FCP-NEXT:    vpbroadcastd 68(%rax), %ymm6
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm5, %zmm23
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
+; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm2, %zmm4, %zmm23
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %zmm5, %zmm2, %zmm31
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %xmm8
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm14, %xmm0, %xmm5
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm14, %xmm29
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm8[1,1,2,2]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3],xmm5[4],xmm6[5,6],xmm5[7]
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm15, %xmm6, %xmm6
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm5, %zmm13, %zmm6
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %xmm13
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vprold $16, %xmm13, %xmm5
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm8 = xmm0[1,1,2,3]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm8[0,1],xmm5[2],xmm8[3,4],xmm5[5],xmm8[6,7]
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm12, %xmm8, %xmm8
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm5, %zmm28, %zmm8
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm6, %zmm3, %zmm8
-; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %xmm3
-; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %xmm5
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm4, %xmm3, %xmm5
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm3, %xmm3
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm7, %xmm17
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm5, %zmm26, %zmm3
+; AVX512DQ-FCP-NEXT:    vpternlogq $184, %zmm1, %zmm2, %zmm23
+; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm1, %xmm5
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm10, %xmm22
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm1, %xmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm7, %xmm3
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm5, %zmm16, %zmm1
 ; AVX512DQ-FCP-NEXT:    vpbroadcastd (%rax), %ymm5
 ; AVX512DQ-FCP-NEXT:    vpbroadcastd 4(%rax), %ymm6
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm5, %zmm26
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm3, %zmm21, %zmm26
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %zmm8, %zmm2, %zmm26
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm14[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm6[2,2,2,2,6,6,6,6]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm14, %ymm3
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm6[1,1,1,1,5,5,5,5]
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm6, %ymm9
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7,8,9],ymm5[10],ymm3[11,12],ymm5[13],ymm3[14,15]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm20, %zmm3
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm5, %zmm20
+; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm1, %zmm4, %zmm20
+; AVX512DQ-FCP-NEXT:    vpternlogq $184, %zmm9, %zmm2, %zmm20
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm4[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm17[2,2,2,2,6,6,6,6]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm17[1,1,1,1,5,5,5,5]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm5[2],ymm2[3,4],ymm5[5],ymm2[6,7,8,9],ymm5[10],ymm2[11,12],ymm5[13],ymm2[14,15]
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm5 = [2,2,2,3,8,10,10,11]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm5, %zmm2
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
 ; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
 ; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm7[2,2,2,2,6,6,6,6]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm5[2],ymm2[3,4],ymm5[5],ymm2[6,7,8,9],ymm5[10],ymm2[11,12],ymm5[13],ymm2[14,15]
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm5 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm5[2],ymm1[3,4],ymm5[5],ymm1[6,7,8,9],ymm5[10],ymm1[11,12],ymm5[13],ymm1[14,15]
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm5 = ymm9[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
 ; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm7[0,1,1,3,4,5,5,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm7, %ymm18
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7,8,9],ymm5[10],ymm6[11,12],ymm5[13],ymm6[14,15]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm11, %zmm5
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rcx), %xmm11
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdx), %xmm7
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm15, %xmm2, %xmm2
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm6 = [0,2,2,3,8,9,9,11]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm6, %zmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm13 = [u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm13, %ymm10, %ymm2
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm8 = ymm19[3,3,3,3,7,7,7,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm8[0,1,2],ymm2[3],ymm8[4,5],ymm2[6],ymm8[7,8,9,10],ymm2[11],ymm8[12,13],ymm2[14],ymm8[15]
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm3, %zmm27, %zmm5
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vprold $16, %ymm4, %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm19[1,2,2,3,5,6,6,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm4, %ymm24
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm19[0,0,2,1,4,4,6,5]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8,9,10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm23, %zmm8
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rsi), %xmm3
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdi), %xmm4
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm12, %xmm2, %xmm2
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm20 = [2,1,3,3,8,8,9,9]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm20, %zmm10
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm1, %zmm21, %zmm10
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm30, %ymm1
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm22[3,3,3,3,7,7,7,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15]
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm23, %zmm16, %zmm1
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm16 = [0,0,4,0,0,0,5,0,0,13,0,0,0,14,0,0]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm1, %zmm16, %zmm22
-; AVX512DQ-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm22
-; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm22
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %xmm5
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %xmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm15, %xmm1, %xmm1
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm18[3,3,3,3,7,7,7,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2,3],ymm0[4],ymm5[5,6,7,8],ymm0[9],ymm5[10,11],ymm0[12],ymm5[13,14,15]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm6, %zmm0
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm30 = [2,2,2,3,8,8,8,9]
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%r9), %xmm6
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%r8), %xmm5
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm17, %xmm1
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm8, %xmm1
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm30, %zmm2
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm13, %ymm14, %ymm1
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[3,3,3,3,7,7,7,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm9[0,1,2],ymm1[3],ymm9[4,5],ymm1[6],ymm9[7,8,9,10],ymm1[11],ymm9[12,13],ymm1[14],ymm9[15]
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %xmm15
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %xmm9
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm12, %xmm14, %xmm14
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm14, %zmm20, %zmm1
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm17 = [6,7,3,3,7,7,6,7]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm25, %ymm17, %ymm16
-; AVX512DQ-FCP-NEXT:    vpbroadcastd 96(%rax), %ymm20
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm20, %zmm16, %zmm25
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm18 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm2, %zmm18, %zmm25
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm27 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %zmm10, %zmm27, %zmm25
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; AVX512DQ-FCP-NEXT:    vprold $16, %xmm3, %xmm3
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,3]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7]
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm0, %zmm21, %zmm1
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm2[4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9]
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm21 = [0,0,2,1,8,8,9,11]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm21, %zmm3
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm7[4],xmm11[4],xmm7[5],xmm11[5],xmm7[6],xmm11[6],xmm7[7],xmm11[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm29, %xmm2
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm11, %xmm2
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[1,1,2,2]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0],xmm2[1],xmm7[2,3],xmm2[4],xmm7[5,6],xmm2[7]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm28, %zmm7
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm28 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm3, %zmm28, %zmm7
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm7, %ymm15
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7,8,9],ymm5[10],ymm6[11,12],ymm5[13],ymm6[14,15]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm18, %zmm8
+; AVX512DQ-FCP-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm8
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vprold $16, %ymm5, %ymm1
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %ymm0, %ymm18, %ymm16
-; AVX512DQ-FCP-NEXT:    vextracti64x4 $1, %zmm0, %ymm13
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm12, %xmm2, %xmm2
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm8[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm3 = [0,1,1,3,8,8,9,9]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm3, %zmm6
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm24, %ymm2
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm19[3,3,3,3,7,7,7,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15]
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r9), %xmm3
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %xmm11
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm2[0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm5, %zmm30, %zmm8
-; AVX512DQ-FCP-NEXT:    vpbroadcastd 100(%rax), %ymm5
-; AVX512DQ-FCP-NEXT:    vpbroadcastd 104(%rax), %ymm19
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm19, %zmm5, %zmm5
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm24 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm6, %zmm24, %zmm5
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm6 = ymm4[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[3,3,3,3]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm12, %xmm10, %xmm14
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[0,0,1,1]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm12, %xmm10, %xmm10
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm12, %xmm30
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,0,1,1]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm23, %ymm17, %ymm17
-; AVX512DQ-FCP-NEXT:    vpbroadcastd 32(%rax), %ymm19
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm19, %zmm17, %zmm20
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm8, %zmm18, %zmm20
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %zmm1, %zmm27, %zmm20
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0]
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %zmm7, %zmm1, %zmm5
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm7 = xmm9[4],xmm15[4],xmm9[5],xmm15[5],xmm9[6],xmm15[6],xmm9[7],xmm15[7]
-; AVX512DQ-FCP-NEXT:    vprold $16, %xmm9, %xmm8
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm9 = xmm15[1,1,2,3]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm9[0,1],xmm8[2],xmm9[3,4],xmm8[5],xmm9[6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm15 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm15, %xmm7, %xmm7
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm7, %zmm21, %zmm8
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm16[1,2,2,3,5,6,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm5, %ymm18
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm16[0,0,2,1,4,4,6,5]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm5[3],ymm2[4,5],ymm5[6],ymm2[7,8,9,10],ymm5[11],ymm2[12,13],ymm5[14],ymm2[15]
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm5 = [2,2,3,3,10,9,11,10]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm5, %zmm2
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm5 = [2,2,2,3,8,8,8,9]
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%r9), %xmm11
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%r8), %xmm7
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm10, %xmm1
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm5, %zmm13
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm14, %zmm19, %zmm1
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm5 = [0,0,4,0,0,0,5,0,0,13,0,0,0,14,0,0]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm1, %zmm5, %zmm19
+; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm2, %zmm25, %zmm19
+; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm19
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %xmm12
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %xmm8
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm12[0],xmm8[1],xmm12[1],xmm8[2],xmm12[2],xmm8[3],xmm12[3]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm21, %xmm2
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm9[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm9 = ymm15[3,3,3,3,7,7,7,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm15 = ymm9[0],ymm2[1],ymm9[2,3],ymm2[4],ymm9[5,6,7,8],ymm2[9],ymm9[10,11],ymm2[12],ymm9[13,14,15]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm31, %zmm15
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm31 = [6,7,3,3,7,7,6,7]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm30, %ymm31, %ymm1
+; AVX512DQ-FCP-NEXT:    vpbroadcastd 96(%rax), %ymm2
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm21
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm30 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535]
+; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm13, %zmm30, %zmm21
+; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm21
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm24, %xmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm26, %xmm1
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; AVX512DQ-FCP-NEXT:    vprold $16, %xmm26, %xmm1
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm24[1,1,2,3]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm17[3,3,3,3,7,7,7,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15]
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %xmm5
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %xmm4
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm28, %xmm6
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm13, %xmm13
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [2,1,3,3,8,8,9,9]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm13, %zmm2, %zmm1
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9]
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm17 = [0,0,2,1,8,8,9,11]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm17, %zmm3
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm29, %xmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm9, %xmm7
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm9 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm12 = xmm0[1,1,2,2]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm12[0],xmm7[1],xmm12[2,3],xmm7[4],xmm12[5,6],xmm7[7]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm9[0,2,3,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm0 = [0,0,1,1,8,8,10,9]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm9, %zmm0, %zmm7
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm8, %zmm28, %zmm7
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm11[4],xmm3[4],xmm11[5],xmm3[5],xmm11[6],xmm3[6],xmm11[7],xmm3[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm30, %xmm0
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm3, %xmm3
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm0 = [0,1,1,3,8,8,9,9]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm0, %zmm2
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7]
+; AVX512DQ-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm24 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm24, %xmm13
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm9 = xmm9[1,1,2,2]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm9[0],xmm0[1],xmm9[2,3],xmm0[4],xmm9[5,6],xmm0[7]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm27, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm25 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
+; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm3, %zmm25, %zmm0
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm11[4],xmm7[5],xmm11[5],xmm7[6],xmm11[6],xmm7[7],xmm11[7]
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpternlogq $184, %ymm3, %ymm30, %ymm7
+; AVX512DQ-FCP-NEXT:    vextracti64x4 $1, %zmm3, %ymm11
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm2, %xmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm22, %xmm13
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm10, %xmm3
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm28 = [0,1,1,3,8,8,9,9]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm28, %zmm3
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm6[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm26 = ymm2[3,3,3,3]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm29 = ymm2[2,2,2,2]
+; AVX512DQ-FCP-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm15, %zmm1
+; AVX512DQ-FCP-NEXT:    vpbroadcastd 100(%rax), %ymm2
+; AVX512DQ-FCP-NEXT:    vpbroadcastd 104(%rax), %ymm6
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm2, %zmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm22 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
+; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm3, %zmm22, %zmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm18, %ymm3
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm16[3,3,3,3,7,7,7,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm3[0,1],ymm6[2],ymm3[3,4],ymm6[5],ymm3[6,7,8,9],ymm6[10],ymm3[11,12],ymm6[13],ymm3[14,15]
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r9), %xmm15
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %xmm3
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm6[0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm16 = [2,2,2,3,8,8,8,9]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm9, %zmm16, %zmm10
+; AVX512DQ-FCP-NEXT:    vpermd %ymm14, %ymm31, %ymm9
+; AVX512DQ-FCP-NEXT:    vpbroadcastd 32(%rax), %ymm18
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm18, %zmm9, %zmm9
+; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm10, %zmm30, %zmm9
+; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm9
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0]
+; AVX512DQ-FCP-NEXT:    vpternlogq $184, %zmm0, %zmm1, %zmm2
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
+; AVX512DQ-FCP-NEXT:    vprold $16, %xmm4, %xmm4
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,3]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm17, %zmm4
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm24, %xmm0
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm12, %xmm0
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7]
+; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm8 = xmm8[1,1,2,2]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm8[0],xmm0[1],xmm8[2,3],xmm0[4],xmm8[5,6],xmm0[7]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,2,3,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm5, %zmm27, %zmm0
+; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm4, %zmm25, %zmm0
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm15[4],xmm3[5],xmm15[5],xmm3[6],xmm15[6],xmm3[7],xmm15[7]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm6, %xmm4
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm28, %zmm4
 ; AVX512DQ-FCP-NEXT:    vpbroadcastd 36(%rax), %ymm3
-; AVX512DQ-FCP-NEXT:    vpbroadcastd 40(%rax), %ymm8
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm3, %zmm3
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm2, %zmm24, %zmm3
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %zmm7, %zmm1, %zmm3
+; AVX512DQ-FCP-NEXT:    vpbroadcastd 40(%rax), %ymm5
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm3, %zmm3
+; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm4, %zmm22, %zmm3
+; AVX512DQ-FCP-NEXT:    vpternlogq $184, %zmm0, %zmm1, %zmm3
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535]
+; AVX512DQ-FCP-NEXT:    vpternlogq $184, %ymm7, %ymm0, %ymm29
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogd $226, 124(%r8){1to8}, %ymm0, %ymm13
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %ymm13, %ymm1, %ymm6
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %ymm16, %ymm2, %ymm4
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm0, %zmm0
-; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm4[0,1,2,3],zmm0[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm13
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm0 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm0 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, %zmm4
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm2 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vpternlogd $226, 124(%r8){1to8}, %ymm0, %ymm11
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535]
+; AVX512DQ-FCP-NEXT:    vpternlogq $184, %ymm11, %ymm0, %ymm26
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm26, %zmm0, %zmm1
+; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm29[0,1,2,3],zmm1[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm12
+; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512DQ-FCP-NEXT:    vpunpckhwd (%rsp), %xmm1, %xmm1 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpunpckhwd (%rsp), %xmm4, %xmm4 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
 ; AVX512DQ-FCP-NEXT:    # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # xmm6 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # xmm7 = xmm7[4],mem[4],xmm7[5],mem[5],xmm7[6],mem[6],xmm7[7],mem[7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm15, %xmm4, %xmm4
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm15, %xmm7, %xmm7
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm8 = ymm8[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
-; AVX512DQ-FCP-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm9 = mem[0,1,1,3,4,5,5,7]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm9[0,1],ymm8[2],ymm9[3,4],ymm8[5],ymm9[6,7,8,9],ymm8[10],ymm9[11,12],ymm8[13],ymm9[14,15]
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
-; AVX512DQ-FCP-NEXT:    vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm10 = mem[1,1,1,1,5,5,5,5]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7,8,9],ymm10[10],ymm9[11,12],ymm10[13],ymm9[14,15]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [0,1,4,5,4,5,5,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm1, %xmm1
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm5, %xmm5
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm6[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512DQ-FCP-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # ymm7 = mem[0,1,1,3,4,5,5,7]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7,8,9],ymm6[10],ymm7[11,12],ymm6[13],ymm7[14,15]
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
+; AVX512DQ-FCP-NEXT:    vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # ymm8 = mem[1,1,1,1,5,5,5,5]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7,8,9],ymm8[10],ymm7[11,12],ymm8[13],ymm7[14,15]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [0,1,4,5,4,5,5,7]
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpermd %ymm10, %ymm8, %ymm8
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm10 = ymm10[12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[16,17],zero,zero
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535]
+; AVX512DQ-FCP-NEXT:    vpandn %ymm8, %ymm11, %ymm8
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm10, %zmm8
+; AVX512DQ-FCP-NEXT:    vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # xmm10 = mem[0,2,3,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[0,2,3,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,0,1,3]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,1,3,2]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3]
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpermd %ymm11, %ymm10, %ymm10
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm11 = ymm11[12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[16,17],zero,zero
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpandn %ymm10, %ymm12, %ymm10
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm11, %zmm10
-; AVX512DQ-FCP-NEXT:    vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # xmm11 = mem[0,2,3,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,0,1,3]
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[0,2,3,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,0,1,3]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,1,3,2]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3]
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm12, %zmm14
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm13
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm10, %zmm25, %zmm1
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm11, %zmm28, %zmm4
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm6, %zmm28, %zmm7
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm0, %zmm6, %zmm4
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm2, %zmm6, %zmm7
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm0, %zmm0
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm9, %zmm2, %zmm2
-; AVX512DQ-FCP-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm10 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm10
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm4, %zmm25, %zmm5
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535]
+; AVX512DQ-FCP-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm1 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm5 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm4, %zmm4
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm6, %zmm6
+; AVX512DQ-FCP-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm6
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm8
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm10, 320(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, 320(%rax)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, 256(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm20, 192(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm22, 128(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm26, (%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm31, 448(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, 704(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm25, 640(%rax)
-; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovaps %zmm1, 576(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm14, 384(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, 64(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, 512(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm13, 832(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, 768(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm9, 192(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm19, 128(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm20, (%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm23, 448(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, 704(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm21, 640(%rax)
+; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovaps %zmm0, 576(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm13, 384(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, 64(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, 512(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm12, 832(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, 768(%rax)
 ; AVX512DQ-FCP-NEXT:    addq $1496, %rsp # imm = 0x5D8
 ; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll
index 06d390f053c7e..f56c43eb49df2 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll
@@ -4879,10 +4879,12 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0]
 ; AVX512-NEXT:    vpternlogq $226, %zmm5, %zmm8, %zmm7
 ; AVX512-NEXT:    vpternlogq $248, %zmm28, %zmm7, %zmm2
-; AVX512-NEXT:    vporq %zmm24, %zmm26, %zmm5
-; AVX512-NEXT:    vpermq {{.*#+}} zmm5 = zmm5[2,2,3,3,6,6,7,7]
-; AVX512-NEXT:    vporq %zmm25, %zmm27, %zmm7
-; AVX512-NEXT:    vpermq {{.*#+}} zmm7 = zmm7[2,2,3,3,6,6,7,7]
+; AVX512-NEXT:    vpermq {{.*#+}} zmm5 = zmm24[2,2,3,3,6,6,7,7]
+; AVX512-NEXT:    vpermq {{.*#+}} zmm7 = zmm26[2,2,3,3,6,6,7,7]
+; AVX512-NEXT:    vporq %zmm5, %zmm7, %zmm5
+; AVX512-NEXT:    vpermq {{.*#+}} zmm7 = zmm25[2,2,3,3,6,6,7,7]
+; AVX512-NEXT:    vpermq {{.*#+}} zmm9 = zmm27[2,2,3,3,6,6,7,7]
+; AVX512-NEXT:    vporq %zmm7, %zmm9, %zmm7
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255]
 ; AVX512-NEXT:    vpternlogq $226, %zmm5, %zmm9, %zmm7
 ; AVX512-NEXT:    vpternlogd $184, %zmm7, %zmm29, %zmm30
@@ -4916,165 +4918,165 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-FCP:       # %bb.0:
 ; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %ymm0
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm14 = [128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128,128]
-; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm0, %ymm1
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm2
+; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm0, %ymm2
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm1
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [12,13,128,15,12,13,14,128,12,13,14,15,128,u,u,u,16,128,18,19,16,17,128,19,16,17,18,128,16,17,18,19]
-; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm2, %ymm3
-; AVX512-FCP-NEXT:    vporq %ymm1, %ymm3, %ymm17
+; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm1, %ymm3
+; AVX512-FCP-NEXT:    vporq %ymm2, %ymm3, %ymm17
 ; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm3
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm3[8],zero,xmm3[u,7],zero,xmm3[9],zero,xmm3[u],zero,xmm3[u,10],zero,xmm3[12],zero,xmm3[u,11]
-; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %xmm5
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128]
-; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm5, %xmm4
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm6, %xmm28
-; AVX512-FCP-NEXT:    vporq %xmm1, %xmm4, %xmm18
-; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %ymm1
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm3[8],zero,xmm3[u,7],zero,xmm3[9],zero,xmm3[u],zero,xmm3[u,10],zero,xmm3[12],zero,xmm3[u,11]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm3, %xmm31
+; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %xmm15
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,xmm15[8,u],zero,xmm15[7],zero,xmm15[9,u,11,u],zero,xmm15[10],zero,xmm15[12,u],zero
+; AVX512-FCP-NEXT:    vporq %xmm2, %xmm4, %xmm18
+; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %ymm2
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm10 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128]
-; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm1, %ymm4
+; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm2, %ymm4
 ; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %ymm11
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm12 = [u,u,12,13,128,u,u,u,14,128,u,u,14,15,128,u,u,u,16,128,u,u,16,17,128,u,u,u,18,128,u,u]
 ; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm11, %ymm6
 ; AVX512-FCP-NEXT:    vporq %ymm4, %ymm6, %ymm19
 ; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %xmm6
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,xmm6[6],zero,xmm6[8,u],zero,xmm6[7],zero,xmm6[9],zero,xmm6[11,u],zero,xmm6[10],zero,xmm6[12]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12]
+; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm6, %xmm4
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm3, %xmm30
 ; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %xmm7
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm13 = [6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128]
-; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm7, %xmm8
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm13, %xmm29
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128]
+; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm7, %xmm8
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm3, %xmm29
 ; AVX512-FCP-NEXT:    vporq %xmm4, %xmm8, %xmm20
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128,128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128]
-; AVX512-FCP-NEXT:    # ymm8 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm2, %ymm4
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm8, %ymm31
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm1[26],zero,ymm1[28],zero,zero,ymm1[27],zero,ymm1[29],zero,ymm1[31],zero,zero,ymm1[30],zero
 ; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128]
 ; AVX512-FCP-NEXT:    # ymm13 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm2, %ymm2
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm2, %zmm21
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [27,0,128,26,128,28,0,128,0,128,29,128,31,0,128,30,27,0,128,26,128,28,0,128,0,128,29,128,31,0,128,30]
-; AVX512-FCP-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm0, %ymm8
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm30
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [19,128,21,0,128,20,128,22,128,24,0,128,23,128,25,0,19,128,21,0,128,20,128,22,128,24,0,128,23,128,25,0]
-; AVX512-FCP-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm0, %zmm22
+; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm1, %ymm1
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm1, %zmm21
+; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [27,0,128,26,128,28,0,128,0,128,29,128,31,0,128,30,27,0,128,26,128,28,0,128,0,128,29,128,31,0,128,30]
+; AVX512-FCP-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm0, %ymm4
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm1, %ymm28
+; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [19,128,21,0,128,20,128,22,128,24,0,128,23,128,25,0,19,128,21,0,128,20,128,22,128,24,0,128,23,128,25,0]
+; AVX512-FCP-NEXT:    # ymm5 = mem[0,1,0,1]
+; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm0, %ymm0
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm22
 ; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128]
 ; AVX512-FCP-NEXT:    # ymm4 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm11, %ymm8
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128]
-; AVX512-FCP-NEXT:    # ymm0 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm1, %ymm15
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm15, %zmm24
+; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm11, %ymm0
+; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128]
+; AVX512-FCP-NEXT:    # ymm3 = mem[0,1,0,1]
+; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm2, %ymm8
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm8, %zmm24
 ; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [25,128,27,0,128,26,128,28,128,30,0,128,29,128,31,0,25,128,27,0,128,26,128,28,128,30,0,128,29,128,31,0]
 ; AVX512-FCP-NEXT:    # ymm8 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm1, %ymm1
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25]
-; AVX512-FCP-NEXT:    # ymm15 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm15, %ymm11, %ymm11
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm11, %zmm26
-; AVX512-FCP-NEXT:    vmovdqa (%rcx), %ymm1
-; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm1, %ymm10
+; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm2, %ymm0
+; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25]
+; AVX512-FCP-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm11, %ymm2
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm26
+; AVX512-FCP-NEXT:    vmovdqa (%rcx), %ymm2
+; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm2, %ymm0
 ; AVX512-FCP-NEXT:    vmovdqa (%rdx), %ymm11
-; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm11, %ymm12
-; AVX512-FCP-NEXT:    vporq %ymm10, %ymm12, %ymm23
-; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
-; AVX512-FCP-NEXT:    vpshufb %ymm15, %ymm11, %ymm10
-; AVX512-FCP-NEXT:    vporq %ymm0, %ymm10, %ymm25
+; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm11, %ymm10
+; AVX512-FCP-NEXT:    vporq %ymm0, %ymm10, %ymm23
+; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm2, %ymm0
+; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm11, %ymm1
+; AVX512-FCP-NEXT:    vporq %ymm0, %ymm1, %ymm25
 ; AVX512-FCP-NEXT:    vmovdqa (%rsi), %ymm10
 ; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm10, %ymm0
 ; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm12
-; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm12, %ymm9
-; AVX512-FCP-NEXT:    vporq %ymm0, %ymm9, %ymm27
+; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm12, %ymm1
+; AVX512-FCP-NEXT:    vporq %ymm0, %ymm1, %ymm27
 ; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm12, %ymm0
-; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm10, %ymm2
-; AVX512-FCP-NEXT:    vporq %ymm0, %ymm2, %ymm16
+; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm10, %ymm1
+; AVX512-FCP-NEXT:    vporq %ymm0, %ymm1, %ymm16
 ; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm9
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm9[8],zero,xmm9[u,7],zero,xmm9[9],zero,xmm9[u],zero,xmm9[u,10],zero,xmm9[12],zero,xmm9[u,11]
 ; AVX512-FCP-NEXT:    vmovdqa (%rsi), %xmm14
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm28, %xmm2
-; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm14, %xmm2
-; AVX512-FCP-NEXT:    vporq %xmm0, %xmm2, %xmm28
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = zero,xmm14[8,u],zero,xmm14[7],zero,xmm14[9,u,11,u],zero,xmm14[10],zero,xmm14[12,u],zero
+; AVX512-FCP-NEXT:    vpor %xmm0, %xmm1, %xmm3
 ; AVX512-FCP-NEXT:    vmovdqa (%rcx), %xmm13
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = zero,xmm13[6],zero,xmm13[8,u],zero,xmm13[7],zero,xmm13[9],zero,xmm13[11,u],zero,xmm13[10],zero,xmm13[12]
-; AVX512-FCP-NEXT:    vmovdqa (%rdx), %xmm15
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm29, %xmm2
-; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm15, %xmm2
-; AVX512-FCP-NEXT:    vporq %xmm0, %xmm2, %xmm29
-; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm11, %ymm0
-; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm1, %ymm1
-; AVX512-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm31, %ymm1
-; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm12, %ymm1
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm30, %ymm2
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm30, %xmm0
+; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm13, %xmm1
+; AVX512-FCP-NEXT:    vmovdqa (%rdx), %xmm0
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm29, %xmm5
+; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm0, %xmm5
+; AVX512-FCP-NEXT:    vpor %xmm1, %xmm5, %xmm5
+; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm11, %ymm1
+; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm2, %ymm2
+; AVX512-FCP-NEXT:    vpor %ymm1, %ymm2, %ymm11
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm12[26],zero,ymm12[28],zero,zero,ymm12[27],zero,ymm12[29],zero,ymm12[31],zero,zero,ymm12[30],zero
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm28, %ymm2
 ; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm10, %ymm2
-; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %ymm4
+; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %ymm8
 ; AVX512-FCP-NEXT:    vpor %ymm1, %ymm2, %ymm1
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm8 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128]
-; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm4, %ymm2
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm4 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128]
+; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm8, %ymm2
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [1,1,2,2,2,2,2,2]
-; AVX512-FCP-NEXT:    vpermd %ymm4, %ymm10, %ymm10
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255]
-; AVX512-FCP-NEXT:    vpandn %ymm10, %ymm11, %ymm10
+; AVX512-FCP-NEXT:    vpermd %ymm8, %ymm10, %ymm10
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255]
+; AVX512-FCP-NEXT:    vpandn %ymm10, %ymm12, %ymm10
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm10, %zmm2
 ; AVX512-FCP-NEXT:    vmovdqa (%r8), %ymm10
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm12 = [4,6,5,5,5,5,4,6,30,30,30,30,31,31,31,31]
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm30 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0]
-; AVX512-FCP-NEXT:    vpermd %ymm10, %ymm12, %ymm31
-; AVX512-FCP-NEXT:    vpandnq %ymm31, %ymm30, %ymm31
-; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm10, %ymm8
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm31, %zmm8, %zmm8
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm28 = [4,6,5,5,5,5,4,6,30,30,30,30,31,31,31,31]
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm29 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0]
+; AVX512-FCP-NEXT:    vpermd %ymm10, %ymm28, %ymm30
+; AVX512-FCP-NEXT:    vpandnq %ymm30, %ymm29, %ymm30
+; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm10, %ymm4
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm30, %zmm4, %zmm4
 ; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3],xmm9[4],xmm14[4],xmm9[5],xmm14[5],xmm9[6],xmm14[6],xmm9[7],xmm14[7]
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm14 = [0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13]
 ; AVX512-FCP-NEXT:    vpshufb %xmm14, %xmm9, %xmm9
-; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm28, %zmm9, %zmm9
-; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3],xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm15 = [2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8]
-; AVX512-FCP-NEXT:    vpshufb %xmm15, %xmm13, %xmm13
-; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm29, %zmm13, %zmm13
-; AVX512-FCP-NEXT:    vmovdqa64 (%r8), %zmm28
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm28, %zmm10
+; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm3, %zmm9, %zmm3
+; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3],xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm9 = [2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8]
+; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm0, %xmm0
+; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm5, %zmm0, %zmm0
+; AVX512-FCP-NEXT:    vmovdqa64 (%r8), %zmm5
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm5, %zmm10
 ; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
-; AVX512-FCP-NEXT:    vpshufb %xmm15, %xmm6, %xmm6
+; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm6, %xmm6
 ; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm7 = [2,2,3,3,8,8,9,9]
-; AVX512-FCP-NEXT:    vpermt2q %zmm6, %zmm7, %zmm0
-; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
-; AVX512-FCP-NEXT:    vpshufb %xmm14, %xmm3, %xmm3
-; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm7, %zmm1
-; AVX512-FCP-NEXT:    vporq %zmm21, %zmm22, %zmm3
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm3 = zmm3[2,2,3,3,6,6,7,7]
-; AVX512-FCP-NEXT:    vporq %zmm24, %zmm26, %zmm5
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm5 = zmm5[2,2,3,3,6,6,7,7]
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255]
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm3, %zmm6, %zmm5
-; AVX512-FCP-NEXT:    vpermt2d %zmm28, %zmm12, %zmm4
-; AVX512-FCP-NEXT:    vpternlogd $184, %zmm5, %zmm30, %zmm4
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, 256(%r9)
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm18[0,0,1,1]
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm17, %zmm3, %zmm3
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm20[0,0,1,1]
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm19, %zmm4, %zmm4
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0]
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm3, %zmm5, %zmm4
-; AVX512-FCP-NEXT:    vpternlogq $248, %zmm11, %zmm4, %zmm2
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm25[2,2,3,3]
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm23, %zmm3
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm16[2,2,3,3]
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm27, %zmm4
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm3, %zmm5, %zmm4
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm0, %zmm6, %zmm1
-; AVX512-FCP-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm8
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm0 = zmm9[0,0,1,1,4,4,5,5]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm3 = zmm13[0,0,1,1,4,4,5,5]
-; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm3
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,0,1,1,9,9,0,10,10,10,10,0]
-; AVX512-FCP-NEXT:    vpermd %zmm10, %zmm0, %zmm0
-; AVX512-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm0
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [6,6,6,0,7,7,7,7,0,8,8,8,8,0,9,9]
-; AVX512-FCP-NEXT:    vpermd %zmm28, %zmm3, %zmm3
-; AVX512-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm3
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, 128(%r9)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, 64(%r9)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, (%r9)
+; AVX512-FCP-NEXT:    vpermt2q %zmm6, %zmm7, %zmm11
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm31, %xmm6
+; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm15[0],xmm6[1],xmm15[1],xmm6[2],xmm15[2],xmm6[3],xmm15[3],xmm6[4],xmm15[4],xmm6[5],xmm15[5],xmm6[6],xmm15[6],xmm6[7],xmm15[7]
+; AVX512-FCP-NEXT:    vpshufb %xmm14, %xmm6, %xmm6
+; AVX512-FCP-NEXT:    vpermt2q %zmm6, %zmm7, %zmm1
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm6 = zmm21[2,2,3,3,6,6,7,7]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm7 = zmm22[2,2,3,3,6,6,7,7]
+; AVX512-FCP-NEXT:    vporq %zmm6, %zmm7, %zmm6
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm7 = zmm24[2,2,3,3,6,6,7,7]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm9 = zmm26[2,2,3,3,6,6,7,7]
+; AVX512-FCP-NEXT:    vporq %zmm7, %zmm9, %zmm7
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255]
+; AVX512-FCP-NEXT:    vpternlogq $226, %zmm6, %zmm9, %zmm7
+; AVX512-FCP-NEXT:    vpermt2d %zmm5, %zmm28, %zmm8
+; AVX512-FCP-NEXT:    vpternlogd $184, %zmm7, %zmm29, %zmm8
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, 256(%r9)
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm18[0,0,1,1]
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm17, %zmm6, %zmm6
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm7 = ymm20[0,0,1,1]
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm19, %zmm7, %zmm7
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0]
+; AVX512-FCP-NEXT:    vpternlogq $226, %zmm6, %zmm8, %zmm7
+; AVX512-FCP-NEXT:    vpternlogq $248, %zmm12, %zmm7, %zmm2
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm25[2,2,3,3]
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm23, %zmm6
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm7 = ymm16[2,2,3,3]
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm27, %zmm7
+; AVX512-FCP-NEXT:    vpternlogq $226, %zmm6, %zmm8, %zmm7
+; AVX512-FCP-NEXT:    vpternlogq $226, %zmm11, %zmm9, %zmm1
+; AVX512-FCP-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm4
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm3 = zmm3[0,0,1,1,4,4,5,5]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[0,0,1,1,4,4,5,5]
+; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm0
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,0,0,1,1,9,9,0,10,10,10,10,0]
+; AVX512-FCP-NEXT:    vpermd %zmm10, %zmm3, %zmm3
+; AVX512-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm3
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [6,6,6,0,7,7,7,7,0,8,8,8,8,0,9,9]
+; AVX512-FCP-NEXT:    vpermd %zmm5, %zmm0, %zmm0
+; AVX512-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, 128(%r9)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, 64(%r9)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, (%r9)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, 192(%r9)
 ; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
@@ -5222,10 +5224,12 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0]
 ; AVX512DQ-NEXT:    vpternlogq $226, %zmm5, %zmm8, %zmm7
 ; AVX512DQ-NEXT:    vpternlogq $248, %zmm28, %zmm7, %zmm2
-; AVX512DQ-NEXT:    vporq %zmm24, %zmm26, %zmm5
-; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm5 = zmm5[2,2,3,3,6,6,7,7]
-; AVX512DQ-NEXT:    vporq %zmm25, %zmm27, %zmm7
-; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm7 = zmm7[2,2,3,3,6,6,7,7]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm5 = zmm24[2,2,3,3,6,6,7,7]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm7 = zmm26[2,2,3,3,6,6,7,7]
+; AVX512DQ-NEXT:    vporq %zmm5, %zmm7, %zmm5
+; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm7 = zmm25[2,2,3,3,6,6,7,7]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm9 = zmm27[2,2,3,3,6,6,7,7]
+; AVX512DQ-NEXT:    vporq %zmm7, %zmm9, %zmm7
 ; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255]
 ; AVX512DQ-NEXT:    vpternlogq $226, %zmm5, %zmm9, %zmm7
 ; AVX512DQ-NEXT:    vpternlogd $184, %zmm7, %zmm29, %zmm30
@@ -5259,165 +5263,165 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-FCP:       # %bb.0:
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm14 = [128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128,128]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm0, %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm2
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm0, %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm1
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [12,13,128,15,12,13,14,128,12,13,14,15,128,u,u,u,16,128,18,19,16,17,128,19,16,17,18,128,16,17,18,19]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm2, %ymm3
-; AVX512DQ-FCP-NEXT:    vporq %ymm1, %ymm3, %ymm17
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm1, %ymm3
+; AVX512DQ-FCP-NEXT:    vporq %ymm2, %ymm3, %ymm17
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %xmm3
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm3[8],zero,xmm3[u,7],zero,xmm3[9],zero,xmm3[u],zero,xmm3[u,10],zero,xmm3[12],zero,xmm3[u,11]
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %xmm5
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm5, %xmm4
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm6, %xmm28
-; AVX512DQ-FCP-NEXT:    vporq %xmm1, %xmm4, %xmm18
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %ymm1
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm3[8],zero,xmm3[u,7],zero,xmm3[9],zero,xmm3[u],zero,xmm3[u,10],zero,xmm3[12],zero,xmm3[u,11]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm3, %xmm31
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %xmm15
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,xmm15[8,u],zero,xmm15[7],zero,xmm15[9,u,11,u],zero,xmm15[10],zero,xmm15[12,u],zero
+; AVX512DQ-FCP-NEXT:    vporq %xmm2, %xmm4, %xmm18
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %ymm2
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm10 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm1, %ymm4
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm2, %ymm4
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %ymm11
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm12 = [u,u,12,13,128,u,u,u,14,128,u,u,14,15,128,u,u,u,16,128,u,u,16,17,128,u,u,u,18,128,u,u]
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm11, %ymm6
 ; AVX512DQ-FCP-NEXT:    vporq %ymm4, %ymm6, %ymm19
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %xmm6
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,xmm6[6],zero,xmm6[8,u],zero,xmm6[7],zero,xmm6[9],zero,xmm6[11,u],zero,xmm6[10],zero,xmm6[12]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm6, %xmm4
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm3, %xmm30
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %xmm7
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm13 = [6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm7, %xmm8
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm13, %xmm29
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm7, %xmm8
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm3, %xmm29
 ; AVX512DQ-FCP-NEXT:    vporq %xmm4, %xmm8, %xmm20
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128,128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128]
-; AVX512DQ-FCP-NEXT:    # ymm8 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm2, %ymm4
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm8, %ymm31
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm1[26],zero,ymm1[28],zero,zero,ymm1[27],zero,ymm1[29],zero,ymm1[31],zero,zero,ymm1[30],zero
 ; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128]
 ; AVX512DQ-FCP-NEXT:    # ymm13 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm13, %ymm2, %ymm2
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm2, %zmm21
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [27,0,128,26,128,28,0,128,0,128,29,128,31,0,128,30,27,0,128,26,128,28,0,128,0,128,29,128,31,0,128,30]
-; AVX512DQ-FCP-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm0, %ymm8
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm30
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [19,128,21,0,128,20,128,22,128,24,0,128,23,128,25,0,19,128,21,0,128,20,128,22,128,24,0,128,23,128,25,0]
-; AVX512DQ-FCP-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm0, %zmm22
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm13, %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm1, %zmm21
+; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [27,0,128,26,128,28,0,128,0,128,29,128,31,0,128,30,27,0,128,26,128,28,0,128,0,128,29,128,31,0,128,30]
+; AVX512DQ-FCP-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm0, %ymm4
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm1, %ymm28
+; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [19,128,21,0,128,20,128,22,128,24,0,128,23,128,25,0,19,128,21,0,128,20,128,22,128,24,0,128,23,128,25,0]
+; AVX512DQ-FCP-NEXT:    # ymm5 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm22
 ; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128]
 ; AVX512DQ-FCP-NEXT:    # ymm4 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm11, %ymm8
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128]
-; AVX512DQ-FCP-NEXT:    # ymm0 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm1, %ymm15
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm15, %zmm24
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm11, %ymm0
+; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128]
+; AVX512DQ-FCP-NEXT:    # ymm3 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm2, %ymm8
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm8, %zmm24
 ; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [25,128,27,0,128,26,128,28,128,30,0,128,29,128,31,0,25,128,27,0,128,26,128,28,128,30,0,128,29,128,31,0]
 ; AVX512DQ-FCP-NEXT:    # ymm8 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm1, %ymm1
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25]
-; AVX512DQ-FCP-NEXT:    # ymm15 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm11, %ymm11
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm11, %zmm26
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %ymm1
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm1, %ymm10
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm2, %ymm0
+; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25]
+; AVX512DQ-FCP-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm11, %ymm2
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm26
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %ymm2
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm2, %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %ymm11
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm11, %ymm12
-; AVX512DQ-FCP-NEXT:    vporq %ymm10, %ymm12, %ymm23
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm11, %ymm10
-; AVX512DQ-FCP-NEXT:    vporq %ymm0, %ymm10, %ymm25
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm11, %ymm10
+; AVX512DQ-FCP-NEXT:    vporq %ymm0, %ymm10, %ymm23
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm2, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm11, %ymm1
+; AVX512DQ-FCP-NEXT:    vporq %ymm0, %ymm1, %ymm25
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %ymm10
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm10, %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm12
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm12, %ymm9
-; AVX512DQ-FCP-NEXT:    vporq %ymm0, %ymm9, %ymm27
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm12, %ymm1
+; AVX512DQ-FCP-NEXT:    vporq %ymm0, %ymm1, %ymm27
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm13, %ymm12, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm10, %ymm2
-; AVX512DQ-FCP-NEXT:    vporq %ymm0, %ymm2, %ymm16
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm10, %ymm1
+; AVX512DQ-FCP-NEXT:    vporq %ymm0, %ymm1, %ymm16
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm9
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm9[8],zero,xmm9[u,7],zero,xmm9[9],zero,xmm9[u],zero,xmm9[u,10],zero,xmm9[12],zero,xmm9[u,11]
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %xmm14
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm28, %xmm2
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm14, %xmm2
-; AVX512DQ-FCP-NEXT:    vporq %xmm0, %xmm2, %xmm28
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = zero,xmm14[8,u],zero,xmm14[7],zero,xmm14[9,u,11,u],zero,xmm14[10],zero,xmm14[12,u],zero
+; AVX512DQ-FCP-NEXT:    vpor %xmm0, %xmm1, %xmm3
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %xmm13
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = zero,xmm13[6],zero,xmm13[8,u],zero,xmm13[7],zero,xmm13[9],zero,xmm13[11,u],zero,xmm13[10],zero,xmm13[12]
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %xmm15
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm29, %xmm2
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm15, %xmm2
-; AVX512DQ-FCP-NEXT:    vporq %xmm0, %xmm2, %xmm29
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm11, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm1, %ymm1
-; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm31, %ymm1
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm12, %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm30, %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm30, %xmm0
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm13, %xmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %xmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm29, %xmm5
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm0, %xmm5
+; AVX512DQ-FCP-NEXT:    vpor %xmm1, %xmm5, %xmm5
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm11, %ymm1
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm2, %ymm2
+; AVX512DQ-FCP-NEXT:    vpor %ymm1, %ymm2, %ymm11
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm12[26],zero,ymm12[28],zero,zero,ymm12[27],zero,ymm12[29],zero,ymm12[31],zero,zero,ymm12[30],zero
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm28, %ymm2
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm10, %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %ymm4
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %ymm8
 ; AVX512DQ-FCP-NEXT:    vpor %ymm1, %ymm2, %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm8 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm4, %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm4 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm8, %ymm2
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [1,1,2,2,2,2,2,2]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm4, %ymm10, %ymm10
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255]
-; AVX512DQ-FCP-NEXT:    vpandn %ymm10, %ymm11, %ymm10
+; AVX512DQ-FCP-NEXT:    vpermd %ymm8, %ymm10, %ymm10
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255]
+; AVX512DQ-FCP-NEXT:    vpandn %ymm10, %ymm12, %ymm10
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm10, %zmm2
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %ymm10
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm12 = [4,6,5,5,5,5,4,6,30,30,30,30,31,31,31,31]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm30 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm10, %ymm12, %ymm31
-; AVX512DQ-FCP-NEXT:    vpandnq %ymm31, %ymm30, %ymm31
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm10, %ymm8
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm31, %zmm8, %zmm8
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm28 = [4,6,5,5,5,5,4,6,30,30,30,30,31,31,31,31]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm29 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm10, %ymm28, %ymm30
+; AVX512DQ-FCP-NEXT:    vpandnq %ymm30, %ymm29, %ymm30
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm10, %ymm4
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm30, %zmm4, %zmm4
 ; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3],xmm9[4],xmm14[4],xmm9[5],xmm14[5],xmm9[6],xmm14[6],xmm9[7],xmm14[7]
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm14 = [0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13]
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm14, %xmm9, %xmm9
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm28, %zmm9, %zmm9
-; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3],xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm15 = [2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm15, %xmm13, %xmm13
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm29, %zmm13, %zmm13
-; AVX512DQ-FCP-NEXT:    vmovdqa64 (%r8), %zmm28
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm28, %zmm10
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm3, %zmm9, %zmm3
+; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3],xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm9 = [2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm5, %zmm0, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 (%r8), %zmm5
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm5, %zmm10
 ; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm15, %xmm6, %xmm6
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm6, %xmm6
 ; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm7 = [2,2,3,3,8,8,9,9]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm6, %zmm7, %zmm0
-; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm14, %xmm3, %xmm3
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm7, %zmm1
-; AVX512DQ-FCP-NEXT:    vporq %zmm21, %zmm22, %zmm3
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm3 = zmm3[2,2,3,3,6,6,7,7]
-; AVX512DQ-FCP-NEXT:    vporq %zmm24, %zmm26, %zmm5
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm5 = zmm5[2,2,3,3,6,6,7,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255]
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm3, %zmm6, %zmm5
-; AVX512DQ-FCP-NEXT:    vpermt2d %zmm28, %zmm12, %zmm4
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm5, %zmm30, %zmm4
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, 256(%r9)
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm18[0,0,1,1]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm17, %zmm3, %zmm3
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm20[0,0,1,1]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm19, %zmm4, %zmm4
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0]
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm3, %zmm5, %zmm4
-; AVX512DQ-FCP-NEXT:    vpternlogq $248, %zmm11, %zmm4, %zmm2
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm25[2,2,3,3]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm23, %zmm3
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm16[2,2,3,3]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm27, %zmm4
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm3, %zmm5, %zmm4
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm0, %zmm6, %zmm1
-; AVX512DQ-FCP-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm8
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm0 = zmm9[0,0,1,1,4,4,5,5]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm3 = zmm13[0,0,1,1,4,4,5,5]
-; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm3
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,0,1,1,9,9,0,10,10,10,10,0]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm10, %zmm0, %zmm0
-; AVX512DQ-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm0
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [6,6,6,0,7,7,7,7,0,8,8,8,8,0,9,9]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm28, %zmm3, %zmm3
-; AVX512DQ-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm3
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, 128(%r9)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, 64(%r9)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, (%r9)
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm6, %zmm7, %zmm11
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm31, %xmm6
+; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm15[0],xmm6[1],xmm15[1],xmm6[2],xmm15[2],xmm6[3],xmm15[3],xmm6[4],xmm15[4],xmm6[5],xmm15[5],xmm6[6],xmm15[6],xmm6[7],xmm15[7]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm14, %xmm6, %xmm6
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm6, %zmm7, %zmm1
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm6 = zmm21[2,2,3,3,6,6,7,7]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm7 = zmm22[2,2,3,3,6,6,7,7]
+; AVX512DQ-FCP-NEXT:    vporq %zmm6, %zmm7, %zmm6
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm7 = zmm24[2,2,3,3,6,6,7,7]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm9 = zmm26[2,2,3,3,6,6,7,7]
+; AVX512DQ-FCP-NEXT:    vporq %zmm7, %zmm9, %zmm7
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255]
+; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm6, %zmm9, %zmm7
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm5, %zmm28, %zmm8
+; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm7, %zmm29, %zmm8
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, 256(%r9)
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm18[0,0,1,1]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm17, %zmm6, %zmm6
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm7 = ymm20[0,0,1,1]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm19, %zmm7, %zmm7
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0]
+; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm6, %zmm8, %zmm7
+; AVX512DQ-FCP-NEXT:    vpternlogq $248, %zmm12, %zmm7, %zmm2
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm25[2,2,3,3]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm23, %zmm6
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm7 = ymm16[2,2,3,3]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm27, %zmm7
+; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm6, %zmm8, %zmm7
+; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm11, %zmm9, %zmm1
+; AVX512DQ-FCP-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm4
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm3 = zmm3[0,0,1,1,4,4,5,5]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[0,0,1,1,4,4,5,5]
+; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm0
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,0,0,1,1,9,9,0,10,10,10,10,0]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm10, %zmm3, %zmm3
+; AVX512DQ-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm3
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [6,6,6,0,7,7,7,7,0,8,8,8,8,0,9,9]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm5, %zmm0, %zmm0
+; AVX512DQ-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, 128(%r9)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, 64(%r9)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, (%r9)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, 192(%r9)
 ; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll
index 60af864597f4f..e43aa56c96c28 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll
@@ -5405,289 +5405,304 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ;
 ; AVX512-LABEL: store_i8_stride6_vf64:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    subq $456, %rsp # imm = 0x1C8
-; AVX512-NEXT:    vmovdqa (%rsi), %xmm6
-; AVX512-NEXT:    vmovdqa 32(%rsi), %xmm3
-; AVX512-NEXT:    vmovdqa (%rdi), %xmm7
-; AVX512-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX512-NEXT:    vmovdqa (%r8), %xmm10
-; AVX512-NEXT:    vmovdqa 32(%r8), %xmm4
-; AVX512-NEXT:    vmovdqa (%r9), %xmm12
-; AVX512-NEXT:    vmovdqa 32(%r9), %xmm8
-; AVX512-NEXT:    vmovdqa 32(%rcx), %ymm13
-; AVX512-NEXT:    vpbroadcastq {{.*#+}} xmm9 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
-; AVX512-NEXT:    vpshufb %xmm9, %xmm3, %xmm0
-; AVX512-NEXT:    vpshufb %xmm9, %xmm2, %xmm1
+; AVX512-NEXT:    subq $328, %rsp # imm = 0x148
+; AVX512-NEXT:    vmovdqa (%rsi), %xmm5
+; AVX512-NEXT:    vmovdqa 32(%rsi), %xmm2
+; AVX512-NEXT:    vmovdqa (%rdi), %xmm10
+; AVX512-NEXT:    vmovdqa 32(%rdi), %xmm3
+; AVX512-NEXT:    vmovdqa (%r9), %xmm11
+; AVX512-NEXT:    vmovdqa 32(%r9), %xmm7
+; AVX512-NEXT:    vmovdqa 32(%rcx), %ymm8
+; AVX512-NEXT:    vmovdqa 32(%rdx), %ymm9
+; AVX512-NEXT:    vmovdqa 32(%rsi), %ymm6
+; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm12
+; AVX512-NEXT:    vpbroadcastq {{.*#+}} ymm14 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
+; AVX512-NEXT:    vpbroadcastq {{.*#+}} xmm4 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
+; AVX512-NEXT:    vpshufb %xmm4, %xmm2, %xmm0
+; AVX512-NEXT:    vpshufb %xmm4, %xmm3, %xmm1
 ; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
-; AVX512-NEXT:    vmovdqa64 %xmm3, %xmm30
-; AVX512-NEXT:    vmovdqa64 %xmm2, %xmm31
+; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; AVX512-NEXT:    vmovdqa64 %xmm3, %xmm16
+; AVX512-NEXT:    vmovdqa64 %xmm2, %xmm27
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5]
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm0 = [6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u]
-; AVX512-NEXT:    vpshufb %xmm0, %xmm4, %xmm2
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u]
-; AVX512-NEXT:    vpshufb %xmm1, %xmm4, %xmm3
-; AVX512-NEXT:    vmovdqa64 %xmm4, %xmm29
-; AVX512-NEXT:    vinserti32x4 $2, %xmm2, %zmm3, %zmm2
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm0 = [u,6,u,5,u,8,u,7,u,9,u,9,u,9,u,9]
+; AVX512-NEXT:    vmovdqa %xmm7, %xmm3
+; AVX512-NEXT:    vpshufb %xmm0, %xmm7, %xmm2
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,2,u,1,u,0,u,3,u,4,u,4,u,4,u,4]
+; AVX512-NEXT:    vpshufb %xmm1, %xmm7, %xmm7
+; AVX512-NEXT:    vmovdqa64 %xmm3, %xmm26
+; AVX512-NEXT:    vinserti32x4 $2, %xmm2, %zmm7, %zmm2
 ; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = [u,6,u,5,u,8,u,7,u,9,u,9,u,9,u,9]
-; AVX512-NEXT:    vpshufb %xmm2, %xmm8, %xmm4
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm3 = [u,2,u,1,u,0,u,3,u,4,u,4,u,4,u,4]
-; AVX512-NEXT:    vpshufb %xmm3, %xmm8, %xmm5
-; AVX512-NEXT:    vmovdqa64 %xmm8, %xmm27
-; AVX512-NEXT:    vinserti32x4 $2, %xmm4, %zmm5, %zmm4
-; AVX512-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa 32(%rdx), %ymm8
-; AVX512-NEXT:    vpshufb %xmm9, %xmm6, %xmm4
-; AVX512-NEXT:    vpshufb %xmm9, %xmm7, %xmm5
-; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
-; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
-; AVX512-NEXT:    vmovdqa64 %xmm7, %xmm18
-; AVX512-NEXT:    vmovdqa64 %xmm6, %xmm20
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,3,2,1,4,5,6,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,6,5]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm4, %zmm5, %zmm4
-; AVX512-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa 32(%rsi), %ymm4
-; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm11
-; AVX512-NEXT:    vpshufb %xmm0, %xmm10, %xmm0
-; AVX512-NEXT:    vpshufb %xmm1, %xmm10, %xmm1
-; AVX512-NEXT:    vmovdqa64 %xmm10, %xmm22
-; AVX512-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
-; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpshufb %xmm2, %xmm12, %xmm0
-; AVX512-NEXT:    vpshufb %xmm3, %xmm12, %xmm1
-; AVX512-NEXT:    vmovdqa64 %xmm12, %xmm21
+; AVX512-NEXT:    vpshufb %xmm4, %xmm5, %xmm2
+; AVX512-NEXT:    vpshufb %xmm4, %xmm10, %xmm4
+; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15]
+; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3],xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7]
+; AVX512-NEXT:    vmovdqa64 %xmm10, %xmm29
+; AVX512-NEXT:    vmovdqa64 %xmm5, %xmm30
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,1,4,5,6,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm4, %zmm2
+; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vpshufb %xmm0, %xmm11, %xmm0
+; AVX512-NEXT:    vpshufb %xmm1, %xmm11, %xmm1
+; AVX512-NEXT:    vmovdqa64 %xmm11, %xmm31
 ; AVX512-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpbroadcastq {{.*#+}} ymm15 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
-; AVX512-NEXT:    vpshufb %ymm15, %ymm4, %ymm0
-; AVX512-NEXT:    vpshufb %ymm15, %ymm11, %ymm1
+; AVX512-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
+; AVX512-NEXT:    vpshufb %ymm7, %ymm6, %ymm0
+; AVX512-NEXT:    vpshufb %ymm7, %ymm12, %ymm1
 ; AVX512-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
-; AVX512-NEXT:    vpunpckhbw {{.*#+}} ymm1 = ymm11[8],ymm4[8],ymm11[9],ymm4[9],ymm11[10],ymm4[10],ymm11[11],ymm4[11],ymm11[12],ymm4[12],ymm11[13],ymm4[13],ymm11[14],ymm4[14],ymm11[15],ymm4[15],ymm11[24],ymm4[24],ymm11[25],ymm4[25],ymm11[26],ymm4[26],ymm11[27],ymm4[27],ymm11[28],ymm4[28],ymm11[29],ymm4[29],ymm11[30],ymm4[30],ymm11[31],ymm4[31]
-; AVX512-NEXT:    vmovdqa64 %ymm4, %ymm19
-; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31]
-; AVX512-NEXT:    # ymm6 = mem[0,1,0,1]
-; AVX512-NEXT:    vpshufb %ymm6, %ymm1, %ymm1
+; AVX512-NEXT:    vpunpckhbw {{.*#+}} ymm1 = ymm12[8],ymm6[8],ymm12[9],ymm6[9],ymm12[10],ymm6[10],ymm12[11],ymm6[11],ymm12[12],ymm6[12],ymm12[13],ymm6[13],ymm12[14],ymm6[14],ymm12[15],ymm6[15],ymm12[24],ymm6[24],ymm12[25],ymm6[25],ymm12[26],ymm6[26],ymm12[27],ymm6[27],ymm12[28],ymm6[28],ymm12[29],ymm6[29],ymm12[30],ymm6[30],ymm12[31],ymm6[31]
+; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31]
+; AVX512-NEXT:    # ymm13 = mem[0,1,0,1]
+; AVX512-NEXT:    vpshufb %ymm13, %ymm1, %ymm1
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
-; AVX512-NEXT:    vpbroadcastq {{.*#+}} ymm12 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
-; AVX512-NEXT:    vpshufb %ymm12, %ymm13, %ymm0
-; AVX512-NEXT:    vpshufb %ymm12, %ymm8, %ymm1
-; AVX512-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
-; AVX512-NEXT:    vpunpckhbw {{.*#+}} ymm1 = ymm8[8],ymm13[8],ymm8[9],ymm13[9],ymm8[10],ymm13[10],ymm8[11],ymm13[11],ymm8[12],ymm13[12],ymm8[13],ymm13[13],ymm8[14],ymm13[14],ymm8[15],ymm13[15],ymm8[24],ymm13[24],ymm8[25],ymm13[25],ymm8[26],ymm13[26],ymm8[27],ymm13[27],ymm8[28],ymm13[28],ymm8[29],ymm13[29],ymm8[30],ymm13[30],ymm8[31],ymm13[31]
-; AVX512-NEXT:    vmovdqa64 %ymm13, %ymm17
-; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31]
-; AVX512-NEXT:    # ymm7 = mem[0,1,0,1]
-; AVX512-NEXT:    vpshufb %ymm7, %ymm1, %ymm1
-; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa 32(%r8), %ymm13
-; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0]
-; AVX512-NEXT:    # ymm5 = mem[0,1,0,1]
-; AVX512-NEXT:    vpshufb %ymm5, %ymm13, %ymm0
-; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0]
-; AVX512-NEXT:    # ymm4 = mem[0,1,0,1]
-; AVX512-NEXT:    vpshufb %ymm4, %ymm13, %ymm1
-; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa 32(%r9), %ymm14
-; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15]
-; AVX512-NEXT:    # ymm3 = mem[0,1,0,1]
-; AVX512-NEXT:    vpshufb %ymm3, %ymm14, %ymm0
-; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0]
+; AVX512-NEXT:    vpshufb %ymm14, %ymm8, %ymm0
+; AVX512-NEXT:    vpshufb %ymm14, %ymm9, %ymm1
+; AVX512-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; AVX512-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11],ymm9[12],ymm8[12],ymm9[13],ymm8[13],ymm9[14],ymm8[14],ymm9[15],ymm8[15],ymm9[24],ymm8[24],ymm9[25],ymm8[25],ymm9[26],ymm8[26],ymm9[27],ymm8[27],ymm9[28],ymm8[28],ymm9[29],ymm8[29],ymm9[30],ymm8[30],ymm9[31],ymm8[31]
+; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31]
+; AVX512-NEXT:    # ymm0 = mem[0,1,0,1]
+; AVX512-NEXT:    vpshufb %ymm0, %ymm2, %ymm2
+; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa 32(%r9), %ymm4
+; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15]
 ; AVX512-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX512-NEXT:    vpshufb %ymm2, %ymm14, %ymm1
-; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm28
-; AVX512-NEXT:    vmovdqa (%rsi), %ymm1
-; AVX512-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512-NEXT:    vpshufb %ymm15, %ymm1, %ymm10
-; AVX512-NEXT:    vpshufb %ymm15, %ymm0, %ymm15
-; AVX512-NEXT:    vpunpcklbw {{.*#+}} ymm10 = ymm15[0],ymm10[0],ymm15[1],ymm10[1],ymm15[2],ymm10[2],ymm15[3],ymm10[3],ymm15[4],ymm10[4],ymm15[5],ymm10[5],ymm15[6],ymm10[6],ymm15[7],ymm10[7],ymm15[16],ymm10[16],ymm15[17],ymm10[17],ymm15[18],ymm10[18],ymm15[19],ymm10[19],ymm15[20],ymm10[20],ymm15[21],ymm10[21],ymm15[22],ymm10[22],ymm15[23],ymm10[23]
-; AVX512-NEXT:    vpunpckhbw {{.*#+}} ymm15 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
-; AVX512-NEXT:    vpshufb %ymm6, %ymm15, %ymm6
-; AVX512-NEXT:    vinserti64x4 $1, %ymm6, %zmm10, %zmm26
-; AVX512-NEXT:    vmovdqa (%rcx), %ymm10
-; AVX512-NEXT:    vmovdqa (%rdx), %ymm15
-; AVX512-NEXT:    vpshufb %ymm12, %ymm10, %ymm6
-; AVX512-NEXT:    vpshufb %ymm12, %ymm15, %ymm9
-; AVX512-NEXT:    vpunpcklbw {{.*#+}} ymm6 = ymm9[0],ymm6[0],ymm9[1],ymm6[1],ymm9[2],ymm6[2],ymm9[3],ymm6[3],ymm9[4],ymm6[4],ymm9[5],ymm6[5],ymm9[6],ymm6[6],ymm9[7],ymm6[7],ymm9[16],ymm6[16],ymm9[17],ymm6[17],ymm9[18],ymm6[18],ymm9[19],ymm6[19],ymm9[20],ymm6[20],ymm9[21],ymm6[21],ymm9[22],ymm6[22],ymm9[23],ymm6[23]
-; AVX512-NEXT:    vpunpckhbw {{.*#+}} ymm9 = ymm15[8],ymm10[8],ymm15[9],ymm10[9],ymm15[10],ymm10[10],ymm15[11],ymm10[11],ymm15[12],ymm10[12],ymm15[13],ymm10[13],ymm15[14],ymm10[14],ymm15[15],ymm10[15],ymm15[24],ymm10[24],ymm15[25],ymm10[25],ymm15[26],ymm10[26],ymm15[27],ymm10[27],ymm15[28],ymm10[28],ymm15[29],ymm10[29],ymm15[30],ymm10[30],ymm15[31],ymm10[31]
-; AVX512-NEXT:    vpshufb %ymm7, %ymm9, %ymm9
-; AVX512-NEXT:    vinserti64x4 $1, %ymm9, %zmm6, %zmm24
-; AVX512-NEXT:    vmovdqa (%r8), %ymm6
-; AVX512-NEXT:    vpshufb %ymm5, %ymm6, %ymm5
-; AVX512-NEXT:    vpshufb %ymm4, %ymm6, %ymm4
-; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm4, %zmm23
-; AVX512-NEXT:    vmovdqa (%r9), %ymm4
-; AVX512-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
-; AVX512-NEXT:    vpshufb %ymm2, %ymm4, %ymm2
-; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm25
-; AVX512-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm15[0],ymm10[0],ymm15[1],ymm10[1],ymm15[2],ymm10[2],ymm15[3],ymm10[3],ymm15[4],ymm10[4],ymm15[5],ymm10[5],ymm15[6],ymm10[6],ymm15[7],ymm10[7],ymm15[16],ymm10[16],ymm15[17],ymm10[17],ymm15[18],ymm10[18],ymm15[19],ymm10[19],ymm15[20],ymm10[20],ymm15[21],ymm10[21],ymm15[22],ymm10[22],ymm15[23],ymm10[23]
-; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm16
-; AVX512-NEXT:    vmovdqa64 %xmm18, %xmm2
-; AVX512-NEXT:    vmovdqa64 %xmm20, %xmm3
-; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
-; AVX512-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+; AVX512-NEXT:    vpshufb %ymm2, %ymm4, %ymm10
+; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0]
+; AVX512-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512-NEXT:    vpshufb %ymm1, %ymm4, %ymm11
+; AVX512-NEXT:    vinserti64x4 $1, %ymm10, %zmm11, %zmm3
+; AVX512-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa (%rsi), %ymm15
+; AVX512-NEXT:    vmovdqa (%rdi), %ymm10
+; AVX512-NEXT:    vpshufb %ymm7, %ymm15, %ymm11
+; AVX512-NEXT:    vpshufb %ymm7, %ymm10, %ymm7
+; AVX512-NEXT:    vpunpcklbw {{.*#+}} ymm7 = ymm7[0],ymm11[0],ymm7[1],ymm11[1],ymm7[2],ymm11[2],ymm7[3],ymm11[3],ymm7[4],ymm11[4],ymm7[5],ymm11[5],ymm7[6],ymm11[6],ymm7[7],ymm11[7],ymm7[16],ymm11[16],ymm7[17],ymm11[17],ymm7[18],ymm11[18],ymm7[19],ymm11[19],ymm7[20],ymm11[20],ymm7[21],ymm11[21],ymm7[22],ymm11[22],ymm7[23],ymm11[23]
+; AVX512-NEXT:    vpunpckhbw {{.*#+}} ymm11 = ymm10[8],ymm15[8],ymm10[9],ymm15[9],ymm10[10],ymm15[10],ymm10[11],ymm15[11],ymm10[12],ymm15[12],ymm10[13],ymm15[13],ymm10[14],ymm15[14],ymm10[15],ymm15[15],ymm10[24],ymm15[24],ymm10[25],ymm15[25],ymm10[26],ymm15[26],ymm10[27],ymm15[27],ymm10[28],ymm15[28],ymm10[29],ymm15[29],ymm10[30],ymm15[30],ymm10[31],ymm15[31]
+; AVX512-NEXT:    vpshufb %ymm13, %ymm11, %ymm11
+; AVX512-NEXT:    vinserti64x4 $1, %ymm11, %zmm7, %zmm25
+; AVX512-NEXT:    vmovdqa (%rcx), %ymm7
+; AVX512-NEXT:    vmovdqa (%rdx), %ymm11
+; AVX512-NEXT:    vpshufb %ymm14, %ymm7, %ymm13
+; AVX512-NEXT:    vpshufb %ymm14, %ymm11, %ymm5
+; AVX512-NEXT:    vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm13[0],ymm5[1],ymm13[1],ymm5[2],ymm13[2],ymm5[3],ymm13[3],ymm5[4],ymm13[4],ymm5[5],ymm13[5],ymm5[6],ymm13[6],ymm5[7],ymm13[7],ymm5[16],ymm13[16],ymm5[17],ymm13[17],ymm5[18],ymm13[18],ymm5[19],ymm13[19],ymm5[20],ymm13[20],ymm5[21],ymm13[21],ymm5[22],ymm13[22],ymm5[23],ymm13[23]
+; AVX512-NEXT:    vpunpckhbw {{.*#+}} ymm13 = ymm11[8],ymm7[8],ymm11[9],ymm7[9],ymm11[10],ymm7[10],ymm11[11],ymm7[11],ymm11[12],ymm7[12],ymm11[13],ymm7[13],ymm11[14],ymm7[14],ymm11[15],ymm7[15],ymm11[24],ymm7[24],ymm11[25],ymm7[25],ymm11[26],ymm7[26],ymm11[27],ymm7[27],ymm11[28],ymm7[28],ymm11[29],ymm7[29],ymm11[30],ymm7[30],ymm11[31],ymm7[31]
+; AVX512-NEXT:    vpshufb %ymm0, %ymm13, %ymm0
+; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm5, %zmm24
+; AVX512-NEXT:    vmovdqa (%r9), %ymm3
+; AVX512-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
+; AVX512-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
+; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm23
+; AVX512-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm11[0],ymm7[0],ymm11[1],ymm7[1],ymm11[2],ymm7[2],ymm11[3],ymm7[3],ymm11[4],ymm7[4],ymm11[5],ymm7[5],ymm11[6],ymm7[6],ymm11[7],ymm7[7],ymm11[16],ymm7[16],ymm11[17],ymm7[17],ymm11[18],ymm7[18],ymm11[19],ymm7[19],ymm11[20],ymm7[20],ymm11[21],ymm7[21],ymm11[22],ymm7[22],ymm11[23],ymm7[23]
+; AVX512-NEXT:    vmovdqa64 %ymm0, %ymm28
+; AVX512-NEXT:    vmovdqa64 %xmm29, %xmm0
+; AVX512-NEXT:    vmovdqa64 %xmm30, %xmm1
+; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm7 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX512-NEXT:    vmovdqa (%rcx), %xmm2
+; AVX512-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm10[0],ymm15[0],ymm10[1],ymm15[1],ymm10[2],ymm15[2],ymm10[3],ymm15[3],ymm10[4],ymm15[4],ymm10[5],ymm15[5],ymm10[6],ymm15[6],ymm10[7],ymm15[7],ymm10[16],ymm15[16],ymm10[17],ymm15[17],ymm10[18],ymm15[18],ymm10[19],ymm15[19],ymm10[20],ymm15[20],ymm10[21],ymm15[21],ymm10[22],ymm15[22],ymm10[23],ymm15[23]
+; AVX512-NEXT:    vmovdqa64 %ymm0, %ymm19
+; AVX512-NEXT:    vmovdqa (%rdx), %xmm10
+; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm10[8],xmm2[8],xmm10[9],xmm2[9],xmm10[10],xmm2[10],xmm10[11],xmm2[11],xmm10[12],xmm2[12],xmm10[13],xmm2[13],xmm10[14],xmm2[14],xmm10[15],xmm2[15]
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm15 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15]
+; AVX512-NEXT:    vpshufb %xmm15, %xmm5, %xmm0
 ; AVX512-NEXT:    vmovdqa64 %ymm0, %ymm18
-; AVX512-NEXT:    vmovdqa (%rcx), %xmm15
-; AVX512-NEXT:    vmovdqa (%rdx), %xmm2
-; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15]
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm3 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15]
-; AVX512-NEXT:    vpshufb %xmm3, %xmm1, %xmm10
 ; AVX512-NEXT:    vmovdqa 32(%rcx), %xmm1
 ; AVX512-NEXT:    vmovdqa 32(%rdx), %xmm0
-; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm9 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX512-NEXT:    vpshufb %xmm3, %xmm9, %xmm3
-; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm20
-; AVX512-NEXT:    vmovdqa64 %ymm17, %ymm3
-; AVX512-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm8[0],ymm3[0],ymm8[1],ymm3[1],ymm8[2],ymm3[2],ymm8[3],ymm3[3],ymm8[4],ymm3[4],ymm8[5],ymm3[5],ymm8[6],ymm3[6],ymm8[7],ymm3[7],ymm8[16],ymm3[16],ymm8[17],ymm3[17],ymm8[18],ymm3[18],ymm8[19],ymm3[19],ymm8[20],ymm3[20],ymm8[21],ymm3[21],ymm8[22],ymm3[22],ymm8[23],ymm3[23]
-; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm17
-; AVX512-NEXT:    vmovdqa64 %xmm30, %xmm3
-; AVX512-NEXT:    vmovdqa64 %xmm31, %xmm7
-; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15]
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm8 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15]
-; AVX512-NEXT:    vpshufb %xmm8, %xmm5, %xmm9
-; AVX512-NEXT:    vpshufb %xmm8, %xmm3, %xmm3
-; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm31
-; AVX512-NEXT:    vmovdqa64 %ymm19, %ymm3
-; AVX512-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm11[0],ymm3[0],ymm11[1],ymm3[1],ymm11[2],ymm3[2],ymm11[3],ymm3[3],ymm11[4],ymm3[4],ymm11[5],ymm3[5],ymm11[6],ymm3[6],ymm11[7],ymm3[7],ymm11[16],ymm3[16],ymm11[17],ymm3[17],ymm11[18],ymm3[18],ymm11[19],ymm3[19],ymm11[20],ymm3[20],ymm11[21],ymm3[21],ymm11[22],ymm3[22],ymm11[23],ymm3[23]
-; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm19
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm3 = [10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u]
-; AVX512-NEXT:    vmovdqa64 %xmm22, %xmm5
-; AVX512-NEXT:    vpshufb %xmm3, %xmm5, %xmm7
-; AVX512-NEXT:    vmovdqa64 %xmm29, %xmm5
-; AVX512-NEXT:    vpshufb %xmm3, %xmm5, %xmm3
+; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX512-NEXT:    vpshufb %xmm15, %xmm5, %xmm5
+; AVX512-NEXT:    vmovdqa64 %ymm5, %ymm29
+; AVX512-NEXT:    vpunpcklbw {{.*#+}} ymm5 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[16],ymm8[16],ymm9[17],ymm8[17],ymm9[18],ymm8[18],ymm9[19],ymm8[19],ymm9[20],ymm8[20],ymm9[21],ymm8[21],ymm9[22],ymm8[22],ymm9[23],ymm8[23]
+; AVX512-NEXT:    vmovdqa64 %ymm5, %ymm30
+; AVX512-NEXT:    vmovdqa64 %xmm16, %xmm5
+; AVX512-NEXT:    vmovdqa64 %xmm27, %xmm8
+; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm8 = xmm5[8],xmm8[8],xmm5[9],xmm8[9],xmm5[10],xmm8[10],xmm5[11],xmm8[11],xmm5[12],xmm8[12],xmm5[13],xmm8[13],xmm5[14],xmm8[14],xmm5[15],xmm8[15]
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm9 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15]
+; AVX512-NEXT:    vpshufb %xmm9, %xmm7, %xmm11
+; AVX512-NEXT:    vpshufb %xmm9, %xmm8, %xmm5
+; AVX512-NEXT:    vmovdqa64 %ymm5, %ymm21
+; AVX512-NEXT:    vpunpcklbw {{.*#+}} ymm5 = ymm12[0],ymm6[0],ymm12[1],ymm6[1],ymm12[2],ymm6[2],ymm12[3],ymm6[3],ymm12[4],ymm6[4],ymm12[5],ymm6[5],ymm12[6],ymm6[6],ymm12[7],ymm6[7],ymm12[16],ymm6[16],ymm12[17],ymm6[17],ymm12[18],ymm6[18],ymm12[19],ymm6[19],ymm12[20],ymm6[20],ymm12[21],ymm6[21],ymm12[22],ymm6[22],ymm12[23],ymm6[23]
+; AVX512-NEXT:    vmovdqa64 %ymm5, %ymm20
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm6 = [u,10,u,13,u,12,u,11,u,14,u,13,u,14,u,15]
+; AVX512-NEXT:    vmovdqa64 %xmm31, %xmm5
+; AVX512-NEXT:    vpshufb %xmm6, %xmm5, %xmm13
+; AVX512-NEXT:    vmovdqa64 %xmm26, %xmm5
+; AVX512-NEXT:    vpshufb %xmm6, %xmm5, %xmm5
+; AVX512-NEXT:    vmovdqa64 %ymm5, %ymm31
+; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0]
+; AVX512-NEXT:    # ymm8 = mem[0,1,0,1]
+; AVX512-NEXT:    vpshufb %ymm8, %ymm3, %ymm9
+; AVX512-NEXT:    vpshufb %ymm8, %ymm4, %ymm3
 ; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm22
-; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0]
-; AVX512-NEXT:    # ymm11 = mem[0,1,0,1]
-; AVX512-NEXT:    vpshufb %ymm11, %ymm6, %ymm5
-; AVX512-NEXT:    vpshufb %ymm11, %ymm13, %ymm6
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm11 = [u,10,u,13,u,12,u,11,u,14,u,13,u,14,u,15]
-; AVX512-NEXT:    vmovdqa64 %xmm21, %xmm3
-; AVX512-NEXT:    vpshufb %xmm11, %xmm3, %xmm13
-; AVX512-NEXT:    vmovdqa64 %xmm27, %xmm3
-; AVX512-NEXT:    vpshufb %xmm11, %xmm3, %xmm11
-; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0]
-; AVX512-NEXT:    # ymm3 = mem[0,1,0,1]
-; AVX512-NEXT:    vpshufb %ymm3, %ymm4, %ymm4
-; AVX512-NEXT:    vpshufb %ymm3, %ymm14, %ymm3
-; AVX512-NEXT:    vpshufb %xmm12, %xmm1, %xmm14
-; AVX512-NEXT:    vpshufb %xmm12, %xmm0, %xmm8
-; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm14[0],xmm8[1],xmm14[1],xmm8[2],xmm14[2],xmm8[3],xmm14[3],xmm8[4],xmm14[4],xmm8[5],xmm14[5],xmm8[6],xmm14[6],xmm8[7],xmm14[7]
+; AVX512-NEXT:    vpshufb %xmm14, %xmm1, %xmm3
+; AVX512-NEXT:    vpshufb %xmm14, %xmm0, %xmm4
+; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
 ; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; AVX512-NEXT:    vprold $16, %xmm0, %xmm0
-; AVX512-NEXT:    vinserti64x4 $1, %ymm8, %zmm0, %zmm0
-; AVX512-NEXT:    vpshufb %xmm12, %xmm15, %xmm1
-; AVX512-NEXT:    vpshufb %xmm12, %xmm2, %xmm8
-; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3],xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7]
-; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3],xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm27
+; AVX512-NEXT:    vpshufb %xmm14, %xmm2, %xmm0
+; AVX512-NEXT:    vpshufb %xmm14, %xmm10, %xmm1
+; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX512-NEXT:    vmovdqa (%r8), %xmm1
+; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3],xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7]
+; AVX512-NEXT:    vmovdqa 32(%r8), %xmm3
 ; AVX512-NEXT:    vprold $16, %xmm2, %xmm2
-; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm30
-; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm10[0,0,0,1]
-; AVX512-NEXT:    vprold $16, %ymm16, %ymm8
-; AVX512-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,0,0,1]
-; AVX512-NEXT:    vmovdqa64 %ymm18, %ymm1
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm10 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,0,0,1]
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm4 = [10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm26
+; AVX512-NEXT:    vpshufb %xmm4, %xmm1, %xmm6
+; AVX512-NEXT:    vpshufb %xmm4, %xmm3, %xmm15
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = [6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u]
+; AVX512-NEXT:    vpshufb %xmm2, %xmm3, %xmm10
+; AVX512-NEXT:    vmovdqa64 %ymm10, %ymm17
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm4 = [2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u]
+; AVX512-NEXT:    vpshufb %xmm4, %xmm3, %xmm10
+; AVX512-NEXT:    vpshufb %xmm2, %xmm1, %xmm14
+; AVX512-NEXT:    vpshufb %xmm4, %xmm1, %xmm0
+; AVX512-NEXT:    vmovdqa64 %ymm0, %ymm16
+; AVX512-NEXT:    vmovdqa (%r8), %ymm1
+; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0]
+; AVX512-NEXT:    # ymm2 = mem[0,1,0,1]
+; AVX512-NEXT:    vpshufb %ymm2, %ymm1, %ymm5
+; AVX512-NEXT:    vmovdqa 32(%r8), %ymm3
+; AVX512-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
+; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0]
+; AVX512-NEXT:    # ymm4 = mem[0,1,0,1]
+; AVX512-NEXT:    vpshufb %ymm4, %ymm3, %ymm7
+; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0]
+; AVX512-NEXT:    # ymm0 = mem[0,1,0,1]
+; AVX512-NEXT:    vpshufb %ymm0, %ymm3, %ymm12
+; AVX512-NEXT:    vpshufb %ymm4, %ymm1, %ymm3
+; AVX512-NEXT:    vpshufb %ymm0, %ymm1, %ymm4
+; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm18[0,0,0,1]
+; AVX512-NEXT:    vprold $16, %ymm28, %ymm1
+; AVX512-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1]
+; AVX512-NEXT:    vmovdqa64 %ymm19, %ymm8
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm8 = ymm8[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm12 = ymm13[0,0,0,1]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm13 = ymm20[0,0,0,1]
-; AVX512-NEXT:    vprold $16, %ymm17, %ymm14
-; AVX512-NEXT:    vpermq {{.*#+}} ymm15 = ymm31[0,0,0,1]
-; AVX512-NEXT:    vmovdqa64 %ymm19, %ymm1
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm28 = ymm6[0,0,0,1]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[0,0,0,1]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm31 = ymm22[0,0,0,1]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    vinserti64x4 $1, %ymm8, %zmm11, %zmm1
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
+; AVX512-NEXT:    vpternlogq $226, %zmm0, %zmm6, %zmm1
+; AVX512-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
+; AVX512-NEXT:    vpternlogq $184, %ymm0, %ymm6, %ymm5
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm0 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0]
+; AVX512-NEXT:    vpternlogq $184, %ymm1, %ymm0, %ymm28
+; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm0, %zmm1
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm28[0,1,2,3],zmm1[4,5,6,7]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm9, %zmm13, %zmm1
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255]
+; AVX512-NEXT:    vpternlogd $184, %zmm5, %zmm8, %zmm1
+; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm29[0,0,0,1]
+; AVX512-NEXT:    vprold $16, %ymm30, %ymm9
+; AVX512-NEXT:    vpermq {{.*#+}} ymm11 = ymm21[0,0,0,1]
+; AVX512-NEXT:    vmovdqa64 %ymm20, %ymm13
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm13 = ymm13[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[0,0,0,1]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm28 = ymm31[0,0,0,1]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm29 = ymm22[2,2,2,3]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm30 = ymm17[0,0,0,1]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[0,0,0,1]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm31 = ymm16[0,0,0,1]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm8, %zmm2, %zmm2
-; AVX512-NEXT:    vinserti64x4 $1, %ymm10, %zmm9, %zmm8
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm9, %zmm5, %zmm5
+; AVX512-NEXT:    vinserti64x4 $1, %ymm13, %zmm11, %zmm9
+; AVX512-NEXT:    vpternlogq $226, %zmm5, %zmm6, %zmm9
+; AVX512-NEXT:    vextracti64x4 $1, %zmm9, %ymm5
+; AVX512-NEXT:    vpternlogq $184, %ymm5, %ymm6, %ymm2
+; AVX512-NEXT:    vpternlogq $184, %ymm9, %ymm0, %ymm15
+; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm2
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm15[0,1,2,3],zmm2[4,5,6,7]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm29, %zmm28, %zmm5
+; AVX512-NEXT:    vpternlogd $184, %zmm2, %zmm8, %zmm5
+; AVX512-NEXT:    vpermq {{.*#+}} zmm2 = zmm27[0,0,0,1,4,4,4,5]
+; AVX512-NEXT:    vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm8 = mem[0,0,0,1,4,4,4,5]
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0]
 ; AVX512-NEXT:    vpternlogq $226, %zmm2, %zmm9, %zmm8
-; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm7, %zmm2
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
-; AVX512-NEXT:    vpternlogd $184, %zmm8, %zmm5, %zmm2
-; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = ymm14[2,2,2,3]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm7, %zmm13, %zmm7
-; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm15, %zmm1
-; AVX512-NEXT:    vpternlogq $226, %zmm7, %zmm9, %zmm1
-; AVX512-NEXT:    vinserti64x4 $1, %ymm6, %zmm31, %zmm6
-; AVX512-NEXT:    vpternlogd $184, %zmm1, %zmm5, %zmm6
-; AVX512-NEXT:    vinserti64x4 $1, %ymm4, %zmm12, %zmm1
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255]
-; AVX512-NEXT:    vpternlogd $184, %zmm2, %zmm4, %zmm1
-; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm11, %zmm2
-; AVX512-NEXT:    vpternlogd $184, %zmm6, %zmm4, %zmm2
-; AVX512-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[0,0,0,1,4,4,4,5]
-; AVX512-NEXT:    vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Folded Reload
-; AVX512-NEXT:    # zmm3 = mem[0,0,0,1,4,4,4,5]
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0]
-; AVX512-NEXT:    vpternlogq $226, %zmm0, %zmm4, %zmm3
-; AVX512-NEXT:    vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
-; AVX512-NEXT:    # zmm0 = mem[0,0,0,1,4,4,4,5]
-; AVX512-NEXT:    vpternlogq $184, %zmm3, %zmm9, %zmm0
-; AVX512-NEXT:    vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Folded Reload
-; AVX512-NEXT:    # zmm3 = mem[0,0,0,1,4,4,4,5]
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255]
-; AVX512-NEXT:    vpternlogd $184, %zmm0, %zmm5, %zmm3
-; AVX512-NEXT:    vpermq {{.*#+}} zmm0 = zmm30[0,0,0,1,4,4,4,5]
-; AVX512-NEXT:    vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Folded Reload
-; AVX512-NEXT:    # zmm6 = mem[0,0,0,1,4,4,4,5]
-; AVX512-NEXT:    vpternlogq $226, %zmm0, %zmm4, %zmm6
-; AVX512-NEXT:    vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
-; AVX512-NEXT:    # zmm0 = mem[0,0,0,1,4,4,4,5]
-; AVX512-NEXT:    vpternlogq $184, %zmm6, %zmm9, %zmm0
-; AVX512-NEXT:    vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Folded Reload
-; AVX512-NEXT:    # zmm6 = mem[0,0,0,1,4,4,4,5]
-; AVX512-NEXT:    vpternlogd $184, %zmm0, %zmm5, %zmm6
-; AVX512-NEXT:    vpermq $234, (%rsp), %zmm0 # 64-byte Folded Reload
-; AVX512-NEXT:    # zmm0 = mem[2,2,2,3,6,6,6,7]
-; AVX512-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Folded Reload
-; AVX512-NEXT:    # zmm5 = mem[2,2,2,3,6,6,6,7]
-; AVX512-NEXT:    vpternlogq $226, %zmm0, %zmm9, %zmm5
-; AVX512-NEXT:    vpermq {{.*#+}} zmm0 = zmm26[2,2,2,3,6,6,6,7]
-; AVX512-NEXT:    vpermq {{.*#+}} zmm7 = zmm24[2,2,2,3,6,6,6,7]
-; AVX512-NEXT:    vpternlogq $226, %zmm0, %zmm9, %zmm7
-; AVX512-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
-; AVX512-NEXT:    # zmm0 = mem[2,2,2,3,6,6,6,7]
-; AVX512-NEXT:    vpternlogq $184, %zmm5, %zmm4, %zmm0
-; AVX512-NEXT:    vpermq {{.*#+}} zmm5 = zmm23[2,2,2,3,6,6,6,7]
-; AVX512-NEXT:    vpternlogq $184, %zmm7, %zmm4, %zmm5
-; AVX512-NEXT:    vpermq {{.*#+}} zmm4 = zmm28[2,2,2,3,6,6,6,7]
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0]
-; AVX512-NEXT:    vpternlogd $184, %zmm0, %zmm7, %zmm4
-; AVX512-NEXT:    vpermq {{.*#+}} zmm0 = zmm25[2,2,2,3,6,6,6,7]
-; AVX512-NEXT:    vpternlogd $184, %zmm5, %zmm7, %zmm0
+; AVX512-NEXT:    vextracti64x4 $1, %zmm8, %ymm2
+; AVX512-NEXT:    vpternlogq $184, %ymm2, %ymm9, %ymm30
+; AVX512-NEXT:    vpternlogq $184, %ymm8, %ymm6, %ymm10
+; AVX512-NEXT:    vinserti64x4 $1, %ymm30, %zmm0, %zmm2
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm10[0,1,2,3],zmm2[4,5,6,7]
+; AVX512-NEXT:    vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm8 = mem[0,0,0,1,4,4,4,5]
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255]
+; AVX512-NEXT:    vpternlogd $184, %zmm2, %zmm10, %zmm8
+; AVX512-NEXT:    vpermq {{.*#+}} zmm2 = zmm26[0,0,0,1,4,4,4,5]
+; AVX512-NEXT:    vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm11 = mem[0,0,0,1,4,4,4,5]
+; AVX512-NEXT:    vpternlogq $226, %zmm2, %zmm9, %zmm11
+; AVX512-NEXT:    vextracti64x4 $1, %zmm11, %ymm2
+; AVX512-NEXT:    vpternlogq $184, %ymm2, %ymm9, %ymm14
+; AVX512-NEXT:    vpternlogq $184, %ymm11, %ymm6, %ymm31
+; AVX512-NEXT:    vinserti64x4 $1, %ymm14, %zmm0, %zmm2
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm31[0,1,2,3],zmm2[4,5,6,7]
+; AVX512-NEXT:    vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm11 = mem[0,0,0,1,4,4,4,5]
+; AVX512-NEXT:    vpternlogd $184, %zmm2, %zmm10, %zmm11
+; AVX512-NEXT:    vpermq $234, (%rsp), %zmm2 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm2 = mem[2,2,2,3,6,6,6,7]
+; AVX512-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm10 = mem[2,2,2,3,6,6,6,7]
+; AVX512-NEXT:    vpternlogq $226, %zmm2, %zmm6, %zmm10
+; AVX512-NEXT:    vpermq {{.*#+}} zmm2 = zmm25[2,2,2,3,6,6,6,7]
+; AVX512-NEXT:    vpermq {{.*#+}} zmm13 = zmm24[2,2,2,3,6,6,6,7]
+; AVX512-NEXT:    vpternlogq $226, %zmm2, %zmm6, %zmm13
+; AVX512-NEXT:    vextracti64x4 $1, %zmm10, %ymm2
+; AVX512-NEXT:    vpternlogq $184, %ymm2, %ymm0, %ymm7
+; AVX512-NEXT:    vextracti64x4 $1, %zmm13, %ymm2
+; AVX512-NEXT:    vpternlogq $184, %ymm2, %ymm0, %ymm3
+; AVX512-NEXT:    vpternlogq $184, %ymm10, %ymm9, %ymm12
+; AVX512-NEXT:    vpternlogq $184, %ymm13, %ymm9, %ymm4
+; AVX512-NEXT:    vinserti64x4 $1, %ymm7, %zmm0, %zmm0
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm12[0,1,2,3],zmm0[4,5,6,7]
+; AVX512-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm2 = mem[2,2,2,3,6,6,6,7]
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0]
+; AVX512-NEXT:    vpternlogd $184, %zmm0, %zmm6, %zmm2
+; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm4[0,1,2,3],zmm0[4,5,6,7]
+; AVX512-NEXT:    vpermq {{.*#+}} zmm3 = zmm23[2,2,2,3,6,6,6,7]
+; AVX512-NEXT:    vpternlogd $184, %zmm0, %zmm6, %zmm3
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    vmovdqa64 %zmm0, 128(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm4, 320(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm6, (%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm3, 192(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm2, 256(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm3, 128(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm2, 320(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm11, (%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm8, 192(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm5, 256(%rax)
 ; AVX512-NEXT:    vmovdqa64 %zmm1, 64(%rax)
-; AVX512-NEXT:    addq $456, %rsp # imm = 0x1C8
+; AVX512-NEXT:    addq $328, %rsp # imm = 0x148
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 ;
 ; AVX512-FCP-LABEL: store_i8_stride6_vf64:
 ; AVX512-FCP:       # %bb.0:
-; AVX512-FCP-NEXT:    subq $424, %rsp # imm = 0x1A8
+; AVX512-FCP-NEXT:    subq $392, %rsp # imm = 0x188
 ; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %ymm4
 ; AVX512-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
 ; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm4, %ymm1
 ; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm3
-; AVX512-FCP-NEXT:    vmovdqu %ymm3, (%rsp) # 32-byte Spill
+; AVX512-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm3, %ymm2
 ; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23]
 ; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15],ymm3[24],ymm4[24],ymm3[25],ymm4[25],ymm3[26],ymm4[26],ymm3[27],ymm4[27],ymm3[28],ymm4[28],ymm3[29],ymm4[29],ymm3[30],ymm4[30],ymm3[31],ymm4[31]
@@ -5697,526 +5712,561 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %ymm5
-; AVX512-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm10 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
-; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm5, %ymm2
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %ymm4
-; AVX512-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm4, %ymm3
+; AVX512-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm14 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
+; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm5, %ymm2
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %ymm6
+; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm6, %ymm3
 ; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[16],ymm2[16],ymm3[17],ymm2[17],ymm3[18],ymm2[18],ymm3[19],ymm2[19],ymm3[20],ymm2[20],ymm3[21],ymm2[21],ymm3[22],ymm2[22],ymm3[23],ymm2[23]
-; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15],ymm4[24],ymm5[24],ymm4[25],ymm5[25],ymm4[26],ymm5[26],ymm4[27],ymm5[27],ymm4[28],ymm5[28],ymm4[29],ymm5[29],ymm4[30],ymm5[30],ymm4[31],ymm5[31]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm5, %ymm22
+; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15],ymm6[24],ymm5[24],ymm6[25],ymm5[25],ymm6[26],ymm5[26],ymm6[27],ymm5[27],ymm6[28],ymm5[28],ymm6[29],ymm5[29],ymm6[30],ymm5[30],ymm6[31],ymm5[31]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm6, %ymm25
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm5, %ymm30
 ; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31]
 ; AVX512-FCP-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm4, %ymm4
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm3, %zmm3
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %ymm6
-; AVX512-FCP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0]
+; AVX512-FCP-NEXT:    vmovdqa 32(%r9), %ymm7
+; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15]
 ; AVX512-FCP-NEXT:    # ymm3 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm6, %ymm5
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0]
+; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm7, %ymm5
+; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0]
 ; AVX512-FCP-NEXT:    # ymm4 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm6, %ymm6
+; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm7, %ymm6
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm7, %ymm31
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm6, %zmm5
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 32(%r9), %ymm7
-; AVX512-FCP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15]
-; AVX512-FCP-NEXT:    # ymm5 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm7, %ymm6
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0]
-; AVX512-FCP-NEXT:    # ymm9 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm7, %ymm8
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm8, %zmm6
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa (%rsi), %ymm7
-; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm7, %ymm6
-; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm8
-; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm8, %ymm0
-; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[1],ymm6[1],ymm0[2],ymm6[2],ymm0[3],ymm6[3],ymm0[4],ymm6[4],ymm0[5],ymm6[5],ymm0[6],ymm6[6],ymm0[7],ymm6[7],ymm0[16],ymm6[16],ymm0[17],ymm6[17],ymm0[18],ymm6[18],ymm0[19],ymm6[19],ymm0[20],ymm6[20],ymm0[21],ymm6[21],ymm0[22],ymm6[22],ymm0[23],ymm6[23]
-; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm6 = ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11],ymm8[12],ymm7[12],ymm8[13],ymm7[13],ymm8[14],ymm7[14],ymm8[15],ymm7[15],ymm8[24],ymm7[24],ymm8[25],ymm7[25],ymm8[26],ymm7[26],ymm8[27],ymm7[27],ymm8[28],ymm7[28],ymm8[29],ymm7[29],ymm8[30],ymm7[30],ymm8[31],ymm7[31]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm8, %ymm18
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm7, %ymm17
-; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
+; AVX512-FCP-NEXT:    vmovdqa (%rsi), %ymm8
+; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm8, %ymm5
+; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm7
+; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm7, %ymm0
+; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[1],ymm5[1],ymm0[2],ymm5[2],ymm0[3],ymm5[3],ymm0[4],ymm5[4],ymm0[5],ymm5[5],ymm0[6],ymm5[6],ymm0[7],ymm5[7],ymm0[16],ymm5[16],ymm0[17],ymm5[17],ymm0[18],ymm5[18],ymm0[19],ymm5[19],ymm0[20],ymm5[20],ymm0[21],ymm5[21],ymm0[22],ymm5[22],ymm0[23],ymm5[23]
+; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm7[8],ymm8[8],ymm7[9],ymm8[9],ymm7[10],ymm8[10],ymm7[11],ymm8[11],ymm7[12],ymm8[12],ymm7[13],ymm8[13],ymm7[14],ymm8[14],ymm7[15],ymm8[15],ymm7[24],ymm8[24],ymm7[25],ymm8[25],ymm7[26],ymm8[26],ymm7[27],ymm8[27],ymm7[28],ymm8[28],ymm7[29],ymm8[29],ymm7[30],ymm8[30],ymm7[31],ymm8[31]
+; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm5, %ymm1
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa (%rcx), %ymm15
-; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm15, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa (%rdx), %ymm11
-; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm11, %ymm1
+; AVX512-FCP-NEXT:    vmovdqa (%rcx), %ymm11
+; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm11, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa (%rdx), %ymm12
+; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm12, %ymm1
 ; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
-; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm1 = ymm11[8],ymm15[8],ymm11[9],ymm15[9],ymm11[10],ymm15[10],ymm11[11],ymm15[11],ymm11[12],ymm15[12],ymm11[13],ymm15[13],ymm11[14],ymm15[14],ymm11[15],ymm15[15],ymm11[24],ymm15[24],ymm11[25],ymm15[25],ymm11[26],ymm15[26],ymm11[27],ymm15[27],ymm11[28],ymm15[28],ymm11[29],ymm15[29],ymm11[30],ymm15[30],ymm11[31],ymm15[31]
+; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm1 = ymm12[8],ymm11[8],ymm12[9],ymm11[9],ymm12[10],ymm11[10],ymm12[11],ymm11[11],ymm12[12],ymm11[12],ymm12[13],ymm11[13],ymm12[14],ymm11[14],ymm12[15],ymm11[15],ymm12[24],ymm11[24],ymm12[25],ymm11[25],ymm12[26],ymm11[26],ymm12[27],ymm11[27],ymm12[28],ymm11[28],ymm12[29],ymm11[29],ymm12[30],ymm11[30],ymm12[31],ymm11[31]
 ; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa (%r8), %ymm2
-; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm2, %ymm0
-; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm2, %ymm1
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm19
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm23
-; AVX512-FCP-NEXT:    vmovdqa (%r9), %ymm2
-; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm2, %ymm0
-; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm2, %ymm1
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm20
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm24
-; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %xmm0
-; AVX512-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm5 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
-; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm0, %xmm1
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm12
-; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm12, %xmm2
+; AVX512-FCP-NEXT:    vmovdqa (%r9), %ymm9
+; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm9, %ymm0
+; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm9, %ymm1
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %xmm3
+; AVX512-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
+; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm3, %xmm1
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm6
+; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm6, %xmm2
 ; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3],xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm0, %xmm16
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm8 = [0,1,6,7,4,5,2,3,8,9,10,11,12,13,10,11]
-; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm2, %xmm2
-; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm1, %zmm2, %zmm26
-; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %xmm13
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm9 = [6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u]
-; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm13, %xmm1
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u]
-; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm13, %xmm3
-; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm1, %zmm3, %zmm29
-; AVX512-FCP-NEXT:    vmovdqa 32(%r9), %xmm14
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [u,6,u,5,u,8,u,7,u,9,u,9,u,9,u,9]
-; AVX512-FCP-NEXT:    vpshufb %xmm7, %xmm14, %xmm1
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [u,2,u,1,u,0,u,3,u,4,u,4,u,4,u,4]
-; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm14, %xmm4
-; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm1, %zmm4, %zmm21
-; AVX512-FCP-NEXT:    vmovdqa (%rsi), %xmm6
-; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm4
-; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm6, %xmm0
-; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
-; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
-; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7]
-; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm5, %xmm5
-; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm5, %zmm25
-; AVX512-FCP-NEXT:    vmovdqa (%r8), %xmm1
-; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm1, %xmm5
-; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm1, %xmm2
-; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm5, %zmm2, %zmm27
-; AVX512-FCP-NEXT:    vmovdqa (%r9), %xmm8
-; AVX512-FCP-NEXT:    vpshufb %xmm7, %xmm8, %xmm5
-; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm8, %xmm3
-; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm5, %zmm3, %zmm28
-; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %xmm5
-; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm5, %xmm7
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %xmm3
-; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm3, %xmm9
-; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3],xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7]
-; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
-; AVX512-FCP-NEXT:    vprold $16, %xmm9, %xmm9
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm9, %zmm30
-; AVX512-FCP-NEXT:    vmovdqa (%rcx), %xmm7
-; AVX512-FCP-NEXT:    vmovdqa (%rdx), %xmm9
-; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm7, %xmm0
-; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm9, %xmm10
-; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3],xmm10[4],xmm0[4],xmm10[5],xmm0[5],xmm10[6],xmm0[6],xmm10[7],xmm0[7]
-; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm10 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3],xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7]
+; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3],xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm3, %xmm18
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm15 = [0,1,6,7,4,5,2,3,8,9,10,11,12,13,10,11]
+; AVX512-FCP-NEXT:    vpshufb %xmm15, %xmm2, %xmm2
+; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm1, %zmm2, %zmm1
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa 32(%r9), %xmm4
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm13 = [u,6,u,5,u,8,u,7,u,9,u,9,u,9,u,9]
+; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm4, %xmm2
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,2,u,1,u,0,u,3,u,4,u,4,u,4,u,4]
+; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm4, %xmm3
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm4, %xmm19
+; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm2, %zmm3, %zmm26
+; AVX512-FCP-NEXT:    vmovdqa (%rsi), %xmm5
+; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm3
+; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm5, %xmm4
+; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15]
+; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
+; AVX512-FCP-NEXT:    vpshufb %xmm15, %xmm4, %xmm4
+; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm4, %zmm27
+; AVX512-FCP-NEXT:    vmovdqa (%r9), %xmm15
+; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm15, %xmm0
+; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm15, %xmm1
+; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm28
+; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %ymm13
+; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0]
+; AVX512-FCP-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm13, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm1, %ymm21
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm22 = ymm0[2,2,2,3]
+; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0]
+; AVX512-FCP-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm13, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm1, %ymm20
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm24 = ymm0[2,2,2,3]
+; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %xmm2
+; AVX512-FCP-NEXT:    vpshufb %xmm14, %xmm2, %xmm4
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %xmm1
+; AVX512-FCP-NEXT:    vpshufb %xmm14, %xmm1, %xmm10
+; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3],xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7]
+; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm10 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
 ; AVX512-FCP-NEXT:    vprold $16, %xmm10, %xmm10
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm10, %zmm31
-; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm11[0],ymm15[0],ymm11[1],ymm15[1],ymm11[2],ymm15[2],ymm11[3],ymm15[3],ymm11[4],ymm15[4],ymm11[5],ymm15[5],ymm11[6],ymm15[6],ymm11[7],ymm15[7],ymm11[16],ymm15[16],ymm11[17],ymm15[17],ymm11[18],ymm15[18],ymm11[19],ymm15[19],ymm11[20],ymm15[20],ymm11[21],ymm15[21],ymm11[22],ymm15[22],ymm11[23],ymm15[23]
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm10, %zmm23
+; AVX512-FCP-NEXT:    vmovdqa (%rcx), %xmm4
+; AVX512-FCP-NEXT:    vmovdqa (%rdx), %xmm10
+; AVX512-FCP-NEXT:    vpshufb %xmm14, %xmm4, %xmm0
+; AVX512-FCP-NEXT:    vpshufb %xmm14, %xmm10, %xmm14
+; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3],xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7]
+; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm14 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3],xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7]
+; AVX512-FCP-NEXT:    vprold $16, %xmm14, %xmm14
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm14, %zmm29
+; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm12[0],ymm11[0],ymm12[1],ymm11[1],ymm12[2],ymm11[2],ymm12[3],ymm11[3],ymm12[4],ymm11[4],ymm12[5],ymm11[5],ymm12[6],ymm11[6],ymm12[7],ymm11[7],ymm12[16],ymm11[16],ymm12[17],ymm11[17],ymm12[18],ymm11[18],ymm12[19],ymm11[19],ymm12[20],ymm11[20],ymm12[21],ymm11[21],ymm12[22],ymm11[22],ymm12[23],ymm11[23]
 ; AVX512-FCP-NEXT:    vprold $16, %ymm0, %ymm0
-; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm7 = xmm9[8],xmm7[8],xmm9[9],xmm7[9],xmm9[10],xmm7[10],xmm9[11],xmm7[11],xmm9[12],xmm7[12],xmm9[13],xmm7[13],xmm9[14],xmm7[14],xmm9[15],xmm7[15]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm18, %ymm2
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm17, %ymm9
-; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm9 = ymm2[0],ymm9[0],ymm2[1],ymm9[1],ymm2[2],ymm9[2],ymm2[3],ymm9[3],ymm2[4],ymm9[4],ymm2[5],ymm9[5],ymm2[6],ymm9[6],ymm2[7],ymm9[7],ymm2[16],ymm9[16],ymm2[17],ymm9[17],ymm2[18],ymm9[18],ymm2[19],ymm9[19],ymm2[20],ymm9[20],ymm2[21],ymm9[21],ymm2[22],ymm9[22],ymm2[23],ymm9[23]
-; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15]
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27]
-; AVX512-FCP-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm9, %ymm9
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm18
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15]
-; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm4, %xmm15
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm2, %xmm17
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm4 = [0,0,0,1,10,10,10,11]
-; AVX512-FCP-NEXT:    vpermt2q %zmm9, %zmm4, %zmm15
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15]
-; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm7, %xmm7
-; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm4, %zmm7
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm7, %zmm9, %zmm15
+; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm10 = xmm10[8],xmm4[8],xmm10[9],xmm4[9],xmm10[10],xmm4[10],xmm10[11],xmm4[11],xmm10[12],xmm4[12],xmm10[13],xmm4[13],xmm10[14],xmm4[14],xmm10[15],xmm4[15]
+; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm4 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[4],ymm8[4],ymm7[5],ymm8[5],ymm7[6],ymm8[6],ymm7[7],ymm8[7],ymm7[16],ymm8[16],ymm7[17],ymm8[17],ymm7[18],ymm8[18],ymm7[19],ymm8[19],ymm7[20],ymm8[20],ymm7[21],ymm8[21],ymm7[22],ymm8[22],ymm7[23],ymm8[23]
+; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15]
+; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm14 = [16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27]
+; AVX512-FCP-NEXT:    # ymm14 = mem[0,1,0,1]
+; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm4, %ymm4
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm12 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15]
+; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm3, %xmm11
+; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm7 = [0,0,0,1,10,10,10,11]
+; AVX512-FCP-NEXT:    vpermt2q %zmm4, %zmm7, %zmm11
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm4 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15]
+; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm10, %xmm3
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm7, %zmm3
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
+; AVX512-FCP-NEXT:    vpternlogq $226, %zmm3, %zmm16, %zmm11
+; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0]
+; AVX512-FCP-NEXT:    # ymm10 = mem[0,1,0,1]
+; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm9, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [u,10,u,13,u,12,u,11,u,14,u,13,u,14,u,15]
+; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm15, %xmm3
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm5, %xmm17
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm7, %zmm3
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm25, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm30, %ymm5
+; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[1],ymm5[1],ymm0[2],ymm5[2],ymm0[3],ymm5[3],ymm0[4],ymm5[4],ymm0[5],ymm5[5],ymm0[6],ymm5[6],ymm0[7],ymm5[7],ymm0[16],ymm5[16],ymm0[17],ymm5[17],ymm0[18],ymm5[18],ymm0[19],ymm5[19],ymm0[20],ymm5[20],ymm0[21],ymm5[21],ymm0[22],ymm5[22],ymm0[23],ymm5[23]
+; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
+; AVX512-FCP-NEXT:    vprold $16, %ymm0, %ymm0
+; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm1, %xmm2
+; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm7, %zmm2
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512-FCP-NEXT:    vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    # ymm4 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[16],mem[16],ymm0[17],mem[17],ymm0[18],mem[18],ymm0[19],mem[19],ymm0[20],mem[20],ymm0[21],mem[21],ymm0[22],mem[22],ymm0[23],mem[23]
+; AVX512-FCP-NEXT:    vmovdqa (%r8), %ymm5
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm21, %ymm0
+; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm5, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm20, %ymm1
+; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm5, %ymm1
+; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm4, %ymm4
+; AVX512-FCP-NEXT:    vmovdqa (%r8), %xmm14
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm18, %xmm8
+; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm6[8],xmm8[8],xmm6[9],xmm8[9],xmm6[10],xmm8[10],xmm6[11],xmm8[11],xmm6[12],xmm8[12],xmm6[13],xmm8[13],xmm6[14],xmm8[14],xmm6[15],xmm8[15]
+; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %xmm15
+; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm6, %xmm8
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u]
+; AVX512-FCP-NEXT:    vpermt2q %zmm4, %zmm7, %zmm8
+; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm15, %xmm4
+; AVX512-FCP-NEXT:    vpternlogq $226, %zmm2, %zmm16, %zmm8
 ; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0]
 ; AVX512-FCP-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm19, %ymm0
-; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm19
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm11 = [10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u]
-; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm1, %xmm1
-; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm4, %zmm1
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
-; AVX512-FCP-NEXT:    vpternlogd $184, %zmm15, %zmm10, %zmm1
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0]
-; AVX512-FCP-NEXT:    # ymm15 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm20, %ymm0
-; AVX512-FCP-NEXT:    vpshufb %ymm15, %ymm0, %ymm2
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [u,10,u,13,u,12,u,11,u,14,u,13,u,14,u,15]
-; AVX512-FCP-NEXT:    vpshufb %xmm7, %xmm8, %xmm0
-; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm4, %zmm0
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255]
-; AVX512-FCP-NEXT:    vpternlogd $184, %zmm1, %zmm2, %zmm0
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm22, %ymm8
-; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm8[0],ymm1[1],ymm8[1],ymm1[2],ymm8[2],ymm1[3],ymm8[3],ymm1[4],ymm8[4],ymm1[5],ymm8[5],ymm1[6],ymm8[6],ymm1[7],ymm8[7],ymm1[16],ymm8[16],ymm1[17],ymm8[17],ymm1[18],ymm8[18],ymm1[19],ymm8[19],ymm1[20],ymm8[20],ymm1[21],ymm8[21],ymm1[22],ymm8[22],ymm1[23],ymm8[23]
-; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15]
-; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm3, %xmm3
-; AVX512-FCP-NEXT:    vmovdqu (%rsp), %ymm5 # 32-byte Reload
-; AVX512-FCP-NEXT:    vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm5 = ymm5[0],mem[0],ymm5[1],mem[1],ymm5[2],mem[2],ymm5[3],mem[3],ymm5[4],mem[4],ymm5[5],mem[5],ymm5[6],mem[6],ymm5[7],mem[7],ymm5[16],mem[16],ymm5[17],mem[17],ymm5[18],mem[18],ymm5[19],mem[19],ymm5[20],mem[20],ymm5[21],mem[21],ymm5[22],mem[22],ymm5[23],mem[23]
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm18, %ymm6
-; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm5, %ymm5
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm16, %xmm6
-; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm12[8],xmm6[8],xmm12[9],xmm6[9],xmm12[10],xmm6[10],xmm12[11],xmm6[11],xmm12[12],xmm6[12],xmm12[13],xmm6[13],xmm12[14],xmm6[14],xmm12[15],xmm6[15]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm17, %xmm8
-; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm6, %xmm6
-; AVX512-FCP-NEXT:    vpermt2q %zmm5, %zmm4, %zmm6
-; AVX512-FCP-NEXT:    vprold $16, %ymm1, %ymm1
-; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm4, %zmm3
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm3, %zmm9, %zmm6
-; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm19, %ymm3
-; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
-; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm13, %xmm3
-; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm4, %zmm3
-; AVX512-FCP-NEXT:    vpternlogd $184, %zmm6, %zmm10, %zmm3
-; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX512-FCP-NEXT:    vpshufb %ymm15, %ymm1, %ymm1
-; AVX512-FCP-NEXT:    vpshufb %xmm7, %xmm14, %xmm5
-; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm4, %zmm5
-; AVX512-FCP-NEXT:    vpternlogd $184, %zmm3, %zmm2, %zmm5
+; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm5, %ymm12
+; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm13, %ymm9
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u]
+; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm14, %xmm6
+; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm15, %xmm5
+; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm14, %xmm2
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm13 = [10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u]
+; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm14, %xmm14
+; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm15, %xmm13
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm31, %ymm15
+; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm15, %ymm15
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm30 = ymm0[2,2,2,3]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm31 = ymm1[2,2,2,3]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm6[0,0,0,1]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm2[0,0,0,1]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm14[0,0,0,1]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm19, %xmm6
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm17, %xmm14
+; AVX512-FCP-NEXT:    vpshufb %xmm14, %xmm6, %xmm6
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm14 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0]
+; AVX512-FCP-NEXT:    vpternlogq $184, %ymm11, %ymm14, %ymm2
+; AVX512-FCP-NEXT:    vextracti64x4 $1, %zmm11, %ymm11
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[0,0,0,1]
+; AVX512-FCP-NEXT:    vpternlogq $184, %ymm8, %ymm14, %ymm13
+; AVX512-FCP-NEXT:    vextracti64x4 $1, %zmm8, %ymm8
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3]
+; AVX512-FCP-NEXT:    vpermt2q %zmm15, %zmm7, %zmm6
+; AVX512-FCP-NEXT:    vpternlogq $184, %ymm8, %ymm16, %ymm9
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm9, %zmm0, %zmm7
+; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm7 = zmm13[0,1,2,3],zmm7[4,5,6,7]
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255]
+; AVX512-FCP-NEXT:    vpternlogd $184, %zmm7, %zmm8, %zmm6
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, 256(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, 64(%rax)
-; AVX512-FCP-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    # zmm0 = mem[2,2,2,3,6,6,6,7]
-; AVX512-FCP-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    # zmm1 = mem[2,2,2,3,6,6,6,7]
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm0, %zmm9, %zmm1
-; AVX512-FCP-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    # zmm0 = mem[2,2,2,3,6,6,6,7]
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, 256(%rax)
+; AVX512-FCP-NEXT:    vpternlogq $184, %ymm11, %ymm16, %ymm12
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm12, %zmm0, %zmm6
+; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm6[4,5,6,7]
+; AVX512-FCP-NEXT:    vpternlogd $184, %zmm2, %zmm8, %zmm3
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, 64(%rax)
+; AVX512-FCP-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm2 = mem[2,2,2,3,6,6,6,7]
+; AVX512-FCP-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm3 = mem[2,2,2,3,6,6,6,7]
+; AVX512-FCP-NEXT:    vpternlogq $226, %zmm2, %zmm16, %zmm3
+; AVX512-FCP-NEXT:    vextracti64x4 $1, %zmm3, %ymm2
+; AVX512-FCP-NEXT:    vpternlogq $184, %ymm2, %ymm14, %ymm22
+; AVX512-FCP-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm2 = mem[2,2,2,3,6,6,6,7]
+; AVX512-FCP-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm6 = mem[2,2,2,3,6,6,6,7]
+; AVX512-FCP-NEXT:    vpternlogq $226, %zmm2, %zmm16, %zmm6
+; AVX512-FCP-NEXT:    vextracti64x4 $1, %zmm6, %ymm2
+; AVX512-FCP-NEXT:    vpternlogq $184, %ymm2, %ymm14, %ymm30
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0]
-; AVX512-FCP-NEXT:    vpternlogq $184, %zmm1, %zmm2, %zmm0
-; AVX512-FCP-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    # zmm1 = mem[2,2,2,3,6,6,6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0]
-; AVX512-FCP-NEXT:    vpternlogd $184, %zmm0, %zmm3, %zmm1
-; AVX512-FCP-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    # zmm0 = mem[2,2,2,3,6,6,6,7]
-; AVX512-FCP-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    # zmm4 = mem[2,2,2,3,6,6,6,7]
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm0, %zmm9, %zmm4
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm0 = zmm23[2,2,2,3,6,6,6,7]
-; AVX512-FCP-NEXT:    vpternlogq $184, %zmm4, %zmm2, %zmm0
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm4 = zmm24[2,2,2,3,6,6,6,7]
-; AVX512-FCP-NEXT:    vpternlogd $184, %zmm0, %zmm3, %zmm4
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm0 = zmm30[0,0,0,1,4,4,4,5]
+; AVX512-FCP-NEXT:    vpternlogq $184, %ymm3, %ymm2, %ymm24
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm22, %zmm0, %zmm3
+; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm24[0,1,2,3],zmm3[4,5,6,7]
+; AVX512-FCP-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm7 = mem[2,2,2,3,6,6,6,7]
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0]
+; AVX512-FCP-NEXT:    vpternlogd $184, %zmm3, %zmm8, %zmm7
+; AVX512-FCP-NEXT:    vpternlogq $184, %ymm6, %ymm2, %ymm31
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm30, %zmm0, %zmm3
+; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm31[0,1,2,3],zmm3[4,5,6,7]
+; AVX512-FCP-NEXT:    vpermq $234, (%rsp), %zmm6 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm6 = mem[2,2,2,3,6,6,6,7]
+; AVX512-FCP-NEXT:    vpternlogd $184, %zmm3, %zmm8, %zmm6
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm3 = zmm23[0,0,0,1,4,4,4,5]
+; AVX512-FCP-NEXT:    vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm8 = mem[0,0,0,1,4,4,4,5]
+; AVX512-FCP-NEXT:    vpternlogq $226, %zmm3, %zmm2, %zmm8
+; AVX512-FCP-NEXT:    vextracti64x4 $1, %zmm8, %ymm3
+; AVX512-FCP-NEXT:    vpternlogq $184, %ymm3, %ymm2, %ymm4
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm3 = zmm29[0,0,0,1,4,4,4,5]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm9 = zmm27[0,0,0,1,4,4,4,5]
+; AVX512-FCP-NEXT:    vpternlogq $226, %zmm3, %zmm2, %zmm9
+; AVX512-FCP-NEXT:    vextracti64x4 $1, %zmm9, %ymm3
+; AVX512-FCP-NEXT:    vpternlogq $184, %ymm3, %ymm2, %ymm0
+; AVX512-FCP-NEXT:    vpternlogq $184, %ymm8, %ymm16, %ymm5
+; AVX512-FCP-NEXT:    vpternlogq $184, %ymm9, %ymm16, %ymm1
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm2
+; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm5[0,1,2,3],zmm2[4,5,6,7]
 ; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm3 = zmm26[0,0,0,1,4,4,4,5]
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm0, %zmm2, %zmm3
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm0 = zmm31[0,0,0,1,4,4,4,5]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm5 = zmm25[0,0,0,1,4,4,4,5]
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm0, %zmm2, %zmm5
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm0 = zmm29[0,0,0,1,4,4,4,5]
-; AVX512-FCP-NEXT:    vpternlogq $184, %zmm3, %zmm9, %zmm0
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm2 = zmm27[0,0,0,1,4,4,4,5]
-; AVX512-FCP-NEXT:    vpternlogq $184, %zmm5, %zmm9, %zmm2
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm3 = zmm21[0,0,0,1,4,4,4,5]
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255]
-; AVX512-FCP-NEXT:    vpternlogd $184, %zmm0, %zmm5, %zmm3
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm0 = zmm28[0,0,0,1,4,4,4,5]
-; AVX512-FCP-NEXT:    vpternlogd $184, %zmm2, %zmm5, %zmm0
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, (%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255]
+; AVX512-FCP-NEXT:    vpternlogd $184, %zmm2, %zmm4, %zmm3
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
+; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm1 = zmm28[0,0,0,1,4,4,4,5]
+; AVX512-FCP-NEXT:    vpternlogd $184, %zmm0, %zmm4, %zmm1
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, (%rax)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, 192(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, 128(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, 320(%rax)
-; AVX512-FCP-NEXT:    addq $424, %rsp # imm = 0x1A8
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, 128(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, 320(%rax)
+; AVX512-FCP-NEXT:    addq $392, %rsp # imm = 0x188
 ; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: store_i8_stride6_vf64:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    subq $456, %rsp # imm = 0x1C8
-; AVX512DQ-NEXT:    vmovdqa (%rsi), %xmm6
-; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %xmm3
-; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm7
-; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX512DQ-NEXT:    vmovdqa (%r8), %xmm10
-; AVX512DQ-NEXT:    vmovdqa 32(%r8), %xmm4
-; AVX512DQ-NEXT:    vmovdqa (%r9), %xmm12
-; AVX512DQ-NEXT:    vmovdqa 32(%r9), %xmm8
-; AVX512DQ-NEXT:    vmovdqa 32(%rcx), %ymm13
-; AVX512DQ-NEXT:    vpbroadcastq {{.*#+}} xmm9 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
-; AVX512DQ-NEXT:    vpshufb %xmm9, %xmm3, %xmm0
-; AVX512DQ-NEXT:    vpshufb %xmm9, %xmm2, %xmm1
+; AVX512DQ-NEXT:    subq $328, %rsp # imm = 0x148
+; AVX512DQ-NEXT:    vmovdqa (%rsi), %xmm5
+; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %xmm2
+; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm10
+; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %xmm3
+; AVX512DQ-NEXT:    vmovdqa (%r9), %xmm11
+; AVX512DQ-NEXT:    vmovdqa 32(%r9), %xmm7
+; AVX512DQ-NEXT:    vmovdqa 32(%rcx), %ymm8
+; AVX512DQ-NEXT:    vmovdqa 32(%rdx), %ymm9
+; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %ymm6
+; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm12
+; AVX512DQ-NEXT:    vpbroadcastq {{.*#+}} ymm14 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
+; AVX512DQ-NEXT:    vpbroadcastq {{.*#+}} xmm4 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
+; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm2, %xmm0
+; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm3, %xmm1
 ; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm3, %xmm30
-; AVX512DQ-NEXT:    vmovdqa64 %xmm2, %xmm31
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm3, %xmm16
+; AVX512DQ-NEXT:    vmovdqa64 %xmm2, %xmm27
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5]
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm0 = [6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u]
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm4, %xmm2
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u]
-; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm4, %xmm3
-; AVX512DQ-NEXT:    vmovdqa64 %xmm4, %xmm29
-; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm2, %zmm3, %zmm2
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm0 = [u,6,u,5,u,8,u,7,u,9,u,9,u,9,u,9]
+; AVX512DQ-NEXT:    vmovdqa %xmm7, %xmm3
+; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm7, %xmm2
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,2,u,1,u,0,u,3,u,4,u,4,u,4,u,4]
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm7, %xmm7
+; AVX512DQ-NEXT:    vmovdqa64 %xmm3, %xmm26
+; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm2, %zmm7, %zmm2
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm2 = [u,6,u,5,u,8,u,7,u,9,u,9,u,9,u,9]
-; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm8, %xmm4
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm3 = [u,2,u,1,u,0,u,3,u,4,u,4,u,4,u,4]
-; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm8, %xmm5
-; AVX512DQ-NEXT:    vmovdqa64 %xmm8, %xmm27
-; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm4, %zmm5, %zmm4
-; AVX512DQ-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 32(%rdx), %ymm8
-; AVX512DQ-NEXT:    vpshufb %xmm9, %xmm6, %xmm4
-; AVX512DQ-NEXT:    vpshufb %xmm9, %xmm7, %xmm5
-; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
-; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm7, %xmm18
-; AVX512DQ-NEXT:    vmovdqa64 %xmm6, %xmm20
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,3,2,1,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,6,5]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm4, %zmm5, %zmm4
-; AVX512DQ-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %ymm4
-; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm11
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm10, %xmm0
-; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm10, %xmm1
-; AVX512DQ-NEXT:    vmovdqa64 %xmm10, %xmm22
-; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
-; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm12, %xmm0
-; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm12, %xmm1
-; AVX512DQ-NEXT:    vmovdqa64 %xmm12, %xmm21
+; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm5, %xmm2
+; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm10, %xmm4
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15]
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3],xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm10, %xmm29
+; AVX512DQ-NEXT:    vmovdqa64 %xmm5, %xmm30
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,1,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm4, %zmm2
+; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm11, %xmm0
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm11, %xmm1
+; AVX512DQ-NEXT:    vmovdqa64 %xmm11, %xmm31
 ; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpbroadcastq {{.*#+}} ymm15 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
-; AVX512DQ-NEXT:    vpshufb %ymm15, %ymm4, %ymm0
-; AVX512DQ-NEXT:    vpshufb %ymm15, %ymm11, %ymm1
+; AVX512DQ-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
+; AVX512DQ-NEXT:    vpshufb %ymm7, %ymm6, %ymm0
+; AVX512DQ-NEXT:    vpshufb %ymm7, %ymm12, %ymm1
 ; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
-; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm1 = ymm11[8],ymm4[8],ymm11[9],ymm4[9],ymm11[10],ymm4[10],ymm11[11],ymm4[11],ymm11[12],ymm4[12],ymm11[13],ymm4[13],ymm11[14],ymm4[14],ymm11[15],ymm4[15],ymm11[24],ymm4[24],ymm11[25],ymm4[25],ymm11[26],ymm4[26],ymm11[27],ymm4[27],ymm11[28],ymm4[28],ymm11[29],ymm4[29],ymm11[30],ymm4[30],ymm11[31],ymm4[31]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm4, %ymm19
-; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31]
-; AVX512DQ-NEXT:    # ymm6 = mem[0,1,0,1]
-; AVX512DQ-NEXT:    vpshufb %ymm6, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm1 = ymm12[8],ymm6[8],ymm12[9],ymm6[9],ymm12[10],ymm6[10],ymm12[11],ymm6[11],ymm12[12],ymm6[12],ymm12[13],ymm6[13],ymm12[14],ymm6[14],ymm12[15],ymm6[15],ymm12[24],ymm6[24],ymm12[25],ymm6[25],ymm12[26],ymm6[26],ymm12[27],ymm6[27],ymm12[28],ymm6[28],ymm12[29],ymm6[29],ymm12[30],ymm6[30],ymm12[31],ymm6[31]
+; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31]
+; AVX512DQ-NEXT:    # ymm13 = mem[0,1,0,1]
+; AVX512DQ-NEXT:    vpshufb %ymm13, %ymm1, %ymm1
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
-; AVX512DQ-NEXT:    vpbroadcastq {{.*#+}} ymm12 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
-; AVX512DQ-NEXT:    vpshufb %ymm12, %ymm13, %ymm0
-; AVX512DQ-NEXT:    vpshufb %ymm12, %ymm8, %ymm1
-; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
-; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm1 = ymm8[8],ymm13[8],ymm8[9],ymm13[9],ymm8[10],ymm13[10],ymm8[11],ymm13[11],ymm8[12],ymm13[12],ymm8[13],ymm13[13],ymm8[14],ymm13[14],ymm8[15],ymm13[15],ymm8[24],ymm13[24],ymm8[25],ymm13[25],ymm8[26],ymm13[26],ymm8[27],ymm13[27],ymm8[28],ymm13[28],ymm8[29],ymm13[29],ymm8[30],ymm13[30],ymm8[31],ymm13[31]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm13, %ymm17
-; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31]
-; AVX512DQ-NEXT:    # ymm7 = mem[0,1,0,1]
-; AVX512DQ-NEXT:    vpshufb %ymm7, %ymm1, %ymm1
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 32(%r8), %ymm13
-; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0]
-; AVX512DQ-NEXT:    # ymm5 = mem[0,1,0,1]
-; AVX512DQ-NEXT:    vpshufb %ymm5, %ymm13, %ymm0
-; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0]
-; AVX512DQ-NEXT:    # ymm4 = mem[0,1,0,1]
-; AVX512DQ-NEXT:    vpshufb %ymm4, %ymm13, %ymm1
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 32(%r9), %ymm14
-; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15]
-; AVX512DQ-NEXT:    # ymm3 = mem[0,1,0,1]
-; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm14, %ymm0
-; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0]
+; AVX512DQ-NEXT:    vpshufb %ymm14, %ymm8, %ymm0
+; AVX512DQ-NEXT:    vpshufb %ymm14, %ymm9, %ymm1
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11],ymm9[12],ymm8[12],ymm9[13],ymm8[13],ymm9[14],ymm8[14],ymm9[15],ymm8[15],ymm9[24],ymm8[24],ymm9[25],ymm8[25],ymm9[26],ymm8[26],ymm9[27],ymm8[27],ymm9[28],ymm8[28],ymm9[29],ymm8[29],ymm9[30],ymm8[30],ymm9[31],ymm8[31]
+; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31]
+; AVX512DQ-NEXT:    # ymm0 = mem[0,1,0,1]
+; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa 32(%r9), %ymm4
+; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15]
 ; AVX512DQ-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX512DQ-NEXT:    vpshufb %ymm2, %ymm14, %ymm1
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm28
-; AVX512DQ-NEXT:    vmovdqa (%rsi), %ymm1
-; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512DQ-NEXT:    vpshufb %ymm15, %ymm1, %ymm10
-; AVX512DQ-NEXT:    vpshufb %ymm15, %ymm0, %ymm15
-; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm10 = ymm15[0],ymm10[0],ymm15[1],ymm10[1],ymm15[2],ymm10[2],ymm15[3],ymm10[3],ymm15[4],ymm10[4],ymm15[5],ymm10[5],ymm15[6],ymm10[6],ymm15[7],ymm10[7],ymm15[16],ymm10[16],ymm15[17],ymm10[17],ymm15[18],ymm10[18],ymm15[19],ymm10[19],ymm15[20],ymm10[20],ymm15[21],ymm10[21],ymm15[22],ymm10[22],ymm15[23],ymm10[23]
-; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm15 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
-; AVX512DQ-NEXT:    vpshufb %ymm6, %ymm15, %ymm6
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm6, %zmm10, %zmm26
-; AVX512DQ-NEXT:    vmovdqa (%rcx), %ymm10
-; AVX512DQ-NEXT:    vmovdqa (%rdx), %ymm15
-; AVX512DQ-NEXT:    vpshufb %ymm12, %ymm10, %ymm6
-; AVX512DQ-NEXT:    vpshufb %ymm12, %ymm15, %ymm9
-; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm6 = ymm9[0],ymm6[0],ymm9[1],ymm6[1],ymm9[2],ymm6[2],ymm9[3],ymm6[3],ymm9[4],ymm6[4],ymm9[5],ymm6[5],ymm9[6],ymm6[6],ymm9[7],ymm6[7],ymm9[16],ymm6[16],ymm9[17],ymm6[17],ymm9[18],ymm6[18],ymm9[19],ymm6[19],ymm9[20],ymm6[20],ymm9[21],ymm6[21],ymm9[22],ymm6[22],ymm9[23],ymm6[23]
-; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm9 = ymm15[8],ymm10[8],ymm15[9],ymm10[9],ymm15[10],ymm10[10],ymm15[11],ymm10[11],ymm15[12],ymm10[12],ymm15[13],ymm10[13],ymm15[14],ymm10[14],ymm15[15],ymm10[15],ymm15[24],ymm10[24],ymm15[25],ymm10[25],ymm15[26],ymm10[26],ymm15[27],ymm10[27],ymm15[28],ymm10[28],ymm15[29],ymm10[29],ymm15[30],ymm10[30],ymm15[31],ymm10[31]
-; AVX512DQ-NEXT:    vpshufb %ymm7, %ymm9, %ymm9
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm9, %zmm6, %zmm24
-; AVX512DQ-NEXT:    vmovdqa (%r8), %ymm6
-; AVX512DQ-NEXT:    vpshufb %ymm5, %ymm6, %ymm5
-; AVX512DQ-NEXT:    vpshufb %ymm4, %ymm6, %ymm4
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm5, %zmm4, %zmm23
-; AVX512DQ-NEXT:    vmovdqa (%r9), %ymm4
-; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
-; AVX512DQ-NEXT:    vpshufb %ymm2, %ymm4, %ymm2
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm25
-; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm15[0],ymm10[0],ymm15[1],ymm10[1],ymm15[2],ymm10[2],ymm15[3],ymm10[3],ymm15[4],ymm10[4],ymm15[5],ymm10[5],ymm15[6],ymm10[6],ymm15[7],ymm10[7],ymm15[16],ymm10[16],ymm15[17],ymm10[17],ymm15[18],ymm10[18],ymm15[19],ymm10[19],ymm15[20],ymm10[20],ymm15[21],ymm10[21],ymm15[22],ymm10[22],ymm15[23],ymm10[23]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm16
-; AVX512DQ-NEXT:    vmovdqa64 %xmm18, %xmm2
-; AVX512DQ-NEXT:    vmovdqa64 %xmm20, %xmm3
-; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
-; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+; AVX512DQ-NEXT:    vpshufb %ymm2, %ymm4, %ymm10
+; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0]
+; AVX512DQ-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm4, %ymm11
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm10, %zmm11, %zmm3
+; AVX512DQ-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa (%rsi), %ymm15
+; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm10
+; AVX512DQ-NEXT:    vpshufb %ymm7, %ymm15, %ymm11
+; AVX512DQ-NEXT:    vpshufb %ymm7, %ymm10, %ymm7
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm7 = ymm7[0],ymm11[0],ymm7[1],ymm11[1],ymm7[2],ymm11[2],ymm7[3],ymm11[3],ymm7[4],ymm11[4],ymm7[5],ymm11[5],ymm7[6],ymm11[6],ymm7[7],ymm11[7],ymm7[16],ymm11[16],ymm7[17],ymm11[17],ymm7[18],ymm11[18],ymm7[19],ymm11[19],ymm7[20],ymm11[20],ymm7[21],ymm11[21],ymm7[22],ymm11[22],ymm7[23],ymm11[23]
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm11 = ymm10[8],ymm15[8],ymm10[9],ymm15[9],ymm10[10],ymm15[10],ymm10[11],ymm15[11],ymm10[12],ymm15[12],ymm10[13],ymm15[13],ymm10[14],ymm15[14],ymm10[15],ymm15[15],ymm10[24],ymm15[24],ymm10[25],ymm15[25],ymm10[26],ymm15[26],ymm10[27],ymm15[27],ymm10[28],ymm15[28],ymm10[29],ymm15[29],ymm10[30],ymm15[30],ymm10[31],ymm15[31]
+; AVX512DQ-NEXT:    vpshufb %ymm13, %ymm11, %ymm11
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm11, %zmm7, %zmm25
+; AVX512DQ-NEXT:    vmovdqa (%rcx), %ymm7
+; AVX512DQ-NEXT:    vmovdqa (%rdx), %ymm11
+; AVX512DQ-NEXT:    vpshufb %ymm14, %ymm7, %ymm13
+; AVX512DQ-NEXT:    vpshufb %ymm14, %ymm11, %ymm5
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm13[0],ymm5[1],ymm13[1],ymm5[2],ymm13[2],ymm5[3],ymm13[3],ymm5[4],ymm13[4],ymm5[5],ymm13[5],ymm5[6],ymm13[6],ymm5[7],ymm13[7],ymm5[16],ymm13[16],ymm5[17],ymm13[17],ymm5[18],ymm13[18],ymm5[19],ymm13[19],ymm5[20],ymm13[20],ymm5[21],ymm13[21],ymm5[22],ymm13[22],ymm5[23],ymm13[23]
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm13 = ymm11[8],ymm7[8],ymm11[9],ymm7[9],ymm11[10],ymm7[10],ymm11[11],ymm7[11],ymm11[12],ymm7[12],ymm11[13],ymm7[13],ymm11[14],ymm7[14],ymm11[15],ymm7[15],ymm11[24],ymm7[24],ymm11[25],ymm7[25],ymm11[26],ymm7[26],ymm11[27],ymm7[27],ymm11[28],ymm7[28],ymm11[29],ymm7[29],ymm11[30],ymm7[30],ymm11[31],ymm7[31]
+; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm13, %ymm0
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm5, %zmm24
+; AVX512DQ-NEXT:    vmovdqa (%r9), %ymm3
+; AVX512DQ-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
+; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm23
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm11[0],ymm7[0],ymm11[1],ymm7[1],ymm11[2],ymm7[2],ymm11[3],ymm7[3],ymm11[4],ymm7[4],ymm11[5],ymm7[5],ymm11[6],ymm7[6],ymm11[7],ymm7[7],ymm11[16],ymm7[16],ymm11[17],ymm7[17],ymm11[18],ymm7[18],ymm11[19],ymm7[19],ymm11[20],ymm7[20],ymm11[21],ymm7[21],ymm11[22],ymm7[22],ymm11[23],ymm7[23]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm0, %ymm28
+; AVX512DQ-NEXT:    vmovdqa64 %xmm29, %xmm0
+; AVX512DQ-NEXT:    vmovdqa64 %xmm30, %xmm1
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm7 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX512DQ-NEXT:    vmovdqa (%rcx), %xmm2
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm10[0],ymm15[0],ymm10[1],ymm15[1],ymm10[2],ymm15[2],ymm10[3],ymm15[3],ymm10[4],ymm15[4],ymm10[5],ymm15[5],ymm10[6],ymm15[6],ymm10[7],ymm15[7],ymm10[16],ymm15[16],ymm10[17],ymm15[17],ymm10[18],ymm15[18],ymm10[19],ymm15[19],ymm10[20],ymm15[20],ymm10[21],ymm15[21],ymm10[22],ymm15[22],ymm10[23],ymm15[23]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm0, %ymm19
+; AVX512DQ-NEXT:    vmovdqa (%rdx), %xmm10
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm10[8],xmm2[8],xmm10[9],xmm2[9],xmm10[10],xmm2[10],xmm10[11],xmm2[11],xmm10[12],xmm2[12],xmm10[13],xmm2[13],xmm10[14],xmm2[14],xmm10[15],xmm2[15]
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm15 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15]
+; AVX512DQ-NEXT:    vpshufb %xmm15, %xmm5, %xmm0
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm0, %ymm18
-; AVX512DQ-NEXT:    vmovdqa (%rcx), %xmm15
-; AVX512DQ-NEXT:    vmovdqa (%rdx), %xmm2
-; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15]
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm3 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15]
-; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm1, %xmm10
 ; AVX512DQ-NEXT:    vmovdqa 32(%rcx), %xmm1
 ; AVX512DQ-NEXT:    vmovdqa 32(%rdx), %xmm0
-; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm9 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm9, %xmm3
-; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm20
-; AVX512DQ-NEXT:    vmovdqa64 %ymm17, %ymm3
-; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm8[0],ymm3[0],ymm8[1],ymm3[1],ymm8[2],ymm3[2],ymm8[3],ymm3[3],ymm8[4],ymm3[4],ymm8[5],ymm3[5],ymm8[6],ymm3[6],ymm8[7],ymm3[7],ymm8[16],ymm3[16],ymm8[17],ymm3[17],ymm8[18],ymm3[18],ymm8[19],ymm3[19],ymm8[20],ymm3[20],ymm8[21],ymm3[21],ymm8[22],ymm3[22],ymm8[23],ymm3[23]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm17
-; AVX512DQ-NEXT:    vmovdqa64 %xmm30, %xmm3
-; AVX512DQ-NEXT:    vmovdqa64 %xmm31, %xmm7
-; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15]
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm8 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15]
-; AVX512DQ-NEXT:    vpshufb %xmm8, %xmm5, %xmm9
-; AVX512DQ-NEXT:    vpshufb %xmm8, %xmm3, %xmm3
-; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm31
-; AVX512DQ-NEXT:    vmovdqa64 %ymm19, %ymm3
-; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm11[0],ymm3[0],ymm11[1],ymm3[1],ymm11[2],ymm3[2],ymm11[3],ymm3[3],ymm11[4],ymm3[4],ymm11[5],ymm3[5],ymm11[6],ymm3[6],ymm11[7],ymm3[7],ymm11[16],ymm3[16],ymm11[17],ymm3[17],ymm11[18],ymm3[18],ymm11[19],ymm3[19],ymm11[20],ymm3[20],ymm11[21],ymm3[21],ymm11[22],ymm3[22],ymm11[23],ymm3[23]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm19
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm3 = [10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm22, %xmm5
-; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm5, %xmm7
-; AVX512DQ-NEXT:    vmovdqa64 %xmm29, %xmm5
-; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm5, %xmm3
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX512DQ-NEXT:    vpshufb %xmm15, %xmm5, %xmm5
+; AVX512DQ-NEXT:    vmovdqa64 %ymm5, %ymm29
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm5 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[16],ymm8[16],ymm9[17],ymm8[17],ymm9[18],ymm8[18],ymm9[19],ymm8[19],ymm9[20],ymm8[20],ymm9[21],ymm8[21],ymm9[22],ymm8[22],ymm9[23],ymm8[23]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm5, %ymm30
+; AVX512DQ-NEXT:    vmovdqa64 %xmm16, %xmm5
+; AVX512DQ-NEXT:    vmovdqa64 %xmm27, %xmm8
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm8 = xmm5[8],xmm8[8],xmm5[9],xmm8[9],xmm5[10],xmm8[10],xmm5[11],xmm8[11],xmm5[12],xmm8[12],xmm5[13],xmm8[13],xmm5[14],xmm8[14],xmm5[15],xmm8[15]
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm9 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15]
+; AVX512DQ-NEXT:    vpshufb %xmm9, %xmm7, %xmm11
+; AVX512DQ-NEXT:    vpshufb %xmm9, %xmm8, %xmm5
+; AVX512DQ-NEXT:    vmovdqa64 %ymm5, %ymm21
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm5 = ymm12[0],ymm6[0],ymm12[1],ymm6[1],ymm12[2],ymm6[2],ymm12[3],ymm6[3],ymm12[4],ymm6[4],ymm12[5],ymm6[5],ymm12[6],ymm6[6],ymm12[7],ymm6[7],ymm12[16],ymm6[16],ymm12[17],ymm6[17],ymm12[18],ymm6[18],ymm12[19],ymm6[19],ymm12[20],ymm6[20],ymm12[21],ymm6[21],ymm12[22],ymm6[22],ymm12[23],ymm6[23]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm5, %ymm20
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm6 = [u,10,u,13,u,12,u,11,u,14,u,13,u,14,u,15]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm31, %xmm5
+; AVX512DQ-NEXT:    vpshufb %xmm6, %xmm5, %xmm13
+; AVX512DQ-NEXT:    vmovdqa64 %xmm26, %xmm5
+; AVX512DQ-NEXT:    vpshufb %xmm6, %xmm5, %xmm5
+; AVX512DQ-NEXT:    vmovdqa64 %ymm5, %ymm31
+; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0]
+; AVX512DQ-NEXT:    # ymm8 = mem[0,1,0,1]
+; AVX512DQ-NEXT:    vpshufb %ymm8, %ymm3, %ymm9
+; AVX512DQ-NEXT:    vpshufb %ymm8, %ymm4, %ymm3
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm22
-; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0]
-; AVX512DQ-NEXT:    # ymm11 = mem[0,1,0,1]
-; AVX512DQ-NEXT:    vpshufb %ymm11, %ymm6, %ymm5
-; AVX512DQ-NEXT:    vpshufb %ymm11, %ymm13, %ymm6
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm11 = [u,10,u,13,u,12,u,11,u,14,u,13,u,14,u,15]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm21, %xmm3
-; AVX512DQ-NEXT:    vpshufb %xmm11, %xmm3, %xmm13
-; AVX512DQ-NEXT:    vmovdqa64 %xmm27, %xmm3
-; AVX512DQ-NEXT:    vpshufb %xmm11, %xmm3, %xmm11
-; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0]
-; AVX512DQ-NEXT:    # ymm3 = mem[0,1,0,1]
-; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm4, %ymm4
-; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm14, %ymm3
-; AVX512DQ-NEXT:    vpshufb %xmm12, %xmm1, %xmm14
-; AVX512DQ-NEXT:    vpshufb %xmm12, %xmm0, %xmm8
-; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm14[0],xmm8[1],xmm14[1],xmm8[2],xmm14[2],xmm8[3],xmm14[3],xmm8[4],xmm14[4],xmm8[5],xmm14[5],xmm8[6],xmm14[6],xmm8[7],xmm14[7]
+; AVX512DQ-NEXT:    vpshufb %xmm14, %xmm1, %xmm3
+; AVX512DQ-NEXT:    vpshufb %xmm14, %xmm0, %xmm4
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
 ; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; AVX512DQ-NEXT:    vprold $16, %xmm0, %xmm0
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm8, %zmm0, %zmm0
-; AVX512DQ-NEXT:    vpshufb %xmm12, %xmm15, %xmm1
-; AVX512DQ-NEXT:    vpshufb %xmm12, %xmm2, %xmm8
-; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3],xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7]
-; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3],xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm27
+; AVX512DQ-NEXT:    vpshufb %xmm14, %xmm2, %xmm0
+; AVX512DQ-NEXT:    vpshufb %xmm14, %xmm10, %xmm1
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX512DQ-NEXT:    vmovdqa (%r8), %xmm1
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3],xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7]
+; AVX512DQ-NEXT:    vmovdqa 32(%r8), %xmm3
 ; AVX512DQ-NEXT:    vprold $16, %xmm2, %xmm2
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm30
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm10[0,0,0,1]
-; AVX512DQ-NEXT:    vprold $16, %ymm16, %ymm8
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,0,0,1]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm18, %ymm1
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm10 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,0,0,1]
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm4 = [10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm26
+; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm1, %xmm6
+; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm3, %xmm15
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm2 = [6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u]
+; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm3, %xmm10
+; AVX512DQ-NEXT:    vmovdqa64 %ymm10, %ymm17
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm4 = [2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u]
+; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm3, %xmm10
+; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm1, %xmm14
+; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm1, %xmm0
+; AVX512DQ-NEXT:    vmovdqa64 %ymm0, %ymm16
+; AVX512DQ-NEXT:    vmovdqa (%r8), %ymm1
+; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0]
+; AVX512DQ-NEXT:    # ymm2 = mem[0,1,0,1]
+; AVX512DQ-NEXT:    vpshufb %ymm2, %ymm1, %ymm5
+; AVX512DQ-NEXT:    vmovdqa 32(%r8), %ymm3
+; AVX512DQ-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
+; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0]
+; AVX512DQ-NEXT:    # ymm4 = mem[0,1,0,1]
+; AVX512DQ-NEXT:    vpshufb %ymm4, %ymm3, %ymm7
+; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0]
+; AVX512DQ-NEXT:    # ymm0 = mem[0,1,0,1]
+; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm3, %ymm12
+; AVX512DQ-NEXT:    vpshufb %ymm4, %ymm1, %ymm3
+; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm1, %ymm4
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm18[0,0,0,1]
+; AVX512DQ-NEXT:    vprold $16, %ymm28, %ymm1
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm19, %ymm8
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm8 = ymm8[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm12 = ymm13[0,0,0,1]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm13 = ymm20[0,0,0,1]
-; AVX512DQ-NEXT:    vprold $16, %ymm17, %ymm14
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm15 = ymm31[0,0,0,1]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm19, %ymm1
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm28 = ymm6[0,0,0,1]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[0,0,0,1]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm31 = ymm22[0,0,0,1]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm8, %zmm11, %zmm1
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
+; AVX512DQ-NEXT:    vpternlogq $226, %zmm0, %zmm6, %zmm1
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
+; AVX512DQ-NEXT:    vpternlogq $184, %ymm0, %ymm6, %ymm5
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm0 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0]
+; AVX512DQ-NEXT:    vpternlogq $184, %ymm1, %ymm0, %ymm28
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm5, %zmm0, %zmm1
+; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm28[0,1,2,3],zmm1[4,5,6,7]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm9, %zmm13, %zmm1
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255]
+; AVX512DQ-NEXT:    vpternlogd $184, %zmm5, %zmm8, %zmm1
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm5 = ymm29[0,0,0,1]
+; AVX512DQ-NEXT:    vprold $16, %ymm30, %ymm9
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm11 = ymm21[0,0,0,1]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm20, %ymm13
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm13 = ymm13[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[0,0,0,1]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm28 = ymm31[0,0,0,1]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm29 = ymm22[2,2,2,3]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm30 = ymm17[0,0,0,1]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[0,0,0,1]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm31 = ymm16[0,0,0,1]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm8, %zmm2, %zmm2
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm10, %zmm9, %zmm8
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm9, %zmm5, %zmm5
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm13, %zmm11, %zmm9
+; AVX512DQ-NEXT:    vpternlogq $226, %zmm5, %zmm6, %zmm9
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm9, %ymm5
+; AVX512DQ-NEXT:    vpternlogq $184, %ymm5, %ymm6, %ymm2
+; AVX512DQ-NEXT:    vpternlogq $184, %ymm9, %ymm0, %ymm15
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm2
+; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm15[0,1,2,3],zmm2[4,5,6,7]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm29, %zmm28, %zmm5
+; AVX512DQ-NEXT:    vpternlogd $184, %zmm2, %zmm8, %zmm5
+; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm2 = zmm27[0,0,0,1,4,4,4,5]
+; AVX512DQ-NEXT:    vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    # zmm8 = mem[0,0,0,1,4,4,4,5]
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0]
 ; AVX512DQ-NEXT:    vpternlogq $226, %zmm2, %zmm9, %zmm8
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm5, %zmm7, %zmm2
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
-; AVX512DQ-NEXT:    vpternlogd $184, %zmm8, %zmm5, %zmm2
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm7 = ymm14[2,2,2,3]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm7, %zmm13, %zmm7
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm15, %zmm1
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm7, %zmm9, %zmm1
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm6, %zmm31, %zmm6
-; AVX512DQ-NEXT:    vpternlogd $184, %zmm1, %zmm5, %zmm6
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm4, %zmm12, %zmm1
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255]
-; AVX512DQ-NEXT:    vpternlogd $184, %zmm2, %zmm4, %zmm1
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm3, %zmm11, %zmm2
-; AVX512DQ-NEXT:    vpternlogd $184, %zmm6, %zmm4, %zmm2
-; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[0,0,0,1,4,4,4,5]
-; AVX512DQ-NEXT:    vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    # zmm3 = mem[0,0,0,1,4,4,4,5]
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0]
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm0, %zmm4, %zmm3
-; AVX512DQ-NEXT:    vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    # zmm0 = mem[0,0,0,1,4,4,4,5]
-; AVX512DQ-NEXT:    vpternlogq $184, %zmm3, %zmm9, %zmm0
-; AVX512DQ-NEXT:    vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    # zmm3 = mem[0,0,0,1,4,4,4,5]
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255]
-; AVX512DQ-NEXT:    vpternlogd $184, %zmm0, %zmm5, %zmm3
-; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm0 = zmm30[0,0,0,1,4,4,4,5]
-; AVX512DQ-NEXT:    vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    # zmm6 = mem[0,0,0,1,4,4,4,5]
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm0, %zmm4, %zmm6
-; AVX512DQ-NEXT:    vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    # zmm0 = mem[0,0,0,1,4,4,4,5]
-; AVX512DQ-NEXT:    vpternlogq $184, %zmm6, %zmm9, %zmm0
-; AVX512DQ-NEXT:    vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    # zmm6 = mem[0,0,0,1,4,4,4,5]
-; AVX512DQ-NEXT:    vpternlogd $184, %zmm0, %zmm5, %zmm6
-; AVX512DQ-NEXT:    vpermq $234, (%rsp), %zmm0 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    # zmm0 = mem[2,2,2,3,6,6,6,7]
-; AVX512DQ-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    # zmm5 = mem[2,2,2,3,6,6,6,7]
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm0, %zmm9, %zmm5
-; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm0 = zmm26[2,2,2,3,6,6,6,7]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm7 = zmm24[2,2,2,3,6,6,6,7]
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm0, %zmm9, %zmm7
-; AVX512DQ-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    # zmm0 = mem[2,2,2,3,6,6,6,7]
-; AVX512DQ-NEXT:    vpternlogq $184, %zmm5, %zmm4, %zmm0
-; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm5 = zmm23[2,2,2,3,6,6,6,7]
-; AVX512DQ-NEXT:    vpternlogq $184, %zmm7, %zmm4, %zmm5
-; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm4 = zmm28[2,2,2,3,6,6,6,7]
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0]
-; AVX512DQ-NEXT:    vpternlogd $184, %zmm0, %zmm7, %zmm4
-; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm0 = zmm25[2,2,2,3,6,6,6,7]
-; AVX512DQ-NEXT:    vpternlogd $184, %zmm5, %zmm7, %zmm0
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm8, %ymm2
+; AVX512DQ-NEXT:    vpternlogq $184, %ymm2, %ymm9, %ymm30
+; AVX512DQ-NEXT:    vpternlogq $184, %ymm8, %ymm6, %ymm10
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm30, %zmm0, %zmm2
+; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm10[0,1,2,3],zmm2[4,5,6,7]
+; AVX512DQ-NEXT:    vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    # zmm8 = mem[0,0,0,1,4,4,4,5]
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255]
+; AVX512DQ-NEXT:    vpternlogd $184, %zmm2, %zmm10, %zmm8
+; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm2 = zmm26[0,0,0,1,4,4,4,5]
+; AVX512DQ-NEXT:    vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    # zmm11 = mem[0,0,0,1,4,4,4,5]
+; AVX512DQ-NEXT:    vpternlogq $226, %zmm2, %zmm9, %zmm11
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm11, %ymm2
+; AVX512DQ-NEXT:    vpternlogq $184, %ymm2, %ymm9, %ymm14
+; AVX512DQ-NEXT:    vpternlogq $184, %ymm11, %ymm6, %ymm31
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm14, %zmm0, %zmm2
+; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm31[0,1,2,3],zmm2[4,5,6,7]
+; AVX512DQ-NEXT:    vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    # zmm11 = mem[0,0,0,1,4,4,4,5]
+; AVX512DQ-NEXT:    vpternlogd $184, %zmm2, %zmm10, %zmm11
+; AVX512DQ-NEXT:    vpermq $234, (%rsp), %zmm2 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    # zmm2 = mem[2,2,2,3,6,6,6,7]
+; AVX512DQ-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    # zmm10 = mem[2,2,2,3,6,6,6,7]
+; AVX512DQ-NEXT:    vpternlogq $226, %zmm2, %zmm6, %zmm10
+; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm2 = zmm25[2,2,2,3,6,6,6,7]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm13 = zmm24[2,2,2,3,6,6,6,7]
+; AVX512DQ-NEXT:    vpternlogq $226, %zmm2, %zmm6, %zmm13
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm10, %ymm2
+; AVX512DQ-NEXT:    vpternlogq $184, %ymm2, %ymm0, %ymm7
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm13, %ymm2
+; AVX512DQ-NEXT:    vpternlogq $184, %ymm2, %ymm0, %ymm3
+; AVX512DQ-NEXT:    vpternlogq $184, %ymm10, %ymm9, %ymm12
+; AVX512DQ-NEXT:    vpternlogq $184, %ymm13, %ymm9, %ymm4
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm7, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm12[0,1,2,3],zmm0[4,5,6,7]
+; AVX512DQ-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    # zmm2 = mem[2,2,2,3,6,6,6,7]
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0]
+; AVX512DQ-NEXT:    vpternlogd $184, %zmm0, %zmm6, %zmm2
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm4[0,1,2,3],zmm0[4,5,6,7]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm3 = zmm23[2,2,2,3,6,6,6,7]
+; AVX512DQ-NEXT:    vpternlogd $184, %zmm0, %zmm6, %zmm3
 ; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-NEXT:    vmovdqa64 %zmm0, 128(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm4, 320(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm6, (%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm3, 192(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm2, 256(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm3, 128(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm2, 320(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm11, (%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm8, 192(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm5, 256(%rax)
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm1, 64(%rax)
-; AVX512DQ-NEXT:    addq $456, %rsp # imm = 0x1C8
+; AVX512DQ-NEXT:    addq $328, %rsp # imm = 0x148
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512DQ-FCP-LABEL: store_i8_stride6_vf64:
 ; AVX512DQ-FCP:       # %bb.0:
-; AVX512DQ-FCP-NEXT:    subq $424, %rsp # imm = 0x1A8
+; AVX512DQ-FCP-NEXT:    subq $392, %rsp # imm = 0x188
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %ymm4
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm4, %ymm1
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm3
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm3, (%rsp) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm3, %ymm2
 ; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23]
 ; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15],ymm3[24],ymm4[24],ymm3[25],ymm4[25],ymm3[26],ymm4[26],ymm3[27],ymm4[27],ymm3[28],ymm4[28],ymm3[29],ymm4[29],ymm3[30],ymm4[30],ymm3[31],ymm4[31]
@@ -6226,238 +6276,258 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %ymm5
-; AVX512DQ-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm10 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm5, %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %ymm4
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm4, %ymm3
+; AVX512DQ-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm14 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm5, %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %ymm6
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm6, %ymm3
 ; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[16],ymm2[16],ymm3[17],ymm2[17],ymm3[18],ymm2[18],ymm3[19],ymm2[19],ymm3[20],ymm2[20],ymm3[21],ymm2[21],ymm3[22],ymm2[22],ymm3[23],ymm2[23]
-; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15],ymm4[24],ymm5[24],ymm4[25],ymm5[25],ymm4[26],ymm5[26],ymm4[27],ymm5[27],ymm4[28],ymm5[28],ymm4[29],ymm5[29],ymm4[30],ymm5[30],ymm4[31],ymm5[31]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm5, %ymm22
+; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15],ymm6[24],ymm5[24],ymm6[25],ymm5[25],ymm6[26],ymm5[26],ymm6[27],ymm5[27],ymm6[28],ymm5[28],ymm6[29],ymm5[29],ymm6[30],ymm5[30],ymm6[31],ymm5[31]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm6, %ymm25
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm5, %ymm30
 ; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31]
 ; AVX512DQ-FCP-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm4, %ymm4
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm3, %zmm3
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %ymm6
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0]
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r9), %ymm7
+; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15]
 ; AVX512DQ-FCP-NEXT:    # ymm3 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm6, %ymm5
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm7, %ymm5
+; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0]
 ; AVX512DQ-FCP-NEXT:    # ymm4 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm6, %ymm6
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm7, %ymm6
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm7, %ymm31
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm6, %zmm5
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r9), %ymm7
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15]
-; AVX512DQ-FCP-NEXT:    # ymm5 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm7, %ymm6
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0]
-; AVX512DQ-FCP-NEXT:    # ymm9 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm7, %ymm8
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm8, %zmm6
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %ymm7
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm7, %ymm6
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm8
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm8, %ymm0
-; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[1],ymm6[1],ymm0[2],ymm6[2],ymm0[3],ymm6[3],ymm0[4],ymm6[4],ymm0[5],ymm6[5],ymm0[6],ymm6[6],ymm0[7],ymm6[7],ymm0[16],ymm6[16],ymm0[17],ymm6[17],ymm0[18],ymm6[18],ymm0[19],ymm6[19],ymm0[20],ymm6[20],ymm0[21],ymm6[21],ymm0[22],ymm6[22],ymm0[23],ymm6[23]
-; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm6 = ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11],ymm8[12],ymm7[12],ymm8[13],ymm7[13],ymm8[14],ymm7[14],ymm8[15],ymm7[15],ymm8[24],ymm7[24],ymm8[25],ymm7[25],ymm8[26],ymm7[26],ymm8[27],ymm7[27],ymm8[28],ymm7[28],ymm8[29],ymm7[29],ymm8[30],ymm7[30],ymm8[31],ymm7[31]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm8, %ymm18
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm7, %ymm17
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %ymm8
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm8, %ymm5
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm7
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm7, %ymm0
+; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[1],ymm5[1],ymm0[2],ymm5[2],ymm0[3],ymm5[3],ymm0[4],ymm5[4],ymm0[5],ymm5[5],ymm0[6],ymm5[6],ymm0[7],ymm5[7],ymm0[16],ymm5[16],ymm0[17],ymm5[17],ymm0[18],ymm5[18],ymm0[19],ymm5[19],ymm0[20],ymm5[20],ymm0[21],ymm5[21],ymm0[22],ymm5[22],ymm0[23],ymm5[23]
+; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm7[8],ymm8[8],ymm7[9],ymm8[9],ymm7[10],ymm8[10],ymm7[11],ymm8[11],ymm7[12],ymm8[12],ymm7[13],ymm8[13],ymm7[14],ymm8[14],ymm7[15],ymm8[15],ymm7[24],ymm8[24],ymm7[25],ymm8[25],ymm7[26],ymm8[26],ymm7[27],ymm8[27],ymm7[28],ymm8[28],ymm7[29],ymm8[29],ymm7[30],ymm8[30],ymm7[31],ymm8[31]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm5, %ymm1
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %ymm15
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm15, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %ymm11
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm11, %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %ymm11
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm11, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %ymm12
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm12, %ymm1
 ; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
-; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm1 = ymm11[8],ymm15[8],ymm11[9],ymm15[9],ymm11[10],ymm15[10],ymm11[11],ymm15[11],ymm11[12],ymm15[12],ymm11[13],ymm15[13],ymm11[14],ymm15[14],ymm11[15],ymm15[15],ymm11[24],ymm15[24],ymm11[25],ymm15[25],ymm11[26],ymm15[26],ymm11[27],ymm15[27],ymm11[28],ymm15[28],ymm11[29],ymm15[29],ymm11[30],ymm15[30],ymm11[31],ymm15[31]
+; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm1 = ymm12[8],ymm11[8],ymm12[9],ymm11[9],ymm12[10],ymm11[10],ymm12[11],ymm11[11],ymm12[12],ymm11[12],ymm12[13],ymm11[13],ymm12[14],ymm11[14],ymm12[15],ymm11[15],ymm12[24],ymm11[24],ymm12[25],ymm11[25],ymm12[26],ymm11[26],ymm12[27],ymm11[27],ymm12[28],ymm11[28],ymm12[29],ymm11[29],ymm12[30],ymm11[30],ymm12[31],ymm11[31]
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %ymm2
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm2, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm2, %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm19
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm23
-; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %ymm2
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm2, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm2, %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm20
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm24
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %xmm0
-; AVX512DQ-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm5 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm0, %xmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %xmm12
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm12, %xmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %ymm9
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm9, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm9, %ymm1
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %xmm3
+; AVX512DQ-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm3, %xmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %xmm6
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm6, %xmm2
 ; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3],xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm0, %xmm16
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm8 = [0,1,6,7,4,5,2,3,8,9,10,11,12,13,10,11]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm2, %xmm2
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm1, %zmm2, %zmm26
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %xmm13
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm9 = [6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm13, %xmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm13, %xmm3
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm1, %zmm3, %zmm29
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r9), %xmm14
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [u,6,u,5,u,8,u,7,u,9,u,9,u,9,u,9]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm14, %xmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [u,2,u,1,u,0,u,3,u,4,u,4,u,4,u,4]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm14, %xmm4
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm1, %zmm4, %zmm21
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %xmm6
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm4
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm6, %xmm0
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
-; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
-; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm5, %xmm5
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm5, %zmm25
-; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %xmm1
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm1, %xmm5
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm1, %xmm2
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm5, %zmm2, %zmm27
-; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %xmm8
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm8, %xmm5
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm8, %xmm3
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm5, %zmm3, %zmm28
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %xmm5
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm5, %xmm7
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %xmm3
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm3, %xmm9
-; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3],xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7]
-; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
-; AVX512DQ-FCP-NEXT:    vprold $16, %xmm9, %xmm9
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm9, %zmm30
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %xmm7
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %xmm9
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm7, %xmm0
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm9, %xmm10
-; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3],xmm10[4],xmm0[4],xmm10[5],xmm0[5],xmm10[6],xmm0[6],xmm10[7],xmm0[7]
-; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm10 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3],xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7]
+; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3],xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm3, %xmm18
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm15 = [0,1,6,7,4,5,2,3,8,9,10,11,12,13,10,11]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm15, %xmm2, %xmm2
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm1, %zmm2, %zmm1
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r9), %xmm4
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm13 = [u,6,u,5,u,8,u,7,u,9,u,9,u,9,u,9]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm4, %xmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,2,u,1,u,0,u,3,u,4,u,4,u,4,u,4]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm4, %xmm3
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm4, %xmm19
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm2, %zmm3, %zmm26
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %xmm5
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm3
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm5, %xmm4
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15]
+; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm15, %xmm4, %xmm4
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm4, %zmm27
+; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %xmm15
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm15, %xmm0
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm15, %xmm1
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm28
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %ymm13
+; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0]
+; AVX512DQ-FCP-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm13, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm1, %ymm21
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm22 = ymm0[2,2,2,3]
+; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0]
+; AVX512DQ-FCP-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm13, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm1, %ymm20
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm24 = ymm0[2,2,2,3]
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %xmm2
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm14, %xmm2, %xmm4
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %xmm1
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm14, %xmm1, %xmm10
+; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3],xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7]
+; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm10 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
 ; AVX512DQ-FCP-NEXT:    vprold $16, %xmm10, %xmm10
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm10, %zmm31
-; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm11[0],ymm15[0],ymm11[1],ymm15[1],ymm11[2],ymm15[2],ymm11[3],ymm15[3],ymm11[4],ymm15[4],ymm11[5],ymm15[5],ymm11[6],ymm15[6],ymm11[7],ymm15[7],ymm11[16],ymm15[16],ymm11[17],ymm15[17],ymm11[18],ymm15[18],ymm11[19],ymm15[19],ymm11[20],ymm15[20],ymm11[21],ymm15[21],ymm11[22],ymm15[22],ymm11[23],ymm15[23]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm10, %zmm23
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %xmm4
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %xmm10
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm14, %xmm4, %xmm0
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm14, %xmm10, %xmm14
+; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3],xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7]
+; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm14 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3],xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7]
+; AVX512DQ-FCP-NEXT:    vprold $16, %xmm14, %xmm14
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm14, %zmm29
+; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm12[0],ymm11[0],ymm12[1],ymm11[1],ymm12[2],ymm11[2],ymm12[3],ymm11[3],ymm12[4],ymm11[4],ymm12[5],ymm11[5],ymm12[6],ymm11[6],ymm12[7],ymm11[7],ymm12[16],ymm11[16],ymm12[17],ymm11[17],ymm12[18],ymm11[18],ymm12[19],ymm11[19],ymm12[20],ymm11[20],ymm12[21],ymm11[21],ymm12[22],ymm11[22],ymm12[23],ymm11[23]
 ; AVX512DQ-FCP-NEXT:    vprold $16, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm7 = xmm9[8],xmm7[8],xmm9[9],xmm7[9],xmm9[10],xmm7[10],xmm9[11],xmm7[11],xmm9[12],xmm7[12],xmm9[13],xmm7[13],xmm9[14],xmm7[14],xmm9[15],xmm7[15]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm18, %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm17, %ymm9
-; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm9 = ymm2[0],ymm9[0],ymm2[1],ymm9[1],ymm2[2],ymm9[2],ymm2[3],ymm9[3],ymm2[4],ymm9[4],ymm2[5],ymm9[5],ymm2[6],ymm9[6],ymm2[7],ymm9[7],ymm2[16],ymm9[16],ymm2[17],ymm9[17],ymm2[18],ymm9[18],ymm2[19],ymm9[19],ymm2[20],ymm9[20],ymm2[21],ymm9[21],ymm2[22],ymm9[22],ymm2[23],ymm9[23]
-; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15]
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27]
-; AVX512DQ-FCP-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm9, %ymm9
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm18
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm4, %xmm15
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm2, %xmm17
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm4 = [0,0,0,1,10,10,10,11]
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm9, %zmm4, %zmm15
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm7, %xmm7
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm4, %zmm7
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm7, %zmm9, %zmm15
+; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm10 = xmm10[8],xmm4[8],xmm10[9],xmm4[9],xmm10[10],xmm4[10],xmm10[11],xmm4[11],xmm10[12],xmm4[12],xmm10[13],xmm4[13],xmm10[14],xmm4[14],xmm10[15],xmm4[15]
+; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm4 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[4],ymm8[4],ymm7[5],ymm8[5],ymm7[6],ymm8[6],ymm7[7],ymm8[7],ymm7[16],ymm8[16],ymm7[17],ymm8[17],ymm7[18],ymm8[18],ymm7[19],ymm8[19],ymm7[20],ymm8[20],ymm7[21],ymm8[21],ymm7[22],ymm8[22],ymm7[23],ymm8[23]
+; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15]
+; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm14 = [16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27]
+; AVX512DQ-FCP-NEXT:    # ymm14 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm4, %ymm4
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm12 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm12, %xmm3, %xmm11
+; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm7 = [0,0,0,1,10,10,10,11]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm4, %zmm7, %zmm11
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm4 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm4, %xmm10, %xmm3
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm7, %zmm3
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
+; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm3, %zmm16, %zmm11
+; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0]
+; AVX512DQ-FCP-NEXT:    # ymm10 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm9, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [u,10,u,13,u,12,u,11,u,14,u,13,u,14,u,15]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm15, %xmm3
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm5, %xmm17
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm7, %zmm3
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm25, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm30, %ymm5
+; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[1],ymm5[1],ymm0[2],ymm5[2],ymm0[3],ymm5[3],ymm0[4],ymm5[4],ymm0[5],ymm5[5],ymm0[6],ymm5[6],ymm0[7],ymm5[7],ymm0[16],ymm5[16],ymm0[17],ymm5[17],ymm0[18],ymm5[18],ymm0[19],ymm5[19],ymm0[20],ymm5[20],ymm0[21],ymm5[21],ymm0[22],ymm5[22],ymm0[23],ymm5[23]
+; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
+; AVX512DQ-FCP-NEXT:    vprold $16, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm4, %xmm1, %xmm2
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm7, %zmm2
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # ymm4 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[16],mem[16],ymm0[17],mem[17],ymm0[18],mem[18],ymm0[19],mem[19],ymm0[20],mem[20],ymm0[21],mem[21],ymm0[22],mem[22],ymm0[23],mem[23]
+; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %ymm5
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm21, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm5, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm20, %ymm1
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm5, %ymm1
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm4, %ymm4
+; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %xmm14
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm18, %xmm8
+; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm6[8],xmm8[8],xmm6[9],xmm8[9],xmm6[10],xmm8[10],xmm6[11],xmm8[11],xmm6[12],xmm8[12],xmm6[13],xmm8[13],xmm6[14],xmm8[14],xmm6[15],xmm8[15]
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %xmm15
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm12, %xmm6, %xmm8
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm4, %zmm7, %zmm8
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm15, %xmm4
+; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm2, %zmm16, %zmm8
 ; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0]
 ; AVX512DQ-FCP-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm19, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm19
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm11 = [10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm11, %xmm1, %xmm1
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm4, %zmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm15, %zmm10, %zmm1
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0]
-; AVX512DQ-FCP-NEXT:    # ymm15 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm20, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm0, %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [u,10,u,13,u,12,u,11,u,14,u,13,u,14,u,15]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm8, %xmm0
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm4, %zmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255]
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm1, %zmm2, %zmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm22, %ymm8
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm8[0],ymm1[1],ymm8[1],ymm1[2],ymm8[2],ymm1[3],ymm8[3],ymm1[4],ymm8[4],ymm1[5],ymm8[5],ymm1[6],ymm8[6],ymm1[7],ymm8[7],ymm1[16],ymm8[16],ymm1[17],ymm8[17],ymm1[18],ymm8[18],ymm1[19],ymm8[19],ymm1[20],ymm8[20],ymm1[21],ymm8[21],ymm1[22],ymm8[22],ymm1[23],ymm8[23]
-; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm3, %xmm3
-; AVX512DQ-FCP-NEXT:    vmovdqu (%rsp), %ymm5 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm5 = ymm5[0],mem[0],ymm5[1],mem[1],ymm5[2],mem[2],ymm5[3],mem[3],ymm5[4],mem[4],ymm5[5],mem[5],ymm5[6],mem[6],ymm5[7],mem[7],ymm5[16],mem[16],ymm5[17],mem[17],ymm5[18],mem[18],ymm5[19],mem[19],ymm5[20],mem[20],ymm5[21],mem[21],ymm5[22],mem[22],ymm5[23],mem[23]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm18, %ymm6
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm5, %ymm5
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm16, %xmm6
-; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm12[8],xmm6[8],xmm12[9],xmm6[9],xmm12[10],xmm6[10],xmm12[11],xmm6[11],xmm12[12],xmm6[12],xmm12[13],xmm6[13],xmm12[14],xmm6[14],xmm12[15],xmm6[15]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm17, %xmm8
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm6, %xmm6
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm5, %zmm4, %zmm6
-; AVX512DQ-FCP-NEXT:    vprold $16, %ymm1, %ymm1
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm4, %zmm3
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm3, %zmm9, %zmm6
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm19, %ymm3
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm11, %xmm13, %xmm3
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm4, %zmm3
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm6, %zmm10, %zmm3
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm1, %ymm1
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm14, %xmm5
-; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm4, %zmm5
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm3, %zmm2, %zmm5
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm5, %ymm12
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm13, %ymm9
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm14, %xmm6
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm15, %xmm5
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm14, %xmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm13 = [10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm14, %xmm14
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm15, %xmm13
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm31, %ymm15
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm15, %ymm15
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm30 = ymm0[2,2,2,3]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm31 = ymm1[2,2,2,3]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm6[0,0,0,1]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm2[0,0,0,1]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm14[0,0,0,1]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm19, %xmm6
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm17, %xmm14
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm14, %xmm6, %xmm6
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm14 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0]
+; AVX512DQ-FCP-NEXT:    vpternlogq $184, %ymm11, %ymm14, %ymm2
+; AVX512DQ-FCP-NEXT:    vextracti64x4 $1, %zmm11, %ymm11
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[0,0,0,1]
+; AVX512DQ-FCP-NEXT:    vpternlogq $184, %ymm8, %ymm14, %ymm13
+; AVX512DQ-FCP-NEXT:    vextracti64x4 $1, %zmm8, %ymm8
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3]
+; AVX512DQ-FCP-NEXT:    vpermt2q %zmm15, %zmm7, %zmm6
+; AVX512DQ-FCP-NEXT:    vpternlogq $184, %ymm8, %ymm16, %ymm9
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm9, %zmm0, %zmm7
+; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm7 = zmm13[0,1,2,3],zmm7[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255]
+; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm7, %zmm8, %zmm6
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, 256(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, 64(%rax)
-; AVX512DQ-FCP-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # zmm0 = mem[2,2,2,3,6,6,6,7]
-; AVX512DQ-FCP-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # zmm1 = mem[2,2,2,3,6,6,6,7]
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm0, %zmm9, %zmm1
-; AVX512DQ-FCP-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # zmm0 = mem[2,2,2,3,6,6,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, 256(%rax)
+; AVX512DQ-FCP-NEXT:    vpternlogq $184, %ymm11, %ymm16, %ymm12
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm12, %zmm0, %zmm6
+; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm6[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm2, %zmm8, %zmm3
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, 64(%rax)
+; AVX512DQ-FCP-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # zmm2 = mem[2,2,2,3,6,6,6,7]
+; AVX512DQ-FCP-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # zmm3 = mem[2,2,2,3,6,6,6,7]
+; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm2, %zmm16, %zmm3
+; AVX512DQ-FCP-NEXT:    vextracti64x4 $1, %zmm3, %ymm2
+; AVX512DQ-FCP-NEXT:    vpternlogq $184, %ymm2, %ymm14, %ymm22
+; AVX512DQ-FCP-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # zmm2 = mem[2,2,2,3,6,6,6,7]
+; AVX512DQ-FCP-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # zmm6 = mem[2,2,2,3,6,6,6,7]
+; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm2, %zmm16, %zmm6
+; AVX512DQ-FCP-NEXT:    vextracti64x4 $1, %zmm6, %ymm2
+; AVX512DQ-FCP-NEXT:    vpternlogq $184, %ymm2, %ymm14, %ymm30
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0]
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %zmm1, %zmm2, %zmm0
-; AVX512DQ-FCP-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # zmm1 = mem[2,2,2,3,6,6,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0]
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm0, %zmm3, %zmm1
-; AVX512DQ-FCP-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # zmm0 = mem[2,2,2,3,6,6,6,7]
-; AVX512DQ-FCP-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # zmm4 = mem[2,2,2,3,6,6,6,7]
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm0, %zmm9, %zmm4
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm0 = zmm23[2,2,2,3,6,6,6,7]
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %zmm4, %zmm2, %zmm0
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm4 = zmm24[2,2,2,3,6,6,6,7]
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm0, %zmm3, %zmm4
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm0 = zmm30[0,0,0,1,4,4,4,5]
+; AVX512DQ-FCP-NEXT:    vpternlogq $184, %ymm3, %ymm2, %ymm24
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm22, %zmm0, %zmm3
+; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm24[0,1,2,3],zmm3[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # zmm7 = mem[2,2,2,3,6,6,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0]
+; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm3, %zmm8, %zmm7
+; AVX512DQ-FCP-NEXT:    vpternlogq $184, %ymm6, %ymm2, %ymm31
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm30, %zmm0, %zmm3
+; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm31[0,1,2,3],zmm3[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpermq $234, (%rsp), %zmm6 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # zmm6 = mem[2,2,2,3,6,6,6,7]
+; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm3, %zmm8, %zmm6
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm3 = zmm23[0,0,0,1,4,4,4,5]
+; AVX512DQ-FCP-NEXT:    vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # zmm8 = mem[0,0,0,1,4,4,4,5]
+; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm3, %zmm2, %zmm8
+; AVX512DQ-FCP-NEXT:    vextracti64x4 $1, %zmm8, %ymm3
+; AVX512DQ-FCP-NEXT:    vpternlogq $184, %ymm3, %ymm2, %ymm4
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm3 = zmm29[0,0,0,1,4,4,4,5]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm9 = zmm27[0,0,0,1,4,4,4,5]
+; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm3, %zmm2, %zmm9
+; AVX512DQ-FCP-NEXT:    vextracti64x4 $1, %zmm9, %ymm3
+; AVX512DQ-FCP-NEXT:    vpternlogq $184, %ymm3, %ymm2, %ymm0
+; AVX512DQ-FCP-NEXT:    vpternlogq $184, %ymm8, %ymm16, %ymm5
+; AVX512DQ-FCP-NEXT:    vpternlogq $184, %ymm9, %ymm16, %ymm1
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm2
+; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm5[0,1,2,3],zmm2[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm3 = zmm26[0,0,0,1,4,4,4,5]
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm0, %zmm2, %zmm3
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm0 = zmm31[0,0,0,1,4,4,4,5]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm5 = zmm25[0,0,0,1,4,4,4,5]
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm0, %zmm2, %zmm5
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm0 = zmm29[0,0,0,1,4,4,4,5]
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %zmm3, %zmm9, %zmm0
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm2 = zmm27[0,0,0,1,4,4,4,5]
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %zmm5, %zmm9, %zmm2
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm3 = zmm21[0,0,0,1,4,4,4,5]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255]
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm0, %zmm5, %zmm3
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm0 = zmm28[0,0,0,1,4,4,4,5]
-; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm2, %zmm5, %zmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, (%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255]
+; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm2, %zmm4, %zmm3
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
+; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm1 = zmm28[0,0,0,1,4,4,4,5]
+; AVX512DQ-FCP-NEXT:    vpternlogd $184, %zmm0, %zmm4, %zmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, (%rax)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, 192(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, 128(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, 320(%rax)
-; AVX512DQ-FCP-NEXT:    addq $424, %rsp # imm = 0x1A8
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, 128(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, 320(%rax)
+; AVX512DQ-FCP-NEXT:    addq $392, %rsp # imm = 0x188
 ; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
index 416fbe9aa340c..8091afbbfd70c 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
@@ -4254,7 +4254,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm9, %zmm1
 ; AVX512-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1
 ; AVX512-NEXT:    vmovdqa64 %ymm19, %ymm14
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = zero,ymm14[u],zero,zero,zero,zero,ymm14[14],zero,ymm14[u],zero,zero,zero,zero,ymm14[15],zero,ymm14[u],zero,zero,zero,zero,ymm14[16],zero,ymm14[u],zero,zero,zero,zero,ymm14[17],zero,ymm14[u],zero,zero
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = zero,ymm14[u,u,u,u,u,14],zero,ymm14[u,u,u,u,u,15],zero,ymm14[u,u,u,u,u,16],zero,ymm14[u,u,u,u,u,17],zero,ymm14[u,u,u]
 ; AVX512-NEXT:    vpshufb {{.*#+}} ymm9 = ymm2[13,u,u,u,u,u],zero,ymm2[14,u,u,u,u,u],zero,ymm2[15,u,u,u,u,u],zero,ymm2[16,u,u,u,u,u],zero,ymm2[17,u,u,u]
 ; AVX512-NEXT:    vpor %ymm0, %ymm9, %ymm0
 ; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm9 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15]
@@ -4357,7 +4357,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1]
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm8, %zmm9
 ; AVX512-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm9
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,ymm1[u],zero,zero,zero,zero,ymm1[14],zero,ymm1[u],zero,zero,zero,zero,ymm1[15],zero,ymm1[u],zero,zero,zero,zero,ymm1[16],zero,ymm1[u],zero,zero,zero,zero,ymm1[17],zero,ymm1[u],zero,zero
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,ymm1[u,u,u,u,u,14],zero,ymm1[u,u,u,u,u,15],zero,ymm1[u,u,u,u,u,16],zero,ymm1[u,u,u,u,u,17],zero,ymm1[u,u,u]
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = ymm2[13,u,u,u,u,u],zero,ymm2[14,u,u,u,u,u],zero,ymm2[15,u,u,u,u,u],zero,ymm2[16,u,u,u,u,u],zero,ymm2[17,u,u,u]
 ; AVX512-FCP-NEXT:    vpor %ymm0, %ymm8, %ymm0
 ; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm8 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15]
@@ -4533,7 +4533,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm9, %zmm1
 ; AVX512DQ-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1
 ; AVX512DQ-NEXT:    vmovdqa64 %ymm19, %ymm14
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = zero,ymm14[u],zero,zero,zero,zero,ymm14[14],zero,ymm14[u],zero,zero,zero,zero,ymm14[15],zero,ymm14[u],zero,zero,zero,zero,ymm14[16],zero,ymm14[u],zero,zero,zero,zero,ymm14[17],zero,ymm14[u],zero,zero
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = zero,ymm14[u,u,u,u,u,14],zero,ymm14[u,u,u,u,u,15],zero,ymm14[u,u,u,u,u,16],zero,ymm14[u,u,u,u,u,17],zero,ymm14[u,u,u]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm9 = ymm2[13,u,u,u,u,u],zero,ymm2[14,u,u,u,u,u],zero,ymm2[15,u,u,u,u,u],zero,ymm2[16,u,u,u,u,u],zero,ymm2[17,u,u,u]
 ; AVX512DQ-NEXT:    vpor %ymm0, %ymm9, %ymm0
 ; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm9 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15]
@@ -4636,7 +4636,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1]
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm8, %zmm9
 ; AVX512DQ-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm9
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,ymm1[u],zero,zero,zero,zero,ymm1[14],zero,ymm1[u],zero,zero,zero,zero,ymm1[15],zero,ymm1[u],zero,zero,zero,zero,ymm1[16],zero,ymm1[u],zero,zero,zero,zero,ymm1[17],zero,ymm1[u],zero,zero
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,ymm1[u,u,u,u,u,14],zero,ymm1[u,u,u,u,u,15],zero,ymm1[u,u,u,u,u,16],zero,ymm1[u,u,u,u,u,17],zero,ymm1[u,u,u]
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = ymm2[13,u,u,u,u,u],zero,ymm2[14,u,u,u,u,u],zero,ymm2[15,u,u,u,u,u],zero,ymm2[16,u,u,u,u,u],zero,ymm2[17,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm8, %ymm0
 ; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm8 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15]
@@ -8544,35 +8544,39 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ;
 ; AVX512-LABEL: store_i8_stride7_vf64:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    subq $1384, %rsp # imm = 0x568
+; AVX512-NEXT:    subq $1720, %rsp # imm = 0x6B8
 ; AVX512-NEXT:    vmovdqa (%rsi), %ymm7
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm7[14],zero,zero,zero,zero,zero,zero,ymm7[15],zero,zero,zero,zero,zero,zero,ymm7[16],zero,zero,zero,zero,zero,zero,ymm7[17],zero,zero,zero,zero,zero,zero,ymm7[18]
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm2 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18]
+; AVX512-NEXT:    vpshufb %ymm2, %ymm7, %ymm0
+; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm26
 ; AVX512-NEXT:    vmovdqa (%rdi), %ymm2
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[0,1,14],zero,ymm2[12,13,0,1,14,15],zero,ymm2[3,12,13,2,3,16],zero,ymm2[30,31,28,29,16,17],zero,ymm2[31,18,19,28,29,18],zero
-; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm20
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,14,128,12,13,0,1,14,15,128,3,12,13,2,3,16,128,30,31,28,29,16,17,128,31,18,19,28,29,18,128]
+; AVX512-NEXT:    vpshufb %ymm3, %ymm2, %ymm1
+; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm25
+; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm18
 ; AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vmovdqa (%rcx), %ymm15
+; AVX512-NEXT:    vmovdqa (%rcx), %ymm14
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128]
-; AVX512-NEXT:    vpshufb %ymm2, %ymm15, %ymm0
-; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm23
-; AVX512-NEXT:    vmovdqa (%rdx), %ymm2
+; AVX512-NEXT:    vpshufb %ymm2, %ymm14, %ymm0
+; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm28
+; AVX512-NEXT:    vmovdqa (%rdx), %ymm8
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31]
-; AVX512-NEXT:    vpshufb %ymm3, %ymm2, %ymm1
-; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm26
-; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm18
+; AVX512-NEXT:    vpshufb %ymm3, %ymm8, %ymm1
+; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm23
+; AVX512-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vmovdqa (%r8), %ymm14
+; AVX512-NEXT:    vmovdqa (%r8), %ymm15
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128]
-; AVX512-NEXT:    vpshufb %ymm2, %ymm14, %ymm0
-; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm27
-; AVX512-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vmovdqa (%r9), %ymm8
+; AVX512-NEXT:    vpshufb %ymm2, %ymm15, %ymm0
+; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm24
+; AVX512-NEXT:    vmovdqa (%r9), %ymm2
 ; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0,13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0]
 ; AVX512-NEXT:    # ymm3 = mem[0,1,0,1]
-; AVX512-NEXT:    vpshufb %ymm3, %ymm8, %ymm1
-; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm17
+; AVX512-NEXT:    vpshufb %ymm3, %ymm2, %ymm1
+; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm27
+; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm19
 ; AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-NEXT:    vmovdqa 32(%rsi), %ymm10
@@ -8585,7 +8589,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [12,13,14,128,12,128,14,15,14,15,128,13,128,15,12,13,28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29]
 ; AVX512-NEXT:    vpshufb %ymm1, %ymm9, %ymm0
-; AVX512-NEXT:    vmovdqa64 %ymm1, %ymm19
+; AVX512-NEXT:    vmovdqa64 %ymm1, %ymm16
 ; AVX512-NEXT:    vpshufb {{.*#+}} ymm1 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm10[23,u,u,u],zero,ymm10[26],zero,ymm10[24,u,u,u],zero,ymm10[27],zero
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -8600,7 +8604,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,0,27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,0]
 ; AVX512-NEXT:    # ymm0 = mem[0,1,0,1]
 ; AVX512-NEXT:    vpshufb %ymm0, %ymm4, %ymm2
-; AVX512-NEXT:    vmovdqa64 %ymm0, %ymm21
+; AVX512-NEXT:    vmovdqa64 %ymm0, %ymm20
 ; AVX512-NEXT:    vpshufb {{.*#+}} ymm3 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm5[23],zero,ymm5[21,22,23,26],zero,ymm5[24],zero,ymm5[28,29,26,27]
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
 ; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -8608,7 +8612,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29]
 ; AVX512-NEXT:    # ymm0 = mem[0,1,0,1]
 ; AVX512-NEXT:    vpshufb %ymm0, %ymm3, %ymm11
-; AVX512-NEXT:    vmovdqa64 %ymm0, %ymm25
+; AVX512-NEXT:    vmovdqa64 %ymm0, %ymm21
 ; AVX512-NEXT:    vmovdqa 32(%r9), %ymm2
 ; AVX512-NEXT:    vpshufb {{.*#+}} ymm12 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm11, %zmm12, %zmm11
@@ -8627,23 +8631,23 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm12, %zmm13, %zmm12
 ; AVX512-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vpshufb %ymm6, %ymm7, %ymm6
-; AVX512-NEXT:    vmovdqa64 %ymm7, %ymm22
-; AVX512-NEXT:    vmovdqa64 %ymm20, %ymm7
-; AVX512-NEXT:    vmovdqa64 %ymm19, %ymm0
-; AVX512-NEXT:    vpshufb %ymm0, %ymm7, %ymm7
+; AVX512-NEXT:    vmovdqa64 %ymm7, %ymm17
+; AVX512-NEXT:    vmovdqa64 %ymm18, %ymm13
+; AVX512-NEXT:    vmovdqa64 %ymm16, %ymm0
+; AVX512-NEXT:    vpshufb %ymm0, %ymm13, %ymm7
 ; AVX512-NEXT:    vpor %ymm6, %ymm7, %ymm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa64 %ymm18, %ymm12
-; AVX512-NEXT:    vpshufb %ymm1, %ymm12, %ymm6
-; AVX512-NEXT:    vmovdqa64 %ymm21, %ymm0
-; AVX512-NEXT:    vpshufb %ymm0, %ymm15, %ymm7
-; AVX512-NEXT:    vmovdqa64 %ymm15, %ymm19
+; AVX512-NEXT:    vpshufb %ymm1, %ymm8, %ymm6
+; AVX512-NEXT:    vmovdqa64 %ymm20, %ymm0
+; AVX512-NEXT:    vpshufb %ymm0, %ymm14, %ymm7
+; AVX512-NEXT:    vmovdqa64 %ymm14, %ymm22
 ; AVX512-NEXT:    vpor %ymm6, %ymm7, %ymm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa64 %ymm25, %ymm0
-; AVX512-NEXT:    vpshufb %ymm0, %ymm14, %ymm6
-; AVX512-NEXT:    vpshufb %ymm11, %ymm8, %ymm7
-; AVX512-NEXT:    vmovdqa64 %ymm8, %ymm16
+; AVX512-NEXT:    vmovdqa64 %ymm21, %ymm0
+; AVX512-NEXT:    vpshufb %ymm0, %ymm15, %ymm6
+; AVX512-NEXT:    vmovdqa64 %ymm15, %ymm16
+; AVX512-NEXT:    vmovdqa64 %ymm19, %ymm15
+; AVX512-NEXT:    vpshufb %ymm11, %ymm15, %ymm7
 ; AVX512-NEXT:    vpor %ymm6, %ymm7, %ymm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa 32(%rdx), %xmm0
@@ -8652,348 +8656,379 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm11 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u]
 ; AVX512-NEXT:    vpshufb %xmm11, %xmm1, %xmm6
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u]
-; AVX512-NEXT:    vpshufb %xmm1, %xmm0, %xmm7
-; AVX512-NEXT:    vmovdqa64 %xmm1, %xmm21
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm12 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u]
+; AVX512-NEXT:    vpshufb %xmm12, %xmm0, %xmm7
 ; AVX512-NEXT:    vpor %xmm6, %xmm7, %xmm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa 32(%rdi), %xmm0
-; AVX512-NEXT:    vmovdqa 32(%rsi), %xmm15
+; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovdqa 32(%rsi), %xmm14
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm6 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128]
-; AVX512-NEXT:    vpshufb %xmm6, %xmm15, %xmm7
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm13 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9]
-; AVX512-NEXT:    vpshufb %xmm13, %xmm0, %xmm8
-; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm30
+; AVX512-NEXT:    vpshufb %xmm6, %xmm14, %xmm7
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9]
+; AVX512-NEXT:    vpshufb %xmm1, %xmm0, %xmm8
+; AVX512-NEXT:    vmovdqa64 %xmm1, %xmm18
 ; AVX512-NEXT:    vpor %xmm7, %xmm8, %xmm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm28 = [0,0,0,0,2,3,0,1,0,18,0,19,18,0,19,0]
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,2,3,0,1,0,18,0,19,18,0,19,0]
 ; AVX512-NEXT:    vmovdqa 32(%rax), %xmm0
 ; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm0[0,1,2,3,4,5,5,6]
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm8 = xmm0[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
-; AVX512-NEXT:    vpermi2d %zmm7, %zmm8, %zmm28
+; AVX512-NEXT:    vpermi2d %zmm7, %zmm8, %zmm1
+; AVX512-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa 32(%r9), %xmm0
-; AVX512-NEXT:    vmovdqa 32(%r8), %xmm14
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6]
-; AVX512-NEXT:    vpshufb %xmm1, %xmm0, %xmm7
-; AVX512-NEXT:    vmovdqa64 %xmm1, %xmm24
-; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm18
+; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovdqa 32(%r8), %xmm1
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm8 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6]
+; AVX512-NEXT:    vpshufb %xmm8, %xmm0, %xmm7
+; AVX512-NEXT:    vmovdqa64 %xmm8, %xmm20
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm0 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128]
-; AVX512-NEXT:    vpshufb %xmm0, %xmm14, %xmm8
-; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm29
-; AVX512-NEXT:    vporq %xmm7, %xmm8, %xmm31
-; AVX512-NEXT:    vmovdqa64 %ymm23, %ymm0
+; AVX512-NEXT:    vpshufb %xmm0, %xmm1, %xmm8
+; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm30
+; AVX512-NEXT:    vmovdqa64 %xmm1, %xmm29
+; AVX512-NEXT:    vpor %xmm7, %xmm8, %xmm0
+; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa64 %ymm28, %ymm0
 ; AVX512-NEXT:    vpshufb %ymm0, %ymm4, %ymm7
-; AVX512-NEXT:    vmovdqa64 %ymm26, %ymm0
+; AVX512-NEXT:    vmovdqa64 %ymm23, %ymm0
 ; AVX512-NEXT:    vpshufb %ymm0, %ymm5, %ymm8
 ; AVX512-NEXT:    vpor %ymm7, %ymm8, %ymm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20]
 ; AVX512-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX512-NEXT:    vpshufb %ymm1, %ymm4, %ymm0
-; AVX512-NEXT:    vmovdqa64 %ymm1, %ymm23
-; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128]
-; AVX512-NEXT:    # ymm7 = mem[0,1,0,1]
-; AVX512-NEXT:    vpshufb %ymm7, %ymm5, %ymm1
+; AVX512-NEXT:    vmovdqa64 %ymm1, %ymm21
+; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128]
+; AVX512-NEXT:    # ymm4 = mem[0,1,0,1]
+; AVX512-NEXT:    vpshufb %ymm4, %ymm5, %ymm1
+; AVX512-NEXT:    vmovdqa64 %ymm4, %ymm19
 ; AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm10[14],zero,zero,zero,zero,zero,zero,ymm10[15],zero,zero,zero,zero,zero,zero,ymm10[16],zero,zero,zero,zero,zero,zero,ymm10[17],zero,zero,zero,zero,zero,zero,ymm10[18]
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm1 = ymm9[0,1,14],zero,ymm9[12,13,0,1,14,15],zero,ymm9[3,12,13,2,3,16],zero,ymm9[30,31,28,29,16,17],zero,ymm9[31,18,19,28,29,18],zero
+; AVX512-NEXT:    vmovdqa64 %ymm26, %ymm0
+; AVX512-NEXT:    vpshufb %ymm0, %ymm10, %ymm0
+; AVX512-NEXT:    vmovdqa64 %ymm25, %ymm1
+; AVX512-NEXT:    vpshufb %ymm1, %ymm9, %ymm1
 ; AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128]
+; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128]
+; AVX512-NEXT:    # ymm7 = mem[0,1,0,1]
+; AVX512-NEXT:    vpshufb %ymm7, %ymm10, %ymm0
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [2,3,4,5,128,3,128,5,4,5,6,128,4,128,6,7,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23]
+; AVX512-NEXT:    vpshufb %ymm1, %ymm9, %ymm4
+; AVX512-NEXT:    vpor %ymm0, %ymm4, %ymm0
+; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22]
+; AVX512-NEXT:    # ymm10 = mem[0,1,0,1]
+; AVX512-NEXT:    vpshufb %ymm10, %ymm2, %ymm4
+; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128]
 ; AVX512-NEXT:    # ymm0 = mem[0,1,0,1]
-; AVX512-NEXT:    vpshufb %ymm0, %ymm10, %ymm1
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm8 = [2,3,4,5,128,3,128,5,4,5,6,128,4,128,6,7,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23]
-; AVX512-NEXT:    vpshufb %ymm8, %ymm9, %ymm4
-; AVX512-NEXT:    vpor %ymm1, %ymm4, %ymm1
-; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22]
-; AVX512-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512-NEXT:    vpshufb %ymm1, %ymm2, %ymm4
-; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128]
-; AVX512-NEXT:    # ymm9 = mem[0,1,0,1]
-; AVX512-NEXT:    vpshufb %ymm9, %ymm3, %ymm5
+; AVX512-NEXT:    vpshufb %ymm0, %ymm3, %ymm5
 ; AVX512-NEXT:    vpor %ymm4, %ymm5, %ymm4
 ; AVX512-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa64 %ymm27, %ymm4
+; AVX512-NEXT:    vmovdqa64 %ymm24, %ymm4
 ; AVX512-NEXT:    vpshufb %ymm4, %ymm3, %ymm3
-; AVX512-NEXT:    vmovdqa64 %ymm17, %ymm4
+; AVX512-NEXT:    vmovdqa64 %ymm27, %ymm4
 ; AVX512-NEXT:    vpshufb %ymm4, %ymm2, %ymm2
 ; AVX512-NEXT:    vpor %ymm3, %ymm2, %ymm2
 ; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-NEXT:    vmovdqa (%rsi), %xmm3
 ; AVX512-NEXT:    vpshufb %xmm6, %xmm3, %xmm2
-; AVX512-NEXT:    vmovdqa64 %xmm3, %xmm25
+; AVX512-NEXT:    vmovdqa64 %xmm3, %xmm31
 ; AVX512-NEXT:    vmovdqa (%rdi), %xmm4
-; AVX512-NEXT:    vpshufb %xmm13, %xmm4, %xmm3
-; AVX512-NEXT:    vmovdqa64 %xmm4, %xmm27
+; AVX512-NEXT:    vmovdqa64 %xmm18, %xmm3
+; AVX512-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX512-NEXT:    vmovdqa64 %xmm4, %xmm28
 ; AVX512-NEXT:    vpor %xmm2, %xmm3, %xmm2
 ; AVX512-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512-NEXT:    vmovdqa (%rcx), %xmm3
 ; AVX512-NEXT:    vpshufb %xmm11, %xmm3, %xmm2
-; AVX512-NEXT:    vmovdqa64 %xmm3, %xmm17
-; AVX512-NEXT:    vmovdqa (%rdx), %xmm13
-; AVX512-NEXT:    vmovdqa64 %xmm21, %xmm3
-; AVX512-NEXT:    vpshufb %xmm3, %xmm13, %xmm3
-; AVX512-NEXT:    vpor %xmm2, %xmm3, %xmm2
-; AVX512-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vmovdqa (%r9), %xmm11
-; AVX512-NEXT:    vmovdqa64 %xmm24, %xmm2
-; AVX512-NEXT:    vpshufb %xmm2, %xmm11, %xmm2
-; AVX512-NEXT:    vmovdqa (%r8), %xmm10
-; AVX512-NEXT:    vmovdqa64 %xmm29, %xmm3
-; AVX512-NEXT:    vpshufb %xmm3, %xmm10, %xmm3
+; AVX512-NEXT:    vmovdqa %xmm3, %xmm6
+; AVX512-NEXT:    vmovdqa (%rdx), %xmm4
+; AVX512-NEXT:    vpshufb %xmm12, %xmm4, %xmm3
+; AVX512-NEXT:    vmovdqa64 %xmm4, %xmm18
 ; AVX512-NEXT:    vpor %xmm2, %xmm3, %xmm2
 ; AVX512-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vmovdqa64 %ymm20, %ymm4
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm2 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm4[23],zero,zero,zero,zero,ymm4[26],zero,ymm4[24],zero,zero,zero,zero,ymm4[27],zero,ymm4[25]
-; AVX512-NEXT:    vmovdqa64 %ymm22, %ymm3
-; AVX512-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
-; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm3[23,u,u,u],zero,ymm3[26],zero,ymm3[24,u,u,u],zero,ymm3[27],zero
-; AVX512-NEXT:    vpshufb %ymm8, %ymm4, %ymm2
-; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm29
-; AVX512-NEXT:    vmovdqa64 %ymm19, %ymm2
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero
-; AVX512-NEXT:    vmovdqa64 %ymm23, %ymm3
+; AVX512-NEXT:    vmovdqa (%r9), %xmm3
+; AVX512-NEXT:    vmovdqa64 %xmm20, %xmm2
+; AVX512-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX512-NEXT:    vmovdqa %xmm3, %xmm8
+; AVX512-NEXT:    vmovdqa (%r8), %xmm3
+; AVX512-NEXT:    vmovdqa64 %xmm30, %xmm4
+; AVX512-NEXT:    vpshufb %xmm4, %xmm3, %xmm4
+; AVX512-NEXT:    vmovdqa %xmm3, %xmm9
+; AVX512-NEXT:    vpor %xmm2, %xmm4, %xmm2
+; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm2 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm13[23],zero,zero,zero,zero,ymm13[26],zero,ymm13[24],zero,zero,zero,zero,ymm13[27],zero,ymm13[25]
+; AVX512-NEXT:    vmovdqa64 %ymm17, %ymm3
+; AVX512-NEXT:    vpshufb %ymm7, %ymm3, %ymm4
+; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm4, %zmm2
+; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm2 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm3[23,u,u,u],zero,ymm3[26],zero,ymm3[24,u,u,u],zero,ymm3[27],zero
+; AVX512-NEXT:    vpshufb %ymm1, %ymm13, %ymm1
+; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa64 %ymm22, %ymm2
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero
+; AVX512-NEXT:    vmovdqa64 %ymm21, %ymm3
 ; AVX512-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
-; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm26
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm12[23],zero,ymm12[21,22,23,26],zero,ymm12[24],zero,ymm12[28,29,26,27]
-; AVX512-NEXT:    vpshufb %ymm7, %ymm12, %ymm2
-; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm23
-; AVX512-NEXT:    vmovdqa64 %ymm16, %ymm2
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero
-; AVX512-NEXT:    vpshufb %ymm1, %ymm2, %ymm1
-; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm1[23],zero,ymm1[23,24,25,26],zero,ymm1[24],zero,ymm1[30,31]
-; AVX512-NEXT:    vpshufb %ymm9, %ymm1, %ymm1
-; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1
+; AVX512-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm2[23],zero,ymm2[21,22,23,26],zero,ymm2[24],zero,ymm2[28,29,26,27]
+; AVX512-NEXT:    vmovdqa64 %ymm19, %ymm3
+; AVX512-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm30
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm1 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm15[25],zero,ymm15[23],zero,zero,zero,zero,ymm15[26],zero,ymm15[24],zero,zero
+; AVX512-NEXT:    vmovdqa64 %ymm16, %ymm3
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm2 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm3[23],zero,ymm3[23,24,25,26],zero,ymm3[24],zero,ymm3[30,31]
+; AVX512-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX512-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vpshufb %ymm10, %ymm15, %ymm1
+; AVX512-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
+; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa (%rax), %ymm8
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm1 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm16 = [0,5,4,0,5,0,4,0,20,21,0,23,0,21,0,23]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm8[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
-; AVX512-NEXT:    vpermi2d %zmm1, %zmm2, %zmm16
-; AVX512-NEXT:    vmovdqa (%rax), %xmm5
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,5,5,6]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1]
-; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
+; AVX512-NEXT:    vmovdqa (%rax), %ymm4
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm15 = [0,5,4,0,5,0,4,0,20,21,0,23,0,21,0,23]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm4[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
+; AVX512-NEXT:    vpermi2d %zmm0, %zmm1, %zmm15
+; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
+; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15]
+; AVX512-NEXT:    vmovdqa64 %xmm9, %xmm21
+; AVX512-NEXT:    vmovdqa64 %xmm8, %xmm22
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
+; AVX512-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX512-NEXT:    vmovdqa64 %xmm2, %xmm24
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm23 = zmm1[0,1,0,1],zmm0[4,5,6,7]
+; AVX512-NEXT:    vmovdqa (%rax), %xmm11
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,4,5,5,6]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
+; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm9 = [128,13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128]
-; AVX512-NEXT:    vpshufb %ymm9, %ymm8, %ymm3
-; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm19
-; AVX512-NEXT:    vmovdqa64 %xmm30, %xmm4
-; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm4[0],xmm15[0],xmm4[1],xmm15[1],xmm4[2],xmm15[2],xmm4[3],xmm15[3],xmm4[4],xmm15[4],xmm4[5],xmm15[5],xmm4[6],xmm15[6],xmm4[7],xmm15[7]
+; AVX512-NEXT:    vpshufb %ymm9, %ymm4, %ymm2
+; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm16
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
+; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3],xmm10[4],xmm14[4],xmm10[5],xmm14[5],xmm10[6],xmm14[6],xmm10[7],xmm14[7]
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5]
-; AVX512-NEXT:    vpshufb %xmm0, %xmm3, %xmm3
-; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm24
+; AVX512-NEXT:    vpshufb %xmm0, %xmm2, %xmm2
+; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm27
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm6 = zmm0[2,3,2,3],zmm3[0,1,0,1]
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm12 = zmm0[2,3,2,3],zmm2[0,1,0,1]
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7]
+; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm0 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9]
 ; AVX512-NEXT:    vpshufb %xmm0, %xmm3, %xmm3
-; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm21
+; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm26
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm22 = zmm0[2,3,2,3],zmm3[0,1,0,1]
-; AVX512-NEXT:    vmovdqa64 %xmm18, %xmm2
-; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm12 = xmm14[0],xmm2[0],xmm14[1],xmm2[1],xmm14[2],xmm2[2],xmm14[3],xmm2[3],xmm14[4],xmm2[4],xmm14[5],xmm2[5],xmm14[6],xmm2[6],xmm14[7],xmm2[7]
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm17 = zmm0[2,3,2,3],zmm3[0,1,0,1]
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; AVX512-NEXT:    vmovdqa64 %xmm29, %xmm3
+; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm13 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm0 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
-; AVX512-NEXT:    vpshufb %xmm0, %xmm12, %xmm12
-; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm20
+; AVX512-NEXT:    vpshufb %xmm0, %xmm13, %xmm13
+; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm25
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm30 = zmm0[2,3,2,3],zmm12[0,1,0,1]
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31]
-; AVX512-NEXT:    vmovdqa64 %ymm0, %ymm18
-; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm7[8],xmm1[8],xmm7[9],xmm1[9],xmm7[10],xmm1[10],xmm7[11],xmm1[11],xmm7[12],xmm1[12],xmm7[13],xmm1[13],xmm7[14],xmm1[14],xmm7[15],xmm1[15]
-; AVX512-NEXT:    vmovdqa64 %xmm17, %xmm3
-; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm8 = xmm3[8],xmm13[8],xmm3[9],xmm13[9],xmm3[10],xmm13[10],xmm3[11],xmm13[11],xmm3[12],xmm13[12],xmm3[13],xmm13[13],xmm3[14],xmm13[14],xmm3[15],xmm13[15]
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm29 = zmm0[2,3,2,3],zmm13[0,1,0,1]
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31]
+; AVX512-NEXT:    vmovdqa64 %ymm0, %ymm20
+; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
+; AVX512-NEXT:    vmovdqa %xmm6, %xmm8
+; AVX512-NEXT:    vmovdqa64 %xmm18, %xmm7
+; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15]
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7]
-; AVX512-NEXT:    vpshufb %xmm1, %xmm8, %xmm12
+; AVX512-NEXT:    vpshufb %xmm1, %xmm4, %xmm2
+; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm18
 ; AVX512-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm8
-; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm15[8],xmm4[8],xmm15[9],xmm4[9],xmm15[10],xmm4[10],xmm15[11],xmm4[11],xmm15[12],xmm4[12],xmm15[13],xmm4[13],xmm15[14],xmm4[14],xmm15[15],xmm4[15]
-; AVX512-NEXT:    vmovdqa64 %xmm25, %xmm4
-; AVX512-NEXT:    vmovdqa64 %xmm27, %xmm7
-; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm15 = xmm4[8],xmm7[8],xmm4[9],xmm7[9],xmm4[10],xmm7[10],xmm4[11],xmm7[11],xmm4[12],xmm7[12],xmm4[13],xmm7[13],xmm4[14],xmm7[14],xmm4[15],xmm7[15]
+; AVX512-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm19
+; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm14[8],xmm10[8],xmm14[9],xmm10[9],xmm14[10],xmm10[10],xmm14[11],xmm10[11],xmm14[12],xmm10[12],xmm14[13],xmm10[13],xmm14[14],xmm10[14],xmm14[15],xmm10[15]
+; AVX512-NEXT:    vmovdqa64 %xmm31, %xmm6
+; AVX512-NEXT:    vmovdqa64 %xmm28, %xmm4
+; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm14 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15]
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm0 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
-; AVX512-NEXT:    vpshufb %xmm0, %xmm15, %xmm15
+; AVX512-NEXT:    vpshufb %xmm0, %xmm14, %xmm13
 ; AVX512-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm17
-; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm14[8],xmm2[8],xmm14[9],xmm2[9],xmm14[10],xmm2[10],xmm14[11],xmm2[11],xmm14[12],xmm2[12],xmm14[13],xmm2[13],xmm14[14],xmm2[14],xmm14[15],xmm2[15]
-; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm14 = xmm10[8],xmm11[8],xmm10[9],xmm11[9],xmm10[10],xmm11[10],xmm10[11],xmm11[11],xmm10[12],xmm11[12],xmm10[13],xmm11[13],xmm10[14],xmm11[14],xmm10[15],xmm11[15]
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm0 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
-; AVX512-NEXT:    vpshufb %xmm0, %xmm14, %xmm14
-; AVX512-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
-; AVX512-NEXT:    vinserti32x4 $2, %xmm0, %zmm31, %zmm31
-; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512-NEXT:    vpshufb %ymm9, %ymm0, %ymm1
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm9 = ymm0[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
+; AVX512-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
+; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15]
+; AVX512-NEXT:    vmovdqa64 %xmm24, %xmm2
+; AVX512-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm14 = zmm2[0,1,0,1],zmm1[0,1,0,1]
+; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX512-NEXT:    vpshufb %ymm9, %ymm10, %ymm1
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm9 = ymm10[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
 ; AVX512-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[0,1,1,3,4,5,5,7]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,3,2]
-; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm9
-; AVX512-NEXT:    vinserti64x4 $1, %ymm9, %zmm1, %zmm9
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255]
-; AVX512-NEXT:    vpternlogq $226, %zmm6, %zmm1, %zmm22
-; AVX512-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm6 = mem[2,3,2,3]
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-NEXT:    vinserti64x4 $1, %ymm6, %zmm0, %zmm6
-; AVX512-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload
-; AVX512-NEXT:    # ymm27 = mem[2,3,2,3]
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-NEXT:    vinserti64x4 $1, %ymm27, %zmm0, %zmm27
-; AVX512-NEXT:    vpternlogq $226, %zmm6, %zmm1, %zmm27
-; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
-; AVX512-NEXT:    vmovdqa64 %xmm24, %xmm0
-; AVX512-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
-; AVX512-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload
-; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm6 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3],xmm13[4],xmm3[4],xmm13[5],xmm3[5],xmm13[6],xmm3[6],xmm13[7],xmm3[7]
-; AVX512-NEXT:    vmovdqa64 %xmm21, %xmm0
-; AVX512-NEXT:    vpshufb %xmm0, %xmm6, %xmm6
-; AVX512-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 16-byte Folded Reload
-; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3],xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7]
-; AVX512-NEXT:    vmovdqa64 %xmm20, %xmm0
-; AVX512-NEXT:    vpshufb %xmm0, %xmm10, %xmm10
-; AVX512-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 16-byte Folded Reload
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm11 = xmm5[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[1,1,0,0,4,5,6,7]
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255]
+; AVX512-NEXT:    vpternlogq $226, %zmm12, %zmm10, %zmm17
+; AVX512-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm2 = mem[2,3,2,3]
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm28 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm28 = mem[2,3,2,3]
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-NEXT:    vinserti64x4 $1, %ymm28, %zmm3, %zmm28
+; AVX512-NEXT:    vpternlogq $226, %zmm2, %zmm10, %zmm28
+; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm9[2,2,3,2]
+; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
+; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm10
+; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7]
+; AVX512-NEXT:    vmovdqa64 %xmm27, %xmm2
+; AVX512-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX512-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm27 # 16-byte Folded Reload
+; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
+; AVX512-NEXT:    vmovdqa64 %xmm26, %xmm1
+; AVX512-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
+; AVX512-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload
+; AVX512-NEXT:    vmovdqa64 %xmm21, %xmm1
+; AVX512-NEXT:    vmovdqa64 %xmm22, %xmm3
+; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm8 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
+; AVX512-NEXT:    vmovdqa64 %xmm25, %xmm1
+; AVX512-NEXT:    vpshufb %xmm1, %xmm8, %xmm8
+; AVX512-NEXT:    vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm8 = zmm8[0,1,0,1],mem[0,1,0,1]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm9 = xmm11[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm11[1,1,0,0,4,5,6,7]
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,0]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm11, %zmm5, %zmm5
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512-NEXT:    vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 64-byte Folded Reload
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512-NEXT:    vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm13 # 64-byte Folded Reload
-; AVX512-NEXT:    vpermq {{.*#+}} zmm11 = zmm11[2,3,2,3,6,7,6,7]
-; AVX512-NEXT:    vpermq {{.*#+}} zmm13 = zmm13[2,3,2,3,6,7,6,7]
-; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm13
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512-NEXT:    vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 64-byte Folded Reload
-; AVX512-NEXT:    vpermq {{.*#+}} zmm11 = zmm11[2,3,2,3,6,7,6,7]
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255]
-; AVX512-NEXT:    vpternlogq $184, %zmm13, %zmm21, %zmm11
-; AVX512-NEXT:    vpermq {{.*#+}} zmm8 = zmm8[0,1,0,1,4,5,4,5]
-; AVX512-NEXT:    vpermq {{.*#+}} zmm7 = zmm17[0,1,0,1,4,5,4,5]
-; AVX512-NEXT:    vpternlogq $226, %zmm8, %zmm21, %zmm7
-; AVX512-NEXT:    vpermq {{.*#+}} ymm8 = ymm15[0,1,0,1]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm13 = ymm14[0,1,0,1]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm14 = ymm18[2,3,2,3]
-; AVX512-NEXT:    vpshuflw $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
-; AVX512-NEXT:    # xmm15 = mem[1,1,0,0,4,5,6,7]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm15 = xmm15[0,1,2,0]
-; AVX512-NEXT:    vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm20 # 64-byte Folded Reload
-; AVX512-NEXT:    vporq %zmm26, %zmm23, %zmm17
-; AVX512-NEXT:    vpermq {{.*#+}} zmm18 = zmm20[2,3,2,3,6,7,6,7]
-; AVX512-NEXT:    vpermq {{.*#+}} zmm17 = zmm17[2,3,2,3,6,7,6,7]
-; AVX512-NEXT:    vpternlogq $226, %zmm18, %zmm21, %zmm17
-; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 32-byte Folded Reload
-; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm12 # 32-byte Folded Reload
-; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm12
-; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm8 # 32-byte Folded Reload
-; AVX512-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm19
-; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm12, %zmm19
-; AVX512-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Folded Reload
-; AVX512-NEXT:    # zmm8 = mem[2,3,2,3,6,7,6,7]
-; AVX512-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm8
-; AVX512-NEXT:    vpermq {{.*#+}} ymm11 = ymm15[0,0,1,0]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm11, %zmm14, %zmm11
-; AVX512-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm30, %zmm11
-; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm22, %zmm11
-; AVX512-NEXT:    vpermq {{.*#+}} zmm0 = zmm31[0,1,0,1,4,5,4,5]
-; AVX512-NEXT:    vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm28, %zmm0
-; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm0
+; AVX512-NEXT:    vinserti64x4 $1, %ymm9, %zmm5, %zmm5
+; AVX512-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm9 = mem[2,3,2,3,6,7,6,7]
+; AVX512-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm11 = mem[2,3,2,3,6,7,6,7]
+; AVX512-NEXT:    vporq %zmm9, %zmm11, %zmm9
+; AVX512-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm11 = mem[2,3,2,3,6,7,6,7]
+; AVX512-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm12 = mem[2,3,2,3,6,7,6,7]
+; AVX512-NEXT:    vporq %zmm11, %zmm12, %zmm11
+; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm11
+; AVX512-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm9 = mem[2,3,2,3,6,7,6,7]
+; AVX512-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm12 = mem[2,3,2,3,6,7,6,7]
+; AVX512-NEXT:    vporq %zmm9, %zmm12, %zmm9
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255]
+; AVX512-NEXT:    vpternlogq $184, %zmm11, %zmm12, %zmm9
+; AVX512-NEXT:    vpermq {{.*#+}} ymm11 = ymm13[0,1,0,1]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm4 = ymm18[0,1,0,1]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm3 = ymm20[2,3,2,3]
+; AVX512-NEXT:    vpshuflw $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm1 = mem[1,1,0,0,4,5,6,7]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
+; AVX512-NEXT:    vpermq {{.*#+}} zmm13 = zmm19[0,1,0,1,4,5,4,5]
+; AVX512-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5]
+; AVX512-NEXT:    vpternlogq $226, %zmm13, %zmm12, %zmm0
+; AVX512-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm13 = mem[2,3,2,3,6,7,6,7]
+; AVX512-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm21 = mem[2,3,2,3,6,7,6,7]
+; AVX512-NEXT:    vporq %zmm13, %zmm21, %zmm13
+; AVX512-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm20 = mem[2,3,2,3,6,7,6,7]
+; AVX512-NEXT:    vpermq {{.*#+}} zmm18 = zmm30[2,3,2,3,6,7,6,7]
+; AVX512-NEXT:    vporq %zmm20, %zmm18, %zmm18
+; AVX512-NEXT:    vpternlogq $226, %zmm13, %zmm12, %zmm18
+; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 32-byte Folded Reload
+; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 32-byte Folded Reload
+; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm4
+; AVX512-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm23, %zmm16
+; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm16
+; AVX512-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm4 = mem[2,3,2,3,6,7,6,7]
+; AVX512-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm4
+; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,1,0]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm3, %zmm1
+; AVX512-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm29, %zmm1
+; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm17, %zmm1
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512-NEXT:    vshufi64x2 $84, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload
-; AVX512-NEXT:    # zmm3 = zmm3[0,1,2,3],mem[2,3,2,3]
-; AVX512-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm9
-; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm27, %zmm9
-; AVX512-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[0,1,0,1,4,5,4,5]
-; AVX512-NEXT:    vpermq {{.*#+}} zmm3 = zmm6[0,1,0,1,4,5,4,5]
-; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm3
-; AVX512-NEXT:    vpermq {{.*#+}} zmm1 = zmm10[0,1,0,1,4,5,4,5]
-; AVX512-NEXT:    vpermq {{.*#+}} zmm5 = zmm5[0,0,1,0,4,4,5,4]
-; AVX512-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm5
-; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm5
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-NEXT:    vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 64-byte Folded Reload
-; AVX512-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7]
-; AVX512-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm16
-; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm17, %zmm16
+; AVX512-NEXT:    vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm14
+; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm14
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT:    vshufi64x2 $84, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm0 = zmm0[0,1,2,3],mem[2,3,2,3]
+; AVX512-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm10
+; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm28, %zmm10
+; AVX512-NEXT:    vpermq {{.*#+}} zmm0 = zmm27[0,1,0,1,4,5,4,5]
+; AVX512-NEXT:    vpermq {{.*#+}} zmm2 = zmm2[0,1,0,1,4,5,4,5]
+; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2
+; AVX512-NEXT:    vpermq {{.*#+}} zmm0 = zmm5[0,0,1,0,4,4,5,4]
+; AVX512-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm0
+; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-NEXT:    vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm2 = zmm2[2,3,2,3],mem[2,3,2,3]
+; AVX512-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm15
+; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm18, %zmm15
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    vmovdqa64 %zmm16, 128(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm5, (%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm9, 320(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm0, 256(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm11, 192(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm8, 384(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm19, 64(%rax)
-; AVX512-NEXT:    addq $1384, %rsp # imm = 0x568
+; AVX512-NEXT:    vmovdqa64 %zmm15, 128(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm0, (%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm10, 320(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm14, 256(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm1, 192(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm4, 384(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm16, 64(%rax)
+; AVX512-NEXT:    addq $1720, %rsp # imm = 0x6B8
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 ;
 ; AVX512-FCP-LABEL: store_i8_stride7_vf64:
 ; AVX512-FCP:       # %bb.0:
 ; AVX512-FCP-NEXT:    subq $1432, %rsp # imm = 0x598
-; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %ymm3
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm3[30],zero,ymm3[28],zero,zero,zero,zero,ymm3[31],zero,ymm3[29],zero,zero,zero
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm2
-; AVX512-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25]
-; AVX512-FCP-NEXT:    # ymm4 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm2, %ymm1
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm4, %ymm25
+; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %ymm2
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero,zero,zero
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm3
+; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25]
+; AVX512-FCP-NEXT:    # ymm15 = mem[0,1,0,1]
+; AVX512-FCP-NEXT:    vpshufb %ymm15, %ymm3, %ymm1
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,30],zero,ymm2[28],zero,ymm2[30,31,30,31],zero,ymm2[29],zero,ymm2[31,28,29]
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [25,128,23,0,0,0,128,26,128,24,0,0,0,128,27,128,25,128,23,0,0,0,128,26,128,24,0,0,0,128,27,128]
-; AVX512-FCP-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm3, %ymm1
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm16
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,30],zero,ymm3[28],zero,ymm3[30,31,30,31],zero,ymm3[29],zero,ymm3[31,28,29]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm25
+; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [25,128,23,0,0,0,128,26,128,24,0,0,0,128,27,128,25,128,23,0,0,0,128,26,128,24,0,0,0,128,27,128]
+; AVX512-FCP-NEXT:    # ymm3 = mem[0,1,0,1]
+; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm2, %ymm1
 ; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm27
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm18
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %ymm1
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29],zero,zero
-; AVX512-FCP-NEXT:    vmovdqa %ymm1, %ymm2
-; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %ymm15
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128]
-; AVX512-FCP-NEXT:    # ymm3 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm15, %ymm1
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm29
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %ymm2
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero,zero
+; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %ymm3
+; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128]
+; AVX512-FCP-NEXT:    # ymm4 = mem[0,1,0,1]
+; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm3, %ymm1
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm4, %ymm28
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u],zero,ymm15[30],zero,ymm15[28,u,u,u],zero,ymm15[31],zero,ymm15[29,u]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u],zero,ymm3[30],zero,ymm3[28,u,u,u],zero,ymm3[31],zero,ymm3[29,u]
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm19
 ; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27]
 ; AVX512-FCP-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm2, %ymm1
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm31
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm26
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm24
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm17
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %ymm2
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[27],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29]
-; AVX512-FCP-NEXT:    vmovdqa 32(%r9), %ymm3
+; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %ymm3
+; AVX512-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[27],zero,zero,zero,zero,ymm3[30],zero,ymm3[28],zero,zero,zero,zero,ymm3[31],zero,ymm3[29]
+; AVX512-FCP-NEXT:    vmovdqa 32(%r9), %ymm2
+; AVX512-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128]
 ; AVX512-FCP-NEXT:    # ymm4 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm3, %ymm1
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm4, %ymm21
+; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm2, %ymm1
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm4, %ymm23
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm3[27,u,u,u],zero,ymm3[30],zero,ymm3[28,u,u,u],zero,ymm3[31],zero
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm23
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31,24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31]
-; AVX512-FCP-NEXT:    # ymm3 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm2, %ymm1
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm20
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm19
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm2[27,u,u,u],zero,ymm2[30],zero,ymm2[28,u,u,u],zero,ymm2[31],zero
+; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31,24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31]
+; AVX512-FCP-NEXT:    # ymm2 = mem[0,1,0,1]
+; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm3, %ymm1
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm22
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
@@ -9003,10 +9038,10 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa (%rsi), %ymm6
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm6[14],zero,zero,zero,zero,zero,zero,ymm6[15],zero,zero,zero,zero,zero,zero,ymm6[16],zero,zero,zero,zero,zero,zero,ymm6[17],zero,zero,zero,zero,zero,zero,ymm6[18]
-; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm5
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm5[0,1,14],zero,ymm5[12,13,0,1,14,15],zero,ymm5[3,12,13,2,3,16],zero,ymm5[30,31,28,29,16,17],zero,ymm5[31,18,19,28,29,18],zero
+; AVX512-FCP-NEXT:    vmovdqa (%rsi), %ymm5
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm5[14],zero,zero,zero,zero,zero,zero,ymm5[15],zero,zero,zero,zero,zero,zero,ymm5[16],zero,zero,zero,zero,zero,zero,ymm5[17],zero,zero,zero,zero,zero,zero,ymm5[18]
+; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm4
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm4[0,1,14],zero,ymm4[12,13,0,1,14,15],zero,ymm4[3,12,13,2,3,16],zero,ymm4[30,31,28,29,16,17],zero,ymm4[31,18,19,28,29,18],zero
 ; AVX512-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa (%rcx), %ymm3
@@ -9016,351 +9051,366 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa (%r8), %ymm1
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,ymm1[14],zero,zero,zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,zero,zero,ymm1[17],zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vmovdqa (%r9), %ymm14
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = ymm14[13,u,u,u,u,u],zero,ymm14[14,u,u,u,u,u],zero,ymm14[15,u,u,u,u,u],zero,ymm14[16,u,u,u,u,u],zero,ymm14[17,u,u,u]
-; AVX512-FCP-NEXT:    vpor %ymm4, %ymm7, %ymm4
-; AVX512-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,ymm1[14],zero,zero,zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,zero,zero,ymm1[17],zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vmovdqa (%r9), %ymm0
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = ymm0[13,u,u,u,u,u],zero,ymm0[14,u,u,u,u,u],zero,ymm0[15,u,u,u,u,u],zero,ymm0[16,u,u,u,u,u],zero,ymm0[17,u,u,u]
+; AVX512-FCP-NEXT:    vpor %ymm6, %ymm7, %ymm6
+; AVX512-FCP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %xmm9
 ; AVX512-FCP-NEXT:    vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %xmm0
-; AVX512-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm4 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u]
-; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm0, %xmm7
+; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %xmm7
+; AVX512-FCP-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u]
+; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm7, %xmm7
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm8 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u]
 ; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm9, %xmm9
-; AVX512-FCP-NEXT:    vpor %xmm7, %xmm9, %xmm0
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm0
+; AVX512-FCP-NEXT:    vpor %xmm7, %xmm9, %xmm7
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm11
+; AVX512-FCP-NEXT:    vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %xmm9
 ; AVX512-FCP-NEXT:    vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128]
 ; AVX512-FCP-NEXT:    vpshufb %xmm7, %xmm9, %xmm9
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9]
-; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm0, %xmm11
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm0, %xmm28
-; AVX512-FCP-NEXT:    vpor %xmm9, %xmm11, %xmm0
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm11, %xmm11
+; AVX512-FCP-NEXT:    vpor %xmm9, %xmm11, %xmm9
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vmovdqa 32(%r9), %xmm9
 ; AVX512-FCP-NEXT:    vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %xmm0
-; AVX512-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm11 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6]
-; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm9, %xmm9
+; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %xmm14
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm12 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6]
+; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm9, %xmm9
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm13 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128]
-; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm0, %xmm12
-; AVX512-FCP-NEXT:    vpor %xmm9, %xmm12, %xmm0
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa (%rsi), %xmm0
-; AVX512-FCP-NEXT:    vpshufb %xmm7, %xmm0, %xmm7
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm0, %xmm24
-; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm0, %xmm10
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm0, %xmm17
-; AVX512-FCP-NEXT:    vpor %xmm7, %xmm10, %xmm0
-; AVX512-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa (%rcx), %xmm10
-; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm10, %xmm4
-; AVX512-FCP-NEXT:    vmovdqa (%rdx), %xmm9
-; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm9, %xmm8
-; AVX512-FCP-NEXT:    vpor %xmm4, %xmm8, %xmm0
-; AVX512-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa (%r9), %xmm8
-; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm8, %xmm11
-; AVX512-FCP-NEXT:    vmovdqa (%r8), %xmm7
-; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm7, %xmm13
-; AVX512-FCP-NEXT:    vporq %xmm11, %xmm13, %xmm30
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm25, %ymm0
-; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm5, %ymm11
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm12 = [128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128]
+; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm14, %xmm11
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm14, %xmm30
+; AVX512-FCP-NEXT:    vpor %xmm9, %xmm11, %xmm9
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa (%rsi), %xmm9
+; AVX512-FCP-NEXT:    vpshufb %xmm7, %xmm9, %xmm7
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm9, %xmm26
+; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm9
+; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm9, %xmm10
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm9, %xmm21
+; AVX512-FCP-NEXT:    vpor %xmm7, %xmm10, %xmm7
+; AVX512-FCP-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa (%rcx), %xmm7
+; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm7, %xmm6
+; AVX512-FCP-NEXT:    vmovdqa (%rdx), %xmm10
+; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm10, %xmm8
+; AVX512-FCP-NEXT:    vpor %xmm6, %xmm8, %xmm6
+; AVX512-FCP-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa (%r9), %xmm11
+; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm11, %xmm12
+; AVX512-FCP-NEXT:    vmovdqa (%r8), %xmm6
+; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm6, %xmm13
+; AVX512-FCP-NEXT:    vmovdqa %xmm6, %xmm9
+; AVX512-FCP-NEXT:    vporq %xmm12, %xmm13, %xmm31
+; AVX512-FCP-NEXT:    vpshufb %ymm15, %ymm4, %ymm12
+; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128]
+; AVX512-FCP-NEXT:    # ymm8 = mem[0,1,0,1]
+; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm5, %ymm13
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm12, %zmm13, %zmm6
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm27, %ymm6
+; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm5, %ymm12
+; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23]
+; AVX512-FCP-NEXT:    # ymm6 = mem[0,1,0,1]
+; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm4, %ymm13
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm6, %ymm20
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm12, %zmm13, %zmm29
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm28, %ymm6
+; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm3, %ymm12
+; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20]
+; AVX512-FCP-NEXT:    # ymm6 = mem[0,1,0,1]
+; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm3, %ymm13
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm6, %ymm16
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm12, %zmm13, %zmm28
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm24, %ymm6
+; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm2, %ymm13
+; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm12 = [18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128]
 ; AVX512-FCP-NEXT:    # ymm12 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm6, %ymm13
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm13, %zmm0
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm16, %ymm0
-; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm6, %ymm11
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23]
-; AVX512-FCP-NEXT:    # ymm0 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm5, %ymm13
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm22
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm13, %zmm0
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm29, %ymm0
-; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm3, %ymm11
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20]
-; AVX512-FCP-NEXT:    # ymm0 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm3, %ymm13
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm29
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm13, %zmm0
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm31, %ymm0
-; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm2, %ymm11
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128]
-; AVX512-FCP-NEXT:    # ymm4 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm2, %ymm13
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm13, %zmm25
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm21, %ymm0
-; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm14, %ymm11
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22]
-; AVX512-FCP-NEXT:    # ymm0 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm14, %ymm13
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm18
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm13, %zmm0
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm20, %ymm0
-; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm1, %ymm11
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128]
-; AVX512-FCP-NEXT:    # ymm0 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm1, %ymm13
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm16
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm13, %zmm0
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm6[30],zero,ymm6[28],zero,zero,zero,zero,ymm6[31],zero,ymm6[29],zero,zero,zero
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,30],zero,ymm5[28],zero,ymm5[30,31,30,31],zero,ymm5[29],zero,ymm5[31,28,29]
-; AVX512-FCP-NEXT:    vporq %ymm6, %ymm5, %ymm20
+; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm2, %ymm15
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm13, %zmm15, %zmm27
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm23, %ymm6
+; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm0, %ymm13
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm22, %ymm6
+; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm1, %ymm15
+; AVX512-FCP-NEXT:    vpor %ymm13, %ymm15, %ymm6
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22]
+; AVX512-FCP-NEXT:    # ymm6 = mem[0,1,0,1]
+; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm0, %ymm15
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm6, %ymm24
+; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128]
+; AVX512-FCP-NEXT:    # ymm13 = mem[0,1,0,1]
+; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm1, %ymm14
+; AVX512-FCP-NEXT:    vpor %ymm15, %ymm14, %ymm6
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm5[30],zero,ymm5[28],zero,zero,zero,zero,ymm5[31],zero,ymm5[29],zero,zero,zero
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,30],zero,ymm4[28],zero,ymm4[30,31,30,31],zero,ymm4[29],zero,ymm4[31,28,29]
+; AVX512-FCP-NEXT:    vpor %ymm5, %ymm4, %ymm4
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero,zero
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u],zero,ymm3[30],zero,ymm3[28,u,u,u],zero,ymm3[31],zero,ymm3[29,u]
-; AVX512-FCP-NEXT:    vpor %ymm2, %ymm3, %ymm0
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vporq %ymm2, %ymm3, %ymm23
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[27],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm14[27,u,u,u],zero,ymm14[30],zero,ymm14[28,u,u,u],zero,ymm14[31],zero
-; AVX512-FCP-NEXT:    vporq %ymm1, %ymm0, %ymm21
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,ymm15[14],zero,zero,zero,zero,zero,zero,ymm15[15],zero,zero,zero,zero,zero,zero,ymm15[16],zero,zero,zero,zero,zero,zero,ymm15[17],zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm26, %ymm2
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[0,1,0,1,14],zero,ymm2[14,15,0,1,14,15],zero,ymm2[13,14,15,16,17,16],zero,ymm2[30,31,30,31,16,17],zero,ymm2[31,28,29,30,31]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm0[27,u,u,u],zero,ymm0[30],zero,ymm0[28,u,u,u],zero,ymm0[31],zero
+; AVX512-FCP-NEXT:    vporq %ymm1, %ymm0, %ymm22
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm19, %ymm2
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm17, %ymm3
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm3[0,1,0,1,14],zero,ymm3[14,15,0,1,14,15],zero,ymm3[13,14,15,16,17,16],zero,ymm3[30,31,30,31,16,17],zero,ymm3[31,28,29,30,31]
 ; AVX512-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm29, %ymm0
-; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm15, %ymm0
-; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm2, %ymm1
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm16, %ymm0
+; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm2, %ymm0
+; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm3, %ymm1
 ; AVX512-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm27, %ymm3
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm18, %ymm3
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero,zero,zero,ymm3[18]
-; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm25, %ymm2
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[0,1,14],zero,ymm2[12,13,0,1,14,15],zero,ymm2[3,12,13,2,3,16],zero,ymm2[30,31,28,29,16,17],zero,ymm2[31,18,19,28,29,18],zero
 ; AVX512-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm3, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm22, %ymm1
+; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm3, %ymm0
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm20, %ymm1
 ; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm2, %ymm1
-; AVX512-FCP-NEXT:    vporq %ymm0, %ymm1, %ymm29
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm23, %ymm2
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm18, %ymm0
-; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm2, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm19, %ymm3
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm16, %ymm1
-; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
+; AVX512-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm24, %ymm0
+; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm2, %ymm1
 ; AVX512-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[13,u,u,u,u,u],zero,ymm2[14,u,u,u,u,u],zero,ymm2[15,u,u,u,u,u],zero,ymm2[16,u,u,u,u,u],zero,ymm2[17,u,u,u]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm3[13,u,u,u,u,u],zero,ymm3[14,u,u,u,u,u],zero,ymm3[15,u,u,u,u,u],zero,ymm3[16,u,u,u,u,u],zero,ymm3[17,u,u,u]
 ; AVX512-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa (%rax), %xmm1
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [2,2,3,3,2,2,3,3]
-; AVX512-FCP-NEXT:    # ymm3 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,5,6]
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm1, %xmm23
-; AVX512-FCP-NEXT:    vpermd %ymm0, %ymm3, %ymm0
-; AVX512-FCP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa (%rax), %ymm15
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = zero,ymm15[13],zero,zero,zero,zero,zero,zero,ymm15[14],zero,zero,zero,zero,zero,zero,ymm15[15],zero,zero,zero,zero,zero,zero,ymm15[16],zero,zero,zero,zero,zero,zero,ymm15[17],zero,zero
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm18
-; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7]
-; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm27
-; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
+; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm9[8],xmm11[8],xmm9[9],xmm11[9],xmm9[10],xmm11[10],xmm9[11],xmm11[11],xmm9[12],xmm11[12],xmm9[13],xmm11[13],xmm9[14],xmm11[14],xmm9[15],xmm11[15]
+; AVX512-FCP-NEXT:    vmovdqa %xmm9, %xmm8
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm11, %xmm17
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm9 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
+; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm1, %xmm1
+; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,0,1],zmm0[4,5,6,7]
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [2,2,3,3,2,2,3,3]
+; AVX512-FCP-NEXT:    # ymm6 = mem[0,1,0,1]
+; AVX512-FCP-NEXT:    vmovdqa (%rax), %xmm0
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,5,6]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm0, %xmm19
+; AVX512-FCP-NEXT:    vpermd %ymm1, %ymm6, %ymm1
+; AVX512-FCP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3
+; AVX512-FCP-NEXT:    vmovdqa (%rax), %ymm12
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = zero,ymm12[13],zero,zero,zero,zero,zero,zero,ymm12[14],zero,zero,zero,zero,zero,zero,ymm12[15],zero,zero,zero,zero,zero,zero,ymm12[16],zero,zero,zero,zero,zero,zero,ymm12[17],zero,zero
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm3, %zmm20
+; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm7[8],xmm10[8],xmm7[9],xmm10[9],xmm7[10],xmm10[10],xmm7[11],xmm10[11],xmm7[12],xmm10[12],xmm7[13],xmm10[13],xmm7[14],xmm10[14],xmm7[15],xmm10[15]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7]
+; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm5, %xmm0
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm25
+; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
 ; AVX512-FCP-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm16 # 16-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm16, %xmm1
-; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm12[8],xmm1[8],xmm12[9],xmm1[9],xmm12[10],xmm1[10],xmm12[11],xmm1[11],xmm12[12],xmm1[12],xmm12[13],xmm1[13],xmm12[14],xmm1[14],xmm12[15],xmm1[15]
-; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm19
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm24, %xmm4
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm17, %xmm1
-; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
-; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm24
-; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm28, %xmm5
-; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm13[8],xmm5[8],xmm13[9],xmm5[9],xmm13[10],xmm5[10],xmm13[11],xmm5[11],xmm13[12],xmm5[12],xmm13[13],xmm5[13],xmm13[14],xmm5[14],xmm13[15],xmm5[15]
-; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm16, %xmm2
+; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm11[8],xmm2[8],xmm11[9],xmm2[9],xmm11[10],xmm2[10],xmm11[11],xmm2[11],xmm11[12],xmm2[12],xmm11[13],xmm2[13],xmm11[14],xmm2[14],xmm11[15],xmm2[15]
+; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm2, %zmm17
-; AVX512-FCP-NEXT:    vmovdqa 32(%rax), %xmm6
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm14 = xmm6[0,1,2,3,4,5,5,6]
-; AVX512-FCP-NEXT:    vpermd %ymm14, %ymm3, %ymm31
-; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm7[8],xmm8[8],xmm7[9],xmm8[9],xmm7[10],xmm8[10],xmm7[11],xmm8[11],xmm7[12],xmm8[12],xmm7[13],xmm8[13],xmm7[14],xmm8[14],xmm7[15],xmm8[15]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm14 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
-; AVX512-FCP-NEXT:    vpshufb %xmm14, %xmm3, %xmm0
-; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm22
-; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm11 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
-; AVX512-FCP-NEXT:    vpshufb %xmm14, %xmm11, %xmm11
+; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm1, %zmm2, %zmm18
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm26, %xmm2
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm21, %xmm0
+; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
+; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm5, %xmm3
+; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm21
+; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm13 = xmm14[8],xmm4[8],xmm14[9],xmm4[9],xmm14[10],xmm4[10],xmm14[11],xmm4[11],xmm14[12],xmm4[12],xmm14[13],xmm4[13],xmm14[14],xmm4[14],xmm14[15],xmm4[15]
+; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm13, %xmm1
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm11, %zmm3, %zmm28
-; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm11 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm14 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5]
-; AVX512-FCP-NEXT:    vpshufb %xmm14, %xmm11, %xmm11
-; AVX512-FCP-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm26 # 16-byte Folded Reload
-; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9]
-; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm9, %xmm9
-; AVX512-FCP-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 16-byte Folded Reload
-; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm11 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
-; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm7, %xmm7
-; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm30, %zmm7, %zmm7
-; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm8 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3],xmm5[4],xmm13[4],xmm5[5],xmm13[5],xmm5[6],xmm13[6],xmm5[7],xmm13[7]
-; AVX512-FCP-NEXT:    vpshufb %xmm14, %xmm8, %xmm8
-; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm14 = zmm20[2,3,2,3],zmm8[0,1,0,1]
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512-FCP-NEXT:    vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512-FCP-NEXT:    vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm20 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm8 = zmm8[2,3,2,3,6,7,6,7]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm20 = zmm20[2,3,2,3,6,7,6,7]
-; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm20
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512-FCP-NEXT:    vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm30 = zmm8[2,3,2,3,6,7,6,7]
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255]
-; AVX512-FCP-NEXT:    vpternlogq $184, %zmm20, %zmm8, %zmm30
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm5 = zmm19[0,1,0,1,4,5,4,5]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm20 = zmm17[0,1,0,1,4,5,4,5]
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm5, %zmm8, %zmm20
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT:    vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm5 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm25 = zmm1[2,3,2,3,6,7,6,7]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm1 = zmm5[2,3,2,3,6,7,6,7]
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm25, %zmm8, %zmm1
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm16, %xmm4
-; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3],xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7]
-; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm5, %xmm5
+; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm1, %zmm3, %zmm13
+; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm30, %xmm5
+; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm5[8],xmm15[8],xmm5[9],xmm15[9],xmm5[10],xmm15[10],xmm5[11],xmm15[11],xmm5[12],xmm15[12],xmm5[13],xmm15[13],xmm5[14],xmm15[14],xmm5[15],xmm15[15]
+; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm1, %xmm1
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm10 = zmm3[2,3,2,3],zmm5[0,1,0,1]
-; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm5, %xmm5
-; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255]
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm14, %zmm8, %zmm10
-; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm21 = zmm21[2,3,2,3],zmm5[0,1,0,1]
-; AVX512-FCP-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    # ymm11 = mem[2,3,2,3]
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm0, %zmm11
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm14 = ymm29[2,3,2,3]
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm14, %zmm0, %zmm14
-; AVX512-FCP-NEXT:    vpternlogq $226, %zmm11, %zmm8, %zmm14
-; AVX512-FCP-NEXT:    vmovdqa64 %xmm23, %xmm0
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm0[1,1,0,0,4,5,6,7]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm11 = [0,1,0,1,2,0,0,1]
-; AVX512-FCP-NEXT:    vpermd %ymm8, %ymm11, %ymm16
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
-; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm0, %xmm12
-; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm6[1,1,0,0,4,5,6,7]
-; AVX512-FCP-NEXT:    vpermd %ymm0, %ymm11, %ymm2
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm11 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31]
+; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm3[0,1,0,1],zmm1[0,1,0,1]
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa 32(%rax), %xmm3
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5,5,6]
+; AVX512-FCP-NEXT:    vpermd %ymm1, %ymm6, %ymm26
+; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5]
+; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX512-FCP-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm24 # 16-byte Folded Reload
+; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3],xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9]
+; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm9, %xmm9
+; AVX512-FCP-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 16-byte Folded Reload
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm17, %xmm2
+; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3],xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
+; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm8, %xmm8
+; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,0,1],zmm31[0,1,0,1]
+; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm14 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3],xmm4[4],xmm14[4],xmm4[5],xmm14[5],xmm4[6],xmm14[6],xmm4[7],xmm14[7]
+; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm14, %xmm1
+; AVX512-FCP-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm14 = mem[2,3,2,3,6,7,6,7]
+; AVX512-FCP-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm31 = mem[2,3,2,3,6,7,6,7]
+; AVX512-FCP-NEXT:    vporq %zmm14, %zmm31, %zmm14
+; AVX512-FCP-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm31 = mem[2,3,2,3,6,7,6,7]
+; AVX512-FCP-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm30 = mem[2,3,2,3,6,7,6,7]
+; AVX512-FCP-NEXT:    vporq %zmm31, %zmm30, %zmm30
+; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm14, %zmm30
+; AVX512-FCP-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm14 = mem[2,3,2,3,6,7,6,7]
+; AVX512-FCP-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm31 = mem[2,3,2,3,6,7,6,7]
+; AVX512-FCP-NEXT:    vporq %zmm14, %zmm31, %zmm14
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm31 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255]
+; AVX512-FCP-NEXT:    vpternlogq $184, %zmm30, %zmm31, %zmm14
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm7 = zmm18[0,1,0,1,4,5,4,5]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm13 = zmm13[0,1,0,1,4,5,4,5]
+; AVX512-FCP-NEXT:    vpternlogq $226, %zmm7, %zmm31, %zmm13
+; AVX512-FCP-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm7 = mem[2,3,2,3,6,7,6,7]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm29 = zmm29[2,3,2,3,6,7,6,7]
+; AVX512-FCP-NEXT:    vporq %zmm7, %zmm29, %zmm7
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm28 = zmm28[2,3,2,3,6,7,6,7]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm27 = zmm27[2,3,2,3,6,7,6,7]
+; AVX512-FCP-NEXT:    vporq %zmm28, %zmm27, %zmm27
+; AVX512-FCP-NEXT:    vpternlogq $226, %zmm7, %zmm31, %zmm27
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm2[2,3,2,3],zmm1[0,1,0,1]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm16, %xmm2
+; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm7 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3],xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7]
+; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm7, %xmm7
+; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm7 = zmm23[2,3,2,3],zmm7[0,1,0,1]
+; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm10 = xmm5[0],xmm15[0],xmm5[1],xmm15[1],xmm5[2],xmm15[2],xmm5[3],xmm15[3],xmm5[4],xmm15[4],xmm5[5],xmm15[5],xmm5[6],xmm15[6],xmm5[7],xmm15[7]
+; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm10, %xmm10
+; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm22 = zmm22[2,3,2,3],zmm10[0,1,0,1]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm19, %xmm2
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm2[1,1,0,0,4,5,6,7]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm17 = [0,1,0,1,2,0,0,1]
+; AVX512-FCP-NEXT:    vpermd %ymm11, %ymm17, %ymm28
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
+; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm2, %xmm6
+; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm3[1,1,0,0,4,5,6,7]
+; AVX512-FCP-NEXT:    vpermd %ymm4, %ymm17, %ymm17
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm10 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31]
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm12[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm23 = [4,5,4,5,5,7,4,5]
+; AVX512-FCP-NEXT:    vpermd %ymm2, %ymm23, %ymm2
+; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm11 = zero,ymm15[13],zero,zero,zero,zero,zero,zero,ymm15[14],zero,zero,zero,zero,zero,zero,ymm15[15],zero,zero,zero,zero,zero,zero,ymm15[16],zero,zero,zero,zero,zero,zero,ymm15[17],zero,zero
 ; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm15 = ymm15[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm17 = [4,5,4,5,5,7,4,5]
-; AVX512-FCP-NEXT:    vpermd %ymm15, %ymm17, %ymm15
-; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = zero,ymm13[13],zero,zero,zero,zero,zero,zero,ymm13[14],zero,zero,zero,zero,zero,zero,ymm13[15],zero,zero,zero,zero,zero,zero,ymm13[16],zero,zero,zero,zero,zero,zero,ymm13[17],zero,zero
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm13 = ymm13[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
-; AVX512-FCP-NEXT:    vpermd %ymm13, %ymm17, %ymm13
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm24[0,1,0,1]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm17 = ymm27[0,1,0,1]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm22[0,1,0,1]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,0,1,0]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm12[0,0,1,0]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; AVX512-FCP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm13, %ymm13
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm13, %zmm8, %zmm8
-; AVX512-FCP-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    # zmm13 = mem[2,3,2,3,6,7,6,7]
-; AVX512-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm30, %zmm13
-; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm17 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm17
-; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload
-; AVX512-FCP-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm18
-; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm17, %zmm18
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm31, %zmm5, %zmm3
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm5 = zmm28[0,1,0,1,4,5,4,5]
-; AVX512-FCP-NEXT:    vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm5
-; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm20, %zmm5
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm3 = zmm26[0,1,0,1,4,5,4,5]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm6 = zmm9[0,1,0,1,4,5,4,5]
-; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm6
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm16, %zmm3
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm4 = zmm7[0,1,0,1,4,5,4,5]
-; AVX512-FCP-NEXT:    vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm4
-; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm4
+; AVX512-FCP-NEXT:    vpermd %ymm15, %ymm23, %ymm15
+; AVX512-FCP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm15, %ymm15
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm15, %zmm11, %zmm11
+; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm15 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255]
+; AVX512-FCP-NEXT:    vpternlogq $226, %zmm1, %zmm15, %zmm7
+; AVX512-FCP-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    # ymm1 = mem[2,3,2,3]
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm3, %zmm1
+; AVX512-FCP-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    # ymm18 = mem[2,3,2,3]
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm18, %zmm3, %zmm18
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm21[0,1,0,1]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm19 = ymm25[0,1,0,1]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,0,1,0]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3]
+; AVX512-FCP-NEXT:    vpternlogq $226, %zmm1, %zmm15, %zmm18
+; AVX512-FCP-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm1 = mem[2,3,2,3,6,7,6,7]
+; AVX512-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm14, %zmm1
+; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm14 # 32-byte Folded Reload
+; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm14
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm20
+; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm14, %zmm20
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm26, %zmm0, %zmm0
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm0
+; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm13, %zmm0
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm5 = zmm24[0,1,0,1,4,5,4,5]
+; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm9 = zmm9[0,1,0,1,4,5,4,5]
+; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm9
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm28, %zmm5
+; AVX512-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm5
+; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm5
+; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-FCP-NEXT:    vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm6 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm6 = zmm3[2,3,2,3],mem[2,3,2,3]
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm2, %zmm2
+; AVX512-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm2
+; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm27, %zmm2
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm17, %zmm4, %zmm4
+; AVX512-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm22, %zmm4
+; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm4
 ; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512-FCP-NEXT:    vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm15, %zmm6
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm3 = zmm3[2,3,2,3,6,7,6,7]
-; AVX512-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm6
-; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm6
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX512-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm21, %zmm0
-; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm0
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT:    vshufi64x2 $84, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 64-byte Folded Reload
-; AVX512-FCP-NEXT:    # zmm1 = zmm1[0,1,2,3],mem[2,3,2,3]
-; AVX512-FCP-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm8
-; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm14, %zmm8
+; AVX512-FCP-NEXT:    vshufi64x2 $84, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm6 # 64-byte Folded Reload
+; AVX512-FCP-NEXT:    # zmm6 = zmm3[0,1,2,3],mem[2,3,2,3]
+; AVX512-FCP-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm11
+; AVX512-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm18, %zmm11
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, 320(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, 192(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, 128(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, (%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, 256(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm18, 64(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm13, 384(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm11, 320(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, 192(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, 128(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, (%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, 256(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm20, 64(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, 384(%rax)
 ; AVX512-FCP-NEXT:    addq $1432, %rsp # imm = 0x598
 ; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: store_i8_stride7_vf64:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    subq $1384, %rsp # imm = 0x568
+; AVX512DQ-NEXT:    subq $1720, %rsp # imm = 0x6B8
 ; AVX512DQ-NEXT:    vmovdqa (%rsi), %ymm7
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm7[14],zero,zero,zero,zero,zero,zero,ymm7[15],zero,zero,zero,zero,zero,zero,ymm7[16],zero,zero,zero,zero,zero,zero,ymm7[17],zero,zero,zero,zero,zero,zero,ymm7[18]
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18]
+; AVX512DQ-NEXT:    vpshufb %ymm2, %ymm7, %ymm0
+; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm26
 ; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm2
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[0,1,14],zero,ymm2[12,13,0,1,14,15],zero,ymm2[3,12,13,2,3,16],zero,ymm2[30,31,28,29,16,17],zero,ymm2[31,18,19,28,29,18],zero
-; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm20
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,14,128,12,13,0,1,14,15,128,3,12,13,2,3,16,128,30,31,28,29,16,17,128,31,18,19,28,29,18,128]
+; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm2, %ymm1
+; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm25
+; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm18
 ; AVX512DQ-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512DQ-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vmovdqa (%rcx), %ymm15
+; AVX512DQ-NEXT:    vmovdqa (%rcx), %ymm14
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128]
-; AVX512DQ-NEXT:    vpshufb %ymm2, %ymm15, %ymm0
-; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm23
-; AVX512DQ-NEXT:    vmovdqa (%rdx), %ymm2
+; AVX512DQ-NEXT:    vpshufb %ymm2, %ymm14, %ymm0
+; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm28
+; AVX512DQ-NEXT:    vmovdqa (%rdx), %ymm8
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31]
-; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm2, %ymm1
-; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm26
-; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm18
+; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm8, %ymm1
+; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm23
+; AVX512DQ-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512DQ-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vmovdqa (%r8), %ymm14
+; AVX512DQ-NEXT:    vmovdqa (%r8), %ymm15
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128]
-; AVX512DQ-NEXT:    vpshufb %ymm2, %ymm14, %ymm0
-; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm27
-; AVX512DQ-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vmovdqa (%r9), %ymm8
+; AVX512DQ-NEXT:    vpshufb %ymm2, %ymm15, %ymm0
+; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm24
+; AVX512DQ-NEXT:    vmovdqa (%r9), %ymm2
 ; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0,13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0]
 ; AVX512DQ-NEXT:    # ymm3 = mem[0,1,0,1]
-; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm8, %ymm1
-; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm17
+; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm2, %ymm1
+; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm27
+; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm19
 ; AVX512DQ-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512DQ-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %ymm10
@@ -9373,7 +9423,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [12,13,14,128,12,128,14,15,14,15,128,13,128,15,12,13,28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29]
 ; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm9, %ymm0
-; AVX512DQ-NEXT:    vmovdqa64 %ymm1, %ymm19
+; AVX512DQ-NEXT:    vmovdqa64 %ymm1, %ymm16
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm1 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm10[23,u,u,u],zero,ymm10[26],zero,ymm10[24,u,u,u],zero,ymm10[27],zero
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -9388,7 +9438,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,0,27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,0]
 ; AVX512DQ-NEXT:    # ymm0 = mem[0,1,0,1]
 ; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm4, %ymm2
-; AVX512DQ-NEXT:    vmovdqa64 %ymm0, %ymm21
+; AVX512DQ-NEXT:    vmovdqa64 %ymm0, %ymm20
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm3 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm5[23],zero,ymm5[21,22,23,26],zero,ymm5[24],zero,ymm5[28,29,26,27]
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -9396,7 +9446,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29]
 ; AVX512DQ-NEXT:    # ymm0 = mem[0,1,0,1]
 ; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm3, %ymm11
-; AVX512DQ-NEXT:    vmovdqa64 %ymm0, %ymm25
+; AVX512DQ-NEXT:    vmovdqa64 %ymm0, %ymm21
 ; AVX512DQ-NEXT:    vmovdqa 32(%r9), %ymm2
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm12 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm11, %zmm12, %zmm11
@@ -9415,23 +9465,23 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm12, %zmm13, %zmm12
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vpshufb %ymm6, %ymm7, %ymm6
-; AVX512DQ-NEXT:    vmovdqa64 %ymm7, %ymm22
-; AVX512DQ-NEXT:    vmovdqa64 %ymm20, %ymm7
-; AVX512DQ-NEXT:    vmovdqa64 %ymm19, %ymm0
-; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm7, %ymm7
+; AVX512DQ-NEXT:    vmovdqa64 %ymm7, %ymm17
+; AVX512DQ-NEXT:    vmovdqa64 %ymm18, %ymm13
+; AVX512DQ-NEXT:    vmovdqa64 %ymm16, %ymm0
+; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm13, %ymm7
 ; AVX512DQ-NEXT:    vpor %ymm6, %ymm7, %ymm0
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa64 %ymm18, %ymm12
-; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm12, %ymm6
-; AVX512DQ-NEXT:    vmovdqa64 %ymm21, %ymm0
-; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm15, %ymm7
-; AVX512DQ-NEXT:    vmovdqa64 %ymm15, %ymm19
+; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm8, %ymm6
+; AVX512DQ-NEXT:    vmovdqa64 %ymm20, %ymm0
+; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm14, %ymm7
+; AVX512DQ-NEXT:    vmovdqa64 %ymm14, %ymm22
 ; AVX512DQ-NEXT:    vpor %ymm6, %ymm7, %ymm0
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa64 %ymm25, %ymm0
-; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm14, %ymm6
-; AVX512DQ-NEXT:    vpshufb %ymm11, %ymm8, %ymm7
-; AVX512DQ-NEXT:    vmovdqa64 %ymm8, %ymm16
+; AVX512DQ-NEXT:    vmovdqa64 %ymm21, %ymm0
+; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm15, %ymm6
+; AVX512DQ-NEXT:    vmovdqa64 %ymm15, %ymm16
+; AVX512DQ-NEXT:    vmovdqa64 %ymm19, %ymm15
+; AVX512DQ-NEXT:    vpshufb %ymm11, %ymm15, %ymm7
 ; AVX512DQ-NEXT:    vpor %ymm6, %ymm7, %ymm0
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa 32(%rdx), %xmm0
@@ -9440,348 +9490,379 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm11 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u]
 ; AVX512DQ-NEXT:    vpshufb %xmm11, %xmm1, %xmm6
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u]
-; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm0, %xmm7
-; AVX512DQ-NEXT:    vmovdqa64 %xmm1, %xmm21
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm12 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u]
+; AVX512DQ-NEXT:    vpshufb %xmm12, %xmm0, %xmm7
 ; AVX512DQ-NEXT:    vpor %xmm6, %xmm7, %xmm0
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %xmm0
-; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %xmm15
+; AVX512DQ-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %xmm14
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm6 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128]
-; AVX512DQ-NEXT:    vpshufb %xmm6, %xmm15, %xmm7
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm13 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9]
-; AVX512DQ-NEXT:    vpshufb %xmm13, %xmm0, %xmm8
-; AVX512DQ-NEXT:    vmovdqa64 %xmm0, %xmm30
+; AVX512DQ-NEXT:    vpshufb %xmm6, %xmm14, %xmm7
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9]
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm0, %xmm8
+; AVX512DQ-NEXT:    vmovdqa64 %xmm1, %xmm18
 ; AVX512DQ-NEXT:    vpor %xmm7, %xmm8, %xmm0
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm28 = [0,0,0,0,2,3,0,1,0,18,0,19,18,0,19,0]
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,2,3,0,1,0,18,0,19,18,0,19,0]
 ; AVX512DQ-NEXT:    vmovdqa 32(%rax), %xmm0
 ; AVX512DQ-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm0[0,1,2,3,4,5,5,6]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm8 = xmm0[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
-; AVX512DQ-NEXT:    vpermi2d %zmm7, %zmm8, %zmm28
+; AVX512DQ-NEXT:    vpermi2d %zmm7, %zmm8, %zmm1
+; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa 32(%r9), %xmm0
-; AVX512DQ-NEXT:    vmovdqa 32(%r8), %xmm14
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6]
-; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm0, %xmm7
-; AVX512DQ-NEXT:    vmovdqa64 %xmm1, %xmm24
-; AVX512DQ-NEXT:    vmovdqa64 %xmm0, %xmm18
+; AVX512DQ-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vmovdqa 32(%r8), %xmm1
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm8 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6]
+; AVX512DQ-NEXT:    vpshufb %xmm8, %xmm0, %xmm7
+; AVX512DQ-NEXT:    vmovdqa64 %xmm8, %xmm20
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm0 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128]
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm14, %xmm8
-; AVX512DQ-NEXT:    vmovdqa64 %xmm0, %xmm29
-; AVX512DQ-NEXT:    vporq %xmm7, %xmm8, %xmm31
-; AVX512DQ-NEXT:    vmovdqa64 %ymm23, %ymm0
+; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm1, %xmm8
+; AVX512DQ-NEXT:    vmovdqa64 %xmm0, %xmm30
+; AVX512DQ-NEXT:    vmovdqa64 %xmm1, %xmm29
+; AVX512DQ-NEXT:    vpor %xmm7, %xmm8, %xmm0
+; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa64 %ymm28, %ymm0
 ; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm4, %ymm7
-; AVX512DQ-NEXT:    vmovdqa64 %ymm26, %ymm0
+; AVX512DQ-NEXT:    vmovdqa64 %ymm23, %ymm0
 ; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm5, %ymm8
 ; AVX512DQ-NEXT:    vpor %ymm7, %ymm8, %ymm0
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20]
 ; AVX512DQ-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm4, %ymm0
-; AVX512DQ-NEXT:    vmovdqa64 %ymm1, %ymm23
-; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128]
-; AVX512DQ-NEXT:    # ymm7 = mem[0,1,0,1]
-; AVX512DQ-NEXT:    vpshufb %ymm7, %ymm5, %ymm1
+; AVX512DQ-NEXT:    vmovdqa64 %ymm1, %ymm21
+; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128]
+; AVX512DQ-NEXT:    # ymm4 = mem[0,1,0,1]
+; AVX512DQ-NEXT:    vpshufb %ymm4, %ymm5, %ymm1
+; AVX512DQ-NEXT:    vmovdqa64 %ymm4, %ymm19
 ; AVX512DQ-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512DQ-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm10[14],zero,zero,zero,zero,zero,zero,ymm10[15],zero,zero,zero,zero,zero,zero,ymm10[16],zero,zero,zero,zero,zero,zero,ymm10[17],zero,zero,zero,zero,zero,zero,ymm10[18]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm1 = ymm9[0,1,14],zero,ymm9[12,13,0,1,14,15],zero,ymm9[3,12,13,2,3,16],zero,ymm9[30,31,28,29,16,17],zero,ymm9[31,18,19,28,29,18],zero
+; AVX512DQ-NEXT:    vmovdqa64 %ymm26, %ymm0
+; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm10, %ymm0
+; AVX512DQ-NEXT:    vmovdqa64 %ymm25, %ymm1
+; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm9, %ymm1
 ; AVX512DQ-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128]
+; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128]
+; AVX512DQ-NEXT:    # ymm7 = mem[0,1,0,1]
+; AVX512DQ-NEXT:    vpshufb %ymm7, %ymm10, %ymm0
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [2,3,4,5,128,3,128,5,4,5,6,128,4,128,6,7,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23]
+; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm9, %ymm4
+; AVX512DQ-NEXT:    vpor %ymm0, %ymm4, %ymm0
+; AVX512DQ-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22]
+; AVX512DQ-NEXT:    # ymm10 = mem[0,1,0,1]
+; AVX512DQ-NEXT:    vpshufb %ymm10, %ymm2, %ymm4
+; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128]
 ; AVX512DQ-NEXT:    # ymm0 = mem[0,1,0,1]
-; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm10, %ymm1
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm8 = [2,3,4,5,128,3,128,5,4,5,6,128,4,128,6,7,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23]
-; AVX512DQ-NEXT:    vpshufb %ymm8, %ymm9, %ymm4
-; AVX512DQ-NEXT:    vpor %ymm1, %ymm4, %ymm1
-; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22]
-; AVX512DQ-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm2, %ymm4
-; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128]
-; AVX512DQ-NEXT:    # ymm9 = mem[0,1,0,1]
-; AVX512DQ-NEXT:    vpshufb %ymm9, %ymm3, %ymm5
+; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm3, %ymm5
 ; AVX512DQ-NEXT:    vpor %ymm4, %ymm5, %ymm4
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa64 %ymm27, %ymm4
+; AVX512DQ-NEXT:    vmovdqa64 %ymm24, %ymm4
 ; AVX512DQ-NEXT:    vpshufb %ymm4, %ymm3, %ymm3
-; AVX512DQ-NEXT:    vmovdqa64 %ymm17, %ymm4
+; AVX512DQ-NEXT:    vmovdqa64 %ymm27, %ymm4
 ; AVX512DQ-NEXT:    vpshufb %ymm4, %ymm2, %ymm2
 ; AVX512DQ-NEXT:    vpor %ymm3, %ymm2, %ymm2
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa (%rsi), %xmm3
 ; AVX512DQ-NEXT:    vpshufb %xmm6, %xmm3, %xmm2
-; AVX512DQ-NEXT:    vmovdqa64 %xmm3, %xmm25
+; AVX512DQ-NEXT:    vmovdqa64 %xmm3, %xmm31
 ; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm4
-; AVX512DQ-NEXT:    vpshufb %xmm13, %xmm4, %xmm3
-; AVX512DQ-NEXT:    vmovdqa64 %xmm4, %xmm27
+; AVX512DQ-NEXT:    vmovdqa64 %xmm18, %xmm3
+; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX512DQ-NEXT:    vmovdqa64 %xmm4, %xmm28
 ; AVX512DQ-NEXT:    vpor %xmm2, %xmm3, %xmm2
 ; AVX512DQ-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-NEXT:    vmovdqa (%rcx), %xmm3
 ; AVX512DQ-NEXT:    vpshufb %xmm11, %xmm3, %xmm2
-; AVX512DQ-NEXT:    vmovdqa64 %xmm3, %xmm17
-; AVX512DQ-NEXT:    vmovdqa (%rdx), %xmm13
-; AVX512DQ-NEXT:    vmovdqa64 %xmm21, %xmm3
-; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm13, %xmm3
-; AVX512DQ-NEXT:    vpor %xmm2, %xmm3, %xmm2
-; AVX512DQ-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vmovdqa (%r9), %xmm11
-; AVX512DQ-NEXT:    vmovdqa64 %xmm24, %xmm2
-; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm11, %xmm2
-; AVX512DQ-NEXT:    vmovdqa (%r8), %xmm10
-; AVX512DQ-NEXT:    vmovdqa64 %xmm29, %xmm3
-; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm10, %xmm3
+; AVX512DQ-NEXT:    vmovdqa %xmm3, %xmm6
+; AVX512DQ-NEXT:    vmovdqa (%rdx), %xmm4
+; AVX512DQ-NEXT:    vpshufb %xmm12, %xmm4, %xmm3
+; AVX512DQ-NEXT:    vmovdqa64 %xmm4, %xmm18
 ; AVX512DQ-NEXT:    vpor %xmm2, %xmm3, %xmm2
 ; AVX512DQ-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vmovdqa64 %ymm20, %ymm4
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm2 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm4[23],zero,zero,zero,zero,ymm4[26],zero,ymm4[24],zero,zero,zero,zero,ymm4[27],zero,ymm4[25]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm22, %ymm3
-; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm3[23,u,u,u],zero,ymm3[26],zero,ymm3[24,u,u,u],zero,ymm3[27],zero
-; AVX512DQ-NEXT:    vpshufb %ymm8, %ymm4, %ymm2
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm29
-; AVX512DQ-NEXT:    vmovdqa64 %ymm19, %ymm2
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero
-; AVX512DQ-NEXT:    vmovdqa64 %ymm23, %ymm3
+; AVX512DQ-NEXT:    vmovdqa (%r9), %xmm3
+; AVX512DQ-NEXT:    vmovdqa64 %xmm20, %xmm2
+; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX512DQ-NEXT:    vmovdqa %xmm3, %xmm8
+; AVX512DQ-NEXT:    vmovdqa (%r8), %xmm3
+; AVX512DQ-NEXT:    vmovdqa64 %xmm30, %xmm4
+; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm3, %xmm4
+; AVX512DQ-NEXT:    vmovdqa %xmm3, %xmm9
+; AVX512DQ-NEXT:    vpor %xmm2, %xmm4, %xmm2
+; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm2 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm13[23],zero,zero,zero,zero,ymm13[26],zero,ymm13[24],zero,zero,zero,zero,ymm13[27],zero,ymm13[25]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm17, %ymm3
+; AVX512DQ-NEXT:    vpshufb %ymm7, %ymm3, %ymm4
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm4, %zmm2
+; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm2 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm3[23,u,u,u],zero,ymm3[26],zero,ymm3[24,u,u,u],zero,ymm3[27],zero
+; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm13, %ymm1
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa64 %ymm22, %ymm2
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero
+; AVX512DQ-NEXT:    vmovdqa64 %ymm21, %ymm3
 ; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm26
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm12[23],zero,ymm12[21,22,23,26],zero,ymm12[24],zero,ymm12[28,29,26,27]
-; AVX512DQ-NEXT:    vpshufb %ymm7, %ymm12, %ymm2
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm23
-; AVX512DQ-NEXT:    vmovdqa64 %ymm16, %ymm2
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero
-; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm2, %ymm1
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm1[23],zero,ymm1[23,24,25,26],zero,ymm1[24],zero,ymm1[30,31]
-; AVX512DQ-NEXT:    vpshufb %ymm9, %ymm1, %ymm1
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1
+; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm2[23],zero,ymm2[21,22,23,26],zero,ymm2[24],zero,ymm2[28,29,26,27]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm19, %ymm3
+; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm30
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm1 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm15[25],zero,ymm15[23],zero,zero,zero,zero,ymm15[26],zero,ymm15[24],zero,zero
+; AVX512DQ-NEXT:    vmovdqa64 %ymm16, %ymm3
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm2 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm3[23],zero,ymm3[23,24,25,26],zero,ymm3[24],zero,ymm3[30,31]
+; AVX512DQ-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vpshufb %ymm10, %ymm15, %ymm1
+; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
+; AVX512DQ-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa (%rax), %ymm8
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm1 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm16 = [0,5,4,0,5,0,4,0,20,21,0,23,0,21,0,23]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm8[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
-; AVX512DQ-NEXT:    vpermi2d %zmm1, %zmm2, %zmm16
-; AVX512DQ-NEXT:    vmovdqa (%rax), %xmm5
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,5,5,6]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1]
-; AVX512DQ-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
+; AVX512DQ-NEXT:    vmovdqa (%rax), %ymm4
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm15 = [0,5,4,0,5,0,4,0,20,21,0,23,0,21,0,23]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm4[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
+; AVX512DQ-NEXT:    vpermi2d %zmm0, %zmm1, %zmm15
+; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm9, %xmm21
+; AVX512DQ-NEXT:    vmovdqa64 %xmm8, %xmm22
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm2 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
+; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX512DQ-NEXT:    vmovdqa64 %xmm2, %xmm24
+; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm23 = zmm1[0,1,0,1],zmm0[4,5,6,7]
+; AVX512DQ-NEXT:    vmovdqa (%rax), %xmm11
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,4,5,5,6]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
+; AVX512DQ-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm9 = [128,13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128]
-; AVX512DQ-NEXT:    vpshufb %ymm9, %ymm8, %ymm3
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm19
-; AVX512DQ-NEXT:    vmovdqa64 %xmm30, %xmm4
-; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm4[0],xmm15[0],xmm4[1],xmm15[1],xmm4[2],xmm15[2],xmm4[3],xmm15[3],xmm4[4],xmm15[4],xmm4[5],xmm15[5],xmm4[6],xmm15[6],xmm4[7],xmm15[7]
+; AVX512DQ-NEXT:    vpshufb %ymm9, %ymm4, %ymm2
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm16
+; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3],xmm10[4],xmm14[4],xmm10[5],xmm14[5],xmm10[6],xmm14[6],xmm10[7],xmm14[7]
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5]
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm3, %xmm3
-; AVX512DQ-NEXT:    vmovdqa64 %xmm0, %xmm24
+; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm2, %xmm2
+; AVX512DQ-NEXT:    vmovdqa64 %xmm0, %xmm27
 ; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm6 = zmm0[2,3,2,3],zmm3[0,1,0,1]
+; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm12 = zmm0[2,3,2,3],zmm2[0,1,0,1]
+; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7]
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm0 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9]
 ; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm3, %xmm3
-; AVX512DQ-NEXT:    vmovdqa64 %xmm0, %xmm21
+; AVX512DQ-NEXT:    vmovdqa64 %xmm0, %xmm26
 ; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm22 = zmm0[2,3,2,3],zmm3[0,1,0,1]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm18, %xmm2
-; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm12 = xmm14[0],xmm2[0],xmm14[1],xmm2[1],xmm14[2],xmm2[2],xmm14[3],xmm2[3],xmm14[4],xmm2[4],xmm14[5],xmm2[5],xmm14[6],xmm2[6],xmm14[7],xmm2[7]
+; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm17 = zmm0[2,3,2,3],zmm3[0,1,0,1]
+; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; AVX512DQ-NEXT:    vmovdqa64 %xmm29, %xmm3
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm13 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm0 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm12, %xmm12
-; AVX512DQ-NEXT:    vmovdqa64 %xmm0, %xmm20
+; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm13, %xmm13
+; AVX512DQ-NEXT:    vmovdqa64 %xmm0, %xmm25
 ; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm30 = zmm0[2,3,2,3],zmm12[0,1,0,1]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31]
-; AVX512DQ-NEXT:    vmovdqa64 %ymm0, %ymm18
-; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm7[8],xmm1[8],xmm7[9],xmm1[9],xmm7[10],xmm1[10],xmm7[11],xmm1[11],xmm7[12],xmm1[12],xmm7[13],xmm1[13],xmm7[14],xmm1[14],xmm7[15],xmm1[15]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm17, %xmm3
-; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm8 = xmm3[8],xmm13[8],xmm3[9],xmm13[9],xmm3[10],xmm13[10],xmm3[11],xmm13[11],xmm3[12],xmm13[12],xmm3[13],xmm13[13],xmm3[14],xmm13[14],xmm3[15],xmm13[15]
+; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm29 = zmm0[2,3,2,3],zmm13[0,1,0,1]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31]
+; AVX512DQ-NEXT:    vmovdqa64 %ymm0, %ymm20
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
+; AVX512DQ-NEXT:    vmovdqa %xmm6, %xmm8
+; AVX512DQ-NEXT:    vmovdqa64 %xmm18, %xmm7
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15]
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7]
-; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm8, %xmm12
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm4, %xmm2
+; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm18
 ; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
 ; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm8
-; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm15[8],xmm4[8],xmm15[9],xmm4[9],xmm15[10],xmm4[10],xmm15[11],xmm4[11],xmm15[12],xmm4[12],xmm15[13],xmm4[13],xmm15[14],xmm4[14],xmm15[15],xmm4[15]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm25, %xmm4
-; AVX512DQ-NEXT:    vmovdqa64 %xmm27, %xmm7
-; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm15 = xmm4[8],xmm7[8],xmm4[9],xmm7[9],xmm4[10],xmm7[10],xmm4[11],xmm7[11],xmm4[12],xmm7[12],xmm4[13],xmm7[13],xmm4[14],xmm7[14],xmm4[15],xmm7[15]
+; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm19
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm14[8],xmm10[8],xmm14[9],xmm10[9],xmm14[10],xmm10[10],xmm14[11],xmm10[11],xmm14[12],xmm10[12],xmm14[13],xmm10[13],xmm14[14],xmm10[14],xmm14[15],xmm10[15]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm31, %xmm6
+; AVX512DQ-NEXT:    vmovdqa64 %xmm28, %xmm4
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm14 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15]
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm0 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm15, %xmm15
+; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm14, %xmm13
 ; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
 ; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm17
-; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm14[8],xmm2[8],xmm14[9],xmm2[9],xmm14[10],xmm2[10],xmm14[11],xmm2[11],xmm14[12],xmm2[12],xmm14[13],xmm2[13],xmm14[14],xmm2[14],xmm14[15],xmm2[15]
-; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm14 = xmm10[8],xmm11[8],xmm10[9],xmm11[9],xmm10[10],xmm11[10],xmm10[11],xmm11[11],xmm10[12],xmm11[12],xmm10[13],xmm11[13],xmm10[14],xmm11[14],xmm10[15],xmm11[15]
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm0 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm14, %xmm14
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
-; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm0, %zmm31, %zmm31
-; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512DQ-NEXT:    vpshufb %ymm9, %ymm0, %ymm1
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm9 = ymm0[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
+; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm24, %xmm2
+; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm14 = zmm2[0,1,0,1],zmm1[0,1,0,1]
+; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX512DQ-NEXT:    vpshufb %ymm9, %ymm10, %ymm1
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm9 = ymm10[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[0,1,1,3,4,5,5,7]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,3,2]
-; AVX512DQ-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm9
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm9, %zmm1, %zmm9
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255]
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm6, %zmm1, %zmm22
-; AVX512DQ-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm6 = mem[2,3,2,3]
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm6, %zmm0, %zmm6
-; AVX512DQ-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    # ymm27 = mem[2,3,2,3]
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm27, %zmm0, %zmm27
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm6, %zmm1, %zmm27
-; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm24, %xmm0
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
-; AVX512DQ-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload
-; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm6 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3],xmm13[4],xmm3[4],xmm13[5],xmm3[5],xmm13[6],xmm3[6],xmm13[7],xmm3[7]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm21, %xmm0
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm6, %xmm6
-; AVX512DQ-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 16-byte Folded Reload
-; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3],xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm20, %xmm0
-; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm10, %xmm10
-; AVX512DQ-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 16-byte Folded Reload
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm11 = xmm5[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[1,1,0,0,4,5,6,7]
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255]
+; AVX512DQ-NEXT:    vpternlogq $226, %zmm12, %zmm10, %zmm17
+; AVX512DQ-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm2 = mem[2,3,2,3]
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512DQ-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm28 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    # ymm28 = mem[2,3,2,3]
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm28, %zmm3, %zmm28
+; AVX512DQ-NEXT:    vpternlogq $226, %zmm2, %zmm10, %zmm28
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm9[2,2,3,2]
+; AVX512DQ-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm10
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm27, %xmm2
+; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX512DQ-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm27 # 16-byte Folded Reload
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm26, %xmm1
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
+; AVX512DQ-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload
+; AVX512DQ-NEXT:    vmovdqa64 %xmm21, %xmm1
+; AVX512DQ-NEXT:    vmovdqa64 %xmm22, %xmm3
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm8 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm25, %xmm1
+; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm8, %xmm8
+; AVX512DQ-NEXT:    vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    # zmm8 = zmm8[0,1,0,1],mem[0,1,0,1]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm9 = xmm11[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm11[1,1,0,0,4,5,6,7]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,0]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm11, %zmm5, %zmm5
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512DQ-NEXT:    vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512DQ-NEXT:    vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm13 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm11 = zmm11[2,3,2,3,6,7,6,7]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm13 = zmm13[2,3,2,3,6,7,6,7]
-; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm13
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512DQ-NEXT:    vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm11 = zmm11[2,3,2,3,6,7,6,7]
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255]
-; AVX512DQ-NEXT:    vpternlogq $184, %zmm13, %zmm21, %zmm11
-; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm8 = zmm8[0,1,0,1,4,5,4,5]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm7 = zmm17[0,1,0,1,4,5,4,5]
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm8, %zmm21, %zmm7
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm8 = ymm15[0,1,0,1]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm13 = ymm14[0,1,0,1]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm14 = ymm18[2,3,2,3]
-; AVX512DQ-NEXT:    vpshuflw $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
-; AVX512DQ-NEXT:    # xmm15 = mem[1,1,0,0,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm15 = xmm15[0,1,2,0]
-; AVX512DQ-NEXT:    vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm20 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    vporq %zmm26, %zmm23, %zmm17
-; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm18 = zmm20[2,3,2,3,6,7,6,7]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm17 = zmm17[2,3,2,3,6,7,6,7]
-; AVX512DQ-NEXT:    vpternlogq $226, %zmm18, %zmm21, %zmm17
-; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm12 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm12
-; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm8 # 32-byte Folded Reload
-; AVX512DQ-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm19
-; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm12, %zmm19
-; AVX512DQ-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    # zmm8 = mem[2,3,2,3,6,7,6,7]
-; AVX512DQ-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm8
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm11 = ymm15[0,0,1,0]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm11, %zmm14, %zmm11
-; AVX512DQ-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm30, %zmm11
-; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm22, %zmm11
-; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm0 = zmm31[0,1,0,1,4,5,4,5]
-; AVX512DQ-NEXT:    vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm28, %zmm0
-; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm0
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm9, %zmm5, %zmm5
+; AVX512DQ-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    # zmm9 = mem[2,3,2,3,6,7,6,7]
+; AVX512DQ-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    # zmm11 = mem[2,3,2,3,6,7,6,7]
+; AVX512DQ-NEXT:    vporq %zmm9, %zmm11, %zmm9
+; AVX512DQ-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    # zmm11 = mem[2,3,2,3,6,7,6,7]
+; AVX512DQ-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    # zmm12 = mem[2,3,2,3,6,7,6,7]
+; AVX512DQ-NEXT:    vporq %zmm11, %zmm12, %zmm11
+; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm11
+; AVX512DQ-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    # zmm9 = mem[2,3,2,3,6,7,6,7]
+; AVX512DQ-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    # zmm12 = mem[2,3,2,3,6,7,6,7]
+; AVX512DQ-NEXT:    vporq %zmm9, %zmm12, %zmm9
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255]
+; AVX512DQ-NEXT:    vpternlogq $184, %zmm11, %zmm12, %zmm9
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm11 = ymm13[0,1,0,1]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm4 = ymm18[0,1,0,1]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm20[2,3,2,3]
+; AVX512DQ-NEXT:    vpshuflw $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; AVX512DQ-NEXT:    # xmm1 = mem[1,1,0,0,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm13 = zmm19[0,1,0,1,4,5,4,5]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5]
+; AVX512DQ-NEXT:    vpternlogq $226, %zmm13, %zmm12, %zmm0
+; AVX512DQ-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    # zmm13 = mem[2,3,2,3,6,7,6,7]
+; AVX512DQ-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    # zmm21 = mem[2,3,2,3,6,7,6,7]
+; AVX512DQ-NEXT:    vporq %zmm13, %zmm21, %zmm13
+; AVX512DQ-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    # zmm20 = mem[2,3,2,3,6,7,6,7]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm18 = zmm30[2,3,2,3,6,7,6,7]
+; AVX512DQ-NEXT:    vporq %zmm20, %zmm18, %zmm18
+; AVX512DQ-NEXT:    vpternlogq $226, %zmm13, %zmm12, %zmm18
+; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm4
+; AVX512DQ-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm23, %zmm16
+; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm16
+; AVX512DQ-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    # zmm4 = mem[2,3,2,3,6,7,6,7]
+; AVX512DQ-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm4
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,1,0]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm3, %zmm1
+; AVX512DQ-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm29, %zmm1
+; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm17, %zmm1
 ; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-NEXT:    vshufi64x2 $84, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    # zmm3 = zmm3[0,1,2,3],mem[2,3,2,3]
-; AVX512DQ-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm9
-; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm27, %zmm9
-; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[0,1,0,1,4,5,4,5]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm3 = zmm6[0,1,0,1,4,5,4,5]
-; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm3
-; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm1 = zmm10[0,1,0,1,4,5,4,5]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm5 = zmm5[0,0,1,0,4,4,5,4]
-; AVX512DQ-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm5
-; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm5
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-NEXT:    vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 64-byte Folded Reload
-; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7]
-; AVX512DQ-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm16
-; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm17, %zmm16
+; AVX512DQ-NEXT:    vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm14
+; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm14
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-NEXT:    vshufi64x2 $84, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    # zmm0 = zmm0[0,1,2,3],mem[2,3,2,3]
+; AVX512DQ-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm10
+; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm28, %zmm10
+; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm0 = zmm27[0,1,0,1,4,5,4,5]
+; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm2 = zmm2[0,1,0,1,4,5,4,5]
+; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2
+; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm0 = zmm5[0,0,1,0,4,4,5,4]
+; AVX512DQ-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm0
+; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-NEXT:    vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload
+; AVX512DQ-NEXT:    # zmm2 = zmm2[2,3,2,3],mem[2,3,2,3]
+; AVX512DQ-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm15
+; AVX512DQ-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm18, %zmm15
 ; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-NEXT:    vmovdqa64 %zmm16, 128(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm5, (%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm9, 320(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm0, 256(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm11, 192(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm8, 384(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm19, 64(%rax)
-; AVX512DQ-NEXT:    addq $1384, %rsp # imm = 0x568
+; AVX512DQ-NEXT:    vmovdqa64 %zmm15, 128(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm0, (%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm10, 320(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm14, 256(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm1, 192(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm4, 384(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm16, 64(%rax)
+; AVX512DQ-NEXT:    addq $1720, %rsp # imm = 0x6B8
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512DQ-FCP-LABEL: store_i8_stride7_vf64:
 ; AVX512DQ-FCP:       # %bb.0:
 ; AVX512DQ-FCP-NEXT:    subq $1432, %rsp # imm = 0x598
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %ymm3
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm3[30],zero,ymm3[28],zero,zero,zero,zero,ymm3[31],zero,ymm3[29],zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25]
-; AVX512DQ-FCP-NEXT:    # ymm4 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm2, %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm4, %ymm25
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %ymm2
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm3
+; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25]
+; AVX512DQ-FCP-NEXT:    # ymm15 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm3, %ymm1
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,30],zero,ymm2[28],zero,ymm2[30,31,30,31],zero,ymm2[29],zero,ymm2[31,28,29]
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [25,128,23,0,0,0,128,26,128,24,0,0,0,128,27,128,25,128,23,0,0,0,128,26,128,24,0,0,0,128,27,128]
-; AVX512DQ-FCP-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm3, %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm16
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,30],zero,ymm3[28],zero,ymm3[30,31,30,31],zero,ymm3[29],zero,ymm3[31,28,29]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm3, %ymm25
+; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [25,128,23,0,0,0,128,26,128,24,0,0,0,128,27,128,25,128,23,0,0,0,128,26,128,24,0,0,0,128,27,128]
+; AVX512DQ-FCP-NEXT:    # ymm3 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm2, %ymm1
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm3, %ymm27
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm18
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %ymm1
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29],zero,zero
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm1, %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %ymm15
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128]
-; AVX512DQ-FCP-NEXT:    # ymm3 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm15, %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm3, %ymm29
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %ymm2
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero,zero
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %ymm3
+; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128]
+; AVX512DQ-FCP-NEXT:    # ymm4 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm3, %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm4, %ymm28
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u],zero,ymm15[30],zero,ymm15[28,u,u,u],zero,ymm15[31],zero,ymm15[29,u]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u],zero,ymm3[30],zero,ymm3[28,u,u,u],zero,ymm3[31],zero,ymm3[29,u]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm3, %ymm19
 ; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27]
 ; AVX512DQ-FCP-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm2, %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm3, %ymm31
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm26
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm3, %ymm24
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm17
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %ymm2
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[27],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29]
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r9), %ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[27],zero,zero,zero,zero,ymm3[30],zero,ymm3[28],zero,zero,zero,zero,ymm3[31],zero,ymm3[29]
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r9), %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128]
 ; AVX512DQ-FCP-NEXT:    # ymm4 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm3, %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm4, %ymm21
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm2, %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm4, %ymm23
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm3[27,u,u,u],zero,ymm3[30],zero,ymm3[28,u,u,u],zero,ymm3[31],zero
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm3, %ymm23
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31,24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31]
-; AVX512DQ-FCP-NEXT:    # ymm3 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm2, %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm3, %ymm20
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm19
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm2[27,u,u,u],zero,ymm2[30],zero,ymm2[28,u,u,u],zero,ymm2[31],zero
+; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31,24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31]
+; AVX512DQ-FCP-NEXT:    # ymm2 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm3, %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm22
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
@@ -9791,10 +9872,10 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %ymm6
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm6[14],zero,zero,zero,zero,zero,zero,ymm6[15],zero,zero,zero,zero,zero,zero,ymm6[16],zero,zero,zero,zero,zero,zero,ymm6[17],zero,zero,zero,zero,zero,zero,ymm6[18]
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm5
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm5[0,1,14],zero,ymm5[12,13,0,1,14,15],zero,ymm5[3,12,13,2,3,16],zero,ymm5[30,31,28,29,16,17],zero,ymm5[31,18,19,28,29,18],zero
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %ymm5
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm5[14],zero,zero,zero,zero,zero,zero,ymm5[15],zero,zero,zero,zero,zero,zero,ymm5[16],zero,zero,zero,zero,zero,zero,ymm5[17],zero,zero,zero,zero,zero,zero,ymm5[18]
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm4
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm4[0,1,14],zero,ymm4[12,13,0,1,14,15],zero,ymm4[3,12,13,2,3,16],zero,ymm4[30,31,28,29,16,17],zero,ymm4[31,18,19,28,29,18],zero
 ; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %ymm3
@@ -9804,316 +9885,327 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %ymm1
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,ymm1[14],zero,zero,zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,zero,zero,ymm1[17],zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %ymm14
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = ymm14[13,u,u,u,u,u],zero,ymm14[14,u,u,u,u,u],zero,ymm14[15,u,u,u,u,u],zero,ymm14[16,u,u,u,u,u],zero,ymm14[17,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpor %ymm4, %ymm7, %ymm4
-; AVX512DQ-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,ymm1[14],zero,zero,zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,zero,zero,ymm1[17],zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = ymm0[13,u,u,u,u,u],zero,ymm0[14,u,u,u,u,u],zero,ymm0[15,u,u,u,u,u],zero,ymm0[16,u,u,u,u,u],zero,ymm0[17,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpor %ymm6, %ymm7, %ymm6
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %xmm9
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm4 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm4, %xmm0, %xmm7
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %xmm7
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm7, %xmm7
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm8 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u]
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm9, %xmm9
-; AVX512DQ-FCP-NEXT:    vpor %xmm7, %xmm9, %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %xmm0
+; AVX512DQ-FCP-NEXT:    vpor %xmm7, %xmm9, %xmm7
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %xmm11
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %xmm9
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128]
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm9, %xmm9
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm0, %xmm11
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm0, %xmm28
-; AVX512DQ-FCP-NEXT:    vpor %xmm9, %xmm11, %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm11, %xmm11
+; AVX512DQ-FCP-NEXT:    vpor %xmm9, %xmm11, %xmm9
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r9), %xmm9
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm11 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm11, %xmm9, %xmm9
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %xmm14
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm12 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm12, %xmm9, %xmm9
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm13 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm0, %xmm12
-; AVX512DQ-FCP-NEXT:    vpor %xmm9, %xmm12, %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %xmm0
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm0, %xmm7
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm0, %xmm24
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm0, %xmm10
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm0, %xmm17
-; AVX512DQ-FCP-NEXT:    vpor %xmm7, %xmm10, %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %xmm10
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm4, %xmm10, %xmm4
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %xmm9
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm9, %xmm8
-; AVX512DQ-FCP-NEXT:    vpor %xmm4, %xmm8, %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %xmm8
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm11, %xmm8, %xmm11
-; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %xmm7
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm7, %xmm13
-; AVX512DQ-FCP-NEXT:    vporq %xmm11, %xmm13, %xmm30
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm25, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm5, %ymm11
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm12 = [128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm14, %xmm11
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm14, %xmm30
+; AVX512DQ-FCP-NEXT:    vpor %xmm9, %xmm11, %xmm9
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %xmm9
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm9, %xmm7
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm9, %xmm26
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm9
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm9, %xmm10
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm9, %xmm21
+; AVX512DQ-FCP-NEXT:    vpor %xmm7, %xmm10, %xmm7
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %xmm7
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm7, %xmm6
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %xmm10
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm10, %xmm8
+; AVX512DQ-FCP-NEXT:    vpor %xmm6, %xmm8, %xmm6
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %xmm11
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm12, %xmm11, %xmm12
+; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %xmm6
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm6, %xmm13
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm6, %xmm9
+; AVX512DQ-FCP-NEXT:    vporq %xmm12, %xmm13, %xmm31
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm4, %ymm12
+; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128]
+; AVX512DQ-FCP-NEXT:    # ymm8 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm5, %ymm13
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm12, %zmm13, %zmm6
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm27, %ymm6
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm5, %ymm12
+; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23]
+; AVX512DQ-FCP-NEXT:    # ymm6 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm4, %ymm13
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm6, %ymm20
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm12, %zmm13, %zmm29
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm28, %ymm6
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm3, %ymm12
+; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20]
+; AVX512DQ-FCP-NEXT:    # ymm6 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm3, %ymm13
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm6, %ymm16
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm12, %zmm13, %zmm28
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm24, %ymm6
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm2, %ymm13
+; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm12 = [18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128]
 ; AVX512DQ-FCP-NEXT:    # ymm12 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm6, %ymm13
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm13, %zmm0
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm16, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm6, %ymm11
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23]
-; AVX512DQ-FCP-NEXT:    # ymm0 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm5, %ymm13
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm0, %ymm22
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm13, %zmm0
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm29, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm3, %ymm11
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20]
-; AVX512DQ-FCP-NEXT:    # ymm0 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm3, %ymm13
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm0, %ymm29
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm13, %zmm0
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm31, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm2, %ymm11
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128]
-; AVX512DQ-FCP-NEXT:    # ymm4 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm2, %ymm13
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm13, %zmm25
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm21, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm14, %ymm11
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22]
-; AVX512DQ-FCP-NEXT:    # ymm0 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm14, %ymm13
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm0, %ymm18
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm13, %zmm0
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm20, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm1, %ymm11
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128]
-; AVX512DQ-FCP-NEXT:    # ymm0 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm1, %ymm13
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm0, %ymm16
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm13, %zmm0
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm6[30],zero,ymm6[28],zero,zero,zero,zero,ymm6[31],zero,ymm6[29],zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,30],zero,ymm5[28],zero,ymm5[30,31,30,31],zero,ymm5[29],zero,ymm5[31,28,29]
-; AVX512DQ-FCP-NEXT:    vporq %ymm6, %ymm5, %ymm20
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm2, %ymm15
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm13, %zmm15, %zmm27
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm23, %ymm6
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm0, %ymm13
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm22, %ymm6
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm1, %ymm15
+; AVX512DQ-FCP-NEXT:    vpor %ymm13, %ymm15, %ymm6
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22]
+; AVX512DQ-FCP-NEXT:    # ymm6 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm0, %ymm15
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm6, %ymm24
+; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128]
+; AVX512DQ-FCP-NEXT:    # ymm13 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm13, %ymm1, %ymm14
+; AVX512DQ-FCP-NEXT:    vpor %ymm15, %ymm14, %ymm6
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm5[30],zero,ymm5[28],zero,zero,zero,zero,ymm5[31],zero,ymm5[29],zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,30],zero,ymm4[28],zero,ymm4[30,31,30,31],zero,ymm4[29],zero,ymm4[31,28,29]
+; AVX512DQ-FCP-NEXT:    vpor %ymm5, %ymm4, %ymm4
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero,zero
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u],zero,ymm3[30],zero,ymm3[28,u,u,u],zero,ymm3[31],zero,ymm3[29,u]
-; AVX512DQ-FCP-NEXT:    vpor %ymm2, %ymm3, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vporq %ymm2, %ymm3, %ymm23
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[27],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm14[27,u,u,u],zero,ymm14[30],zero,ymm14[28,u,u,u],zero,ymm14[31],zero
-; AVX512DQ-FCP-NEXT:    vporq %ymm1, %ymm0, %ymm21
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,ymm15[14],zero,zero,zero,zero,zero,zero,ymm15[15],zero,zero,zero,zero,zero,zero,ymm15[16],zero,zero,zero,zero,zero,zero,ymm15[17],zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm26, %ymm2
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[0,1,0,1,14],zero,ymm2[14,15,0,1,14,15],zero,ymm2[13,14,15,16,17,16],zero,ymm2[30,31,30,31,16,17],zero,ymm2[31,28,29,30,31]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm0[27,u,u,u],zero,ymm0[30],zero,ymm0[28,u,u,u],zero,ymm0[31],zero
+; AVX512DQ-FCP-NEXT:    vporq %ymm1, %ymm0, %ymm22
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm19, %ymm2
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm17, %ymm3
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm3[0,1,0,1,14],zero,ymm3[14,15,0,1,14,15],zero,ymm3[13,14,15,16,17,16],zero,ymm3[30,31,30,31,16,17],zero,ymm3[31,28,29,30,31]
 ; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm29, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm15, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm2, %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm16, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm2, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm3, %ymm1
 ; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm27, %ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm18, %ymm3
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero,zero,zero,ymm3[18]
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm25, %ymm2
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[0,1,14],zero,ymm2[12,13,0,1,14,15],zero,ymm2[3,12,13,2,3,16],zero,ymm2[30,31,28,29,16,17],zero,ymm2[31,18,19,28,29,18],zero
 ; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm3, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm22, %ymm1
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm3, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm20, %ymm1
 ; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm2, %ymm1
-; AVX512DQ-FCP-NEXT:    vporq %ymm0, %ymm1, %ymm29
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm23, %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm18, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm2, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm19, %ymm3
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm16, %ymm1
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
+; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm24, %ymm0
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufb %ymm13, %ymm2, %ymm1
 ; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[13,u,u,u,u,u],zero,ymm2[14,u,u,u,u,u],zero,ymm2[15,u,u,u,u,u],zero,ymm2[16,u,u,u,u,u],zero,ymm2[17,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm3[13,u,u,u,u,u],zero,ymm3[14,u,u,u,u,u],zero,ymm3[15,u,u,u,u,u],zero,ymm3[16,u,u,u,u,u],zero,ymm3[17,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rax), %xmm1
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [2,2,3,3,2,2,3,3]
-; AVX512DQ-FCP-NEXT:    # ymm3 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,5,6]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm1, %xmm23
-; AVX512DQ-FCP-NEXT:    vpermd %ymm0, %ymm3, %ymm0
-; AVX512DQ-FCP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rax), %ymm15
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = zero,ymm15[13],zero,zero,zero,zero,zero,zero,ymm15[14],zero,zero,zero,zero,zero,zero,ymm15[15],zero,zero,zero,zero,zero,zero,ymm15[16],zero,zero,zero,zero,zero,zero,ymm15[17],zero,zero
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm18
-; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm0, %ymm27
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm9[8],xmm11[8],xmm9[9],xmm11[9],xmm9[10],xmm11[10],xmm9[11],xmm11[11],xmm9[12],xmm11[12],xmm9[13],xmm11[13],xmm9[14],xmm11[14],xmm9[15],xmm11[15]
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm9, %xmm8
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm11, %xmm17
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm9 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm1, %xmm1
+; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,0,1],zmm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [2,2,3,3,2,2,3,3]
+; AVX512DQ-FCP-NEXT:    # ymm6 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rax), %xmm0
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,5,6]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm0, %xmm19
+; AVX512DQ-FCP-NEXT:    vpermd %ymm1, %ymm6, %ymm1
+; AVX512DQ-FCP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rax), %ymm12
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = zero,ymm12[13],zero,zero,zero,zero,zero,zero,ymm12[14],zero,zero,zero,zero,zero,zero,ymm12[15],zero,zero,zero,zero,zero,zero,ymm12[16],zero,zero,zero,zero,zero,zero,ymm12[17],zero,zero
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm3, %zmm20
+; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm7[8],xmm10[8],xmm7[9],xmm10[9],xmm7[10],xmm10[10],xmm7[11],xmm10[11],xmm7[12],xmm10[12],xmm7[13],xmm10[13],xmm7[14],xmm10[14],xmm7[15],xmm10[15]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm5, %xmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm0, %ymm25
+; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm16 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm16, %xmm1
-; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm12[8],xmm1[8],xmm12[9],xmm1[9],xmm12[10],xmm1[10],xmm12[11],xmm1[11],xmm12[12],xmm1[12],xmm12[13],xmm1[13],xmm12[14],xmm1[14],xmm12[15],xmm1[15]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm19
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm24, %xmm4
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm17, %xmm1
-; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm0, %ymm24
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm28, %xmm5
-; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm13[8],xmm5[8],xmm13[9],xmm5[9],xmm13[10],xmm5[10],xmm13[11],xmm5[11],xmm13[12],xmm5[12],xmm13[13],xmm5[13],xmm13[14],xmm5[14],xmm13[15],xmm5[15]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm16, %xmm2
+; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm11[8],xmm2[8],xmm11[9],xmm2[9],xmm11[10],xmm2[10],xmm11[11],xmm2[11],xmm11[12],xmm2[12],xmm11[13],xmm2[13],xmm11[14],xmm2[14],xmm11[15],xmm2[15]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm2, %zmm17
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rax), %xmm6
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm14 = xmm6[0,1,2,3,4,5,5,6]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm14, %ymm3, %ymm31
-; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm7[8],xmm8[8],xmm7[9],xmm8[9],xmm7[10],xmm8[10],xmm7[11],xmm8[11],xmm7[12],xmm8[12],xmm7[13],xmm8[13],xmm7[14],xmm8[14],xmm7[15],xmm8[15]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm14 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm14, %xmm3, %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm0, %ymm22
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm11 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm14, %xmm11, %xmm11
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm1, %zmm2, %zmm18
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm26, %xmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm21, %xmm0
+; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm5, %xmm3
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm3, %ymm21
+; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm13 = xmm14[8],xmm4[8],xmm14[9],xmm4[9],xmm14[10],xmm4[10],xmm14[11],xmm4[11],xmm14[12],xmm4[12],xmm14[13],xmm4[13],xmm14[14],xmm4[14],xmm14[15],xmm4[15]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm13, %xmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm11, %zmm3, %zmm28
-; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm11 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm14 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm14, %xmm11, %xmm11
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm26 # 16-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm9, %xmm9
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 16-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm11 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm11, %xmm7, %xmm7
-; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm30, %zmm7, %zmm7
-; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm8 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3],xmm5[4],xmm13[4],xmm5[5],xmm13[5],xmm5[6],xmm13[6],xmm5[7],xmm13[7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm14, %xmm8, %xmm8
-; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm14 = zmm20[2,3,2,3],zmm8[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm20 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm8 = zmm8[2,3,2,3,6,7,6,7]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm20 = zmm20[2,3,2,3,6,7,6,7]
-; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm20
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm30 = zmm8[2,3,2,3,6,7,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255]
-; AVX512DQ-FCP-NEXT:    vpternlogq $184, %zmm20, %zmm8, %zmm30
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm5 = zmm19[0,1,0,1,4,5,4,5]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm20 = zmm17[0,1,0,1,4,5,4,5]
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm5, %zmm8, %zmm20
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm5 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm25 = zmm1[2,3,2,3,6,7,6,7]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm1 = zmm5[2,3,2,3,6,7,6,7]
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm25, %zmm8, %zmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm16, %xmm4
-; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3],xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm5, %xmm5
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm1, %zmm3, %zmm13
+; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm30, %xmm5
+; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm5[8],xmm15[8],xmm5[9],xmm15[9],xmm5[10],xmm15[10],xmm5[11],xmm15[11],xmm5[12],xmm15[12],xmm5[13],xmm15[13],xmm5[14],xmm15[14],xmm5[15],xmm15[15]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm1, %xmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm10 = zmm3[2,3,2,3],zmm5[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm11, %xmm5, %xmm5
-; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255]
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm14, %zmm8, %zmm10
-; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm21 = zmm21[2,3,2,3],zmm5[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # ymm11 = mem[2,3,2,3]
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm0, %zmm11
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm14 = ymm29[2,3,2,3]
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm14, %zmm0, %zmm14
-; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm11, %zmm8, %zmm14
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm23, %xmm0
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm0[1,1,0,0,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm11 = [0,1,0,1,2,0,0,1]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm8, %ymm11, %ymm16
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm0, %xmm12
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm6[1,1,0,0,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm0, %ymm11, %ymm2
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm11 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31]
+; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm3[0,1,0,1],zmm1[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rax), %xmm3
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5,5,6]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm1, %ymm6, %ymm26
+; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm24 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3],xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm9, %xmm9
+; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm17, %xmm2
+; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3],xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm8, %xmm8
+; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,0,1],zmm31[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm14 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3],xmm4[4],xmm14[4],xmm4[5],xmm14[5],xmm4[6],xmm14[6],xmm4[7],xmm14[7]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm14, %xmm1
+; AVX512DQ-FCP-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # zmm14 = mem[2,3,2,3,6,7,6,7]
+; AVX512DQ-FCP-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # zmm31 = mem[2,3,2,3,6,7,6,7]
+; AVX512DQ-FCP-NEXT:    vporq %zmm14, %zmm31, %zmm14
+; AVX512DQ-FCP-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # zmm31 = mem[2,3,2,3,6,7,6,7]
+; AVX512DQ-FCP-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # zmm30 = mem[2,3,2,3,6,7,6,7]
+; AVX512DQ-FCP-NEXT:    vporq %zmm31, %zmm30, %zmm30
+; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm14, %zmm30
+; AVX512DQ-FCP-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # zmm14 = mem[2,3,2,3,6,7,6,7]
+; AVX512DQ-FCP-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # zmm31 = mem[2,3,2,3,6,7,6,7]
+; AVX512DQ-FCP-NEXT:    vporq %zmm14, %zmm31, %zmm14
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm31 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255]
+; AVX512DQ-FCP-NEXT:    vpternlogq $184, %zmm30, %zmm31, %zmm14
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm7 = zmm18[0,1,0,1,4,5,4,5]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm13 = zmm13[0,1,0,1,4,5,4,5]
+; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm7, %zmm31, %zmm13
+; AVX512DQ-FCP-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # zmm7 = mem[2,3,2,3,6,7,6,7]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm29 = zmm29[2,3,2,3,6,7,6,7]
+; AVX512DQ-FCP-NEXT:    vporq %zmm7, %zmm29, %zmm7
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm28 = zmm28[2,3,2,3,6,7,6,7]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm27 = zmm27[2,3,2,3,6,7,6,7]
+; AVX512DQ-FCP-NEXT:    vporq %zmm28, %zmm27, %zmm27
+; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm7, %zmm31, %zmm27
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm2[2,3,2,3],zmm1[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm16, %xmm2
+; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm7 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3],xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm7, %xmm7
+; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm7 = zmm23[2,3,2,3],zmm7[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm10 = xmm5[0],xmm15[0],xmm5[1],xmm15[1],xmm5[2],xmm15[2],xmm5[3],xmm15[3],xmm5[4],xmm15[4],xmm5[5],xmm15[5],xmm5[6],xmm15[6],xmm5[7],xmm15[7]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm10, %xmm10
+; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm22 = zmm22[2,3,2,3],zmm10[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm19, %xmm2
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm2[1,1,0,0,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm17 = [0,1,0,1,2,0,0,1]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm11, %ymm17, %ymm28
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm2, %xmm6
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm3[1,1,0,0,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm4, %ymm17, %ymm17
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm10 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31]
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm12[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm23 = [4,5,4,5,5,7,4,5]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm2, %ymm23, %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm11 = zero,ymm15[13],zero,zero,zero,zero,zero,zero,ymm15[14],zero,zero,zero,zero,zero,zero,ymm15[15],zero,zero,zero,zero,zero,zero,ymm15[16],zero,zero,zero,zero,zero,zero,ymm15[17],zero,zero
 ; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm15 = ymm15[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm17 = [4,5,4,5,5,7,4,5]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm15, %ymm17, %ymm15
-; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = zero,ymm13[13],zero,zero,zero,zero,zero,zero,ymm13[14],zero,zero,zero,zero,zero,zero,ymm13[15],zero,zero,zero,zero,zero,zero,ymm13[16],zero,zero,zero,zero,zero,zero,ymm13[17],zero,zero
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm13 = ymm13[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm13, %ymm17, %ymm13
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm24[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm17 = ymm27[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm22[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,0,1,0]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm12[0,0,1,0]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; AVX512DQ-FCP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm13, %ymm13
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm13, %zmm8, %zmm8
-; AVX512DQ-FCP-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # zmm13 = mem[2,3,2,3,6,7,6,7]
-; AVX512DQ-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm30, %zmm13
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm17 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm17
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm18
-; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm17, %zmm18
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm31, %zmm5, %zmm3
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm5 = zmm28[0,1,0,1,4,5,4,5]
-; AVX512DQ-FCP-NEXT:    vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm5
-; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm20, %zmm5
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm3 = zmm26[0,1,0,1,4,5,4,5]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm6 = zmm9[0,1,0,1,4,5,4,5]
-; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm6
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm16, %zmm3
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm4 = zmm7[0,1,0,1,4,5,4,5]
-; AVX512DQ-FCP-NEXT:    vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm4
-; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm4
+; AVX512DQ-FCP-NEXT:    vpermd %ymm15, %ymm23, %ymm15
+; AVX512DQ-FCP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm15, %ymm15
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm15, %zmm11, %zmm11
+; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm15 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255]
+; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm1, %zmm15, %zmm7
+; AVX512DQ-FCP-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # ymm1 = mem[2,3,2,3]
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm3, %zmm1
+; AVX512DQ-FCP-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # ymm18 = mem[2,3,2,3]
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm18, %zmm3, %zmm18
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm21[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm19 = ymm25[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,0,1,0]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3]
+; AVX512DQ-FCP-NEXT:    vpternlogq $226, %zmm1, %zmm15, %zmm18
+; AVX512DQ-FCP-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # zmm1 = mem[2,3,2,3,6,7,6,7]
+; AVX512DQ-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm14, %zmm1
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm14 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm14
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm20
+; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm14, %zmm20
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm26, %zmm0, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm0
+; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm13, %zmm0
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm5 = zmm24[0,1,0,1,4,5,4,5]
+; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm9 = zmm9[0,1,0,1,4,5,4,5]
+; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm9
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm28, %zmm5
+; AVX512DQ-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm5
+; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm5
+; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm6 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # zmm6 = zmm3[2,3,2,3],mem[2,3,2,3]
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm2, %zmm2
+; AVX512DQ-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm2
+; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm27, %zmm2
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm17, %zmm4, %zmm4
+; AVX512DQ-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm22, %zmm4
+; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm4
 ; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm15, %zmm6
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm3 = zmm3[2,3,2,3,6,7,6,7]
-; AVX512DQ-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm6
-; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm6
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX512DQ-FCP-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm21, %zmm0
-; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm0
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vshufi64x2 $84, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # zmm1 = zmm1[0,1,2,3],mem[2,3,2,3]
-; AVX512DQ-FCP-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm8
-; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm14, %zmm8
+; AVX512DQ-FCP-NEXT:    vshufi64x2 $84, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm6 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT:    # zmm6 = zmm3[0,1,2,3],mem[2,3,2,3]
+; AVX512DQ-FCP-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm11
+; AVX512DQ-FCP-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm18, %zmm11
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, 320(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, 192(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, 128(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, (%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, 256(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm18, 64(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm13, 384(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm11, 320(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, 192(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, 128(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, (%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, 256(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm20, 64(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, 384(%rax)
 ; AVX512DQ-FCP-NEXT:    addq $1432, %rsp # imm = 0x598
 ; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll
index f5a6b9f59aacf..1771b53a0f835 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll
@@ -2104,14 +2104,16 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-NEXT:    kmovd %ecx, %k3
 ; AVX512BW-NEXT:    vmovdqa32 %zmm4, %zmm5 {%k3}
 ; AVX512BW-NEXT:    vpermq {{.*#+}} zmm2 = zmm2[1,3,1,3,5,7,5,7]
-; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm2 = zmm2[u,u,u,u,0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u]
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,2,10,2,10,u,u,u,u,3,11,3,11,u,u,u,u,4,12,4,12,u,u,u,u,5,13,5,13,u,u,u,u,6,14,6,14,u,u,u,u,7,15,7,15]
+; AVX512BW-NEXT:    vpshufb %zmm4, %zmm2, %zmm2
 ; AVX512BW-NEXT:    vpermq {{.*#+}} zmm3 = zmm3[1,3,1,3,5,7,5,7]
-; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm3 = zmm3[u,u,u,u,0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63]
+; AVX512BW-NEXT:    vpshufb %zmm4, %zmm3, %zmm3
 ; AVX512BW-NEXT:    vmovdqu16 %zmm3, %zmm2 {%k1}
 ; AVX512BW-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[1,3,1,3,5,7,5,7]
-; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpmovsxdq {{.*#+}} zmm3 = [134219776,151062785,167905794,184748803,201591812,218434821,235277830,252120839]
+; AVX512BW-NEXT:    vpshufb %zmm3, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[1,3,1,3,5,7,5,7]
-; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm1 = zmm1[0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u]
+; AVX512BW-NEXT:    vpshufb %zmm3, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k2}
 ; AVX512BW-NEXT:    vmovdqa32 %zmm2, %zmm0 {%k3}
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, 64(%rax)
@@ -2157,14 +2159,16 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [5,7,5,7,5,7,5,7]
 ; AVX512BW-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-FCP-NEXT:    vpermq %zmm5, %zmm1, %zmm5
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} zmm5 = zmm5[u,u,u,u,0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63]
+; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,2,10,2,10,u,u,u,u,3,11,3,11,u,u,u,u,4,12,4,12,u,u,u,u,5,13,5,13,u,u,u,u,6,14,6,14,u,u,u,u,7,15,7,15]
+; AVX512BW-FCP-NEXT:    vpshufb %zmm6, %zmm5, %zmm5
 ; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} zmm2 = zmm2[1,3,1,3,5,7,5,7]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} zmm2 = zmm2[u,u,u,u,0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u]
+; AVX512BW-FCP-NEXT:    vpshufb %zmm6, %zmm2, %zmm2
 ; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm5, %zmm2 {%k1}
 ; AVX512BW-FCP-NEXT:    vpermq %zmm4, %zmm1, %zmm1
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} zmm1 = zmm1[0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpmovsxdq {{.*#+}} zmm4 = [134219776,151062785,167905794,184748803,201591812,218434821,235277830,252120839]
+; AVX512BW-FCP-NEXT:    vpshufb %zmm4, %zmm1, %zmm1
 ; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[1,3,1,3,5,7,5,7]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpshufb %zmm4, %zmm0, %zmm0
 ; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k2}
 ; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm2, %zmm0 {%k3}
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, 64(%rax)
@@ -2209,14 +2213,16 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-BW-NEXT:    kmovd %ecx, %k3
 ; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm4, %zmm5 {%k3}
 ; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} zmm2 = zmm2[1,3,1,3,5,7,5,7]
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} zmm2 = zmm2[u,u,u,u,0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u]
+; AVX512DQ-BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,2,10,2,10,u,u,u,u,3,11,3,11,u,u,u,u,4,12,4,12,u,u,u,u,5,13,5,13,u,u,u,u,6,14,6,14,u,u,u,u,7,15,7,15]
+; AVX512DQ-BW-NEXT:    vpshufb %zmm4, %zmm2, %zmm2
 ; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} zmm3 = zmm3[1,3,1,3,5,7,5,7]
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} zmm3 = zmm3[u,u,u,u,0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63]
+; AVX512DQ-BW-NEXT:    vpshufb %zmm4, %zmm3, %zmm3
 ; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm3, %zmm2 {%k1}
 ; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[1,3,1,3,5,7,5,7]
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpmovsxdq {{.*#+}} zmm3 = [134219776,151062785,167905794,184748803,201591812,218434821,235277830,252120839]
+; AVX512DQ-BW-NEXT:    vpshufb %zmm3, %zmm0, %zmm0
 ; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[1,3,1,3,5,7,5,7]
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} zmm1 = zmm1[0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpshufb %zmm3, %zmm1, %zmm1
 ; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k2}
 ; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm2, %zmm0 {%k3}
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, 64(%rax)
@@ -2262,14 +2268,16 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [5,7,5,7,5,7,5,7]
 ; AVX512DQ-BW-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512DQ-BW-FCP-NEXT:    vpermq %zmm5, %zmm1, %zmm5
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} zmm5 = zmm5[u,u,u,u,0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,2,10,2,10,u,u,u,u,3,11,3,11,u,u,u,u,4,12,4,12,u,u,u,u,5,13,5,13,u,u,u,u,6,14,6,14,u,u,u,u,7,15,7,15]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %zmm6, %zmm5, %zmm5
 ; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} zmm2 = zmm2[1,3,1,3,5,7,5,7]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} zmm2 = zmm2[u,u,u,u,0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %zmm6, %zmm2, %zmm2
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm5, %zmm2 {%k1}
 ; AVX512DQ-BW-FCP-NEXT:    vpermq %zmm4, %zmm1, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} zmm1 = zmm1[0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxdq {{.*#+}} zmm4 = [134219776,151062785,167905794,184748803,201591812,218434821,235277830,252120839]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %zmm4, %zmm1, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[1,3,1,3,5,7,5,7]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %zmm4, %zmm0, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k2}
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm2, %zmm0 {%k3}
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, 64(%rax)
@@ -8612,336 +8620,230 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512BW-NEXT:    vmovdqa (%r10), %xmm0
-; AVX512BW-NEXT:    vmovdqa 16(%r10), %xmm12
-; AVX512BW-NEXT:    vmovdqa64 32(%r10), %xmm16
-; AVX512BW-NEXT:    vmovdqa 48(%r10), %xmm15
-; AVX512BW-NEXT:    vmovdqa (%rax), %xmm2
-; AVX512BW-NEXT:    vmovdqa 16(%rax), %xmm13
-; AVX512BW-NEXT:    vmovdqa64 32(%rax), %xmm17
-; AVX512BW-NEXT:    vmovdqa64 48(%rax), %xmm18
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
-; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,6,5]
-; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,6,6,7]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm4, %ymm3, %ymm3
-; AVX512BW-NEXT:    vpshufd {{.*#+}} ymm4 = ymm3[0,2,2,3,4,6,6,7]
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm3 = [0,0,0,0,4,5,2,1,0,2,0,2,4,5,2,3]
-; AVX512BW-NEXT:    vpermw %ymm1, %ymm3, %ymm1
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm4, %zmm1, %zmm1
-; AVX512BW-NEXT:    vmovdqa (%r9), %xmm4
-; AVX512BW-NEXT:    vmovdqa64 48(%r9), %xmm19
-; AVX512BW-NEXT:    vmovdqa (%r8), %xmm5
-; AVX512BW-NEXT:    vmovdqa64 48(%r8), %xmm21
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm7 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
-; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm7[0,1,2,3,4,5,5,7]
-; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm7[0,1,2,3,6,5,7,7]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm8, %ymm6, %ymm6
-; AVX512BW-NEXT:    vpshufd {{.*#+}} ymm8 = ymm6[0,2,2,3,4,6,6,7]
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm6 = [0,1,0,1,4,5,1,3,2,1,2,1,4,5,3,3]
-; AVX512BW-NEXT:    vpermw %ymm7, %ymm6, %ymm7
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm8, %zmm7, %zmm14
+; AVX512BW-NEXT:    vmovdqa (%r10), %xmm1
+; AVX512BW-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512BW-NEXT:    vmovdqa 16(%r10), %xmm13
+; AVX512BW-NEXT:    vmovdqa64 32(%r10), %xmm22
+; AVX512BW-NEXT:    vmovdqa64 48(%r10), %xmm19
+; AVX512BW-NEXT:    vmovdqa (%rax), %xmm0
+; AVX512BW-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 16(%rax), %xmm16
+; AVX512BW-NEXT:    vmovdqa64 32(%rax), %xmm23
+; AVX512BW-NEXT:    vmovdqa64 48(%rax), %xmm20
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX512BW-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovdqa (%r9), %xmm5
+; AVX512BW-NEXT:    vmovdqa64 16(%r9), %xmm17
+; AVX512BW-NEXT:    vmovdqa64 32(%r9), %xmm25
+; AVX512BW-NEXT:    vmovdqa64 48(%r9), %xmm21
+; AVX512BW-NEXT:    vmovdqa (%r8), %xmm7
+; AVX512BW-NEXT:    vmovdqa64 16(%r8), %xmm18
+; AVX512BW-NEXT:    vmovdqa64 32(%r8), %xmm26
+; AVX512BW-NEXT:    vmovdqa64 48(%r8), %xmm24
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm7[8],xmm5[8],xmm7[9],xmm5[9],xmm7[10],xmm5[10],xmm7[11],xmm5[11],xmm7[12],xmm5[12],xmm7[13],xmm5[13],xmm7[14],xmm5[14],xmm7[15],xmm5[15]
+; AVX512BW-NEXT:    vinserti32x4 $2, %xmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm10 = [0,1,0,1,4,5,1,3,2,1,2,1,4,5,3,3,16,17,20,21,20,21,21,23,16,17,22,21,22,21,23,23]
+; AVX512BW-NEXT:    vpermw %zmm1, %zmm10, %zmm4
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm9 = [0,0,0,0,4,5,2,1,0,2,0,2,4,5,2,3,16,17,20,20,20,20,22,21,16,17,20,22,20,22,22,23]
 ; AVX512BW-NEXT:    movl $-2004318072, %eax # imm = 0x88888888
 ; AVX512BW-NEXT:    kmovd %eax, %k1
-; AVX512BW-NEXT:    vmovdqu16 %zmm1, %zmm14 {%k1}
-; AVX512BW-NEXT:    vmovdqa (%rsi), %xmm7
-; AVX512BW-NEXT:    vmovdqa64 48(%rsi), %xmm24
-; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm8
-; AVX512BW-NEXT:    vmovdqa64 48(%rdi), %xmm27
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15]
-; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm9 = xmm1[2,3,2,3]
-; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero
-; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm10 = xmm1[3,3,3,3]
-; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero
-; AVX512BW-NEXT:    vinserti128 $1, %xmm10, %ymm9, %ymm9
-; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm10 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
+; AVX512BW-NEXT:    vpermw %zmm0, %zmm9, %zmm4 {%k1}
+; AVX512BW-NEXT:    vmovdqa (%rsi), %xmm0
+; AVX512BW-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 48(%rsi), %xmm28
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm6
+; AVX512BW-NEXT:    vmovdqa64 48(%rdi), %xmm30
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15]
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX512BW-NEXT:    vinserti128 $1, %xmm1, %ymm10, %ymm1
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm9, %zmm1, %zmm1
-; AVX512BW-NEXT:    vmovdqa (%rcx), %xmm9
-; AVX512BW-NEXT:    vmovdqa64 48(%rcx), %xmm28
-; AVX512BW-NEXT:    vmovdqa (%rdx), %xmm10
-; AVX512BW-NEXT:    vmovdqa64 48(%rdx), %xmm29
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm20 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15]
-; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm11 = xmm20[0,1,2,3,4,4,6,5]
-; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm22 = xmm20[0,1,2,3,4,6,6,7]
-; AVX512BW-NEXT:    vinserti32x4 $1, %xmm22, %ymm11, %ymm11
-; AVX512BW-NEXT:    vpshufd {{.*#+}} ymm22 = ymm11[2,1,3,3,6,5,7,7]
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm11 = [0,0,2,1,2,1,6,7,0,2,2,3,2,3,6,7]
-; AVX512BW-NEXT:    vpermw %ymm20, %ymm11, %ymm20
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm22, %zmm20, %zmm20
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm8 = xmm0[3,3,3,3]
+; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero
+; AVX512BW-NEXT:    vinserti128 $1, %xmm8, %ymm1, %ymm1
+; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm8 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX512BW-NEXT:    vinserti128 $1, %xmm0, %ymm8, %ymm0
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm3
+; AVX512BW-NEXT:    vmovdqa (%rcx), %xmm8
+; AVX512BW-NEXT:    vmovdqa 48(%rcx), %xmm1
+; AVX512BW-NEXT:    vmovdqa (%rdx), %xmm11
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm12 = xmm11[8],xmm8[8],xmm11[9],xmm8[9],xmm11[10],xmm8[10],xmm11[11],xmm8[11],xmm11[12],xmm8[12],xmm11[13],xmm8[13],xmm11[14],xmm8[14],xmm11[15],xmm8[15]
+; AVX512BW-NEXT:    vinserti32x4 $2, %xmm12, %zmm12, %zmm14
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm12 = [0,0,2,1,2,1,6,7,0,2,2,3,2,3,6,7,20,20,18,19,22,21,22,21,20,22,18,19,22,23,22,23]
 ; AVX512BW-NEXT:    movl $572662306, %eax # imm = 0x22222222
 ; AVX512BW-NEXT:    kmovd %eax, %k2
-; AVX512BW-NEXT:    vmovdqu16 %zmm20, %zmm1 {%k2}
-; AVX512BW-NEXT:    movw $-21846, %ax # imm = 0xAAAA
-; AVX512BW-NEXT:    kmovd %eax, %k3
-; AVX512BW-NEXT:    vmovdqa32 %zmm14, %zmm1 {%k3}
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm14 = xmm18[0],xmm15[0],xmm18[1],xmm15[1],xmm18[2],xmm15[2],xmm18[3],xmm15[3],xmm18[4],xmm15[4],xmm18[5],xmm15[5],xmm18[6],xmm15[6],xmm18[7],xmm15[7]
-; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm20 = xmm14[0,1,2,3,4,4,6,5]
-; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm22 = xmm14[0,1,2,3,4,6,6,7]
-; AVX512BW-NEXT:    vinserti32x4 $1, %xmm22, %ymm20, %ymm20
-; AVX512BW-NEXT:    vpshufd {{.*#+}} ymm20 = ymm20[0,2,2,3,4,6,6,7]
-; AVX512BW-NEXT:    vpermw %ymm14, %ymm3, %ymm14
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm20, %zmm14, %zmm14
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm20 = xmm21[0],xmm19[0],xmm21[1],xmm19[1],xmm21[2],xmm19[2],xmm21[3],xmm19[3],xmm21[4],xmm19[4],xmm21[5],xmm19[5],xmm21[6],xmm19[6],xmm21[7],xmm19[7]
-; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm22 = xmm20[0,1,2,3,4,5,5,7]
-; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm23 = xmm20[0,1,2,3,6,5,7,7]
-; AVX512BW-NEXT:    vinserti32x4 $1, %xmm23, %ymm22, %ymm22
-; AVX512BW-NEXT:    vpshufd {{.*#+}} ymm22 = ymm22[0,2,2,3,4,6,6,7]
-; AVX512BW-NEXT:    vpermw %ymm20, %ymm6, %ymm20
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm22, %zmm20, %zmm23
-; AVX512BW-NEXT:    vmovdqu16 %zmm14, %zmm23 {%k1}
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm14 = xmm27[0],xmm24[0],xmm27[1],xmm24[1],xmm27[2],xmm24[2],xmm27[3],xmm24[3],xmm27[4],xmm24[4],xmm27[5],xmm24[5],xmm27[6],xmm24[6],xmm27[7],xmm24[7]
-; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm20 = xmm14[2,3,2,3]
-; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm20 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero
-; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm22 = xmm14[3,3,3,3]
-; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm22 = xmm22[0],zero,zero,zero,xmm22[1],zero,zero,zero
-; AVX512BW-NEXT:    vinserti32x4 $1, %xmm22, %ymm20, %ymm20
-; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm22 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero
-; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm14 = xmm14[1,1,1,1]
-; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm14 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero
-; AVX512BW-NEXT:    vinserti32x4 $1, %xmm14, %ymm22, %ymm14
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm20, %zmm14, %zmm14
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm20 = xmm29[0],xmm28[0],xmm29[1],xmm28[1],xmm29[2],xmm28[2],xmm29[3],xmm28[3],xmm29[4],xmm28[4],xmm29[5],xmm28[5],xmm29[6],xmm28[6],xmm29[7],xmm28[7]
-; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm22 = xmm20[0,1,2,3,4,4,6,5]
-; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm25 = xmm20[0,1,2,3,4,6,6,7]
-; AVX512BW-NEXT:    vinserti32x4 $1, %xmm25, %ymm22, %ymm25
-; AVX512BW-NEXT:    vmovdqa64 32(%r9), %xmm22
-; AVX512BW-NEXT:    vpshufd {{.*#+}} ymm25 = ymm25[2,1,3,3,6,5,7,7]
-; AVX512BW-NEXT:    vpermw %ymm20, %ymm11, %ymm20
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm25, %zmm20, %zmm20
-; AVX512BW-NEXT:    vmovdqa64 32(%r8), %xmm25
-; AVX512BW-NEXT:    vmovdqu16 %zmm20, %zmm14 {%k2}
-; AVX512BW-NEXT:    vmovdqa64 32(%rsi), %xmm20
-; AVX512BW-NEXT:    vmovdqa32 %zmm23, %zmm14 {%k3}
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm15 = xmm18[8],xmm15[8],xmm18[9],xmm15[9],xmm18[10],xmm15[10],xmm18[11],xmm15[11],xmm18[12],xmm15[12],xmm18[13],xmm15[13],xmm18[14],xmm15[14],xmm18[15],xmm15[15]
-; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm18 = xmm15[0,1,2,3,4,4,6,5]
-; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm23 = xmm15[0,1,2,3,4,6,6,7]
-; AVX512BW-NEXT:    vinserti32x4 $1, %xmm23, %ymm18, %ymm18
-; AVX512BW-NEXT:    vmovdqa64 32(%rdi), %xmm23
-; AVX512BW-NEXT:    vpshufd {{.*#+}} ymm18 = ymm18[0,2,2,3,4,6,6,7]
-; AVX512BW-NEXT:    vpermw %ymm15, %ymm3, %ymm15
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm18, %zmm15, %zmm15
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm18 = xmm21[8],xmm19[8],xmm21[9],xmm19[9],xmm21[10],xmm19[10],xmm21[11],xmm19[11],xmm21[12],xmm19[12],xmm21[13],xmm19[13],xmm21[14],xmm19[14],xmm21[15],xmm19[15]
-; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm19 = xmm18[0,1,2,3,4,5,5,7]
-; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm21 = xmm18[0,1,2,3,6,5,7,7]
-; AVX512BW-NEXT:    vinserti32x4 $1, %xmm21, %ymm19, %ymm21
-; AVX512BW-NEXT:    vmovdqa64 32(%rcx), %xmm19
-; AVX512BW-NEXT:    vpshufd {{.*#+}} ymm21 = ymm21[0,2,2,3,4,6,6,7]
-; AVX512BW-NEXT:    vpermw %ymm18, %ymm6, %ymm18
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm21, %zmm18, %zmm18
-; AVX512BW-NEXT:    vmovdqa64 32(%rdx), %xmm26
-; AVX512BW-NEXT:    vmovdqu16 %zmm15, %zmm18 {%k1}
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm15 = xmm27[8],xmm24[8],xmm27[9],xmm24[9],xmm27[10],xmm24[10],xmm27[11],xmm24[11],xmm27[12],xmm24[12],xmm27[13],xmm24[13],xmm27[14],xmm24[14],xmm27[15],xmm24[15]
-; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm21 = xmm15[2,3,2,3]
-; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm21 = xmm21[0],zero,zero,zero,xmm21[1],zero,zero,zero
-; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm24 = xmm15[3,3,3,3]
-; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm24 = xmm24[0],zero,zero,zero,xmm24[1],zero,zero,zero
-; AVX512BW-NEXT:    vinserti32x4 $1, %xmm24, %ymm21, %ymm21
-; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm24 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero
+; AVX512BW-NEXT:    vpermw %zmm14, %zmm12, %zmm3 {%k2}
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm14 = xmm20[0],xmm19[0],xmm20[1],xmm19[1],xmm20[2],xmm19[2],xmm20[3],xmm19[3],xmm20[4],xmm19[4],xmm20[5],xmm19[5],xmm20[6],xmm19[6],xmm20[7],xmm19[7]
+; AVX512BW-NEXT:    vinserti32x4 $2, %xmm14, %zmm14, %zmm15
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm14 = xmm24[0],xmm21[0],xmm24[1],xmm21[1],xmm24[2],xmm21[2],xmm24[3],xmm21[3],xmm24[4],xmm21[4],xmm24[5],xmm21[5],xmm24[6],xmm21[6],xmm24[7],xmm21[7]
+; AVX512BW-NEXT:    vinserti32x4 $2, %xmm14, %zmm14, %zmm14
+; AVX512BW-NEXT:    vpermw %zmm14, %zmm10, %zmm14
+; AVX512BW-NEXT:    vpermw %zmm15, %zmm9, %zmm14 {%k1}
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm15 = xmm30[0],xmm28[0],xmm30[1],xmm28[1],xmm30[2],xmm28[2],xmm30[3],xmm28[3],xmm30[4],xmm28[4],xmm30[5],xmm28[5],xmm30[6],xmm28[6],xmm30[7],xmm28[7]
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm27 = xmm15[2,3,2,3]
+; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm27 = xmm27[0],zero,zero,zero,xmm27[1],zero,zero,zero
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm29 = xmm15[3,3,3,3]
+; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm29 = xmm29[0],zero,zero,zero,xmm29[1],zero,zero,zero
+; AVX512BW-NEXT:    vinserti32x4 $1, %xmm29, %ymm27, %ymm27
+; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm29 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero
 ; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm15 = xmm15[1,1,1,1]
 ; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm15 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero
-; AVX512BW-NEXT:    vinserti32x4 $1, %xmm15, %ymm24, %ymm15
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm21, %zmm15, %zmm15
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm21 = xmm29[8],xmm28[8],xmm29[9],xmm28[9],xmm29[10],xmm28[10],xmm29[11],xmm28[11],xmm29[12],xmm28[12],xmm29[13],xmm28[13],xmm29[14],xmm28[14],xmm29[15],xmm28[15]
-; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm24 = xmm21[0,1,2,3,4,4,6,5]
-; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm27 = xmm21[0,1,2,3,4,6,6,7]
-; AVX512BW-NEXT:    vinserti32x4 $1, %xmm27, %ymm24, %ymm24
-; AVX512BW-NEXT:    vpshufd {{.*#+}} ymm24 = ymm24[2,1,3,3,6,5,7,7]
-; AVX512BW-NEXT:    vpermw %ymm21, %ymm11, %ymm21
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm24, %zmm21, %zmm21
-; AVX512BW-NEXT:    vmovdqu16 %zmm21, %zmm15 {%k2}
-; AVX512BW-NEXT:    vmovdqa32 %zmm18, %zmm15 {%k3}
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm18 = xmm17[0],xmm16[0],xmm17[1],xmm16[1],xmm17[2],xmm16[2],xmm17[3],xmm16[3],xmm17[4],xmm16[4],xmm17[5],xmm16[5],xmm17[6],xmm16[6],xmm17[7],xmm16[7]
-; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm21 = xmm18[0,1,2,3,4,4,6,5]
-; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm24 = xmm18[0,1,2,3,4,6,6,7]
-; AVX512BW-NEXT:    vinserti32x4 $1, %xmm24, %ymm21, %ymm21
-; AVX512BW-NEXT:    vpshufd {{.*#+}} ymm21 = ymm21[0,2,2,3,4,6,6,7]
-; AVX512BW-NEXT:    vpermw %ymm18, %ymm3, %ymm18
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm21, %zmm18, %zmm18
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm21 = xmm25[0],xmm22[0],xmm25[1],xmm22[1],xmm25[2],xmm22[2],xmm25[3],xmm22[3],xmm25[4],xmm22[4],xmm25[5],xmm22[5],xmm25[6],xmm22[6],xmm25[7],xmm22[7]
-; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm24 = xmm21[0,1,2,3,4,5,5,7]
-; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm27 = xmm21[0,1,2,3,6,5,7,7]
-; AVX512BW-NEXT:    vinserti32x4 $1, %xmm27, %ymm24, %ymm24
-; AVX512BW-NEXT:    vpshufd {{.*#+}} ymm24 = ymm24[0,2,2,3,4,6,6,7]
-; AVX512BW-NEXT:    vpermw %ymm21, %ymm6, %ymm21
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm24, %zmm21, %zmm27
-; AVX512BW-NEXT:    vmovdqu16 %zmm18, %zmm27 {%k1}
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm18 = xmm23[0],xmm20[0],xmm23[1],xmm20[1],xmm23[2],xmm20[2],xmm23[3],xmm20[3],xmm23[4],xmm20[4],xmm23[5],xmm20[5],xmm23[6],xmm20[6],xmm23[7],xmm20[7]
-; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm21 = xmm18[2,3,2,3]
+; AVX512BW-NEXT:    vinserti32x4 $1, %xmm15, %ymm29, %ymm15
+; AVX512BW-NEXT:    vmovdqa 48(%rdx), %xmm0
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm27, %zmm15, %zmm15
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm27 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX512BW-NEXT:    vinserti32x4 $2, %xmm27, %zmm27, %zmm27
+; AVX512BW-NEXT:    vpermw %zmm27, %zmm12, %zmm15 {%k2}
+; AVX512BW-NEXT:    vmovdqa64 32(%rsi), %xmm29
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm19 = xmm20[8],xmm19[8],xmm20[9],xmm19[9],xmm20[10],xmm19[10],xmm20[11],xmm19[11],xmm20[12],xmm19[12],xmm20[13],xmm19[13],xmm20[14],xmm19[14],xmm20[15],xmm19[15]
+; AVX512BW-NEXT:    vmovdqa64 32(%rdi), %xmm31
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm20 = xmm24[8],xmm21[8],xmm24[9],xmm21[9],xmm24[10],xmm21[10],xmm24[11],xmm21[11],xmm24[12],xmm21[12],xmm24[13],xmm21[13],xmm24[14],xmm21[14],xmm24[15],xmm21[15]
+; AVX512BW-NEXT:    vmovdqa64 32(%rcx), %xmm27
+; AVX512BW-NEXT:    vinserti32x4 $2, %xmm19, %zmm19, %zmm21
+; AVX512BW-NEXT:    vinserti32x4 $2, %xmm20, %zmm20, %zmm19
+; AVX512BW-NEXT:    vpermw %zmm19, %zmm10, %zmm19
+; AVX512BW-NEXT:    vpermw %zmm21, %zmm9, %zmm19 {%k1}
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm20 = xmm30[8],xmm28[8],xmm30[9],xmm28[9],xmm30[10],xmm28[10],xmm30[11],xmm28[11],xmm30[12],xmm28[12],xmm30[13],xmm28[13],xmm30[14],xmm28[14],xmm30[15],xmm28[15]
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm21 = xmm20[2,3,2,3]
 ; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm21 = xmm21[0],zero,zero,zero,xmm21[1],zero,zero,zero
-; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm24 = xmm18[3,3,3,3]
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm24 = xmm20[3,3,3,3]
 ; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm24 = xmm24[0],zero,zero,zero,xmm24[1],zero,zero,zero
 ; AVX512BW-NEXT:    vinserti32x4 $1, %xmm24, %ymm21, %ymm21
-; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm24 = xmm18[0],zero,zero,zero,xmm18[1],zero,zero,zero
-; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm18 = xmm18[1,1,1,1]
-; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm18 = xmm18[0],zero,zero,zero,xmm18[1],zero,zero,zero
-; AVX512BW-NEXT:    vinserti32x4 $1, %xmm18, %ymm24, %ymm18
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm21, %zmm18, %zmm18
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm21 = xmm26[0],xmm19[0],xmm26[1],xmm19[1],xmm26[2],xmm19[2],xmm26[3],xmm19[3],xmm26[4],xmm19[4],xmm26[5],xmm19[5],xmm26[6],xmm19[6],xmm26[7],xmm19[7]
-; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm24 = xmm21[0,1,2,3,4,4,6,5]
-; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm28 = xmm21[0,1,2,3,4,6,6,7]
-; AVX512BW-NEXT:    vinserti32x4 $1, %xmm28, %ymm24, %ymm28
-; AVX512BW-NEXT:    vmovdqa64 16(%r9), %xmm24
-; AVX512BW-NEXT:    vpshufd {{.*#+}} ymm28 = ymm28[2,1,3,3,6,5,7,7]
-; AVX512BW-NEXT:    vpermw %ymm21, %ymm11, %ymm21
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm28, %zmm21, %zmm21
-; AVX512BW-NEXT:    vmovdqa64 16(%r8), %xmm28
-; AVX512BW-NEXT:    vmovdqu16 %zmm21, %zmm18 {%k2}
-; AVX512BW-NEXT:    vmovdqa64 16(%rsi), %xmm21
-; AVX512BW-NEXT:    vmovdqa32 %zmm27, %zmm18 {%k3}
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm16 = xmm17[8],xmm16[8],xmm17[9],xmm16[9],xmm17[10],xmm16[10],xmm17[11],xmm16[11],xmm17[12],xmm16[12],xmm17[13],xmm16[13],xmm17[14],xmm16[14],xmm17[15],xmm16[15]
-; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm17 = xmm16[0,1,2,3,4,4,6,5]
-; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm27 = xmm16[0,1,2,3,4,6,6,7]
-; AVX512BW-NEXT:    vinserti32x4 $1, %xmm27, %ymm17, %ymm17
-; AVX512BW-NEXT:    vmovdqa64 16(%rdi), %xmm27
-; AVX512BW-NEXT:    vpshufd {{.*#+}} ymm17 = ymm17[0,2,2,3,4,6,6,7]
-; AVX512BW-NEXT:    vpermw %ymm16, %ymm3, %ymm16
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm17, %zmm16, %zmm16
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm22 = xmm25[8],xmm22[8],xmm25[9],xmm22[9],xmm25[10],xmm22[10],xmm25[11],xmm22[11],xmm25[12],xmm22[12],xmm25[13],xmm22[13],xmm25[14],xmm22[14],xmm25[15],xmm22[15]
-; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm17 = xmm22[0,1,2,3,4,5,5,7]
-; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm25 = xmm22[0,1,2,3,6,5,7,7]
-; AVX512BW-NEXT:    vinserti32x4 $1, %xmm25, %ymm17, %ymm25
-; AVX512BW-NEXT:    vmovdqa64 16(%rcx), %xmm17
-; AVX512BW-NEXT:    vpshufd {{.*#+}} ymm25 = ymm25[0,2,2,3,4,6,6,7]
-; AVX512BW-NEXT:    vpermw %ymm22, %ymm6, %ymm22
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm25, %zmm22, %zmm25
-; AVX512BW-NEXT:    vmovdqa64 16(%rdx), %xmm22
-; AVX512BW-NEXT:    vmovdqu16 %zmm16, %zmm25 {%k1}
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm16 = xmm23[8],xmm20[8],xmm23[9],xmm20[9],xmm23[10],xmm20[10],xmm23[11],xmm20[11],xmm23[12],xmm20[12],xmm23[13],xmm20[13],xmm23[14],xmm20[14],xmm23[15],xmm20[15]
-; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm20 = xmm16[2,3,2,3]
+; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm24 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm20 = xmm20[1,1,1,1]
 ; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm20 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero
-; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm23 = xmm16[3,3,3,3]
-; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm23 = xmm23[0],zero,zero,zero,xmm23[1],zero,zero,zero
-; AVX512BW-NEXT:    vinserti32x4 $1, %xmm23, %ymm20, %ymm20
-; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm23 = xmm16[0],zero,zero,zero,xmm16[1],zero,zero,zero
-; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm16 = xmm16[1,1,1,1]
-; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm16 = xmm16[0],zero,zero,zero,xmm16[1],zero,zero,zero
-; AVX512BW-NEXT:    vinserti32x4 $1, %xmm16, %ymm23, %ymm16
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm20, %zmm16, %zmm16
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm19 = xmm26[8],xmm19[8],xmm26[9],xmm19[9],xmm26[10],xmm19[10],xmm26[11],xmm19[11],xmm26[12],xmm19[12],xmm26[13],xmm19[13],xmm26[14],xmm19[14],xmm26[15],xmm19[15]
-; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm20 = xmm19[0,1,2,3,4,4,6,5]
-; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm23 = xmm19[0,1,2,3,4,6,6,7]
-; AVX512BW-NEXT:    vinserti32x4 $1, %xmm23, %ymm20, %ymm20
-; AVX512BW-NEXT:    vpshufd {{.*#+}} ymm20 = ymm20[2,1,3,3,6,5,7,7]
-; AVX512BW-NEXT:    vpermw %ymm19, %ymm11, %ymm19
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm20, %zmm19, %zmm19
-; AVX512BW-NEXT:    vmovdqu16 %zmm19, %zmm16 {%k2}
-; AVX512BW-NEXT:    vmovdqa32 %zmm25, %zmm16 {%k3}
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm19 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7]
-; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm20 = xmm19[0,1,2,3,4,4,6,5]
-; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm23 = xmm19[0,1,2,3,4,6,6,7]
-; AVX512BW-NEXT:    vinserti32x4 $1, %xmm23, %ymm20, %ymm20
-; AVX512BW-NEXT:    vpshufd {{.*#+}} ymm20 = ymm20[0,2,2,3,4,6,6,7]
-; AVX512BW-NEXT:    vpermw %ymm19, %ymm3, %ymm19
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm20, %zmm19, %zmm19
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm20 = xmm28[0],xmm24[0],xmm28[1],xmm24[1],xmm28[2],xmm24[2],xmm28[3],xmm24[3],xmm28[4],xmm24[4],xmm28[5],xmm24[5],xmm28[6],xmm24[6],xmm28[7],xmm24[7]
-; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm23 = xmm20[0,1,2,3,4,5,5,7]
-; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm25 = xmm20[0,1,2,3,6,5,7,7]
-; AVX512BW-NEXT:    vinserti32x4 $1, %xmm25, %ymm23, %ymm23
-; AVX512BW-NEXT:    vpshufd {{.*#+}} ymm23 = ymm23[0,2,2,3,4,6,6,7]
-; AVX512BW-NEXT:    vpermw %ymm20, %ymm6, %ymm20
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm23, %zmm20, %zmm20
-; AVX512BW-NEXT:    vmovdqu16 %zmm19, %zmm20 {%k1}
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm19 = xmm27[0],xmm21[0],xmm27[1],xmm21[1],xmm27[2],xmm21[2],xmm27[3],xmm21[3],xmm27[4],xmm21[4],xmm27[5],xmm21[5],xmm27[6],xmm21[6],xmm27[7],xmm21[7]
-; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm23 = xmm19[2,3,2,3]
+; AVX512BW-NEXT:    vinserti32x4 $1, %xmm20, %ymm24, %ymm20
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm21, %zmm20, %zmm20
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX512BW-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpermw %zmm0, %zmm12, %zmm20 {%k2}
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm23[0],xmm22[0],xmm23[1],xmm22[1],xmm23[2],xmm22[2],xmm23[3],xmm22[3],xmm23[4],xmm22[4],xmm23[5],xmm22[5],xmm23[6],xmm22[6],xmm23[7],xmm22[7]
+; AVX512BW-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm26[0],xmm25[0],xmm26[1],xmm25[1],xmm26[2],xmm25[2],xmm26[3],xmm25[3],xmm26[4],xmm25[4],xmm26[5],xmm25[5],xmm26[6],xmm25[6],xmm26[7],xmm25[7]
+; AVX512BW-NEXT:    vinserti32x4 $2, %xmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpermw %zmm1, %zmm10, %zmm21
+; AVX512BW-NEXT:    vpermw %zmm0, %zmm9, %zmm21 {%k1}
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm31[0],xmm29[0],xmm31[1],xmm29[1],xmm31[2],xmm29[2],xmm31[3],xmm29[3],xmm31[4],xmm29[4],xmm31[5],xmm29[5],xmm31[6],xmm29[6],xmm31[7],xmm29[7]
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm24 = xmm0[3,3,3,3]
+; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm24 = xmm24[0],zero,zero,zero,xmm24[1],zero,zero,zero
+; AVX512BW-NEXT:    vinserti32x4 $1, %xmm24, %ymm1, %ymm1
+; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm24 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX512BW-NEXT:    vinserti32x4 $1, %xmm0, %ymm24, %ymm0
+; AVX512BW-NEXT:    vmovdqa 32(%rdx), %xmm2
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm24
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm27[0],xmm2[1],xmm27[1],xmm2[2],xmm27[2],xmm2[3],xmm27[3],xmm2[4],xmm27[4],xmm2[5],xmm27[5],xmm2[6],xmm27[6],xmm2[7],xmm27[7]
+; AVX512BW-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpermw %zmm0, %zmm12, %zmm24 {%k2}
+; AVX512BW-NEXT:    vmovdqa64 16(%rsi), %xmm28
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm23[8],xmm22[8],xmm23[9],xmm22[9],xmm23[10],xmm22[10],xmm23[11],xmm22[11],xmm23[12],xmm22[12],xmm23[13],xmm22[13],xmm23[14],xmm22[14],xmm23[15],xmm22[15]
+; AVX512BW-NEXT:    vmovdqa64 16(%rdi), %xmm30
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm26[8],xmm25[8],xmm26[9],xmm25[9],xmm26[10],xmm25[10],xmm26[11],xmm25[11],xmm26[12],xmm25[12],xmm26[13],xmm25[13],xmm26[14],xmm25[14],xmm26[15],xmm25[15]
+; AVX512BW-NEXT:    vmovdqa64 16(%rcx), %xmm25
+; AVX512BW-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
+; AVX512BW-NEXT:    vinserti32x4 $2, %xmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpermw %zmm1, %zmm10, %zmm22
+; AVX512BW-NEXT:    vpermw %zmm0, %zmm9, %zmm22 {%k1}
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm31[8],xmm29[8],xmm31[9],xmm29[9],xmm31[10],xmm29[10],xmm31[11],xmm29[11],xmm31[12],xmm29[12],xmm31[13],xmm29[13],xmm31[14],xmm29[14],xmm31[15],xmm29[15]
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm23 = xmm0[3,3,3,3]
 ; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm23 = xmm23[0],zero,zero,zero,xmm23[1],zero,zero,zero
-; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm25 = xmm19[3,3,3,3]
-; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm25 = xmm25[0],zero,zero,zero,xmm25[1],zero,zero,zero
-; AVX512BW-NEXT:    vinserti32x4 $1, %xmm25, %ymm23, %ymm23
-; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm25 = xmm19[0],zero,zero,zero,xmm19[1],zero,zero,zero
-; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm19 = xmm19[1,1,1,1]
-; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm19 = xmm19[0],zero,zero,zero,xmm19[1],zero,zero,zero
-; AVX512BW-NEXT:    vinserti32x4 $1, %xmm19, %ymm25, %ymm19
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm23, %zmm19, %zmm19
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm23 = xmm22[0],xmm17[0],xmm22[1],xmm17[1],xmm22[2],xmm17[2],xmm22[3],xmm17[3],xmm22[4],xmm17[4],xmm22[5],xmm17[5],xmm22[6],xmm17[6],xmm22[7],xmm17[7]
-; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm25 = xmm23[0,1,2,3,4,4,6,5]
-; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm26 = xmm23[0,1,2,3,4,6,6,7]
-; AVX512BW-NEXT:    vinserti32x4 $1, %xmm26, %ymm25, %ymm25
-; AVX512BW-NEXT:    vpshufd {{.*#+}} ymm25 = ymm25[2,1,3,3,6,5,7,7]
-; AVX512BW-NEXT:    vpermw %ymm23, %ymm11, %ymm23
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm25, %zmm23, %zmm23
-; AVX512BW-NEXT:    vmovdqu16 %zmm23, %zmm19 {%k2}
-; AVX512BW-NEXT:    vmovdqa32 %zmm20, %zmm19 {%k3}
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm12 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15]
-; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm13 = xmm12[0,1,2,3,4,4,6,5]
-; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm20 = xmm12[0,1,2,3,4,6,6,7]
-; AVX512BW-NEXT:    vinserti32x4 $1, %xmm20, %ymm13, %ymm13
-; AVX512BW-NEXT:    vpshufd {{.*#+}} ymm13 = ymm13[0,2,2,3,4,6,6,7]
-; AVX512BW-NEXT:    vpermw %ymm12, %ymm3, %ymm12
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm13, %zmm12, %zmm12
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm13 = xmm28[8],xmm24[8],xmm28[9],xmm24[9],xmm28[10],xmm24[10],xmm28[11],xmm24[11],xmm28[12],xmm24[12],xmm28[13],xmm24[13],xmm28[14],xmm24[14],xmm28[15],xmm24[15]
-; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm20 = xmm13[0,1,2,3,4,5,5,7]
-; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm23 = xmm13[0,1,2,3,6,5,7,7]
-; AVX512BW-NEXT:    vinserti32x4 $1, %xmm23, %ymm20, %ymm20
-; AVX512BW-NEXT:    vpshufd {{.*#+}} ymm20 = ymm20[0,2,2,3,4,6,6,7]
-; AVX512BW-NEXT:    vpermw %ymm13, %ymm6, %ymm13
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm20, %zmm13, %zmm13
-; AVX512BW-NEXT:    vmovdqu16 %zmm12, %zmm13 {%k1}
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm12 = xmm27[8],xmm21[8],xmm27[9],xmm21[9],xmm27[10],xmm21[10],xmm27[11],xmm21[11],xmm27[12],xmm21[12],xmm27[13],xmm21[13],xmm27[14],xmm21[14],xmm27[15],xmm21[15]
-; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm20 = xmm12[2,3,2,3]
-; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm20 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero
-; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm21 = xmm12[3,3,3,3]
-; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm21 = xmm21[0],zero,zero,zero,xmm21[1],zero,zero,zero
-; AVX512BW-NEXT:    vinserti32x4 $1, %xmm21, %ymm20, %ymm20
-; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm21 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero
-; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[1,1,1,1]
-; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero
-; AVX512BW-NEXT:    vinserti32x4 $1, %xmm12, %ymm21, %ymm12
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm20, %zmm12, %zmm12
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm17 = xmm22[8],xmm17[8],xmm22[9],xmm17[9],xmm22[10],xmm17[10],xmm22[11],xmm17[11],xmm22[12],xmm17[12],xmm22[13],xmm17[13],xmm22[14],xmm17[14],xmm22[15],xmm17[15]
-; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm20 = xmm17[0,1,2,3,4,4,6,5]
-; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm21 = xmm17[0,1,2,3,4,6,6,7]
-; AVX512BW-NEXT:    vinserti32x4 $1, %xmm21, %ymm20, %ymm20
-; AVX512BW-NEXT:    vpshufd {{.*#+}} ymm20 = ymm20[2,1,3,3,6,5,7,7]
-; AVX512BW-NEXT:    vpermw %ymm17, %ymm11, %ymm17
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm20, %zmm17, %zmm17
-; AVX512BW-NEXT:    vmovdqu16 %zmm17, %zmm12 {%k2}
-; AVX512BW-NEXT:    vmovdqa32 %zmm13, %zmm12 {%k3}
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,6,5]
-; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm13 = xmm0[0,1,2,3,4,6,6,7]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm13, %ymm2, %ymm2
-; AVX512BW-NEXT:    vpermw %ymm0, %ymm3, %ymm0
-; AVX512BW-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
-; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,5,5,7]
-; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,6,5,7,7]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm4, %ymm3, %ymm3
-; AVX512BW-NEXT:    vpermw %ymm2, %ymm6, %ymm2
-; AVX512BW-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
-; AVX512BW-NEXT:    vmovdqu16 %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
-; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
-; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
-; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX512BW-NEXT:    vinserti128 $1, %xmm4, %ymm3, %ymm3
-; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX512BW-NEXT:    vinserti32x4 $1, %xmm23, %ymm1, %ymm1
+; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm23 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
 ; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
 ; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX512BW-NEXT:    vinserti128 $1, %xmm0, %ymm4, %ymm0
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
-; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,4,6,5]
-; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,6,6,7]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm5, %ymm4, %ymm4
-; AVX512BW-NEXT:    vpermw %ymm3, %ymm11, %ymm3
-; AVX512BW-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7]
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm4, %zmm3, %zmm3
-; AVX512BW-NEXT:    vmovdqu16 %zmm3, %zmm0 {%k2}
-; AVX512BW-NEXT:    vmovdqa32 %zmm2, %zmm0 {%k3}
+; AVX512BW-NEXT:    vinserti32x4 $1, %xmm0, %ymm23, %ymm0
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm23
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm27[8],xmm2[9],xmm27[9],xmm2[10],xmm27[10],xmm2[11],xmm27[11],xmm2[12],xmm27[12],xmm2[13],xmm27[13],xmm2[14],xmm27[14],xmm2[15],xmm27[15]
+; AVX512BW-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpermw %zmm0, %zmm12, %zmm23 {%k2}
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm16[0],xmm13[0],xmm16[1],xmm13[1],xmm16[2],xmm13[2],xmm16[3],xmm13[3],xmm16[4],xmm13[4],xmm16[5],xmm13[5],xmm16[6],xmm13[6],xmm16[7],xmm13[7]
+; AVX512BW-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm18[0],xmm17[0],xmm18[1],xmm17[1],xmm18[2],xmm17[2],xmm18[3],xmm17[3],xmm18[4],xmm17[4],xmm18[5],xmm17[5],xmm18[6],xmm17[6],xmm18[7],xmm17[7]
+; AVX512BW-NEXT:    vinserti32x4 $2, %xmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpermw %zmm1, %zmm10, %zmm26
+; AVX512BW-NEXT:    vpermw %zmm0, %zmm9, %zmm26 {%k1}
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm30[0],xmm28[0],xmm30[1],xmm28[1],xmm30[2],xmm28[2],xmm30[3],xmm28[3],xmm30[4],xmm28[4],xmm30[5],xmm28[5],xmm30[6],xmm28[6],xmm30[7],xmm28[7]
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
+; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX512BW-NEXT:    vinserti128 $1, %xmm0, %ymm2, %ymm0
+; AVX512BW-NEXT:    vmovdqa 16(%rdx), %xmm2
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm27
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm25[0],xmm2[1],xmm25[1],xmm2[2],xmm25[2],xmm2[3],xmm25[3],xmm2[4],xmm25[4],xmm2[5],xmm25[5],xmm2[6],xmm25[6],xmm2[7],xmm25[7]
+; AVX512BW-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpermw %zmm0, %zmm12, %zmm27 {%k2}
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm16[8],xmm13[8],xmm16[9],xmm13[9],xmm16[10],xmm13[10],xmm16[11],xmm13[11],xmm16[12],xmm13[12],xmm16[13],xmm13[13],xmm16[14],xmm13[14],xmm16[15],xmm13[15]
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm18[8],xmm17[8],xmm18[9],xmm17[9],xmm18[10],xmm17[10],xmm18[11],xmm17[11],xmm18[12],xmm17[12],xmm18[13],xmm17[13],xmm18[14],xmm17[14],xmm18[15],xmm17[15]
+; AVX512BW-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm13
+; AVX512BW-NEXT:    vinserti32x4 $2, %xmm1, %zmm1, %zmm0
+; AVX512BW-NEXT:    vpermw %zmm0, %zmm10, %zmm0
+; AVX512BW-NEXT:    vpermw %zmm13, %zmm9, %zmm0 {%k1}
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm30[8],xmm28[8],xmm30[9],xmm28[9],xmm30[10],xmm28[10],xmm30[11],xmm28[11],xmm30[12],xmm28[12],xmm30[13],xmm28[13],xmm30[14],xmm28[14],xmm30[15],xmm28[15]
+; AVX512BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
+; AVX512BW-NEXT:    vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload
+; AVX512BW-NEXT:    # xmm13 = xmm13[0],mem[0],xmm13[1],mem[1],xmm13[2],mem[2],xmm13[3],mem[3],xmm13[4],mem[4],xmm13[5],mem[5],xmm13[6],mem[6],xmm13[7],mem[7]
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7]
+; AVX512BW-NEXT:    vinserti32x4 $2, %xmm5, %zmm5, %zmm5
+; AVX512BW-NEXT:    vpermw %zmm5, %zmm10, %zmm5
+; AVX512BW-NEXT:    vinserti32x4 $2, %xmm13, %zmm13, %zmm7
+; AVX512BW-NEXT:    vpermw %zmm7, %zmm9, %zmm5 {%k1}
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm7 = xmm1[2,3,2,3]
+; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm9 = xmm1[3,3,3,3]
+; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero
+; AVX512BW-NEXT:    vinserti128 $1, %xmm9, %ymm7, %ymm7
+; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm9 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
+; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512BW-NEXT:    vinserti128 $1, %xmm1, %ymm9, %ymm1
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm7, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm25[8],xmm2[9],xmm25[9],xmm2[10],xmm25[10],xmm2[11],xmm25[11],xmm2[12],xmm25[12],xmm2[13],xmm25[13],xmm2[14],xmm25[14],xmm2[15],xmm25[15]
+; AVX512BW-NEXT:    vinserti32x4 $2, %xmm2, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpermw %zmm2, %zmm12, %zmm1 {%k2}
+; AVX512BW-NEXT:    vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm2 # 16-byte Folded Reload
+; AVX512BW-NEXT:    # xmm2 = xmm6[0],mem[0],xmm6[1],mem[1],xmm6[2],mem[2],xmm6[3],mem[3],xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7]
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm6 = xmm2[2,3,2,3]
+; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm7 = xmm2[3,3,3,3]
+; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero
+; AVX512BW-NEXT:    vinserti128 $1, %xmm7, %ymm6, %ymm6
+; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm7 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
+; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm7, %ymm2
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm6, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm6 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3],xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7]
+; AVX512BW-NEXT:    vinserti32x4 $2, %xmm6, %zmm6, %zmm6
+; AVX512BW-NEXT:    vpermw %zmm6, %zmm12, %zmm2 {%k2}
+; AVX512BW-NEXT:    movw $-21846, %ax # imm = 0xAAAA
+; AVX512BW-NEXT:    kmovd %eax, %k1
+; AVX512BW-NEXT:    vmovdqa32 %zmm4, %zmm3 {%k1}
+; AVX512BW-NEXT:    vmovdqa32 %zmm14, %zmm15 {%k1}
+; AVX512BW-NEXT:    vmovdqa32 %zmm19, %zmm20 {%k1}
+; AVX512BW-NEXT:    vmovdqa32 %zmm21, %zmm24 {%k1}
+; AVX512BW-NEXT:    vmovdqa32 %zmm22, %zmm23 {%k1}
+; AVX512BW-NEXT:    vmovdqa32 %zmm26, %zmm27 {%k1}
+; AVX512BW-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovdqa32 %zmm5, %zmm2 {%k1}
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, 192(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm19, 128(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm16, 320(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm18, 256(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm15, 448(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm14, 384(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, 64(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, (%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, 192(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm27, 128(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm23, 320(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm24, 256(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm20, 448(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm15, 384(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, 64(%rax)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
@@ -9122,336 +9024,230 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-BW:       # %bb.0:
 ; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512DQ-BW-NEXT:    vmovdqa (%r10), %xmm0
-; AVX512DQ-BW-NEXT:    vmovdqa 16(%r10), %xmm12
-; AVX512DQ-BW-NEXT:    vmovdqa64 32(%r10), %xmm16
-; AVX512DQ-BW-NEXT:    vmovdqa 48(%r10), %xmm15
-; AVX512DQ-BW-NEXT:    vmovdqa (%rax), %xmm2
-; AVX512DQ-BW-NEXT:    vmovdqa 16(%rax), %xmm13
-; AVX512DQ-BW-NEXT:    vmovdqa64 32(%rax), %xmm17
-; AVX512DQ-BW-NEXT:    vmovdqa64 48(%rax), %xmm18
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
-; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,6,5]
-; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,6,6,7]
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm4, %ymm3, %ymm3
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} ymm4 = ymm3[0,2,2,3,4,6,6,7]
-; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm3 = [0,0,0,0,4,5,2,1,0,2,0,2,4,5,2,3]
-; AVX512DQ-BW-NEXT:    vpermw %ymm1, %ymm3, %ymm1
-; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm4, %zmm1, %zmm1
-; AVX512DQ-BW-NEXT:    vmovdqa (%r9), %xmm4
-; AVX512DQ-BW-NEXT:    vmovdqa64 48(%r9), %xmm19
-; AVX512DQ-BW-NEXT:    vmovdqa (%r8), %xmm5
-; AVX512DQ-BW-NEXT:    vmovdqa64 48(%r8), %xmm21
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm7 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
-; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm7[0,1,2,3,4,5,5,7]
-; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm7[0,1,2,3,6,5,7,7]
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm8, %ymm6, %ymm6
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} ymm8 = ymm6[0,2,2,3,4,6,6,7]
-; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm6 = [0,1,0,1,4,5,1,3,2,1,2,1,4,5,3,3]
-; AVX512DQ-BW-NEXT:    vpermw %ymm7, %ymm6, %ymm7
-; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm8, %zmm7, %zmm14
+; AVX512DQ-BW-NEXT:    vmovdqa (%r10), %xmm1
+; AVX512DQ-BW-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-BW-NEXT:    vmovdqa 16(%r10), %xmm13
+; AVX512DQ-BW-NEXT:    vmovdqa64 32(%r10), %xmm22
+; AVX512DQ-BW-NEXT:    vmovdqa64 48(%r10), %xmm19
+; AVX512DQ-BW-NEXT:    vmovdqa (%rax), %xmm0
+; AVX512DQ-BW-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-BW-NEXT:    vmovdqa64 16(%rax), %xmm16
+; AVX512DQ-BW-NEXT:    vmovdqa64 32(%rax), %xmm23
+; AVX512DQ-BW-NEXT:    vmovdqa64 48(%rax), %xmm20
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
+; AVX512DQ-BW-NEXT:    vmovdqa (%r9), %xmm5
+; AVX512DQ-BW-NEXT:    vmovdqa64 16(%r9), %xmm17
+; AVX512DQ-BW-NEXT:    vmovdqa64 32(%r9), %xmm25
+; AVX512DQ-BW-NEXT:    vmovdqa64 48(%r9), %xmm21
+; AVX512DQ-BW-NEXT:    vmovdqa (%r8), %xmm7
+; AVX512DQ-BW-NEXT:    vmovdqa64 16(%r8), %xmm18
+; AVX512DQ-BW-NEXT:    vmovdqa64 32(%r8), %xmm26
+; AVX512DQ-BW-NEXT:    vmovdqa64 48(%r8), %xmm24
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm7[8],xmm5[8],xmm7[9],xmm5[9],xmm7[10],xmm5[10],xmm7[11],xmm5[11],xmm7[12],xmm5[12],xmm7[13],xmm5[13],xmm7[14],xmm5[14],xmm7[15],xmm5[15]
+; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm1, %zmm1, %zmm1
+; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm10 = [0,1,0,1,4,5,1,3,2,1,2,1,4,5,3,3,16,17,20,21,20,21,21,23,16,17,22,21,22,21,23,23]
+; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm10, %zmm4
+; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm9 = [0,0,0,0,4,5,2,1,0,2,0,2,4,5,2,3,16,17,20,20,20,20,22,21,16,17,20,22,20,22,22,23]
 ; AVX512DQ-BW-NEXT:    movl $-2004318072, %eax # imm = 0x88888888
 ; AVX512DQ-BW-NEXT:    kmovd %eax, %k1
-; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm1, %zmm14 {%k1}
-; AVX512DQ-BW-NEXT:    vmovdqa (%rsi), %xmm7
-; AVX512DQ-BW-NEXT:    vmovdqa64 48(%rsi), %xmm24
-; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %xmm8
-; AVX512DQ-BW-NEXT:    vmovdqa64 48(%rdi), %xmm27
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15]
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm9 = xmm1[2,3,2,3]
-; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm10 = xmm1[3,3,3,3]
-; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm10, %ymm9, %ymm9
-; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm10 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
+; AVX512DQ-BW-NEXT:    vpermw %zmm0, %zmm9, %zmm4 {%k1}
+; AVX512DQ-BW-NEXT:    vmovdqa (%rsi), %xmm0
+; AVX512DQ-BW-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-BW-NEXT:    vmovdqa64 48(%rsi), %xmm28
+; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %xmm6
+; AVX512DQ-BW-NEXT:    vmovdqa64 48(%rdi), %xmm30
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15]
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm1, %ymm10, %ymm1
-; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm9, %zmm1, %zmm1
-; AVX512DQ-BW-NEXT:    vmovdqa (%rcx), %xmm9
-; AVX512DQ-BW-NEXT:    vmovdqa64 48(%rcx), %xmm28
-; AVX512DQ-BW-NEXT:    vmovdqa (%rdx), %xmm10
-; AVX512DQ-BW-NEXT:    vmovdqa64 48(%rdx), %xmm29
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm20 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15]
-; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm11 = xmm20[0,1,2,3,4,4,6,5]
-; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm22 = xmm20[0,1,2,3,4,6,6,7]
-; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm22, %ymm11, %ymm11
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} ymm22 = ymm11[2,1,3,3,6,5,7,7]
-; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm11 = [0,0,2,1,2,1,6,7,0,2,2,3,2,3,6,7]
-; AVX512DQ-BW-NEXT:    vpermw %ymm20, %ymm11, %ymm20
-; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm22, %zmm20, %zmm20
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm8 = xmm0[3,3,3,3]
+; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm8, %ymm1, %ymm1
+; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm8 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm0, %ymm8, %ymm0
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm3
+; AVX512DQ-BW-NEXT:    vmovdqa (%rcx), %xmm8
+; AVX512DQ-BW-NEXT:    vmovdqa 48(%rcx), %xmm1
+; AVX512DQ-BW-NEXT:    vmovdqa (%rdx), %xmm11
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm12 = xmm11[8],xmm8[8],xmm11[9],xmm8[9],xmm11[10],xmm8[10],xmm11[11],xmm8[11],xmm11[12],xmm8[12],xmm11[13],xmm8[13],xmm11[14],xmm8[14],xmm11[15],xmm8[15]
+; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm12, %zmm12, %zmm14
+; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm12 = [0,0,2,1,2,1,6,7,0,2,2,3,2,3,6,7,20,20,18,19,22,21,22,21,20,22,18,19,22,23,22,23]
 ; AVX512DQ-BW-NEXT:    movl $572662306, %eax # imm = 0x22222222
 ; AVX512DQ-BW-NEXT:    kmovd %eax, %k2
-; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm20, %zmm1 {%k2}
-; AVX512DQ-BW-NEXT:    movw $-21846, %ax # imm = 0xAAAA
-; AVX512DQ-BW-NEXT:    kmovd %eax, %k3
-; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm14, %zmm1 {%k3}
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm14 = xmm18[0],xmm15[0],xmm18[1],xmm15[1],xmm18[2],xmm15[2],xmm18[3],xmm15[3],xmm18[4],xmm15[4],xmm18[5],xmm15[5],xmm18[6],xmm15[6],xmm18[7],xmm15[7]
-; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm20 = xmm14[0,1,2,3,4,4,6,5]
-; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm22 = xmm14[0,1,2,3,4,6,6,7]
-; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm22, %ymm20, %ymm20
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} ymm20 = ymm20[0,2,2,3,4,6,6,7]
-; AVX512DQ-BW-NEXT:    vpermw %ymm14, %ymm3, %ymm14
-; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm20, %zmm14, %zmm14
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm20 = xmm21[0],xmm19[0],xmm21[1],xmm19[1],xmm21[2],xmm19[2],xmm21[3],xmm19[3],xmm21[4],xmm19[4],xmm21[5],xmm19[5],xmm21[6],xmm19[6],xmm21[7],xmm19[7]
-; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm22 = xmm20[0,1,2,3,4,5,5,7]
-; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm23 = xmm20[0,1,2,3,6,5,7,7]
-; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm23, %ymm22, %ymm22
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} ymm22 = ymm22[0,2,2,3,4,6,6,7]
-; AVX512DQ-BW-NEXT:    vpermw %ymm20, %ymm6, %ymm20
-; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm22, %zmm20, %zmm23
-; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm14, %zmm23 {%k1}
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm14 = xmm27[0],xmm24[0],xmm27[1],xmm24[1],xmm27[2],xmm24[2],xmm27[3],xmm24[3],xmm27[4],xmm24[4],xmm27[5],xmm24[5],xmm27[6],xmm24[6],xmm27[7],xmm24[7]
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm20 = xmm14[2,3,2,3]
-; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm20 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm22 = xmm14[3,3,3,3]
-; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm22 = xmm22[0],zero,zero,zero,xmm22[1],zero,zero,zero
-; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm22, %ymm20, %ymm20
-; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm22 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm14 = xmm14[1,1,1,1]
-; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm14 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero
-; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm14, %ymm22, %ymm14
-; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm20, %zmm14, %zmm14
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm20 = xmm29[0],xmm28[0],xmm29[1],xmm28[1],xmm29[2],xmm28[2],xmm29[3],xmm28[3],xmm29[4],xmm28[4],xmm29[5],xmm28[5],xmm29[6],xmm28[6],xmm29[7],xmm28[7]
-; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm22 = xmm20[0,1,2,3,4,4,6,5]
-; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm25 = xmm20[0,1,2,3,4,6,6,7]
-; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm25, %ymm22, %ymm25
-; AVX512DQ-BW-NEXT:    vmovdqa64 32(%r9), %xmm22
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} ymm25 = ymm25[2,1,3,3,6,5,7,7]
-; AVX512DQ-BW-NEXT:    vpermw %ymm20, %ymm11, %ymm20
-; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm25, %zmm20, %zmm20
-; AVX512DQ-BW-NEXT:    vmovdqa64 32(%r8), %xmm25
-; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm20, %zmm14 {%k2}
-; AVX512DQ-BW-NEXT:    vmovdqa64 32(%rsi), %xmm20
-; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm23, %zmm14 {%k3}
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm15 = xmm18[8],xmm15[8],xmm18[9],xmm15[9],xmm18[10],xmm15[10],xmm18[11],xmm15[11],xmm18[12],xmm15[12],xmm18[13],xmm15[13],xmm18[14],xmm15[14],xmm18[15],xmm15[15]
-; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm18 = xmm15[0,1,2,3,4,4,6,5]
-; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm23 = xmm15[0,1,2,3,4,6,6,7]
-; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm23, %ymm18, %ymm18
-; AVX512DQ-BW-NEXT:    vmovdqa64 32(%rdi), %xmm23
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} ymm18 = ymm18[0,2,2,3,4,6,6,7]
-; AVX512DQ-BW-NEXT:    vpermw %ymm15, %ymm3, %ymm15
-; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm18, %zmm15, %zmm15
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm18 = xmm21[8],xmm19[8],xmm21[9],xmm19[9],xmm21[10],xmm19[10],xmm21[11],xmm19[11],xmm21[12],xmm19[12],xmm21[13],xmm19[13],xmm21[14],xmm19[14],xmm21[15],xmm19[15]
-; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm19 = xmm18[0,1,2,3,4,5,5,7]
-; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm21 = xmm18[0,1,2,3,6,5,7,7]
-; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm21, %ymm19, %ymm21
-; AVX512DQ-BW-NEXT:    vmovdqa64 32(%rcx), %xmm19
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} ymm21 = ymm21[0,2,2,3,4,6,6,7]
-; AVX512DQ-BW-NEXT:    vpermw %ymm18, %ymm6, %ymm18
-; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm21, %zmm18, %zmm18
-; AVX512DQ-BW-NEXT:    vmovdqa64 32(%rdx), %xmm26
-; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm15, %zmm18 {%k1}
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm15 = xmm27[8],xmm24[8],xmm27[9],xmm24[9],xmm27[10],xmm24[10],xmm27[11],xmm24[11],xmm27[12],xmm24[12],xmm27[13],xmm24[13],xmm27[14],xmm24[14],xmm27[15],xmm24[15]
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm21 = xmm15[2,3,2,3]
-; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm21 = xmm21[0],zero,zero,zero,xmm21[1],zero,zero,zero
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm24 = xmm15[3,3,3,3]
-; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm24 = xmm24[0],zero,zero,zero,xmm24[1],zero,zero,zero
-; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm24, %ymm21, %ymm21
-; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm24 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero
+; AVX512DQ-BW-NEXT:    vpermw %zmm14, %zmm12, %zmm3 {%k2}
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm14 = xmm20[0],xmm19[0],xmm20[1],xmm19[1],xmm20[2],xmm19[2],xmm20[3],xmm19[3],xmm20[4],xmm19[4],xmm20[5],xmm19[5],xmm20[6],xmm19[6],xmm20[7],xmm19[7]
+; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm14, %zmm14, %zmm15
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm14 = xmm24[0],xmm21[0],xmm24[1],xmm21[1],xmm24[2],xmm21[2],xmm24[3],xmm21[3],xmm24[4],xmm21[4],xmm24[5],xmm21[5],xmm24[6],xmm21[6],xmm24[7],xmm21[7]
+; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm14, %zmm14, %zmm14
+; AVX512DQ-BW-NEXT:    vpermw %zmm14, %zmm10, %zmm14
+; AVX512DQ-BW-NEXT:    vpermw %zmm15, %zmm9, %zmm14 {%k1}
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm15 = xmm30[0],xmm28[0],xmm30[1],xmm28[1],xmm30[2],xmm28[2],xmm30[3],xmm28[3],xmm30[4],xmm28[4],xmm30[5],xmm28[5],xmm30[6],xmm28[6],xmm30[7],xmm28[7]
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm27 = xmm15[2,3,2,3]
+; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm27 = xmm27[0],zero,zero,zero,xmm27[1],zero,zero,zero
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm29 = xmm15[3,3,3,3]
+; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm29 = xmm29[0],zero,zero,zero,xmm29[1],zero,zero,zero
+; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm29, %ymm27, %ymm27
+; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm29 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero
 ; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm15 = xmm15[1,1,1,1]
 ; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm15 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero
-; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm15, %ymm24, %ymm15
-; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm21, %zmm15, %zmm15
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm21 = xmm29[8],xmm28[8],xmm29[9],xmm28[9],xmm29[10],xmm28[10],xmm29[11],xmm28[11],xmm29[12],xmm28[12],xmm29[13],xmm28[13],xmm29[14],xmm28[14],xmm29[15],xmm28[15]
-; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm24 = xmm21[0,1,2,3,4,4,6,5]
-; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm27 = xmm21[0,1,2,3,4,6,6,7]
-; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm27, %ymm24, %ymm24
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} ymm24 = ymm24[2,1,3,3,6,5,7,7]
-; AVX512DQ-BW-NEXT:    vpermw %ymm21, %ymm11, %ymm21
-; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm24, %zmm21, %zmm21
-; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm21, %zmm15 {%k2}
-; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm18, %zmm15 {%k3}
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm18 = xmm17[0],xmm16[0],xmm17[1],xmm16[1],xmm17[2],xmm16[2],xmm17[3],xmm16[3],xmm17[4],xmm16[4],xmm17[5],xmm16[5],xmm17[6],xmm16[6],xmm17[7],xmm16[7]
-; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm21 = xmm18[0,1,2,3,4,4,6,5]
-; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm24 = xmm18[0,1,2,3,4,6,6,7]
-; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm24, %ymm21, %ymm21
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} ymm21 = ymm21[0,2,2,3,4,6,6,7]
-; AVX512DQ-BW-NEXT:    vpermw %ymm18, %ymm3, %ymm18
-; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm21, %zmm18, %zmm18
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm21 = xmm25[0],xmm22[0],xmm25[1],xmm22[1],xmm25[2],xmm22[2],xmm25[3],xmm22[3],xmm25[4],xmm22[4],xmm25[5],xmm22[5],xmm25[6],xmm22[6],xmm25[7],xmm22[7]
-; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm24 = xmm21[0,1,2,3,4,5,5,7]
-; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm27 = xmm21[0,1,2,3,6,5,7,7]
-; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm27, %ymm24, %ymm24
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} ymm24 = ymm24[0,2,2,3,4,6,6,7]
-; AVX512DQ-BW-NEXT:    vpermw %ymm21, %ymm6, %ymm21
-; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm24, %zmm21, %zmm27
-; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm18, %zmm27 {%k1}
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm18 = xmm23[0],xmm20[0],xmm23[1],xmm20[1],xmm23[2],xmm20[2],xmm23[3],xmm20[3],xmm23[4],xmm20[4],xmm23[5],xmm20[5],xmm23[6],xmm20[6],xmm23[7],xmm20[7]
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm21 = xmm18[2,3,2,3]
+; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm15, %ymm29, %ymm15
+; AVX512DQ-BW-NEXT:    vmovdqa 48(%rdx), %xmm0
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm27, %zmm15, %zmm15
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm27 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm27, %zmm27, %zmm27
+; AVX512DQ-BW-NEXT:    vpermw %zmm27, %zmm12, %zmm15 {%k2}
+; AVX512DQ-BW-NEXT:    vmovdqa64 32(%rsi), %xmm29
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm19 = xmm20[8],xmm19[8],xmm20[9],xmm19[9],xmm20[10],xmm19[10],xmm20[11],xmm19[11],xmm20[12],xmm19[12],xmm20[13],xmm19[13],xmm20[14],xmm19[14],xmm20[15],xmm19[15]
+; AVX512DQ-BW-NEXT:    vmovdqa64 32(%rdi), %xmm31
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm20 = xmm24[8],xmm21[8],xmm24[9],xmm21[9],xmm24[10],xmm21[10],xmm24[11],xmm21[11],xmm24[12],xmm21[12],xmm24[13],xmm21[13],xmm24[14],xmm21[14],xmm24[15],xmm21[15]
+; AVX512DQ-BW-NEXT:    vmovdqa64 32(%rcx), %xmm27
+; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm19, %zmm19, %zmm21
+; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm20, %zmm20, %zmm19
+; AVX512DQ-BW-NEXT:    vpermw %zmm19, %zmm10, %zmm19
+; AVX512DQ-BW-NEXT:    vpermw %zmm21, %zmm9, %zmm19 {%k1}
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm20 = xmm30[8],xmm28[8],xmm30[9],xmm28[9],xmm30[10],xmm28[10],xmm30[11],xmm28[11],xmm30[12],xmm28[12],xmm30[13],xmm28[13],xmm30[14],xmm28[14],xmm30[15],xmm28[15]
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm21 = xmm20[2,3,2,3]
 ; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm21 = xmm21[0],zero,zero,zero,xmm21[1],zero,zero,zero
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm24 = xmm18[3,3,3,3]
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm24 = xmm20[3,3,3,3]
 ; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm24 = xmm24[0],zero,zero,zero,xmm24[1],zero,zero,zero
 ; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm24, %ymm21, %ymm21
-; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm24 = xmm18[0],zero,zero,zero,xmm18[1],zero,zero,zero
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm18 = xmm18[1,1,1,1]
-; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm18 = xmm18[0],zero,zero,zero,xmm18[1],zero,zero,zero
-; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm18, %ymm24, %ymm18
-; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm21, %zmm18, %zmm18
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm21 = xmm26[0],xmm19[0],xmm26[1],xmm19[1],xmm26[2],xmm19[2],xmm26[3],xmm19[3],xmm26[4],xmm19[4],xmm26[5],xmm19[5],xmm26[6],xmm19[6],xmm26[7],xmm19[7]
-; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm24 = xmm21[0,1,2,3,4,4,6,5]
-; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm28 = xmm21[0,1,2,3,4,6,6,7]
-; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm28, %ymm24, %ymm28
-; AVX512DQ-BW-NEXT:    vmovdqa64 16(%r9), %xmm24
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} ymm28 = ymm28[2,1,3,3,6,5,7,7]
-; AVX512DQ-BW-NEXT:    vpermw %ymm21, %ymm11, %ymm21
-; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm28, %zmm21, %zmm21
-; AVX512DQ-BW-NEXT:    vmovdqa64 16(%r8), %xmm28
-; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm21, %zmm18 {%k2}
-; AVX512DQ-BW-NEXT:    vmovdqa64 16(%rsi), %xmm21
-; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm27, %zmm18 {%k3}
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm16 = xmm17[8],xmm16[8],xmm17[9],xmm16[9],xmm17[10],xmm16[10],xmm17[11],xmm16[11],xmm17[12],xmm16[12],xmm17[13],xmm16[13],xmm17[14],xmm16[14],xmm17[15],xmm16[15]
-; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm17 = xmm16[0,1,2,3,4,4,6,5]
-; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm27 = xmm16[0,1,2,3,4,6,6,7]
-; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm27, %ymm17, %ymm17
-; AVX512DQ-BW-NEXT:    vmovdqa64 16(%rdi), %xmm27
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} ymm17 = ymm17[0,2,2,3,4,6,6,7]
-; AVX512DQ-BW-NEXT:    vpermw %ymm16, %ymm3, %ymm16
-; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm17, %zmm16, %zmm16
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm22 = xmm25[8],xmm22[8],xmm25[9],xmm22[9],xmm25[10],xmm22[10],xmm25[11],xmm22[11],xmm25[12],xmm22[12],xmm25[13],xmm22[13],xmm25[14],xmm22[14],xmm25[15],xmm22[15]
-; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm17 = xmm22[0,1,2,3,4,5,5,7]
-; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm25 = xmm22[0,1,2,3,6,5,7,7]
-; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm25, %ymm17, %ymm25
-; AVX512DQ-BW-NEXT:    vmovdqa64 16(%rcx), %xmm17
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} ymm25 = ymm25[0,2,2,3,4,6,6,7]
-; AVX512DQ-BW-NEXT:    vpermw %ymm22, %ymm6, %ymm22
-; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm25, %zmm22, %zmm25
-; AVX512DQ-BW-NEXT:    vmovdqa64 16(%rdx), %xmm22
-; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm16, %zmm25 {%k1}
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm16 = xmm23[8],xmm20[8],xmm23[9],xmm20[9],xmm23[10],xmm20[10],xmm23[11],xmm20[11],xmm23[12],xmm20[12],xmm23[13],xmm20[13],xmm23[14],xmm20[14],xmm23[15],xmm20[15]
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm20 = xmm16[2,3,2,3]
+; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm24 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm20 = xmm20[1,1,1,1]
 ; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm20 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm23 = xmm16[3,3,3,3]
-; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm23 = xmm23[0],zero,zero,zero,xmm23[1],zero,zero,zero
-; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm23, %ymm20, %ymm20
-; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm23 = xmm16[0],zero,zero,zero,xmm16[1],zero,zero,zero
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm16 = xmm16[1,1,1,1]
-; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm16 = xmm16[0],zero,zero,zero,xmm16[1],zero,zero,zero
-; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm16, %ymm23, %ymm16
-; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm20, %zmm16, %zmm16
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm19 = xmm26[8],xmm19[8],xmm26[9],xmm19[9],xmm26[10],xmm19[10],xmm26[11],xmm19[11],xmm26[12],xmm19[12],xmm26[13],xmm19[13],xmm26[14],xmm19[14],xmm26[15],xmm19[15]
-; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm20 = xmm19[0,1,2,3,4,4,6,5]
-; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm23 = xmm19[0,1,2,3,4,6,6,7]
-; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm23, %ymm20, %ymm20
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} ymm20 = ymm20[2,1,3,3,6,5,7,7]
-; AVX512DQ-BW-NEXT:    vpermw %ymm19, %ymm11, %ymm19
-; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm20, %zmm19, %zmm19
-; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm19, %zmm16 {%k2}
-; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm25, %zmm16 {%k3}
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm19 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7]
-; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm20 = xmm19[0,1,2,3,4,4,6,5]
-; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm23 = xmm19[0,1,2,3,4,6,6,7]
-; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm23, %ymm20, %ymm20
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} ymm20 = ymm20[0,2,2,3,4,6,6,7]
-; AVX512DQ-BW-NEXT:    vpermw %ymm19, %ymm3, %ymm19
-; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm20, %zmm19, %zmm19
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm20 = xmm28[0],xmm24[0],xmm28[1],xmm24[1],xmm28[2],xmm24[2],xmm28[3],xmm24[3],xmm28[4],xmm24[4],xmm28[5],xmm24[5],xmm28[6],xmm24[6],xmm28[7],xmm24[7]
-; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm23 = xmm20[0,1,2,3,4,5,5,7]
-; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm25 = xmm20[0,1,2,3,6,5,7,7]
-; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm25, %ymm23, %ymm23
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} ymm23 = ymm23[0,2,2,3,4,6,6,7]
-; AVX512DQ-BW-NEXT:    vpermw %ymm20, %ymm6, %ymm20
-; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm23, %zmm20, %zmm20
-; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm19, %zmm20 {%k1}
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm19 = xmm27[0],xmm21[0],xmm27[1],xmm21[1],xmm27[2],xmm21[2],xmm27[3],xmm21[3],xmm27[4],xmm21[4],xmm27[5],xmm21[5],xmm27[6],xmm21[6],xmm27[7],xmm21[7]
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm23 = xmm19[2,3,2,3]
+; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm20, %ymm24, %ymm20
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm21, %zmm20, %zmm20
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
+; AVX512DQ-BW-NEXT:    vpermw %zmm0, %zmm12, %zmm20 {%k2}
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm23[0],xmm22[0],xmm23[1],xmm22[1],xmm23[2],xmm22[2],xmm23[3],xmm22[3],xmm23[4],xmm22[4],xmm23[5],xmm22[5],xmm23[6],xmm22[6],xmm23[7],xmm22[7]
+; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm26[0],xmm25[0],xmm26[1],xmm25[1],xmm26[2],xmm25[2],xmm26[3],xmm25[3],xmm26[4],xmm25[4],xmm26[5],xmm25[5],xmm26[6],xmm25[6],xmm26[7],xmm25[7]
+; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm1, %zmm1, %zmm1
+; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm10, %zmm21
+; AVX512DQ-BW-NEXT:    vpermw %zmm0, %zmm9, %zmm21 {%k1}
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm31[0],xmm29[0],xmm31[1],xmm29[1],xmm31[2],xmm29[2],xmm31[3],xmm29[3],xmm31[4],xmm29[4],xmm31[5],xmm29[5],xmm31[6],xmm29[6],xmm31[7],xmm29[7]
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm24 = xmm0[3,3,3,3]
+; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm24 = xmm24[0],zero,zero,zero,xmm24[1],zero,zero,zero
+; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm24, %ymm1, %ymm1
+; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm24 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm0, %ymm24, %ymm0
+; AVX512DQ-BW-NEXT:    vmovdqa 32(%rdx), %xmm2
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm24
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm27[0],xmm2[1],xmm27[1],xmm2[2],xmm27[2],xmm2[3],xmm27[3],xmm2[4],xmm27[4],xmm2[5],xmm27[5],xmm2[6],xmm27[6],xmm2[7],xmm27[7]
+; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
+; AVX512DQ-BW-NEXT:    vpermw %zmm0, %zmm12, %zmm24 {%k2}
+; AVX512DQ-BW-NEXT:    vmovdqa64 16(%rsi), %xmm28
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm23[8],xmm22[8],xmm23[9],xmm22[9],xmm23[10],xmm22[10],xmm23[11],xmm22[11],xmm23[12],xmm22[12],xmm23[13],xmm22[13],xmm23[14],xmm22[14],xmm23[15],xmm22[15]
+; AVX512DQ-BW-NEXT:    vmovdqa64 16(%rdi), %xmm30
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm26[8],xmm25[8],xmm26[9],xmm25[9],xmm26[10],xmm25[10],xmm26[11],xmm25[11],xmm26[12],xmm25[12],xmm26[13],xmm25[13],xmm26[14],xmm25[14],xmm26[15],xmm25[15]
+; AVX512DQ-BW-NEXT:    vmovdqa64 16(%rcx), %xmm25
+; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
+; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm1, %zmm1, %zmm1
+; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm10, %zmm22
+; AVX512DQ-BW-NEXT:    vpermw %zmm0, %zmm9, %zmm22 {%k1}
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm31[8],xmm29[8],xmm31[9],xmm29[9],xmm31[10],xmm29[10],xmm31[11],xmm29[11],xmm31[12],xmm29[12],xmm31[13],xmm29[13],xmm31[14],xmm29[14],xmm31[15],xmm29[15]
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm23 = xmm0[3,3,3,3]
 ; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm23 = xmm23[0],zero,zero,zero,xmm23[1],zero,zero,zero
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm25 = xmm19[3,3,3,3]
-; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm25 = xmm25[0],zero,zero,zero,xmm25[1],zero,zero,zero
-; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm25, %ymm23, %ymm23
-; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm25 = xmm19[0],zero,zero,zero,xmm19[1],zero,zero,zero
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm19 = xmm19[1,1,1,1]
-; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm19 = xmm19[0],zero,zero,zero,xmm19[1],zero,zero,zero
-; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm19, %ymm25, %ymm19
-; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm23, %zmm19, %zmm19
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm23 = xmm22[0],xmm17[0],xmm22[1],xmm17[1],xmm22[2],xmm17[2],xmm22[3],xmm17[3],xmm22[4],xmm17[4],xmm22[5],xmm17[5],xmm22[6],xmm17[6],xmm22[7],xmm17[7]
-; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm25 = xmm23[0,1,2,3,4,4,6,5]
-; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm26 = xmm23[0,1,2,3,4,6,6,7]
-; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm26, %ymm25, %ymm25
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} ymm25 = ymm25[2,1,3,3,6,5,7,7]
-; AVX512DQ-BW-NEXT:    vpermw %ymm23, %ymm11, %ymm23
-; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm25, %zmm23, %zmm23
-; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm23, %zmm19 {%k2}
-; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm20, %zmm19 {%k3}
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm12 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15]
-; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm13 = xmm12[0,1,2,3,4,4,6,5]
-; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm20 = xmm12[0,1,2,3,4,6,6,7]
-; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm20, %ymm13, %ymm13
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} ymm13 = ymm13[0,2,2,3,4,6,6,7]
-; AVX512DQ-BW-NEXT:    vpermw %ymm12, %ymm3, %ymm12
-; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm13, %zmm12, %zmm12
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm13 = xmm28[8],xmm24[8],xmm28[9],xmm24[9],xmm28[10],xmm24[10],xmm28[11],xmm24[11],xmm28[12],xmm24[12],xmm28[13],xmm24[13],xmm28[14],xmm24[14],xmm28[15],xmm24[15]
-; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm20 = xmm13[0,1,2,3,4,5,5,7]
-; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm23 = xmm13[0,1,2,3,6,5,7,7]
-; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm23, %ymm20, %ymm20
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} ymm20 = ymm20[0,2,2,3,4,6,6,7]
-; AVX512DQ-BW-NEXT:    vpermw %ymm13, %ymm6, %ymm13
-; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm20, %zmm13, %zmm13
-; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm12, %zmm13 {%k1}
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm12 = xmm27[8],xmm21[8],xmm27[9],xmm21[9],xmm27[10],xmm21[10],xmm27[11],xmm21[11],xmm27[12],xmm21[12],xmm27[13],xmm21[13],xmm27[14],xmm21[14],xmm27[15],xmm21[15]
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm20 = xmm12[2,3,2,3]
-; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm20 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm21 = xmm12[3,3,3,3]
-; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm21 = xmm21[0],zero,zero,zero,xmm21[1],zero,zero,zero
-; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm21, %ymm20, %ymm20
-; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm21 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[1,1,1,1]
-; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero
-; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm12, %ymm21, %ymm12
-; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm20, %zmm12, %zmm12
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm17 = xmm22[8],xmm17[8],xmm22[9],xmm17[9],xmm22[10],xmm17[10],xmm22[11],xmm17[11],xmm22[12],xmm17[12],xmm22[13],xmm17[13],xmm22[14],xmm17[14],xmm22[15],xmm17[15]
-; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm20 = xmm17[0,1,2,3,4,4,6,5]
-; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm21 = xmm17[0,1,2,3,4,6,6,7]
-; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm21, %ymm20, %ymm20
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} ymm20 = ymm20[2,1,3,3,6,5,7,7]
-; AVX512DQ-BW-NEXT:    vpermw %ymm17, %ymm11, %ymm17
-; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm20, %zmm17, %zmm17
-; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm17, %zmm12 {%k2}
-; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm13, %zmm12 {%k3}
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,6,5]
-; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm13 = xmm0[0,1,2,3,4,6,6,7]
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm13, %ymm2, %ymm2
-; AVX512DQ-BW-NEXT:    vpermw %ymm0, %ymm3, %ymm0
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
-; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,5,5,7]
-; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,6,5,7,7]
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm4, %ymm3, %ymm3
-; AVX512DQ-BW-NEXT:    vpermw %ymm2, %ymm6, %ymm2
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
-; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
-; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm0, %zmm2 {%k1}
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
-; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm4, %ymm3, %ymm3
-; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm23, %ymm1, %ymm1
+; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm23 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
 ; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
 ; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm0, %ymm4, %ymm0
-; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
-; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,4,6,5]
-; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,6,6,7]
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm5, %ymm4, %ymm4
-; AVX512DQ-BW-NEXT:    vpermw %ymm3, %ymm11, %ymm3
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7]
-; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm4, %zmm3, %zmm3
-; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm3, %zmm0 {%k2}
-; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm2, %zmm0 {%k3}
+; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm0, %ymm23, %ymm0
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm23
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm27[8],xmm2[9],xmm27[9],xmm2[10],xmm27[10],xmm2[11],xmm27[11],xmm2[12],xmm27[12],xmm2[13],xmm27[13],xmm2[14],xmm27[14],xmm2[15],xmm27[15]
+; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
+; AVX512DQ-BW-NEXT:    vpermw %zmm0, %zmm12, %zmm23 {%k2}
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm16[0],xmm13[0],xmm16[1],xmm13[1],xmm16[2],xmm13[2],xmm16[3],xmm13[3],xmm16[4],xmm13[4],xmm16[5],xmm13[5],xmm16[6],xmm13[6],xmm16[7],xmm13[7]
+; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm18[0],xmm17[0],xmm18[1],xmm17[1],xmm18[2],xmm17[2],xmm18[3],xmm17[3],xmm18[4],xmm17[4],xmm18[5],xmm17[5],xmm18[6],xmm17[6],xmm18[7],xmm17[7]
+; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm1, %zmm1, %zmm1
+; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm10, %zmm26
+; AVX512DQ-BW-NEXT:    vpermw %zmm0, %zmm9, %zmm26 {%k1}
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm30[0],xmm28[0],xmm30[1],xmm28[1],xmm30[2],xmm28[2],xmm30[3],xmm28[3],xmm30[4],xmm28[4],xmm30[5],xmm28[5],xmm30[6],xmm28[6],xmm30[7],xmm28[7]
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
+; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm0, %ymm2, %ymm0
+; AVX512DQ-BW-NEXT:    vmovdqa 16(%rdx), %xmm2
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm27
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm25[0],xmm2[1],xmm25[1],xmm2[2],xmm25[2],xmm2[3],xmm25[3],xmm2[4],xmm25[4],xmm2[5],xmm25[5],xmm2[6],xmm25[6],xmm2[7],xmm25[7]
+; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
+; AVX512DQ-BW-NEXT:    vpermw %zmm0, %zmm12, %zmm27 {%k2}
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm16[8],xmm13[8],xmm16[9],xmm13[9],xmm16[10],xmm13[10],xmm16[11],xmm13[11],xmm16[12],xmm13[12],xmm16[13],xmm13[13],xmm16[14],xmm13[14],xmm16[15],xmm13[15]
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm18[8],xmm17[8],xmm18[9],xmm17[9],xmm18[10],xmm17[10],xmm18[11],xmm17[11],xmm18[12],xmm17[12],xmm18[13],xmm17[13],xmm18[14],xmm17[14],xmm18[15],xmm17[15]
+; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm13
+; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm1, %zmm1, %zmm0
+; AVX512DQ-BW-NEXT:    vpermw %zmm0, %zmm10, %zmm0
+; AVX512DQ-BW-NEXT:    vpermw %zmm13, %zmm9, %zmm0 {%k1}
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm30[8],xmm28[8],xmm30[9],xmm28[9],xmm30[10],xmm28[10],xmm30[11],xmm28[11],xmm30[12],xmm28[12],xmm30[13],xmm28[13],xmm30[14],xmm28[14],xmm30[15],xmm28[15]
+; AVX512DQ-BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload
+; AVX512DQ-BW-NEXT:    # xmm13 = xmm13[0],mem[0],xmm13[1],mem[1],xmm13[2],mem[2],xmm13[3],mem[3],xmm13[4],mem[4],xmm13[5],mem[5],xmm13[6],mem[6],xmm13[7],mem[7]
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7]
+; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm5, %zmm5, %zmm5
+; AVX512DQ-BW-NEXT:    vpermw %zmm5, %zmm10, %zmm5
+; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm13, %zmm13, %zmm7
+; AVX512DQ-BW-NEXT:    vpermw %zmm7, %zmm9, %zmm5 {%k1}
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm7 = xmm1[2,3,2,3]
+; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm9 = xmm1[3,3,3,3]
+; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm9, %ymm7, %ymm7
+; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm9 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
+; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm1, %ymm9, %ymm1
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm7, %zmm1, %zmm1
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm25[8],xmm2[9],xmm25[9],xmm2[10],xmm25[10],xmm2[11],xmm25[11],xmm2[12],xmm25[12],xmm2[13],xmm25[13],xmm2[14],xmm25[14],xmm2[15],xmm25[15]
+; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm2, %zmm2, %zmm2
+; AVX512DQ-BW-NEXT:    vpermw %zmm2, %zmm12, %zmm1 {%k2}
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm2 # 16-byte Folded Reload
+; AVX512DQ-BW-NEXT:    # xmm2 = xmm6[0],mem[0],xmm6[1],mem[1],xmm6[2],mem[2],xmm6[3],mem[3],xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7]
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm6 = xmm2[2,3,2,3]
+; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm7 = xmm2[3,3,3,3]
+; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm7, %ymm6, %ymm6
+; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm7 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
+; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm2, %ymm7, %ymm2
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm6, %zmm2, %zmm2
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm6 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3],xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7]
+; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm6, %zmm6, %zmm6
+; AVX512DQ-BW-NEXT:    vpermw %zmm6, %zmm12, %zmm2 {%k2}
+; AVX512DQ-BW-NEXT:    movw $-21846, %ax # imm = 0xAAAA
+; AVX512DQ-BW-NEXT:    kmovd %eax, %k1
+; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm4, %zmm3 {%k1}
+; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm14, %zmm15 {%k1}
+; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm19, %zmm20 {%k1}
+; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm21, %zmm24 {%k1}
+; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm22, %zmm23 {%k1}
+; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm26, %zmm27 {%k1}
+; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
+; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm5, %zmm2 {%k1}
 ; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, (%rax)
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm12, 192(%rax)
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm19, 128(%rax)
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm16, 320(%rax)
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm18, 256(%rax)
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm15, 448(%rax)
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm14, 384(%rax)
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, 64(%rax)
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm2, (%rax)
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, 192(%rax)
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm27, 128(%rax)
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm23, 320(%rax)
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm24, 256(%rax)
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm20, 448(%rax)
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm15, 384(%rax)
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm3, 64(%rax)
 ; AVX512DQ-BW-NEXT:    vzeroupper
 ; AVX512DQ-BW-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/widen_bitcnt.ll b/llvm/test/CodeGen/X86/widen_bitcnt.ll
index 541dfb54e96d2..18cb5e1b86ec3 100644
--- a/llvm/test/CodeGen/X86/widen_bitcnt.ll
+++ b/llvm/test/CodeGen/X86/widen_bitcnt.ll
@@ -629,45 +629,71 @@ define <8 x i32> @widen_ctlz_v2i32_v8i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32>
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
 ; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX2-NEXT:    vmovq {{.*#+}} xmm5 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT:    vpshufb %xmm0, %xmm5, %xmm4
+; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm6
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm7 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpand %xmm7, %xmm6, %xmm8
+; AVX2-NEXT:    vpxor %xmm6, %xmm6, %xmm6
+; AVX2-NEXT:    vpcmpeqb %xmm6, %xmm8, %xmm9
+; AVX2-NEXT:    vpand %xmm4, %xmm9, %xmm4
+; AVX2-NEXT:    vpshufb %xmm8, %xmm5, %xmm8
+; AVX2-NEXT:    vpaddb %xmm4, %xmm8, %xmm4
+; AVX2-NEXT:    vpcmpeqb %xmm6, %xmm0, %xmm8
+; AVX2-NEXT:    vpsrlw $8, %xmm8, %xmm8
+; AVX2-NEXT:    vpand %xmm4, %xmm8, %xmm8
+; AVX2-NEXT:    vpsrlw $8, %xmm4, %xmm4
+; AVX2-NEXT:    vpaddw %xmm4, %xmm8, %xmm4
+; AVX2-NEXT:    vpshufb %xmm1, %xmm5, %xmm8
+; AVX2-NEXT:    vpsrlw $4, %xmm1, %xmm9
+; AVX2-NEXT:    vpand %xmm7, %xmm9, %xmm9
+; AVX2-NEXT:    vpcmpeqb %xmm6, %xmm9, %xmm10
+; AVX2-NEXT:    vpand %xmm10, %xmm8, %xmm8
+; AVX2-NEXT:    vpshufb %xmm9, %xmm5, %xmm9
+; AVX2-NEXT:    vpaddb %xmm9, %xmm8, %xmm8
+; AVX2-NEXT:    vpcmpeqb %xmm6, %xmm1, %xmm9
+; AVX2-NEXT:    vpsrlw $8, %xmm9, %xmm9
+; AVX2-NEXT:    vpand %xmm9, %xmm8, %xmm9
+; AVX2-NEXT:    vpsrlw $8, %xmm8, %xmm8
+; AVX2-NEXT:    vpaddw %xmm9, %xmm8, %xmm8
+; AVX2-NEXT:    vpshufb %xmm2, %xmm5, %xmm9
+; AVX2-NEXT:    vpsrlw $4, %xmm2, %xmm10
+; AVX2-NEXT:    vpand %xmm7, %xmm10, %xmm10
+; AVX2-NEXT:    vpcmpeqb %xmm6, %xmm10, %xmm11
+; AVX2-NEXT:    vpand %xmm11, %xmm9, %xmm9
+; AVX2-NEXT:    vpshufb %xmm10, %xmm5, %xmm10
+; AVX2-NEXT:    vpaddb %xmm10, %xmm9, %xmm9
+; AVX2-NEXT:    vpcmpeqb %xmm6, %xmm2, %xmm10
+; AVX2-NEXT:    vpsrlw $8, %xmm10, %xmm10
+; AVX2-NEXT:    vpand %xmm10, %xmm9, %xmm10
+; AVX2-NEXT:    vpsrlw $8, %xmm9, %xmm9
+; AVX2-NEXT:    vpaddw %xmm10, %xmm9, %xmm9
+; AVX2-NEXT:    vpshufb %xmm3, %xmm5, %xmm10
+; AVX2-NEXT:    vpsrlw $4, %xmm3, %xmm11
+; AVX2-NEXT:    vpand %xmm7, %xmm11, %xmm7
+; AVX2-NEXT:    vpcmpeqb %xmm6, %xmm7, %xmm11
+; AVX2-NEXT:    vpand %xmm11, %xmm10, %xmm10
+; AVX2-NEXT:    vpshufb %xmm7, %xmm5, %xmm5
+; AVX2-NEXT:    vpaddb %xmm5, %xmm10, %xmm5
+; AVX2-NEXT:    vpcmpeqb %xmm6, %xmm3, %xmm6
+; AVX2-NEXT:    vpsrlw $8, %xmm6, %xmm6
+; AVX2-NEXT:    vpand %xmm6, %xmm5, %xmm6
+; AVX2-NEXT:    vpsrlw $8, %xmm5, %xmm5
+; AVX2-NEXT:    vpaddw %xmm6, %xmm5, %xmm5
+; AVX2-NEXT:    vinserti128 $1, %xmm5, %ymm8, %ymm5
 ; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
-; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
-; AVX2-NEXT:    vpshufb %ymm1, %ymm3, %ymm4
-; AVX2-NEXT:    vpsrlw $4, %ymm1, %ymm5
-; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX2-NEXT:    vpand %ymm6, %ymm5, %ymm5
-; AVX2-NEXT:    vpxor %xmm7, %xmm7, %xmm7
-; AVX2-NEXT:    vpcmpeqb %ymm7, %ymm5, %ymm8
-; AVX2-NEXT:    vpand %ymm4, %ymm8, %ymm4
-; AVX2-NEXT:    vpshufb %ymm5, %ymm3, %ymm5
-; AVX2-NEXT:    vpaddb %ymm5, %ymm4, %ymm4
-; AVX2-NEXT:    vpcmpeqb %ymm7, %ymm1, %ymm5
-; AVX2-NEXT:    vpsrlw $8, %ymm5, %ymm5
-; AVX2-NEXT:    vpand %ymm5, %ymm4, %ymm5
-; AVX2-NEXT:    vpsrlw $8, %ymm4, %ymm4
-; AVX2-NEXT:    vpaddw %ymm5, %ymm4, %ymm4
-; AVX2-NEXT:    vpcmpeqw %ymm7, %ymm1, %ymm1
+; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX2-NEXT:    vpcmpeqw %ymm3, %ymm1, %ymm1
 ; AVX2-NEXT:    vpsrld $16, %ymm1, %ymm1
-; AVX2-NEXT:    vpand %ymm1, %ymm4, %ymm1
-; AVX2-NEXT:    vpsrld $16, %ymm4, %ymm4
-; AVX2-NEXT:    vpaddd %ymm1, %ymm4, %ymm1
+; AVX2-NEXT:    vpand %ymm1, %ymm5, %ymm1
+; AVX2-NEXT:    vpsrld $16, %ymm5, %ymm5
+; AVX2-NEXT:    vpaddd %ymm1, %ymm5, %ymm1
+; AVX2-NEXT:    vinserti128 $1, %xmm9, %ymm4, %ymm4
 ; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpshufb %ymm0, %ymm3, %ymm2
-; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm4
-; AVX2-NEXT:    vpand %ymm6, %ymm4, %ymm4
-; AVX2-NEXT:    vpcmpeqb %ymm7, %ymm4, %ymm5
-; AVX2-NEXT:    vpand %ymm5, %ymm2, %ymm2
-; AVX2-NEXT:    vpshufb %ymm4, %ymm3, %ymm3
-; AVX2-NEXT:    vpaddb %ymm3, %ymm2, %ymm2
-; AVX2-NEXT:    vpcmpeqb %ymm7, %ymm0, %ymm3
-; AVX2-NEXT:    vpsrlw $8, %ymm3, %ymm3
-; AVX2-NEXT:    vpand %ymm3, %ymm2, %ymm3
-; AVX2-NEXT:    vpsrlw $8, %ymm2, %ymm2
-; AVX2-NEXT:    vpaddw %ymm3, %ymm2, %ymm2
-; AVX2-NEXT:    vpcmpeqw %ymm7, %ymm0, %ymm0
+; AVX2-NEXT:    vpcmpeqw %ymm3, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
-; AVX2-NEXT:    vpand %ymm0, %ymm2, %ymm0
-; AVX2-NEXT:    vpsrld $16, %ymm2, %ymm2
+; AVX2-NEXT:    vpand %ymm0, %ymm4, %ymm0
+; AVX2-NEXT:    vpsrld $16, %ymm4, %ymm2
 ; AVX2-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
 ; AVX2-NEXT:    retq
@@ -982,45 +1008,71 @@ define <8 x i32> @widen_ctlz_undef_v2i32_v8i32(<2 x i32> %a0, <2 x i32> %a1, <2
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
 ; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX2-NEXT:    vmovq {{.*#+}} xmm5 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT:    vpshufb %xmm0, %xmm5, %xmm4
+; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm6
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm7 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpand %xmm7, %xmm6, %xmm8
+; AVX2-NEXT:    vpxor %xmm6, %xmm6, %xmm6
+; AVX2-NEXT:    vpcmpeqb %xmm6, %xmm8, %xmm9
+; AVX2-NEXT:    vpand %xmm4, %xmm9, %xmm4
+; AVX2-NEXT:    vpshufb %xmm8, %xmm5, %xmm8
+; AVX2-NEXT:    vpaddb %xmm4, %xmm8, %xmm4
+; AVX2-NEXT:    vpcmpeqb %xmm6, %xmm0, %xmm8
+; AVX2-NEXT:    vpsrlw $8, %xmm8, %xmm8
+; AVX2-NEXT:    vpand %xmm4, %xmm8, %xmm8
+; AVX2-NEXT:    vpsrlw $8, %xmm4, %xmm4
+; AVX2-NEXT:    vpaddw %xmm4, %xmm8, %xmm4
+; AVX2-NEXT:    vpshufb %xmm1, %xmm5, %xmm8
+; AVX2-NEXT:    vpsrlw $4, %xmm1, %xmm9
+; AVX2-NEXT:    vpand %xmm7, %xmm9, %xmm9
+; AVX2-NEXT:    vpcmpeqb %xmm6, %xmm9, %xmm10
+; AVX2-NEXT:    vpand %xmm10, %xmm8, %xmm8
+; AVX2-NEXT:    vpshufb %xmm9, %xmm5, %xmm9
+; AVX2-NEXT:    vpaddb %xmm9, %xmm8, %xmm8
+; AVX2-NEXT:    vpcmpeqb %xmm6, %xmm1, %xmm9
+; AVX2-NEXT:    vpsrlw $8, %xmm9, %xmm9
+; AVX2-NEXT:    vpand %xmm9, %xmm8, %xmm9
+; AVX2-NEXT:    vpsrlw $8, %xmm8, %xmm8
+; AVX2-NEXT:    vpaddw %xmm9, %xmm8, %xmm8
+; AVX2-NEXT:    vpshufb %xmm2, %xmm5, %xmm9
+; AVX2-NEXT:    vpsrlw $4, %xmm2, %xmm10
+; AVX2-NEXT:    vpand %xmm7, %xmm10, %xmm10
+; AVX2-NEXT:    vpcmpeqb %xmm6, %xmm10, %xmm11
+; AVX2-NEXT:    vpand %xmm11, %xmm9, %xmm9
+; AVX2-NEXT:    vpshufb %xmm10, %xmm5, %xmm10
+; AVX2-NEXT:    vpaddb %xmm10, %xmm9, %xmm9
+; AVX2-NEXT:    vpcmpeqb %xmm6, %xmm2, %xmm10
+; AVX2-NEXT:    vpsrlw $8, %xmm10, %xmm10
+; AVX2-NEXT:    vpand %xmm10, %xmm9, %xmm10
+; AVX2-NEXT:    vpsrlw $8, %xmm9, %xmm9
+; AVX2-NEXT:    vpaddw %xmm10, %xmm9, %xmm9
+; AVX2-NEXT:    vpshufb %xmm3, %xmm5, %xmm10
+; AVX2-NEXT:    vpsrlw $4, %xmm3, %xmm11
+; AVX2-NEXT:    vpand %xmm7, %xmm11, %xmm7
+; AVX2-NEXT:    vpcmpeqb %xmm6, %xmm7, %xmm11
+; AVX2-NEXT:    vpand %xmm11, %xmm10, %xmm10
+; AVX2-NEXT:    vpshufb %xmm7, %xmm5, %xmm5
+; AVX2-NEXT:    vpaddb %xmm5, %xmm10, %xmm5
+; AVX2-NEXT:    vpcmpeqb %xmm6, %xmm3, %xmm6
+; AVX2-NEXT:    vpsrlw $8, %xmm6, %xmm6
+; AVX2-NEXT:    vpand %xmm6, %xmm5, %xmm6
+; AVX2-NEXT:    vpsrlw $8, %xmm5, %xmm5
+; AVX2-NEXT:    vpaddw %xmm6, %xmm5, %xmm5
+; AVX2-NEXT:    vinserti128 $1, %xmm5, %ymm8, %ymm5
 ; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
-; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
-; AVX2-NEXT:    vpshufb %ymm1, %ymm3, %ymm4
-; AVX2-NEXT:    vpsrlw $4, %ymm1, %ymm5
-; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX2-NEXT:    vpand %ymm6, %ymm5, %ymm5
-; AVX2-NEXT:    vpxor %xmm7, %xmm7, %xmm7
-; AVX2-NEXT:    vpcmpeqb %ymm7, %ymm5, %ymm8
-; AVX2-NEXT:    vpand %ymm4, %ymm8, %ymm4
-; AVX2-NEXT:    vpshufb %ymm5, %ymm3, %ymm5
-; AVX2-NEXT:    vpaddb %ymm5, %ymm4, %ymm4
-; AVX2-NEXT:    vpcmpeqb %ymm7, %ymm1, %ymm5
-; AVX2-NEXT:    vpsrlw $8, %ymm5, %ymm5
-; AVX2-NEXT:    vpand %ymm5, %ymm4, %ymm5
-; AVX2-NEXT:    vpsrlw $8, %ymm4, %ymm4
-; AVX2-NEXT:    vpaddw %ymm5, %ymm4, %ymm4
-; AVX2-NEXT:    vpcmpeqw %ymm7, %ymm1, %ymm1
+; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX2-NEXT:    vpcmpeqw %ymm3, %ymm1, %ymm1
 ; AVX2-NEXT:    vpsrld $16, %ymm1, %ymm1
-; AVX2-NEXT:    vpand %ymm1, %ymm4, %ymm1
-; AVX2-NEXT:    vpsrld $16, %ymm4, %ymm4
-; AVX2-NEXT:    vpaddd %ymm1, %ymm4, %ymm1
+; AVX2-NEXT:    vpand %ymm1, %ymm5, %ymm1
+; AVX2-NEXT:    vpsrld $16, %ymm5, %ymm5
+; AVX2-NEXT:    vpaddd %ymm1, %ymm5, %ymm1
+; AVX2-NEXT:    vinserti128 $1, %xmm9, %ymm4, %ymm4
 ; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpshufb %ymm0, %ymm3, %ymm2
-; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm4
-; AVX2-NEXT:    vpand %ymm6, %ymm4, %ymm4
-; AVX2-NEXT:    vpcmpeqb %ymm7, %ymm4, %ymm5
-; AVX2-NEXT:    vpand %ymm5, %ymm2, %ymm2
-; AVX2-NEXT:    vpshufb %ymm4, %ymm3, %ymm3
-; AVX2-NEXT:    vpaddb %ymm3, %ymm2, %ymm2
-; AVX2-NEXT:    vpcmpeqb %ymm7, %ymm0, %ymm3
-; AVX2-NEXT:    vpsrlw $8, %ymm3, %ymm3
-; AVX2-NEXT:    vpand %ymm3, %ymm2, %ymm3
-; AVX2-NEXT:    vpsrlw $8, %ymm2, %ymm2
-; AVX2-NEXT:    vpaddw %ymm3, %ymm2, %ymm2
-; AVX2-NEXT:    vpcmpeqw %ymm7, %ymm0, %ymm0
+; AVX2-NEXT:    vpcmpeqw %ymm3, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
-; AVX2-NEXT:    vpand %ymm0, %ymm2, %ymm0
-; AVX2-NEXT:    vpsrld $16, %ymm2, %ymm2
+; AVX2-NEXT:    vpand %ymm0, %ymm4, %ymm0
+; AVX2-NEXT:    vpsrld $16, %ymm4, %ymm2
 ; AVX2-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
 ; AVX2-NEXT:    retq
diff --git a/llvm/test/DebugInfo/Symbolize/ELF/aarch64-mapping-symbol.s b/llvm/test/DebugInfo/Symbolize/ELF/aarch64-mapping-symbol.s
index 6990a82c4acfe..2f1140cf62910 100644
--- a/llvm/test/DebugInfo/Symbolize/ELF/aarch64-mapping-symbol.s
+++ b/llvm/test/DebugInfo/Symbolize/ELF/aarch64-mapping-symbol.s
@@ -7,9 +7,9 @@
 ## addresses.
 # RUN: llvm-nm --special-syms %t | FileCheck %s -check-prefix MAPPING_SYM
 
-# MAPPING_SYM:      0000000000000000 t $d
-# MAPPING_SYM-NEXT: 000000000000000c t $d
-# MAPPING_SYM-NEXT: 0000000000000004 t $x
+# MAPPING_SYM:      0000000000000000 t $d.0
+# MAPPING_SYM-NEXT: 000000000000000c t $d.2
+# MAPPING_SYM-NEXT: 0000000000000004 t $x.1
 # MAPPING_SYM-NEXT: 0000000000000000 T foo
 
 # RUN: llvm-symbolizer --obj=%t 0 4 0xc | FileCheck %s -check-prefix SYMBOL
diff --git a/llvm/test/DebugInfo/Symbolize/ELF/csky-mapping-symbol.s b/llvm/test/DebugInfo/Symbolize/ELF/csky-mapping-symbol.s
index 609b5e7e01b61..d1cab1250636d 100644
--- a/llvm/test/DebugInfo/Symbolize/ELF/csky-mapping-symbol.s
+++ b/llvm/test/DebugInfo/Symbolize/ELF/csky-mapping-symbol.s
@@ -5,11 +5,11 @@
 
 ## Verify that mapping symbols are actually present in the object at expected
 ## addresses.
-# RUN: llvm-nm --special-syms %t | FileCheck %s -check-prefix MAPPING_SYM --match-full-lines
+# RUN: llvm-nm --special-syms %t | FileCheck %s -check-prefix MAPPING_SYM
 
-# MAPPING_SYM:      00000000 t $d
-# MAPPING_SYM-NEXT: 00000008 t $d
-# MAPPING_SYM-NEXT: 00000004 t $t
+# MAPPING_SYM:      00000000 t $d.0
+# MAPPING_SYM-NEXT: 00000008 t $d.2
+# MAPPING_SYM-NEXT: 00000004 t $t.1
 # MAPPING_SYM-NEXT: 00000000 T foo
 
 # RUN: llvm-symbolizer --obj=%t 0 4 0xc | FileCheck %s -check-prefix SYMBOL
diff --git a/llvm/test/DebugInfo/XCOFF/empty.ll b/llvm/test/DebugInfo/XCOFF/empty.ll
index 445e1a63447ef..c1393907169fc 100644
--- a/llvm/test/DebugInfo/XCOFF/empty.ll
+++ b/llvm/test/DebugInfo/XCOFF/empty.ll
@@ -79,7 +79,7 @@ entry:
 ; ASM32-NEXT:                                          # -- End function
 ; ASM32-NEXT:  L..sec_end0:
 ; ASM32:               .dwsect 0x60000
-; ASM32-NEXT:  .dwabrev:
+; ASM32-NEXT:  L...dwabrev:
 ; ASM32-NEXT:          .byte   1                               # Abbreviation Code
 ; ASM32-NEXT:          .byte   17                              # DW_TAG_compile_unit
 ; ASM32-NEXT:          .byte   1                               # DW_CHILDREN_yes
@@ -135,10 +135,10 @@ entry:
 ; ASM32-NEXT:          .byte   0                               # EOM(2)
 ; ASM32-NEXT:          .byte   0                               # EOM(3)
 ; ASM32:               .dwsect 0x10000
-; ASM32-NEXT:  .dwinfo:
+; ASM32-NEXT:  L...dwinfo:
 ; ASM32-NEXT:  L..cu_begin0:
 ; ASM32-NEXT:          .vbyte  2, 4                            # DWARF version number
-; ASM32-NEXT:          .vbyte  4, .dwabrev                  # Offset Into Abbrev. Section
+; ASM32-NEXT:          .vbyte  4, L...dwabrev                  # Offset Into Abbrev. Section
 ; ASM32-NEXT:          .byte   4                               # Address Size (in bytes)
 ; ASM32-NEXT:          .byte   1                               # Abbrev [1] 0xb:0x38 DW_TAG_compile_unit
 ; ASM32-NEXT:          .vbyte  4, L..info_string0              # DW_AT_producer
@@ -166,7 +166,7 @@ entry:
 ; ASM32-NEXT:          .byte   0                               # End Of Children Mark
 ; ASM32-NEXT:  L..debug_info_end0:
 ; ASM32:               .dwsect 0x70000
-; ASM32-NEXT:  .dwstr:
+; ASM32-NEXT:  L...dwstr:
 ; ASM32-NEXT:  L..info_string0:
 ; ASM32-NEXT:          .string "clang version 12.0.0"          # string offset=0
 ; ASM32-NEXT:  L..info_string1:
@@ -179,7 +179,7 @@ entry:
 ; ASM32-NEXT:          .string "int"                           # string offset=36
 ; ASM32-NEXT:          .toc
 ; ASM32:               .dwsect 0x20000
-; ASM32-NEXT:  .dwline:
+; ASM32-NEXT:  L...dwline:
 ; ASM32-NEXT:  L..debug_line_0:
 ; ASM32-NEXT:  .set L..line_table_start0, L..debug_line_0-4
 ; ASM32-NEXT:          .vbyte  2, 4
@@ -281,7 +281,7 @@ entry:
 ; ASM64-NEXT:                                          # -- End function
 ; ASM64-NEXT:  L..sec_end0:
 ; ASM64:               .dwsect 0x60000
-; ASM64-NEXT:  .dwabrev:
+; ASM64-NEXT:  L...dwabrev:
 ; ASM64-NEXT:          .byte   1                               # Abbreviation Code
 ; ASM64-NEXT:          .byte   17                              # DW_TAG_compile_unit
 ; ASM64-NEXT:          .byte   1                               # DW_CHILDREN_yes
@@ -337,10 +337,10 @@ entry:
 ; ASM64-NEXT:          .byte   0                               # EOM(2)
 ; ASM64-NEXT:          .byte   0                               # EOM(3)
 ; ASM64:               .dwsect 0x10000
-; ASM64-NEXT:  .dwinfo:
+; ASM64-NEXT:  L...dwinfo:
 ; ASM64-NEXT:  L..cu_begin0:
 ; ASM64-NEXT:          .vbyte  2, 4                            # DWARF version number
-; ASM64-NEXT:          .vbyte  8, .dwabrev                  # Offset Into Abbrev. Section
+; ASM64-NEXT:          .vbyte  8, L...dwabrev                  # Offset Into Abbrev. Section
 ; ASM64-NEXT:          .byte   8                               # Address Size (in bytes)
 ; ASM64-NEXT:          .byte   1                               # Abbrev [1] 0x17:0x58 DW_TAG_compile_unit
 ; ASM64-NEXT:          .vbyte  8, L..info_string0              # DW_AT_producer
@@ -368,7 +368,7 @@ entry:
 ; ASM64-NEXT:          .byte   0                               # End Of Children Mark
 ; ASM64-NEXT:  L..debug_info_end0:
 ; ASM64:               .dwsect 0x70000
-; ASM64-NEXT:  .dwstr:
+; ASM64-NEXT:  L...dwstr:
 ; ASM64-NEXT:  L..info_string0:
 ; ASM64-NEXT:          .string "clang version 12.0.0"          # string offset=0
 ; ASM64-NEXT:  L..info_string1:
@@ -381,7 +381,7 @@ entry:
 ; ASM64-NEXT:          .string "int"                           # string offset=36
 ; ASM64-NEXT:         .toc
 ; ASM64:               .dwsect 0x20000
-; ASM64-NEXT:  .dwline:
+; ASM64-NEXT:  L...dwline:
 ; ASM64-NEXT:  L..debug_line_0:
 ; ASM64-NEXT:  .set L..line_table_start0, L..debug_line_0-12
 ; ASM64-NEXT:          .vbyte  2, 4
diff --git a/llvm/test/DebugInfo/XCOFF/explicit-section.ll b/llvm/test/DebugInfo/XCOFF/explicit-section.ll
index 530267ec634fc..ed2ffb709168e 100644
--- a/llvm/test/DebugInfo/XCOFF/explicit-section.ll
+++ b/llvm/test/DebugInfo/XCOFF/explicit-section.ll
@@ -130,7 +130,7 @@ entry:
 ; CHECK-NEXT:                                          # -- End function
 ; CHECK-NEXT:  L..sec_end0:
 ; CHECK:               .dwsect 0x60000
-; CHECK-NEXT:  .dwabrev:
+; CHECK-NEXT:  L...dwabrev:
 ; CHECK-NEXT:          .byte   1                               # Abbreviation Code
 ; CHECK-NEXT:          .byte   17                              # DW_TAG_compile_unit
 ; CHECK-NEXT:          .byte   1                               # DW_CHILDREN_yes
@@ -184,10 +184,10 @@ entry:
 ; CHECK-NEXT:          .byte   0                               # EOM(2)
 ; CHECK-NEXT:          .byte   0                               # EOM(3)
 ; CHECK:               .dwsect 0x10000
-; CHECK-NEXT:  .dwinfo:
+; CHECK-NEXT:  L...dwinfo:
 ; CHECK-NEXT:  L..cu_begin0:
 ; CHECK-NEXT:          .vbyte  2, 3                            # DWARF version number
-; CHECK-NEXT:          .vbyte  4, .dwabrev                  # Offset Into Abbrev. Section
+; CHECK-NEXT:          .vbyte  4, L...dwabrev                  # Offset Into Abbrev. Section
 ; CHECK-NEXT:          .byte   4                               # Address Size (in bytes)
 ; CHECK-NEXT:          .byte   1                               # Abbrev [1] 0xb:0x4f DW_TAG_compile_unit
 ; CHECK-NEXT:          .vbyte  4, L..info_string0              # DW_AT_producer
@@ -224,7 +224,7 @@ entry:
 ; CHECK-NEXT:          .byte   0                               # End Of Children Mark
 ; CHECK-NEXT:  L..debug_info_end0:
 ; CHECK:               .dwsect 0x80000
-; CHECK-NEXT:  .dwrnges:
+; CHECK-NEXT:  L...dwrnges:
 ; CHECK-NEXT:  L..debug_ranges0:
 ; CHECK-NEXT:          .vbyte  4, L..func_begin0
 ; CHECK-NEXT:          .vbyte  4, L..func_end0
@@ -233,7 +233,7 @@ entry:
 ; CHECK-NEXT:          .vbyte  4, 0
 ; CHECK-NEXT:          .vbyte  4, 0
 ; CHECK:               .dwsect 0x70000
-; CHECK-NEXT:  .dwstr:
+; CHECK-NEXT:  L...dwstr:
 ; CHECK-NEXT:  L..info_string0:
 ; CHECK-NEXT:  	.string	"clang version 13.0.0"          # string offset=0
 ; CHECK-NEXT:  L..info_string1:
@@ -248,7 +248,7 @@ entry:
 ; CHECK-NEXT:  	.string	"main"                          # string offset=39
 ; CHECK-NEXT:          .toc
 ; CHECK:               .dwsect 0x20000
-; CHECK-NEXT:  .dwline:
+; CHECK-NEXT:  L...dwline:
 ; CHECK-NEXT:  L..debug_line_0:
 ; CHECK-NEXT:  .set L..line_table_start0, L..debug_line_0-4
 ; CHECK-NEXT:          .vbyte  2, 3
diff --git a/llvm/test/DebugInfo/XCOFF/function-sections.ll b/llvm/test/DebugInfo/XCOFF/function-sections.ll
index 5103b66db114d..c899089102c64 100644
--- a/llvm/test/DebugInfo/XCOFF/function-sections.ll
+++ b/llvm/test/DebugInfo/XCOFF/function-sections.ll
@@ -112,7 +112,7 @@ entry:
 ; CHECK-NEXT:                                          # -- End function
 ; CHECK-NEXT:  L..sec_end0:
 ; CHECK:               .dwsect 0x60000
-; CHECK-NEXT:  .dwabrev:
+; CHECK-NEXT:  L...dwabrev:
 ; CHECK-NEXT:          .byte   1                               # Abbreviation Code
 ; CHECK-NEXT:          .byte   17                              # DW_TAG_compile_unit
 ; CHECK-NEXT:          .byte   1                               # DW_CHILDREN_yes
@@ -168,10 +168,10 @@ entry:
 ; CHECK-NEXT:          .byte   0                               # EOM(2)
 ; CHECK-NEXT:          .byte   0                               # EOM(3)
 ; CHECK:               .dwsect 0x10000
-; CHECK-NEXT:  .dwinfo:
+; CHECK-NEXT:  L...dwinfo:
 ; CHECK-NEXT:  L..cu_begin0:
 ; CHECK-NEXT:          .vbyte  2, 3                            # DWARF version number
-; CHECK-NEXT:          .vbyte  4, .dwabrev                  # Offset Into Abbrev. Section
+; CHECK-NEXT:          .vbyte  4, L...dwabrev                  # Offset Into Abbrev. Section
 ; CHECK-NEXT:          .byte   4                               # Address Size (in bytes)
 ; CHECK-NEXT:          .byte   1                               # Abbrev [1] 0xb:0x51 DW_TAG_compile_unit
 ; CHECK-NEXT:          .vbyte  4, L..info_string0              # DW_AT_producer
@@ -210,7 +210,7 @@ entry:
 ; CHECK-NEXT:          .byte   0                               # End Of Children Mark
 ; CHECK-NEXT:  L..debug_info_end0:
 ; CHECK:               .dwsect 0x80000
-; CHECK-NEXT:  .dwrnges:
+; CHECK-NEXT:  L...dwrnges:
 ; CHECK-NEXT:  L..debug_ranges0:
 ; CHECK-NEXT:          .vbyte  4, L..func_begin0
 ; CHECK-NEXT:          .vbyte  4, L..func_end0
@@ -219,7 +219,7 @@ entry:
 ; CHECK-NEXT:          .vbyte  4, 0
 ; CHECK-NEXT:          .vbyte  4, 0
 ; CHECK:               .dwsect 0x70000
-; CHECK-NEXT:  .dwstr:
+; CHECK-NEXT:  L...dwstr:
 ; CHECK-NEXT:  L..info_string0:
 ; CHECK-NEXT:          .string "clang version 13.0.0"          # string offset=0
 ; CHECK-NEXT:  L..info_string1:
@@ -234,7 +234,7 @@ entry:
 ; CHECK-NEXT:          .string "bar"                           # string offset=39
 ; CHECK-NEXT:          .toc
 ; CHECK:               .dwsect 0x20000
-; CHECK-NEXT:  .dwline:
+; CHECK-NEXT:  L...dwline:
 ; CHECK-NEXT:  L..debug_line_0:
 ; CHECK-NEXT:  .set L..line_table_start0, L..debug_line_0-4
 ; CHECK-NEXT:          .vbyte  2, 3
diff --git a/llvm/test/DebugInfo/omit-empty.ll b/llvm/test/DebugInfo/omit-empty.ll
index 351d3055e039b..0267ec5556f11 100644
--- a/llvm/test/DebugInfo/omit-empty.ll
+++ b/llvm/test/DebugInfo/omit-empty.ll
@@ -1,4 +1,4 @@
-; RUN: %llc_dwarf %s -filetype=obj -generate-arange-section -o - | llvm-objdump -h - | FileCheck %s
+; RUN: %llc_dwarf %s -filetype=obj -o - | llvm-objdump -h - | FileCheck %s
 ; REQUIRES: object-emission
 
 ; CHECK-NOT: .debug_
diff --git a/llvm/test/Instrumentation/AddressSanitizer/skip-coro.ll b/llvm/test/Instrumentation/AddressSanitizer/skip-coro.ll
deleted file mode 100644
index 65b27ee2241dd..0000000000000
--- a/llvm/test/Instrumentation/AddressSanitizer/skip-coro.ll
+++ /dev/null
@@ -1,34 +0,0 @@
-; Tests that asan skips pre-split coroutine and NoopCoro.Frame
-; RUN: opt < %s -S -passes=coro-early,asan | FileCheck %s
-
-; CHECK: %NoopCoro.Frame = type { ptr, ptr }
-; CHECK: @NoopCoro.Frame.Const = private constant %NoopCoro.Frame { ptr @__NoopCoro_ResumeDestroy, ptr @__NoopCoro_ResumeDestroy }
-; CHECK-NOT: @0 = private alias { %NoopCoro.Frame,
-
-%struct.Promise = type { %"struct.std::__n4861::coroutine_handle" }
-%"struct.std::__n4861::coroutine_handle" = type { ptr }
-
-; CHECK-LABEL: @foo(
-define ptr @foo() #0 {
-; CHECK-NEXT: entry
-; CHECK-NOT: %asan_local_stack_base
-entry:
-  %__promise = alloca %struct.Promise, align 8
-  %0 = call token @llvm.coro.id(i32 16, ptr nonnull %__promise, ptr null, ptr null)
-  %1 = call ptr @llvm.coro.noop()
-  ret ptr %1
-}
-
-declare token @llvm.coro.id(i32, ptr readnone, ptr nocapture readonly, ptr)
-declare ptr @llvm.coro.noop()
-
-attributes #0 = { sanitize_address presplitcoroutine }
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!3, !4}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "hand-written", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
-!1 = !DIFile(filename: "<stdin>", directory: "")
-!2 = !{}
-!3 = !{i32 2, !"Dwarf Version", i32 4}
-!4 = !{i32 2, !"Debug Info Version", i32 3}
diff --git a/llvm/test/MC/AArch64/CheckDataSymbol.s b/llvm/test/MC/AArch64/CheckDataSymbol.s
index 13d1db7da091b..baa1158984386 100644
--- a/llvm/test/MC/AArch64/CheckDataSymbol.s
+++ b/llvm/test/MC/AArch64/CheckDataSymbol.s
@@ -1,7 +1,7 @@
 # RUN: llvm-mc -filetype=obj -assemble \
 # RUN: -triple=aarch64- %s -o - \
 # RUN: | llvm-readobj -S --symbols - | FileCheck %s
-# CHECK:     Name: $d ({{[1-9][0-9]+}})
+# CHECK:     Name: $d.1 ({{[1-9][0-9]+}})
 # CHECK-NEXT:     Value: 0x4
 # CHECK-NEXT:     Size: 0
 # CHECK-NEXT:     Binding: Local (0x0)
diff --git a/llvm/test/MC/AArch64/mapping-across-sections.s b/llvm/test/MC/AArch64/mapping-across-sections.s
index f453c86d45fb6..6bb5a8811b57d 100644
--- a/llvm/test/MC/AArch64/mapping-across-sections.s
+++ b/llvm/test/MC/AArch64/mapping-across-sections.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple=aarch64 -filetype=obj %s | llvm-objdump -t - | FileCheck %s --match-full-lines
+// RUN: llvm-mc -triple=aarch64 -filetype=obj %s | llvm-objdump -t - | FileCheck %s
 
 .section .text1,"ax"
 add w0, w0, w0
@@ -28,13 +28,13 @@ add w0, w0, w0
 .section ".note.GNU-stack","", at progbits
 
 // CHECK:      SYMBOL TABLE:
-// CHECK-NEXT: 0000000000000000 l       .text1 0000000000000000 $x
-// CHECK-NEXT: 0000000000000000 l       .text  0000000000000000 $x
-// CHECK-NEXT: 0000000000000004 l       .text  0000000000000000 $d
-// CHECK-NEXT: 0000000000000000 l       .data  0000000000000000 $d
-// CHECK-NEXT: 0000000000000008 l       .text  0000000000000000 $x
-// CHECK-NEXT: 000000000000000c l       .text  0000000000000000 $d
-// CHECK-NEXT: 0000000000000000 l       .rodata        0000000000000000 $d
-// CHECK-NEXT: 0000000000000004 l       .rodata        0000000000000000 $x
-// CHECK-NEXT: 0000000000000000 l       .comment       0000000000000000 $d
+// CHECK-NEXT: 0000000000000000 l       .text1 0000000000000000 $x.0
+// CHECK-NEXT: 0000000000000000 l       .text  0000000000000000 $x.1
+// CHECK-NEXT: 0000000000000004 l       .text  0000000000000000 $d.2
+// CHECK-NEXT: 0000000000000000 l       .data  0000000000000000 $d.3
+// CHECK-NEXT: 0000000000000008 l       .text  0000000000000000 $x.4
+// CHECK-NEXT: 000000000000000c l       .text  0000000000000000 $d.5
+// CHECK-NEXT: 0000000000000000 l       .rodata        0000000000000000 $d.6
+// CHECK-NEXT: 0000000000000004 l       .rodata        0000000000000000 $x.7
+// CHECK-NEXT: 0000000000000000 l       .comment       0000000000000000 $d.8
 // CHECK-NOT:  {{.}}
diff --git a/llvm/test/MC/AArch64/mapping-within-section.s b/llvm/test/MC/AArch64/mapping-within-section.s
index c84e3a4d2fe64..07a2d3cc0fed9 100644
--- a/llvm/test/MC/AArch64/mapping-within-section.s
+++ b/llvm/test/MC/AArch64/mapping-within-section.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple=aarch64-none-linux-gnu -filetype=obj %s | llvm-nm --no-sort --special-syms - | FileCheck %s --match-full-lines
+// RUN: llvm-mc -triple=aarch64-none-linux-gnu -filetype=obj %s | llvm-nm --special-syms - | FileCheck %s
 
     .text
 // $x at 0x0000
@@ -18,13 +18,6 @@
 // $x at 0x0018
     add x0, x0, x0
 
-.globl $d
-$d:
-$x:
-
-// CHECK:      0000000000000000 t $x
-// CHECK-NEXT: 0000000000000004 t $d
-// CHECK-NEXT: 0000000000000064 t $x
-// CHECK-NEXT: 0000000000000068 t $x
-// CHECK-NEXT: 0000000000000068 T $d
-// CHECK-NOT:  {{.}}
+// CHECK:      0000000000000004 t $d.1
+// CHECK-NEXT: 0000000000000000 t $x.0
+// CHECK-NEXT: 0000000000000064 t $x.2
diff --git a/llvm/test/MC/AArch64/size-directive.s b/llvm/test/MC/AArch64/size-directive.s
index d9bdf970cf68b..0b19cda4eaa1e 100644
--- a/llvm/test/MC/AArch64/size-directive.s
+++ b/llvm/test/MC/AArch64/size-directive.s
@@ -32,7 +32,7 @@ aarch64_size:
 // CHECK-OBJ-NEXT: )
 
 // SYMS:      Type   Bind   Vis     Ndx Name
-// SYMS:      NOTYPE LOCAL  DEFAULT   3 $d{{$}}
+// SYMS:      NOTYPE LOCAL  DEFAULT   3 $d.0
 // SYMS-NEXT: FUNC   GLOBAL DEFAULT   3 aarch64_size
 // SYMS-NEXT: NOTYPE GLOBAL DEFAULT UND half_word
 // SYMS-NEXT: NOTYPE GLOBAL DEFAULT UND full_word
diff --git a/llvm/test/MC/ELF/AArch64/cfi.s b/llvm/test/MC/ELF/AArch64/cfi.s
index 6bdf03cc7bb85..033c8d9c04094 100644
--- a/llvm/test/MC/ELF/AArch64/cfi.s
+++ b/llvm/test/MC/ELF/AArch64/cfi.s
@@ -227,7 +227,7 @@ f37:
         .cfi_endproc
 
 // CHECK:       Section {
-// CHECK:         Name: .eh_frame
+// CHECK:         Name: .eh_frame (20)
 // CHECK-NEXT:    Type: SHT_PROGBITS (0x1)
 // CHECK-NEXT:    Flags [ (0x2)
 // CHECK-NEXT:      SHF_ALLOC (0x2)
@@ -355,7 +355,7 @@ f37:
 // CHECK-NEXT:    )
 // CHECK-NEXT:  }
 // CHECK:       Section {
-// CHECK:         Name: .rela.eh_frame
+// CHECK:         Name: .rela.eh_frame (15)
 // CHECK-NEXT:    Type: SHT_RELA (0x4)
 // CHECK-NEXT:    Flags [ (0x40)
 // CHECK-NEXT:      SHF_INFO_LINK (0x40)
diff --git a/llvm/test/MC/RISCV/mapping-across-sections.s b/llvm/test/MC/RISCV/mapping-across-sections.s
index 11b05a616bdcf..700e86a0eb70e 100644
--- a/llvm/test/MC/RISCV/mapping-across-sections.s
+++ b/llvm/test/MC/RISCV/mapping-across-sections.s
@@ -1,5 +1,5 @@
-# RUN: llvm-mc -triple=riscv32 -filetype=obj %s | llvm-readelf -Ss - | FileCheck %s
-# RUN: llvm-mc -triple=riscv64 -filetype=obj %s | llvm-readelf -Ss - | FileCheck %s
+# RUN: llvm-mc -triple=riscv32 -filetype=obj < %s | llvm-readelf -Ss - | FileCheck %s
+# RUN: llvm-mc -triple=riscv64 -filetype=obj < %s | llvm-readelf -Ss - | FileCheck %s
 
         .text
         nop
@@ -28,6 +28,6 @@
 # CHECK: [[#STARTS_DATA:]]] .starts_data
 
 # CHECK:    Value  Size Type    Bind   Vis     Ndx              Name
-# CHECK: 00000000     0 NOTYPE  LOCAL  DEFAULT [[#TEXT]]        $x{{$}}
-# CHECK: 00000000     0 NOTYPE  LOCAL  DEFAULT [[#WIBBLE]]      $x{{$}}
-# CHECK: 00000000     0 NOTYPE  LOCAL  DEFAULT [[#STARTS_DATA]] $d{{$}}
+# CHECK: 00000000     0 NOTYPE  LOCAL  DEFAULT [[#TEXT]]        $x
+# CHECK: 00000000     0 NOTYPE  LOCAL  DEFAULT [[#WIBBLE]]      $x
+# CHECK: 00000000     0 NOTYPE  LOCAL  DEFAULT [[#STARTS_DATA]] $d
diff --git a/llvm/test/MC/RISCV/rv32zacas-invalid.s b/llvm/test/MC/RISCV/rv32zacas-invalid.s
index bad2edcaaa915..66f939d139a12 100644
--- a/llvm/test/MC/RISCV/rv32zacas-invalid.s
+++ b/llvm/test/MC/RISCV/rv32zacas-invalid.s
@@ -1,4 +1,4 @@
-# RUN: not llvm-mc -triple riscv32 -mattr=+a,+experimental-zacas < %s 2>&1 | FileCheck %s
+# RUN: not llvm-mc -triple riscv32 -mattr=+a,zacas < %s 2>&1 | FileCheck %s
 
 # Non-zero offsets not supported for the third operand (rs1).
 amocas.w a1, a3, 1(a5) # CHECK: :[[@LINE]]:18: error: optional integer offset must be 0
diff --git a/llvm/test/MC/RISCV/rv32zacas-valid.s b/llvm/test/MC/RISCV/rv32zacas-valid.s
index 8ba2b02542bc0..0e76f02399483 100644
--- a/llvm/test/MC/RISCV/rv32zacas-valid.s
+++ b/llvm/test/MC/RISCV/rv32zacas-valid.s
@@ -1,12 +1,12 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+a,+experimental-zacas -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+a,+zacas -riscv-no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+a,+experimental-zacas -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+a,+zacas -riscv-no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+a,+experimental-zacas < %s \
-# RUN:     | llvm-objdump --mattr=+a,+experimental-zacas -M no-aliases -d -r - \
+# RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+a,+zacas < %s \
+# RUN:     | llvm-objdump --mattr=+a,+zacas -M no-aliases -d -r - \
 # RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+a,+experimental-zacas < %s \
-# RUN:     | llvm-objdump --mattr=+a,+experimental-zacas -M no-aliases -d -r - \
+# RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+a,+zacas < %s \
+# RUN:     | llvm-objdump --mattr=+a,+zacas -M no-aliases -d -r - \
 # RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
 # RUN: not llvm-mc -triple=riscv32 -mattr=+a -show-encoding %s 2>&1 \
 # RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
diff --git a/llvm/test/MC/RISCV/rv64zacas-invalid.s b/llvm/test/MC/RISCV/rv64zacas-invalid.s
index 854e6fe308b0a..e6a4e4007e978 100644
--- a/llvm/test/MC/RISCV/rv64zacas-invalid.s
+++ b/llvm/test/MC/RISCV/rv64zacas-invalid.s
@@ -1,4 +1,4 @@
-# RUN: not llvm-mc -triple riscv64 -mattr=+a,+experimental-zacas < %s 2>&1 | FileCheck %s
+# RUN: not llvm-mc -triple riscv64 -mattr=+experimental-zacas < %s 2>&1 | FileCheck %s
 
 # Non-zero offsets not supported for the third operand (rs1).
 amocas.w a1, a3, 1(a5) # CHECK: :[[@LINE]]:18: error: optional integer offset must be 0
diff --git a/llvm/test/MC/RISCV/rv64zacas-valid.s b/llvm/test/MC/RISCV/rv64zacas-valid.s
index d5044a0e0671d..c6bf1252fa40d 100644
--- a/llvm/test/MC/RISCV/rv64zacas-valid.s
+++ b/llvm/test/MC/RISCV/rv64zacas-valid.s
@@ -1,7 +1,7 @@
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+a,+experimental-zacas -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+a,zacas -riscv-no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+a,+experimental-zacas < %s \
-# RUN:     | llvm-objdump --mattr=+a,+experimental-zacas -M no-aliases -d -r - \
+# RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+a,zacas < %s \
+# RUN:     | llvm-objdump --mattr=+a,zacas -M no-aliases -d -r - \
 # RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
 # RUN: not llvm-mc -triple=riscv64 -mattr=+a -show-encoding %s 2>&1 \
 # RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
diff --git a/llvm/test/MC/RISCV/rvzabha-zacas-valid.s b/llvm/test/MC/RISCV/rvzabha-zacas-valid.s
index f8aa6867aedc6..97afb9d6563e5 100644
--- a/llvm/test/MC/RISCV/rvzabha-zacas-valid.s
+++ b/llvm/test/MC/RISCV/rvzabha-zacas-valid.s
@@ -1,12 +1,12 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+a,+zabha,+experimental-zacas -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+a,+zabha,+zacas -riscv-no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+a,+zabha,+experimental-zacas -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+a,+zabha,+zacas -riscv-no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+a,+zabha,+experimental-zacas < %s \
-# RUN:     | llvm-objdump --mattr=+a,+zabha,+experimental-zacas -M no-aliases -d -r - \
+# RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+a,+zabha,+zacas < %s \
+# RUN:     | llvm-objdump --mattr=+a,+zabha,+zacas -M no-aliases -d -r - \
 # RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+a,+zabha,+experimental-zacas < %s \
-# RUN:     | llvm-objdump --mattr=+a,+zabha,+experimental-zacas -M no-aliases -d -r - \
+# RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+a,+zabha,+zacas < %s \
+# RUN:     | llvm-objdump --mattr=+a,+zabha,+zacas -M no-aliases -d -r - \
 # RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
 # RUN: not llvm-mc -triple=riscv32 -mattr=+a,+zabha -show-encoding %s 2>&1 \
 # RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
diff --git a/llvm/test/Object/ARM/nm-mapping-symbol.s b/llvm/test/Object/ARM/nm-mapping-symbol.s
new file mode 100644
index 0000000000000..6b41876c35e5e
--- /dev/null
+++ b/llvm/test/Object/ARM/nm-mapping-symbol.s
@@ -0,0 +1,12 @@
+// RUN: llvm-mc %s -o %t.o -filetype=obj -triple=armv7-pc-linux
+// RUN: llvm-readobj --symbols %t.o | FileCheck %s
+// RUN: llvm-nm %t.o | FileCheck -allow-empty --check-prefix=NM %s
+
+// Test that nm doesn't print the mapping symbols
+
+// CHECK: Name: $d.0
+// NM-NOT: $d.0
+
+        .section        .foobar,"",%progbits
+        .asciz  "foo"
+        nop
diff --git a/llvm/test/Other/new-pm-print-pipeline.ll b/llvm/test/Other/new-pm-print-pipeline.ll
index f2e80814f347a..a524c9991f1bf 100644
--- a/llvm/test/Other/new-pm-print-pipeline.ll
+++ b/llvm/test/Other/new-pm-print-pipeline.ll
@@ -49,8 +49,8 @@
 ; RUN: opt -disable-output -disable-verify -print-pipeline-passes -passes='function(print<stack-lifetime><may>,print<stack-lifetime><must>)' < %s | FileCheck %s --match-full-lines --check-prefixes=CHECK-17
 ; CHECK-17: function(print<stack-lifetime><may>,print<stack-lifetime><must>)
 
-; RUN: opt -disable-output -disable-verify -print-pipeline-passes -passes='function(simplifycfg<bonus-inst-threshold=5;forward-switch-cond;switch-to-lookup;keep-loops;hoist-common-insts;sink-common-insts;speculate-blocks;simplify-cond-branch;speculate-unpredictables>,simplifycfg<bonus-inst-threshold=7;no-forward-switch-cond;no-switch-to-lookup;no-keep-loops;no-hoist-common-insts;no-sink-common-insts;no-speculate-blocks;no-simplify-cond-branch;no-speculate-unpredictables>)' < %s | FileCheck %s --match-full-lines --check-prefixes=CHECK-18
-; CHECK-18: function(simplifycfg<bonus-inst-threshold=5;forward-switch-cond;no-switch-range-to-icmp;switch-to-lookup;keep-loops;hoist-common-insts;sink-common-insts;speculate-blocks;simplify-cond-branch;speculate-unpredictables>,simplifycfg<bonus-inst-threshold=7;no-forward-switch-cond;no-switch-range-to-icmp;no-switch-to-lookup;no-keep-loops;no-hoist-common-insts;no-sink-common-insts;no-speculate-blocks;no-simplify-cond-branch;no-speculate-unpredictables>)
+; RUN: opt -disable-output -disable-verify -print-pipeline-passes -passes='function(simplifycfg<bonus-inst-threshold=5;forward-switch-cond;switch-to-lookup;keep-loops;hoist-common-insts;sink-common-insts;speculate-blocks;simplify-cond-branch>,simplifycfg<bonus-inst-threshold=7;no-forward-switch-cond;no-switch-to-lookup;no-keep-loops;no-hoist-common-insts;no-sink-common-insts;no-speculate-blocks;no-simplify-cond-branch>)' < %s | FileCheck %s --match-full-lines --check-prefixes=CHECK-18
+; CHECK-18: function(simplifycfg<bonus-inst-threshold=5;forward-switch-cond;no-switch-range-to-icmp;switch-to-lookup;keep-loops;hoist-common-insts;sink-common-insts;speculate-blocks;simplify-cond-branch>,simplifycfg<bonus-inst-threshold=7;no-forward-switch-cond;no-switch-range-to-icmp;no-switch-to-lookup;no-keep-loops;no-hoist-common-insts;no-sink-common-insts;no-speculate-blocks;no-simplify-cond-branch>)
 
 ; RUN: opt -disable-output -disable-verify -print-pipeline-passes -passes='function(loop-vectorize<no-interleave-forced-only;no-vectorize-forced-only>,loop-vectorize<interleave-forced-only;vectorize-forced-only>)' < %s | FileCheck %s --match-full-lines --check-prefixes=CHECK-19
 ; CHECK-19: function(loop-vectorize<no-interleave-forced-only;no-vectorize-forced-only;>,loop-vectorize<interleave-forced-only;vectorize-forced-only;>)
diff --git a/llvm/test/TableGen/riscv-target-def.td b/llvm/test/TableGen/riscv-target-def.td
index c071cfd731cb5..7137cf96fd3d4 100644
--- a/llvm/test/TableGen/riscv-target-def.td
+++ b/llvm/test/TableGen/riscv-target-def.td
@@ -12,11 +12,6 @@ class RISCVExtension<string name, int major, int minor, string desc,
   bit Experimental = false;
 }
 
-class RISCVExtensionBitmask<bits<3> groupID, int bitPos> {
-  int GroupID = groupID;
-  int BitPos = bitPos;
-}
-
 class RISCVExperimentalExtension<string name, int major, int minor, string desc,
                                  list<RISCVExtension> implies = [],
                                  string fieldname = !subst("Feature", "Has", NAME),
@@ -28,8 +23,7 @@ class RISCVExperimentalExtension<string name, int major, int minor, string desc,
 
 def FeatureStdExtI
     : RISCVExtension<"i", 2, 1,
-                     "'I' (Base Integer Instruction Set)">,
-      RISCVExtensionBitmask<0, 8>;
+                     "'I' (Base Integer Instruction Set)">;
 
 def FeatureStdExtZicsr
     : RISCVExtension<"zicsr", 2, 0,
@@ -42,8 +36,7 @@ def FeatureStdExtZifencei
 def FeatureStdExtF
     : RISCVExtension<"f", 2, 2,
                      "'F' (Single-Precision Floating-Point)",
-                     [FeatureStdExtZicsr]>,
-      RISCVExtensionBitmask<0, 5>;
+                     [FeatureStdExtZicsr]>;
 
 def FeatureStdExtZidummy
     : RISCVExperimentalExtension<"zidummy", 0, 1,
@@ -178,10 +171,3 @@ def ROCKET : RISCVTuneProcessorModel<"rocket",
 // CHECK-NEXT: TUNE_PROC(ROCKET, "rocket")
 
 // CHECK: #undef TUNE_PROC
-
-// CHECK: #ifdef GET_RISCVExtensionBitmaskTable_IMPL
-// CHECK-NEXT: static const RISCVExtensionBitmask ExtensionBitmask[]={
-// CHECK-NEXT:     {"f", 0, 5ULL},
-// CHECK-NEXT:     {"i", 0, 8ULL},
-// CHECK-NEXT: };
-// CHECK-NEXT: #endif
diff --git a/llvm/test/Transforms/DeadStoreElimination/memoryssa-scan-limit.ll b/llvm/test/Transforms/DeadStoreElimination/memoryssa-scan-limit.ll
index 231aba5cc2f0c..bd225def3addc 100644
--- a/llvm/test/Transforms/DeadStoreElimination/memoryssa-scan-limit.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/memoryssa-scan-limit.ll
@@ -4,7 +4,6 @@
 ; RUN: opt < %s -passes=dse -dse-memoryssa-scanlimit=0 -S | FileCheck --check-prefix=LIMIT-0 %s
 ; RUN: opt < %s -passes=dse -dse-memoryssa-scanlimit=2 -S | FileCheck --check-prefix=LIMIT-2 %s
 ; RUN: opt < %s -passes=dse -dse-memoryssa-scanlimit=3 -S | FileCheck --check-prefix=LIMIT-3 %s
-; RUN: opt < %s -passes=dse -dse-memoryssa-scanlimit=4 -S | FileCheck --check-prefix=LIMIT-4 %s
 
 target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
 
@@ -72,47 +71,3 @@ bb3:
   store i32 0, ptr %P
   ret void
 }
-
-define void @duplicate_worklist_endoffunction(ptr %ptr.0, ptr %ptr.1) {
-; LIMIT-4-LABEL: @duplicate_worklist_endoffunction(
-; LIMIT-4-NEXT:  entry:
-; LIMIT-4-NEXT:    [[STACK_0:%.*]] = alloca [768 x i8], align 16
-; LIMIT-4-NEXT:    [[VAL_0:%.*]] = load i16, ptr [[PTR_1:%.*]], align 8
-; LIMIT-4-NEXT:    [[COND:%.*]] = icmp ugt i16 [[VAL_0]], 24
-; LIMIT-4-NEXT:    br i1 [[COND]], label [[BB_1:%.*]], label [[EXIT:%.*]]
-; LIMIT-4:       bb.1:
-; LIMIT-4-NEXT:    br label [[LOOP:%.*]]
-; LIMIT-4:       loop:
-; LIMIT-4-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[BB_1]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; LIMIT-4-NEXT:    [[PTR_3:%.*]] = getelementptr i8, ptr [[STACK_0]], i64 [[IV]]
-; LIMIT-4-NEXT:    store ptr [[PTR_0:%.*]], ptr [[PTR_3]], align 2
-; LIMIT-4-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], 1
-; LIMIT-4-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 10
-; LIMIT-4-NEXT:    br i1 [[EXITCOND]], label [[EXIT]], label [[LOOP]]
-; LIMIT-4:       exit:
-; LIMIT-4-NEXT:    ret void
-;
-entry:
-  %stack.0 = alloca [768 x i8], align 16
-  %stack.1 = alloca [20 x i8], align 8
-  %val.0 = load i16, ptr %ptr.1, align 8
-  %cond = icmp ugt i16 %val.0, 24
-  br i1 %cond, label %bb.1, label %exit
-
-bb.1:                                             ; preds = %entry
-  %ptr.2 = getelementptr inbounds i8, ptr %ptr.1, i64 8
-  %val.1 = load i64, ptr %ptr.2, align 8
-  store i64 %val.1, ptr %stack.1, align 8
-  br label %loop
-
-loop:                                             ; preds = %loop, %bb.1
-  %iv = phi i64 [ 0, %bb.1 ], [ %iv.next, %loop ]
-  %ptr.3 = getelementptr i8, ptr %stack.0, i64 %iv
-  store ptr %ptr.0, ptr %ptr.3, align 2
-  %iv.next = add nuw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 10
-  br i1 %exitcond, label %exit, label %loop
-
-exit:                                             ; preds = %loop, %entry
-  ret void
-}
diff --git a/llvm/test/Transforms/GVN/condprop.ll b/llvm/test/Transforms/GVN/condprop.ll
index 3babdd8173a21..02f4bb97d7ebd 100644
--- a/llvm/test/Transforms/GVN/condprop.ll
+++ b/llvm/test/Transforms/GVN/condprop.ll
@@ -813,7 +813,7 @@ define void @select_same_obj(i1 %c, ptr %p, i64 %x) {
 ; CHECK-NEXT:    br i1 [[CMP]], label [[IF:%.*]], label [[EXIT:%.*]]
 ; CHECK:       if:
 ; CHECK-NEXT:    call void @use_ptr(ptr [[P]])
-; CHECK-NEXT:    call void @use_ptr(ptr [[P]])
+; CHECK-NEXT:    call void @use_ptr(ptr [[P3]])
 ; CHECK-NEXT:    ret void
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
@@ -902,7 +902,7 @@ define void @phi_same_obj(i1 %c, ptr %p, i64 %x) {
 ; CHECK-NEXT:    br i1 [[CMP]], label [[IF2:%.*]], label [[EXIT:%.*]]
 ; CHECK:       if2:
 ; CHECK-NEXT:    call void @use_ptr(ptr [[P]])
-; CHECK-NEXT:    call void @use_ptr(ptr [[P]])
+; CHECK-NEXT:    call void @use_ptr(ptr [[P3]])
 ; CHECK-NEXT:    ret void
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
@@ -975,7 +975,7 @@ define void @phi_same_obj_cycle(i1 %c, ptr %p, i64 %x) {
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq ptr [[P_IV]], [[P]]
 ; CHECK-NEXT:    br i1 [[CMP]], label [[IF:%.*]], label [[LOOP_LATCH]]
 ; CHECK:       if:
-; CHECK-NEXT:    call void @use_ptr(ptr [[P]])
+; CHECK-NEXT:    call void @use_ptr(ptr [[P_IV]])
 ; CHECK-NEXT:    call void @use_ptr(ptr [[P]])
 ; CHECK-NEXT:    br label [[LOOP_LATCH]]
 ; CHECK:       loop.latch:
diff --git a/llvm/test/Transforms/InstCombine/select-binop-cmp.ll b/llvm/test/Transforms/InstCombine/select-binop-cmp.ll
index fb56764598e2d..1fa0c09a9e987 100644
--- a/llvm/test/Transforms/InstCombine/select-binop-cmp.ll
+++ b/llvm/test/Transforms/InstCombine/select-binop-cmp.ll
@@ -1315,19 +1315,6 @@ define i32 @select_replace_call_speculatable(i32 %x, i32 %y) {
   ret i32 %s
 }
 
-define i32 @select_replace_call_speculatable_intrinsic(i32 %x, i32 %y) {
-; CHECK-LABEL: @select_replace_call_speculatable_intrinsic(
-; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[X:%.*]], 0
-; CHECK-NEXT:    [[CALL:%.*]] = call i32 @llvm.smax.i32(i32 [[Y:%.*]], i32 0)
-; CHECK-NEXT:    [[S:%.*]] = select i1 [[C]], i32 [[CALL]], i32 [[Y]]
-; CHECK-NEXT:    ret i32 [[S]]
-;
-  %c = icmp eq i32 %x, 0
-  %call = call i32 @llvm.smax.i32(i32 %x, i32 %y)
-  %s = select i1 %c, i32 %call, i32 %y
-  ret i32 %s
-}
-
 ; We can't replace the call arguments, as the call is not speculatable. We
 ; may end up changing side-effects or causing undefined behavior.
 define i32 @select_replace_call_non_speculatable(i32 %x, i32 %y) {
diff --git a/llvm/test/Transforms/InstCombine/select.ll b/llvm/test/Transforms/InstCombine/select.ll
index 1369be305ec13..d66ffb9a63ac1 100644
--- a/llvm/test/Transforms/InstCombine/select.ll
+++ b/llvm/test/Transforms/InstCombine/select.ll
@@ -4713,19 +4713,3 @@ define i8 @select_knownbits_simplify_missing_noundef(i8 %x)  {
   %res = select i1 %cmp, i8 %and, i8 0
   ret i8 %res
 }
-
- at g_ext = external global i8
-
-; Make sure we don't replace %ptr with @g_ext, which may cause the load to trigger UB.
-define i32 @pr99436(ptr align 4 dereferenceable(4) %ptr) {
-; CHECK-LABEL: @pr99436(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq ptr [[PTR:%.*]], @g_ext
-; CHECK-NEXT:    [[VAL:%.*]] = load i32, ptr [[PTR]], align 4
-; CHECK-NEXT:    [[RET:%.*]] = select i1 [[CMP]], i32 [[VAL]], i32 0
-; CHECK-NEXT:    ret i32 [[RET]]
-;
-  %cmp = icmp eq ptr %ptr, @g_ext
-  %val = load i32, ptr %ptr, align 4
-  %ret = select i1 %cmp, i32 %val, i32 0
-  ret i32 %ret
-}
diff --git a/llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll b/llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll
index 56b59012eef40..483955c1c57a0 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll
@@ -384,51 +384,4 @@ for.exit:
   ret void
 }
 
-;; This test demonstrates an incorrect MUL VL address calculation. Here there
-;; are two writes that should be `16 * vscale * vscale` apart, however,
-;; loop-strength-reduce has ignored the second `vscale` and offset the second
-;; write by `#4, mul vl` which is an offset of `16 * vscale` dropping a vscale.
-define void @vscale_squared_offset(ptr %alloc) #0 {
-; COMMON-LABEL: vscale_squared_offset:
-; COMMON:       // %bb.0: // %entry
-; COMMON-NEXT:    fmov z0.s, #4.00000000
-; COMMON-NEXT:    mov x8, xzr
-; COMMON-NEXT:    cntw x9
-; COMMON-NEXT:    fmov z1.s, #8.00000000
-; COMMON-NEXT:    ptrue p0.s, vl1
-; COMMON-NEXT:    cmp x8, x9
-; COMMON-NEXT:    b.ge .LBB6_2
-; COMMON-NEXT:  .LBB6_1: // %for.body
-; COMMON-NEXT:    // =>This Inner Loop Header: Depth=1
-; COMMON-NEXT:    st1w { z0.s }, p0, [x0]
-; COMMON-NEXT:    add x8, x8, #1
-; COMMON-NEXT:    st1w { z1.s }, p0, [x0, #4, mul vl]
-; COMMON-NEXT:    addvl x0, x0, #1
-; COMMON-NEXT:    cmp x8, x9
-; COMMON-NEXT:    b.lt .LBB6_1
-; COMMON-NEXT:  .LBB6_2: // %for.exit
-; COMMON-NEXT:    ret
-entry:
-  %vscale = call i64 @llvm.vscale.i64()
-  %c4_vscale = mul i64 %vscale, 4
-  br label %for.check
-for.check:
-  %i = phi i64 [ %next_i, %for.body ], [ 0, %entry ]
-  %is_lt = icmp slt i64 %i, %c4_vscale
-  br i1 %is_lt, label %for.body, label %for.exit
-for.body:
-  %mask = call <vscale x 4 x i1> @llvm.aarch64.sve.whilelt.nxv4i1.i64(i64 0, i64 1)
-  %upper_offset = mul i64 %i, %c4_vscale
-  %upper_ptr = getelementptr float, ptr %alloc, i64 %upper_offset
-  call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 4.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), ptr %upper_ptr, i32 4, <vscale x 4 x i1> %mask)
-  %lower_i = add i64 %i, %c4_vscale
-  %lower_offset = mul i64 %lower_i, %c4_vscale
-  %lower_ptr = getelementptr float, ptr %alloc, i64 %lower_offset
-  call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 8.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), ptr %lower_ptr, i32 4, <vscale x 4 x i1> %mask)
-  %next_i = add i64 %i, 1
-  br label %for.check
-for.exit:
-  ret void
-}
-
 attributes #0 = { "target-features"="+sve2" vscale_range(1,16) }
diff --git a/llvm/test/Transforms/LoopVectorize/LoongArch/defaults.ll b/llvm/test/Transforms/LoopVectorize/LoongArch/defaults.ll
index 09eada311d219..2cee1dacea37d 100644
--- a/llvm/test/Transforms/LoopVectorize/LoongArch/defaults.ll
+++ b/llvm/test/Transforms/LoopVectorize/LoongArch/defaults.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt < %s -passes=loop-vectorize -mtriple loongarch64-linux-gnu -mattr=+lasx -S | FileCheck %s
+; RUN: opt < %s -passes=loop-vectorize -mtriple loongarch64-linux-gnu -mattr=+lasx,+auto-vec -S | FileCheck %s
 
 ;; This is a collection of tests whose only purpose is to show changes in the
 ;; default configuration.  Please keep these tests minimal - if you're testing
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll b/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll
index a151232df0cd5..e50d7362365b8 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll
@@ -39,32 +39,32 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; RV32-NEXT:    [[N_MOD_VF:%.*]] = urem i64 625, [[TMP4]]
 ; RV32-NEXT:    [[N_VEC:%.*]] = sub i64 625, [[N_MOD_VF]]
 ; RV32-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 16
-; RV32-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; RV32-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
-; RV32-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
-; RV32-NEXT:    [[TMP8:%.*]] = add <vscale x 2 x i64> [[TMP7]], zeroinitializer
-; RV32-NEXT:    [[TMP9:%.*]] = mul <vscale x 2 x i64> [[TMP8]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 16, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; RV32-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP9]]
-; RV32-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; RV32-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 2
-; RV32-NEXT:    [[TMP12:%.*]] = mul i64 16, [[TMP11]]
-; RV32-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP12]], i64 0
+; RV32-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
+; RV32-NEXT:    [[TMP19:%.*]] = mul i64 [[TMP18]], 2
+; RV32-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
+; RV32-NEXT:    [[TMP6:%.*]] = add <vscale x 2 x i64> [[TMP5]], zeroinitializer
+; RV32-NEXT:    [[TMP7:%.*]] = mul <vscale x 2 x i64> [[TMP6]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 16, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; RV32-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP7]]
+; RV32-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; RV32-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 2
+; RV32-NEXT:    [[TMP10:%.*]] = mul i64 16, [[TMP9]]
+; RV32-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP10]], i64 0
 ; RV32-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; RV32-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; RV32:       vector.body:
 ; RV32-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; RV32-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; RV32-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], <vscale x 2 x i64> [[VEC_IND]]
-; RV32-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> [[TMP13]], i32 4, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i32> poison), !alias.scope [[META0:![0-9]+]]
-; RV32-NEXT:    [[TMP14:%.*]] = icmp slt <vscale x 2 x i32> [[WIDE_MASKED_GATHER]], shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 100, i64 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
-; RV32-NEXT:    [[TMP15:%.*]] = shl nuw nsw <vscale x 2 x i64> [[VEC_IND]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; RV32-NEXT:    [[TMP16:%.*]] = getelementptr inbounds double, ptr [[B]], <vscale x 2 x i64> [[TMP15]]
-; RV32-NEXT:    [[WIDE_MASKED_GATHER6:%.*]] = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64.nxv2p0(<vscale x 2 x ptr> [[TMP16]], i32 8, <vscale x 2 x i1> [[TMP14]], <vscale x 2 x double> poison), !alias.scope [[META3:![0-9]+]]
-; RV32-NEXT:    [[TMP17:%.*]] = sitofp <vscale x 2 x i32> [[WIDE_MASKED_GATHER]] to <vscale x 2 x double>
-; RV32-NEXT:    [[TMP18:%.*]] = fadd <vscale x 2 x double> [[WIDE_MASKED_GATHER6]], [[TMP17]]
-; RV32-NEXT:    [[TMP19:%.*]] = getelementptr inbounds double, ptr [[A]], <vscale x 2 x i64> [[VEC_IND]]
-; RV32-NEXT:    call void @llvm.masked.scatter.nxv2f64.nxv2p0(<vscale x 2 x double> [[TMP18]], <vscale x 2 x ptr> [[TMP19]], i32 8, <vscale x 2 x i1> [[TMP14]]), !alias.scope [[META5:![0-9]+]], !noalias [[META7:![0-9]+]]
-; RV32-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
+; RV32-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], <vscale x 2 x i64> [[VEC_IND]]
+; RV32-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> [[TMP11]], i32 4, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i32> poison), !alias.scope !0
+; RV32-NEXT:    [[TMP12:%.*]] = icmp slt <vscale x 2 x i32> [[WIDE_MASKED_GATHER]], shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 100, i64 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
+; RV32-NEXT:    [[TMP13:%.*]] = shl nuw nsw <vscale x 2 x i64> [[VEC_IND]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; RV32-NEXT:    [[TMP14:%.*]] = getelementptr inbounds double, ptr [[B]], <vscale x 2 x i64> [[TMP13]]
+; RV32-NEXT:    [[WIDE_MASKED_GATHER6:%.*]] = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64.nxv2p0(<vscale x 2 x ptr> [[TMP14]], i32 8, <vscale x 2 x i1> [[TMP12]], <vscale x 2 x double> poison), !alias.scope !3
+; RV32-NEXT:    [[TMP15:%.*]] = sitofp <vscale x 2 x i32> [[WIDE_MASKED_GATHER]] to <vscale x 2 x double>
+; RV32-NEXT:    [[TMP16:%.*]] = fadd <vscale x 2 x double> [[WIDE_MASKED_GATHER6]], [[TMP15]]
+; RV32-NEXT:    [[TMP17:%.*]] = getelementptr inbounds double, ptr [[A]], <vscale x 2 x i64> [[VEC_IND]]
+; RV32-NEXT:    call void @llvm.masked.scatter.nxv2f64.nxv2p0(<vscale x 2 x double> [[TMP16]], <vscale x 2 x ptr> [[TMP17]], i32 8, <vscale x 2 x i1> [[TMP12]]), !alias.scope !5, !noalias !7
+; RV32-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP19]]
 ; RV32-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; RV32-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; RV32-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
@@ -121,32 +121,32 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; RV64-NEXT:    [[N_MOD_VF:%.*]] = urem i64 625, [[TMP4]]
 ; RV64-NEXT:    [[N_VEC:%.*]] = sub i64 625, [[N_MOD_VF]]
 ; RV64-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 16
-; RV64-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; RV64-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
-; RV64-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
-; RV64-NEXT:    [[TMP8:%.*]] = add <vscale x 2 x i64> [[TMP7]], zeroinitializer
-; RV64-NEXT:    [[TMP9:%.*]] = mul <vscale x 2 x i64> [[TMP8]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 16, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; RV64-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP9]]
-; RV64-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; RV64-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 2
-; RV64-NEXT:    [[TMP12:%.*]] = mul i64 16, [[TMP11]]
-; RV64-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP12]], i64 0
+; RV64-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-NEXT:    [[TMP19:%.*]] = mul i64 [[TMP18]], 2
+; RV64-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
+; RV64-NEXT:    [[TMP6:%.*]] = add <vscale x 2 x i64> [[TMP5]], zeroinitializer
+; RV64-NEXT:    [[TMP7:%.*]] = mul <vscale x 2 x i64> [[TMP6]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 16, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; RV64-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP7]]
+; RV64-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 2
+; RV64-NEXT:    [[TMP10:%.*]] = mul i64 16, [[TMP9]]
+; RV64-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP10]], i64 0
 ; RV64-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; RV64-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; RV64:       vector.body:
 ; RV64-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; RV64-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; RV64-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], <vscale x 2 x i64> [[VEC_IND]]
-; RV64-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> [[TMP13]], i32 4, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i32> poison), !alias.scope [[META0:![0-9]+]]
-; RV64-NEXT:    [[TMP14:%.*]] = icmp slt <vscale x 2 x i32> [[WIDE_MASKED_GATHER]], shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 100, i64 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
-; RV64-NEXT:    [[TMP15:%.*]] = shl nuw nsw <vscale x 2 x i64> [[VEC_IND]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; RV64-NEXT:    [[TMP16:%.*]] = getelementptr inbounds double, ptr [[B]], <vscale x 2 x i64> [[TMP15]]
-; RV64-NEXT:    [[WIDE_MASKED_GATHER6:%.*]] = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64.nxv2p0(<vscale x 2 x ptr> [[TMP16]], i32 8, <vscale x 2 x i1> [[TMP14]], <vscale x 2 x double> poison), !alias.scope [[META3:![0-9]+]]
-; RV64-NEXT:    [[TMP17:%.*]] = sitofp <vscale x 2 x i32> [[WIDE_MASKED_GATHER]] to <vscale x 2 x double>
-; RV64-NEXT:    [[TMP18:%.*]] = fadd <vscale x 2 x double> [[WIDE_MASKED_GATHER6]], [[TMP17]]
-; RV64-NEXT:    [[TMP19:%.*]] = getelementptr inbounds double, ptr [[A]], <vscale x 2 x i64> [[VEC_IND]]
-; RV64-NEXT:    call void @llvm.masked.scatter.nxv2f64.nxv2p0(<vscale x 2 x double> [[TMP18]], <vscale x 2 x ptr> [[TMP19]], i32 8, <vscale x 2 x i1> [[TMP14]]), !alias.scope [[META5:![0-9]+]], !noalias [[META7:![0-9]+]]
-; RV64-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
+; RV64-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], <vscale x 2 x i64> [[VEC_IND]]
+; RV64-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> [[TMP11]], i32 4, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i32> poison), !alias.scope !0
+; RV64-NEXT:    [[TMP12:%.*]] = icmp slt <vscale x 2 x i32> [[WIDE_MASKED_GATHER]], shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 100, i64 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
+; RV64-NEXT:    [[TMP13:%.*]] = shl nuw nsw <vscale x 2 x i64> [[VEC_IND]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; RV64-NEXT:    [[TMP14:%.*]] = getelementptr inbounds double, ptr [[B]], <vscale x 2 x i64> [[TMP13]]
+; RV64-NEXT:    [[WIDE_MASKED_GATHER6:%.*]] = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64.nxv2p0(<vscale x 2 x ptr> [[TMP14]], i32 8, <vscale x 2 x i1> [[TMP12]], <vscale x 2 x double> poison), !alias.scope !3
+; RV64-NEXT:    [[TMP15:%.*]] = sitofp <vscale x 2 x i32> [[WIDE_MASKED_GATHER]] to <vscale x 2 x double>
+; RV64-NEXT:    [[TMP16:%.*]] = fadd <vscale x 2 x double> [[WIDE_MASKED_GATHER6]], [[TMP15]]
+; RV64-NEXT:    [[TMP17:%.*]] = getelementptr inbounds double, ptr [[A]], <vscale x 2 x i64> [[VEC_IND]]
+; RV64-NEXT:    call void @llvm.masked.scatter.nxv2f64.nxv2p0(<vscale x 2 x double> [[TMP16]], <vscale x 2 x ptr> [[TMP17]], i32 8, <vscale x 2 x i1> [[TMP12]]), !alias.scope !5, !noalias !7
+; RV64-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP19]]
 ; RV64-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; RV64-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; RV64-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
index fc310f4163082..8e9713fecf29d 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -54,46 +54,43 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: Loop does not require scalar epilogue
 ; CHECK-NEXT:  LV: Loop does not require scalar epilogue
 ; CHECK-NEXT:  VPlan 'Initial VPlan for VF={vscale x 4},UF>=1' {
-; CHECK-NEXT:  Live-in vp<%0> = VF * UF
-; CHECK-NEXT:  Live-in vp<%1> = vector-trip-count
-; CHECK-NEXT:  vp<%2> = original trip-count
-; CHECK-EMPTY:
-; CHECK-NEXT:  ir-bb<for.body.preheader>:
-; CHECK-NEXT:    EMIT vp<%2> = EXPAND SCEV (zext i32 %n to i64)
+; CHECK-NEXT:  Live-in vp<[[VFxUF:%.+]]> = VF * UF
+; CHECK-NEXT:  Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
+; CHECK-NEXT:  vp<[[TC:%.+]]> = original trip-count
+; CHECK:       ir-bb<for.body.preheader>:
+; CHECK-NEXT:    EMIT vp<[[TC]]> = EXPAND SCEV (zext i32 %n to i64)
 ; CHECK-NEXT:  No successors
-; CHECK-EMPTY:
-; CHECK-NEXT:  vector.ph:
+; CHECK:       vector.ph:
 ; CHECK-NEXT:  Successor(s): vector loop
-; CHECK-EMPTY:
-; CHECK-NEXT:  <x1> vector loop: {
+; CHECK:       <x1> vector loop: {
 ; CHECK-NEXT:    vector.body:
-; CHECK-NEXT:      EMIT vp<%3> = CANONICAL-INDUCTION ir<0>, vp<%8>
-; CHECK-NEXT:      vp<%4> = DERIVED-IV ir<%n> + vp<%3> * ir<-1>
-; CHECK-NEXT:      vp<%5> = SCALAR-STEPS vp<%4>, ir<-1>
-; CHECK-NEXT:      CLONE ir<%i.0> = add nsw vp<%5>, ir<-1>
-; CHECK-NEXT:      CLONE ir<%idxprom> = zext ir<%i.0>
-; CHECK-NEXT:      CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom>
-; CHECK-NEXT:      vp<%6> = vector-pointer (reverse) ir<%arrayidx>
-; CHECK-NEXT:      WIDEN ir<%1> = load vp<%6>
-; CHECK-NEXT:      WIDEN ir<%add9> = add ir<%1>, ir<1>
-; CHECK-NEXT:      CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom>
-; CHECK-NEXT:      vp<%7> = vector-pointer (reverse) ir<%arrayidx3>
-; CHECK-NEXT:      WIDEN store vp<%7>, ir<%add9>
-; CHECK-NEXT:      EMIT vp<%8> = add nuw vp<%3>, vp<%0>
-; CHECK-NEXT:      EMIT branch-on-count vp<%8>, vp<%1>
+; CHECK-NEXT:    EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION
+; CHECK-NEXT:    vp<[[DERIVED_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1>
+; CHECK-NEXT:    vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[DERIVED_IV]]>, ir<-1>
+; CHECK-NEXT:    CLONE ir<%i.0> = add nsw vp<[[STEPS]]>, ir<-1>
+; CHECK-NEXT:    CLONE ir<%idxprom> = zext ir<%i.0>
+; CHECK-NEXT:    CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom>
+; CHECK-NEXT:    vp<[[VEC_PTR:%.+]]> = vector-pointer (reverse) ir<%arrayidx>
+; CHECK-NEXT:    WIDEN ir<%1> = load vp<[[VEC_PTR]]>
+; CHECK-NEXT:    WIDEN ir<%add9> = add ir<%1>, ir<1>
+; CHECK-NEXT:    CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom>
+; CHECK-NEXT:    vp<[[VEC_PTR2:%.+]]> = vector-pointer (reverse) ir<%arrayidx3>
+; CHECK-NEXT:    WIDEN store vp<[[VEC_PTR2]]>, ir<%add9>
+; CHECK-NEXT:    EMIT vp<[[IV_INC:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
+; CHECK-NEXT:    EMIT branch-on-count vp<[[IV_INC]]>, vp<[[VEC_TC]]>
 ; CHECK-NEXT:    No successors
 ; CHECK-NEXT:  }
 ; CHECK-NEXT:  Successor(s): middle.block
 ; CHECK-EMPTY:
-; CHECK-NEXT:  middle.block:
-; CHECK-NEXT:    EMIT vp<%10> = icmp eq vp<%2>, vp<%1>
-; CHECK-NEXT:    EMIT branch-on-cond vp<%10>
+; CHECK:       middle.block:
+; CHECK-NEXT:    EMIT vp<[[CMP:%.+]]> = icmp eq vp<[[TC]]>, vp<[[VEC_TC]]>
+; CHECK-NEXT:    EMIT branch-on-cond vp<[[CMP]]>
 ; CHECK-NEXT:  Successor(s): ir-bb<for.cond.cleanup.loopexit>, scalar.ph
 ; CHECK-EMPTY:
-; CHECK-NEXT:  ir-bb<for.cond.cleanup.loopexit>:
+; CHECK-NEXT:  ir-bb<for.cond.cleanup.loopexit>
 ; CHECK-NEXT:  No successors
 ; CHECK-EMPTY:
-; CHECK-NEXT:  scalar.ph:
+; CHECK-NEXT:  scalar.ph
 ; CHECK-NEXT:  No successors
 ; CHECK-NEXT:  }
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
@@ -137,52 +134,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LEV: Epilogue vectorization is not profitable for this loop
 ; CHECK-NEXT:  VF picked by VPlan cost model: vscale x 4
 ; CHECK-NEXT:  Executing best plan with VF=vscale x 4, UF=1
-; CHECK-NEXT:  VPlan 'Final VPlan for VF={vscale x 4},UF>=1' {
-; CHECK-NEXT:  Live-in vp<%0> = VF * UF
-; CHECK-NEXT:  Live-in vp<%1> = vector-trip-count
-; CHECK-NEXT:  vp<%2> = original trip-count
-; CHECK-EMPTY:
-; CHECK-NEXT:  ir-bb<for.body.preheader>:
-; CHECK-NEXT:    EMIT vp<%2> = EXPAND SCEV (zext i32 %n to i64)
-; CHECK-NEXT:  No successors
-; CHECK-EMPTY:
-; CHECK-NEXT:  vector.ph:
-; CHECK-NEXT:  Successor(s): vector loop
-; CHECK-EMPTY:
-; CHECK-NEXT:  <x1> vector loop: {
-; CHECK-NEXT:    vector.body:
-; CHECK-NEXT:      EMIT vp<%3> = CANONICAL-INDUCTION ir<0>, vp<%8>
-; CHECK-NEXT:      vp<%4> = DERIVED-IV ir<%n> + vp<%3> * ir<-1>
-; CHECK-NEXT:      vp<%5> = SCALAR-STEPS vp<%4>, ir<-1>
-; CHECK-NEXT:      CLONE ir<%i.0> = add nsw vp<%5>, ir<-1>
-; CHECK-NEXT:      CLONE ir<%idxprom> = zext ir<%i.0>
-; CHECK-NEXT:      CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom>
-; CHECK-NEXT:      vp<%6> = vector-pointer (reverse) ir<%arrayidx>
-; CHECK-NEXT:      WIDEN ir<%13> = load vp<%6>
-; CHECK-NEXT:      WIDEN ir<%add9> = add ir<%13>, ir<1>
-; CHECK-NEXT:      CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom>
-; CHECK-NEXT:      vp<%7> = vector-pointer (reverse) ir<%arrayidx3>
-; CHECK-NEXT:      WIDEN store vp<%7>, ir<%add9>
-; CHECK-NEXT:      EMIT vp<%8> = add nuw vp<%3>, vp<%0>
-; CHECK-NEXT:      EMIT branch-on-count vp<%8>, vp<%1>
-; CHECK-NEXT:    No successors
-; CHECK-NEXT:  }
-; CHECK-NEXT:  Successor(s): middle.block
-; CHECK-EMPTY:
-; CHECK-NEXT:  middle.block:
-; CHECK-NEXT:    EMIT vp<%10> = icmp eq vp<%2>, vp<%1>
-; CHECK-NEXT:    EMIT branch-on-cond vp<%10>
-; CHECK-NEXT:  Successor(s): ir-bb<for.cond.cleanup.loopexit>, scalar.ph
-; CHECK-EMPTY:
-; CHECK-NEXT:  ir-bb<for.cond.cleanup.loopexit>:
-; CHECK-NEXT:  No successors
-; CHECK-EMPTY:
-; CHECK-NEXT:  scalar.ph:
-; CHECK-NEXT:  No successors
-; CHECK-NEXT:  }
-; CHECK-NEXT:  LV: Loop does not require scalar epilogue
-; CHECK-NEXT:  LV: Loop does not require scalar epilogue
-; CHECK-NEXT:  LV: Interleaving disabled by the pass manager
+; CHECK:       LV: Interleaving disabled by the pass manager
 ; CHECK-NEXT:  LV: Loop does not require scalar epilogue
 ; CHECK-NEXT:  LV: Vectorizing: innermost loop.
 ; CHECK-EMPTY:
@@ -241,7 +193,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
 ; CHECK-NEXT:  LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: %1 = load float, ptr %arrayidx, align 4
-; CHECK-NEXT:  LV: Found an estimated cost of 4 for VF vscale x 4 For instruction: %conv1 = fadd float %1, 1.000000e+00
+; CHECK-NEXT:  LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: %conv1 = fadd float %1, 1.000000e+00
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom
 ; CHECK-NEXT:  LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: store float %conv1, ptr %arrayidx3, align 4
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %cmp = icmp ugt i64 %indvars.iv, 1
@@ -258,40 +210,37 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: Loop does not require scalar epilogue
 ; CHECK-NEXT:  LV: Loop does not require scalar epilogue
 ; CHECK-NEXT:  VPlan 'Initial VPlan for VF={vscale x 4},UF>=1' {
-; CHECK-NEXT:  Live-in vp<%0> = VF * UF
-; CHECK-NEXT:  Live-in vp<%1> = vector-trip-count
-; CHECK-NEXT:  vp<%2> = original trip-count
-; CHECK-EMPTY:
-; CHECK-NEXT:  ir-bb<for.body.preheader>:
-; CHECK-NEXT:    EMIT vp<%2> = EXPAND SCEV (zext i32 %n to i64)
+; CHECK-NEXT:  Live-in vp<[[VFxUF:%.+]]> = VF * UF
+; CHECK-NEXT:  Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
+; CHECK-NEXT:  vp<[[TC:%.+]]> = original trip-count
+; CHECK:       ir-bb<for.body.preheader>:
+; CHECK-NEXT:    EMIT vp<[[TC]]> = EXPAND SCEV (zext i32 %n to i64)
 ; CHECK-NEXT:  No successors
-; CHECK-EMPTY:
-; CHECK-NEXT:  vector.ph:
+; CHECK:       vector.ph:
 ; CHECK-NEXT:  Successor(s): vector loop
-; CHECK-EMPTY:
-; CHECK-NEXT:  <x1> vector loop: {
+; CHECK:       <x1> vector loop: {
 ; CHECK-NEXT:    vector.body:
-; CHECK-NEXT:      EMIT vp<%3> = CANONICAL-INDUCTION ir<0>, vp<%8>
-; CHECK-NEXT:      vp<%4> = DERIVED-IV ir<%n> + vp<%3> * ir<-1>
-; CHECK-NEXT:      vp<%5> = SCALAR-STEPS vp<%4>, ir<-1>
-; CHECK-NEXT:      CLONE ir<%i.0> = add nsw vp<%5>, ir<-1>
-; CHECK-NEXT:      CLONE ir<%idxprom> = zext ir<%i.0>
-; CHECK-NEXT:      CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom>
-; CHECK-NEXT:      vp<%6> = vector-pointer (reverse) ir<%arrayidx>
-; CHECK-NEXT:      WIDEN ir<%1> = load vp<%6>
-; CHECK-NEXT:      WIDEN ir<%conv1> = fadd ir<%1>, ir<1.000000e+00>
-; CHECK-NEXT:      CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom>
-; CHECK-NEXT:      vp<%7> = vector-pointer (reverse) ir<%arrayidx3>
-; CHECK-NEXT:      WIDEN store vp<%7>, ir<%conv1>
-; CHECK-NEXT:      EMIT vp<%8> = add nuw vp<%3>, vp<%0>
-; CHECK-NEXT:      EMIT branch-on-count vp<%8>, vp<%1>
+; CHECK-NEXT:    EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION
+; CHECK-NEXT:    vp<[[DERIVED_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1>
+; CHECK-NEXT:    vp<[[STEPS]]> = SCALAR-STEPS vp<[[DERIVED_IV]]>, ir<-1>
+; CHECK-NEXT:    CLONE ir<%i.0> = add nsw vp<[[STEPS]]>, ir<-1>
+; CHECK-NEXT:    CLONE ir<%idxprom> = zext ir<%i.0>
+; CHECK-NEXT:    CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom>
+; CHECK-NEXT:    vp<[[VEC_PTR:%.+]]> = vector-pointer (reverse) ir<%arrayidx>
+; CHECK-NEXT:    WIDEN ir<%1> = load vp<[[VEC_PTR]]>
+; CHECK-NEXT:    WIDEN ir<%conv1> = fadd ir<%1>, ir<1.000000e+00>
+; CHECK-NEXT:    CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom>
+; CHECK-NEXT:    vp<[[VEC_PTR2:%.+]]> = vector-pointer (reverse) ir<%arrayidx3>
+; CHECK-NEXT:    WIDEN store vp<[[VEC_PTR2]]>, ir<%conv1>
+; CHECK-NEXT:    EMIT vp<[[IV_INC:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
+; CHECK-NEXT:    EMIT branch-on-count vp<[[IV_INC]]>, vp<[[VEC_TC]]>
 ; CHECK-NEXT:    No successors
 ; CHECK-NEXT:  }
 ; CHECK-NEXT:  Successor(s): middle.block
 ; CHECK-EMPTY:
-; CHECK-NEXT:  middle.block:
-; CHECK-NEXT:    EMIT vp<%10> = icmp eq vp<%2>, vp<%1>
-; CHECK-NEXT:    EMIT branch-on-cond vp<%10>
+; CHECK:       middle.block:
+; CHECK-NEXT:    EMIT vp<[[CMP:%.+]]> = icmp eq vp<[[TC]]>, vp<[[VEC_TC]]>
+; CHECK-NEXT:    EMIT branch-on-cond vp<[[CMP]]>
 ; CHECK-NEXT:  Successor(s): ir-bb<for.cond.cleanup.loopexit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<for.cond.cleanup.loopexit>:
@@ -306,7 +255,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
 ; CHECK-NEXT:  LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: %1 = load float, ptr %arrayidx, align 4
-; CHECK-NEXT:  LV: Found an estimated cost of 4 for VF vscale x 4 For instruction: %conv1 = fadd float %1, 1.000000e+00
+; CHECK-NEXT:  LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: %conv1 = fadd float %1, 1.000000e+00
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom
 ; CHECK-NEXT:  LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: store float %conv1, ptr %arrayidx3, align 4
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %cmp = icmp ugt i64 %indvars.iv, 1
@@ -332,7 +281,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: The target has 31 registers of RISCV::GPRRC register class
 ; CHECK-NEXT:  LV: The target has 32 registers of RISCV::VRRC register class
 ; CHECK-NEXT:  LV: Loop does not require scalar epilogue
-; CHECK-NEXT:  LV: Loop cost is 34
+; CHECK-NEXT:  LV: Loop cost is 32
 ; CHECK-NEXT:  LV: IC is 1
 ; CHECK-NEXT:  LV: VF is vscale x 4
 ; CHECK-NEXT:  LV: Not Interleaving.
@@ -341,52 +290,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LEV: Epilogue vectorization is not profitable for this loop
 ; CHECK-NEXT:  VF picked by VPlan cost model: vscale x 4
 ; CHECK-NEXT:  Executing best plan with VF=vscale x 4, UF=1
-; CHECK-NEXT:  VPlan 'Final VPlan for VF={vscale x 4},UF>=1' {
-; CHECK-NEXT:  Live-in vp<%0> = VF * UF
-; CHECK-NEXT:  Live-in vp<%1> = vector-trip-count
-; CHECK-NEXT:  vp<%2> = original trip-count
-; CHECK-EMPTY:
-; CHECK-NEXT:  ir-bb<for.body.preheader>:
-; CHECK-NEXT:    EMIT vp<%2> = EXPAND SCEV (zext i32 %n to i64)
-; CHECK-NEXT:  No successors
-; CHECK-EMPTY:
-; CHECK-NEXT:  vector.ph:
-; CHECK-NEXT:  Successor(s): vector loop
-; CHECK-EMPTY:
-; CHECK-NEXT:  <x1> vector loop: {
-; CHECK-NEXT:    vector.body:
-; CHECK-NEXT:      EMIT vp<%3> = CANONICAL-INDUCTION ir<0>, vp<%8>
-; CHECK-NEXT:      vp<%4> = DERIVED-IV ir<%n> + vp<%3> * ir<-1>
-; CHECK-NEXT:      vp<%5> = SCALAR-STEPS vp<%4>, ir<-1>
-; CHECK-NEXT:      CLONE ir<%i.0> = add nsw vp<%5>, ir<-1>
-; CHECK-NEXT:      CLONE ir<%idxprom> = zext ir<%i.0>
-; CHECK-NEXT:      CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom>
-; CHECK-NEXT:      vp<%6> = vector-pointer (reverse) ir<%arrayidx>
-; CHECK-NEXT:      WIDEN ir<%13> = load vp<%6>
-; CHECK-NEXT:      WIDEN ir<%conv1> = fadd ir<%13>, ir<1.000000e+00>
-; CHECK-NEXT:      CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom>
-; CHECK-NEXT:      vp<%7> = vector-pointer (reverse) ir<%arrayidx3>
-; CHECK-NEXT:      WIDEN store vp<%7>, ir<%conv1>
-; CHECK-NEXT:      EMIT vp<%8> = add nuw vp<%3>, vp<%0>
-; CHECK-NEXT:      EMIT branch-on-count vp<%8>, vp<%1>
-; CHECK-NEXT:    No successors
-; CHECK-NEXT:  }
-; CHECK-NEXT:  Successor(s): middle.block
-; CHECK-EMPTY:
-; CHECK-NEXT:  middle.block:
-; CHECK-NEXT:    EMIT vp<%10> = icmp eq vp<%2>, vp<%1>
-; CHECK-NEXT:    EMIT branch-on-cond vp<%10>
-; CHECK-NEXT:  Successor(s): ir-bb<for.cond.cleanup.loopexit>, scalar.ph
-; CHECK-EMPTY:
-; CHECK-NEXT:  ir-bb<for.cond.cleanup.loopexit>:
-; CHECK-NEXT:  No successors
-; CHECK-EMPTY:
-; CHECK-NEXT:  scalar.ph:
-; CHECK-NEXT:  No successors
-; CHECK-NEXT:  }
-; CHECK-NEXT:  LV: Loop does not require scalar epilogue
-; CHECK-NEXT:  LV: Loop does not require scalar epilogue
-; CHECK-NEXT:  LV: Interleaving disabled by the pass manager
+; CHECK:       LV: Interleaving disabled by the pass manager
 ; CHECK-NEXT:  LV: Loop does not require scalar epilogue
 ; CHECK-NEXT:  LV: Vectorizing: innermost loop.
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll b/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll
index a7c9a18127ade..b85d68c574a82 100644
--- a/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll
+++ b/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -passes=loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -S %s | FileCheck %s
 
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1-p:16:16:16:16"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1"
 
 declare void @init(ptr nocapture nofree)
 
@@ -17,7 +17,7 @@ define i16 @test_access_size_not_multiple_of_align(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE2:%.*]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i16> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[PRED_LOAD_CONTINUE2]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i16> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[PRED_LOAD_CONTINUE2]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TEST_BASE:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
@@ -43,16 +43,16 @@ define i16 @test_access_size_not_multiple_of_align(i64 %len, ptr %test_base) {
 ; CHECK:       pred.load.continue2:
 ; CHECK-NEXT:    [[TMP14:%.*]] = phi <2 x i16> [ [[TMP8]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP13]], [[PRED_LOAD_IF1]] ]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP3]], <2 x i16> [[TMP14]], <2 x i16> zeroinitializer
-; CHECK-NEXT:    [[TMP15]] = add <2 x i16> [[VEC_PHI]], [[PREDPHI]]
+; CHECK-NEXT:    [[TMP16]] = add <2 x i16> [[VEC_PHI]], [[PREDPHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
-; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP17:%.*]] = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> [[TMP15]])
+; CHECK-NEXT:    [[TMP18:%.*]] = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> [[TMP16]])
 ; CHECK-NEXT:    br i1 true, label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i16 [ [[TMP17]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i16 [ [[TMP18]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
@@ -72,7 +72,7 @@ define i16 @test_access_size_not_multiple_of_align(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    [[EXIT:%.*]] = icmp eq i64 [[IV]], 4095
 ; CHECK-NEXT:    br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       loop_exit:
-; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i16 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i16 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i16 [[ACCUM_NEXT_LCSSA]]
 ;
 entry:
@@ -114,7 +114,7 @@ define i32 @test_access_size_multiple_of_align_but_offset_by_1(i64 %len, ptr %te
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE2:%.*]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[PRED_LOAD_CONTINUE2]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[PRED_LOAD_CONTINUE2]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TEST_BASE:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
@@ -140,16 +140,16 @@ define i32 @test_access_size_multiple_of_align_but_offset_by_1(i64 %len, ptr %te
 ; CHECK:       pred.load.continue2:
 ; CHECK-NEXT:    [[TMP14:%.*]] = phi <2 x i32> [ [[TMP8]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP13]], [[PRED_LOAD_IF1]] ]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP3]], <2 x i32> [[TMP14]], <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP15]] = add <2 x i32> [[VEC_PHI]], [[PREDPHI]]
+; CHECK-NEXT:    [[TMP16]] = add <2 x i32> [[VEC_PHI]], [[PREDPHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
-; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[TMP15]])
+; CHECK-NEXT:    [[TMP18:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[TMP16]])
 ; CHECK-NEXT:    br i1 true, label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP17]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP18]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
@@ -169,7 +169,7 @@ define i32 @test_access_size_multiple_of_align_but_offset_by_1(i64 %len, ptr %te
 ; CHECK-NEXT:    [[EXIT:%.*]] = icmp eq i64 [[IV]], 4095
 ; CHECK-NEXT:    br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       loop_exit:
-; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[ACCUM_NEXT_LCSSA]]
 ;
 entry:
@@ -198,101 +198,3 @@ latch:
 loop_exit:
   ret i32 %accum.next
 }
-
-
-; Test where offset relative to alloca is negative and we shouldn't
-; treat predicated loads as being always dereferenceable.
-define i8 @test_negative_off(i16 %len, ptr %test_base) {
-; CHECK-LABEL: @test_negative_off(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [64638 x i8], align 1
-; CHECK-NEXT:    call void @init(ptr [[ALLOCA]])
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE2:%.*]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i8> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[PRED_LOAD_CONTINUE2]] ]
-; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[INDEX]] to i16
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i16 -1000, [[DOTCAST]]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i16 [[OFFSET_IDX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i16 [[OFFSET_IDX]], 1
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE:%.*]], i16 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i16 [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = load i1, ptr [[TMP2]], align 1
-; CHECK-NEXT:    [[TMP5:%.*]] = load i1, ptr [[TMP3]], align 1
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i1> poison, i1 [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i1> [[TMP6]], i1 [[TMP5]], i32 1
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0
-; CHECK-NEXT:    br i1 [[TMP8]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
-; CHECK:       pred.load.if:
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[ALLOCA]], i16 [[TMP0]]
-; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x i8> poison, i8 [[TMP10]], i32 0
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE]]
-; CHECK:       pred.load.continue:
-; CHECK-NEXT:    [[TMP12:%.*]] = phi <2 x i8> [ poison, [[VECTOR_BODY]] ], [ [[TMP11]], [[PRED_LOAD_IF]] ]
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1
-; CHECK-NEXT:    br i1 [[TMP13]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2]]
-; CHECK:       pred.load.if1:
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[ALLOCA]], i16 [[TMP1]]
-; CHECK-NEXT:    [[TMP15:%.*]] = load i8, ptr [[TMP14]], align 1
-; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <2 x i8> [[TMP12]], i8 [[TMP15]], i32 1
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE2]]
-; CHECK:       pred.load.continue2:
-; CHECK-NEXT:    [[TMP17:%.*]] = phi <2 x i8> [ [[TMP12]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP16]], [[PRED_LOAD_IF1]] ]
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP7]], <2 x i8> [[TMP17]], <2 x i8> zeroinitializer
-; CHECK-NEXT:    [[TMP18]] = add <2 x i8> [[VEC_PHI]], [[PREDPHI]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT]], 12
-; CHECK-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP20:%.*]] = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> [[TMP18]])
-; CHECK-NEXT:    br i1 true, label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ -988, [[MIDDLE_BLOCK]] ], [ -1000, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i8 [ [[TMP20]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
-; CHECK-NEXT:    [[ACCUM:%.*]] = phi i8 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ]
-; CHECK-NEXT:    [[IV_NEXT]] = add i16 [[IV]], 1
-; CHECK-NEXT:    [[TEST_ADDR:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i16 [[IV]]
-; CHECK-NEXT:    [[EARLYCND:%.*]] = load i1, ptr [[TEST_ADDR]], align 1
-; CHECK-NEXT:    br i1 [[EARLYCND]], label [[PRED:%.*]], label [[LATCH]]
-; CHECK:       pred:
-; CHECK-NEXT:    [[ADDR:%.*]] = getelementptr i8, ptr [[ALLOCA]], i16 [[IV]]
-; CHECK-NEXT:    [[VAL:%.*]] = load i8, ptr [[ADDR]], align 1
-; CHECK-NEXT:    br label [[LATCH]]
-; CHECK:       latch:
-; CHECK-NEXT:    [[VAL_PHI:%.*]] = phi i8 [ 0, [[LOOP]] ], [ [[VAL]], [[PRED]] ]
-; CHECK-NEXT:    [[ACCUM_NEXT]] = add i8 [[ACCUM]], [[VAL_PHI]]
-; CHECK-NEXT:    [[EXIT:%.*]] = icmp ugt i16 [[IV]], -990
-; CHECK-NEXT:    br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK:       loop_exit:
-; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i8 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i8 [[ACCUM_NEXT_LCSSA]]
-;
-entry:
-  %alloca = alloca [64638 x i8]
-  call void @init(ptr %alloca)
-  br label %loop
-loop:
-  %iv = phi i16 [ -1000, %entry ], [ %iv.next, %latch ]
-  %accum = phi i8 [ 0, %entry ], [ %accum.next, %latch ]
-  %iv.next = add i16 %iv, 1
-  %test_addr = getelementptr inbounds i1, ptr %test_base, i16 %iv
-  %earlycnd = load i1, ptr %test_addr
-  br i1 %earlycnd, label %pred, label %latch
-pred:
-  %addr = getelementptr i8, ptr %alloca, i16 %iv
-  %val = load i8, ptr %addr
-  br label %latch
-latch:
-  %val.phi = phi i8 [ 0, %loop ], [ %val, %pred ]
-  %accum.next = add i8 %accum, %val.phi
-  %exit = icmp ugt i16 %iv, -990
-  br i1 %exit, label %loop_exit, label %loop
-loop_exit:
-  ret i8 %accum.next
-}
diff --git a/llvm/test/Transforms/PGOProfile/Inputs/cspgo_bar_sample.ll b/llvm/test/Transforms/PGOProfile/Inputs/cspgo_bar_sample.ll
deleted file mode 100644
index 1c8be82715f25..0000000000000
--- a/llvm/test/Transforms/PGOProfile/Inputs/cspgo_bar_sample.ll
+++ /dev/null
@@ -1,82 +0,0 @@
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-$__llvm_profile_filename = comdat any
-$__llvm_profile_raw_version = comdat any
-$__llvm_profile_sampling = comdat any
-
- at odd = common dso_local local_unnamed_addr global i32 0, align 4
- at even = common dso_local local_unnamed_addr global i32 0, align 4
- at __llvm_profile_filename = local_unnamed_addr constant [25 x i8] c"pass2/default_%m.profraw\00", comdat
- at __llvm_profile_raw_version = local_unnamed_addr constant i64 216172782113783812, comdat
- at __llvm_profile_sampling = thread_local global i16 0, comdat
- at llvm.used = appending global [1 x i8*] [i8* bitcast (i64* @__llvm_profile_sampling to i8*)], section "llvm.metadata"
-
-define dso_local void @bar(i32 %n) !prof !30 {
-entry:
-  %call = tail call fastcc i32 @cond(i32 %n)
-  %tobool = icmp eq i32 %call, 0
-  br i1 %tobool, label %if.else, label %if.then, !prof !31
-
-if.then:
-  %0 = load i32, i32* @odd, align 4, !tbaa !32
-  %inc = add i32 %0, 1
-  store i32 %inc, i32* @odd, align 4, !tbaa !32
-  br label %if.end
-
-if.else:
-  %1 = load i32, i32* @even, align 4, !tbaa !32
-  %inc1 = add i32 %1, 1
-  store i32 %inc1, i32* @even, align 4, !tbaa !32
-  br label %if.end
-
-if.end:
-  ret void
-}
-
-define internal fastcc i32 @cond(i32 %i) #1 !prof !30 !PGOFuncName !36 {
-entry:
-  %rem = srem i32 %i, 2
-  ret i32 %rem
-}
-
-attributes #1 = { inlinehint noinline }
-
-!llvm.module.flags = !{!0, !1, !2}
-
-!0 = !{i32 1, !"wchar_size", i32 4}
-!1 = !{i32 1, !"EnableSplitLTOUnit", i32 0}
-!2 = !{i32 1, !"ProfileSummary", !3}
-!3 = !{!4, !5, !6, !7, !8, !9, !10, !11}
-!4 = !{!"ProfileFormat", !"InstrProf"}
-!5 = !{!"TotalCount", i64 500002}
-!6 = !{!"MaxCount", i64 200000}
-!7 = !{!"MaxInternalCount", i64 100000}
-!8 = !{!"MaxFunctionCount", i64 200000}
-!9 = !{!"NumCounts", i64 6}
-!10 = !{!"NumFunctions", i64 4}
-!11 = !{!"DetailedSummary", !12}
-!12 = !{!13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28}
-!13 = !{i32 10000, i64 200000, i32 1}
-!14 = !{i32 100000, i64 200000, i32 1}
-!15 = !{i32 200000, i64 200000, i32 1}
-!16 = !{i32 300000, i64 200000, i32 1}
-!17 = !{i32 400000, i64 200000, i32 1}
-!18 = !{i32 500000, i64 100000, i32 4}
-!19 = !{i32 600000, i64 100000, i32 4}
-!20 = !{i32 700000, i64 100000, i32 4}
-!21 = !{i32 800000, i64 100000, i32 4}
-!22 = !{i32 900000, i64 100000, i32 4}
-!23 = !{i32 950000, i64 100000, i32 4}
-!24 = !{i32 990000, i64 100000, i32 4}
-!25 = !{i32 999000, i64 100000, i32 4}
-!26 = !{i32 999900, i64 100000, i32 4}
-!27 = !{i32 999990, i64 100000, i32 4}
-!28 = !{i32 999999, i64 1, i32 6}
-!30 = !{!"function_entry_count", i64 200000}
-!31 = !{!"branch_weights", i32 100000, i32 100000}
-!32 = !{!33, !33, i64 0}
-!33 = !{!"int", !34, i64 0}
-!34 = !{!"omnipotent char", !35, i64 0}
-!35 = !{!"Simple C/C++ TBAA"}
-!36 = !{!"cspgo_bar.c:cond"}
diff --git a/llvm/test/Transforms/PGOProfile/counter_promo_sampling.ll b/llvm/test/Transforms/PGOProfile/counter_promo_sampling.ll
deleted file mode 100644
index 9d083fe04015e..0000000000000
--- a/llvm/test/Transforms/PGOProfile/counter_promo_sampling.ll
+++ /dev/null
@@ -1,78 +0,0 @@
-; RUN: opt < %s --passes=pgo-instr-gen,instrprof --do-counter-promotion=true --sampled-instrumentation=true --skip-ret-exit-block=0 -S | FileCheck --check-prefixes=SAMPLING,PROMO %s
-
-; SAMPLING: $__llvm_profile_sampling = comdat any
-; SAMPLING: @__llvm_profile_sampling = thread_local global i16 0, comdat
-
-define void @foo(i32 %n, i32 %N) {
-; SAMPLING-LABEL: @foo
-; SAMPLING:  %[[VV0:[0-9]+]] = load i16, ptr @__llvm_profile_sampling, align 2
-; SAMPLING:  %[[VV1:[0-9]+]] = icmp ule i16 %[[VV0]], 200
-; SAMPLING:  br i1 %[[VV1]], label {{.*}}, label {{.*}}, !prof !0
-; SAMPLING: {{.*}} = load {{.*}} @__profc_foo{{.*}} 3)
-; SAMPLING-NEXT: add
-; SAMPLING-NEXT: store {{.*}}@__profc_foo{{.*}}3)
-bb:
-  %tmp = add nsw i32 %n, 1
-  %tmp1 = add nsw i32 %n, -1
-  br label %bb2
-
-bb2:
-; PROMO: phi {{.*}}
-; PROMO-NEXT: phi {{.*}}
-; PROMO-NEXT: phi {{.*}}
-; PROMO-NEXT: phi {{.*}}
-  %i.0 = phi i32 [ 0, %bb ], [ %tmp10, %bb9 ]
-  %tmp3 = icmp slt i32 %i.0, %tmp
-  br i1 %tmp3, label %bb4, label %bb5
-
-bb4:
-  tail call void @bar(i32 1)
-  br label %bb9
-
-bb5:
-  %tmp6 = icmp slt i32 %i.0, %tmp1
-  br i1 %tmp6, label %bb7, label %bb8
-
-bb7:
-  tail call void @bar(i32 2)
-  br label %bb9
-
-bb8:
-  tail call void @bar(i32 3)
-  br label %bb9
-
-bb9:
-; SAMPLING:       phi {{.*}}
-; SAMPLING-NEXT:  %[[V1:[0-9]+]] = add i16 {{.*}}, 1
-; SAMPLING-NEXT:  store i16 %[[V1]], ptr @__llvm_profile_sampling, align 2
-; SAMPLING:       phi {{.*}}
-; SAMPLING-NEXT:  %[[V2:[0-9]+]] = add i16 {{.*}}, 1
-; SAMPLING-NEXT:  store i16 %[[V2]], ptr @__llvm_profile_sampling, align 2
-; SAMPLING:       phi {{.*}}
-; SAMPLING-NEXT:  %[[V3:[0-9]+]] = add i16 {{.*}}, 1
-; SAMPLING-NEXT:  store i16 %[[V3]], ptr @__llvm_profile_sampling, align 2
-; PROMO: %[[LIVEOUT3:[a-z0-9]+]] = phi {{.*}}
-; PROMO-NEXT: %[[LIVEOUT2:[a-z0-9]+]] = phi {{.*}}
-; PROMO-NEXT: %[[LIVEOUT1:[a-z0-9]+]] = phi {{.*}}
-  %tmp10 = add nsw i32 %i.0, 1
-  %tmp11 = icmp slt i32 %tmp10, %N
-  br i1 %tmp11, label %bb2, label %bb12
-
-bb12:
-  ret void
-; PROMO: %[[CHECK1:[a-z0-9.]+]] = load {{.*}} @__profc_foo{{.*}}
-; PROMO-NEXT: add {{.*}} %[[CHECK1]], %[[LIVEOUT1]]
-; PROMO-NEXT: store {{.*}}@__profc_foo{{.*}}
-; PROMO-NEXT: %[[CHECK2:[a-z0-9.]+]] = load {{.*}} @__profc_foo{{.*}} 1)
-; PROMO-NEXT: add {{.*}} %[[CHECK2]], %[[LIVEOUT2]]
-; PROMO-NEXT: store {{.*}}@__profc_foo{{.*}}1)
-; PROMO-NEXT: %[[CHECK3:[a-z0-9.]+]] = load {{.*}} @__profc_foo{{.*}} 2)
-; PROMO-NEXT: add {{.*}} %[[CHECK3]], %[[LIVEOUT3]]
-; PROMO-NEXT: store {{.*}}@__profc_foo{{.*}}2)
-; PROMO-NOT: @__profc_foo{{.*}})
-
-}
-
-declare void @bar(i32)
-
-; SAMPLING: !0 = !{!"branch_weights", i32 200, i32 65336}
diff --git a/llvm/test/Transforms/PGOProfile/cspgo_sample.ll b/llvm/test/Transforms/PGOProfile/cspgo_sample.ll
deleted file mode 100644
index 97ad4d00c9d9c..0000000000000
--- a/llvm/test/Transforms/PGOProfile/cspgo_sample.ll
+++ /dev/null
@@ -1,112 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
-; REQUIRES: x86-registered-target
-
-; RUN: opt -module-summary %s -o %t1.bc
-; RUN: opt -module-summary %S/Inputs/cspgo_bar_sample.ll -o %t2.bc
-; RUN: llvm-lto2 run -lto-cspgo-profile-file=alloc -enable-sampled-instrumentation -lto-cspgo-gen -save-temps -o %t %t1.bc %t2.bc \
-; RUN:   -r=%t1.bc,foo,pl \
-; RUN:   -r=%t1.bc,bar,l \
-; RUN:   -r=%t1.bc,main,plx \
-; RUN:   -r=%t1.bc,__llvm_profile_filename,plx \
-; RUN:   -r=%t1.bc,__llvm_profile_raw_version,plx \
-; RUN:   -r=%t1.bc,__llvm_profile_sampling,pl \
-; RUN:   -r=%t2.bc,bar,pl \
-; RUN:   -r=%t2.bc,odd,pl \
-; RUN:   -r=%t2.bc,even,pl \
-; RUN:   -r=%t2.bc,__llvm_profile_filename,x \
-; RUN:   -r=%t2.bc,__llvm_profile_raw_version,x \
-; RUN:   -r=%t2.bc,__llvm_profile_sampling,
-; RUN: llvm-dis %t.1.4.opt.bc -o - | FileCheck %s --check-prefix=CSGEN
-
-; CSGEN: @__llvm_profile_sampling = thread_local global i16 0, comdat
-; CSGEN: @__profc_
-; CSGEN: @__profd_
-
-source_filename = "cspgo.c"
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-$__llvm_profile_filename = comdat any
-$__llvm_profile_raw_version = comdat any
-$__llvm_profile_sampling = comdat any
- at __llvm_profile_filename = local_unnamed_addr constant [25 x i8] c"pass2/default_%m.profraw\00", comdat
- at __llvm_profile_raw_version = local_unnamed_addr constant i64 216172782113783812, comdat
- at __llvm_profile_sampling = thread_local global i16 0, comdat
- at llvm.used = appending global [1 x i8*] [i8* bitcast (i64* @__llvm_profile_sampling to i8*)], section "llvm.metadata"
-
-define dso_local void @foo() #0 !prof !30 {
-entry:
-  br label %for.body
-
-for.body:
-  %i.06 = phi i32 [ 0, %entry ], [ %add1, %for.body ]
-  tail call void @bar(i32 %i.06) #3
-  %add = or i32 %i.06, 1
-  tail call void @bar(i32 %add) #3
-  %add1 = add nuw nsw i32 %i.06, 2
-  %cmp = icmp ult i32 %add1, 200000
-  br i1 %cmp, label %for.body, label %for.end, !prof !31
-
-for.end:
-  ret void
-}
-
-; CSGEN-LABEL: @foo
-; CSGEN:        [[TMP0:%.*]]  = load i16, ptr @__llvm_profile_sampling, align 2
-; CSGEN-NEXT:   [[TMP1:%.*]] = icmp ult i16 [[TMP0]], 201
-; CSGEN-NEXT:   br i1 [[TMP1]], label %{{.*}}, label %{{.*}}, !prof [[PROF:![0-9]+]]
-; CSGEN:        [[TMP2:%.*]] = add i16 {{.*}}, 1
-; CSGEN-NEXT:   store i16 [[TMP2]], ptr @__llvm_profile_sampling, align 2
-
-declare dso_local void @bar(i32)
-
-define dso_local i32 @main() !prof !30 {
-entry:
-  tail call void @foo()
-  ret i32 0
-}
-; CSGEN-LABEL: @main
-; CSGEN:        [[TMP0:%.*]]  = load i16, ptr @__llvm_profile_sampling, align 2
-; CSGEN-NEXT:   [[TMP1:%.*]] = icmp ult i16 [[TMP0]], 201
-; CSGEN-NEXT:   br i1 [[TMP1]], label %{{.*}}, label %{{.*}}, !prof [[PROF:![0-9]+]]
-; CSGEN:        [[TMP2:%.*]] = add i16 {{.*}}, 1
-; CSGEN-NEXT:   store i16 [[TMP2]], ptr @__llvm_profile_sampling, align 2
-
-attributes #0 = { "target-cpu"="x86-64" }
-
-!llvm.module.flags = !{!0, !1, !2}
-
-!0 = !{i32 1, !"wchar_size", i32 4}
-!1 = !{i32 1, !"EnableSplitLTOUnit", i32 0}
-!2 = !{i32 1, !"ProfileSummary", !3}
-!3 = !{!4, !5, !6, !7, !8, !9, !10, !11}
-!4 = !{!"ProfileFormat", !"InstrProf"}
-!5 = !{!"TotalCount", i64 500002}
-!6 = !{!"MaxCount", i64 200000}
-!7 = !{!"MaxInternalCount", i64 100000}
-!8 = !{!"MaxFunctionCount", i64 200000}
-!9 = !{!"NumCounts", i64 6}
-!10 = !{!"NumFunctions", i64 4}
-!11 = !{!"DetailedSummary", !12}
-!12 = !{!13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28}
-!13 = !{i32 10000, i64 200000, i32 1}
-!14 = !{i32 100000, i64 200000, i32 1}
-!15 = !{i32 200000, i64 200000, i32 1}
-!16 = !{i32 300000, i64 200000, i32 1}
-!17 = !{i32 400000, i64 200000, i32 1}
-!18 = !{i32 500000, i64 100000, i32 4}
-!19 = !{i32 600000, i64 100000, i32 4}
-!20 = !{i32 700000, i64 100000, i32 4}
-!21 = !{i32 800000, i64 100000, i32 4}
-!22 = !{i32 900000, i64 100000, i32 4}
-!23 = !{i32 950000, i64 100000, i32 4}
-!24 = !{i32 990000, i64 100000, i32 4}
-!25 = !{i32 999000, i64 100000, i32 4}
-!26 = !{i32 999900, i64 100000, i32 4}
-!27 = !{i32 999990, i64 100000, i32 4}
-!28 = !{i32 999999, i64 1, i32 6}
-!30 = !{!"function_entry_count", i64 1}
-!31 = !{!"branch_weights", i32 100000, i32 1}
-
-; CSGEN: [[PROF]] = !{!"branch_weights", i32 200, i32 65336}
-
diff --git a/llvm/test/Transforms/PGOProfile/instrprof_burst_sampling_fast.ll b/llvm/test/Transforms/PGOProfile/instrprof_burst_sampling_fast.ll
deleted file mode 100644
index dcc1e805ba6f6..0000000000000
--- a/llvm/test/Transforms/PGOProfile/instrprof_burst_sampling_fast.ll
+++ /dev/null
@@ -1,47 +0,0 @@
-; RUN: opt < %s --passes=instrprof --sampled-instrumentation -S | FileCheck %s --check-prefixes=SAMPLE-VAR,SAMPLE-CODE,SAMPLE-DURATION,SAMPLE-WEIGHT
-; RUN: opt < %s --passes=instrprof --sampled-instrumentation --sampled-instr-burst-duration=100 -S | FileCheck %s --check-prefixes=SAMPLE-VAR,SAMPLE-CODE,SAMPLE-DURATION100,SAMPLE-WEIGHT100
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-$__llvm_profile_raw_version = comdat any
-
-; SAMPLE-VAR: $__llvm_profile_sampling = comdat any
-
- at __llvm_profile_raw_version = constant i64 72057594037927940, comdat
- at __profn_f = private constant [1 x i8] c"f"
-
-; SAMPLE-VAR: @__llvm_profile_sampling = thread_local global i16 0, comdat
-; SAMPLE-VAR: @__profc_f = private global [1 x i64] zeroinitializer, section "__llvm_prf_cnts", comdat, align 8
-; SAMPLE-VAR: @__profd_f = private global { i64, i64, i64, i64, ptr, ptr, i32, [3 x i16], i32 } { i64 -3706093650706652785, i64 12884901887, i64 sub (i64 ptrtoint (ptr @__profc_f to i64), i64 ptrtoint (ptr @__profd_f to i64)), i64 0, ptr @f.local, ptr null, i32 1, [3 x i16] zeroinitializer, i32 0 }, section "__llvm_prf_data", comdat($__profc_f), align 8
-; SAMPLE-VAR: @__llvm_prf_nm = private constant {{.*}}, section "__llvm_prf_names", align 1
-; SAMPLE-VAR: @llvm.compiler.used = appending global [2 x ptr] [ptr @__llvm_profile_sampling, ptr @__profd_f], section "llvm.metadata"
-; SAMPLE-VAR: @llvm.used = appending global [1 x ptr] [ptr @__llvm_prf_nm], section "llvm.metadata"
-
-
-define void @f() {
-; SAMPLE-CODE-LABEL: @f(
-; SAMPLE-CODE:  entry:
-; SAMPLE-CODE-NEXT:    [[TMP0:%.*]] = load i16, ptr @__llvm_profile_sampling, align 2
-; SAMPLE-DURATION:         [[TMP1:%.*]] = icmp ule i16 [[TMP0]], 200
-; SAMPLE-DURATION100:     [[TMP1:%.*]] = icmp ule i16 [[TMP0]], 100
-; SAMPLE-CODE:         br i1 [[TMP1]], label %[[TMP2:.*]], label %[[TMP4:.*]], !prof !0
-; SAMPLE-CODE:       [[TMP2]]:
-; SAMPLE-CODE-NEXT:    [[PGOCOUNT:%.*]] = load i64, ptr @__profc_f
-; SAMPLE-CODE-NEXT:    [[TMP3:%.*]] = add i64 [[PGOCOUNT]], 1
-; SAMPLE-CODE-NEXT:    store i64 [[TMP3]], ptr @__profc_f
-; SAMPLE-CODE-NEXT:    br label %[[TMP4]]
-; SAMPLE-CODE:       [[TMP4]]:
-; SAMPLE-CODE-NEXT:    [[TMP5:%.*]] = add i16 [[TMP0]], 1
-; SAMPLE-CODE-NEXT:    store i16 [[TMP5]],  ptr @__llvm_profile_sampling, align 2
-; SAMPLE-CODE-NEXT:    ret void
-;
-entry:
-  call void @llvm.instrprof.increment(i8* getelementptr inbounds ([1 x i8], [1 x i8]* @__profn_f, i32 0, i32 0), i64 12884901887, i32 1, i32 0)
-  ret void
-}
-
-; SAMPLE-WEIGHT: !0 = !{!"branch_weights", i32 200, i32 65336}
-; SAMPLE-WEIGHT100: !0 = !{!"branch_weights", i32 100, i32 65436}
-
-declare void @llvm.instrprof.increment(i8*, i64, i32, i32)
diff --git a/llvm/test/Transforms/PGOProfile/instrprof_burst_sampling_full.ll b/llvm/test/Transforms/PGOProfile/instrprof_burst_sampling_full.ll
deleted file mode 100644
index 57d1a0cd33fbe..0000000000000
--- a/llvm/test/Transforms/PGOProfile/instrprof_burst_sampling_full.ll
+++ /dev/null
@@ -1,45 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt < %s --passes=instrprof -sampled-instrumentation --sampled-instr-period=1009 --sampled-instr-burst-duration=32 -S | FileCheck %s
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-$__llvm_profile_raw_version = comdat any
-
- at __llvm_profile_raw_version = constant i64 72057594037927940, comdat
- at __profn_f = private constant [1 x i8] c"f"
-
-define void @f() {
-; CHECK-LABEL: define void @f() {
-; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr @__llvm_profile_sampling, align 2
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp ule i16 [[TMP0]], 32
-; CHECK-NEXT:    br i1 [[TMP1]], label %[[BB2:.*]], label %[[BB4:.*]], !prof [[PROF0:![0-9]+]]
-; CHECK:       [[BB2]]:
-; CHECK-NEXT:    [[PGOCOUNT:%.*]] = load i64, ptr @__profc_f, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[PGOCOUNT]], 1
-; CHECK-NEXT:    store i64 [[TMP3]], ptr @__profc_f, align 8
-; CHECK-NEXT:    br label %[[BB4]]
-; CHECK:       [[BB4]]:
-; CHECK-NEXT:    [[TMP5:%.*]] = add i16 [[TMP0]], 1
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp uge i16 [[TMP5]], 1009
-; CHECK-NEXT:    br i1 [[TMP6]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1:![0-9]+]]
-; CHECK:       [[BB7]]:
-; CHECK-NEXT:    store i16 0, ptr @__llvm_profile_sampling, align 2
-; CHECK-NEXT:    br label %[[BB9:.*]]
-; CHECK:       [[BB8]]:
-; CHECK-NEXT:    store i16 [[TMP5]], ptr @__llvm_profile_sampling, align 2
-; CHECK-NEXT:    br label %[[BB9]]
-; CHECK:       [[BB9]]:
-; CHECK-NEXT:    ret void
-;
-entry:
-  call void @llvm.instrprof.increment(i8* getelementptr inbounds ([1 x i8], [1 x i8]* @__profn_f, i32 0, i32 0), i64 12884901887, i32 1, i32 0)
-  ret void
-}
-
-declare void @llvm.instrprof.increment(i8*, i64, i32, i32)
-;.
-; CHECK: [[PROF0]] = !{!"branch_weights", i32 32, i32 978}
-; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 1009}
-;.
diff --git a/llvm/test/Transforms/PGOProfile/instrprof_burst_sampling_full_intsize.ll b/llvm/test/Transforms/PGOProfile/instrprof_burst_sampling_full_intsize.ll
deleted file mode 100644
index 1ad889524bc6a..0000000000000
--- a/llvm/test/Transforms/PGOProfile/instrprof_burst_sampling_full_intsize.ll
+++ /dev/null
@@ -1,45 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt < %s --passes=instrprof --sampled-instrumentation --sampled-instr-period=1000019 --sampled-instr-burst-duration=3000 -S | FileCheck %s
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-$__llvm_profile_raw_version = comdat any
-
- at __llvm_profile_raw_version = constant i64 72057594037927940, comdat
- at __profn_f = private constant [1 x i8] c"f"
-
-define void @f() {
-; CHECK-LABEL: define void @f() {
-; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr @__llvm_profile_sampling, align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp ule i32 [[TMP0]], 3000
-; CHECK-NEXT:    br i1 [[TMP1]], label %[[BB2:.*]], label %[[BB4:.*]], !prof [[PROF0:![0-9]+]]
-; CHECK:       [[BB2]]:
-; CHECK-NEXT:    [[PGOCOUNT:%.*]] = load i64, ptr @__profc_f, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[PGOCOUNT]], 1
-; CHECK-NEXT:    store i64 [[TMP3]], ptr @__profc_f, align 8
-; CHECK-NEXT:    br label %[[BB4]]
-; CHECK:       [[BB4]]:
-; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[TMP0]], 1
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp uge i32 [[TMP5]], 1000019
-; CHECK-NEXT:    br i1 [[TMP6]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1:![0-9]+]]
-; CHECK:       [[BB7]]:
-; CHECK-NEXT:    store i32 0, ptr @__llvm_profile_sampling, align 4
-; CHECK-NEXT:    br label %[[BB9:.*]]
-; CHECK:       [[BB8]]:
-; CHECK-NEXT:    store i32 [[TMP5]], ptr @__llvm_profile_sampling, align 4
-; CHECK-NEXT:    br label %[[BB9]]
-; CHECK:       [[BB9]]:
-; CHECK-NEXT:    ret void
-;
-entry:
-  call void @llvm.instrprof.increment(i8* getelementptr inbounds ([1 x i8], [1 x i8]* @__profn_f, i32 0, i32 0), i64 12884901887, i32 1, i32 0)
-  ret void
-}
-
-declare void @llvm.instrprof.increment(i8*, i64, i32, i32)
-;.
-; CHECK: [[PROF0]] = !{!"branch_weights", i32 3000, i32 997020}
-; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 1000019}
-;.
diff --git a/llvm/test/Transforms/PGOProfile/instrprof_simple_sampling.ll b/llvm/test/Transforms/PGOProfile/instrprof_simple_sampling.ll
deleted file mode 100644
index 8e846bbf1d982..0000000000000
--- a/llvm/test/Transforms/PGOProfile/instrprof_simple_sampling.ll
+++ /dev/null
@@ -1,60 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt < %s --passes=instrprof --sampled-instrumentation --sampled-instr-burst-duration=1 --sampled-instr-period=1009 -S | FileCheck %s --check-prefix=PERIOD1009
-; RUN: opt < %s --passes=instrprof --sampled-instrumentation --sampled-instr-burst-duration=1 -S | FileCheck %s --check-prefix=DEFAULTPERIOD
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-$__llvm_profile_raw_version = comdat any
-
- at __llvm_profile_raw_version = constant i64 72057594037927940, comdat
- at __profn_f = private constant [1 x i8] c"f"
-
-define void @f() {
-; PERIOD1009-LABEL: define void @f() {
-; PERIOD1009-NEXT:  [[ENTRY:.*:]]
-; PERIOD1009-NEXT:    [[TMP0:%.*]] = load i16, ptr @__llvm_profile_sampling, align 2
-; PERIOD1009-NEXT:    [[TMP1:%.*]] = add i16 [[TMP0]], 1
-; PERIOD1009-NEXT:    [[TMP2:%.*]] = icmp uge i16 [[TMP1]], 1009
-; PERIOD1009-NEXT:    br i1 [[TMP2]], label %[[BB3:.*]], label %[[BB5:.*]], !prof [[PROF0:![0-9]+]]
-; PERIOD1009:       [[BB3]]:
-; PERIOD1009-NEXT:    [[PGOCOUNT:%.*]] = load i64, ptr @__profc_f, align 8
-; PERIOD1009-NEXT:    [[TMP4:%.*]] = add i64 [[PGOCOUNT]], 1
-; PERIOD1009-NEXT:    store i64 [[TMP4]], ptr @__profc_f, align 8
-; PERIOD1009-NEXT:    store i16 0, ptr @__llvm_profile_sampling, align 2
-; PERIOD1009-NEXT:    br label %[[BB6:.*]]
-; PERIOD1009:       [[BB5]]:
-; PERIOD1009-NEXT:    store i16 [[TMP1]], ptr @__llvm_profile_sampling, align 2
-; PERIOD1009-NEXT:    br label %[[BB6]]
-; PERIOD1009:       [[BB6]]:
-; PERIOD1009-NEXT:    ret void
-;
-; DEFAULTPERIOD-LABEL: define void @f() {
-; DEFAULTPERIOD-NEXT:  [[ENTRY:.*:]]
-; DEFAULTPERIOD-NEXT:    [[TMP0:%.*]] = load i16, ptr @__llvm_profile_sampling, align 2
-; DEFAULTPERIOD-NEXT:    [[TMP1:%.*]] = add i16 [[TMP0]], 1
-; DEFAULTPERIOD-NEXT:    [[TMP2:%.*]] = icmp uge i16 [[TMP1]], -1
-; DEFAULTPERIOD-NEXT:    br i1 [[TMP2]], label %[[BB3:.*]], label %[[BB5:.*]], !prof [[PROF0:![0-9]+]]
-; DEFAULTPERIOD:       [[BB3]]:
-; DEFAULTPERIOD-NEXT:    [[PGOCOUNT:%.*]] = load i64, ptr @__profc_f, align 8
-; DEFAULTPERIOD-NEXT:    [[TMP4:%.*]] = add i64 [[PGOCOUNT]], 1
-; DEFAULTPERIOD-NEXT:    store i64 [[TMP4]], ptr @__profc_f, align 8
-; DEFAULTPERIOD-NEXT:    store i16 0, ptr @__llvm_profile_sampling, align 2
-; DEFAULTPERIOD-NEXT:    br label %[[BB6:.*]]
-; DEFAULTPERIOD:       [[BB5]]:
-; DEFAULTPERIOD-NEXT:    store i16 [[TMP1]], ptr @__llvm_profile_sampling, align 2
-; DEFAULTPERIOD-NEXT:    br label %[[BB6]]
-; DEFAULTPERIOD:       [[BB6]]:
-; DEFAULTPERIOD-NEXT:    ret void
-;
-entry:
-  call void @llvm.instrprof.increment(i8* getelementptr inbounds ([1 x i8], [1 x i8]* @__profn_f, i32 0, i32 0), i64 12884901887, i32 1, i32 0)
-  ret void
-}
-
-declare void @llvm.instrprof.increment(i8*, i64, i32, i32)
-;.
-; PERIOD1009: [[PROF0]] = !{!"branch_weights", i32 1, i32 1009}
-;.
-; DEFAULTPERIOD: [[PROF0]] = !{!"branch_weights", i32 1, i32 65535}
-;.
diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/cmp-ptr-minmax.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/cmp-ptr-minmax.ll
deleted file mode 100644
index 81837bdf99eaf..0000000000000
--- a/llvm/test/Transforms/SLPVectorizer/SystemZ/cmp-ptr-minmax.ll
+++ /dev/null
@@ -1,39 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -S --passes=slp-vectorizer -mtriple=s390x-unknown-linux-gnu -mcpu=z16 -slp-threshold=-10 < %s | FileCheck %s
-
-define i1 @test(i64 %0, i64 %1, ptr %2) {
-; CHECK-LABEL: define i1 @test(
-; CHECK-SAME: i64 [[TMP0:%.*]], i64 [[TMP1:%.*]], ptr [[TMP2:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[GEP44:%.*]] = getelementptr i8, ptr null, i64 [[TMP0]]
-; CHECK-NEXT:    [[GEP45:%.*]] = getelementptr i8, ptr null, i64 [[TMP1]]
-; CHECK-NEXT:    [[GEP48:%.*]] = getelementptr i8, ptr null, i64 [[TMP0]]
-; CHECK-NEXT:    [[GEP49:%.*]] = getelementptr i8, ptr null, i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x ptr> poison, ptr [[GEP44]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x ptr> [[TMP3]], ptr [[GEP48]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x ptr> poison, ptr [[GEP45]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x ptr> [[TMP5]], ptr [[GEP49]], i32 1
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp ult <2 x ptr> [[TMP4]], [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = select <2 x i1> [[TMP7]], <2 x ptr> [[TMP4]], <2 x ptr> [[TMP6]]
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x ptr> [[TMP9]], <2 x ptr> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp ult <2 x ptr> [[TMP8]], [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i1> [[TMP11]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP11]], i32 1
-; CHECK-NEXT:    [[RES:%.*]] = and i1 [[TMP12]], [[TMP13]]
-; CHECK-NEXT:    ret i1 [[RES]]
-;
-entry:
-  %gep44 = getelementptr i8, ptr null, i64 %0
-  %gep45 = getelementptr i8, ptr null, i64 %1
-  %4 = icmp ult ptr %gep44, %gep45
-  %umin = select i1 %4, ptr %gep44, ptr %gep45
-  %gep48 = getelementptr i8, ptr null, i64 %0
-  %gep49 = getelementptr i8, ptr null, i64 %1
-  %5 = icmp ult ptr %gep48, %gep49
-  %umin50 = select i1 %5, ptr %gep48, ptr %gep49
-  %b095 = icmp ult ptr %umin, %2
-  %b196 = icmp ult ptr %umin50, %2
-  %res = and i1 %b095, %b196
-  ret i1 %res
-}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gather-extractelements-different-bbs.ll b/llvm/test/Transforms/SLPVectorizer/X86/gather-extractelements-different-bbs.ll
index be790b772a2eb..58c108b81a7ce 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/gather-extractelements-different-bbs.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/gather-extractelements-different-bbs.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -passes=slp-vectorizer -mtriple=x86_64-unknown-linux -mattr="+avx512f,+avx512bw" -slp-threshold=-100 -slp-min-tree-size=0 < %s | FileCheck %s
+; RUN: opt -S -passes=slp-vectorizer -mtriple=x86_64-unknown-linux -mattr="-avx512pf,+avx512f,+avx512bw" -slp-threshold=-100 -slp-min-tree-size=0 < %s | FileCheck %s
 
 define i32 @foo(i32 %a) {
 ; CHECK-LABEL: @foo(
diff --git a/llvm/test/Transforms/SimplifyCFG/X86/two-entry-phi-fold-unpredictable.ll b/llvm/test/Transforms/SimplifyCFG/X86/two-entry-phi-fold-unpredictable.ll
deleted file mode 100644
index 82566d47b0328..0000000000000
--- a/llvm/test/Transforms/SimplifyCFG/X86/two-entry-phi-fold-unpredictable.ll
+++ /dev/null
@@ -1,100 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5
-; Two-entry phi nodes with unpredictable conditions may get increased budget for folding.
-; RUN: opt < %s -S -passes=simplifycfg | FileCheck --check-prefix=CHECK-NOFOLD %s
-; RUN: opt < %s -S -passes='simplifycfg<speculate-unpredictables>' | FileCheck --check-prefix=CHECK-NOFOLD %s
-; RUN: opt -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 < %s -S -passes=simplifycfg | FileCheck --check-prefix=CHECK-NOFOLD %s
-; RUN: opt -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 < %s -S -passes='simplifycfg<speculate-unpredictables>' | FileCheck --check-prefix=CHECK-FOLD %s
-
-define { <2 x float>, <2 x float> } @foo(float %arg, <2 x float> %arg1, <2 x float> %arg2) #0 {
-; CHECK-NOFOLD-LABEL: define { <2 x float>, <2 x float> } @foo(
-; CHECK-NOFOLD-SAME: float [[ARG:%.*]], <2 x float> [[ARG1:%.*]], <2 x float> [[ARG2:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NOFOLD-NEXT:  [[BB:.*]]:
-; CHECK-NOFOLD-NEXT:    [[I:%.*]] = fcmp fast ogt float [[ARG]], 0x3F747AE140000000
-; CHECK-NOFOLD-NEXT:    br i1 [[I]], label %[[BB3:.*]], label %[[BB20:.*]], !unpredictable [[META0:![0-9]+]]
-; CHECK-NOFOLD:       [[BB3]]:
-; CHECK-NOFOLD-NEXT:    [[I4:%.*]] = extractelement <2 x float> [[ARG1]], i64 0
-; CHECK-NOFOLD-NEXT:    [[I5:%.*]] = fmul fast float [[I4]], [[I4]]
-; CHECK-NOFOLD-NEXT:    [[I6:%.*]] = extractelement <2 x float> [[ARG1]], i64 1
-; CHECK-NOFOLD-NEXT:    [[I7:%.*]] = fmul fast float [[I6]], [[I6]]
-; CHECK-NOFOLD-NEXT:    [[I8:%.*]] = fadd fast float [[I7]], [[I5]]
-; CHECK-NOFOLD-NEXT:    [[I9:%.*]] = extractelement <2 x float> [[ARG2]], i64 0
-; CHECK-NOFOLD-NEXT:    [[I10:%.*]] = fmul fast float [[I9]], [[I9]]
-; CHECK-NOFOLD-NEXT:    [[I11:%.*]] = fadd fast float [[I8]], [[I10]]
-; CHECK-NOFOLD-NEXT:    [[I12:%.*]] = tail call fast noundef float @llvm.sqrt.f32(float [[I11]])
-; CHECK-NOFOLD-NEXT:    [[I13:%.*]] = fdiv fast float 0x3FEFD70A40000000, [[I12]]
-; CHECK-NOFOLD-NEXT:    [[I14:%.*]] = fmul fast float [[I13]], [[I4]]
-; CHECK-NOFOLD-NEXT:    [[I15:%.*]] = insertelement <2 x float> poison, float [[I14]], i64 0
-; CHECK-NOFOLD-NEXT:    [[I16:%.*]] = fmul fast float [[I13]], [[I6]]
-; CHECK-NOFOLD-NEXT:    [[I17:%.*]] = insertelement <2 x float> [[I15]], float [[I16]], i64 1
-; CHECK-NOFOLD-NEXT:    [[I18:%.*]] = fmul fast float [[I13]], [[I9]]
-; CHECK-NOFOLD-NEXT:    [[I19:%.*]] = insertelement <2 x float> [[ARG2]], float [[I18]], i64 0
-; CHECK-NOFOLD-NEXT:    br label %[[BB20]]
-; CHECK-NOFOLD:       [[BB20]]:
-; CHECK-NOFOLD-NEXT:    [[I21:%.*]] = phi nsz <2 x float> [ [[I17]], %[[BB3]] ], [ zeroinitializer, %[[BB]] ]
-; CHECK-NOFOLD-NEXT:    [[I22:%.*]] = phi nsz <2 x float> [ [[I19]], %[[BB3]] ], [ zeroinitializer, %[[BB]] ]
-; CHECK-NOFOLD-NEXT:    [[I23:%.*]] = insertvalue { <2 x float>, <2 x float> } poison, <2 x float> [[I21]], 0
-; CHECK-NOFOLD-NEXT:    [[I24:%.*]] = insertvalue { <2 x float>, <2 x float> } [[I23]], <2 x float> [[I22]], 1
-; CHECK-NOFOLD-NEXT:    ret { <2 x float>, <2 x float> } [[I24]]
-;
-; CHECK-FOLD-LABEL: define { <2 x float>, <2 x float> } @foo(
-; CHECK-FOLD-SAME: float [[ARG:%.*]], <2 x float> [[ARG1:%.*]], <2 x float> [[ARG2:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-FOLD-NEXT:  [[BB:.*:]]
-; CHECK-FOLD-NEXT:    [[I:%.*]] = fcmp fast ogt float [[ARG]], 0x3F747AE140000000
-; CHECK-FOLD-NEXT:    [[I4:%.*]] = extractelement <2 x float> [[ARG1]], i64 0
-; CHECK-FOLD-NEXT:    [[I5:%.*]] = fmul fast float [[I4]], [[I4]]
-; CHECK-FOLD-NEXT:    [[I6:%.*]] = extractelement <2 x float> [[ARG1]], i64 1
-; CHECK-FOLD-NEXT:    [[I7:%.*]] = fmul fast float [[I6]], [[I6]]
-; CHECK-FOLD-NEXT:    [[I8:%.*]] = fadd fast float [[I7]], [[I5]]
-; CHECK-FOLD-NEXT:    [[I9:%.*]] = extractelement <2 x float> [[ARG2]], i64 0
-; CHECK-FOLD-NEXT:    [[I10:%.*]] = fmul fast float [[I9]], [[I9]]
-; CHECK-FOLD-NEXT:    [[I11:%.*]] = fadd fast float [[I8]], [[I10]]
-; CHECK-FOLD-NEXT:    [[I12:%.*]] = tail call fast float @llvm.sqrt.f32(float [[I11]])
-; CHECK-FOLD-NEXT:    [[I13:%.*]] = fdiv fast float 0x3FEFD70A40000000, [[I12]]
-; CHECK-FOLD-NEXT:    [[I14:%.*]] = fmul fast float [[I13]], [[I4]]
-; CHECK-FOLD-NEXT:    [[I15:%.*]] = insertelement <2 x float> poison, float [[I14]], i64 0
-; CHECK-FOLD-NEXT:    [[I16:%.*]] = fmul fast float [[I13]], [[I6]]
-; CHECK-FOLD-NEXT:    [[I17:%.*]] = insertelement <2 x float> [[I15]], float [[I16]], i64 1
-; CHECK-FOLD-NEXT:    [[I18:%.*]] = fmul fast float [[I13]], [[I9]]
-; CHECK-FOLD-NEXT:    [[I19:%.*]] = insertelement <2 x float> [[ARG2]], float [[I18]], i64 0
-; CHECK-FOLD-NEXT:    [[I21:%.*]] = select nsz i1 [[I]], <2 x float> [[I17]], <2 x float> zeroinitializer, !unpredictable [[META0:![0-9]+]]
-; CHECK-FOLD-NEXT:    [[I22:%.*]] = select nsz i1 [[I]], <2 x float> [[I19]], <2 x float> zeroinitializer, !unpredictable [[META0]]
-; CHECK-FOLD-NEXT:    [[I23:%.*]] = insertvalue { <2 x float>, <2 x float> } poison, <2 x float> [[I21]], 0
-; CHECK-FOLD-NEXT:    [[I24:%.*]] = insertvalue { <2 x float>, <2 x float> } [[I23]], <2 x float> [[I22]], 1
-; CHECK-FOLD-NEXT:    ret { <2 x float>, <2 x float> } [[I24]]
-;
-bb:
-  %i = fcmp fast ogt float %arg, 0x3F747AE140000000
-  br i1 %i, label %bb3, label %bb20, !unpredictable !0
-
-bb3:                                              ; preds = %bb
-  %i4 = extractelement <2 x float> %arg1, i64 0
-  %i5 = fmul fast float %i4, %i4
-  %i6 = extractelement <2 x float> %arg1, i64 1
-  %i7 = fmul fast float %i6, %i6
-  %i8 = fadd fast float %i7, %i5
-  %i9 = extractelement <2 x float> %arg2, i64 0
-  %i10 = fmul fast float %i9, %i9
-  %i11 = fadd fast float %i8, %i10
-  %i12 = tail call fast noundef float @llvm.sqrt.f32(float %i11)
-  %i13 = fdiv fast float 0x3FEFD70A40000000, %i12
-  %i14 = fmul fast float %i13, %i4
-  %i15 = insertelement <2 x float> poison, float %i14, i64 0
-  %i16 = fmul fast float %i13, %i6
-  %i17 = insertelement <2 x float> %i15, float %i16, i64 1
-  %i18 = fmul fast float %i13, %i9
-  %i19 = insertelement <2 x float> %arg2, float %i18, i64 0
-  br label %bb20
-
-bb20:                                             ; preds = %bb3, %bb
-  %i21 = phi nsz <2 x float> [ %i17, %bb3 ], [ zeroinitializer, %bb ]
-  %i22 = phi nsz <2 x float> [ %i19, %bb3 ], [ zeroinitializer, %bb ]
-  %i23 = insertvalue { <2 x float>, <2 x float> } poison, <2 x float> %i21, 0
-  %i24 = insertvalue { <2 x float>, <2 x float> } %i23, <2 x float> %i22, 1
-  ret { <2 x float>, <2 x float> } %i24
-}
-
-declare float @llvm.sqrt.f32(float)
-
-attributes #0 = { nounwind }
-
-!0 = !{}
diff --git a/llvm/test/Transforms/VectorCombine/RISCV/vpintrin-scalarization.ll b/llvm/test/Transforms/VectorCombine/RISCV/vpintrin-scalarization.ll
index 97608174b524d..03fbb5e5a4674 100644
--- a/llvm/test/Transforms/VectorCombine/RISCV/vpintrin-scalarization.ll
+++ b/llvm/test/Transforms/VectorCombine/RISCV/vpintrin-scalarization.ll
@@ -1130,23 +1130,14 @@ define <vscale x 1 x i64> @umax_nxv1i64_anymask(<vscale x 1 x i64> %x, i64 %y, <
 }
 
 define <vscale x 1 x float> @fadd_nxv1f32_allonesmask(<vscale x 1 x float> %x, float %y, i32 zeroext %evl) {
-; VEC-COMBINE-LABEL: @fadd_nxv1f32_allonesmask(
-; VEC-COMBINE-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
-; VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
-; VEC-COMBINE-NEXT:    [[TMP1:%.*]] = fadd float [[Y:%.*]], 4.200000e+01
-; VEC-COMBINE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x float> poison, float [[TMP1]], i64 0
-; VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x float> [[DOTSPLATINSERT]], <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-; VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[X:%.*]], <vscale x 1 x float> [[TMP2]], <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
-; VEC-COMBINE-NEXT:    ret <vscale x 1 x float> [[TMP3]]
-;
-; NO-VEC-COMBINE-LABEL: @fadd_nxv1f32_allonesmask(
-; NO-VEC-COMBINE-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
-; NO-VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
-; NO-VEC-COMBINE-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x float> poison, float [[Y:%.*]], i64 0
-; NO-VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x float> [[TMP1]], <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i64 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
-; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[X:%.*]], <vscale x 1 x float> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
-; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x float> [[TMP4]]
+; ALL-LABEL: @fadd_nxv1f32_allonesmask(
+; ALL-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+; ALL-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+; ALL-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x float> poison, float [[Y:%.*]], i64 0
+; ALL-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x float> [[TMP1]], <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
+; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i64 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; ALL-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[X:%.*]], <vscale x 1 x float> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
+; ALL-NEXT:    ret <vscale x 1 x float> [[TMP4]]
 ;
   %splat = insertelement <vscale x 1 x i1> poison, i1 -1, i32 0
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
@@ -1173,23 +1164,14 @@ define <vscale x 1 x float> @fadd_nxv1f32_anymask(<vscale x 1 x float> %x, float
 }
 
 define <vscale x 1 x float> @fsub_nxv1f32_allonesmask(<vscale x 1 x float> %x, float %y, i32 zeroext %evl) {
-; VEC-COMBINE-LABEL: @fsub_nxv1f32_allonesmask(
-; VEC-COMBINE-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
-; VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
-; VEC-COMBINE-NEXT:    [[TMP1:%.*]] = fsub float [[Y:%.*]], 4.200000e+01
-; VEC-COMBINE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x float> poison, float [[TMP1]], i64 0
-; VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x float> [[DOTSPLATINSERT]], <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-; VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[X:%.*]], <vscale x 1 x float> [[TMP2]], <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
-; VEC-COMBINE-NEXT:    ret <vscale x 1 x float> [[TMP3]]
-;
-; NO-VEC-COMBINE-LABEL: @fsub_nxv1f32_allonesmask(
-; NO-VEC-COMBINE-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
-; NO-VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
-; NO-VEC-COMBINE-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x float> poison, float [[Y:%.*]], i64 0
-; NO-VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x float> [[TMP1]], <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.fsub.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i64 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
-; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[X:%.*]], <vscale x 1 x float> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
-; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x float> [[TMP4]]
+; ALL-LABEL: @fsub_nxv1f32_allonesmask(
+; ALL-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+; ALL-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+; ALL-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x float> poison, float [[Y:%.*]], i64 0
+; ALL-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x float> [[TMP1]], <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
+; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.fsub.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i64 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; ALL-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[X:%.*]], <vscale x 1 x float> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
+; ALL-NEXT:    ret <vscale x 1 x float> [[TMP4]]
 ;
   %splat = insertelement <vscale x 1 x i1> poison, i1 -1, i32 0
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
@@ -1216,23 +1198,14 @@ define <vscale x 1 x float> @fsub_nxv1f32_anymask(<vscale x 1 x float> %x, float
 }
 
 define <vscale x 1 x float> @fdiv_nxv1f32_allonesmask(<vscale x 1 x float> %x, float %y, i32 zeroext %evl) {
-; VEC-COMBINE-LABEL: @fdiv_nxv1f32_allonesmask(
-; VEC-COMBINE-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
-; VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
-; VEC-COMBINE-NEXT:    [[TMP1:%.*]] = fdiv float [[Y:%.*]], 4.200000e+01
-; VEC-COMBINE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x float> poison, float [[TMP1]], i64 0
-; VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x float> [[DOTSPLATINSERT]], <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-; VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[X:%.*]], <vscale x 1 x float> [[TMP2]], <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
-; VEC-COMBINE-NEXT:    ret <vscale x 1 x float> [[TMP3]]
-;
-; NO-VEC-COMBINE-LABEL: @fdiv_nxv1f32_allonesmask(
-; NO-VEC-COMBINE-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
-; NO-VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
-; NO-VEC-COMBINE-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x float> poison, float [[Y:%.*]], i64 0
-; NO-VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x float> [[TMP1]], <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.fdiv.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i64 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
-; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[X:%.*]], <vscale x 1 x float> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
-; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x float> [[TMP4]]
+; ALL-LABEL: @fdiv_nxv1f32_allonesmask(
+; ALL-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+; ALL-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+; ALL-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x float> poison, float [[Y:%.*]], i64 0
+; ALL-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x float> [[TMP1]], <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
+; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.fdiv.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i64 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; ALL-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[X:%.*]], <vscale x 1 x float> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
+; ALL-NEXT:    ret <vscale x 1 x float> [[TMP4]]
 ;
   %splat = insertelement <vscale x 1 x i1> poison, i1 -1, i32 0
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
@@ -1302,23 +1275,14 @@ define <vscale x 1 x float> @frem_nxv1f32_allonesmask(<vscale x 1 x float> %x, f
 }
 
 define <vscale x 1 x float> @fdiv_nxv1f32_allonesmask_knownvl(<vscale x 1 x float> %x, float %y) {
-; VEC-COMBINE-LABEL: @fdiv_nxv1f32_allonesmask_knownvl(
-; VEC-COMBINE-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
-; VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
-; VEC-COMBINE-NEXT:    [[TMP1:%.*]] = fdiv float [[Y:%.*]], 4.200000e+01
-; VEC-COMBINE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x float> poison, float [[TMP1]], i64 0
-; VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x float> [[DOTSPLATINSERT]], <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-; VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[X:%.*]], <vscale x 1 x float> [[TMP2]], <vscale x 1 x i1> [[MASK]], i32 4)
-; VEC-COMBINE-NEXT:    ret <vscale x 1 x float> [[TMP3]]
-;
-; NO-VEC-COMBINE-LABEL: @fdiv_nxv1f32_allonesmask_knownvl(
-; NO-VEC-COMBINE-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
-; NO-VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
-; NO-VEC-COMBINE-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x float> poison, float [[Y:%.*]], i64 0
-; NO-VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x float> [[TMP1]], <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.fdiv.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i64 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 4)
-; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[X:%.*]], <vscale x 1 x float> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 4)
-; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x float> [[TMP4]]
+; ALL-LABEL: @fdiv_nxv1f32_allonesmask_knownvl(
+; ALL-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+; ALL-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+; ALL-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x float> poison, float [[Y:%.*]], i64 0
+; ALL-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x float> [[TMP1]], <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
+; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.fdiv.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i64 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 4)
+; ALL-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[X:%.*]], <vscale x 1 x float> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 4)
+; ALL-NEXT:    ret <vscale x 1 x float> [[TMP4]]
 ;
   %splat = insertelement <vscale x 1 x i1> poison, i1 -1, i32 0
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
diff --git a/llvm/test/tools/llvm-dlltool/arm64ec.test b/llvm/test/tools/llvm-dlltool/arm64ec.test
index 4ed0f4a4387d6..b03b4eaf7b2d0 100644
--- a/llvm/test/tools/llvm-dlltool/arm64ec.test
+++ b/llvm/test/tools/llvm-dlltool/arm64ec.test
@@ -44,29 +44,6 @@ RUN: llvm-nm --print-armap test3.lib | FileCheck --check-prefix=ARMAP2 %s
 RUN: not llvm-dlltool -m arm64 -d test.def -N test2.def -l test4.lib 2>&1 | FileCheck --check-prefix=ERR %s
 ERR: native .def file is supported only on arm64ec target
 
-RUN: llvm-dlltool -m arm64ec -d test3.def -l test3.lib
-RUN: llvm-readobj test3.lib | FileCheck --check-prefix=ALIAS %s
-
-ALIAS:      File: test.dll
-ALIAS-NEXT: Format: COFF-import-file-ARM64EC
-ALIAS-NEXT: Type: code
-ALIAS-NEXT: Name type: export as
-ALIAS-NEXT: Export name: efunc
-ALIAS-NEXT: Symbol: __imp_func
-ALIAS-NEXT: Symbol: func
-ALIAS-NEXT: Symbol: __imp_aux_func
-ALIAS-NEXT: Symbol: #func
-ALIAS-EMPTY:
-ALIAS-NEXT: File: test.dll
-ALIAS-NEXT: Format: COFF-import-file-ARM64EC
-ALIAS-NEXT: Type: code
-ALIAS-NEXT: Name type: export as
-ALIAS-NEXT: Export name: efunc
-ALIAS-NEXT: Symbol: __imp_efunc
-ALIAS-NEXT: Symbol: efunc
-ALIAS-NEXT: Symbol: __imp_aux_efunc
-ALIAS-NEXT: Symbol: #efunc
-
 #--- test.def
 LIBRARY test.dll
 EXPORTS
@@ -76,9 +53,3 @@ EXPORTS
 LIBRARY test.dll
 EXPORTS
     otherfunc
-
-#--- test3.def
-LIBRARY test.dll
-EXPORTS
-    func == efunc
-    efunc
diff --git a/llvm/test/tools/llvm-objdump/ELF/AArch64/elf-aarch64-mapping-symbols.test b/llvm/test/tools/llvm-objdump/ELF/AArch64/elf-aarch64-mapping-symbols.test
index 011e67cda8c32..cf24952a2d8d4 100644
--- a/llvm/test/tools/llvm-objdump/ELF/AArch64/elf-aarch64-mapping-symbols.test
+++ b/llvm/test/tools/llvm-objdump/ELF/AArch64/elf-aarch64-mapping-symbols.test
@@ -4,11 +4,11 @@
 
 # CHECK:       Disassembly of section .mysection:
 # CHECK-EMPTY:
-# ALL-NEXT:    <$x>:
+# ALL-NEXT:    <$x.0>:
 # CHECK-NEXT:  <_start>:
 # CHECK-NEXT:         0:       10000021        adr     x1, 0x4
 # CHECK-EMPTY:
-# ALL-NEXT:    <$d>:
+# ALL-NEXT:    <$d.1>:
 # CHECK-NEXT:  <msg>:
 # CHECK-NEXT:         4:       48 65 6c 6c     .word
 # CHECK-NEXT:         8:       6f 2c 20 77     .word
@@ -18,10 +18,10 @@
 # CHECK-NEXT:  Disassembly of section .myothersection:
 # CHECK-EMPTY:
 # NOALL-NEXT:  <.myothersection>:
-# ALL-NEXT:    <$x>:
+# ALL-NEXT:    <$x.2>:
 # CHECK-NEXT:         0:       90000001        adrp    x1, 0x0
 # CHECK-EMPTY:
-# ALL-NEXT:    <$d>:
+# ALL-NEXT:    <$d.3>:
 # CHECK-NEXT:  <mystr>:
 # CHECK-NEXT:         4:       62 6c 61 68     .word
 # CHECK-NEXT:         8:       9a              .byte   0x9a
diff --git a/llvm/test/tools/llvm-profgen/period-scaling.test b/llvm/test/tools/llvm-profgen/period-scaling.test
deleted file mode 100644
index a44c9a78caeaf..0000000000000
--- a/llvm/test/tools/llvm-profgen/period-scaling.test
+++ /dev/null
@@ -1,84 +0,0 @@
-// RUN: llvm-profgen --format=text --perfscript=%S/Inputs/cmov_3.perfscript --binary=%S/Inputs/cmov_3.perfbin --output=%t --skip-symbolization --perf-event=br_inst_retired.near_taken:upp --sample-period=1000003
-// RUN: FileCheck %s --input-file %t --check-prefix=CHECK-RAW-PROFILE
-// RUN: llvm-profgen --format=text --perfscript=%S/Inputs/cmov_3.perfscript --binary=%S/Inputs/cmov_3.perfbin --output=%t --perf-event=br_inst_retired.near_taken:upp --sample-period=1000003
-// RUN: FileCheck %s --input-file %t --check-prefix=CHECK
-
-// RUN: llvm-profgen --format=text --perfscript=%S/Inputs/cmov_3.perfscript --binary=%S/Inputs/cmov_3.perfbin --output=%t --skip-symbolization --perf-event=br_misp_retired.all_branches:upp --leading-ip-only --sample-period=1000003
-// RUN: FileCheck %s --input-file %t --check-prefix=UNPRED-RAW-PROFILE
-// RUN: llvm-profgen --format=text --perfscript=%S/Inputs/cmov_3.perfscript --binary=%S/Inputs/cmov_3.perfbin --output=%t --perf-event=br_misp_retired.all_branches:upp --leading-ip-only --sample-period=1000003
-// RUN: FileCheck %s --input-file %t --check-prefix=UNPRED
-
-// Check that we can use perf event filtering to generate multiple types of
-// source-level profiles from a single perf profile. In this case, we generate
-// a typical execution frequency profile using br_inst_retired.near_taken LBRs,
-// and a branch mispredict profile using br_misp_retired.all_branches sample
-// IPs.
-
-// Check that we can use --sample-period to compute LBR and IP-based profiles
-// which have comparable and absolute magnitudes. For example, in this case the
-// branch of interest (at source line offset 4) is in a loop body which is
-// executed ~20M times in total, and it's mispredicted about 9M times, yielding
-// a mispredict rate of roughly 0.45.
-
-// The source example below is based on perfKernelCpp/cmov_3, except a
-// misleading builtin is used to persuade the compiler not to use cmov, which
-// induces branch mispredicts.
-
-// CHECK: sel_arr:652547082:0
-// CHECK:  3.1: 20225766
-// CHECK:  3.2: 20225766
-// CHECK:  4: 19838670
-// CHECK:  5: 20225766
-
-// UNPRED: sel_arr:18000054:0
-// UNPRED:  3.1: 0
-// UNPRED:  3.2: 0
-// UNPRED:  4: 9000027
-// UNPRED:  5: 0
-
-// CHECK-RAW-PROFILE:      3
-// CHECK-RAW-PROFILE-NEXT: 2f0-2fa:9774174
-// CHECK-RAW-PROFILE-NEXT: 2f0-310:10064496
-// CHECK-RAW-PROFILE-NEXT: 2ff-310:10161270
-
-// UNPRED-RAW-PROFILE:      1
-// UNPRED-RAW-PROFILE-NEXT: 2fa-2fa:9000027
-
-// original code:
-// icx -fprofile-sample-generate lit.c
-#include <stdlib.h>
-
-#define N 20000
-#define ITERS 10000
-
-static int *m_s1, *m_s2, *m_s3, *m_dst;
-
-void init(void) {
-    m_s1 = malloc(sizeof(int)*N);
-    m_s2 = malloc(sizeof(int)*N);
-    m_s3 = malloc(sizeof(int)*N);
-    m_dst = malloc(sizeof(int)*N);
-    srand(42);
-
-    for (int i = 0; i < N; i++) {
-        m_s1[i] = rand() % N;
-        m_s2[i] = 0;
-        m_s3[i] = 1;
-    }
-}
-
-void __attribute__((noinline)) sel_arr(int *dst, int *s1, int *s2, int *s3) {
-#pragma nounroll
-#pragma clang loop vectorize(disable) interleave(disable)
-    for (int i = 0; i < N; i++) {
-        int *p = __builtin_expect((s1[i] < 10035), 0) ? &s2[i] : &s3[i];
-        dst[i] = *p;
-    }
-}
-
-int main(void) {
-  init();
-  for(int i=0; i<ITERS; ++i)
-    sel_arr(m_dst, m_s1, m_s2, m_s3);
-  return 0;
-}
diff --git a/llvm/tools/llvm-profgen/PerfReader.cpp b/llvm/tools/llvm-profgen/PerfReader.cpp
index 4041271cc0a82..b4e4911fb8912 100644
--- a/llvm/tools/llvm-profgen/PerfReader.cpp
+++ b/llvm/tools/llvm-profgen/PerfReader.cpp
@@ -53,10 +53,6 @@ static cl::alias
                           cl::desc("Comma-delimited version of -perf-event"),
                           cl::aliasopt(PerfEventFilter));
 
-static cl::opt<uint64_t>
-    SamplePeriod("sample-period", cl::init(1),
-                 cl::desc("The sampling period (-c) used for perf data"));
-
 extern cl::opt<std::string> PerfTraceFilename;
 extern cl::opt<bool> ShowDisassemblyOnly;
 extern cl::opt<bool> ShowSourceLocations;
@@ -1004,16 +1000,6 @@ void LBRPerfReader::parseSample(TraceStream &TraceIt, uint64_t Count) {
   if (extractLBRStack(TraceIt, Sample->LBRStack)) {
     warnIfMissingMMap();
     // Record LBR only samples by aggregation
-    // If a sampling period is given we can adjust the magnitude of sample
-    // counts to estimate the absolute magnitute.
-    if (SamplePeriod.getNumOccurrences()) {
-      Count *= SamplePeriod;
-      // If counts are LBR-based, as opposed to IP-based, then the magnitude is
-      // now amplified by roughly the LBR stack size. By adjusting this down, we
-      // can produce LBR-based and IP-based profiles with comparable magnitudes.
-      if (!LeadingIPOnly && Sample->LBRStack.size() > 1)
-        Count /= (Sample->LBRStack.size() - 1);
-    }
     AggregatedSamples[Hashable<PerfSample>(Sample)] += Count;
   }
 }
diff --git a/llvm/unittests/ExecutionEngine/Orc/CMakeLists.txt b/llvm/unittests/ExecutionEngine/Orc/CMakeLists.txt
index cc868e7587dc6..af431658c9b7d 100644
--- a/llvm/unittests/ExecutionEngine/Orc/CMakeLists.txt
+++ b/llvm/unittests/ExecutionEngine/Orc/CMakeLists.txt
@@ -28,7 +28,6 @@ add_llvm_unittest(OrcJITTests
   LookupAndRecordAddrsTest.cpp
   MachOPlatformTest.cpp
   MapperJITLinkMemoryManagerTest.cpp
-  MemoryFlagsTest.cpp
   MemoryMapperTest.cpp
   ObjectFormatsTest.cpp
   ObjectLinkingLayerTest.cpp
diff --git a/llvm/unittests/ExecutionEngine/Orc/MemoryFlagsTest.cpp b/llvm/unittests/ExecutionEngine/Orc/MemoryFlagsTest.cpp
deleted file mode 100644
index 270ea2f442b9e..0000000000000
--- a/llvm/unittests/ExecutionEngine/Orc/MemoryFlagsTest.cpp
+++ /dev/null
@@ -1,55 +0,0 @@
-//===------- MemoryFlagsTest.cpp - Test MemoryFlags and related APIs ------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ExecutionEngine/Orc/Shared/MemoryFlags.h"
-#include "gtest/gtest.h"
-
-#include <future>
-
-using namespace llvm;
-using namespace llvm::orc;
-
-TEST(MemProtTest, Basics) {
-  MemProt MPNone = MemProt::None;
-
-  EXPECT_EQ(MPNone & MemProt::Read, MemProt::None);
-  EXPECT_EQ(MPNone & MemProt::Write, MemProt::None);
-  EXPECT_EQ(MPNone & MemProt::Exec, MemProt::None);
-
-  EXPECT_EQ(MPNone | MemProt::Read, MemProt::Read);
-  EXPECT_EQ(MPNone | MemProt::Write, MemProt::Write);
-  EXPECT_EQ(MPNone | MemProt::Exec, MemProt::Exec);
-
-  MemProt MPAll = MemProt::Read | MemProt::Write | MemProt::Exec;
-  EXPECT_EQ(MPAll & MemProt::Read, MemProt::Read);
-  EXPECT_EQ(MPAll & MemProt::Write, MemProt::Write);
-  EXPECT_EQ(MPAll & MemProt::Exec, MemProt::Exec);
-}
-
-TEST(AllocGroupSmallMap, EmptyMap) {
-  AllocGroupSmallMap<bool> EM;
-  EXPECT_TRUE(EM.empty());
-  EXPECT_EQ(EM.size(), 0u);
-}
-
-TEST(AllocGroupSmallMap, NonEmptyMap) {
-  AllocGroupSmallMap<unsigned> NEM;
-  NEM[MemProt::Read] = 42;
-
-  EXPECT_FALSE(NEM.empty());
-  EXPECT_EQ(NEM.size(), 1U);
-  EXPECT_EQ(NEM[MemProt::Read], 42U);
-  EXPECT_EQ(NEM.find(MemProt::Read), NEM.begin());
-  EXPECT_EQ(NEM.find(MemProt::Read | MemProt::Write), NEM.end());
-
-  NEM[MemProt::Read | MemProt::Write] = 7;
-  EXPECT_EQ(NEM.size(), 2U);
-  EXPECT_EQ(NEM.begin()->second, 42U);
-  EXPECT_EQ((NEM.begin() + 1)->second, 7U);
-}
-
diff --git a/llvm/unittests/SandboxIR/SandboxIRTest.cpp b/llvm/unittests/SandboxIR/SandboxIRTest.cpp
index ba90b4f811f8e..054a81e9cf308 100644
--- a/llvm/unittests/SandboxIR/SandboxIRTest.cpp
+++ b/llvm/unittests/SandboxIR/SandboxIRTest.cpp
@@ -132,7 +132,7 @@ define i32 @foo(i32 %v0, i32 %v1) {
   auto *Arg1 = F.getArg(1);
   auto It = BB.begin();
   auto *I0 = &*It++;
-  auto *Ret = cast<sandboxir::ReturnInst>(&*It++);
+  auto *Ret = &*It++;
 
   SmallVector<sandboxir::Argument *> Args{Arg0, Arg1};
   unsigned OpIdx = 0;
@@ -245,7 +245,7 @@ define i32 @foo(i32 %arg0, i32 %arg1) {
   auto *I0 = &*It++;
   auto *I1 = &*It++;
   auto *I2 = &*It++;
-  auto *Ret = cast<sandboxir::ReturnInst>(&*It++);
+  auto *Ret = &*It++;
 
   bool Replaced;
   // Try to replace an operand that doesn't match.
@@ -401,7 +401,7 @@ void @foo(i32 %arg0, i32 %arg1) {
   br label %bb1 ; SB3. (Opaque)
 
 bb1:
-  ret void ; SB5. (Ret)
+  ret void ; SB5. (Opaque)
 }
 )IR");
   }
@@ -488,7 +488,7 @@ define void @foo(i8 %v1) {
   auto It = BB->begin();
   auto *I0 = &*It++;
   auto *I1 = &*It++;
-  auto *Ret = cast<sandboxir::ReturnInst>(&*It++);
+  auto *Ret = &*It++;
 
   // Check getPrevNode().
   EXPECT_EQ(Ret->getPrevNode(), I1);
@@ -508,7 +508,7 @@ define void @foo(i8 %v1) {
   // Check getOpcode().
   EXPECT_EQ(I0->getOpcode(), sandboxir::Instruction::Opcode::Opaque);
   EXPECT_EQ(I1->getOpcode(), sandboxir::Instruction::Opcode::Opaque);
-  EXPECT_EQ(Ret->getOpcode(), sandboxir::Instruction::Opcode::Ret);
+  EXPECT_EQ(Ret->getOpcode(), sandboxir::Instruction::Opcode::Opaque);
 
   // Check moveBefore(I).
   I1->moveBefore(I0);
@@ -561,74 +561,6 @@ define void @foo(i8 %v1) {
   EXPECT_EQ(I0->getNextNode(), Ret);
 }
 
-TEST_F(SandboxIRTest, SelectInst) {
-  parseIR(C, R"IR(
-define void @foo(i1 %c0, i8 %v0, i8 %v1, i1 %c1) {
-  %sel = select i1 %c0, i8 %v0, i8 %v1
-  ret void
-}
-)IR");
-  llvm::Function *LLVMF = &*M->getFunction("foo");
-  sandboxir::Context Ctx(C);
-  sandboxir::Function *F = Ctx.createFunction(LLVMF);
-  auto *Cond0 = F->getArg(0);
-  auto *V0 = F->getArg(1);
-  auto *V1 = F->getArg(2);
-  auto *Cond1 = F->getArg(3);
-  auto *BB = &*F->begin();
-  auto It = BB->begin();
-  auto *Select = cast<sandboxir::SelectInst>(&*It++);
-  auto *Ret = &*It++;
-
-  // Check getCondition().
-  EXPECT_EQ(Select->getCondition(), Cond0);
-  // Check getTrueValue().
-  EXPECT_EQ(Select->getTrueValue(), V0);
-  // Check getFalseValue().
-  EXPECT_EQ(Select->getFalseValue(), V1);
-  // Check setCondition().
-  Select->setCondition(Cond1);
-  EXPECT_EQ(Select->getCondition(), Cond1);
-  // Check setTrueValue().
-  Select->setTrueValue(V1);
-  EXPECT_EQ(Select->getTrueValue(), V1);
-  // Check setFalseValue().
-  Select->setFalseValue(V0);
-  EXPECT_EQ(Select->getFalseValue(), V0);
-
-  {
-    // Check SelectInst::create() InsertBefore.
-    auto *NewSel = cast<sandboxir::SelectInst>(sandboxir::SelectInst::create(
-        Cond0, V0, V1, /*InsertBefore=*/Ret, Ctx));
-    EXPECT_EQ(NewSel->getCondition(), Cond0);
-    EXPECT_EQ(NewSel->getTrueValue(), V0);
-    EXPECT_EQ(NewSel->getFalseValue(), V1);
-    EXPECT_EQ(NewSel->getNextNode(), Ret);
-  }
-  {
-    // Check SelectInst::create() InsertAtEnd.
-    auto *NewSel = cast<sandboxir::SelectInst>(
-        sandboxir::SelectInst::create(Cond0, V0, V1, /*InsertAtEnd=*/BB, Ctx));
-    EXPECT_EQ(NewSel->getCondition(), Cond0);
-    EXPECT_EQ(NewSel->getTrueValue(), V0);
-    EXPECT_EQ(NewSel->getFalseValue(), V1);
-    EXPECT_EQ(NewSel->getPrevNode(), Ret);
-  }
-  {
-    // Check SelectInst::create() Folded.
-    auto *False =
-        sandboxir::Constant::createInt(llvm::Type::getInt1Ty(C), 0, Ctx,
-                                       /*IsSigned=*/false);
-    auto *FortyTwo =
-        sandboxir::Constant::createInt(llvm::Type::getInt1Ty(C), 42, Ctx,
-                                       /*IsSigned=*/false);
-    auto *NewSel =
-        sandboxir::SelectInst::create(False, FortyTwo, FortyTwo, Ret, Ctx);
-    EXPECT_TRUE(isa<sandboxir::Constant>(NewSel));
-    EXPECT_EQ(NewSel, FortyTwo);
-  }
-}
-
 TEST_F(SandboxIRTest, LoadInst) {
   parseIR(C, R"IR(
 define void @foo(ptr %arg0, ptr %arg1) {
@@ -644,7 +576,7 @@ define void @foo(ptr %arg0, ptr %arg1) {
   auto *BB = &*F->begin();
   auto It = BB->begin();
   auto *Ld = cast<sandboxir::LoadInst>(&*It++);
-  auto *Ret = cast<sandboxir::ReturnInst>(&*It++);
+  auto *Ret = &*It++;
 
   // Check getPointerOperand()
   EXPECT_EQ(Ld->getPointerOperand(), Arg0);
@@ -675,7 +607,7 @@ define void @foo(i8 %val, ptr %ptr) {
   auto *BB = &*F->begin();
   auto It = BB->begin();
   auto *St = cast<sandboxir::StoreInst>(&*It++);
-  auto *Ret = cast<sandboxir::ReturnInst>(&*It++);
+  auto *Ret = &*It++;
 
   // Check that the StoreInst has been created correctly.
   // Check getPointerOperand()
@@ -692,42 +624,3 @@ define void @foo(i8 %val, ptr %ptr) {
   EXPECT_EQ(NewSt->getPointerOperand(), Ptr);
   EXPECT_EQ(NewSt->getAlign(), 8);
 }
-
-TEST_F(SandboxIRTest, ReturnInst) {
-  parseIR(C, R"IR(
-define i8 @foo(i8 %val) {
-  %add = add i8 %val, 42
-  ret i8 %val
-}
-)IR");
-  llvm::Function *LLVMF = &*M->getFunction("foo");
-  sandboxir::Context Ctx(C);
-  sandboxir::Function *F = Ctx.createFunction(LLVMF);
-  auto *Val = F->getArg(0);
-  auto *BB = &*F->begin();
-  auto It = BB->begin();
-  It++;
-  auto *Ret = cast<sandboxir::ReturnInst>(&*It++);
-
-  // Check that the ReturnInst has been created correctly.
-  // Check getReturnValue().
-  EXPECT_EQ(Ret->getReturnValue(), Val);
-
-  // Check create(InsertBefore) a void ReturnInst.
-  auto *NewRet1 = cast<sandboxir::ReturnInst>(
-      sandboxir::ReturnInst::create(nullptr, /*InsertBefore=*/Ret, Ctx));
-  EXPECT_EQ(NewRet1->getReturnValue(), nullptr);
-  // Check create(InsertBefore) a non-void ReturnInst.
-  auto *NewRet2 = cast<sandboxir::ReturnInst>(
-      sandboxir::ReturnInst::create(Val, /*InsertBefore=*/Ret, Ctx));
-  EXPECT_EQ(NewRet2->getReturnValue(), Val);
-
-  // Check create(InsertAtEnd) a void ReturnInst.
-  auto *NewRet3 = cast<sandboxir::ReturnInst>(
-      sandboxir::ReturnInst::create(nullptr, /*InsertAtEnd=*/BB, Ctx));
-  EXPECT_EQ(NewRet3->getReturnValue(), nullptr);
-  // Check create(InsertAtEnd) a non-void ReturnInst.
-  auto *NewRet4 = cast<sandboxir::ReturnInst>(
-      sandboxir::ReturnInst::create(Val, /*InsertAtEnd=*/BB, Ctx));
-  EXPECT_EQ(NewRet4->getReturnValue(), Val);
-}
diff --git a/llvm/unittests/Support/raw_socket_stream_test.cpp b/llvm/unittests/Support/raw_socket_stream_test.cpp
index 348fb4bb3e089..c4e8cfbbe7e6a 100644
--- a/llvm/unittests/Support/raw_socket_stream_test.cpp
+++ b/llvm/unittests/Support/raw_socket_stream_test.cpp
@@ -62,50 +62,17 @@ TEST(raw_socket_streamTest, CLIENT_TO_SERVER_AND_SERVER_TO_CLIENT) {
   ssize_t BytesRead = Server.read(Bytes, 8);
 
   std::string string(Bytes, 8);
-  ASSERT_EQ(Server.has_error(), false);
 
   ASSERT_EQ(8, BytesRead);
   ASSERT_EQ("01234567", string);
 }
 
-TEST(raw_socket_streamTest, READ_WITH_TIMEOUT) {
+TEST(raw_socket_streamTest, TIMEOUT_PROVIDED) {
   if (!hasUnixSocketSupport())
     GTEST_SKIP();
 
   SmallString<100> SocketPath;
-  llvm::sys::fs::createUniquePath("read_with_timeout.sock", SocketPath, true);
-
-  // Make sure socket file does not exist. May still be there from the last test
-  std::remove(SocketPath.c_str());
-
-  Expected<ListeningSocket> MaybeServerListener =
-      ListeningSocket::createUnix(SocketPath);
-  ASSERT_THAT_EXPECTED(MaybeServerListener, llvm::Succeeded());
-  ListeningSocket ServerListener = std::move(*MaybeServerListener);
-
-  Expected<std::unique_ptr<raw_socket_stream>> MaybeClient =
-      raw_socket_stream::createConnectedUnix(SocketPath);
-  ASSERT_THAT_EXPECTED(MaybeClient, llvm::Succeeded());
-
-  Expected<std::unique_ptr<raw_socket_stream>> MaybeServer =
-      ServerListener.accept();
-  ASSERT_THAT_EXPECTED(MaybeServer, llvm::Succeeded());
-  raw_socket_stream &Server = **MaybeServer;
-
-  char Bytes[8];
-  ssize_t BytesRead = Server.read(Bytes, 8, std::chrono::milliseconds(100));
-  ASSERT_EQ(BytesRead, -1);
-  ASSERT_EQ(Server.has_error(), true);
-  ASSERT_EQ(Server.error(), std::errc::timed_out);
-  Server.clear_error();
-}
-
-TEST(raw_socket_streamTest, ACCEPT_WITH_TIMEOUT) {
-  if (!hasUnixSocketSupport())
-    GTEST_SKIP();
-
-  SmallString<100> SocketPath;
-  llvm::sys::fs::createUniquePath("accept_with_timeout.sock", SocketPath, true);
+  llvm::sys::fs::createUniquePath("timout_provided.sock", SocketPath, true);
 
   // Make sure socket file does not exist. May still be there from the last test
   std::remove(SocketPath.c_str());
@@ -115,19 +82,19 @@ TEST(raw_socket_streamTest, ACCEPT_WITH_TIMEOUT) {
   ASSERT_THAT_EXPECTED(MaybeServerListener, llvm::Succeeded());
   ListeningSocket ServerListener = std::move(*MaybeServerListener);
 
+  std::chrono::milliseconds Timeout = std::chrono::milliseconds(100);
   Expected<std::unique_ptr<raw_socket_stream>> MaybeServer =
-      ServerListener.accept(std::chrono::milliseconds(100));
+      ServerListener.accept(Timeout);
   ASSERT_EQ(llvm::errorToErrorCode(MaybeServer.takeError()),
             std::errc::timed_out);
 }
 
-TEST(raw_socket_streamTest, ACCEPT_WITH_SHUTDOWN) {
+TEST(raw_socket_streamTest, FILE_DESCRIPTOR_CLOSED) {
   if (!hasUnixSocketSupport())
     GTEST_SKIP();
 
   SmallString<100> SocketPath;
-  llvm::sys::fs::createUniquePath("accept_with_shutdown.sock", SocketPath,
-                                  true);
+  llvm::sys::fs::createUniquePath("fd_closed.sock", SocketPath, true);
 
   // Make sure socket file does not exist. May still be there from the last test
   std::remove(SocketPath.c_str());
diff --git a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
index bede4e64696c5..e0a848351d06f 100644
--- a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
+++ b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
@@ -892,6 +892,7 @@ R"(All available -march extensions for RISC-V
     za64rs               1.0
     zaamo                1.0
     zabha                1.0
+    zacas                1.0
     zalrsc               1.0
     zama16b              1.0
     zawrs                1.0
@@ -1027,7 +1028,6 @@ R"(All available -march extensions for RISC-V
 Experimental extensions
     zicfilp              1.0       This is a long dummy description
     zicfiss              1.0
-    zacas                1.0
     zalasr               0.1
     smmpm                1.0
     smnpm                1.0
diff --git a/llvm/unittests/TargetParser/TripleTest.cpp b/llvm/unittests/TargetParser/TripleTest.cpp
index 0aecfc64da208..f93dc3671197a 100644
--- a/llvm/unittests/TargetParser/TripleTest.cpp
+++ b/llvm/unittests/TargetParser/TripleTest.cpp
@@ -1169,12 +1169,6 @@ TEST(TripleTest, ParsedIDs) {
   EXPECT_EQ(Triple::Serenity, T.getOS());
   EXPECT_EQ(Triple::UnknownEnvironment, T.getEnvironment());
 
-  T = Triple("aarch64-unknown-linux-pauthtest");
-  EXPECT_EQ(Triple::aarch64, T.getArch());
-  EXPECT_EQ(Triple::UnknownVendor, T.getVendor());
-  EXPECT_EQ(Triple::Linux, T.getOS());
-  EXPECT_EQ(Triple::PAuthTest, T.getEnvironment());
-
   T = Triple("huh");
   EXPECT_EQ(Triple::UnknownArch, T.getArch());
 }
diff --git a/llvm/utils/TableGen/RISCVTargetDefEmitter.cpp b/llvm/utils/TableGen/RISCVTargetDefEmitter.cpp
index 910488a14b985..04e9e0fa48db0 100644
--- a/llvm/utils/TableGen/RISCVTargetDefEmitter.cpp
+++ b/llvm/utils/TableGen/RISCVTargetDefEmitter.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/ADT/DenseSet.h"
 #include "llvm/Support/RISCVISAUtils.h"
 #include "llvm/TableGen/Record.h"
 #include "llvm/TableGen/TableGenBackend.h"
@@ -210,46 +209,10 @@ static void emitRISCVProcs(RecordKeeper &RK, raw_ostream &OS) {
   OS << "\n#undef TUNE_PROC\n";
 }
 
-static void emitRISCVExtensionBitmask(RecordKeeper &RK, raw_ostream &OS) {
-
-  std::vector<Record *> Extensions =
-      RK.getAllDerivedDefinitionsIfDefined("RISCVExtensionBitmask");
-  llvm::sort(Extensions, [](const Record *Rec1, const Record *Rec2) {
-    return getExtensionName(Rec1) < getExtensionName(Rec2);
-  });
-
-#ifndef NDEBUG
-  llvm::DenseSet<std::pair<uint64_t, uint64_t>> Seen;
-#endif
-
-  OS << "#ifdef GET_RISCVExtensionBitmaskTable_IMPL\n";
-  OS << "static const RISCVExtensionBitmask ExtensionBitmask[]={\n";
-  for (const Record *Rec : Extensions) {
-    unsigned GroupIDVal = Rec->getValueAsInt("GroupID");
-    unsigned BitPosVal = Rec->getValueAsInt("BitPos");
-
-    StringRef ExtName = Rec->getValueAsString("Name");
-    ExtName.consume_front("experimental-");
-
-#ifndef NDEBUG
-    assert(Seen.insert(std::make_pair(GroupIDVal, BitPosVal)).second &&
-           "duplicated bitmask");
-#endif
-
-    OS << "    {"
-       << "\"" << ExtName << "\""
-       << ", " << GroupIDVal << ", " << BitPosVal << "ULL"
-       << "},\n";
-  }
-  OS << "};\n";
-  OS << "#endif\n";
-}
-
 static void EmitRISCVTargetDef(RecordKeeper &RK, raw_ostream &OS) {
   emitRISCVExtensions(RK, OS);
   emitRISCVProfiles(RK, OS);
   emitRISCVProcs(RK, OS);
-  emitRISCVExtensionBitmask(RK, OS);
 }
 
 static TableGen::Emitter::Opt X("gen-riscv-target-def", EmitRISCVTargetDef,
diff --git a/llvm/utils/gn/secondary/clang/tools/clang-nvlink-wrapper/BUILD.gn b/llvm/utils/gn/secondary/clang/tools/clang-nvlink-wrapper/BUILD.gn
deleted file mode 100644
index 24a0e89b804bf..0000000000000
--- a/llvm/utils/gn/secondary/clang/tools/clang-nvlink-wrapper/BUILD.gn
+++ /dev/null
@@ -1,31 +0,0 @@
-import("//llvm/utils/TableGen/tablegen.gni")
-
-tablegen("NVLinkOpts") {
-  args = [ "-gen-opt-parser-defs" ]
-  td_file = "NVLinkOpts.td"
-}
-
-executable("clang-nvlink-wrapper") {
-  configs += [ "//llvm/utils/gn/build:clang_code" ]
-  deps = [
-    ":NVLinkOpts",
-    "//clang/lib/Basic",
-    "//llvm/lib/Analysis",
-    "//llvm/lib/BinaryFormat",
-    "//llvm/lib/Bitcode/Writer",
-    "//llvm/lib/CodeGen",
-    "//llvm/lib/IR",
-    "//llvm/lib/IRReader",
-    "//llvm/lib/LTO",
-    "//llvm/lib/MC",
-    "//llvm/lib/Object",
-    "//llvm/lib/Option",
-    "//llvm/lib/Passes",
-    "//llvm/lib/Support",
-    "//llvm/lib/Target",
-    "//llvm/lib/Target:TargetsToBuild",
-    "//llvm/lib/TargetParser",
-    "//llvm/lib/Transforms/Utils",
-  ]
-  sources = [ "ClangNVLinkWrapper.cpp" ]
-}
diff --git a/llvm/utils/gn/secondary/clang/tools/driver/BUILD.gn b/llvm/utils/gn/secondary/clang/tools/driver/BUILD.gn
index 6f00fcac4cd54..bc50bdcfecd5b 100644
--- a/llvm/utils/gn/secondary/clang/tools/driver/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/tools/driver/BUILD.gn
@@ -58,7 +58,6 @@ driver_executable("clang") {
     "//clang/lib/FrontendTool",
     "//clang/lib/Headers",
     "//clang/tools/clang-linker-wrapper",
-    "//clang/tools/clang-nvlink-wrapper",
     "//clang/tools/clang-offload-bundler",
     "//clang/tools/clang-offload-packager",
     "//llvm/include/llvm/Config:llvm-config",
diff --git a/llvm/utils/gn/secondary/clang/unittests/Tooling/BUILD.gn b/llvm/utils/gn/secondary/clang/unittests/Tooling/BUILD.gn
index 8e1d09c9c2ee9..258fb61038df5 100644
--- a/llvm/utils/gn/secondary/clang/unittests/Tooling/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/unittests/Tooling/BUILD.gn
@@ -59,7 +59,6 @@ unittest("ToolingTests") {
     "RecursiveASTVisitorTests/Concept.cpp",
     "RecursiveASTVisitorTests/ConstructExpr.cpp",
     "RecursiveASTVisitorTests/DeclRefExpr.cpp",
-    "RecursiveASTVisitorTests/DeductionGuide.cpp",
     "RecursiveASTVisitorTests/ImplicitCtor.cpp",
     "RecursiveASTVisitorTests/ImplicitCtorInitializer.cpp",
     "RecursiveASTVisitorTests/InitListExprPostOrder.cpp",
diff --git a/llvm/utils/gn/secondary/compiler-rt/lib/builtins/BUILD.gn b/llvm/utils/gn/secondary/compiler-rt/lib/builtins/BUILD.gn
index e1bb52cafa6ab..3005a2a8a1852 100644
--- a/llvm/utils/gn/secondary/compiler-rt/lib/builtins/BUILD.gn
+++ b/llvm/utils/gn/secondary/compiler-rt/lib/builtins/BUILD.gn
@@ -571,7 +571,6 @@ static_library("builtins") {
 
   if (current_cpu == "riscv" || current_cpu == "riscv64") {
     sources += [
-      "riscv/feature_bits.c",
       "riscv/fp_mode.c",
       "riscv/restore.S",
       "riscv/save.S",
diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
index cc759d2337516..27f29c3344bfa 100644
--- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
+++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
@@ -168,7 +168,6 @@ if (current_toolchain == default_toolchain) {
       "__algorithm/ranges_find_first_of.h",
       "__algorithm/ranges_find_if.h",
       "__algorithm/ranges_find_if_not.h",
-      "__algorithm/ranges_find_last.h",
       "__algorithm/ranges_for_each.h",
       "__algorithm/ranges_for_each_n.h",
       "__algorithm/ranges_generate.h",
@@ -580,7 +579,6 @@ if (current_toolchain == default_toolchain) {
       "__math/remainder.h",
       "__math/roots.h",
       "__math/rounding_functions.h",
-      "__math/special_functions.h",
       "__math/traits.h",
       "__math/trigonometric_functions.h",
       "__mbstate_t.h",
diff --git a/llvm/utils/gn/secondary/llvm/lib/CodeGenData/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/CodeGenData/BUILD.gn
index d69579df7cc35..7fc69766368ec 100644
--- a/llvm/utils/gn/secondary/llvm/lib/CodeGenData/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/CodeGenData/BUILD.gn
@@ -5,9 +5,6 @@ static_library("CodeGenData") {
     "//llvm/lib/Support",
   ]
   sources = [
-    "CodeGenData.cpp",
-    "CodeGenDataReader.cpp",
-    "CodeGenDataWriter.cpp",
     "OutlinedHashTree.cpp",
     "OutlinedHashTreeRecord.cpp",
   ]
diff --git a/llvm/utils/gn/secondary/llvm/test/BUILD.gn b/llvm/utils/gn/secondary/llvm/test/BUILD.gn
index 31abb1285bc02..945fb0e922c9e 100644
--- a/llvm/utils/gn/secondary/llvm/test/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/test/BUILD.gn
@@ -265,7 +265,6 @@ group("test") {
     "//llvm/tools/llvm-c-test",
     "//llvm/tools/llvm-cat",
     "//llvm/tools/llvm-cfi-verify",
-    "//llvm/tools/llvm-cgdata",
     "//llvm/tools/llvm-cov",
     "//llvm/tools/llvm-cvtres",
     "//llvm/tools/llvm-cxxdump",
diff --git a/llvm/utils/gn/secondary/llvm/tools/llvm-cgdata/BUILD.gn b/llvm/utils/gn/secondary/llvm/tools/llvm-cgdata/BUILD.gn
deleted file mode 100644
index 8567c7d3de5da..0000000000000
--- a/llvm/utils/gn/secondary/llvm/tools/llvm-cgdata/BUILD.gn
+++ /dev/null
@@ -1,12 +0,0 @@
-import("//llvm/utils/gn/build/driver_executable.gni")
-
-driver_executable("llvm-cgdata") {
-  deps = [
-    "//llvm/lib/CodeGen",
-    "//llvm/lib/CodeGenData",
-    "//llvm/lib/IR",
-    "//llvm/lib/Object",
-    "//llvm/lib/Support",
-  ]
-  sources = [ "llvm-cgdata.cpp" ]
-}
diff --git a/llvm/utils/gn/secondary/llvm/unittests/ExecutionEngine/Orc/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/ExecutionEngine/Orc/BUILD.gn
index 357e5d91ee19e..3e05b9b1f8c3f 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/ExecutionEngine/Orc/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/ExecutionEngine/Orc/BUILD.gn
@@ -26,7 +26,6 @@ unittest("OrcJITTests") {
     "LookupAndRecordAddrsTest.cpp",
     "MachOPlatformTest.cpp",
     "MapperJITLinkMemoryManagerTest.cpp",
-    "MemoryFlagsTest.cpp",
     "MemoryMapperTest.cpp",
     "ObjectFormatsTest.cpp",
     "ObjectLinkingLayerTest.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/version.gni b/llvm/utils/gn/secondary/llvm/version.gni
index e0107cdeae76d..7c02ed396db5f 100644
--- a/llvm/utils/gn/secondary/llvm/version.gni
+++ b/llvm/utils/gn/secondary/llvm/version.gni
@@ -1,4 +1,4 @@
-llvm_version_major = 20
+llvm_version_major = 19
 llvm_version_minor = 0
 llvm_version_patch = 0
 llvm_version = "$llvm_version_major.$llvm_version_minor.$llvm_version_patch"
diff --git a/llvm/utils/lit/lit/__init__.py b/llvm/utils/lit/lit/__init__.py
index ea56d192fb394..a5a1ff66bf417 100644
--- a/llvm/utils/lit/lit/__init__.py
+++ b/llvm/utils/lit/lit/__init__.py
@@ -2,7 +2,7 @@
 
 __author__ = "Daniel Dunbar"
 __email__ = "daniel at minormatter.com"
-__versioninfo__ = (20, 0, 0)
+__versioninfo__ = (19, 0, 0)
 __version__ = ".".join(str(v) for v in __versioninfo__) + "dev"
 
 __all__ = []
diff --git a/llvm/utils/lit/tests/Inputs/shtest-env/env-calls-env.txt b/llvm/utils/lit/tests/Inputs/shtest-env/env-calls-env.txt
index 26150c413dc03..e4997b1250ed3 100644
--- a/llvm/utils/lit/tests/Inputs/shtest-env/env-calls-env.txt
+++ b/llvm/utils/lit/tests/Inputs/shtest-env/env-calls-env.txt
@@ -1,10 +1,10 @@
 # Check that internal env can call internal env.
 
 # RUN: env env %{python} print_environment.py \
-# RUN: | FileCheck -check-prefix=CHECK-2-EMPTY-ARGS %s
+# RUN: | FileCheck -check-prefix=CHECK-2-EMPTY %s
 #
-# CHECK-2-EMPTY-ARGS: BAR = 2
-# CHECK-2-EMPTY-ARGS: FOO = 1
+# CHECK-2-EMPTY: BAR = 2
+# CHECK-2-EMPTY: FOO = 1
 
 # RUN: env FOO=2 env BAR=1 %{python} print_environment.py \
 # RUN: | FileCheck -check-prefix=CHECK-2-VAL %s
diff --git a/mlir/docs/Canonicalization.md b/mlir/docs/Canonicalization.md
index 03fd174229afe..d1cba572af212 100644
--- a/mlir/docs/Canonicalization.md
+++ b/mlir/docs/Canonicalization.md
@@ -33,10 +33,6 @@ together.
 
 Some important things to think about w.r.t. canonicalization patterns:
 
-*   The goal of canonicalization is to make subsequent analyses and
-    optimizations more effective. Therefore, performance improvements are not
-    necessary for canonicalization.
-
 *   Pass pipelines should not rely on the canonicalizer pass for correctness.
     They should work correctly with all instances of the canonicalization pass
     removed.
@@ -55,39 +51,6 @@ Some important things to think about w.r.t. canonicalization patterns:
 *   It is always good to eliminate operations entirely when possible, e.g. by
     folding known identities (like "x + 0 = x").
 
-*   Pattens with expensive running time (i.e. have O(n) complexity) or
-    complicated cost models don't belong to canonicalization: since the
-    algorithm is executed iteratively until fixed-point we want patterns that
-    execute quickly (in particular their matching phase).
-
-*   Canonicalize shouldn't lose the semantic of original operation: the original
-    information should always be recoverable from the transformed IR.
-
-For example, a pattern that transform
-
-```
-  %transpose = linalg.transpose
-      ins(%input : tensor<1x2x3xf32>)
-      outs(%init1 : tensor<2x1x3xf32>)
-      dimensions = [1, 0, 2]
-  %out = linalg.transpose
-      ins(%tranpose: tensor<2x1x3xf32>)
-      outs(%init2 : tensor<3x1x2xf32>)
-      permutation = [2, 1, 0]
-```
-
-to
-
-```
-  %out= linalg.transpose
-      ins(%input : tensor<1x2x3xf32>)
-      outs(%init2: tensor<3x1x2xf32>)
-      permutation = [2, 0, 1]
-```
-
-is a good canonicalization pattern because it removes a redundant operation,
-making other analysis optimizations and more efficient.
-
 ## Globally Applied Rules
 
 These transformations are applied to all levels of IR:
@@ -226,7 +189,7 @@ each of the operands, returning the corresponding constant attribute. These
 operands are those that implement the `ConstantLike` trait. If any of the
 operands are non-constant, a null `Attribute` value is provided instead. For
 example, if MyOp provides three operands [`a`, `b`, `c`], but only `b` is
-constant then `adaptor` will return Attribute() for `getA()` and `getC()`,
+constant then `adaptor` will return Attribute() for `getA()` and `getC()`, 
 and b-value for `getB()`.
 
 Also above, is the use of `OpFoldResult`. This class represents the possible
diff --git a/mlir/include/mlir-c/BuiltinTypes.h b/mlir/include/mlir-c/BuiltinTypes.h
index 2212087b9898f..99c5e3f46b04c 100644
--- a/mlir/include/mlir-c/BuiltinTypes.h
+++ b/mlir/include/mlir-c/BuiltinTypes.h
@@ -89,16 +89,6 @@ MLIR_CAPI_EXPORTED bool mlirTypeIsAFloat8E5M2(MlirType type);
 /// context.
 MLIR_CAPI_EXPORTED MlirType mlirFloat8E5M2TypeGet(MlirContext ctx);
 
-/// Returns the typeID of an Float8E4M3 type.
-MLIR_CAPI_EXPORTED MlirTypeID mlirFloat8E4M3TypeGetTypeID(void);
-
-/// Checks whether the given type is an f8E4M3 type.
-MLIR_CAPI_EXPORTED bool mlirTypeIsAFloat8E4M3(MlirType type);
-
-/// Creates an f8E4M3 type in the given context. The type is owned by the
-/// context.
-MLIR_CAPI_EXPORTED MlirType mlirFloat8E4M3TypeGet(MlirContext ctx);
-
 /// Returns the typeID of an Float8E4M3FN type.
 MLIR_CAPI_EXPORTED MlirTypeID mlirFloat8E4M3FNTypeGetTypeID(void);
 
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
index 6abcadf0a4b45..06656c791c594 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
@@ -560,14 +560,14 @@ def LLVM_InvokeOp : LLVM_Op<"invoke", [
                       DeclareOpInterfaceMethods<BranchWeightOpInterface>,
                       Terminator]> {
   let arguments = (ins
-                   OptionalAttr<TypeAttrOf<LLVM_FunctionType>>:$var_callee_type,
+                   OptionalAttr<TypeAttrOf<LLVM_FunctionType>>:$callee_type,
                    OptionalAttr<FlatSymbolRefAttr>:$callee,
                    Variadic<LLVM_Type>:$callee_operands,
                    Variadic<LLVM_Type>:$normalDestOperands,
                    Variadic<LLVM_Type>:$unwindDestOperands,
                    OptionalAttr<DenseI32ArrayAttr>:$branch_weights,
                    DefaultValuedAttr<CConv, "CConv::C">:$CConv);
-  let results = (outs Optional<LLVM_Type>:$result);
+  let results = (outs Variadic<LLVM_Type>);
   let successors = (successor AnySuccessor:$normalDest,
                               AnySuccessor:$unwindDest);
 
@@ -617,12 +617,11 @@ def LLVM_CallOp : LLVM_MemAccessOpBase<"call",
     start with a function name (`@`-prefixed) and indirect calls start with an
     SSA value (`%`-prefixed). The direct callee, if present, is stored as a
     function attribute `callee`. For indirect calls, the callee is of `!llvm.ptr` type
-    and is stored as the first value in `callee_operands`. If and only if the
-    callee is a variadic function, the `var_callee_type` attribute must carry
-    the variadic LLVM function type. The trailing type list contains the
-    optional indirect callee type and the MLIR function type, which differs from
-    the LLVM function type that uses an explicit void type to model functions
-    that do not return a value.
+    and is stored as the first value in `callee_operands`. If the callee is a variadic
+    function, then the `callee_type` attribute must carry the function type. The
+    trailing type list contains the optional indirect callee type and the MLIR
+    function type, which differs from the LLVM function type that uses a explicit
+    void type to model functions that do not return a value.
 
     Examples:
 
@@ -645,19 +644,14 @@ def LLVM_CallOp : LLVM_MemAccessOpBase<"call",
     ```
   }];
 
-  dag args = (ins OptionalAttr<TypeAttrOf<LLVM_FunctionType>>:$var_callee_type,
+  dag args = (ins OptionalAttr<TypeAttrOf<LLVM_FunctionType>>:$callee_type,
                   OptionalAttr<FlatSymbolRefAttr>:$callee,
                   Variadic<LLVM_Type>:$callee_operands,
                   DefaultValuedAttr<LLVM_FastmathFlagsAttr,
                                    "{}">:$fastmathFlags,
                   OptionalAttr<DenseI32ArrayAttr>:$branch_weights,
                   DefaultValuedAttr<CConv, "CConv::C">:$CConv,
-                  DefaultValuedAttr<TailCallKind, "TailCallKind::None">:$TailCallKind,
-                  OptionalAttr<LLVM_MemoryEffectsAttr>:$memory,
-                  OptionalAttr<UnitAttr>:$convergent,
-                  OptionalAttr<UnitAttr>:$no_unwind,
-                  OptionalAttr<UnitAttr>:$will_return
-                  );
+                  DefaultValuedAttr<TailCallKind, "TailCallKind::None">:$TailCallKind);
   // Append the aliasing related attributes defined in LLVM_MemAccessOpBase.
   let arguments = !con(args, aliasAttrs);
   let results = (outs Optional<LLVM_Type>:$result);
diff --git a/mlir/include/mlir/Dialect/Utils/IndexingUtils.h b/mlir/include/mlir/Dialect/Utils/IndexingUtils.h
index 7849782e5442b..b774359552aa5 100644
--- a/mlir/include/mlir/Dialect/Utils/IndexingUtils.h
+++ b/mlir/include/mlir/Dialect/Utils/IndexingUtils.h
@@ -243,14 +243,6 @@ SmallVector<int64_t>
 computePermutationVector(int64_t permSize, ArrayRef<int64_t> positions,
                          ArrayRef<int64_t> desiredPositions);
 
-/// Returns a permutation vector that drop the input dims in
-/// dropPositions from inputPerm.
-///
-/// For example, inputPerm = {2, 4, 0, 1, 3} and dropPositions= {1, 2} would
-/// result in a {2, 0, 1} permutation vector.
-SmallVector<int64_t> dropDims(ArrayRef<int64_t> inputPerm,
-                              ArrayRef<int64_t> dropPositions);
-
 /// Helper to return a subset of `arrayAttr` as a vector of int64_t.
 // TODO: Port everything relevant to DenseArrayAttr and drop this util.
 SmallVector<int64_t> getI64SubArray(ArrayAttr arrayAttr, unsigned dropFront = 0,
diff --git a/mlir/include/mlir/IR/Builders.h b/mlir/include/mlir/IR/Builders.h
index 1c4d329fbf0d8..0d5fa719d0dee 100644
--- a/mlir/include/mlir/IR/Builders.h
+++ b/mlir/include/mlir/IR/Builders.h
@@ -61,7 +61,6 @@ class Builder {
 
   // Types.
   FloatType getFloat8E5M2Type();
-  FloatType getFloat8E4M3Type();
   FloatType getFloat8E4M3FNType();
   FloatType getFloat8E5M2FNUZType();
   FloatType getFloat8E4M3FNUZType();
diff --git a/mlir/include/mlir/IR/BuiltinTypes.h b/mlir/include/mlir/IR/BuiltinTypes.h
index 4250be90ba7fb..5579b138668d2 100644
--- a/mlir/include/mlir/IR/BuiltinTypes.h
+++ b/mlir/include/mlir/IR/BuiltinTypes.h
@@ -39,11 +39,6 @@ struct IntegerTypeStorage;
 struct TupleTypeStorage;
 } // namespace detail
 
-/// Type trait indicating that the type has value semantics.
-template <typename ConcreteType>
-class ValueSemantics
-    : public TypeTrait::TraitBase<ConcreteType, ValueSemantics> {};
-
 //===----------------------------------------------------------------------===//
 // FloatType
 //===----------------------------------------------------------------------===//
@@ -61,7 +56,6 @@ class FloatType : public Type {
   static FloatType getF80(MLIRContext *ctx);
   static FloatType getF128(MLIRContext *ctx);
   static FloatType getFloat8E5M2(MLIRContext *ctx);
-  static FloatType getFloat8E4M3(MLIRContext *ctx);
   static FloatType getFloat8E4M3FN(MLIRContext *ctx);
   static FloatType getFloat8E5M2FNUZ(MLIRContext *ctx);
   static FloatType getFloat8E4M3FNUZ(MLIRContext *ctx);
@@ -411,20 +405,16 @@ inline bool BaseMemRefType::isValidElementType(Type type) {
 }
 
 inline bool FloatType::classof(Type type) {
-  return llvm::isa<
-      Float8E5M2Type, Float8E4M3Type, Float8E4M3FNType, Float8E5M2FNUZType,
-      Float8E4M3FNUZType, Float8E4M3B11FNUZType, BFloat16Type, Float16Type,
-      FloatTF32Type, Float32Type, Float64Type, Float80Type, Float128Type>(type);
+  return llvm::isa<Float8E5M2Type, Float8E4M3FNType, Float8E5M2FNUZType,
+                   Float8E4M3FNUZType, Float8E4M3B11FNUZType, BFloat16Type,
+                   Float16Type, FloatTF32Type, Float32Type, Float64Type,
+                   Float80Type, Float128Type>(type);
 }
 
 inline FloatType FloatType::getFloat8E5M2(MLIRContext *ctx) {
   return Float8E5M2Type::get(ctx);
 }
 
-inline FloatType FloatType::getFloat8E4M3(MLIRContext *ctx) {
-  return Float8E4M3Type::get(ctx);
-}
-
 inline FloatType FloatType::getFloat8E4M3FN(MLIRContext *ctx) {
   return Float8E4M3FNType::get(ctx);
 }
diff --git a/mlir/include/mlir/IR/BuiltinTypes.td b/mlir/include/mlir/IR/BuiltinTypes.td
index 0b3532dcc7d4f..4cade83dd3c32 100644
--- a/mlir/include/mlir/IR/BuiltinTypes.td
+++ b/mlir/include/mlir/IR/BuiltinTypes.td
@@ -30,15 +30,6 @@ class Builtin_Type<string name, string typeMnemonic, list<Trait> traits = [],
   let typeName = "builtin." # typeMnemonic;
 }
 
-//===----------------------------------------------------------------------===//
-// Traits
-//===----------------------------------------------------------------------===//
-
-/// Type trait indicating that the type has value semantics.
-def ValueSemantics : NativeTypeTrait<"ValueSemantics"> {
-  let cppNamespace = "::mlir";
-}
-
 //===----------------------------------------------------------------------===//
 // ComplexType
 //===----------------------------------------------------------------------===//
@@ -106,25 +97,6 @@ def Builtin_Float8E5M2 : Builtin_FloatType<"Float8E5M2", "f8E5M2"> {
   }];
 }
 
-//===----------------------------------------------------------------------===//
-// Float8E4M3Type
-
-def Builtin_Float8E4M3 : Builtin_FloatType<"Float8E4M3", "f8E4M3"> {
-  let summary = "8-bit floating point with 3 bit mantissa";
-  let description = [{
-    An 8-bit floating point type with 1 sign bit, 4 bits exponent and 3 bits
-    mantissa. This is not a standard type as defined by IEEE-754, but it
-    follows similar conventions with the following characteristics:
-
-      * bit encoding: S1E4M3
-      * exponent bias: 7
-      * infinities: supported with exponent set to all 1s and mantissa 0s
-      * NaNs: supported with exponent bits set to all 1s and mantissa of
-        (001, 010, 011, 100, 101, 110, 111)
-      * denormals when exponent is 0
-  }];
-}
-
 //===----------------------------------------------------------------------===//
 // Float8E4M3FNType
 
@@ -773,7 +745,7 @@ def Builtin_Opaque : Builtin_Type<"Opaque", "opaque"> {
 //===----------------------------------------------------------------------===//
 
 def Builtin_RankedTensor : Builtin_Type<"RankedTensor", "tensor", [
-    ShapedTypeInterface, ValueSemantics
+    ShapedTypeInterface
   ], "TensorType"> {
   let summary = "Multi-dimensional array with a fixed number of dimensions";
   let description = [{
@@ -1029,7 +1001,7 @@ def Builtin_UnrankedMemRef : Builtin_Type<"UnrankedMemRef", "unranked_memref", [
 //===----------------------------------------------------------------------===//
 
 def Builtin_UnrankedTensor : Builtin_Type<"UnrankedTensor", "unranked_tensor", [
-    ShapedTypeInterface, ValueSemantics
+    ShapedTypeInterface
   ], "TensorType"> {
   let summary = "Multi-dimensional array with unknown dimensions";
   let description = [{
@@ -1077,8 +1049,7 @@ def Builtin_UnrankedTensor : Builtin_Type<"UnrankedTensor", "unranked_tensor", [
 // VectorType
 //===----------------------------------------------------------------------===//
 
-def Builtin_Vector : Builtin_Type<"Vector", "vector",
-    [ShapedTypeInterface, ValueSemantics], "Type"> {
+def Builtin_Vector : Builtin_Type<"Vector", "vector", [ShapedTypeInterface], "Type"> {
   let summary = "Multi-dimensional SIMD vector type";
   let description = [{
     Syntax:
diff --git a/mlir/include/mlir/IR/CommonTypeConstraints.td b/mlir/include/mlir/IR/CommonTypeConstraints.td
index c23d2d87e080f..af4f13dc09360 100644
--- a/mlir/include/mlir/IR/CommonTypeConstraints.td
+++ b/mlir/include/mlir/IR/CommonTypeConstraints.td
@@ -89,9 +89,6 @@ def HasStaticShapePred :
 // Whether a type is a TupleType.
 def IsTupleTypePred : CPred<"::llvm::isa<::mlir::TupleType>($_self)">;
 
-// Whether a type has a ValueSemantics trait.
-def HasValueSemanticsPred : CPred<"$_self.hasTrait<::mlir::ValueSemantics>()">;
-
 //===----------------------------------------------------------------------===//
 // Type definitions
 //===----------------------------------------------------------------------===//
@@ -334,8 +331,6 @@ def F8E4M3FN : Type<CPred<"$_self.isFloat8E4M3FN()">, "f8E4M3FN type">,
                BuildableType<"$_builder.getFloat8E4M3FNType()">;
 def F8E5M2 : Type<CPred<"$_self.isFloat8E5M2()">, "f8E5M2 type">,
              BuildableType<"$_builder.getFloat8E5M2Type()">;
-def F8E4M3 : Type<CPred<"$_self.isFloat8E4M3()">, "f8E4M3 type">,
-             BuildableType<"$_builder.getFloat8E4M3Type()">;
 def F8E4M3FNUZ : Type<CPred<"$_self.isFloat8E4M3FNUZ()">, "f8E4M3FNUZ type">,
                  BuildableType<"$_builder.getFloat8E4M3FNUZType()">;
 def F8E4M3B11FNUZ : Type<CPred<"$_self.isFloat8E4M3B11FNUZ()">, "f8E4M3B11FNUZ type">,
@@ -408,11 +403,6 @@ class HasRankGreaterOrEqualPred<int rank> : And<[
     CPred<[{::llvm::cast<::mlir::ShapedType>($_self).getRank() >= }] # rank>
 ]>;
 
-// Container with value semantics.
-class ValueSemanticsContainerOf<list<Type> allowedTypes> :
-  ShapedContainerType<allowedTypes, HasValueSemanticsPred,
-  "container with value semantics">;
-
 // Vector types.
 
 class VectorOf<list<Type> allowedTypes> :
@@ -852,18 +842,10 @@ class NestedTupleOf<list<Type> allowedTypes> :
 // Common type constraints
 //===----------------------------------------------------------------------===//
 // Type constraint for types that are "like" some type or set of types T, that is
-// they're either a T, a vector of Ts, or a tensor of Ts.
+// they're either a T, a vector of Ts, or a tensor of Ts
 class TypeOrContainer<Type allowedType, string name> : TypeConstraint<Or<[
-  allowedType.predicate,
-  ValueSemanticsContainerOf<[allowedType]>.predicate]>,
-  name>;
-
-// Type constraint for types that are "like" some type or set of types T, that is
-// they're either a T or a mapable container of Ts.
-class TypeOrValueSemanticsContainer<Type allowedType, string name>
-    : TypeConstraint<Or<[
-  allowedType.predicate,
-  ValueSemanticsContainerOf<[allowedType]>.predicate]>,
+  allowedType.predicate, VectorOf<[allowedType]>.predicate,
+  TensorOf<[allowedType]>.predicate]>,
   name>;
 
 // Temporary constraint to allow gradual transition to supporting 0-D vectors.
@@ -882,8 +864,8 @@ def BoolLikeOfAnyRank : TypeOrContainerOfAnyRank<I1, "bool-like">;
 
 // Type constraint for signless-integer-like types: signless integers, indices,
 // vectors of signless integers or indices, tensors of signless integers.
-def SignlessIntegerLike : TypeOrValueSemanticsContainer<
-    AnySignlessIntegerOrIndex, "signless-integer-like">;
+def SignlessIntegerLike : TypeOrContainer<AnySignlessIntegerOrIndex,
+    "signless-integer-like">;
 
 def SignlessIntegerLikeOfAnyRank : TypeOrContainerOfAnyRank<
     AnySignlessIntegerOrIndex,
diff --git a/mlir/include/mlir/IR/Types.h b/mlir/include/mlir/IR/Types.h
index a32de33114e40..65824531fdc90 100644
--- a/mlir/include/mlir/IR/Types.h
+++ b/mlir/include/mlir/IR/Types.h
@@ -126,7 +126,6 @@ class Type {
   // derived types should use isa/dyn_cast.
   bool isIndex() const;
   bool isFloat8E5M2() const;
-  bool isFloat8E4M3() const;
   bool isFloat8E4M3FN() const;
   bool isFloat8E5M2FNUZ() const;
   bool isFloat8E4M3FNUZ() const;
diff --git a/mlir/lib/AsmParser/TokenKinds.def b/mlir/lib/AsmParser/TokenKinds.def
index eb3154c6da42e..297e074594530 100644
--- a/mlir/lib/AsmParser/TokenKinds.def
+++ b/mlir/lib/AsmParser/TokenKinds.def
@@ -95,7 +95,6 @@ TOK_KEYWORD(f32)
 TOK_KEYWORD(f64)
 TOK_KEYWORD(f80)
 TOK_KEYWORD(f8E5M2)
-TOK_KEYWORD(f8E4M3)
 TOK_KEYWORD(f8E4M3FN)
 TOK_KEYWORD(f8E5M2FNUZ)
 TOK_KEYWORD(f8E4M3FNUZ)
diff --git a/mlir/lib/AsmParser/TypeParser.cpp b/mlir/lib/AsmParser/TypeParser.cpp
index 467b5f0844ab3..0b46c96bbc04d 100644
--- a/mlir/lib/AsmParser/TypeParser.cpp
+++ b/mlir/lib/AsmParser/TypeParser.cpp
@@ -40,7 +40,6 @@ OptionalParseResult Parser::parseOptionalType(Type &type) {
   case Token::kw_vector:
   case Token::inttype:
   case Token::kw_f8E5M2:
-  case Token::kw_f8E4M3:
   case Token::kw_f8E4M3FN:
   case Token::kw_f8E5M2FNUZ:
   case Token::kw_f8E4M3FNUZ:
@@ -305,9 +304,6 @@ Type Parser::parseNonFunctionType() {
   case Token::kw_f8E5M2:
     consumeToken(Token::kw_f8E5M2);
     return builder.getFloat8E5M2Type();
-  case Token::kw_f8E4M3:
-    consumeToken(Token::kw_f8E4M3);
-    return builder.getFloat8E4M3Type();
   case Token::kw_f8E4M3FN:
     consumeToken(Token::kw_f8E4M3FN);
     return builder.getFloat8E4M3FNType();
diff --git a/mlir/lib/Bindings/Python/IRTypes.cpp b/mlir/lib/Bindings/Python/IRTypes.cpp
index 5e0aebc03e2c1..e1e4eb999b3aa 100644
--- a/mlir/lib/Bindings/Python/IRTypes.cpp
+++ b/mlir/lib/Bindings/Python/IRTypes.cpp
@@ -143,7 +143,7 @@ class PyFloat8E4M3FNType
   }
 };
 
-/// Floating Point Type subclass - Float8E5M2Type.
+/// Floating Point Type subclass - Float8M5E2Type.
 class PyFloat8E5M2Type : public PyConcreteType<PyFloat8E5M2Type, PyFloatType> {
 public:
   static constexpr IsAFunctionTy isaFunction = mlirTypeIsAFloat8E5M2;
@@ -163,26 +163,6 @@ class PyFloat8E5M2Type : public PyConcreteType<PyFloat8E5M2Type, PyFloatType> {
   }
 };
 
-/// Floating Point Type subclass - Float8E4M3Type.
-class PyFloat8E4M3Type : public PyConcreteType<PyFloat8E4M3Type, PyFloatType> {
-public:
-  static constexpr IsAFunctionTy isaFunction = mlirTypeIsAFloat8E4M3;
-  static constexpr GetTypeIDFunctionTy getTypeIdFunction =
-      mlirFloat8E4M3TypeGetTypeID;
-  static constexpr const char *pyClassName = "Float8E4M3Type";
-  using PyConcreteType::PyConcreteType;
-
-  static void bindDerived(ClassTy &c) {
-    c.def_static(
-        "get",
-        [](DefaultingPyMlirContext context) {
-          MlirType t = mlirFloat8E4M3TypeGet(context->get());
-          return PyFloat8E4M3Type(context->getRef(), t);
-        },
-        py::arg("context") = py::none(), "Create a float8_e4m3 type.");
-  }
-};
-
 /// Floating Point Type subclass - Float8E4M3FNUZ.
 class PyFloat8E4M3FNUZType
     : public PyConcreteType<PyFloat8E4M3FNUZType, PyFloatType> {
@@ -860,7 +840,6 @@ void mlir::python::populateIRTypes(py::module &m) {
   PyIndexType::bind(m);
   PyFloat8E4M3FNType::bind(m);
   PyFloat8E5M2Type::bind(m);
-  PyFloat8E4M3Type::bind(m);
   PyFloat8E4M3FNUZType::bind(m);
   PyFloat8E4M3B11FNUZType::bind(m);
   PyFloat8E5M2FNUZType::bind(m);
diff --git a/mlir/lib/CAPI/IR/BuiltinTypes.cpp b/mlir/lib/CAPI/IR/BuiltinTypes.cpp
index d507027357c26..01bb71d9228b4 100644
--- a/mlir/lib/CAPI/IR/BuiltinTypes.cpp
+++ b/mlir/lib/CAPI/IR/BuiltinTypes.cpp
@@ -97,18 +97,6 @@ MlirType mlirFloat8E5M2TypeGet(MlirContext ctx) {
   return wrap(FloatType::getFloat8E5M2(unwrap(ctx)));
 }
 
-MlirTypeID mlirFloat8E4M3TypeGetTypeID() {
-  return wrap(Float8E4M3Type::getTypeID());
-}
-
-bool mlirTypeIsAFloat8E4M3(MlirType type) {
-  return unwrap(type).isFloat8E4M3();
-}
-
-MlirType mlirFloat8E4M3TypeGet(MlirContext ctx) {
-  return wrap(FloatType::getFloat8E4M3(unwrap(ctx)));
-}
-
 MlirTypeID mlirFloat8E4M3FNTypeGetTypeID() {
   return wrap(Float8E4M3FNType::getTypeID());
 }
diff --git a/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp b/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp
index d5df960928afb..32d02d5e438bd 100644
--- a/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp
+++ b/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp
@@ -247,9 +247,8 @@ Type LLVMTypeConverter::convertIntegerType(IntegerType type) const {
 }
 
 Type LLVMTypeConverter::convertFloatType(FloatType type) const {
-  if (type.isFloat8E5M2() || type.isFloat8E4M3() || type.isFloat8E4M3FN() ||
-      type.isFloat8E5M2FNUZ() || type.isFloat8E4M3FNUZ() ||
-      type.isFloat8E4M3B11FNUZ())
+  if (type.isFloat8E5M2() || type.isFloat8E4M3FN() || type.isFloat8E5M2FNUZ() ||
+      type.isFloat8E4M3FNUZ() || type.isFloat8E4M3B11FNUZ())
     return IntegerType::get(&getContext(), type.getWidth());
   return type;
 }
diff --git a/mlir/lib/Conversion/VectorToSPIRV/VectorToSPIRV.cpp b/mlir/lib/Conversion/VectorToSPIRV/VectorToSPIRV.cpp
index 527fbe5cf628a..a4390447532a5 100644
--- a/mlir/lib/Conversion/VectorToSPIRV/VectorToSPIRV.cpp
+++ b/mlir/lib/Conversion/VectorToSPIRV/VectorToSPIRV.cpp
@@ -144,8 +144,8 @@ struct VectorBroadcastConvert final
 
     SmallVector<Value, 4> source(castOp.getResultVectorType().getNumElements(),
                                  adaptor.getSource());
-    rewriter.replaceOpWithNewOp<spirv::CompositeConstructOp>(castOp, resultType,
-                                                             source);
+    rewriter.replaceOpWithNewOp<spirv::CompositeConstructOp>(
+        castOp, castOp.getResultVectorType(), source);
     return success();
   }
 };
diff --git a/mlir/lib/Dialect/Arith/Transforms/EmulateUnsupportedFloats.cpp b/mlir/lib/Dialect/Arith/Transforms/EmulateUnsupportedFloats.cpp
index a362c8500aa5b..8e1cb474feee7 100644
--- a/mlir/lib/Dialect/Arith/Transforms/EmulateUnsupportedFloats.cpp
+++ b/mlir/lib/Dialect/Arith/Transforms/EmulateUnsupportedFloats.cpp
@@ -56,7 +56,6 @@ static std::optional<FloatType> parseFloatType(MLIRContext *ctx,
   Builder b(ctx);
   return llvm::StringSwitch<std::optional<FloatType>>(name)
       .Case("f8E5M2", b.getFloat8E5M2Type())
-      .Case("f8E4M3", b.getFloat8E4M3Type())
       .Case("f8E4M3FN", b.getFloat8E4M3FNType())
       .Case("f8E5M2FNUZ", b.getFloat8E5M2FNUZType())
       .Case("f8E4M3FNUZ", b.getFloat8E4M3FNUZType())
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
index f0ac45be4b9e8..9372caf6e32a7 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
@@ -948,11 +948,6 @@ static SmallVector<Type, 1> getCallOpResultTypes(LLVMFunctionType calleeType) {
   return results;
 }
 
-/// Gets the variadic callee type for a LLVMFunctionType.
-static TypeAttr getCallOpVarCalleeType(LLVMFunctionType calleeType) {
-  return calleeType.isVarArg() ? TypeAttr::get(calleeType) : nullptr;
-}
-
 /// Constructs a LLVMFunctionType from MLIR `results` and `args`.
 static LLVMFunctionType getLLVMFuncType(MLIRContext *context, TypeRange results,
                                         ValueRange args) {
@@ -979,11 +974,9 @@ void CallOp::build(OpBuilder &builder, OperationState &state, TypeRange results,
                    FlatSymbolRefAttr callee, ValueRange args) {
   assert(callee && "expected non-null callee in direct call builder");
   build(builder, state, results,
-        /*var_callee_type=*/nullptr, callee, args, /*fastmathFlags=*/nullptr,
-        /*branch_weights=*/nullptr,
+        TypeAttr::get(getLLVMFuncType(builder.getContext(), results, args)),
+        callee, args, /*fastmathFlags=*/nullptr, /*branch_weights=*/nullptr,
         /*CConv=*/nullptr, /*TailCallKind=*/nullptr,
-        /*memory=*/nullptr,
-        /*convergent=*/nullptr, /*no_unwind=*/nullptr, /*will_return=*/nullptr,
         /*access_groups=*/nullptr, /*alias_scopes=*/nullptr,
         /*noalias_scopes=*/nullptr, /*tbaa=*/nullptr);
 }
@@ -1004,23 +997,18 @@ void CallOp::build(OpBuilder &builder, OperationState &state,
                    LLVMFunctionType calleeType, FlatSymbolRefAttr callee,
                    ValueRange args) {
   build(builder, state, getCallOpResultTypes(calleeType),
-        getCallOpVarCalleeType(calleeType), callee, args,
-        /*fastmathFlags=*/nullptr,
+        TypeAttr::get(calleeType), callee, args, /*fastmathFlags=*/nullptr,
         /*branch_weights=*/nullptr, /*CConv=*/nullptr,
-        /*TailCallKind=*/nullptr, /*memory=*/nullptr, /*convergent=*/nullptr,
-        /*no_unwind=*/nullptr, /*will_return=*/nullptr,
-        /*access_groups=*/nullptr,
+        /*TailCallKind=*/nullptr, /*access_groups=*/nullptr,
         /*alias_scopes=*/nullptr, /*noalias_scopes=*/nullptr, /*tbaa=*/nullptr);
 }
 
 void CallOp::build(OpBuilder &builder, OperationState &state,
                    LLVMFunctionType calleeType, ValueRange args) {
   build(builder, state, getCallOpResultTypes(calleeType),
-        getCallOpVarCalleeType(calleeType),
-        /*callee=*/nullptr, args,
+        TypeAttr::get(calleeType), /*callee=*/nullptr, args,
         /*fastmathFlags=*/nullptr, /*branch_weights=*/nullptr,
-        /*CConv=*/nullptr, /*TailCallKind=*/nullptr, /*memory=*/nullptr,
-        /*convergent=*/nullptr, /*no_unwind=*/nullptr, /*will_return=*/nullptr,
+        /*CConv=*/nullptr, /*TailCallKind=*/nullptr,
         /*access_groups=*/nullptr, /*alias_scopes=*/nullptr,
         /*noalias_scopes=*/nullptr, /*tbaa=*/nullptr);
 }
@@ -1029,10 +1017,9 @@ void CallOp::build(OpBuilder &builder, OperationState &state, LLVMFuncOp func,
                    ValueRange args) {
   auto calleeType = func.getFunctionType();
   build(builder, state, getCallOpResultTypes(calleeType),
-        getCallOpVarCalleeType(calleeType), SymbolRefAttr::get(func), args,
+        TypeAttr::get(calleeType), SymbolRefAttr::get(func), args,
         /*fastmathFlags=*/nullptr, /*branch_weights=*/nullptr,
-        /*CConv=*/nullptr, /*TailCallKind=*/nullptr, /*memory=*/nullptr,
-        /*convergent=*/nullptr, /*no_unwind=*/nullptr, /*will_return=*/nullptr,
+        /*CConv=*/nullptr, /*TailCallKind=*/nullptr,
         /*access_groups=*/nullptr, /*alias_scopes=*/nullptr,
         /*noalias_scopes=*/nullptr, /*tbaa=*/nullptr);
 }
@@ -1089,49 +1076,9 @@ static LogicalResult verifyCallOpDebugInfo(CallOp callOp, LLVMFuncOp callee) {
   return success();
 }
 
-/// Verify that the parameter and return types of the variadic callee type match
-/// the `callOp` argument and result types.
-template <typename OpTy>
-LogicalResult verifyCallOpVarCalleeType(OpTy callOp) {
-  std::optional<LLVMFunctionType> varCalleeType = callOp.getVarCalleeType();
-  if (!varCalleeType)
-    return success();
-
-  // Verify the variadic callee type is a variadic function type.
-  if (!varCalleeType->isVarArg())
-    return callOp.emitOpError(
-        "expected var_callee_type to be a variadic function type");
-
-  // Verify the variadic callee type has at most as many parameters as the call
-  // has argument operands.
-  if (varCalleeType->getNumParams() > callOp.getArgOperands().size())
-    return callOp.emitOpError("expected var_callee_type to have at most ")
-           << callOp.getArgOperands().size() << " parameters";
-
-  // Verify the variadic callee type matches the call argument types.
-  for (auto [paramType, operand] :
-       llvm::zip(varCalleeType->getParams(), callOp.getArgOperands()))
-    if (paramType != operand.getType())
-      return callOp.emitOpError()
-             << "var_callee_type parameter type mismatch: " << paramType
-             << " != " << operand.getType();
-
-  // Verify the variadic callee type matches the call result type.
-  if (!callOp.getNumResults()) {
-    if (!isa<LLVMVoidType>(varCalleeType->getReturnType()))
-      return callOp.emitOpError("expected var_callee_type to return void");
-  } else {
-    if (callOp.getResult().getType() != varCalleeType->getReturnType())
-      return callOp.emitOpError("var_callee_type return type mismatch: ")
-             << varCalleeType->getReturnType()
-             << " != " << callOp.getResult().getType();
-  }
-  return success();
-}
-
 LogicalResult CallOp::verifySymbolUses(SymbolTableCollection &symbolTable) {
-  if (failed(verifyCallOpVarCalleeType(*this)))
-    return failure();
+  if (getNumResults() > 1)
+    return emitOpError("must have 0 or 1 result");
 
   // Type for the callee, we'll get it differently depending if it is a direct
   // or indirect call.
@@ -1173,8 +1120,8 @@ LogicalResult CallOp::verifySymbolUses(SymbolTableCollection &symbolTable) {
   if (!funcType)
     return emitOpError("callee does not have a functional type: ") << fnType;
 
-  if (funcType.isVarArg() && !getVarCalleeType())
-    return emitOpError() << "missing var_callee_type attribute for vararg call";
+  if (funcType.isVarArg() && !getCalleeType())
+    return emitOpError() << "missing callee type attribute for vararg call";
 
   // Verify that the operand and result types match the callee.
 
@@ -1221,13 +1168,21 @@ void CallOp::print(OpAsmPrinter &p) {
   auto callee = getCallee();
   bool isDirect = callee.has_value();
 
+  LLVMFunctionType calleeType;
+  bool isVarArg = false;
+
+  if (std::optional<LLVMFunctionType> optionalCalleeType = getCalleeType()) {
+    calleeType = *optionalCalleeType;
+    isVarArg = calleeType.isVarArg();
+  }
+
   p << ' ';
 
   // Print calling convention.
   if (getCConv() != LLVM::CConv::C)
     p << stringifyCConv(getCConv()) << ' ';
 
-  if (getTailCallKind() != LLVM::TailCallKind::None)
+  if(getTailCallKind() != LLVM::TailCallKind::None)
     p << tailcallkind::stringifyTailCallKind(getTailCallKind()) << ' ';
 
   // Print the direct callee if present as a function attribute, or an indirect
@@ -1240,13 +1195,12 @@ void CallOp::print(OpAsmPrinter &p) {
   auto args = getOperands().drop_front(isDirect ? 0 : 1);
   p << '(' << args << ')';
 
-  // Print the variadic callee type if the call is variadic.
-  if (std::optional<LLVMFunctionType> varCalleeType = getVarCalleeType())
-    p << " vararg(" << *varCalleeType << ")";
+  if (isVarArg)
+    p << " vararg(" << calleeType << ")";
 
   p.printOptionalAttrDict(processFMFAttr((*this)->getAttrs()),
-                          {getCalleeAttrName(), getTailCallKindAttrName(),
-                           getVarCalleeTypeAttrName(), getCConvAttrName()});
+                          {getCConvAttrName(), "callee", "callee_type",
+                           getTailCallKindAttrName()});
 
   p << " : ";
   if (!isDirect)
@@ -1316,11 +1270,11 @@ static ParseResult parseOptionalCallFuncPtr(
 
 // <operation> ::= `llvm.call` (cconv)? (tailcallkind)? (function-id | ssa-use)
 //                             `(` ssa-use-list `)`
-//                             ( `vararg(` var-callee-type `)` )?
+//                             ( `vararg(` var-arg-func-type `)` )?
 //                             attribute-dict? `:` (type `,`)? function-type
 ParseResult CallOp::parse(OpAsmParser &parser, OperationState &result) {
   SymbolRefAttr funcAttr;
-  TypeAttr varCalleeType;
+  TypeAttr calleeType;
   SmallVector<OpAsmParser::UnresolvedOperand> operands;
 
   // Default to C Calling Convention if no keyword is provided.
@@ -1351,12 +1305,8 @@ ParseResult CallOp::parse(OpAsmParser &parser, OperationState &result) {
 
   bool isVarArg = parser.parseOptionalKeyword("vararg").succeeded();
   if (isVarArg) {
-    StringAttr varCalleeTypeAttrName =
-        CallOp::getVarCalleeTypeAttrName(result.name);
     if (parser.parseLParen().failed() ||
-        parser
-            .parseAttribute(varCalleeType, varCalleeTypeAttrName,
-                            result.attributes)
+        parser.parseAttribute(calleeType, "callee_type", result.attributes)
             .failed() ||
         parser.parseRParen().failed())
       return failure();
@@ -1370,8 +1320,8 @@ ParseResult CallOp::parse(OpAsmParser &parser, OperationState &result) {
 }
 
 LLVMFunctionType CallOp::getCalleeFunctionType() {
-  if (std::optional<LLVMFunctionType> varCalleeType = getVarCalleeType())
-    return *varCalleeType;
+  if (getCalleeType())
+    return *getCalleeType();
   return getLLVMFuncType(getContext(), getResultTypes(), getArgOperands());
 }
 
@@ -1384,8 +1334,8 @@ void InvokeOp::build(OpBuilder &builder, OperationState &state, LLVMFuncOp func,
                      Block *unwind, ValueRange unwindOps) {
   auto calleeType = func.getFunctionType();
   build(builder, state, getCallOpResultTypes(calleeType),
-        getCallOpVarCalleeType(calleeType), SymbolRefAttr::get(func), ops,
-        normalOps, unwindOps, nullptr, nullptr, normal, unwind);
+        TypeAttr::get(calleeType), SymbolRefAttr::get(func), ops, normalOps,
+        unwindOps, nullptr, nullptr, normal, unwind);
 }
 
 void InvokeOp::build(OpBuilder &builder, OperationState &state, TypeRange tys,
@@ -1393,8 +1343,8 @@ void InvokeOp::build(OpBuilder &builder, OperationState &state, TypeRange tys,
                      ValueRange normalOps, Block *unwind,
                      ValueRange unwindOps) {
   build(builder, state, tys,
-        /*var_callee_type=*/nullptr, callee, ops, normalOps, unwindOps, nullptr,
-        nullptr, normal, unwind);
+        TypeAttr::get(getLLVMFuncType(builder.getContext(), tys, ops)), callee,
+        ops, normalOps, unwindOps, nullptr, nullptr, normal, unwind);
 }
 
 void InvokeOp::build(OpBuilder &builder, OperationState &state,
@@ -1402,8 +1352,8 @@ void InvokeOp::build(OpBuilder &builder, OperationState &state,
                      ValueRange ops, Block *normal, ValueRange normalOps,
                      Block *unwind, ValueRange unwindOps) {
   build(builder, state, getCallOpResultTypes(calleeType),
-        getCallOpVarCalleeType(calleeType), callee, ops, normalOps, unwindOps,
-        nullptr, nullptr, normal, unwind);
+        TypeAttr::get(calleeType), callee, ops, normalOps, unwindOps, nullptr,
+        nullptr, normal, unwind);
 }
 
 SuccessorOperands InvokeOp::getSuccessorOperands(unsigned index) {
@@ -1440,8 +1390,8 @@ MutableOperandRange InvokeOp::getArgOperandsMutable() {
 }
 
 LogicalResult InvokeOp::verify() {
-  if (failed(verifyCallOpVarCalleeType(*this)))
-    return failure();
+  if (getNumResults() > 1)
+    return emitOpError("must have 0 or 1 result");
 
   Block *unwindDest = getUnwindDest();
   if (unwindDest->empty())
@@ -1459,6 +1409,14 @@ void InvokeOp::print(OpAsmPrinter &p) {
   auto callee = getCallee();
   bool isDirect = callee.has_value();
 
+  LLVMFunctionType calleeType;
+  bool isVarArg = false;
+
+  if (std::optional<LLVMFunctionType> optionalCalleeType = getCalleeType()) {
+    calleeType = *optionalCalleeType;
+    isVarArg = calleeType.isVarArg();
+  }
+
   p << ' ';
 
   // Print calling convention.
@@ -1477,13 +1435,12 @@ void InvokeOp::print(OpAsmPrinter &p) {
   p << " unwind ";
   p.printSuccessorAndUseList(getUnwindDest(), getUnwindDestOperands());
 
-  // Print the variadic callee type if the invoke is variadic.
-  if (std::optional<LLVMFunctionType> varCalleeType = getVarCalleeType())
-    p << " vararg(" << *varCalleeType << ")";
+  if (isVarArg)
+    p << " vararg(" << calleeType << ")";
 
   p.printOptionalAttrDict((*this)->getAttrs(),
-                          {getCalleeAttrName(), getOperandSegmentSizeAttr(),
-                           getCConvAttrName(), getVarCalleeTypeAttrName()});
+                          {InvokeOp::getOperandSegmentSizeAttr(), "callee",
+                           "callee_type", InvokeOp::getCConvAttrName()});
 
   p << " : ";
   if (!isDirect)
@@ -1496,12 +1453,12 @@ void InvokeOp::print(OpAsmPrinter &p) {
 //                  `(` ssa-use-list `)`
 //                  `to` bb-id (`[` ssa-use-and-type-list `]`)?
 //                  `unwind` bb-id (`[` ssa-use-and-type-list `]`)?
-//                  ( `vararg(` var-callee-type `)` )?
+//                  ( `vararg(` var-arg-func-type `)` )?
 //                  attribute-dict? `:` (type `,`)? function-type
 ParseResult InvokeOp::parse(OpAsmParser &parser, OperationState &result) {
   SmallVector<OpAsmParser::UnresolvedOperand, 8> operands;
   SymbolRefAttr funcAttr;
-  TypeAttr varCalleeType;
+  TypeAttr calleeType;
   Block *normalDest, *unwindDest;
   SmallVector<Value, 4> normalOperands, unwindOperands;
   Builder &builder = parser.getBuilder();
@@ -1531,12 +1488,8 @@ ParseResult InvokeOp::parse(OpAsmParser &parser, OperationState &result) {
 
   bool isVarArg = parser.parseOptionalKeyword("vararg").succeeded();
   if (isVarArg) {
-    StringAttr varCalleeTypeAttrName =
-        InvokeOp::getVarCalleeTypeAttrName(result.name);
     if (parser.parseLParen().failed() ||
-        parser
-            .parseAttribute(varCalleeType, varCalleeTypeAttrName,
-                            result.attributes)
+        parser.parseAttribute(calleeType, "callee_type", result.attributes)
             .failed() ||
         parser.parseRParen().failed())
       return failure();
@@ -1562,8 +1515,8 @@ ParseResult InvokeOp::parse(OpAsmParser &parser, OperationState &result) {
 }
 
 LLVMFunctionType InvokeOp::getCalleeFunctionType() {
-  if (std::optional<LLVMFunctionType> varCalleeType = getVarCalleeType())
-    return *varCalleeType;
+  if (getCalleeType())
+    return *getCalleeType();
   return getLLVMFuncType(getContext(), getResultTypes(), getArgOperands());
 }
 
diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
index d1db90bbe2d20..cefaad9b22653 100644
--- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
+++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
@@ -1895,68 +1895,9 @@ struct FoldTransposeWithTranspose : OpRewritePattern<linalg::TransposeOp> {
   }
 };
 
-/// This pattern canonicalize transpose by swapping the order of
-/// broadcast and transpose:
-///   transpose(broadcast(input)) -> broadcast(transpose(input))
-struct SwapTransposeWithBroadcast : OpRewritePattern<linalg::TransposeOp> {
-  using OpRewritePattern<linalg::TransposeOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(linalg::TransposeOp transposeOp,
-                                PatternRewriter &rewriter) const override {
-    Value input = transposeOp.getInput();
-    BroadcastOp broadcastOp = input.getDefiningOp<BroadcastOp>();
-    if (!input.hasOneUse() || !broadcastOp)
-      return failure();
-
-    ArrayRef<int64_t> dimensions = broadcastOp.getDimensions();
-    ArrayRef<int64_t> perms = transposeOp.getPermutation();
-
-    // Get new perms and new dimensions.
-    SmallVector<int64_t> resultPerms = dropDims(perms, dimensions);
-    SmallVector<int64_t> invertPerm = invertPermutationVector(perms);
-    SmallVector<int64_t> resultDimensions;
-    unsigned dimensionSize = dimensions.size();
-    for (unsigned i = 0; i < dimensionSize; ++i)
-      resultDimensions.push_back(invertPerm[dimensions[i]]);
-
-    // Create transpose result.
-    Value broadcastInput = broadcastOp.getInput();
-    Location loc = transposeOp.getLoc();
-    MLIRContext *ctx = transposeOp.getContext();
-    SmallVector<OpFoldResult> dims;
-    auto broadcastInputTy =
-        mlir::cast<RankedTensorType>(broadcastInput.getType());
-    unsigned inputRank = broadcastInputTy.getRank();
-    for (unsigned i = 0; i < inputRank; ++i) {
-      if (broadcastInputTy.isDynamicDim(i)) {
-        dims.push_back(rewriter.create<tensor::DimOp>(loc, broadcastInput, i)
-                           ->getResult(0));
-      } else {
-        dims.push_back(IntegerAttr::get(IndexType::get(ctx),
-                                        broadcastInputTy.getDimSize(i)));
-      }
-    }
-    SmallVector<OpFoldResult> transposeResultShapes =
-        applyPermutation(dims, resultPerms);
-    Value transposeInit = rewriter.create<tensor::EmptyOp>(
-        transposeOp.getLoc(), transposeResultShapes,
-        broadcastInputTy.getElementType());
-
-    // Create broadcast(transpose(input)).
-    Value transposeResult =
-        rewriter
-            .create<TransposeOp>(loc, broadcastOp.getInput(), transposeInit,
-                                 resultPerms)
-            ->getResult(0);
-    rewriter.replaceOpWithNewOp<BroadcastOp>(
-        transposeOp, transposeResult, transposeOp.getInit(), resultDimensions);
-    return success();
-  }
-};
-
 void TransposeOp::getCanonicalizationPatterns(RewritePatternSet &results,
                                               MLIRContext *context) {
-  results.add<FoldTransposeWithTranspose, SwapTransposeWithBroadcast>(context);
+  results.add<FoldTransposeWithTranspose>(context);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/Utils/IndexingUtils.cpp b/mlir/lib/Dialect/Utils/IndexingUtils.cpp
index 108839a4d90e9..aba225be720c3 100644
--- a/mlir/lib/Dialect/Utils/IndexingUtils.cpp
+++ b/mlir/lib/Dialect/Utils/IndexingUtils.cpp
@@ -252,32 +252,6 @@ mlir::computePermutationVector(int64_t permSize, ArrayRef<int64_t> positions,
   return res;
 }
 
-SmallVector<int64_t> mlir::dropDims(ArrayRef<int64_t> inputPerm,
-                                    ArrayRef<int64_t> dropPositions) {
-  assert(inputPerm.size() >= dropPositions.size() &&
-         "expect inputPerm size large than position to drop");
-  SmallVector<int64_t> res;
-  unsigned permSize = inputPerm.size();
-  for (unsigned inputIndex = 0; inputIndex < permSize; ++inputIndex) {
-    int64_t targetIndex = inputPerm[inputIndex];
-    bool shouldDrop = false;
-    unsigned dropSize = dropPositions.size();
-    for (unsigned dropIndex = 0; dropIndex < dropSize; dropIndex++) {
-      if (dropPositions[dropIndex] == inputPerm[inputIndex]) {
-        shouldDrop = true;
-        break;
-      }
-      if (dropPositions[dropIndex] < inputPerm[inputIndex]) {
-        targetIndex--;
-      }
-    }
-    if (!shouldDrop) {
-      res.push_back(targetIndex);
-    }
-  }
-  return res;
-}
-
 SmallVector<int64_t> mlir::getI64SubArray(ArrayAttr arrayAttr,
                                           unsigned dropFront,
                                           unsigned dropBack) {
diff --git a/mlir/lib/IR/AffineExpr.cpp b/mlir/lib/IR/AffineExpr.cpp
index 7978b35e7147d..75cc01ee9a146 100644
--- a/mlir/lib/IR/AffineExpr.cpp
+++ b/mlir/lib/IR/AffineExpr.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <cmath>
 #include <cstdint>
 #include <limits>
 #include <utility>
@@ -258,7 +257,7 @@ int64_t AffineExpr::getLargestKnownDivisor() const {
     if (rhs && rhs.getValue() != 0) {
       int64_t lhsDiv = binExpr.getLHS().getLargestKnownDivisor();
       if (lhsDiv % rhs.getValue() == 0)
-        return std::abs(lhsDiv / rhs.getValue());
+        return lhsDiv / rhs.getValue();
     }
     return 1;
   }
diff --git a/mlir/lib/IR/AsmPrinter.cpp b/mlir/lib/IR/AsmPrinter.cpp
index e5b1291afce2b..13eb18036eeec 100644
--- a/mlir/lib/IR/AsmPrinter.cpp
+++ b/mlir/lib/IR/AsmPrinter.cpp
@@ -2576,7 +2576,6 @@ void AsmPrinter::Impl::printTypeImpl(Type type) {
       })
       .Case<IndexType>([&](Type) { os << "index"; })
       .Case<Float8E5M2Type>([&](Type) { os << "f8E5M2"; })
-      .Case<Float8E4M3Type>([&](Type) { os << "f8E4M3"; })
       .Case<Float8E4M3FNType>([&](Type) { os << "f8E4M3FN"; })
       .Case<Float8E5M2FNUZType>([&](Type) { os << "f8E5M2FNUZ"; })
       .Case<Float8E4M3FNUZType>([&](Type) { os << "f8E4M3FNUZ"; })
diff --git a/mlir/lib/IR/Builders.cpp b/mlir/lib/IR/Builders.cpp
index d0eb2d8fbae9d..d49f69a7b7ae6 100644
--- a/mlir/lib/IR/Builders.cpp
+++ b/mlir/lib/IR/Builders.cpp
@@ -38,10 +38,6 @@ FloatType Builder::getFloat8E5M2Type() {
   return FloatType::getFloat8E5M2(context);
 }
 
-FloatType Builder::getFloat8E4M3Type() {
-  return FloatType::getFloat8E4M3(context);
-}
-
 FloatType Builder::getFloat8E4M3FNType() {
   return FloatType::getFloat8E4M3FN(context);
 }
diff --git a/mlir/lib/IR/BuiltinTypes.cpp b/mlir/lib/IR/BuiltinTypes.cpp
index faa944937e007..179797cb943a1 100644
--- a/mlir/lib/IR/BuiltinTypes.cpp
+++ b/mlir/lib/IR/BuiltinTypes.cpp
@@ -87,9 +87,8 @@ IntegerType IntegerType::scaleElementBitwidth(unsigned scale) {
 //===----------------------------------------------------------------------===//
 
 unsigned FloatType::getWidth() {
-  if (llvm::isa<Float8E5M2Type, Float8E4M3Type, Float8E4M3FNType,
-                Float8E5M2FNUZType, Float8E4M3FNUZType, Float8E4M3B11FNUZType>(
-          *this))
+  if (llvm::isa<Float8E5M2Type, Float8E4M3FNType, Float8E5M2FNUZType,
+          Float8E4M3FNUZType, Float8E4M3B11FNUZType>(*this))
     return 8;
   if (llvm::isa<Float16Type, BFloat16Type>(*this))
     return 16;
@@ -108,8 +107,6 @@ unsigned FloatType::getWidth() {
 const llvm::fltSemantics &FloatType::getFloatSemantics() {
   if (llvm::isa<Float8E5M2Type>(*this))
     return APFloat::Float8E5M2();
-  if (llvm::isa<Float8E4M3Type>(*this))
-    return APFloat::Float8E4M3();
   if (llvm::isa<Float8E4M3FNType>(*this))
     return APFloat::Float8E4M3FN();
   if (llvm::isa<Float8E5M2FNUZType>(*this))
diff --git a/mlir/lib/IR/MLIRContext.cpp b/mlir/lib/IR/MLIRContext.cpp
index 12336701c9ca0..214b354c5347e 100644
--- a/mlir/lib/IR/MLIRContext.cpp
+++ b/mlir/lib/IR/MLIRContext.cpp
@@ -222,7 +222,6 @@ class MLIRContextImpl {
 
   /// Cached Type Instances.
   Float8E5M2Type f8E5M2Ty;
-  Float8E4M3Type f8E4M3Ty;
   Float8E4M3FNType f8E4M3FNTy;
   Float8E5M2FNUZType f8E5M2FNUZTy;
   Float8E4M3FNUZType f8E4M3FNUZTy;
@@ -313,7 +312,6 @@ MLIRContext::MLIRContext(const DialectRegistry &registry, Threading setting)
   //// Types.
   /// Floating-point Types.
   impl->f8E5M2Ty = TypeUniquer::get<Float8E5M2Type>(this);
-  impl->f8E4M3Ty = TypeUniquer::get<Float8E4M3Type>(this);
   impl->f8E4M3FNTy = TypeUniquer::get<Float8E4M3FNType>(this);
   impl->f8E5M2FNUZTy = TypeUniquer::get<Float8E5M2FNUZType>(this);
   impl->f8E4M3FNUZTy = TypeUniquer::get<Float8E4M3FNUZType>(this);
@@ -1014,9 +1012,6 @@ StorageUniquer &MLIRContext::getTypeUniquer() { return getImpl().typeUniquer; }
 Float8E5M2Type Float8E5M2Type::get(MLIRContext *context) {
   return context->getImpl().f8E5M2Ty;
 }
-Float8E4M3Type Float8E4M3Type::get(MLIRContext *context) {
-  return context->getImpl().f8E4M3Ty;
-}
 Float8E4M3FNType Float8E4M3FNType::get(MLIRContext *context) {
   return context->getImpl().f8E4M3FNTy;
 }
diff --git a/mlir/lib/IR/Types.cpp b/mlir/lib/IR/Types.cpp
index e8cd28bf9e85d..1d1ba6df4db2f 100644
--- a/mlir/lib/IR/Types.cpp
+++ b/mlir/lib/IR/Types.cpp
@@ -35,7 +35,6 @@ Type AbstractType::replaceImmediateSubElements(Type type,
 MLIRContext *Type::getContext() const { return getDialect().getContext(); }
 
 bool Type::isFloat8E5M2() const { return llvm::isa<Float8E5M2Type>(*this); }
-bool Type::isFloat8E4M3() const { return llvm::isa<Float8E4M3Type>(*this); }
 bool Type::isFloat8E4M3FN() const { return llvm::isa<Float8E4M3FNType>(*this); }
 bool Type::isFloat8E5M2FNUZ() const {
   return llvm::isa<Float8E5M2FNUZType>(*this);
diff --git a/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp
index 8837c19b48588..3d6dd1247b413 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp
@@ -219,25 +219,6 @@ convertOperationImpl(Operation &opInst, llvm::IRBuilderBase &builder,
     }
     call->setCallingConv(convertCConvToLLVM(callOp.getCConv()));
     call->setTailCallKind(convertTailCallKindToLLVM(callOp.getTailCallKind()));
-    if (callOp.getConvergentAttr())
-      call->addFnAttr(llvm::Attribute::Convergent);
-    if (callOp.getNoUnwindAttr())
-      call->addFnAttr(llvm::Attribute::NoUnwind);
-    if (callOp.getWillReturnAttr())
-      call->addFnAttr(llvm::Attribute::WillReturn);
-
-    if (MemoryEffectsAttr memAttr = callOp.getMemoryAttr()) {
-      llvm::MemoryEffects memEffects =
-          llvm::MemoryEffects(llvm::MemoryEffects::Location::ArgMem,
-                              convertModRefInfoToLLVM(memAttr.getArgMem())) |
-          llvm::MemoryEffects(
-              llvm::MemoryEffects::Location::InaccessibleMem,
-              convertModRefInfoToLLVM(memAttr.getInaccessibleMem())) |
-          llvm::MemoryEffects(llvm::MemoryEffects::Location::Other,
-                              convertModRefInfoToLLVM(memAttr.getOther()));
-      call->setMemoryEffects(memEffects);
-    }
-
     moduleTranslation.setAccessGroupsMetadata(callOp, call);
     moduleTranslation.setAliasScopeMetadata(callOp, call);
     moduleTranslation.setTBAAMetadata(callOp, call);
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 8b031deca8931..f7cb52236e1a9 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -1009,32 +1009,6 @@ convertOmpTeams(omp::TeamsOp op, llvm::IRBuilderBase &builder,
   return bodyGenStatus;
 }
 
-static void
-buildDependData(std::optional<ArrayAttr> depends, OperandRange dependVars,
-                LLVM::ModuleTranslation &moduleTranslation,
-                SmallVectorImpl<llvm::OpenMPIRBuilder::DependData> &dds) {
-  if (dependVars.empty())
-    return;
-  for (auto dep : llvm::zip(dependVars, depends->getValue())) {
-    llvm::omp::RTLDependenceKindTy type;
-    switch (
-        cast<mlir::omp::ClauseTaskDependAttr>(std::get<1>(dep)).getValue()) {
-    case mlir::omp::ClauseTaskDepend::taskdependin:
-      type = llvm::omp::RTLDependenceKindTy::DepIn;
-      break;
-    // The OpenMP runtime requires that the codegen for 'depend' clause for
-    // 'out' dependency kind must be the same as codegen for 'depend' clause
-    // with 'inout' dependency.
-    case mlir::omp::ClauseTaskDepend::taskdependout:
-    case mlir::omp::ClauseTaskDepend::taskdependinout:
-      type = llvm::omp::RTLDependenceKindTy::DepInOut;
-      break;
-    };
-    llvm::Value *depVal = moduleTranslation.lookupValue(std::get<0>(dep));
-    llvm::OpenMPIRBuilder::DependData dd(type, depVal->getType(), depVal);
-    dds.emplace_back(dd);
-  }
-}
 /// Converts an OpenMP task construct into LLVM IR using OpenMPIRBuilder.
 static LogicalResult
 convertOmpTaskOp(omp::TaskOp taskOp, llvm::IRBuilderBase &builder,
@@ -1058,8 +1032,28 @@ convertOmpTaskOp(omp::TaskOp taskOp, llvm::IRBuilderBase &builder,
   };
 
   SmallVector<llvm::OpenMPIRBuilder::DependData> dds;
-  buildDependData(taskOp.getDepends(), taskOp.getDependVars(),
-                  moduleTranslation, dds);
+  if (!taskOp.getDependVars().empty() && taskOp.getDepends()) {
+    for (auto dep :
+         llvm::zip(taskOp.getDependVars(), taskOp.getDepends()->getValue())) {
+      llvm::omp::RTLDependenceKindTy type;
+      switch (
+          cast<mlir::omp::ClauseTaskDependAttr>(std::get<1>(dep)).getValue()) {
+      case mlir::omp::ClauseTaskDepend::taskdependin:
+        type = llvm::omp::RTLDependenceKindTy::DepIn;
+        break;
+      // The OpenMP runtime requires that the codegen for 'depend' clause for
+      // 'out' dependency kind must be the same as codegen for 'depend' clause
+      // with 'inout' dependency.
+      case mlir::omp::ClauseTaskDepend::taskdependout:
+      case mlir::omp::ClauseTaskDepend::taskdependinout:
+        type = llvm::omp::RTLDependenceKindTy::DepInOut;
+        break;
+      };
+      llvm::Value *depVal = moduleTranslation.lookupValue(std::get<0>(dep));
+      llvm::OpenMPIRBuilder::DependData dd(type, depVal->getType(), depVal);
+      dds.emplace_back(dd);
+    }
+  }
 
   llvm::OpenMPIRBuilder::InsertPointTy allocaIP =
       findAllocaInsertPoint(builder, moduleTranslation);
@@ -3298,14 +3292,10 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
     if (!mapData.IsDeclareTarget[i] && !mapData.IsAMember[i])
       kernelInput.push_back(mapData.OriginalValue[i]);
   }
-  SmallVector<llvm::OpenMPIRBuilder::DependData> dds;
-  buildDependData(targetOp.getDepends(), targetOp.getDependVars(),
-                  moduleTranslation, dds);
 
   builder.restoreIP(moduleTranslation.getOpenMPBuilder()->createTarget(
       ompLoc, allocaIP, builder.saveIP(), entryInfo, defaultValTeams,
-      defaultValThreads, kernelInput, genMapInfoCB, bodyCB, argAccessorCB,
-      dds));
+      defaultValThreads, kernelInput, genMapInfoCB, bodyCB, argAccessorCB));
 
   // Remap access operations to declare target reference pointers for the
   // device, essentially generating extra loadop's as necessary
diff --git a/mlir/lib/Target/LLVMIR/ModuleImport.cpp b/mlir/lib/Target/LLVMIR/ModuleImport.cpp
index 42e9b66a6a2ac..16007592175f7 100644
--- a/mlir/lib/Target/LLVMIR/ModuleImport.cpp
+++ b/mlir/lib/Target/LLVMIR/ModuleImport.cpp
@@ -1471,28 +1471,6 @@ LogicalResult ModuleImport::convertInstruction(llvm::Instruction *inst) {
     callOp.setTailCallKind(
         convertTailCallKindFromLLVM(callInst->getTailCallKind()));
     setFastmathFlagsAttr(inst, callOp);
-
-    // Handle function attributes.
-    if (callInst->hasFnAttr(llvm::Attribute::Convergent))
-      callOp.setConvergent(true);
-    if (callInst->hasFnAttr(llvm::Attribute::NoUnwind))
-      callOp.setNoUnwind(true);
-    if (callInst->hasFnAttr(llvm::Attribute::WillReturn))
-      callOp.setWillReturn(true);
-
-    llvm::MemoryEffects memEffects = callInst->getMemoryEffects();
-    ModRefInfo othermem = convertModRefInfoFromLLVM(
-        memEffects.getModRef(llvm::MemoryEffects::Location::Other));
-    ModRefInfo argMem = convertModRefInfoFromLLVM(
-        memEffects.getModRef(llvm::MemoryEffects::Location::ArgMem));
-    ModRefInfo inaccessibleMem = convertModRefInfoFromLLVM(
-        memEffects.getModRef(llvm::MemoryEffects::Location::InaccessibleMem));
-    auto memAttr = MemoryEffectsAttr::get(callOp.getContext(), othermem, argMem,
-                                          inaccessibleMem);
-    // Only set the attribute when it does not match the default value.
-    if (!memAttr.isReadWrite())
-      callOp.setMemoryAttr(memAttr);
-
     if (!callInst->getType()->isVoidTy())
       mapValue(inst, callOp.getResult());
     else
diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp
index a045868b66031..0b552a7e1ca3b 100644
--- a/mlir/lib/Transforms/Utils/DialectConversion.cpp
+++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp
@@ -293,9 +293,10 @@ class CreateBlockRewrite : public BlockRewrite {
 /// original location.
 class EraseBlockRewrite : public BlockRewrite {
 public:
-  EraseBlockRewrite(ConversionPatternRewriterImpl &rewriterImpl, Block *block)
-      : BlockRewrite(Kind::EraseBlock, rewriterImpl, block),
-        region(block->getParent()), insertBeforeBlock(block->getNextNode()) {}
+  EraseBlockRewrite(ConversionPatternRewriterImpl &rewriterImpl, Block *block,
+                    Region *region, Block *insertBeforeBlock)
+      : BlockRewrite(Kind::EraseBlock, rewriterImpl, block), region(region),
+        insertBeforeBlock(insertBeforeBlock) {}
 
   static bool classof(const IRRewrite *rewrite) {
     return rewrite->getKind() == Kind::EraseBlock;
@@ -1439,7 +1440,9 @@ void ConversionPatternRewriterImpl::notifyOpReplaced(Operation *op,
 }
 
 void ConversionPatternRewriterImpl::notifyBlockIsBeingErased(Block *block) {
-  appendRewrite<EraseBlockRewrite>(block);
+  Region *region = block->getParent();
+  Block *origNextBlock = block->getNextNode();
+  appendRewrite<EraseBlockRewrite>(block, region, origNextBlock);
 }
 
 void ConversionPatternRewriterImpl::notifyBlockInserted(
diff --git a/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi b/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi
index 224e77a3f46be..317e688076304 100644
--- a/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi
+++ b/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi
@@ -123,7 +123,6 @@ __all__ = [
     "Float8E4M3B11FNUZType",
     "Float8E4M3FNType",
     "Float8E4M3FNUZType",
-    "Float8E4M3Type",
     "Float8E5M2FNUZType",
     "Float8E5M2Type",
     "FloatAttr",
@@ -1576,19 +1575,6 @@ class Float8E4M3FNUZType(FloatType):
     @property
     def typeid(self) -> TypeID: ...
 
-class Float8E4M3Type(FloatType):
-    static_typeid: ClassVar[TypeID]
-    @staticmethod
-    def get(context: Optional[Context] = None) -> Float8E4M3Type:
-        """
-        Create a float8_e4m3 type.
-        """
-    @staticmethod
-    def isinstance(other: Type) -> bool: ...
-    def __init__(self, cast_from_type: Type) -> None: ...
-    @property
-    def typeid(self) -> TypeID: ...
-
 class Float8E5M2FNUZType(FloatType):
     static_typeid: ClassVar[TypeID]
     @staticmethod
diff --git a/mlir/python/mlir/extras/types.py b/mlir/python/mlir/extras/types.py
index fde9909a8f9d6..b93c46b172a9a 100644
--- a/mlir/python/mlir/extras/types.py
+++ b/mlir/python/mlir/extras/types.py
@@ -14,7 +14,6 @@
     F64Type,
     Float8E4M3B11FNUZType,
     Float8E4M3FNType,
-    Float8E4M3Type,
     Float8E5M2Type,
     FunctionType,
     IndexType,
@@ -69,7 +68,6 @@ def ui(width):
 bf16 = lambda: BF16Type.get()
 
 f8E5M2 = lambda: Float8E5M2Type.get()
-f8E4M3 = lambda: Float8E4M3Type.get()
 f8E4M3FN = lambda: Float8E4M3FNType.get()
 f8E4M3B11FNUZ = lambda: Float8E4M3B11FNUZType.get()
 
diff --git a/mlir/test/Conversion/VectorToSPIRV/vector-to-spirv.mlir b/mlir/test/Conversion/VectorToSPIRV/vector-to-spirv.mlir
index edad208749930..667aad7645c51 100644
--- a/mlir/test/Conversion/VectorToSPIRV/vector-to-spirv.mlir
+++ b/mlir/test/Conversion/VectorToSPIRV/vector-to-spirv.mlir
@@ -150,19 +150,6 @@ func.func @broadcast(%arg0 : f32) -> (vector<4xf32>, vector<2xf32>) {
 
 // -----
 
-// CHECK-LABEL: @broadcast_index
-//  CHECK-SAME: %[[ARG0:.*]]: index
-//       CHECK:   %[[CAST0:.*]] = builtin.unrealized_conversion_cast %[[ARG0]] : index to i32
-//       CHECK:   %[[CONSTRUCT:.*]] = spirv.CompositeConstruct %[[CAST0]], %[[CAST0]], %[[CAST0]], %[[CAST0]] : (i32, i32, i32, i32) -> vector<4xi32>
-//       CHECK:   %[[CAST1:.*]] = builtin.unrealized_conversion_cast %[[CONSTRUCT]] : vector<4xi32> to vector<4xindex>
-//       CHECK:   return %[[CAST1]] : vector<4xindex>
-func.func @broadcast_index(%a: index) -> vector<4xindex> {
-  %0 = vector.broadcast %a : index to vector<4xindex>
-  return %0 : vector<4xindex>
-}
-
-// -----
-
 // CHECK-LABEL: @extract
 //  CHECK-SAME: %[[ARG:.+]]: vector<2xf32>
 //       CHECK:   spirv.CompositeExtract %[[ARG]][0 : i32] : vector<2xf32>
diff --git a/mlir/test/Dialect/LLVMIR/invalid.mlir b/mlir/test/Dialect/LLVMIR/invalid.mlir
index fe288dab973f5..39f8e70b9fb7b 100644
--- a/mlir/test/Dialect/LLVMIR/invalid.mlir
+++ b/mlir/test/Dialect/LLVMIR/invalid.mlir
@@ -1415,163 +1415,10 @@ func.func @invalid_zext_target_type_two(%arg: vector<1xi32>)  {
 
 // -----
 
-llvm.func @non_variadic(%arg: i32)
-
-llvm.func @invalid_var_callee_type(%arg: i32)  {
-  // expected-error at below {{expected var_callee_type to be a variadic function type}}
-  llvm.call @non_variadic(%arg) vararg(!llvm.func<void (i32)>) : (i32) -> ()
-  llvm.return
-}
-
-// -----
-
-llvm.func @variadic(%arg: i32, ...)
-
-llvm.func @invalid_var_callee_type_num_parameters(%arg: i32)  {
-  // expected-error at below {{expected var_callee_type to have at most 1 parameters}}
-  llvm.call @variadic(%arg) vararg(!llvm.func<void (i32, i64, ...)>) : (i32) -> ()
-  llvm.return
-}
-
-// -----
-
-llvm.func @invalid_var_callee_type_num_parameters_indirect(%callee : !llvm.ptr, %arg: i32)  {
-  // expected-error at below {{expected var_callee_type to have at most 1 parameters}}
-  llvm.call %callee(%arg) vararg(!llvm.func<void (i32, i64, ...)>) : !llvm.ptr, (i32) -> ()
-  llvm.return
-}
-
-// -----
-
-llvm.func @variadic(%arg: i32, ...)
-
-llvm.func @invalid_var_callee_type_parameter_type_mismatch(%arg: i32)  {
-  // expected-error at below {{var_callee_type parameter type mismatch: 'i64' != 'i32'}}
-  llvm.call @variadic(%arg) vararg(!llvm.func<void (i64, ...)>) : (i32) -> ()
-  llvm.return
-}
-
-// -----
-
-llvm.func @invalid_var_callee_type_parameter_type_mismatch_indirect(%callee : !llvm.ptr, %arg: i32)  {
-  // expected-error at below {{var_callee_type parameter type mismatch: 'i64' != 'i32'}}
-  llvm.call %callee(%arg) vararg(!llvm.func<void (i64, ...)>) : !llvm.ptr, (i32) -> ()
-  llvm.return
-}
-
-// -----
-
-llvm.func @variadic(%arg: i32, ...)
-
-llvm.func @invalid_var_callee_type_non_void(%arg: i32)  {
-  // expected-error at below {{expected var_callee_type to return void}}
-  llvm.call @variadic(%arg) vararg(!llvm.func<i8 (i32, ...)>) : (i32) -> ()
-  llvm.return
-}
-
-// -----
-
-llvm.func @variadic(%arg: i32, ...) -> i32
-
-llvm.func @invalid_var_callee_type_return_type_mismatch(%arg: i32)  {
-  // expected-error at below {{var_callee_type return type mismatch: 'i8' != 'i32'}}
-  %0 = llvm.call @variadic(%arg) vararg(!llvm.func<i8 (i32, ...)>) : (i32) -> (i32)
-  llvm.return
-}
-
-// -----
-
-llvm.func @non_variadic(%arg: i32)
-
-llvm.func @invalid_var_callee_type(%arg: i32)  {
-  // expected-error at below {{expected var_callee_type to be a variadic function type}}
-  llvm.invoke @non_variadic(%arg) to ^bb2 unwind ^bb1 vararg(!llvm.func<void (i32)>) : (i32) -> ()
-^bb1:
-  llvm.return
-^bb2:
-  llvm.return
-}
-
-// -----
-
-llvm.func @variadic(%arg: i32, ...)
-
-llvm.func @invalid_var_callee_type_num_parameters(%arg: i32)  {
-  // expected-error at below {{expected var_callee_type to have at most 1 parameters}}
-  llvm.invoke @variadic(%arg) to ^bb2 unwind ^bb1 vararg(!llvm.func<void (i32, i64, ...)>) : (i32) -> ()
-^bb1:
-  llvm.return
-^bb2:
-  llvm.return
-}
-
-// -----
-
-llvm.func @invalid_var_callee_type_num_parameters_indirect(%callee : !llvm.ptr, %arg: i32)  {
-  // expected-error at below {{expected var_callee_type to have at most 1 parameters}}
-  llvm.invoke %callee(%arg) to ^bb2 unwind ^bb1 vararg(!llvm.func<void (i32, i64, ...)>) : !llvm.ptr, (i32) -> ()
-^bb1:
-  llvm.return
-^bb2:
-  llvm.return
-}
-
-// -----
-
-llvm.func @variadic(%arg: i32, ...)
-
-llvm.func @invalid_var_callee_type_parameter_type_mismatch(%arg: i32)  {
-  // expected-error at below {{var_callee_type parameter type mismatch: 'i64' != 'i32'}}
-  llvm.invoke @variadic(%arg) to ^bb2 unwind ^bb1 vararg(!llvm.func<void (i64, ...)>) : (i32) -> ()
-^bb1:
-  llvm.return
-^bb2:
-  llvm.return
-}
-
-// -----
-
-llvm.func @invalid_var_callee_type_parameter_type_mismatch_indirect(%callee : !llvm.ptr, %arg: i32)  {
-  // expected-error at below {{var_callee_type parameter type mismatch: 'i64' != 'i32'}}
-  llvm.invoke %callee(%arg) to ^bb2 unwind ^bb1 vararg(!llvm.func<void (i64, ...)>) : !llvm.ptr, (i32) -> ()
-^bb1:
-  llvm.return
-^bb2:
-  llvm.return
-}
-
-// -----
-
-llvm.func @variadic(%arg: i32, ...)
-
-llvm.func @invalid_var_callee_type_non_void(%arg: i32)  {
-  // expected-error at below {{expected var_callee_type to return void}}
-  llvm.invoke @variadic(%arg) to ^bb2 unwind ^bb1 vararg(!llvm.func<i8 (i32, ...)>) : (i32) -> ()
-^bb1:
-  llvm.return
-^bb2:
-  llvm.return
-}
-
-// -----
-
-llvm.func @variadic(%arg: i32, ...) -> i32
-
-llvm.func @invalid_var_callee_type_return_type_mismatch(%arg: i32)  {
-  // expected-error at below {{var_callee_type return type mismatch: 'i8' != 'i32'}}
-  %0 = llvm.invoke @variadic(%arg) to ^bb2 unwind ^bb1 vararg(!llvm.func<i8 (i32, ...)>) : (i32) -> (i32)
-^bb1:
-  llvm.return
-^bb2:
-  llvm.return
-}
-
-// -----
-
 llvm.func @variadic(...)
 
 llvm.func @invalid_variadic_call(%arg: i32)  {
-  // expected-error at +1 {{missing var_callee_type attribute for vararg call}}
+  // expected-error at +1 {{missing callee type attribute for vararg call}}
   "llvm.call"(%arg) <{callee = @variadic}> : (i32) -> ()
   llvm.return
 }
@@ -1581,7 +1428,7 @@ llvm.func @invalid_variadic_call(%arg: i32)  {
 llvm.func @variadic(...)
 
 llvm.func @invalid_variadic_call(%arg: i32)  {
-  // expected-error at +1 {{missing var_callee_type attribute for vararg call}}
+  // expected-error at +1 {{missing callee type attribute for vararg call}}
   "llvm.call"(%arg) <{callee = @variadic}> : (i32) -> ()
   llvm.return
 }
@@ -1598,14 +1445,14 @@ llvm.func @foo(%arg: !llvm.ptr) {
 
 func.func @tma_load(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %off0: i16, %off1: i16, %ctamask : i16, %cacheHint : i64, %p : i1) {
   // expected-error at +1 {{to use im2col mode, the tensor has to be at least 3-dimensional}}
-  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box[%crd0,%crd1] im2col[%off0] multicast_mask = %ctamask l2_cache_hint = %cacheHint : !llvm.ptr<3>, !llvm.ptr
+  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box[%crd0,%crd1] im2col[%off0] multicast_mask = %ctamask l2_cache_hint = %cacheHint : !llvm.ptr<3>, !llvm.ptr  
   return
 }
 // -----
 
 func.func @tma_load(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %off0: i16, %off1: i16, %ctamask : i16, %cacheHint : i64, %p : i1) {
   // expected-error at +1 {{im2col offsets must be 2 less than number of coordinates}}
-  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box[%crd0,%crd1,%crd2,%crd3] im2col[%off0] multicast_mask = %ctamask l2_cache_hint = %cacheHint : !llvm.ptr<3>, !llvm.ptr
+  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box[%crd0,%crd1,%crd2,%crd3] im2col[%off0] multicast_mask = %ctamask l2_cache_hint = %cacheHint : !llvm.ptr<3>, !llvm.ptr  
   return
 }
 
@@ -1613,7 +1460,7 @@ func.func @tma_load(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !
 
 func.func @tma_load(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %off0: i16, %off1: i16, %ctamask : i16, %cacheHint : i64, %p : i1) {
   // expected-error at +1 {{expects coordinates between 1 to 5 dimension}}
-  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box[]: !llvm.ptr<3>, !llvm.ptr
+  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box[]: !llvm.ptr<3>, !llvm.ptr  
   return
 }
 
@@ -1622,7 +1469,7 @@ func.func @tma_load(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !
 
 func.func @tma_load(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %off0: i16, %off1: i16, %ctamask : i16, %cacheHint : i64, %p : i1) {
   // expected-error at +1 {{expects coordinates between 1 to 5 dimension}}
-  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box[%crd0,%crd1,%crd2,%crd3,%crd0,%crd1,%crd2,%crd3]: !llvm.ptr<3>, !llvm.ptr
+  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box[%crd0,%crd1,%crd2,%crd3,%crd0,%crd1,%crd2,%crd3]: !llvm.ptr<3>, !llvm.ptr  
   return
 }
 
diff --git a/mlir/test/Dialect/LLVMIR/roundtrip.mlir b/mlir/test/Dialect/LLVMIR/roundtrip.mlir
index ff16bb0f857dd..ca9748a2b8b7b 100644
--- a/mlir/test/Dialect/LLVMIR/roundtrip.mlir
+++ b/mlir/test/Dialect/LLVMIR/roundtrip.mlir
@@ -1,10 +1,5 @@
 // RUN: mlir-opt %s | mlir-opt | FileCheck %s
 
-
-// CHECK-LABEL: func @baz
-// something to call
-llvm.func @baz()
-
 // CHECK-LABEL: func @ops
 // CHECK-SAME: (%[[I32:.*]]: i32, %[[FLOAT:.*]]: f32, %[[PTR1:.*]]: !llvm.ptr, %[[PTR2:.*]]: !llvm.ptr, %[[BOOL:.*]]: i1, %[[VPTR1:.*]]: !llvm.vec<2 x ptr>)
 func.func @ops(%arg0: i32, %arg1: f32,
@@ -98,19 +93,6 @@ func.func @ops(%arg0: i32, %arg1: f32,
   llvm.call %variadic_func(%arg0, %arg0) vararg(!llvm.func<void (i32, ...)>) : !llvm.ptr, (i32, i32) -> ()
   llvm.call %variadic_func(%arg0, %arg0) vararg(!llvm.func<void (i32, ...)>) {fastmathFlags = #llvm.fastmath<fast>} : !llvm.ptr, (i32, i32) -> ()
 
-// Function call attributes
-// CHECK: llvm.call @baz() {convergent} : () -> ()
-  llvm.call @baz() {convergent} : () -> ()
-
-// CHECK: llvm.call @baz() {no_unwind} : () -> ()
-  llvm.call @baz() {no_unwind} : () -> ()
-
-// CHECK: llvm.call @baz() {will_return} : () -> ()
-  llvm.call @baz() {will_return} : () -> ()
-
-// CHECK: llvm.call @baz() {memory = #llvm.memory_effects<other = none, argMem = read, inaccessibleMem = write>} : () -> ()
-  llvm.call @baz() {memory = #llvm.memory_effects<other = none, argMem = read, inaccessibleMem = write>} : () -> ()
-
 // Terminator operations and their successors.
 //
 // CHECK: llvm.br ^[[BB1:.*]]
diff --git a/mlir/test/Dialect/Linalg/canonicalize.mlir b/mlir/test/Dialect/Linalg/canonicalize.mlir
index d34bc8c1c54f6..928030a81dc02 100644
--- a/mlir/test/Dialect/Linalg/canonicalize.mlir
+++ b/mlir/test/Dialect/Linalg/canonicalize.mlir
@@ -1017,7 +1017,7 @@ func.func @broadcast_same_shape(%input: tensor<2x3xf32>, %init: tensor<2x3xf32>)
   return %0 : tensor<2x3xf32>
 }
 
-// -----
+// ----
 
 func.func @transpose_1d(%input: tensor<16xf32>,
                         %init: tensor<16xf32>) -> tensor<16xf32> {
@@ -1096,76 +1096,3 @@ func.func @transpose_transpose_fold(%input: tensor<5x4x3xf32>,
   func.return %transpose2 : tensor<3x4x5xf32>
 }
 
-// -----
-
-func.func @broadcast_transpose_fold(%input: tensor<2x4x5xf32>,
-                                    %init1: tensor<1x2x3x4x5x6xf32>,
-                                    %init2: tensor<1x6x2x3x5x4xf32>) -> tensor<1x6x2x3x5x4xf32> {
-  // CHECK-LABEL: @broadcast_transpose_fold
-  //  CHECK-SAME:     %[[INPUT:[a-zA-Z0-9]+]]: tensor<2x4x5xf32>
-  //  CHECK-SAME:     %[[INIT1:[a-zA-Z0-9]+]]: tensor<1x2x3x4x5x6xf32>
-  //  CHECK-SAME:     %[[INIT2:[a-zA-Z0-9]+]]: tensor<1x6x2x3x5x4xf32>
-  //       CHECK:   %[[TMP_INIT:.+]] = tensor.empty() : tensor<2x5x4xf32>
-  //       CHECK:   %[[TRANSPOSE:.+]] = linalg.transpose ins(%[[INPUT]] : tensor<2x4x5xf32>) outs(%[[TMP_INIT]] : tensor<2x5x4xf32>) permutation = [0, 2, 1]
-  //       CHECK:   %[[BROADCAST:.+]] = linalg.broadcast ins(%[[TRANSPOSE]] : tensor<2x5x4xf32>) outs(%[[INIT2]] : tensor<1x6x2x3x5x4xf32>) dimensions = [0, 3, 1]
-  //       CHECK:   return %[[BROADCAST]] : tensor<1x6x2x3x5x4xf32>
-  %broadcast = linalg.broadcast
-      ins(%input : tensor<2x4x5xf32>)
-      outs(%init1 : tensor<1x2x3x4x5x6xf32>)
-      dimensions = [0, 2, 5]
-  %transpose = linalg.transpose
-      ins(%broadcast : tensor<1x2x3x4x5x6xf32>)
-      outs(%init2 : tensor<1x6x2x3x5x4xf32>)
-      permutation = [0, 5, 1, 2, 4, 3]
-  func.return %transpose : tensor<1x6x2x3x5x4xf32>
-}
-
-// -----
-
-func.func @broadcast_transpose_fold_dynamic(%input: tensor<?x?x5xf32>,
-                                            %init1: tensor<1x?x3x?x5x6xf32>,
-                                            %init2: tensor<1x3x?x6x5x?xf32>) -> tensor<1x3x?x6x5x?xf32> {
-  // CHECK-LABEL: @broadcast_transpose_fold_dynamic
-  //  CHECK-SAME:     %[[INPUT:[a-zA-Z0-9]+]]: tensor<?x?x5xf32>
-  //  CHECK-SAME:     %[[INIT1:[a-zA-Z0-9]+]]: tensor<1x?x3x?x5x6xf32>
-  //  CHECK-SAME:     %[[INIT2:[a-zA-Z0-9]+]]: tensor<1x3x?x6x5x?xf32>
-  //   CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
-  //   CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
-  //       CHECK:   %[[DIM0:.+]] = tensor.dim %[[INPUT]], %[[C0]] : tensor<?x?x5xf32>
-  //       CHECK:   %[[DIM1:.+]] = tensor.dim %[[INPUT]], %[[C1]] : tensor<?x?x5xf32>
-  //       CHECK:   %[[TMP_INIT:.+]] = tensor.empty(%[[DIM1]], %[[DIM0]]) : tensor<?x5x?xf32>
-  //       CHECK:   %[[TRANSPOSE:.+]] = linalg.transpose ins(%[[INPUT]] : tensor<?x?x5xf32>) outs(%[[TMP_INIT]] : tensor<?x5x?xf32>) permutation = [1, 2, 0]
-  //       CHECK:   %[[BROADCAST:.+]] = linalg.broadcast ins(%[[TRANSPOSE]] : tensor<?x5x?xf32>) outs(%[[INIT2]] : tensor<1x3x?x6x5x?xf32>) dimensions = [0, 1, 3]
-  //       CHECK:   return %[[BROADCAST]] : tensor<1x3x?x6x5x?xf32>
-  %broadcast = linalg.broadcast
-      ins(%input : tensor<?x?x5xf32>)
-      outs(%init1 : tensor<1x?x3x?x5x6xf32>)
-      dimensions = [0, 2, 5]
-  %transpose = linalg.transpose
-      ins(%broadcast : tensor<1x?x3x?x5x6xf32>)
-      outs(%init2 : tensor<1x3x?x6x5x?xf32>)
-      permutation = [0, 2, 3, 5, 4, 1]
-  func.return %transpose : tensor<1x3x?x6x5x?xf32>
-}
-
-// -----
-
-func.func @broadcast_transpose_fold_2dim(%input: tensor<2xf32>,
-                                         %init1: tensor<2x4xf32>,
-                                         %init2: tensor<4x2xf32>) -> tensor<4x2xf32> {
-  // CHECK-LABEL: @broadcast_transpose_fold_2dim
-  //  CHECK-SAME:     %[[INPUT:[a-zA-Z0-9]+]]: tensor<2xf32>
-  //  CHECK-SAME:     %[[INIT1:[a-zA-Z0-9]+]]: tensor<2x4xf32>
-  //  CHECK-SAME:     %[[INIT2:[a-zA-Z0-9]+]]: tensor<4x2xf32>
-  //       CHECK:   %[[BROADCAST:.+]] = linalg.broadcast ins(%[[INPUT]] : tensor<2xf32>) outs(%[[INIT2]] : tensor<4x2xf32>) dimensions = [0]
-  //       CHECK:   return %[[BROADCAST]] : tensor<4x2xf32>
-  %broadcast = linalg.broadcast
-      ins(%input : tensor<2xf32>)
-      outs(%init1 : tensor<2x4xf32>)
-      dimensions = [1]
-  %transpose = linalg.transpose
-      ins(%broadcast : tensor<2x4xf32>)
-      outs(%init2 : tensor<4x2xf32>)
-      permutation = [1, 0]
-  func.return %transpose : tensor<4x2xf32>
-}
diff --git a/mlir/test/Dialect/Vector/vector-transfer-flatten.mlir b/mlir/test/Dialect/Vector/vector-transfer-flatten.mlir
index 621baef82319f..303f841e8a828 100644
--- a/mlir/test/Dialect/Vector/vector-transfer-flatten.mlir
+++ b/mlir/test/Dialect/Vector/vector-transfer-flatten.mlir
@@ -110,12 +110,12 @@ func.func @transfer_read_dims_mismatch_non_zero_indices(
 
 func.func @transfer_read_dims_mismatch_non_contiguous_non_zero_indices(
     %arg : memref<1x3x3x2xf32, strided<[40, 10, 2, 1], offset: ?>>,
-    %idx_1 : index,
-    %idx_2 : index) -> vector<2x2xf32> {
+    %idx0 : index,
+    %idx1 : index) -> vector<2x2xf32> {
 
   %c0 = arith.constant 0 : index
   %cst_1 = arith.constant 0.000000e+00 : f32
-  %8 = vector.transfer_read %arg[%c0, %idx_1, %idx_2, %c0], %cst_1 {in_bounds = [true, true]} :
+  %8 = vector.transfer_read %arg[%c0, %idx0, %idx1, %c0], %cst_1 {in_bounds = [true, true]} :
     memref<1x3x3x2xf32, strided<[40, 10, 2, 1], offset: ?>>, vector<2x2xf32>
   return %8 : vector<2x2xf32>
 }
@@ -123,8 +123,7 @@ func.func @transfer_read_dims_mismatch_non_contiguous_non_zero_indices(
 // CHECK: #[[$MAP:.+]] = affine_map<()[s0] -> (s0 * 2)>
 
 // CHECK-LABEL:  func.func @transfer_read_dims_mismatch_non_contiguous_non_zero_indices(
-// CHECK:         %[[COLLAPSE:.+]] = memref.collapse_shape %{{.*}} {{\[}}[0], [1], [2, 3]]
-// CHECK-SAME:      : memref<1x3x3x2xf32, strided<[40, 10, 2, 1], offset: ?>> into memref<1x3x6xf32, strided<[40, 10, 1], offset: ?>>
+// CHECK:         %[[COLLAPSE:.+]] = memref.collapse_shape %{{.*}} {{\[}}[0], [1], [2, 3]] : memref<1x3x3x2xf32, strided<[40, 10, 2, 1], offset: ?>> into memref<1x3x6xf32, strided<[40, 10, 1], offset: ?>>
 // CHECK:         %[[APPLY:.*]] = affine.apply #[[$MAP]]()
 
 // CHECK-128B-LABEL: func @transfer_read_dims_mismatch_non_contiguous_non_zero_indices(
@@ -132,42 +131,10 @@ func.func @transfer_read_dims_mismatch_non_contiguous_non_zero_indices(
 
 // -----
 
-// The leading dynamic shapes don't affect whether this example is flattenable
-// or not. Indeed, those dynamic shapes are not candidates for flattening anyway.
+// The input memref has a dynamic trailing shape and hence is not flattened.
+// TODO: This case could be supported via memref.dim
 
-func.func @transfer_read_leading_dynamic_dims(
-    %arg : memref<?x?x8x4xi8, strided<[?, 32, 4, 1], offset: ?>>,
-    %idx_1 : index,
-    %idx_2 : index) -> vector<8x4xi8> {
-
-  %c0_i8 = arith.constant 0 : i8
-  %c0 = arith.constant 0 : index
-  %result = vector.transfer_read %arg[%idx_1, %idx_2, %c0, %c0], %c0_i8 {in_bounds = [true, true]} :
-    memref<?x?x8x4xi8, strided<[?, 32, 4, 1], offset: ?>>, vector<8x4xi8>
-  return %result : vector<8x4xi8>
-}
-
-// CHECK-LABEL: func @transfer_read_leading_dynamic_dims
-// CHECK-SAME:    %[[ARG0:.+]]: memref<?x?x8x4xi8, {{.+}}>, %[[ARG1:.+]]: index, %[[ARG2:.+]]: index
-// CHECK:         %[[C0_I8:.+]] = arith.constant 0 : i8
-// CHECK:         %[[C0:.+]] = arith.constant 0 : index
-// CHECK:         %[[COLLAPSED:.+]] = memref.collapse_shape %[[ARG0]] {{\[}}[0], [1], [2, 3]{{\]}}
-// CHECK-SAME:      : memref<?x?x8x4xi8, {{.+}}> into memref<?x?x32xi8, {{.+}}>
-// CHECK:         %[[VEC1D:.+]] = vector.transfer_read %[[COLLAPSED]]
-// CHECK-SAME:    [%[[ARG1]], %[[ARG2]], %[[C0]]], %[[C0_I8]]
-// CHECK-SAME:    {in_bounds = [true]}
-// CHECK-SAME:      : memref<?x?x32xi8, {{.+}}>, vector<32xi8>
-// CHECK:         %[[VEC2D:.+]] = vector.shape_cast %[[VEC1D]] : vector<32xi8> to vector<8x4xi8>
-// CHECK:         return %[[VEC2D]] : vector<8x4xi8>
-
-// CHECK-128B-LABEL: func @transfer_read_leading_dynamic_dims
-//       CHECK-128B:   memref.collapse_shape
-
-// -----
-
-// One of the dims to be flattened is dynamic - not supported ATM.
-
-func.func @negative_transfer_read_dynamic_dim_to_flatten(
+func.func @transfer_read_dims_mismatch_non_zero_indices_dynamic_shapes(
     %idx_1: index,
     %idx_2: index,
     %m_in: memref<1x?x4x6xi32>) -> vector<1x2x6xi32> {
@@ -179,11 +146,11 @@ func.func @negative_transfer_read_dynamic_dim_to_flatten(
   return %v : vector<1x2x6xi32>
 }
 
-// CHECK-LABEL: func.func @negative_transfer_read_dynamic_dim_to_flatten
+// CHECK-LABEL: func.func @transfer_read_dims_mismatch_non_zero_indices_dynamic_shapes(
 // CHECK-NOT: memref.collapse_shape
 // CHECK-NOT: vector.shape_cast
 
-// CHECK-128B-LABEL: func @negative_transfer_read_dynamic_dim_to_flatten
+// CHECK-128B-LABEL: func @transfer_read_dims_mismatch_non_zero_indices_dynamic_shapes(
 //   CHECK-128B-NOT:   memref.collapse_shape
 
 // -----
@@ -359,11 +326,11 @@ func.func @transfer_write_dims_mismatch_non_zero_indices(
 func.func @transfer_write_dims_mismatch_non_contiguous_non_zero_indices(
     %value : vector<2x2xf32>,
     %subview : memref<1x3x3x2xf32, strided<[40, 10, 2, 1], offset: ?>>,
-    %idx_1 : index,
-    %idx_2 : index) {
+    %idx0 : index,
+    %idx1 : index) {
 
   %c0 = arith.constant 0 : index
-  vector.transfer_write %value, %subview[%c0, %idx_1, %idx_2, %c0] {in_bounds = [true, true]} : vector<2x2xf32>, memref<1x3x3x2xf32, strided<[40, 10, 2, 1], offset: ?>>
+  vector.transfer_write %value, %subview[%c0, %idx0, %idx1, %c0] {in_bounds = [true, true]} : vector<2x2xf32>, memref<1x3x3x2xf32, strided<[40, 10, 2, 1], offset: ?>>
   return
 }
 
@@ -378,40 +345,10 @@ func.func @transfer_write_dims_mismatch_non_contiguous_non_zero_indices(
 
 // -----
 
-// The leading dynamic shapes don't affect whether this example is flattenable
-// or not. Indeed, those dynamic shapes are not candidates for flattening anyway.
-
-func.func @transfer_write_leading_dynamic_dims(
-    %vec : vector<8x4xi8>,
-    %arg : memref<?x?x8x4xi8, strided<[?, 32, 4, 1], offset: ?>>,
-    %idx_1 : index,
-    %idx_2 : index) {
-
-  %c0 = arith.constant 0 : index
-  vector.transfer_write %vec, %arg[%idx_1, %idx_2, %c0, %c0] {in_bounds = [true, true]} :
-    vector<8x4xi8>, memref<?x?x8x4xi8, strided<[?, 32, 4, 1], offset: ?>>
-  return
-}
-
-// CHECK-LABEL: func @transfer_write_leading_dynamic_dims
-// CHECK-SAME:    %[[ARG0:.+]]: vector<8x4xi8>, %[[ARG1:.+]]: memref<?x?x8x4xi8, {{.+}}>, %[[ARG2:.+]]: index, %[[ARG3:.+]]: index
-// CHECK:         %[[C0:.+]] = arith.constant 0 : index
-// CHECK:         %[[COLLAPSED:.+]] = memref.collapse_shape %[[ARG1]] {{\[}}[0], [1], [2, 3]{{\]}}
-// CHECK-SAME:      : memref<?x?x8x4xi8, {{.+}}> into memref<?x?x32xi8, {{.+}}>
-// CHECK:         %[[VEC1D:.+]] = vector.shape_cast %[[ARG0]] : vector<8x4xi8> to vector<32xi8>
-// CHECK:         vector.transfer_write %[[VEC1D]], %[[COLLAPSED]]
-// CHECK-SAME:      [%[[ARG2]], %[[ARG3]], %[[C0]]]
-// CHECK-SAME:      {in_bounds = [true]}
-// CHECK-SAME:      : vector<32xi8>, memref<?x?x32xi8, {{.+}}>
-
-// CHECK-128B-LABEL: func @transfer_write_leading_dynamic_dims
-//       CHECK-128B:   memref.collapse_shape
+// The input memref has a dynamic trailing shape and hence is not flattened.
+// TODO: This case could be supported via memref.dim
 
-// -----
-
-// One of the dims to be flattened is dynamic - not supported ATM.
-
-func.func @negative_transfer_write_dynamic_to_flatten(
+func.func @transfer_write_dims_mismatch_non_zero_indices_dynamic_shapes(
     %idx_1: index,
     %idx_2: index,
     %vec : vector<1x2x6xi32>,
@@ -424,11 +361,11 @@ func.func @negative_transfer_write_dynamic_to_flatten(
   return
 }
 
-// CHECK-LABEL: func.func @negative_transfer_write_dynamic_to_flatten
+// CHECK-LABEL: func.func @transfer_write_dims_mismatch_non_zero_indices_dynamic_shapes(
 // CHECK-NOT: memref.collapse_shape
 // CHECK-NOT: vector.shape_cast
 
-// CHECK-128B-LABEL: func @negative_transfer_write_dynamic_to_flatten
+// CHECK-128B-LABEL: func @transfer_write_dims_mismatch_non_zero_indices_dynamic_shapes(
 //   CHECK-128B-NOT:   memref.collapse_shape
 
 // -----
@@ -497,10 +434,56 @@ func.func @transfer_write_non_contiguous_src(
 // -----
 
 ///----------------------------------------------------------------------------------------
-/// [Pattern: DropUnitDimFromElementwiseOps]
-/// TODO: Move to a dedicated file - there's no "flattening" in the following tests
+/// TODO: Categorize + re-format
 ///----------------------------------------------------------------------------------------
 
+func.func @transfer_read_flattenable_with_dynamic_dims_and_indices(%arg0 : memref<?x?x8x4xi8, strided<[?, 32, 4, 1], offset: ?>>, %arg1 : index, %arg2 : index) -> vector<8x4xi8> {
+    %c0_i8 = arith.constant 0 : i8
+    %c0 = arith.constant 0 : index
+    %result = vector.transfer_read %arg0[%arg1, %arg2, %c0, %c0], %c0_i8 {in_bounds = [true, true]} : memref<?x?x8x4xi8, strided<[?, 32, 4, 1], offset: ?>>, vector<8x4xi8>
+    return %result : vector<8x4xi8>
+}
+
+// CHECK-LABEL: func @transfer_read_flattenable_with_dynamic_dims_and_indices
+// CHECK-SAME:    %[[ARG0:.+]]: memref<?x?x8x4xi8, {{.+}}>, %[[ARG1:.+]]: index, %[[ARG2:.+]]: index
+// CHECK:       %[[C0_I8:.+]] = arith.constant 0 : i8
+// CHECK:       %[[C0:.+]] = arith.constant 0 : index
+// CHECK:       %[[COLLAPSED:.+]] = memref.collapse_shape %[[ARG0]] {{\[}}[0], [1], [2, 3]{{\]}}
+// CHECK-SAME:    : memref<?x?x8x4xi8, {{.+}}> into memref<?x?x32xi8, {{.+}}>
+// CHECK:       %[[VEC1D:.+]] = vector.transfer_read %[[COLLAPSED]]
+// CHECK-SAME:    [%[[ARG1]], %[[ARG2]], %[[C0]]], %[[C0_I8]]
+// CHECK-SAME:    {in_bounds = [true]}
+// CHECK-SAME:    : memref<?x?x32xi8, {{.+}}>, vector<32xi8>
+// CHECK:       %[[VEC2D:.+]] = vector.shape_cast %[[VEC1D]] : vector<32xi8> to vector<8x4xi8>
+// CHECK:       return %[[VEC2D]] : vector<8x4xi8>
+
+// CHECK-128B-LABEL: func @transfer_read_flattenable_with_dynamic_dims_and_indices(
+//       CHECK-128B:   memref.collapse_shape
+
+// -----
+
+func.func @transfer_write_flattenable_with_dynamic_dims_and_indices(%vec : vector<8x4xi8>, %dst : memref<?x?x8x4xi8, strided<[?, 32, 4, 1], offset: ?>>, %arg1 : index, %arg2 : index) {
+    %c0 = arith.constant 0 : index
+    vector.transfer_write %vec, %dst[%arg1, %arg2, %c0, %c0] {in_bounds = [true, true]} : vector<8x4xi8>, memref<?x?x8x4xi8, strided<[?, 32, 4, 1], offset: ?>>
+    return
+}
+
+// CHECK-LABEL: func @transfer_write_flattenable_with_dynamic_dims_and_indices
+// CHECK-SAME:    %[[ARG0:.+]]: vector<8x4xi8>, %[[ARG1:.+]]: memref<?x?x8x4xi8, {{.+}}>, %[[ARG2:.+]]: index, %[[ARG3:.+]]: index
+// CHECK:       %[[C0:.+]] = arith.constant 0 : index
+// CHECK:       %[[COLLAPSED:.+]] = memref.collapse_shape %[[ARG1]] {{\[}}[0], [1], [2, 3]{{\]}}
+// CHECK-SAME:    : memref<?x?x8x4xi8, {{.+}}> into memref<?x?x32xi8, {{.+}}>
+// CHECK:       %[[VEC1D:.+]] = vector.shape_cast %[[ARG0]] : vector<8x4xi8> to vector<32xi8>
+// CHECK:       vector.transfer_write %[[VEC1D]], %[[COLLAPSED]]
+// CHECK-SAME:    [%[[ARG2]], %[[ARG3]], %[[C0]]]
+// CHECK-SAME:    {in_bounds = [true]}
+// CHECK-SAME:    : vector<32xi8>, memref<?x?x32xi8, {{.+}}>
+
+// CHECK-128B-LABEL: func @transfer_write_flattenable_with_dynamic_dims_and_indices(
+//       CHECK-128B:   memref.collapse_shape
+
+// -----
+
 func.func @fold_unit_dim_add_basic(%arg0 : vector<1x8xi32>) -> vector<1x8xi32> {
    %add = arith.addi %arg0, %arg0 : vector<1x8xi32>
    return %add : vector<1x8xi32>
diff --git a/mlir/test/IR/attribute.mlir b/mlir/test/IR/attribute.mlir
index 362e98134ee4a..291d5832fce79 100644
--- a/mlir/test/IR/attribute.mlir
+++ b/mlir/test/IR/attribute.mlir
@@ -40,10 +40,6 @@ func.func @float_attrs_pass() {
     // CHECK: float_attr = 2.000000e+00 : f8E5M2
     float_attr = 2. : f8E5M2
   } : () -> ()
-  "test.float_attrs"() {
-    // CHECK: float_attr = 2.000000e+00 : f8E4M3
-    float_attr = 2. : f8E4M3
-  } : () -> ()
   "test.float_attrs"() {
     // CHECK: float_attr = 2.000000e+00 : f8E4M3FN
     float_attr = 2. : f8E4M3FN
diff --git a/mlir/test/Target/LLVMIR/Import/instructions.ll b/mlir/test/Target/LLVMIR/Import/instructions.ll
index b4dad2deb3496..005aafb20a510 100644
--- a/mlir/test/Target/LLVMIR/Import/instructions.ll
+++ b/mlir/test/Target/LLVMIR/Import/instructions.ll
@@ -528,64 +528,6 @@ define void @varargs_call(i32 %0) {
 
 ; // -----
 
-; CHECK: llvm.func @f()
-declare void @f()
-
-; CHECK-LABEL: @call_convergent
-define void @call_convergent() {
-; CHECK: llvm.call @f() {convergent}
-  call void @f() convergent
-  ret void
-}
-
-; // -----
-
-; CHECK: llvm.func @f()
-declare void @f()
-
-; CHECK-LABEL: @call_no_unwind
-define void @call_no_unwind() {
-; CHECK: llvm.call @f() {no_unwind}
-  call void @f() nounwind
-  ret void
-}
-
-; // -----
-
-; CHECK: llvm.func @f()
-declare void @f()
-
-; CHECK-LABEL: @call_will_return
-define void @call_will_return() {
-; CHECK: llvm.call @f() {will_return}
-  call void @f() willreturn
-  ret void
-}
-
-; // -----
-
-; CHECK: llvm.func @f()
-declare void @f()
-
-; CHECK-LABEL: @call_memory_effects
-define void @call_memory_effects() {
-; CHECK: llvm.call @f() {memory = #llvm.memory_effects<other = none, argMem = none, inaccessibleMem = none>}
-  call void @f() memory(none)
-; CHECK: llvm.call @f() {memory = #llvm.memory_effects<other = none, argMem = write, inaccessibleMem = read>}
-  call void @f() memory(none, argmem: write, inaccessiblemem: read)
-; CHECK: llvm.call @f() {memory = #llvm.memory_effects<other = write, argMem = none, inaccessibleMem = write>}
-  call void @f() memory(write, argmem: none)
-; CHECK: llvm.call @f() {memory = #llvm.memory_effects<other = readwrite, argMem = readwrite, inaccessibleMem = read>}
-  call void @f() memory(readwrite, inaccessiblemem: read)
-; CHECK: llvm.call @f()
-; CHECK-NOT: #llvm.memory_effects
-; CHECK-SAME: : () -> ()
-  call void @f() memory(readwrite)
-  ret void
-}
-
-; // -----
-
 %sub_struct = type { i32, i8 }
 %my_struct = type { %sub_struct, [4 x i32] }
 
diff --git a/mlir/test/Target/LLVMIR/llvmir.mlir b/mlir/test/Target/LLVMIR/llvmir.mlir
index ab5b65bde6305..132a8eb668eba 100644
--- a/mlir/test/Target/LLVMIR/llvmir.mlir
+++ b/mlir/test/Target/LLVMIR/llvmir.mlir
@@ -39,9 +39,6 @@ llvm.mlir.global internal constant @string_const("foobar") : !llvm.array<6 x i8>
 // CHECK: @int_global_undef = internal global i64 undef
 llvm.mlir.global internal @int_global_undef() : i64
 
-// CHECK: @f8E4M3_global_as_i8 = internal global i8 60
-llvm.mlir.global internal @f8E4M3_global_as_i8(1.5 : f8E4M3) : i8
-
 // CHECK: @f8E4M3FN_global_as_i8 = internal global i8 60
 llvm.mlir.global internal @f8E4M3FN_global_as_i8(1.5 : f8E4M3FN) : i8
 
@@ -2477,75 +2474,3 @@ llvm.func @willreturn() attributes { will_return } {
 
 // CHECK: #[[ATTRS]]
 // CHECK-SAME: willreturn
-
-// -----
-
-llvm.func @f()
-
-// CHECK-LABEL: @convergent_call
-// CHECK: call void @f() #[[ATTRS:[0-9]+]]
-llvm.func @convergent_call() {
-  llvm.call @f() {convergent} : () -> ()
-  llvm.return
-}
-
-// CHECK: #[[ATTRS]]
-// CHECK-SAME: convergent
-
-// -----
-
-llvm.func @f()
-
-// CHECK-LABEL: @nounwind_call
-// CHECK: call void @f() #[[ATTRS:[0-9]+]]
-llvm.func @nounwind_call() {
-  llvm.call @f() {no_unwind} : () -> ()
-  llvm.return
-}
-
-// CHECK: #[[ATTRS]]
-// CHECK-SAME: nounwind
-
-// -----
-
-llvm.func @f()
-
-// CHECK-LABEL: @willreturn_call
-// CHECK: call void @f() #[[ATTRS:[0-9]+]]
-llvm.func @willreturn_call() {
-  llvm.call @f() {will_return} : () -> ()
-  llvm.return
-}
-
-// CHECK: #[[ATTRS]]
-// CHECK-SAME: willreturn
-
-// -----
-
-llvm.func @fa()
-llvm.func @fb()
-llvm.func @fc()
-llvm.func @fd()
-
-// CHECK-LABEL: @mem_none_call
-// CHECK: call void @fa() #[[ATTRS_0:[0-9]+]]
-// CHECK: call void @fb() #[[ATTRS_1:[0-9]+]]
-// CHECK: call void @fc() #[[ATTRS_2:[0-9]+]]
-// CHECK: call void @fd() #[[ATTRS_3:[0-9]+]]
-llvm.func @mem_none_call() {
-  llvm.call @fa() {memory = #llvm.memory_effects<other = none, argMem = none, inaccessibleMem = none>} : () -> ()
-  llvm.call @fb() {memory = #llvm.memory_effects<other = read, argMem = none, inaccessibleMem = write>} : () -> ()
-  llvm.call @fc() {memory = #llvm.memory_effects<other = read, argMem = read, inaccessibleMem = write>} : () -> ()
-  llvm.call @fd() {memory = #llvm.memory_effects<other = readwrite, argMem = read, inaccessibleMem = readwrite>} : () -> ()
-  llvm.return
-
-}
-
-// CHECK: #[[ATTRS_0]]
-// CHECK-SAME: memory(none)
-// CHECK: #[[ATTRS_1]]
-// CHECK-SAME: memory(read, argmem: none, inaccessiblemem: write)
-// CHECK: #[[ATTRS_2]]
-// CHECK-SAME: memory(read, inaccessiblemem: write)
-// CHECK: #[[ATTRS_3]]
-// CHECK-SAME: memory(readwrite, argmem: read)
diff --git a/mlir/test/Target/LLVMIR/omptarget-depend.mlir b/mlir/test/Target/LLVMIR/omptarget-depend.mlir
deleted file mode 100644
index c386342005e5e..0000000000000
--- a/mlir/test/Target/LLVMIR/omptarget-depend.mlir
+++ /dev/null
@@ -1,140 +0,0 @@
-// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
-  llvm.func @_QQmain() attributes {fir.bindc_name = "main"} {
-    %0 = llvm.mlir.constant(39 : index) : i64
-    %1 = llvm.mlir.constant(0 : index) : i64
-    %2 = llvm.mlir.constant(1 : index) : i64
-    %3 = llvm.mlir.constant(40 : index) : i64
-    %4 = llvm.mlir.addressof @_QFEa : !llvm.ptr
-    %5 = llvm.mlir.addressof @_QFEb : !llvm.ptr
-    %6 = llvm.mlir.constant(1 : i64) : i64
-    %7 = llvm.alloca %6 x i32 {bindc_name = "i"} : (i64) -> !llvm.ptr
-    %8 = llvm.mlir.addressof @_QFEn : !llvm.ptr
-    omp.task {
-      %14 = llvm.mlir.constant(1 : i64) : i64
-      %15 = llvm.alloca %14 x i32 {bindc_name = "i", pinned} : (i64) -> !llvm.ptr
-      %16 = llvm.load %8 : !llvm.ptr -> i32
-      %17 = llvm.sext %16 : i32 to i64
-      %18 = llvm.trunc %2 : i64 to i32
-      llvm.br ^bb1(%18, %17 : i32, i64)
-    ^bb1(%19: i32, %20: i64):  // 2 preds: ^bb0, ^bb2
-      %21 = llvm.icmp "sgt" %20, %1 : i64
-      llvm.cond_br %21, ^bb2, ^bb3
-    ^bb2:  // pred: ^bb1
-      llvm.store %19, %15 : i32, !llvm.ptr
-      %22 = llvm.load %15 : !llvm.ptr -> i32
-      %23 = llvm.sext %22 : i32 to i64
-      %24 = llvm.mlir.constant(1 : i64) : i64
-      %25 = llvm.mlir.constant(0 : i64) : i64
-      %26 = llvm.sub %23, %24 overflow<nsw> : i64
-      %27 = llvm.mul %26, %24 overflow<nsw> : i64
-      %28 = llvm.mul %27, %24 overflow<nsw> : i64
-      %29 = llvm.add %28, %25 overflow<nsw> : i64
-      %30 = llvm.mul %24, %3 overflow<nsw> : i64
-      %31 = llvm.getelementptr %4[%29] : (!llvm.ptr, i64) -> !llvm.ptr, i32
-      llvm.store %22, %31 : i32, !llvm.ptr
-      %32 = llvm.load %15 : !llvm.ptr -> i32
-      %33 = llvm.add %32, %18 : i32
-      %34 = llvm.sub %20, %2 : i64
-      llvm.br ^bb1(%33, %34 : i32, i64)
-    ^bb3:  // pred: ^bb1
-      llvm.store %19, %15 : i32, !llvm.ptr
-      omp.terminator
-    }
-    %9 = omp.map.bounds lower_bound(%1 : i64) upper_bound(%0 : i64) extent(%3 : i64) stride(%2 : i64) start_idx(%2 : i64)
-    %10 = omp.map.info var_ptr(%4 : !llvm.ptr, !llvm.array<40 x i32>) map_clauses(to) capture(ByRef) bounds(%9) -> !llvm.ptr {name = "a"}
-    %11 = omp.map.info var_ptr(%5 : !llvm.ptr, !llvm.array<40 x i32>) map_clauses(from) capture(ByRef) bounds(%9) -> !llvm.ptr {name = "b"}
-    %12 = omp.map.info var_ptr(%7 : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = "i"}
-    %13 = omp.map.info var_ptr(%8 : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = "n"}
-    omp.target map_entries(%10 -> %arg0, %11 -> %arg1, %12 -> %arg2, %13 -> %arg3 : !llvm.ptr, !llvm.ptr, !llvm.ptr, !llvm.ptr) depend(taskdependin -> %4 : !llvm.ptr) {
-    ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: !llvm.ptr, %arg3: !llvm.ptr):
-      %14 = llvm.mlir.constant(0 : index) : i64
-      %15 = llvm.mlir.constant(10 : i32) : i32
-      %16 = llvm.mlir.constant(1 : index) : i64
-      %17 = llvm.mlir.constant(40 : index) : i64
-      %18 = llvm.load %arg3 : !llvm.ptr -> i32
-      %19 = llvm.sext %18 : i32 to i64
-      %20 = llvm.trunc %16 : i64 to i32
-      llvm.br ^bb1(%20, %19 : i32, i64)
-    ^bb1(%21: i32, %22: i64):  // 2 preds: ^bb0, ^bb2
-      %23 = llvm.icmp "sgt" %22, %14 : i64
-      llvm.cond_br %23, ^bb2, ^bb3
-    ^bb2:  // pred: ^bb1
-      llvm.store %21, %arg2 : i32, !llvm.ptr
-      %24 = llvm.load %arg2 : !llvm.ptr -> i32
-      %25 = llvm.sext %24 : i32 to i64
-      %26 = llvm.mlir.constant(1 : i64) : i64
-      %27 = llvm.mlir.constant(0 : i64) : i64
-      %28 = llvm.sub %25, %26 overflow<nsw> : i64
-      %29 = llvm.mul %28, %26 overflow<nsw> : i64
-      %30 = llvm.mul %29, %26 overflow<nsw> : i64
-      %31 = llvm.add %30, %27 overflow<nsw> : i64
-      %32 = llvm.mul %26, %17 overflow<nsw> : i64
-      %33 = llvm.getelementptr %arg0[%31] : (!llvm.ptr, i64) -> !llvm.ptr, i32
-      %34 = llvm.load %33 : !llvm.ptr -> i32
-      %35 = llvm.add %34, %15 : i32
-      %36 = llvm.mlir.constant(1 : i64) : i64
-      %37 = llvm.mlir.constant(0 : i64) : i64
-      %38 = llvm.sub %25, %36 overflow<nsw> : i64
-      %39 = llvm.mul %38, %36 overflow<nsw> : i64
-      %40 = llvm.mul %39, %36 overflow<nsw> : i64
-      %41 = llvm.add %40, %37 overflow<nsw> : i64
-      %42 = llvm.mul %36, %17 overflow<nsw> : i64
-      %43 = llvm.getelementptr %arg1[%41] : (!llvm.ptr, i64) -> !llvm.ptr, i32
-      llvm.store %35, %43 : i32, !llvm.ptr
-      %44 = llvm.load %arg2 : !llvm.ptr -> i32
-      %45 = llvm.add %44, %20 : i32
-      %46 = llvm.sub %22, %16 : i64
-      llvm.br ^bb1(%45, %46 : i32, i64)
-    ^bb3:  // pred: ^bb1
-      llvm.store %21, %arg2 : i32, !llvm.ptr
-      omp.terminator
-    }
-    llvm.return
-  }
-  llvm.mlir.global internal @_QFEa() {addr_space = 0 : i32} : !llvm.array<40 x i32> {
-    %0 = llvm.mlir.zero : !llvm.array<40 x i32>
-    llvm.return %0 : !llvm.array<40 x i32>
-  }
-  llvm.mlir.global internal @_QFEb() {addr_space = 0 : i32} : !llvm.array<40 x i32> {
-    %0 = llvm.mlir.zero : !llvm.array<40 x i32>
-    llvm.return %0 : !llvm.array<40 x i32>
-  }
-  llvm.mlir.global internal @_QFEc() {addr_space = 0 : i32} : !llvm.array<40 x i32> {
-    %0 = llvm.mlir.zero : !llvm.array<40 x i32>
-    llvm.return %0 : !llvm.array<40 x i32>
-  }
-  llvm.mlir.global internal @_QFEn() {addr_space = 0 : i32} : i32 {
-    %0 = llvm.mlir.constant(40 : i32) : i32
-    llvm.return %0 : i32
-  }
-  llvm.func @_FortranAProgramStart(i32, !llvm.ptr, !llvm.ptr, !llvm.ptr) attributes {sym_visibility = "private"}
-  llvm.func @_FortranAProgramEndStatement() attributes {sym_visibility = "private"}
-  llvm.func @main(%arg0: i32, %arg1: !llvm.ptr, %arg2: !llvm.ptr) -> i32 {
-    %0 = llvm.mlir.constant(0 : i32) : i32
-    %1 = llvm.mlir.zero : !llvm.ptr
-    llvm.call @_FortranAProgramStart(%arg0, %arg1, %arg2, %1) {fastmathFlags = #llvm.fastmath<contract>} : (i32, !llvm.ptr, !llvm.ptr, !llvm.ptr) -> ()
-    llvm.call @_QQmain() {fastmathFlags = #llvm.fastmath<contract>} : () -> ()
-    llvm.call @_FortranAProgramEndStatement() {fastmathFlags = #llvm.fastmath<contract>} : () -> ()
-    llvm.return %0 : i32
-  }
-
-// %strucArg holds pointers to shared data.
-// CHECK: define void @_QQmain() {
-// CHECK-DAG: %[[STRUCTARG:.+]] = alloca { ptr, ptr, ptr }, align 8
-// CHECK-DAG:  %[[DEP_ARRAY:.+]] = alloca [1 x %struct.kmp_dep_info], align 8
-// CHECK: %[[DEP_INFO:.+]]  = getelementptr inbounds [1 x %struct.kmp_dep_info], ptr %[[DEP_ARRAY]], i64 0, i64 0
-// CHECK: %[[PTR0:.+]] = getelementptr inbounds %struct.kmp_dep_info, ptr %[[DEP_INFO]], i32 0, i32 0
-// CHECK: store i64 ptrtoint (ptr @_QFEa to i64), ptr %[[PTR0]], align 4
-// CHECK: %[[PTR1:.+]] = getelementptr inbounds %struct.kmp_dep_info, ptr %[[DEP_INFO]], i32 0, i32 1
-// CHECK: store i64 8, ptr %[[PTR1]], align 4
-// CHECK: %[[PTR2:.+]] = getelementptr inbounds %struct.kmp_dep_info, ptr %[[DEP_INFO]], i32 0, i32 2
-// CHECK: store i8 1, ptr %[[PTR2]], align 1
-
-// CHECK: %[[TASKDATA:.+]] = call ptr @__kmpc_omp_task_alloc({{.+}}, ptr @.omp_target_task_proxy_func)
-// CHECK: %[[SHARED_DATA:.+]] = load ptr, ptr %[[TASKDATA]], align 8
-// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 1 %[[SHARED_DATA]], ptr align 1 %[[STRUCTARG]], i64 24, i1 false)
-// CHECK: call void @__kmpc_omp_wait_deps({{.+}}, i32 1, ptr %[[DEP_ARRAY]], i32 0, ptr null)
-// CHECK: call void @__kmpc_omp_task_begin_if0({{.+}}, ptr  %[[TASKDATA]])
-// CHECK: call void @.omp_target_task_proxy_func({{.+}}, ptr %[[TASKDATA]])
-// CHECK: call void @__kmpc_omp_task_complete_if0({{.+}}, ptr %[[TASKDATA]])
-	      
diff --git a/mlir/test/python/ir/builtin_types.py b/mlir/test/python/ir/builtin_types.py
index 3178f58cf2e74..cfe377c703717 100644
--- a/mlir/test/python/ir/builtin_types.py
+++ b/mlir/test/python/ir/builtin_types.py
@@ -113,8 +113,6 @@ def testTypeIsInstance():
 def testFloatTypeSubclasses():
     ctx = Context()
     # CHECK: True
-    print(isinstance(Type.parse("f8E4M3", ctx), FloatType))
-    # CHECK: True
     print(isinstance(Type.parse("f8E4M3FN", ctx), FloatType))
     # CHECK: True
     print(isinstance(Type.parse("f8E5M2", ctx), FloatType))
@@ -231,8 +229,6 @@ def testIndexType():
 @run
 def testFloatType():
     with Context():
-        # CHECK: float: f8E4M3
-        print("float:", Float8E4M3Type.get())
         # CHECK: float: f8E4M3FN
         print("float:", Float8E4M3FNType.get())
         # CHECK: float: f8E5M2
@@ -605,7 +601,6 @@ def testTypeIDs():
         types = [
             (IntegerType, IntegerType.get_signless(16)),
             (IndexType, IndexType.get()),
-            (Float8E4M3Type, Float8E4M3Type.get()),
             (Float8E4M3FNType, Float8E4M3FNType.get()),
             (Float8E5M2Type, Float8E5M2Type.get()),
             (Float8E4M3FNUZType, Float8E4M3FNUZType.get()),
@@ -629,7 +624,6 @@ def testTypeIDs():
 
         # CHECK: IntegerType(i16)
         # CHECK: IndexType(index)
-        # CHECK: Float8E4M3Type(f8E4M3)
         # CHECK: Float8E4M3FNType(f8E4M3FN)
         # CHECK: Float8E5M2Type(f8E5M2)
         # CHECK: Float8E4M3FNUZType(f8E4M3FNUZ)
@@ -710,9 +704,6 @@ def print_downcasted(typ):
         # CHECK: Float8E4M3B11FNUZType
         # CHECK: Float8E4M3B11FNUZType(f8E4M3B11FNUZ)
         print_downcasted(Float8E4M3B11FNUZType.get())
-        # CHECK: Float8E4M3Type
-        # CHECK: Float8E4M3Type(f8E4M3)
-        print_downcasted(Float8E4M3Type.get())
         # CHECK: Float8E4M3FNType
         # CHECK: Float8E4M3FNType(f8E4M3FN)
         print_downcasted(Float8E4M3FNType.get())
diff --git a/mlir/unittests/IR/AffineExprTest.cpp b/mlir/unittests/IR/AffineExprTest.cpp
index dc78bbac85f3c..75c893334943d 100644
--- a/mlir/unittests/IR/AffineExprTest.cpp
+++ b/mlir/unittests/IR/AffineExprTest.cpp
@@ -106,9 +106,3 @@ TEST(AffineExprTest, modSimplificationRegression) {
   auto sum = d0 + d0.floorDiv(3).floorDiv(-3);
   ASSERT_EQ(sum.getKind(), AffineExprKind::Add);
 }
-
-TEST(AffineExprTest, divisorOfNegativeFloorDiv) {
-  MLIRContext ctx;
-  OpBuilder b(&ctx);
-  ASSERT_EQ(b.getAffineDimExpr(0).floorDiv(-1).getLargestKnownDivisor(), 1);
-}
diff --git a/mlir/unittests/IR/ShapedTypeTest.cpp b/mlir/unittests/IR/ShapedTypeTest.cpp
index 7a5b0722a03ba..61264bc523648 100644
--- a/mlir/unittests/IR/ShapedTypeTest.cpp
+++ b/mlir/unittests/IR/ShapedTypeTest.cpp
@@ -11,7 +11,6 @@
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Dialect.h"
 #include "mlir/IR/DialectInterface.h"
-#include "mlir/Support/LLVM.h"
 #include "llvm/ADT/SmallVector.h"
 #include "gtest/gtest.h"
 #include <cstdint>
@@ -227,61 +226,4 @@ TEST(ShapedTypeTest, RankedTensorTypeBuilder) {
   }
 }
 
-/// Simple wrapper class to enable "isa querying" and simple accessing of
-/// encoding.
-class TensorWithString : public RankedTensorType {
-public:
-  using RankedTensorType::RankedTensorType;
-
-  static TensorWithString get(ArrayRef<int64_t> shape, Type elementType,
-                              StringRef name) {
-    return mlir::cast<TensorWithString>(RankedTensorType::get(
-        shape, elementType, StringAttr::get(elementType.getContext(), name)));
-  }
-
-  StringRef getName() const {
-    if (Attribute enc = getEncoding())
-      return mlir::cast<StringAttr>(enc).getValue();
-    return {};
-  }
-
-  static bool classof(Type type) {
-    if (auto rt = mlir::dyn_cast_or_null<RankedTensorType>(type))
-      return mlir::isa_and_present<StringAttr>(rt.getEncoding());
-    return false;
-  }
-};
-
-TEST(ShapedTypeTest, RankedTensorTypeView) {
-  MLIRContext context;
-  Type f32 = FloatType::getF32(&context);
-
-  Type noEncodingRankedTensorType = RankedTensorType::get({10, 20}, f32);
-
-  UnitAttr unitAttr = UnitAttr::get(&context);
-  Type unitEncodingRankedTensorType =
-      RankedTensorType::get({10, 20}, f32, unitAttr);
-
-  StringAttr stringAttr = StringAttr::get(&context, "app");
-  Type stringEncodingRankedTensorType =
-      RankedTensorType::get({10, 20}, f32, stringAttr);
-
-  EXPECT_FALSE(mlir::isa<TensorWithString>(noEncodingRankedTensorType));
-  EXPECT_FALSE(mlir::isa<TensorWithString>(unitEncodingRankedTensorType));
-  ASSERT_TRUE(mlir::isa<TensorWithString>(stringEncodingRankedTensorType));
-
-  // Cast to TensorWithString view.
-  auto view = mlir::cast<TensorWithString>(stringEncodingRankedTensorType);
-  ASSERT_TRUE(mlir::isa<TensorWithString>(view));
-  EXPECT_EQ(view.getName(), "app");
-  // Verify one could cast view type back to base type.
-  ASSERT_TRUE(mlir::isa<RankedTensorType>(view));
-
-  Type viewCreated = TensorWithString::get({10, 20}, f32, "bob");
-  ASSERT_TRUE(mlir::isa<TensorWithString>(viewCreated));
-  ASSERT_TRUE(mlir::isa<RankedTensorType>(viewCreated));
-  view = mlir::cast<TensorWithString>(viewCreated);
-  EXPECT_EQ(view.getName(), "bob");
-}
-
 } // namespace
diff --git a/mlir/utils/lldb-scripts/mlirDataFormatters.py b/mlir/utils/lldb-scripts/mlirDataFormatters.py
index ed0ee431fd7d8..3282b4a32d503 100644
--- a/mlir/utils/lldb-scripts/mlirDataFormatters.py
+++ b/mlir/utils/lldb-scripts/mlirDataFormatters.py
@@ -51,7 +51,6 @@ def build_ptr_str_from_addr(addrValue: lldb.SBValue, type: lldb.SBType):
     "mlir::FusedLoc": '"loc(fused<...>[...])"',
     "mlir::UnknownLoc": '"loc(unknown)"',
     "mlir::Float8E5M2Type": '"f8E5M2"',
-    "mlir::Float8E4M3Type": '"f8E4M3"',
     "mlir::Float8E4M3FNType": '"f8E4M3FN"',
     "mlir::Float8E5M2FNUZType": '"f8E5M2FNUZ"',
     "mlir::Float8E4M3FNUZType": '"f8E4M3FNUZ"',
diff --git a/mlir/utils/tree-sitter-mlir/grammar.js b/mlir/utils/tree-sitter-mlir/grammar.js
index a657874f894b7..0e7075b124b55 100644
--- a/mlir/utils/tree-sitter-mlir/grammar.js
+++ b/mlir/utils/tree-sitter-mlir/grammar.js
@@ -230,8 +230,7 @@ const common = {
   integer_type : $ =>
       token(seq(choice('si', 'ui', 'i'), /[1-9]/, repeat(/[0-9]/))),
   float_type : $ => token(
-      choice('f16', 'f32', 'f64', 'f80', 'f128', 'bf16', 'f8E4M3FN', 'f8E4M3',
-             'f8E5M2')),
+      choice('f16', 'f32', 'f64', 'f80', 'f128', 'bf16', 'f8E4M3FN', 'f8E5M2')),
   index_type : $ => token('index'),
   none_type : $ => token('none'),
   complex_type : $ => seq(token('complex'), '<', $._prim_type, '>'),
diff --git a/offload/DeviceRTL/src/Parallelism.cpp b/offload/DeviceRTL/src/Parallelism.cpp
index 15b991f202539..031a5ced25518 100644
--- a/offload/DeviceRTL/src/Parallelism.cpp
+++ b/offload/DeviceRTL/src/Parallelism.cpp
@@ -166,6 +166,9 @@ __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
   // From this point forward we know that there is no thread state used.
   ASSERT(state::HasThreadState == false, nullptr);
 
+  uint32_t NumThreads = determineNumberOfThreads(num_threads);
+  uint32_t MaxTeamThreads = mapping::getMaxTeamThreads();
+  uint32_t PTeamSize = NumThreads == MaxTeamThreads ? 0 : NumThreads;
   if (mapping::isSPMDMode()) {
     // This was moved to its own routine so it could be called directly
     // in certain situations to avoid resource consumption of unused
@@ -175,10 +178,6 @@ __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
     return;
   }
 
-  uint32_t NumThreads = determineNumberOfThreads(num_threads);
-  uint32_t MaxTeamThreads = mapping::getMaxTeamThreads();
-  uint32_t PTeamSize = NumThreads == MaxTeamThreads ? 0 : NumThreads;
-
   // We do *not* create a new data environment because all threads in the team
   // that are active are now running this parallel region. They share the
   // TeamState, which has an increase level-var and potentially active-level
diff --git a/offload/test/offloading/fortran/target-depend.f90 b/offload/test/offloading/fortran/target-depend.f90
deleted file mode 100644
index 928eb671c9706..0000000000000
--- a/offload/test/offloading/fortran/target-depend.f90
+++ /dev/null
@@ -1,69 +0,0 @@
-! Offloading test checking the use of the depend clause on
-! the target construct
-! REQUIRES: flang, amdgcn-amd-amdhsa
-! UNSUPPORTED: nvptx64-nvidia-cuda
-! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-! UNSUPPORTED: aarch64-unknown-linux-gnu
-! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-! UNSUPPORTED: x86_64-pc-linux-gnu
-! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-
-! RUN: %libomptarget-compile-fortran-run-and-check-generic
-program main
-  implicit none
-  integer :: a = 0
-  INTERFACE
-     FUNCTION omp_get_device_num() BIND(C)
-       USE, INTRINSIC :: iso_c_binding, ONLY: C_INT
-       integer :: omp_get_device_num
-     END FUNCTION omp_get_device_num
-  END INTERFACE
-
-  call foo(5, a)
-  print*, "======= FORTRAN Test passed! ======="
-  print*, "foo(5) returned ", a, ", expected 6\n"
-
-  !       stop 0
-  contains
-    subroutine foo(N, r)
-      integer, intent(in) :: N
-      integer, intent(out) :: r
-      integer :: z, i, accumulator
-      z = 1
-      accumulator = 0
-      ! Spawn 3 threads
-      !$omp parallel num_threads(3)
-
-      ! A single thread will then create two tasks - one is the 'producer' and
-      ! potentially slower task that updates 'z' to 'N'. The second is an
-      ! offloaded target task that increments 'z'. If the depend clauses work
-      ! properly, the target task should wait for the 'producer' task to
-      ! complete before incrementing 'z'. We use 'omp single' here because the
-      ! depend clause establishes dependencies between sibling tasks only.
-      ! This is the easiest way of creating two sibling tasks.
-      !$omp single
-      !$omp task depend(out: z) shared(z)
-      do i=1, 32766
-         ! dumb loop nest to slow down the update of 'z'.
-         ! Adding a function call slows down the producer to the point
-         ! that removing the depend clause from the target construct below
-         ! frequently results in the wrong answer.
-         accumulator = accumulator + omp_get_device_num()
-      end do
-      z = N
-      !$omp end task
-
-      ! z is 5 now. Increment z to 6.
-      !$omp target map(tofrom: z) depend(in:z)
-      z = z + 1
-      !$omp end target
-      !$omp end single
-      !$omp end parallel
-      ! Use 'accumulator' so it is not optimized away by the compiler.
-      print *, accumulator
-      r = z
-    end subroutine foo
-
-!CHECK: ======= FORTRAN Test passed! =======
-!CHECK: foo(5) returned 6 , expected 6
-end program main
diff --git a/openmp/docs/ReleaseNotes.rst b/openmp/docs/ReleaseNotes.rst
index d4a4b1a99f781..df944c3be68de 100644
--- a/openmp/docs/ReleaseNotes.rst
+++ b/openmp/docs/ReleaseNotes.rst
@@ -1,5 +1,5 @@
 ===========================
-OpenMP 20.0.0 Release Notes
+OpenMP 19.0.0 Release Notes
 ===========================
 
 
diff --git a/pstl/docs/ReleaseNotes.rst b/pstl/docs/ReleaseNotes.rst
index 941b029ecd674..342b2eafc2f51 100644
--- a/pstl/docs/ReleaseNotes.rst
+++ b/pstl/docs/ReleaseNotes.rst
@@ -1,5 +1,5 @@
 =======================================
-PSTL 20.0.0 (In-Progress) Release Notes
+PSTL 19.0.0 (In-Progress) Release Notes
 =======================================
 
 .. contents::
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index eab5e28511c2c..63fd3faf38aef 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -1076,22 +1076,6 @@ libc_support_library(
     ],
 )
 
-libc_support_library(
-    name = "__support_osutil_pid",
-    srcs = ["src/__support/OSUtil/linux/pid.cpp"],
-    hdrs = ["src/__support/OSUtil/pid.h"],
-    target_compatible_with = select({
-        "@platforms//os:linux": [],
-        "//conditions:default": ["@platforms//:incompatible"],
-    }),
-    deps = [
-        ":__support_macros_attributes",
-        ":__support_macros_optimization",
-        ":__support_osutil_syscall",
-        ":types_pid_t",
-    ]
-)
-
 libc_support_library(
     name = "__support_stringutil",
     srcs = glob(["src/__support/StringUtil/tables/**/*.h"]) + [
@@ -3048,11 +3032,8 @@ libc_function(
     hdrs = ["src/unistd/getpid.h"],
     deps = [
         ":__support_common",
-        ":__support_macros_config",
-        ":__support_osutil_pid",
         ":__support_osutil_syscall",
         ":errno",
-        ":types_pid_t",
     ],
 )
 



More information about the flang-commits mailing list