[flang-commits] [clang] [clang-tools-extra] [compiler-rt] [flang] [libc] [libclc] [lldb] [llvm] [libc][math] Refactor bf16mul family to header-only (PR #182018)

Mohamed Emad via flang-commits flang-commits at lists.llvm.org
Mon Feb 23 18:15:18 PST 2026


https://github.com/hulxv updated https://github.com/llvm/llvm-project/pull/182018

>From 36635f8c6d4dc09398c5cdef70a30c53b3016b90 Mon Sep 17 00:00:00 2001
From: hulxv <hulxxv at gmail.com>
Date: Thu, 19 Feb 2026 02:28:59 +0200
Subject: [PATCH 1/2] [libc][math] Refactor bf16mul family to header-only

Refactored functions:
  - bf16mul
  - bf16mulf
  - bf16mulf128
  - bf16mull
---
 clang-tools-extra/clangd/TidyProvider.cpp     |    3 +-
 .../abseil/unchecked-statusor-access.rst      |   42 +-
 clang/docs/ReleaseNotes.rst                   |    2 +
 clang/docs/analyzer/checkers.rst              |   98 +-
 .../FlowSensitive/DataflowAnalysisContext.h   |   21 +-
 .../SerializationFormatRegistry.h             |   21 +-
 clang/include/clang/Basic/BuiltinsAMDGPU.td   |  148 +-
 clang/include/clang/Basic/CodeGenOptions.def  |    3 +
 clang/include/clang/CIR/Dialect/IR/CIROps.td  |    8 -
 clang/include/clang/Options/Options.td        |   17 +-
 .../clang/StaticAnalyzer/Checkers/Checkers.td |   10 +-
 clang/lib/AST/Stmt.cpp                        |   47 +-
 .../FlowSensitive/DataflowAnalysisContext.cpp |   12 +-
 .../lib/Analysis/FlowSensitive/RecordOps.cpp  |    4 +-
 clang/lib/Basic/ParsedAttrInfo.cpp            |   17 +-
 .../lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp  |   62 +-
 clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp    |   19 +-
 clang/lib/CodeGen/CGExpr.cpp                  |   11 +-
 clang/lib/CodeGen/CGHLSLRuntime.cpp           |   26 +
 clang/lib/CodeGen/CGHLSLRuntime.h             |    3 +
 clang/lib/CodeGen/CodeGenAction.cpp           |    4 +-
 clang/lib/CodeGen/CodeGenModule.cpp           |    4 +-
 clang/lib/CodeGen/HLSLBufferLayoutBuilder.cpp |   27 +-
 clang/lib/CodeGen/HLSLBufferLayoutBuilder.h   |    7 +
 clang/lib/Driver/Driver.cpp                   |    7 +
 clang/lib/Driver/ToolChains/CommonArgs.cpp    |   11 +-
 clang/lib/Driver/ToolChains/Darwin.cpp        |   15 +-
 clang/lib/Sema/SemaHLSL.cpp                   |    4 +-
 clang/lib/Sema/SemaInit.cpp                   |   37 +-
 clang/lib/Sema/SemaType.cpp                   |   76 +-
 clang/lib/Serialization/ASTReaderDecl.cpp     |   14 +
 .../Checkers/BasicObjCFoundationChecks.cpp    |   16 +-
 .../Checkers/WebKit/NoDeleteChecker.cpp       |    6 +-
 .../Checkers/WebKit/PtrTypesSemantics.cpp     |   82 +-
 .../Checkers/WebKit/PtrTypesSemantics.h       |    5 +
 .../Analysis/Checkers/WebKit/mock-types.h     |   42 +-
 .../Checkers/WebKit/nodelete-annotation.cpp   |  123 +
 .../Checkers/WebKit/uncounted-local-vars.cpp  |    6 +
 .../test/Analysis/analyzer-enabled-checkers.c |    1 -
 clang/test/Analysis/builtin_bitcast.cpp       |    2 +-
 clang/test/Analysis/concrete-address.c        |    2 +-
 clang/test/Analysis/dtor.cpp                  |    2 +-
 clang/test/Analysis/fixed-address-notes.c     |    2 +-
 clang/test/Analysis/misc-ps.m                 |    4 +-
 clang/test/Analysis/pr22954.c                 |    2 +-
 ...c-library-functions-arg-enabled-checkers.c |    1 -
 ...ress-dereferences-from-any-address-space.c |    8 +-
 .../builtin-structured-binding-size.cpp       |   30 +
 .../builtin-trivally-copyable.cpp             |   57 +
 .../builtin-types-compatible.c                |   59 +
 clang/test/CodeGen/AArch64/neon/fullfp16.c    |   22 +
 .../CodeGen/AArch64/v8.2a-fp16-intrinsics.c   |   16 -
 clang/test/CodeGen/attr-no-outline.c          |  120 +-
 .../MatrixToAndFromVectorConstructors.hlsl    |  121 +
 clang/test/CodeGenHLSL/matrix_types.hlsl      |   36 +-
 .../resources/cbuffer_matrix_align.hlsl       |   71 +
 .../default_cbuffer_with_layout.hlsl          |   11 +-
 clang/test/CodeGenObjC/attr-no-outline.m      |   73 +-
 .../builtins-amdgcn-gfx12-wmma-w32.cl         |  141 +-
 .../builtins-amdgcn-gfx12-wmma-w64.cl         |  141 +-
 .../builtins-amdgcn-wmma-w32-gfx10-err.cl     |   16 +-
 .../builtins-amdgcn-wmma-w64-gfx10-err.cl     |   16 +-
 .../usr/include/libxml/.keep                  |    0
 clang/test/Driver/aarch64-outliner.c          |    2 +-
 clang/test/Driver/arm-machine-outliner.c      |    2 +-
 clang/test/Driver/crash-diagnostics-dir-3.c   |    2 +-
 clang/test/Driver/crash-diagnostics-dir.c     |    2 +-
 clang/test/Driver/crash-ir-repro.cpp          |    2 +-
 clang/test/Driver/crash-report-clang-cl.cpp   |    2 +-
 clang/test/Driver/crash-report-header.h       |    2 +-
 clang/test/Driver/crash-report-spaces.c       |    2 +-
 clang/test/Driver/crash-report-with-asserts.c |    4 +-
 clang/test/Driver/crash-report.cpp            |    4 +-
 clang/test/Driver/emit-reproducer.c           |   14 +-
 clang/test/Driver/incompatible_sysroot.c      |   11 +-
 clang/test/Driver/lit.local.cfg               |    7 +
 clang/test/Driver/output-file-cleanup.c       |    2 +-
 clang/test/Driver/riscv-outliner.c            |    2 +-
 clang/test/Driver/x86-outliner.c              |    2 +-
 clang/test/Modules/demote-var-def.cpp         |   94 -
 clang/test/Modules/pr149404-02.cppm           |  104 -
 clang/test/Modules/pr172241.cppm              |   47 -
 clang/test/Modules/var-inst-def.cppm          |  110 -
 .../BuiltinMatrix/MatrixSplatErrors.hlsl      |    9 +-
 clang/tools/driver/driver.cpp                 |   20 +-
 .../sanitizer_common/sanitizer_unwind_win.cpp |   49 +-
 .../flang/Optimizer/Builder/HLFIRTools.h      |   25 +-
 flang/lib/Lower/Bridge.cpp                    |    1 -
 flang/lib/Optimizer/Builder/HLFIRTools.cpp    |   35 +-
 .../Support/FIROpenACCTypeInterfaces.cpp      |   56 +-
 flang/test/Lower/OpenACC/acc-reduction.f90    |  145 +-
 flang/test/Lower/do-while-to-scf-while.f90    |   19 +-
 libc/shared/math.h                            |   13 +
 libc/shared/math/bf16divl.h                   |   23 +
 libc/shared/math/bf16fmal.h                   |   25 +
 libc/shared/math/bf16mul.h                    |   22 +
 libc/shared/math/bf16mulf.h                   |   22 +
 libc/shared/math/bf16mulf128.h                |   28 +
 libc/shared/math/bf16mull.h                   |   22 +
 libc/shared/math/f16add.h                     |   29 +
 libc/shared/math/f16addf.h                    |   29 +
 libc/shared/math/f16addf128.h                 |   32 +
 libc/shared/math/f16addl.h                    |   29 +
 libc/shared/math/logbl.h                      |   23 +
 libc/shared/math/tanf16.h                     |   29 +
 libc/shared/math/tanpif.h                     |   23 +
 libc/src/__support/math/CMakeLists.txt        |  141 +
 libc/src/__support/math/bf16divl.h            |   26 +
 libc/src/__support/math/bf16fmal.h            |   26 +
 libc/src/__support/math/bf16mul.h             |   27 +
 libc/src/__support/math/bf16mulf.h            |   27 +
 libc/src/__support/math/bf16mulf128.h         |   33 +
 libc/src/__support/math/bf16mull.h            |   27 +
 libc/src/__support/math/f16add.h              |   31 +
 libc/src/__support/math/f16addf.h             |   31 +
 libc/src/__support/math/f16addf128.h          |   34 +
 libc/src/__support/math/f16addl.h             |   31 +
 libc/src/__support/math/logbl.h               |   26 +
 libc/src/__support/math/tanf16.h              |  137 +
 libc/src/__support/math/tanpif.h              |  115 +
 libc/src/math/generic/CMakeLists.txt          |   80 +-
 libc/src/math/generic/bf16divl.cpp            |    7 +-
 libc/src/math/generic/bf16fmal.cpp            |    9 +-
 libc/src/math/generic/bf16mul.cpp             |    7 +-
 libc/src/math/generic/bf16mulf.cpp            |    7 +-
 libc/src/math/generic/bf16mulf128.cpp         |    7 +-
 libc/src/math/generic/bf16mull.cpp            |    7 +-
 libc/src/math/generic/f16add.cpp              |    6 +-
 libc/src/math/generic/f16addf.cpp             |    6 +-
 libc/src/math/generic/f16addf128.cpp          |    6 +-
 libc/src/math/generic/f16addl.cpp             |    6 +-
 libc/src/math/generic/logbl.cpp               |    6 +-
 libc/src/math/generic/tanf16.cpp              |  112 +-
 libc/src/math/generic/tanpif.cpp              |   95 +-
 libc/test/shared/CMakeLists.txt               |   13 +
 libc/test/shared/shared_math_test.cpp         |   21 +
 .../lib/amdgcn/workitem/get_local_size.cl     |   10 +-
 .../lib/amdgcn/workitem/get_num_groups.cl     |   10 +-
 .../src/detail/offload/offload_topology.cpp   |    2 +-
 lldb/include/lldb/Host/ProcessLaunchInfo.h    |    3 +-
 .../include/lldb/Host/posix/HostThreadPosix.h |    2 +-
 lldb/include/lldb/Utility/ArchSpec.h          |   11 +
 .../Python/lldbsuite/test/lldbinline.py       |    1 +
 .../Python/lldbsuite/test/lldbtest.py         |   16 +-
 .../test/tools/lldb-dap/dap_server.py         |   25 +-
 .../test/tools/lldb-dap/lldbdap_testcase.py   |   37 +-
 lldb/source/Host/posix/HostThreadPosix.cpp    |   12 +-
 .../Disassembler/LLVMC/DisassemblerLLVMC.cpp  |   39 +-
 .../Plugins/ObjectFile/ELF/ObjectFileELF.cpp  |  187 +
 .../Plugins/ObjectFile/ELF/ObjectFileELF.h    |    4 +
 .../Platform/MacOSX/PlatformDarwin.cpp        |  232 +-
 .../Process/Windows/Common/ProcessWindows.cpp |    4 -
 .../API/commands/frame/var/TestFrameVar.py    |    1 +
 .../platform/connect/TestPlatformConnect.py   |    1 +
 .../TestPlatformLaunchGDBServer.py            |    1 +
 .../process/launch/TestProcessLaunch.py       |    1 +
 .../use_source_cache/TestUseSourceCache.py    |    1 +
 .../commands/statistics/basic/TestStats.py    |    1 +
 .../TestAutoInstallMainExecutable.py          |    1 +
 .../dwo/TestDumpDwo.py                        |    1 +
 .../API/commands/trace/TestTraceStartStop.py  |    2 +
 .../TestBreakpointCommand.py                  |    1 +
 .../comp_dir_symlink/TestCompDirSymLink.py    |    2 +
 .../breakpoint/objc/TestObjCBreakpoints.py    |    2 +
 .../generic/map/TestDataFormatterStdMap.py    |    1 +
 .../string/TestDataFormatterStdString.py      |    1 +
 .../gdb_remote_client/TestPty.py              |    2 +
 .../inferior-changed/TestInferiorChanged.py   |    2 +
 .../limit-debug-info/TestLimitDebugInfo.py    |    2 +
 .../module_cache/bsd/TestModuleCacheBSD.py    |    2 +
 .../debug_index/TestDebugIndexCache.py        |    2 +
 .../rerun_and_expr/TestRerunAndExpr.py        |    2 +
 .../TestRerunAndExprDylib.py                  |    2 +
 .../thread/step_until/TestStepUntilAPI.py     |    1 +
 .../TestCCallingConventions.py                |    1 +
 .../TestSharedLibStrippedSymbols.py           |    2 +
 .../cpp/abi_tag_lookup/TestAbiTagLookup.py    |    2 +
 .../abi_tag_structors/TestAbiTagStructors.py  |    2 +
 .../TestConstStaticIntegralMember.py          |    2 +
 .../TestExprDefinitionInDylib.py              |    1 +
 .../TestTemplateWithSameArg.py                |    2 +
 .../lang/cpp/namespace/TestNamespaceLookup.py |    2 +
 .../cpp/template-alias/TestTemplateAlias.py   |    2 +
 .../TestTemplateFunctions.py                  |    2 +
 .../cpp/unique-types3/TestUniqueTypes3.py     |    2 +
 .../TestObjCStructArgument.py                 |    2 +
 .../TestLLDBUtilFailedToHitBreakpoint.py      |    1 -
 .../TestFirmwareCorefiles.py                  |    2 +
 .../macosx/simulator/TestSimulatorPlatform.py |    1 +
 .../skinny-corefile/TestSkinnyCorefile.py     |    2 +
 .../python_api/debugger/TestDebuggerAPI.py    |    1 +
 .../TestTargetArchFromModule.py               |    2 +
 .../TestModuleUnifiedSectionList.py           |    2 +
 .../riscv/disassembler/TestDisassembler.py    |   79 +
 lldb/test/API/riscv/disassembler/a.out.yaml   |   32 +
 .../riscv/disassembler/conflicting.out.yaml   |   38 +
 .../API/riscv/disassembler/stripped.out.yaml  |   28 +
 .../API/source-manager/TestSourceManager.py   |    1 +
 lldb/test/API/test_utils/base/TestBaseTest.py |    2 +
 .../breakpoint/TestDAP_setBreakpoints.py      |    2 +
 .../lldb-dap/disconnect/TestDAP_disconnect.py |    2 +
 .../lldb-dap/launch/TestDAP_launch_args.py    |    3 -
 .../lldb-dap/launch/TestDAP_launch_basic.py   |    3 -
 .../launch/TestDAP_launch_debuggerRoot.py     |    3 -
 .../TestDAP_launch_environment_with_object.py |    3 -
 ...AP_launch_shellExpandArguments_disabled.py |    3 -
 ...DAP_launch_shellExpandArguments_enabled.py |    5 +-
 ...AP_launch_stdio_redirection_and_console.py |    6 +-
 .../lldb-dap/launch/TestDAP_launch_version.py |    3 -
 .../runInTerminal/TestDAP_runInTerminal.py    |    2 +
 .../lldb-dap/variables/TestDAP_variables.py   |   12 +
 .../lldb-server/TestGdbRemotePlatformFile.py  |    2 +
 .../commandline/TestGdbRemoteConnection.py    |    2 +
 lldb/test/API/types/AbstractBase.py           |    2 +
 .../Handler/SetVariableRequestHandler.cpp     |    3 +-
 .../lldb-dap/Protocol/ProtocolRequests.h      |    2 +-
 lldb/unittests/DAP/ProtocolRequestsTest.cpp   |   20 +
 llvm/cmake/modules/AddLLVM.cmake              |    7 +
 llvm/cmake/modules/HandleLLVMOptions.cmake    |    1 -
 .../cmake/modules/llvm-driver-template.cpp.in |    2 +-
 llvm/docs/MIRLangRef.rst                      |    7 +-
 .../llvm/Support/CrashRecoveryContext.h       |    2 +-
 llvm/include/llvm/Support/InitLLVM.h          |    9 +-
 llvm/include/llvm/Support/Signals.h           |    8 +-
 llvm/lib/Analysis/ConstantFolding.cpp         |    3 +-
 llvm/lib/Bitcode/Reader/BitcodeReader.cpp     |   35 +-
 llvm/lib/CodeGen/MIRParser/MIParser.cpp       |   97 +-
 .../CodeGen/SelectionDAG/TargetLowering.cpp   |    4 +
 llvm/lib/Support/CrashRecoveryContext.cpp     |   25 +-
 llvm/lib/Support/InitLLVM.cpp                 |    8 +-
 llvm/lib/Support/KnownBits.cpp                |   18 +-
 llvm/lib/Support/Unix/Signals.inc             |   58 +-
 llvm/lib/Support/Windows/Signals.inc          |    4 +-
 llvm/lib/Target/AArch64/AArch64InstrInfo.h    |    8 +-
 .../AArch64/GISel/AArch64RegisterBankInfo.cpp |    2 +-
 llvm/lib/Target/AMDGPU/AMDGPU.td              |   31 +-
 .../Target/AMDGPU/AMDGPULowerVGPREncoding.cpp |    9 +-
 .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp      |    2 +
 .../Disassembler/AMDGPUDisassembler.cpp       |   10 +
 .../AMDGPU/Disassembler/AMDGPUDisassembler.h  |    1 +
 llvm/lib/Target/AMDGPU/GCNSubtarget.h         |    4 +
 llvm/lib/Target/AMDGPU/R600InstrInfo.h        |    2 +-
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        |   20 +-
 llvm/lib/Target/AMDGPU/SIInstrInfo.h          |   42 +-
 llvm/lib/Target/AMDGPU/SIInstrInfo.td         |    7 +-
 .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp    |    6 +-
 llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h |    5 +-
 llvm/lib/Target/AMDGPU/VOP3PInstructions.td   |  148 +-
 .../Target/PowerPC/PPCHazardRecognizers.cpp   |    6 +-
 llvm/lib/Target/PowerPC/PPCISelLowering.cpp   |   24 +-
 llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp   |    2 +-
 .../Target/RISCV/AsmParser/RISCVAsmParser.cpp |   99 +-
 .../RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp  |    1 +
 .../RISCV/MCTargetDesc/RISCVMCTargetDesc.h    |    1 +
 llvm/lib/Target/RISCV/RISCVInstrFormatsV.td   |    8 +
 llvm/lib/Target/RISCV/RISCVInstrInfo.cpp      |    1 -
 llvm/lib/Target/RISCV/RISCVInstrInfo.h        |    1 -
 llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td |   12 +
 llvm/lib/Target/RISCV/RISCVInstrInfoXRivos.td |    4 +
 llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td    |    4 +-
 llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td    |   10 +-
 llvm/lib/Target/SystemZ/SystemZInstrInfo.h    |    4 +-
 .../TargetInfo/WebAssemblyTargetInfo.h        |    6 +-
 llvm/lib/Target/X86/X86ISelLowering.cpp       |   10 +-
 llvm/lib/TargetParser/TargetParser.cpp        |   29 +-
 .../AArch64/fp16_intrinsic_scalar_1op.ll      |   86 +-
 ...wmma-gfx12-w32-f16-f32-matrix-modifiers.ll |  374 +-
 .../AMDGPU/GlobalISel/wmma-gfx12-w32-imm.ll   |  598 +-
 .../GlobalISel/wmma-gfx12-w32-iu-modifiers.ll |  258 +-
 .../wmma-gfx12-w32-swmmac-index_key.ll        |  183 +-
 .../AMDGPU/GlobalISel/wmma-gfx12-w32.ll       |  298 +-
 ...wmma-gfx12-w64-f16-f32-matrix-modifiers.ll |  291 +-
 .../AMDGPU/GlobalISel/wmma-gfx12-w64-imm.ll   |  442 +-
 .../GlobalISel/wmma-gfx12-w64-iu-modifiers.ll |  186 +-
 .../wmma-gfx12-w64-swmmac-index_key.ll        |  266 +-
 .../AMDGPU/GlobalISel/wmma-gfx12-w64.ll       |  226 +-
 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll |    1 +
 ...wmma-gfx12-w32-f16-f32-matrix-modifiers.ll |  371 +-
 .../test/CodeGen/AMDGPU/wmma-gfx12-w32-imm.ll |  334 +-
 .../AMDGPU/wmma-gfx12-w32-iu-modifiers.ll     |  258 +-
 .../AMDGPU/wmma-gfx12-w32-swmmac-index_key.ll |  183 +-
 llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32.ll    |  298 +-
 ...wmma-gfx12-w64-f16-f32-matrix-modifiers.ll |  303 +-
 .../test/CodeGen/AMDGPU/wmma-gfx12-w64-imm.ll |  334 +-
 .../AMDGPU/wmma-gfx12-w64-iu-modifiers.ll     |  186 +-
 .../AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll |  266 +-
 llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64.ll    |  226 +-
 .../CodeGen/AMDGPU/wmma-hazards-gfx12-w32.mir |  237 +-
 .../CodeGen/AMDGPU/wmma-hazards-gfx12-w64.mir |  237 +-
 .../X86/expected-integer-after-tied-def.mir   |    2 +-
 .../MIR/X86/invalid-tied-physical-reg-def.mir |   15 +
 ...aren.mir => invalid-type-physical-reg.mir} |    2 +-
 llvm/test/CodeGen/PowerPC/clmul-vector.ll     | 8874 +++++++++++++++++
 llvm/test/CodeGen/X86/known-pow2.ll           |   47 +-
 llvm/test/MC/AMDGPU/gfx1170_asm_wmma_w32.s    | 1529 +++
 llvm/test/MC/AMDGPU/gfx1170_asm_wmma_w64.s    | 1529 +++
 llvm/test/MC/AMDGPU/gfx1170_unsupported.s     |   11 +
 llvm/test/MC/AMDGPU/literals.s                |    8 +-
 .../AMDGPU/gfx1170_dasm_wmma_w32.txt          | 1628 +++
 .../AMDGPU/gfx1170_dasm_wmma_w64.txt          | 1628 +++
 .../ConstProp/vector-type-constant-folding.ll |   25 +
 llvm/unittests/Target/X86/CMakeLists.txt      |    2 +
 .../Target/X86/X86SelectionDAGTest.cpp        |  103 +
 llvm/utils/TableGen/CodeGenMapTable.cpp       |    6 +-
 .../llvm/unittests/Target/X86/BUILD.gn        |    1 +
 .../mlir/Dialect/MemRef/Utils/MemRefUtils.h   |   15 +
 mlir/include/mlir/Dialect/SCF/Utils/Utils.h   |    7 +
 .../Conversion/MathToSPIRV/MathToSPIRV.cpp    |   15 +-
 mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp    |    8 +-
 mlir/lib/Dialect/MemRef/Utils/MemRefUtils.cpp |   41 +
 mlir/lib/Dialect/OpenACC/IR/CMakeLists.txt    |    1 +
 mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp       |  119 -
 mlir/lib/Dialect/OpenACC/IR/OpenACCCG.cpp     |  186 +
 .../lib/Dialect/SCF/Transforms/CMakeLists.txt |    1 +
 .../SCF/Transforms/ParallelLoopFusion.cpp     |  742 +-
 mlir/lib/Dialect/SCF/Utils/Utils.cpp          |   19 +
 .../MathToSPIRV/math-to-gl-spirv.mlir         |   21 +
 mlir/test/Dialect/LLVMIR/invalid.mlir         |    7 +
 .../Dialect/SCF/parallel-loop-fusion.mlir     |  424 +-
 .../Dialect/XeGPU/LANE/no-xegpu-ops.mlir      |   53 +
 utils/bazel/BUILD.bazel                       |    8 +-
 .../clang-tools-extra/clang-query/BUILD.bazel |   16 +-
 .../llvm-project-overlay/libc/BUILD.bazel     |  218 +-
 .../lldb/source/Plugins/BUILD.bazel           |    1 +
 .../llvm-project-overlay/mlir/BUILD.bazel     |    3 +
 325 files changed, 25008 insertions(+), 4720 deletions(-)
 create mode 100644 clang/test/CIR/CodeGenBuiltins/builtin-structured-binding-size.cpp
 create mode 100644 clang/test/CIR/CodeGenBuiltins/builtin-trivally-copyable.cpp
 create mode 100644 clang/test/CIR/CodeGenBuiltins/builtin-types-compatible.c
 create mode 100644 clang/test/CodeGenHLSL/BasicFeatures/MatrixToAndFromVectorConstructors.hlsl
 create mode 100644 clang/test/CodeGenHLSL/resources/cbuffer_matrix_align.hlsl
 create mode 100644 clang/test/Driver/Inputs/XRSimulator1.0.sdk/usr/include/libxml/.keep
 delete mode 100644 clang/test/Modules/demote-var-def.cpp
 delete mode 100644 clang/test/Modules/pr149404-02.cppm
 delete mode 100644 clang/test/Modules/pr172241.cppm
 delete mode 100644 clang/test/Modules/var-inst-def.cppm
 create mode 100644 libc/shared/math/bf16divl.h
 create mode 100644 libc/shared/math/bf16fmal.h
 create mode 100644 libc/shared/math/bf16mul.h
 create mode 100644 libc/shared/math/bf16mulf.h
 create mode 100644 libc/shared/math/bf16mulf128.h
 create mode 100644 libc/shared/math/bf16mull.h
 create mode 100644 libc/shared/math/f16add.h
 create mode 100644 libc/shared/math/f16addf.h
 create mode 100644 libc/shared/math/f16addf128.h
 create mode 100644 libc/shared/math/f16addl.h
 create mode 100644 libc/shared/math/logbl.h
 create mode 100644 libc/shared/math/tanf16.h
 create mode 100644 libc/shared/math/tanpif.h
 create mode 100644 libc/src/__support/math/bf16divl.h
 create mode 100644 libc/src/__support/math/bf16fmal.h
 create mode 100644 libc/src/__support/math/bf16mul.h
 create mode 100644 libc/src/__support/math/bf16mulf.h
 create mode 100644 libc/src/__support/math/bf16mulf128.h
 create mode 100644 libc/src/__support/math/bf16mull.h
 create mode 100644 libc/src/__support/math/f16add.h
 create mode 100644 libc/src/__support/math/f16addf.h
 create mode 100644 libc/src/__support/math/f16addf128.h
 create mode 100644 libc/src/__support/math/f16addl.h
 create mode 100644 libc/src/__support/math/logbl.h
 create mode 100644 libc/src/__support/math/tanf16.h
 create mode 100644 libc/src/__support/math/tanpif.h
 create mode 100644 lldb/test/API/riscv/disassembler/TestDisassembler.py
 create mode 100644 lldb/test/API/riscv/disassembler/a.out.yaml
 create mode 100644 lldb/test/API/riscv/disassembler/conflicting.out.yaml
 create mode 100644 lldb/test/API/riscv/disassembler/stripped.out.yaml
 create mode 100644 llvm/test/CodeGen/MIR/X86/invalid-tied-physical-reg-def.mir
 rename llvm/test/CodeGen/MIR/X86/{expected-tied-def-after-lparen.mir => invalid-type-physical-reg.mir} (87%)
 create mode 100644 llvm/test/CodeGen/PowerPC/clmul-vector.ll
 create mode 100644 llvm/test/MC/AMDGPU/gfx1170_asm_wmma_w32.s
 create mode 100644 llvm/test/MC/AMDGPU/gfx1170_asm_wmma_w64.s
 create mode 100644 llvm/test/MC/AMDGPU/gfx1170_unsupported.s
 create mode 100644 llvm/test/MC/Disassembler/AMDGPU/gfx1170_dasm_wmma_w32.txt
 create mode 100644 llvm/test/MC/Disassembler/AMDGPU/gfx1170_dasm_wmma_w64.txt
 create mode 100644 llvm/test/Transforms/InstSimplify/ConstProp/vector-type-constant-folding.ll
 create mode 100644 llvm/unittests/Target/X86/X86SelectionDAGTest.cpp
 create mode 100644 mlir/lib/Dialect/OpenACC/IR/OpenACCCG.cpp
 create mode 100644 mlir/test/Integration/Dialect/XeGPU/LANE/no-xegpu-ops.mlir

diff --git a/clang-tools-extra/clangd/TidyProvider.cpp b/clang-tools-extra/clangd/TidyProvider.cpp
index 801b3af2fbdd5..bfb0835af2245 100644
--- a/clang-tools-extra/clangd/TidyProvider.cpp
+++ b/clang-tools-extra/clangd/TidyProvider.cpp
@@ -222,7 +222,8 @@ TidyProvider disableUnusableChecks(llvm::ArrayRef<std::string> ExtraBadChecks) {
       "-hicpp-invalid-access-moved",
       // Check uses dataflow analysis, which might hang/crash unexpectedly on
       // incomplete code.
-      "-bugprone-unchecked-optional-access");
+      "-bugprone-unchecked-optional-access",
+      "-abseil-unchecked-statusor-access");
 
   size_t Size = BadChecks.size();
   for (const std::string &Str : ExtraBadChecks) {
diff --git a/clang-tools-extra/docs/clang-tidy/checks/abseil/unchecked-statusor-access.rst b/clang-tools-extra/docs/clang-tidy/checks/abseil/unchecked-statusor-access.rst
index 8a766e8f6abe4..c56ff8c886e2c 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/abseil/unchecked-statusor-access.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/abseil/unchecked-statusor-access.rst
@@ -93,7 +93,7 @@ known to have ok status. For example:
 Ensuring that the status is ok using common macros
 --------------------------------------------------
 
-The check is aware of common macros like ``ABSL_CHECK`` and ``ASSERT_THAT``.
+The check is aware of common macros like ``ABSL_CHECK`` or ``ABSL_CHECK_OK``.
 Those can be used to ensure that the status of a ``StatusOr<T>`` object
 is ok. For example:
 
@@ -104,6 +104,46 @@ is ok. For example:
      use(*x);
    }
 
+Ensuring that the status is ok using googletest macros
+------------------------------------------------------
+
+The check is aware of ``googletest`` (or ``gtest``) macros and matchers.
+Accessing the value of a ``StatusOr<T>`` object is considered safe if it
+is preceded by an ``ASSERT_`` macro that ensures the status is ok.
+For example:
+
+.. code:: cpp
+
+   TEST(MySuite, MyTest) {
+     absl::StatusOr<int> x = foo();
+     ASSERT_OK(x);
+     use(*x);
+   }
+
+   TEST(MySuite, MyOtherTest) {
+     absl::StatusOr<int> x = foo();
+     ASSERT_THAT(x, absl_testing::IsOk());
+     use(*x);
+   }
+
+The following ``googletest`` macros are supported:
+
+- ``ASSERT_OK(...)``
+- ``ASSERT_TRUE(...)``
+- ``ASSERT_FALSE(...)``
+- ``ASSERT_THAT(...)``
+
+The following matchers are supported:
+
+- ``IsOk()``
+- ``StatusIs(...)``
+- ``IsOkAndHolds(...)``
+- ``CanonicalStatusIs(...)``
+
+**Note**: ``EXPECT_`` macros (like ``EXPECT_OK`` or ``EXPECT_TRUE(x.ok())``)
+do **not** make subsequent accesses safe because they do not terminate the
+test execution.
+
 Ensuring that the status is ok, then accessing the value in a correlated branch
 -------------------------------------------------------------------------------
 
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 86cee7d1b6f9b..56c8b79e37576 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -160,6 +160,8 @@ Deprecated Compiler Flags
 Modified Compiler Flags
 -----------------------
 - The `-mno-outline` and `-moutline` compiler flags are now allowed on RISC-V and X86, which both support the machine outliner.
+- The `-mno-outline` flag will now add the `nooutline` IR attribute, so that
+  `-mno-outline` and `-moutline` objects can be mixed correctly during LTO.
 
 Removed Compiler Flags
 ----------------------
diff --git a/clang/docs/analyzer/checkers.rst b/clang/docs/analyzer/checkers.rst
index 7ff55bc9d77a7..e51015655de65 100644
--- a/clang/docs/analyzer/checkers.rst
+++ b/clang/docs/analyzer/checkers.rst
@@ -139,55 +139,6 @@ core.DivideZero (C, C++, ObjC)
 .. literalinclude:: checkers/dividezero_example.c
     :language: c
 
-.. _core-FixedAddressDereference:
-
-core.FixedAddressDereference (C, C++, ObjC)
-"""""""""""""""""""""""""""""""""""""""""""
-Check for dereferences of fixed addresses.
-
-A pointer contains a fixed address if it was set to a hard-coded value or it
-becomes otherwise obvious that at that point it can have only a single fixed
-numerical value.
-
-.. code-block:: c
-
- void test1() {
-   int *p = (int *)0x020;
-   int x = p[0]; // warn
- }
-
- void test2(int *p) {
-   if (p == (int *)-1)
-     *p = 0; // warn
- }
-
- void test3() {
-   int (*p_function)(char, char);
-   p_function = (int (*)(char, char))0x04080;
-   int x = (*p_function)('x', 'y'); // NO warning yet at functon pointer calls
- }
-
- void volatile_pointee() {
-   *(volatile int *)0x404 = 1; // no warning: constant non-null "volatile" pointee, you must know what you are doing
- }
-
- void deref_volatile_nullptr() {
-   *(volatile int *)0 = 1; // core.NullDereference still warns about this
- }
-
-If your project is low-level (e.g., firmware), or deals with hardware interop with a lot of genuine constant addresses, then consider disabling this checker.
-The checker automatically suppresses issues if the type of the pointee of the address is ``volatile``.
-You probably already need this to be ``volatile`` for legitimate access, so the checker suppresses such issues to avoid false-positives.
-Note that null pointers will still be reported by :ref:`core.NullDereference <core-NullDereference>`
-regardless if the pointee is ``volatile`` or not.
-
-If the analyzer option ``suppress-dereferences-from-any-address-space`` is set
-to true (the default value), then this checker never reports dereference of
-pointers with a specified address space. If the option is set to false, then
-reports from the specific x86 address spaces 256, 257 and 258 are still
-suppressed, but fixed address dereferences from other address spaces are
-reported.
-
 .. _core-NonNullParamChecker:
 
 core.NonNullParamChecker (C, C++, ObjC)
@@ -898,6 +849,55 @@ of this Clang attribute.
 
 Projects that use this pattern should not enable this optin checker.
 
+.. _optin-core-FixedAddressDereference:
+
+optin.core.FixedAddressDereference (C, C++, ObjC)
+"""""""""""""""""""""""""""""""""""""""""""""""""
+Check for dereferences of fixed addresses.
+
+A pointer contains a fixed address if it was set to a hard-coded value or it
+becomes otherwise obvious that at that point it can have only a single fixed
+numerical value.
+
+.. code-block:: c
+
+ void test1() {
+   int *p = (int *)0x020;
+   int x = p[0]; // warn
+ }
+
+ void test2(int *p) {
+   if (p == (int *)-1)
+     *p = 0; // warn
+ }
+
+ void test3() {
+   int (*p_function)(char, char);
+   p_function = (int (*)(char, char))0x04080;
+   int x = (*p_function)('x', 'y'); // NO warning yet at functon pointer calls
+ }
+
+ void volatile_pointee() {
+   *(volatile int *)0x404 = 1; // no warning: constant non-null "volatile" pointee, you must know what you are doing
+ }
+
+ void deref_volatile_nullptr() {
+   *(volatile int *)0 = 1; // core.NullDereference still warns about this
+ }
+
+If your project is low-level (e.g., firmware), or deals with hardware interop with a lot of genuine constant addresses, then consider disabling this checker.
+The checker automatically suppresses issues if the type of the pointee of the address is ``volatile``.
+You probably already need this to be ``volatile`` for legitimate access, so the checker suppresses such issues to avoid false-positives.
+Note that null pointers will still be reported by :ref:`core.NullDereference <core-NullDereference>`
+regardless if the pointee is ``volatile`` or not.
+
+If the analyzer option ``suppress-dereferences-from-any-address-space`` is set
+to true (the default value), then this checker never reports dereference of
+pointers with a specified address space. If the option is set to false, then
+reports from the specific x86 address spaces 256, 257 and 258 are still
+suppressed, but fixed address dereferences from other address spaces are
+reported.
+
 .. _optin-cplusplus-UninitializedObject:
 
 optin.cplusplus.UninitializedObject (C++)
diff --git a/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysisContext.h b/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysisContext.h
index 11042e865c4e6..4b6306eb21dc3 100644
--- a/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysisContext.h
+++ b/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysisContext.h
@@ -17,6 +17,7 @@
 
 #include "clang/AST/Decl.h"
 #include "clang/AST/Expr.h"
+#include "clang/AST/Type.h"
 #include "clang/AST/TypeOrdering.h"
 #include "clang/Analysis/FlowSensitive/ASTOps.h"
 #include "clang/Analysis/FlowSensitive/AdornedCFG.h"
@@ -207,8 +208,9 @@ class DataflowAnalysisContext {
   Solver::Result querySolver(llvm::SetVector<const Formula *> Constraints);
 
   /// Returns the fields of `Type`, limited to the set of fields modeled by this
-  /// context.
-  FieldSet getModeledFields(QualType Type);
+  /// context. The returned reference is valid for the lifetime of the context,
+  /// or until `addModeledFields()` is called.
+  const FieldSet &getModeledFields(QualType Type);
 
   /// Returns the names and types of the synthetic fields for the given record
   /// type.
@@ -262,7 +264,11 @@ class DataflowAnalysisContext {
   /// `Tokens` in the dependency graph.
   llvm::DenseSet<Atom> collectDependencies(llvm::DenseSet<Atom> Tokens) const;
 
-  // Extends the set of modeled field declarations.
+  /// Computes and returns the fields of `Type`, limited to the set of fields
+  /// modeled by this context.
+  FieldSet computeModeledFields(QualType Type);
+
+  /// Extends the set of modeled field declarations.
   void addModeledFields(const FieldSet &Fields);
 
   /// Adds all constraints of the flow condition identified by `Token` and all
@@ -326,9 +332,16 @@ class DataflowAnalysisContext {
 
   llvm::DenseMap<const FunctionDecl *, AdornedCFG> FunctionContexts;
 
-  // Fields modeled by environments covered by this context.
+  // Fields (from any record Type) modeled by environments using this context.
+  // The set may only contain fields that are referenced in the scope of
+  // the environments (but it is up to the environment what is relevant to
+  // model).
   FieldSet ModeledFields;
 
+  // A lazily-computed and cached version of ModeledFields that is split by
+  // record Type.
+  llvm::DenseMap<QualType, std::unique_ptr<FieldSet>> CachedModeledFields;
+
   std::unique_ptr<Logger> LogOwner; // If created via flags.
 
   std::function<llvm::StringMap<QualType>(QualType)> SyntheticFieldCallback;
diff --git a/clang/include/clang/Analysis/Scalable/Serialization/SerializationFormatRegistry.h b/clang/include/clang/Analysis/Scalable/Serialization/SerializationFormatRegistry.h
index d7e77b9b18f77..ef060dd27c522 100644
--- a/clang/include/clang/Analysis/Scalable/Serialization/SerializationFormatRegistry.h
+++ b/clang/include/clang/Analysis/Scalable/Serialization/SerializationFormatRegistry.h
@@ -7,7 +7,24 @@
 //===----------------------------------------------------------------------===//
 //
 // Registry for SerializationFormats, and some helper functions.
-// To register some custom serialization format, insert this code:
+//
+// To register some custom serialization format, you will need to add some
+// declarations and defintions.
+//
+// Insert this code to the header file:
+//
+//   namespace llvm {
+//   extern template class CLANG_TEMPLATE_ABI
+//     Registry<clang::ssaf::MyFormat::FormatInfo>;
+//   } // namespace llvm
+//
+// Insert this declaration to the MyFormat class:
+//
+//   using FormatInfo = FormatInfoEntry<SerializerFn, DeserializerFn>;
+//
+// Insert this code to the cpp file:
+//
+//   LLVM_INSTANTIATE_REGISTRY(llvm::Registry<MyFormat::FormatInfo>)
 //
 //   static SerializationFormatRegistry::Add<MyFormat>
 //     RegisterFormat("MyFormat", "My awesome serialization format");
@@ -17,7 +34,7 @@
 //
 //   namespace {
 //   using FormatInfo = MyFormat::FormatInfo;
-//   struct MyAnalysisFormatInfo : FormatInfo {
+//   struct MyAnalysisFormatInfo final : FormatInfo {
 //     MyAnalysisFormatInfo() : FormatInfo{
 //               SummaryName("MyAnalysis"),
 //               serializeMyAnalysis,
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.td b/clang/include/clang/Basic/BuiltinsAMDGPU.td
index 78443ac291f31..86b10eba55e8e 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.td
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.td
@@ -358,23 +358,23 @@ def __builtin_amdgcn_s_wait_event : AMDGPUBuiltin<"void(_Constant short)", [], "
 // Postfix w32 indicates the builtin requires wavefront size of 32.
 // Postfix w64 indicates the builtin requires wavefront size of 64.
 //===----------------------------------------------------------------------===//
-def __builtin_amdgcn_wmma_f32_16x16x16_f16_w32 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<16, _Float16>, _ExtVector<16, _Float16>, _ExtVector<8, float>)", [Const], "gfx11-insts,wavefrontsize32">;
-def __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<16, short>, _ExtVector<16, short>, _ExtVector<8, float>)", [Const], "gfx11-insts,wavefrontsize32">;
-def __builtin_amdgcn_wmma_f16_16x16x16_f16_w32 : AMDGPUBuiltin<"_ExtVector<16, _Float16>(_ExtVector<16, _Float16>, _ExtVector<16, _Float16>, _ExtVector<16, _Float16>, _Constant bool)", [Const], "gfx11-insts,wavefrontsize32">;
-def __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32 : AMDGPUBuiltin<"_ExtVector<16, short>(_ExtVector<16, short>, _ExtVector<16, short>, _ExtVector<16, short>, _Constant bool)", [Const], "gfx11-insts,wavefrontsize32">;
-def __builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32 : AMDGPUBuiltin<"_ExtVector<16, _Float16>(_ExtVector<16, _Float16>, _ExtVector<16, _Float16>, _ExtVector<16, _Float16>, _Constant bool)", [Const], "gfx11-insts,wavefrontsize32">;
-def __builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32 : AMDGPUBuiltin<"_ExtVector<16, short>(_ExtVector<16, short>, _ExtVector<16, short>, _ExtVector<16, short>, _Constant bool)", [Const], "gfx11-insts,wavefrontsize32">;
-def __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32 : AMDGPUBuiltin<"_ExtVector<8, int>(_Constant bool, _ExtVector<4, int>, _Constant bool, _ExtVector<4, int>, _ExtVector<8, int>, _Constant bool)", [Const], "gfx11-insts,wavefrontsize32">;
-def __builtin_amdgcn_wmma_i32_16x16x16_iu4_w32 : AMDGPUBuiltin<"_ExtVector<8, int>(_Constant bool, _ExtVector<2, int>, _Constant bool, _ExtVector<2, int>, _ExtVector<8, int>, _Constant bool)", [Const], "gfx11-insts,wavefrontsize32">;
-
-def __builtin_amdgcn_wmma_f32_16x16x16_f16_w64 : AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<16, _Float16>, _ExtVector<16, _Float16>, _ExtVector<4, float>)", [Const], "gfx11-insts,wavefrontsize64">;
-def __builtin_amdgcn_wmma_f32_16x16x16_bf16_w64 : AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<16, short>, _ExtVector<16, short>, _ExtVector<4, float>)", [Const], "gfx11-insts,wavefrontsize64">;
-def __builtin_amdgcn_wmma_f16_16x16x16_f16_w64 : AMDGPUBuiltin<"_ExtVector<8, _Float16>(_ExtVector<16, _Float16>, _ExtVector<16, _Float16>, _ExtVector<8, _Float16>, _Constant bool)", [Const], "gfx11-insts,wavefrontsize64">;
-def __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64 : AMDGPUBuiltin<"_ExtVector<8, short>(_ExtVector<16, short>, _ExtVector<16, short>, _ExtVector<8, short>, _Constant bool)", [Const], "gfx11-insts,wavefrontsize64">;
-def __builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64 : AMDGPUBuiltin<"_ExtVector<8, _Float16>(_ExtVector<16, _Float16>, _ExtVector<16, _Float16>, _ExtVector<8, _Float16>, _Constant bool)", [Const], "gfx11-insts,wavefrontsize64">;
-def __builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64 : AMDGPUBuiltin<"_ExtVector<8, short>(_ExtVector<16, short>, _ExtVector<16, short>, _ExtVector<8, short>, _Constant bool)", [Const], "gfx11-insts,wavefrontsize64">;
-def __builtin_amdgcn_wmma_i32_16x16x16_iu8_w64 : AMDGPUBuiltin<"_ExtVector<4, int>(_Constant bool, _ExtVector<4, int>, _Constant bool, _ExtVector<4, int>, _ExtVector<4, int>, _Constant bool)", [Const], "gfx11-insts,wavefrontsize64">;
-def __builtin_amdgcn_wmma_i32_16x16x16_iu4_w64 : AMDGPUBuiltin<"_ExtVector<4, int>(_Constant bool, _ExtVector<2, int>, _Constant bool, _ExtVector<2, int>, _ExtVector<4, int>, _Constant bool)", [Const], "gfx11-insts,wavefrontsize64">;
+def __builtin_amdgcn_wmma_f32_16x16x16_f16_w32 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<16, _Float16>, _ExtVector<16, _Float16>, _ExtVector<8, float>)", [Const], "wmma-256b-insts,wavefrontsize32">;
+def __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<16, short>, _ExtVector<16, short>, _ExtVector<8, float>)", [Const], "wmma-256b-insts,wavefrontsize32">;
+def __builtin_amdgcn_wmma_f16_16x16x16_f16_w32 : AMDGPUBuiltin<"_ExtVector<16, _Float16>(_ExtVector<16, _Float16>, _ExtVector<16, _Float16>, _ExtVector<16, _Float16>, _Constant bool)", [Const], "wmma-256b-insts,wavefrontsize32">;
+def __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32 : AMDGPUBuiltin<"_ExtVector<16, short>(_ExtVector<16, short>, _ExtVector<16, short>, _ExtVector<16, short>, _Constant bool)", [Const], "wmma-256b-insts,wavefrontsize32">;
+def __builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32 : AMDGPUBuiltin<"_ExtVector<16, _Float16>(_ExtVector<16, _Float16>, _ExtVector<16, _Float16>, _ExtVector<16, _Float16>, _Constant bool)", [Const], "wmma-256b-insts,wavefrontsize32">;
+def __builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32 : AMDGPUBuiltin<"_ExtVector<16, short>(_ExtVector<16, short>, _ExtVector<16, short>, _ExtVector<16, short>, _Constant bool)", [Const], "wmma-256b-insts,wavefrontsize32">;
+def __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32 : AMDGPUBuiltin<"_ExtVector<8, int>(_Constant bool, _ExtVector<4, int>, _Constant bool, _ExtVector<4, int>, _ExtVector<8, int>, _Constant bool)", [Const], "wmma-256b-insts,wavefrontsize32">;
+def __builtin_amdgcn_wmma_i32_16x16x16_iu4_w32 : AMDGPUBuiltin<"_ExtVector<8, int>(_Constant bool, _ExtVector<2, int>, _Constant bool, _ExtVector<2, int>, _ExtVector<8, int>, _Constant bool)", [Const], "wmma-256b-insts,wavefrontsize32">;
+
+def __builtin_amdgcn_wmma_f32_16x16x16_f16_w64 : AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<16, _Float16>, _ExtVector<16, _Float16>, _ExtVector<4, float>)", [Const], "wmma-256b-insts,wavefrontsize64">;
+def __builtin_amdgcn_wmma_f32_16x16x16_bf16_w64 : AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<16, short>, _ExtVector<16, short>, _ExtVector<4, float>)", [Const], "wmma-256b-insts,wavefrontsize64">;
+def __builtin_amdgcn_wmma_f16_16x16x16_f16_w64 : AMDGPUBuiltin<"_ExtVector<8, _Float16>(_ExtVector<16, _Float16>, _ExtVector<16, _Float16>, _ExtVector<8, _Float16>, _Constant bool)", [Const], "wmma-256b-insts,wavefrontsize64">;
+def __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64 : AMDGPUBuiltin<"_ExtVector<8, short>(_ExtVector<16, short>, _ExtVector<16, short>, _ExtVector<8, short>, _Constant bool)", [Const], "wmma-256b-insts,wavefrontsize64">;
+def __builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64 : AMDGPUBuiltin<"_ExtVector<8, _Float16>(_ExtVector<16, _Float16>, _ExtVector<16, _Float16>, _ExtVector<8, _Float16>, _Constant bool)", [Const], "wmma-256b-insts,wavefrontsize64">;
+def __builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64 : AMDGPUBuiltin<"_ExtVector<8, short>(_ExtVector<16, short>, _ExtVector<16, short>, _ExtVector<8, short>, _Constant bool)", [Const], "wmma-256b-insts,wavefrontsize64">;
+def __builtin_amdgcn_wmma_i32_16x16x16_iu8_w64 : AMDGPUBuiltin<"_ExtVector<4, int>(_Constant bool, _ExtVector<4, int>, _Constant bool, _ExtVector<4, int>, _ExtVector<4, int>, _Constant bool)", [Const], "wmma-256b-insts,wavefrontsize64">;
+def __builtin_amdgcn_wmma_i32_16x16x16_iu4_w64 : AMDGPUBuiltin<"_ExtVector<4, int>(_Constant bool, _ExtVector<2, int>, _Constant bool, _ExtVector<2, int>, _ExtVector<4, int>, _Constant bool)", [Const], "wmma-256b-insts,wavefrontsize64">;
 
 def __builtin_amdgcn_s_sendmsg_rtn : AMDGPUBuiltin<"unsigned int(_Constant unsigned int)", [], "gfx11-insts">;
 def __builtin_amdgcn_s_sendmsg_rtnl : AMDGPUBuiltin<"uint64_t(_Constant unsigned int)", [], "gfx11-insts">;
@@ -599,67 +599,71 @@ def __builtin_amdgcn_ds_bvh_stack_push8_pop1_rtn : AMDGPUBuiltin<"_ExtVector<2,
 // The second return value of the intrinsic is zext'ed.
 def __builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn : AMDGPUBuiltin<"_ExtVector<2, uint64_t>(unsigned int, unsigned int, _ExtVector<8, unsigned int>, _Constant int)", [], "gfx12-insts">;
 
+//===----------------------------------------------------------------------===//
+// GFX1170, GFX12+ only builtins.
+//===----------------------------------------------------------------------===//
+
 //===----------------------------------------------------------------------===//
 // WMMA builtins.
 // Postfix w32 indicates the builtin requires wavefront size of 32.
 // Postfix w64 indicates the builtin requires wavefront size of 64.
 //
-// Some of these are very similar to their GFX11 counterparts, but they don't
-// require replication of the A,B matrices, so they use fewer vector elements.
-// Therefore, we add an "_gfx12" suffix to distinguish them from the existing
-// builtins.
+// Some of these are very similar to their base GFX11 counterparts, but they
+// don't require replication of the A,B matrices, so they use fewer vector
+// elements. Therefore, we add an "_gfx12" suffix to distinguish them from the
+// existing builtins.
 //===----------------------------------------------------------------------===//
-def __builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<8, _Float16>, _ExtVector<8, _Float16>, _ExtVector<8, float>)", [Const], "gfx12-insts,wavefrontsize32">;
-def __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<8, short>, _ExtVector<8, short>, _ExtVector<8, float>)", [Const], "gfx12-insts,wavefrontsize32">;
-def __builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, _Float16>(_ExtVector<8, _Float16>, _ExtVector<8, _Float16>, _ExtVector<8, _Float16>)", [Const], "gfx12-insts,wavefrontsize32">;
-def __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, short>(_ExtVector<8, short>, _ExtVector<8, short>, _ExtVector<8, short>)", [Const], "gfx12-insts,wavefrontsize32">;
-def __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, int>(_Constant bool, _ExtVector<2, int>, _Constant bool, _ExtVector<2, int>, _ExtVector<8, int>, _Constant bool)", [Const], "gfx12-insts,wavefrontsize32">;
-def __builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, int>(_Constant bool, int, _Constant bool, int, _ExtVector<8, int>, _Constant bool)", [Const], "gfx12-insts,wavefrontsize32">;
-// These are gfx12-only, but for consistency with the other WMMA variants we're
-// keeping the "_gfx12" suffix.
-def __builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<2, int>, _ExtVector<2, int>, _ExtVector<8, float>)", [Const], "gfx12-insts,wavefrontsize32">;
-def __builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<2, int>, _ExtVector<2, int>, _ExtVector<8, float>)", [Const], "gfx12-insts,wavefrontsize32">;
-def __builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<2, int>, _ExtVector<2, int>, _ExtVector<8, float>)", [Const], "gfx12-insts,wavefrontsize32">;
-def __builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<2, int>, _ExtVector<2, int>, _ExtVector<8, float>)", [Const], "gfx12-insts,wavefrontsize32">;
-def __builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, int>(_Constant bool, _ExtVector<2, int>, _Constant bool, _ExtVector<2, int>, _ExtVector<8, int>, _Constant bool)", [Const], "gfx12-insts,wavefrontsize32">;
-
-def __builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<4, _Float16>, _ExtVector<4, _Float16>, _ExtVector<4, float>)", [Const], "gfx12-insts,wavefrontsize64">;
-def __builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<4, short>, _ExtVector<4, short>, _ExtVector<4, float>)", [Const], "gfx12-insts,wavefrontsize64">;
-def __builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, _Float16>(_ExtVector<4, _Float16>, _ExtVector<4, _Float16>, _ExtVector<4, _Float16>)", [Const], "gfx12-insts,wavefrontsize64">;
-def __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, short>(_ExtVector<4, short>, _ExtVector<4, short>, _ExtVector<4, short>)", [Const], "gfx12-insts,wavefrontsize64">;
-def __builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, int>(_Constant bool, int, _Constant bool, int, _ExtVector<4, int>, _Constant bool)", [Const], "gfx12-insts,wavefrontsize64">;
-def __builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, int>(_Constant bool, int, _Constant bool, int, _ExtVector<4, int>, _Constant bool)", [Const], "gfx12-insts,wavefrontsize64">;
-// These are gfx12-only, but for consistency with the other WMMA variants we're
-// keeping the "_gfx12" suffix.
-def __builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, float>(int, int, _ExtVector<4, float>)", [Const], "gfx12-insts,wavefrontsize64">;
-def __builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, float>(int, int, _ExtVector<4, float>)", [Const], "gfx12-insts,wavefrontsize64">;
-def __builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, float>(int, int, _ExtVector<4, float>)", [Const], "gfx12-insts,wavefrontsize64">;
-def __builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, float>(int, int, _ExtVector<4, float>)", [Const], "gfx12-insts,wavefrontsize64">;
-def __builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, int>(_Constant bool, int, _Constant bool, int, _ExtVector<4, int>, _Constant bool)", [Const], "gfx12-insts,wavefrontsize64">;
-
-def __builtin_amdgcn_swmmac_f32_16x16x32_f16_w32 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<8, __fp16>, _ExtVector<16, __fp16>, _ExtVector<8, float>, int)", [Const], "gfx12-insts,wavefrontsize32">;
-def __builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<8, short>, _ExtVector<16, short>, _ExtVector<8, float>, int)", [Const], "gfx12-insts,wavefrontsize32">;
-def __builtin_amdgcn_swmmac_f16_16x16x32_f16_w32 : AMDGPUBuiltin<"_ExtVector<8, __fp16>(_ExtVector<8, __fp16>, _ExtVector<16, __fp16>, _ExtVector<8, __fp16>, int)", [Const], "gfx12-insts,wavefrontsize32">;
-def __builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32 : AMDGPUBuiltin<"_ExtVector<8, short>(_ExtVector<8, short>, _ExtVector<16, short>, _ExtVector<8, short>, int)", [Const], "gfx12-insts,wavefrontsize32">;
-def __builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32 : AMDGPUBuiltin<"_ExtVector<8, int>(_Constant bool, _ExtVector<2, int>, _Constant bool, _ExtVector<4, int>, _ExtVector<8, int>, int, _Constant bool)", [Const], "gfx12-insts,wavefrontsize32">;
-def __builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32 : AMDGPUBuiltin<"_ExtVector<8, int>(_Constant bool, int, _Constant bool, _ExtVector<2, int>, _ExtVector<8, int>, int, _Constant bool)", [Const], "gfx12-insts,wavefrontsize32">;
-def __builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32 : AMDGPUBuiltin<"_ExtVector<8, int>(_Constant bool, _ExtVector<2, int>, _Constant bool, _ExtVector<4, int>, _ExtVector<8, int>, int, _Constant bool)", [Const], "gfx12-insts,wavefrontsize32">;
-def __builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<2, int>, _ExtVector<4, int>, _ExtVector<8, float>, int)", [Const], "gfx12-insts,wavefrontsize32">;
-def __builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<2, int>, _ExtVector<4, int>, _ExtVector<8, float>, int)", [Const], "gfx12-insts,wavefrontsize32">;
-def __builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<2, int>, _ExtVector<4, int>, _ExtVector<8, float>, int)", [Const], "gfx12-insts,wavefrontsize32">;
-def __builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<2, int>, _ExtVector<4, int>, _ExtVector<8, float>, int)", [Const], "gfx12-insts,wavefrontsize32">;
-
-def __builtin_amdgcn_swmmac_f32_16x16x32_f16_w64 : AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<4, __fp16>, _ExtVector<8, __fp16>, _ExtVector<4, float>, int)", [Const], "gfx12-insts,wavefrontsize64">;
-def __builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64 : AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<4, short>, _ExtVector<8, short>, _ExtVector<4, float>, int)", [Const], "gfx12-insts,wavefrontsize64">;
-def __builtin_amdgcn_swmmac_f16_16x16x32_f16_w64 : AMDGPUBuiltin<"_ExtVector<4, __fp16>(_ExtVector<4, __fp16>, _ExtVector<8, __fp16>, _ExtVector<4, __fp16>, int)", [Const], "gfx12-insts,wavefrontsize64">;
-def __builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64 : AMDGPUBuiltin<"_ExtVector<4, short>(_ExtVector<4, short>, _ExtVector<8, short>, _ExtVector<4, short>, int)", [Const], "gfx12-insts,wavefrontsize64">;
-def __builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64 : AMDGPUBuiltin<"_ExtVector<4, int>(_Constant bool, int, _Constant bool, _ExtVector<2, int>, _ExtVector<4, int>, int, _Constant bool)", [Const], "gfx12-insts,wavefrontsize64">;
-def __builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64 : AMDGPUBuiltin<"_ExtVector<4, int>(_Constant bool, int, _Constant bool, int, _ExtVector<4, int>, int, _Constant bool)", [Const], "gfx12-insts,wavefrontsize64">;
-def __builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64 : AMDGPUBuiltin<"_ExtVector<4, int>(_Constant bool, int, _Constant bool, _ExtVector<2, int>, _ExtVector<4, int>, int, _Constant bool)", [Const], "gfx12-insts,wavefrontsize64">;
-def __builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64 : AMDGPUBuiltin<"_ExtVector<4, float>(int, _ExtVector<2, int>, _ExtVector<4, float>, int)", [Const], "gfx12-insts,wavefrontsize64">;
-def __builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64 : AMDGPUBuiltin<"_ExtVector<4, float>(int, _ExtVector<2, int>, _ExtVector<4, float>, int)", [Const], "gfx12-insts,wavefrontsize64">;
-def __builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64 : AMDGPUBuiltin<"_ExtVector<4, float>(int, _ExtVector<2, int>, _ExtVector<4, float>, int)", [Const], "gfx12-insts,wavefrontsize64">;
-def __builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64 : AMDGPUBuiltin<"_ExtVector<4, float>(int, _ExtVector<2, int>, _ExtVector<4, float>, int)", [Const], "gfx12-insts,wavefrontsize64">;
+def __builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<8, _Float16>, _ExtVector<8, _Float16>, _ExtVector<8, float>)", [Const], "wmma-128b-insts,wavefrontsize32">;
+def __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<8, short>, _ExtVector<8, short>, _ExtVector<8, float>)", [Const], "wmma-128b-insts,wavefrontsize32">;
+def __builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, _Float16>(_ExtVector<8, _Float16>, _ExtVector<8, _Float16>, _ExtVector<8, _Float16>)", [Const], "wmma-128b-insts,wavefrontsize32">;
+def __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, short>(_ExtVector<8, short>, _ExtVector<8, short>, _ExtVector<8, short>)", [Const], "wmma-128b-insts,wavefrontsize32">;
+def __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, int>(_Constant bool, _ExtVector<2, int>, _Constant bool, _ExtVector<2, int>, _ExtVector<8, int>, _Constant bool)", [Const], "wmma-128b-insts,wavefrontsize32">;
+def __builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, int>(_Constant bool, int, _Constant bool, int, _ExtVector<8, int>, _Constant bool)", [Const], "wmma-128b-insts,wavefrontsize32">;
+// These are gfx1170 and gfx12 only, but for consistency with the other WMMA
+// variants we're keeping the "_gfx12" suffix.
+def __builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<2, int>, _ExtVector<2, int>, _ExtVector<8, float>)", [Const], "wmma-128b-insts,wavefrontsize32">;
+def __builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<2, int>, _ExtVector<2, int>, _ExtVector<8, float>)", [Const], "wmma-128b-insts,wavefrontsize32">;
+def __builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<2, int>, _ExtVector<2, int>, _ExtVector<8, float>)", [Const], "wmma-128b-insts,wavefrontsize32">;
+def __builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<2, int>, _ExtVector<2, int>, _ExtVector<8, float>)", [Const], "wmma-128b-insts,wavefrontsize32">;
+def __builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, int>(_Constant bool, _ExtVector<2, int>, _Constant bool, _ExtVector<2, int>, _ExtVector<8, int>, _Constant bool)", [Const], "wmma-128b-insts,wavefrontsize32">;
+
+def __builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<4, _Float16>, _ExtVector<4, _Float16>, _ExtVector<4, float>)", [Const], "wmma-128b-insts,wavefrontsize64">;
+def __builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<4, short>, _ExtVector<4, short>, _ExtVector<4, float>)", [Const], "wmma-128b-insts,wavefrontsize64">;
+def __builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, _Float16>(_ExtVector<4, _Float16>, _ExtVector<4, _Float16>, _ExtVector<4, _Float16>)", [Const], "wmma-128b-insts,wavefrontsize64">;
+def __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, short>(_ExtVector<4, short>, _ExtVector<4, short>, _ExtVector<4, short>)", [Const], "wmma-128b-insts,wavefrontsize64">;
+def __builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, int>(_Constant bool, int, _Constant bool, int, _ExtVector<4, int>, _Constant bool)", [Const], "wmma-128b-insts,wavefrontsize64">;
+def __builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, int>(_Constant bool, int, _Constant bool, int, _ExtVector<4, int>, _Constant bool)", [Const], "wmma-128b-insts,wavefrontsize64">;
+// These are gfx1170 and gfx12 only, but for consistency with the other WMMA
+// variants we're keeping the "_gfx12" suffix.
+def __builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, float>(int, int, _ExtVector<4, float>)", [Const], "wmma-128b-insts,wavefrontsize64">;
+def __builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, float>(int, int, _ExtVector<4, float>)", [Const], "wmma-128b-insts,wavefrontsize64">;
+def __builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, float>(int, int, _ExtVector<4, float>)", [Const], "wmma-128b-insts,wavefrontsize64">;
+def __builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, float>(int, int, _ExtVector<4, float>)", [Const], "wmma-128b-insts,wavefrontsize64">;
+def __builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, int>(_Constant bool, int, _Constant bool, int, _ExtVector<4, int>, _Constant bool)", [Const], "wmma-128b-insts,wavefrontsize64">;
+
+def __builtin_amdgcn_swmmac_f32_16x16x32_f16_w32 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<8, __fp16>, _ExtVector<16, __fp16>, _ExtVector<8, float>, int)", [Const], "wmma-128b-insts,wavefrontsize32">;
+def __builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<8, short>, _ExtVector<16, short>, _ExtVector<8, float>, int)", [Const], "wmma-128b-insts,wavefrontsize32">;
+def __builtin_amdgcn_swmmac_f16_16x16x32_f16_w32 : AMDGPUBuiltin<"_ExtVector<8, __fp16>(_ExtVector<8, __fp16>, _ExtVector<16, __fp16>, _ExtVector<8, __fp16>, int)", [Const], "wmma-128b-insts,wavefrontsize32">;
+def __builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32 : AMDGPUBuiltin<"_ExtVector<8, short>(_ExtVector<8, short>, _ExtVector<16, short>, _ExtVector<8, short>, int)", [Const], "wmma-128b-insts,wavefrontsize32">;
+def __builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32 : AMDGPUBuiltin<"_ExtVector<8, int>(_Constant bool, _ExtVector<2, int>, _Constant bool, _ExtVector<4, int>, _ExtVector<8, int>, int, _Constant bool)", [Const], "wmma-128b-insts,wavefrontsize32">;
+def __builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32 : AMDGPUBuiltin<"_ExtVector<8, int>(_Constant bool, int, _Constant bool, _ExtVector<2, int>, _ExtVector<8, int>, int, _Constant bool)", [Const], "wmma-128b-insts,wavefrontsize32">;
+def __builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32 : AMDGPUBuiltin<"_ExtVector<8, int>(_Constant bool, _ExtVector<2, int>, _Constant bool, _ExtVector<4, int>, _ExtVector<8, int>, int, _Constant bool)", [Const], "wmma-128b-insts,wavefrontsize32">;
+def __builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<2, int>, _ExtVector<4, int>, _ExtVector<8, float>, int)", [Const], "wmma-128b-insts,wavefrontsize32">;
+def __builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<2, int>, _ExtVector<4, int>, _ExtVector<8, float>, int)", [Const], "wmma-128b-insts,wavefrontsize32">;
+def __builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<2, int>, _ExtVector<4, int>, _ExtVector<8, float>, int)", [Const], "wmma-128b-insts,wavefrontsize32">;
+def __builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<2, int>, _ExtVector<4, int>, _ExtVector<8, float>, int)", [Const], "wmma-128b-insts,wavefrontsize32">;
+
+def __builtin_amdgcn_swmmac_f32_16x16x32_f16_w64 : AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<4, __fp16>, _ExtVector<8, __fp16>, _ExtVector<4, float>, int)", [Const], "wmma-128b-insts,wavefrontsize64">;
+def __builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64 : AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<4, short>, _ExtVector<8, short>, _ExtVector<4, float>, int)", [Const], "wmma-128b-insts,wavefrontsize64">;
+def __builtin_amdgcn_swmmac_f16_16x16x32_f16_w64 : AMDGPUBuiltin<"_ExtVector<4, __fp16>(_ExtVector<4, __fp16>, _ExtVector<8, __fp16>, _ExtVector<4, __fp16>, int)", [Const], "wmma-128b-insts,wavefrontsize64">;
+def __builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64 : AMDGPUBuiltin<"_ExtVector<4, short>(_ExtVector<4, short>, _ExtVector<8, short>, _ExtVector<4, short>, int)", [Const], "wmma-128b-insts,wavefrontsize64">;
+def __builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64 : AMDGPUBuiltin<"_ExtVector<4, int>(_Constant bool, int, _Constant bool, _ExtVector<2, int>, _ExtVector<4, int>, int, _Constant bool)", [Const], "wmma-128b-insts,wavefrontsize64">;
+def __builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64 : AMDGPUBuiltin<"_ExtVector<4, int>(_Constant bool, int, _Constant bool, int, _ExtVector<4, int>, int, _Constant bool)", [Const], "wmma-128b-insts,wavefrontsize64">;
+def __builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64 : AMDGPUBuiltin<"_ExtVector<4, int>(_Constant bool, int, _Constant bool, _ExtVector<2, int>, _ExtVector<4, int>, int, _Constant bool)", [Const], "wmma-128b-insts,wavefrontsize64">;
+def __builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64 : AMDGPUBuiltin<"_ExtVector<4, float>(int, _ExtVector<2, int>, _ExtVector<4, float>, int)", [Const], "wmma-128b-insts,wavefrontsize64">;
+def __builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64 : AMDGPUBuiltin<"_ExtVector<4, float>(int, _ExtVector<2, int>, _ExtVector<4, float>, int)", [Const], "wmma-128b-insts,wavefrontsize64">;
+def __builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64 : AMDGPUBuiltin<"_ExtVector<4, float>(int, _ExtVector<2, int>, _ExtVector<4, float>, int)", [Const], "wmma-128b-insts,wavefrontsize64">;
+def __builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64 : AMDGPUBuiltin<"_ExtVector<4, float>(int, _ExtVector<2, int>, _ExtVector<4, float>, int)", [Const], "wmma-128b-insts,wavefrontsize64">;
 
 def __builtin_amdgcn_prng_b32 : AMDGPUBuiltin<"unsigned int(unsigned int)", [Const], "prng-inst">;
 def __builtin_amdgcn_cvt_scalef32_pk32_fp6_f16 : AMDGPUBuiltin<"_ExtVector<6, unsigned int>(_ExtVector<32, _Float16>, float)", [Const], "f16bf16-to-fp6bf6-cvt-scale-insts">;
diff --git a/clang/include/clang/Basic/CodeGenOptions.def b/clang/include/clang/Basic/CodeGenOptions.def
index 8c056bb690690..5e174b21be466 100644
--- a/clang/include/clang/Basic/CodeGenOptions.def
+++ b/clang/include/clang/Basic/CodeGenOptions.def
@@ -509,6 +509,9 @@ CODEGENOPT(AllResourcesBound, 1, 0, Benign)
 ENUM_CODEGENOPT(WinX64EHUnwindV2, WinX64EHUnwindV2Mode,
                 2, WinX64EHUnwindV2Mode::Disabled, Benign)
 
+/// Adds attributes that prevent outlining (`-mno-outline`)
+CODEGENOPT(DisableOutlining, 1, 0, Benign)
+
 /// FIXME: Make DebugOptions its own top-level .def file.
 #include "DebugOptions.def"
 
diff --git a/clang/include/clang/CIR/Dialect/IR/CIROps.td b/clang/include/clang/CIR/Dialect/IR/CIROps.td
index cf312af194e85..78d7ef7510000 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIROps.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIROps.td
@@ -6211,12 +6211,6 @@ def CIR_TryOp : CIR_Op<"try",[
     Holds the lexical scope of `try {}`. Note that resources used on catch
     clauses are usually allocated in the same parent as `cir.try`.
 
-    `synthetic`: use `cir.try` to represent try/catches not originally
-    present in the source code. For example, a synthetic `cir.try` region
-    is created around the constructor call when `operator new` is used
-    so that the memory allocated will be freed if the constructor throws
-    an exception.
-
     `cleanup`: indicates that there are cleanups that must be performed
     when exiting the try region via exception, even if the exception is not
     caught.
@@ -6238,7 +6232,6 @@ def CIR_TryOp : CIR_Op<"try",[
   }];
 
   let arguments = (ins
-    UnitAttr:$synthetic,
     UnitAttr:$cleanup,
     DefaultValuedAttr<CIR_TryHandlerArrayAttr, "{}">:$handler_types
   );
@@ -6249,7 +6242,6 @@ def CIR_TryOp : CIR_Op<"try",[
   );
 
   let assemblyFormat = [{
-    (`synthetic` $synthetic^)?
     (`cleanup` $cleanup^)?
     $try_region
     custom<TryHandlerRegions>($handler_regions, $handler_types)
diff --git a/clang/include/clang/Options/Options.td b/clang/include/clang/Options/Options.td
index 24b31fb3fefcc..c8fb2a55fe7ac 100644
--- a/clang/include/clang/Options/Options.td
+++ b/clang/include/clang/Options/Options.td
@@ -5300,16 +5300,13 @@ def mmacos_version_min_EQ : Joined<["-"], "mmacos-version-min=">,
 def : Joined<["-"], "mmacosx-version-min=">,
   Visibility<[ClangOption, CC1Option, FC1Option, FlangOption]>,
   Group<m_Group>, Alias<mmacos_version_min_EQ>;
-def moutline
-    : Flag<["-"], "moutline">,
-      Group<f_clang_Group>,
-      Visibility<[ClangOption, CC1Option]>,
-      HelpText<"Enable function outlining (AArch64,Arm,RISC-V,X86 only)">;
-def mno_outline
-    : Flag<["-"], "mno-outline">,
-      Group<f_clang_Group>,
-      Visibility<[ClangOption, CC1Option]>,
-      HelpText<"Disable function outlining (AArch64,Arm,RISC-V,X86 only)">;
+defm outline
+    : BoolMOption<
+          "outline", CodeGenOpts<"DisableOutlining">, DefaultFalse,
+          NegFlag<SetTrue, [], [ClangOption, CC1Option],
+                  "Disable function outlining (AArch64,Arm,RISC-V,X86 only)">,
+          PosFlag<SetFalse, [], [ClangOption],
+                  "Enable function outlining (AArch64,Arm,RISC-V,X86 only)">>;
 def mms_bitfields : Flag<["-"], "mms-bitfields">, Group<m_Group>,
   HelpText<"Set the default structure layout to be compatible with the Microsoft compiler standard">;
 def mno_ms_bitfields : Flag<["-"], "mno-ms-bitfields">, Group<m_Group>,
diff --git a/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td b/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td
index 58e785d5ca36f..35d2f9c1d5ef1 100644
--- a/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td
+++ b/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td
@@ -188,11 +188,6 @@ def CallAndMessageChecker
 ]>,
       Documentation<HasDocumentation>;
 
-def FixedAddressDereferenceChecker
-    : Checker<"FixedAddressDereference">,
-      HelpText<"Check for dereferences of fixed addresses">,
-      Documentation<HasDocumentation>;
-
 def NullDereferenceChecker
     : Checker<"NullDereference">,
       HelpText<"Check for dereferences of null pointers">,
@@ -426,6 +421,11 @@ def EnumCastOutOfRangeChecker : Checker<"EnumCastOutOfRange">,
   HelpText<"Check integer to enumeration casts for out of range values">,
   Documentation<HasDocumentation>;
 
+def FixedAddressDereferenceChecker
+    : Checker<"FixedAddressDereference">,
+      HelpText<"Check for dereferences of fixed addresses">,
+      Documentation<HasDocumentation>;
+
 } // end "optin.core"
 
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/AST/Stmt.cpp b/clang/lib/AST/Stmt.cpp
index 5b745dd3c43f5..15d0e6435aaf3 100644
--- a/clang/lib/AST/Stmt.cpp
+++ b/clang/lib/AST/Stmt.cpp
@@ -40,6 +40,7 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
+#include <array>
 #include <cassert>
 #include <cstring>
 #include <optional>
@@ -57,25 +58,23 @@ using namespace clang;
 #define ABSTRACT_STMT(STMT)
 #include "clang/AST/StmtNodes.inc"
 
-static struct StmtClassNameTable {
+struct StmtClassNameTable {
   const char *Name;
   unsigned Counter;
   unsigned Size;
-} StmtClassInfo[Stmt::lastStmtConstant+1];
+};
 
 static StmtClassNameTable &getStmtInfoTableEntry(Stmt::StmtClass E) {
-  static bool Initialized = false;
-  if (Initialized)
-    return StmtClassInfo[E];
-
-  // Initialize the table on the first use.
-  Initialized = true;
+  static std::array<StmtClassNameTable, Stmt::lastStmtConstant + 1>
+      StmtClassInfo = [] {
+        std::array<StmtClassNameTable, Stmt::lastStmtConstant + 1> Table{};
 #define ABSTRACT_STMT(STMT)
-#define STMT(CLASS, PARENT) \
-  StmtClassInfo[(unsigned)Stmt::CLASS##Class].Name = #CLASS;    \
-  StmtClassInfo[(unsigned)Stmt::CLASS##Class].Size = sizeof(CLASS);
+#define STMT(CLASS, PARENT)                                                    \
+  Table[static_cast<unsigned>(Stmt::CLASS##Class)].Name = #CLASS;              \
+  Table[static_cast<unsigned>(Stmt::CLASS##Class)].Size = sizeof(CLASS);
 #include "clang/AST/StmtNodes.inc"
-
+        return Table;
+      }();
   return StmtClassInfo[E];
 }
 
@@ -85,7 +84,7 @@ void *Stmt::operator new(size_t bytes, const ASTContext& C,
 }
 
 const char *Stmt::getStmtClassName() const {
-  return getStmtInfoTableEntry((StmtClass) StmtBits.sClass).Name;
+  return getStmtInfoTableEntry(static_cast<StmtClass>(StmtBits.sClass)).Name;
 }
 
 // Check that no statement / expression class is polymorphic. LLVM style RTTI
@@ -113,19 +112,25 @@ void Stmt::PrintStats() {
   unsigned sum = 0;
   llvm::errs() << "\n*** Stmt/Expr Stats:\n";
   for (int i = 0; i != Stmt::lastStmtConstant+1; i++) {
-    if (StmtClassInfo[i].Name == nullptr) continue;
-    sum += StmtClassInfo[i].Counter;
+    const StmtClassNameTable &Entry =
+        getStmtInfoTableEntry(static_cast<Stmt::StmtClass>(i));
+    if (Entry.Name == nullptr)
+      continue;
+    sum += Entry.Counter;
   }
   llvm::errs() << "  " << sum << " stmts/exprs total.\n";
   sum = 0;
   for (int i = 0; i != Stmt::lastStmtConstant+1; i++) {
-    if (StmtClassInfo[i].Name == nullptr) continue;
-    if (StmtClassInfo[i].Counter == 0) continue;
-    llvm::errs() << "    " << StmtClassInfo[i].Counter << " "
-                 << StmtClassInfo[i].Name << ", " << StmtClassInfo[i].Size
-                 << " each (" << StmtClassInfo[i].Counter*StmtClassInfo[i].Size
+    const StmtClassNameTable &Entry =
+        getStmtInfoTableEntry(static_cast<Stmt::StmtClass>(i));
+    if (Entry.Name == nullptr)
+      continue;
+    if (Entry.Counter == 0)
+      continue;
+    llvm::errs() << "    " << Entry.Counter << " " << Entry.Name << ", "
+                 << Entry.Size << " each (" << Entry.Counter * Entry.Size
                  << " bytes)\n";
-    sum += StmtClassInfo[i].Counter*StmtClassInfo[i].Size;
+    sum += Entry.Counter * Entry.Size;
   }
 
   llvm::errs() << "Total bytes = " << sum << "\n";
diff --git a/clang/lib/Analysis/FlowSensitive/DataflowAnalysisContext.cpp b/clang/lib/Analysis/FlowSensitive/DataflowAnalysisContext.cpp
index 6e3a270e6bed6..1a68b2e81634f 100644
--- a/clang/lib/Analysis/FlowSensitive/DataflowAnalysisContext.cpp
+++ b/clang/lib/Analysis/FlowSensitive/DataflowAnalysisContext.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "clang/Analysis/FlowSensitive/DataflowAnalysisContext.h"
+#include "clang/AST/Type.h"
 #include "clang/Analysis/FlowSensitive/ASTOps.h"
 #include "clang/Analysis/FlowSensitive/Formula.h"
 #include "clang/Analysis/FlowSensitive/Logger.h"
@@ -43,7 +44,7 @@ static llvm::cl::opt<std::string> DataflowLog(
 namespace clang {
 namespace dataflow {
 
-FieldSet DataflowAnalysisContext::getModeledFields(QualType Type) {
+FieldSet DataflowAnalysisContext::computeModeledFields(QualType Type) {
   // During context-sensitive analysis, a struct may be allocated in one
   // function, but its field accessed in a function lower in the stack than
   // the allocation. Since we only collect fields used in the function where
@@ -57,8 +58,17 @@ FieldSet DataflowAnalysisContext::getModeledFields(QualType Type) {
   return llvm::set_intersection(getObjectFields(Type), ModeledFields);
 }
 
+const FieldSet &DataflowAnalysisContext::getModeledFields(QualType Type) {
+  QualType CanonicalType = Type.getCanonicalType().getUnqualifiedType();
+  std::unique_ptr<FieldSet> &Fields = CachedModeledFields[CanonicalType];
+  if (Fields == nullptr)
+    Fields = std::make_unique<FieldSet>(computeModeledFields(CanonicalType));
+  return *Fields;
+}
+
 void DataflowAnalysisContext::addModeledFields(const FieldSet &Fields) {
   ModeledFields.set_union(Fields);
+  CachedModeledFields.clear();
 }
 
 StorageLocation &DataflowAnalysisContext::createStorageLocation(QualType Type) {
diff --git a/clang/lib/Analysis/FlowSensitive/RecordOps.cpp b/clang/lib/Analysis/FlowSensitive/RecordOps.cpp
index 03d6ed8020a0a..767521334b0a2 100644
--- a/clang/lib/Analysis/FlowSensitive/RecordOps.cpp
+++ b/clang/lib/Analysis/FlowSensitive/RecordOps.cpp
@@ -85,7 +85,7 @@ void copyRecord(RecordStorageLocation &Src, RecordStorageLocation &Dst,
     // Dst may have children modeled from other derived types than SrcType, e.g.
     // after casts of Dst to other types derived from DstType. Only copy the
     // children and synthetic fields present in both Dst and SrcType.
-    const FieldSet FieldsInSrcType =
+    const FieldSet &FieldsInSrcType =
         Env.getDataflowAnalysisContext().getModeledFields(SrcType);
     for (auto [Field, DstFieldLoc] : Dst.children())
       if (const auto *FieldAsFieldDecl = dyn_cast<FieldDecl>(Field);
@@ -103,7 +103,7 @@ void copyRecord(RecordStorageLocation &Src, RecordStorageLocation &Dst,
     // after other casts of Src to those types (likely in different branches,
     // but without flow-condition-dependent field modeling). Only copy the
     // children and synthetic fields of Src that are present in DstType.
-    const FieldSet FieldsInDstType =
+    const FieldSet &FieldsInDstType =
         Env.getDataflowAnalysisContext().getModeledFields(DstType);
     for (auto [Field, SrcFieldLoc] : Src.children()) {
       if (const auto *FieldAsFieldDecl = dyn_cast<FieldDecl>(Field);
diff --git a/clang/lib/Basic/ParsedAttrInfo.cpp b/clang/lib/Basic/ParsedAttrInfo.cpp
index 16fa314b642b9..d5b17b34b6e3a 100644
--- a/clang/lib/Basic/ParsedAttrInfo.cpp
+++ b/clang/lib/Basic/ParsedAttrInfo.cpp
@@ -20,13 +20,16 @@ using namespace clang;
 
 LLVM_INSTANTIATE_REGISTRY(ParsedAttrInfoRegistry)
 
+static std::list<std::unique_ptr<ParsedAttrInfo>> instantiateEntries() {
+  std::list<std::unique_ptr<ParsedAttrInfo>> Instances;
+  for (const auto &It : ParsedAttrInfoRegistry::entries())
+    Instances.emplace_back(It.instantiate());
+  return Instances;
+}
+
 const std::list<std::unique_ptr<ParsedAttrInfo>> &
 clang::getAttributePluginInstances() {
-  static llvm::ManagedStatic<std::list<std::unique_ptr<ParsedAttrInfo>>>
-      PluginAttrInstances;
-  if (PluginAttrInstances->empty())
-    for (const auto &It : ParsedAttrInfoRegistry::entries())
-      PluginAttrInstances->emplace_back(It.instantiate());
-
-  return *PluginAttrInstances;
+  static std::list<std::unique_ptr<ParsedAttrInfo>> Instances =
+      instantiateEntries();
+  return Instances;
 }
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
index 71cf896aede10..699fee5a3a358 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
@@ -123,6 +123,17 @@ emitAArch64CompareBuiltinExpr(CIRGenFunction &cgf, CIRGenBuilderTy &builder,
   return builder.createCast(loc, cir::CastKind::integral, cmp, retTy);
 }
 
+// Emit an intrinsic where all operands are of the same type as the result.
+// Depending on mode, this may be a constrained floating-point intrinsic.
+static mlir::Value
+emitCallMaybeConstrainedBuiltin(CIRGenBuilderTy &builder, mlir::Location loc,
+                                StringRef intrName, mlir::Type retTy,
+                                llvm::SmallVector<mlir::Value> &ops) {
+  assert(!cir::MissingFeatures::emitConstrainedFPCall());
+
+  return builder.emitIntrinsicCallOp(loc, intrName, retTy, ops);
+}
+
 bool CIRGenFunction::getAArch64SVEProcessedOperands(
     unsigned builtinID, const CallExpr *expr, SmallVectorImpl<mlir::Value> &ops,
     SVETypeFlags typeFlags) {
@@ -1344,10 +1355,41 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned builtinID, const CallExpr *expr,
   // Find out if any arguments are required to be integer constant
   // expressions.
   assert(!cir::MissingFeatures::handleBuiltinICEArguments());
+  unsigned iceArguments = 0;
+  ASTContext::GetBuiltinTypeError error;
+  getContext().GetBuiltinType(builtinID, error, &iceArguments);
+  assert(error == ASTContext::GE_None && "Should not codegen an error");
+  llvm::SmallVector<mlir::Value> ops;
+  for (auto [idx, arg] : llvm::enumerate(expr->arguments())) {
+    if (idx == 0) {
+      switch (builtinID) {
+      case NEON::BI__builtin_neon_vld1_v:
+      case NEON::BI__builtin_neon_vld1q_v:
+      case NEON::BI__builtin_neon_vld1_dup_v:
+      case NEON::BI__builtin_neon_vld1q_dup_v:
+      case NEON::BI__builtin_neon_vld1_lane_v:
+      case NEON::BI__builtin_neon_vld1q_lane_v:
+      case NEON::BI__builtin_neon_vst1_v:
+      case NEON::BI__builtin_neon_vst1q_v:
+      case NEON::BI__builtin_neon_vst1_lane_v:
+      case NEON::BI__builtin_neon_vst1q_lane_v:
+      case NEON::BI__builtin_neon_vldap1_lane_s64:
+      case NEON::BI__builtin_neon_vldap1q_lane_s64:
+      case NEON::BI__builtin_neon_vstl1_lane_s64:
+      case NEON::BI__builtin_neon_vstl1q_lane_s64:
+        // Get the alignment for the argument in addition to the value;
+        // we'll use it later.
+        cgm.errorNYI(
+            expr->getSourceRange(),
+            std::string("unimplemented AArch64 builtin argument handling ") +
+                getContext().BuiltinInfo.getName(builtinID));
+      }
+    }
+    ops.push_back(emitScalarOrConstFoldImmArg(iceArguments, idx, arg));
+  }
 
   assert(!cir::MissingFeatures::neonSISDIntrinsics());
 
-  llvm::SmallVector<mlir::Value> ops;
   mlir::Location loc = getLoc(expr->getExprLoc());
 
   // Handle non-overloaded intrinsics first.
@@ -1355,7 +1397,6 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned builtinID, const CallExpr *expr,
   default:
     break;
   case NEON::BI__builtin_neon_vabsh_f16: {
-    ops.push_back(emitScalarExpr(expr->getArg(0)));
     return cir::FAbsOp::create(builder, loc, ops);
   }
   case NEON::BI__builtin_neon_vaddq_p128:
@@ -1397,7 +1438,6 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned builtinID, const CallExpr *expr,
                      getContext().BuiltinInfo.getName(builtinID));
     return mlir::Value{};
   case NEON::BI__builtin_neon_vceqzd_s64:
-    ops.push_back(emitScalarExpr(expr->getArg(0)));
     return emitAArch64CompareBuiltinExpr(
         *this, builder, loc, ops[0],
         convertType(expr->getCallReturnType(getContext())), cir::CmpOpKind::eq);
@@ -1451,11 +1491,9 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned builtinID, const CallExpr *expr,
                      getContext().BuiltinInfo.getName(builtinID));
     return mlir::Value{};
   case NEON::BI__builtin_neon_vnegd_s64: {
-    ops.push_back(emitScalarExpr(expr->getArg(0)));
     return builder.createNeg(ops[0]);
   }
   case NEON::BI__builtin_neon_vnegh_f16: {
-    ops.push_back(emitScalarExpr(expr->getArg(0)));
     return builder.createFNeg(ops[0]);
   }
   case NEON::BI__builtin_neon_vtstd_s64:
@@ -1508,8 +1546,22 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned builtinID, const CallExpr *expr,
   case NEON::BI__builtin_neon_vsubh_f16:
   case NEON::BI__builtin_neon_vmulh_f16:
   case NEON::BI__builtin_neon_vdivh_f16:
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented AArch64 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return mlir::Value{};
   case NEON::BI__builtin_neon_vfmah_f16:
+    // NEON intrinsic puts accumulator first, unlike the LLVM fma.
+    std::rotate(ops.begin(), ops.begin() + 1, ops.end());
+    return emitCallMaybeConstrainedBuiltin(builder, loc, "fma",
+                                           convertType(expr->getType()), ops);
+    break;
   case NEON::BI__builtin_neon_vfmsh_f16:
+    // NEON intrinsic puts accumulator first, unlike the LLVM fma.
+    std::rotate(ops.begin(), ops.begin() + 1, ops.end());
+    ops[0] = builder.createFNeg(ops[0]);
+    return emitCallMaybeConstrainedBuiltin(builder, loc, "fma",
+                                           convertType(expr->getType()), ops);
   case NEON::BI__builtin_neon_vaddd_s64:
   case NEON::BI__builtin_neon_vaddd_u64:
   case NEON::BI__builtin_neon_vsubd_s64:
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
index 5977f8c585e26..2c5a57e1ba2ee 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
@@ -870,12 +870,21 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
     return {};
   }
   mlir::Value VisitTypeTraitExpr(const TypeTraitExpr *e) {
+    // We diverge slightly from classic codegen here because CIR has stricter
+    // typing. In LLVM IR, constant folding covers up some potential type
+    // mismatches such as bool-to-int conversions that would fail the verifier
+    // in CIR. To make things work, we need to be sure we only emit a bool value
+    // if the expression type is bool.
     mlir::Location loc = cgf.getLoc(e->getExprLoc());
-    if (e->isStoredAsBoolean())
-      return builder.getBool(e->getBoolValue(), loc);
-    cgf.cgm.errorNYI(e->getSourceRange(),
-                     "ScalarExprEmitter: TypeTraitExpr stored as int");
-    return {};
+    if (e->isStoredAsBoolean()) {
+      if (e->getType()->isBooleanType())
+        return builder.getBool(e->getBoolValue(), loc);
+      assert(e->getType()->isIntegerType() &&
+             "Expected int type for TypeTraitExpr");
+      return builder.getConstInt(loc, cgf.convertType(e->getType()),
+                                 (uint64_t)e->getBoolValue());
+    }
+    return builder.getConstInt(loc, e->getAPValue().getInt());
   }
   mlir::Value
   VisitConceptSpecializationExpr(const ConceptSpecializationExpr *e) {
diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index 3d4aa552b6af2..eb778f3583a98 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -2426,7 +2426,16 @@ void CodeGenFunction::EmitStoreOfScalar(llvm::Value *value, LValue lvalue,
 static RValue EmitLoadOfMatrixLValue(LValue LV, SourceLocation Loc,
                                      CodeGenFunction &CGF) {
   assert(LV.getType()->isConstantMatrixType());
-  Address Addr = MaybeConvertMatrixAddress(LV.getAddress(), CGF);
+  RawAddress DestAddr = LV.getAddress();
+
+  // HLSL constant buffers may pad matrix layouts, so copy elements into a
+  // non-padded local alloca before loading.
+  if (CGF.getLangOpts().HLSL &&
+      LV.getType().getAddressSpace() == LangAS::hlsl_constant)
+    DestAddr =
+        CGF.CGM.getHLSLRuntime().createBufferMatrixTempAddress(LV, Loc, CGF);
+
+  Address Addr = MaybeConvertMatrixAddress(DestAddr, CGF);
   LV.setAddress(Addr);
   return RValue::get(CGF.EmitLoadOfScalar(LV, Loc));
 }
diff --git a/clang/lib/CodeGen/CGHLSLRuntime.cpp b/clang/lib/CodeGen/CGHLSLRuntime.cpp
index c68c9f16482ff..805f7a8b4445b 100644
--- a/clang/lib/CodeGen/CGHLSLRuntime.cpp
+++ b/clang/lib/CodeGen/CGHLSLRuntime.cpp
@@ -1356,6 +1356,32 @@ bool CGHLSLRuntime::emitResourceArrayCopy(LValue &LHS, Expr *RHSExpr,
   return EndIndex.has_value();
 }
 
+RawAddress CGHLSLRuntime::createBufferMatrixTempAddress(const LValue &LV,
+                                                        SourceLocation Loc,
+                                                        CodeGenFunction &CGF) {
+
+  assert(LV.getType()->isConstantMatrixType() && "expected matrix type");
+  assert(LV.getType().getAddressSpace() == LangAS::hlsl_constant &&
+         "expected cbuffer matrix");
+
+  QualType MatQualTy = LV.getType();
+  llvm::Type *MemTy = CGF.ConvertTypeForMem(MatQualTy);
+  llvm::Type *LayoutTy = HLSLBufferLayoutBuilder(CGF.CGM).layOutType(MatQualTy);
+
+  if (LayoutTy == MemTy)
+    return LV.getAddress();
+
+  Address SrcAddr = LV.getAddress();
+  // NOTE: B\C CreateMemTemp flattens MatrixTypes which causes
+  // overlapping GEPs in emitBufferCopy. Use CreateTempAlloca with
+  // the non-padded layout.
+  CharUnits Align =
+      CharUnits::fromQuantity(CGF.CGM.getDataLayout().getABITypeAlign(MemTy));
+  RawAddress DestAlloca = CGF.CreateTempAlloca(MemTy, Align, "matrix.buf.copy");
+  emitBufferCopy(CGF, DestAlloca, SrcAddr, MatQualTy);
+  return DestAlloca;
+}
+
 std::optional<LValue> CGHLSLRuntime::emitBufferArraySubscriptExpr(
     const ArraySubscriptExpr *E, CodeGenFunction &CGF,
     llvm::function_ref<llvm::Value *(bool Promote)> EmitIdxAfterBase) {
diff --git a/clang/lib/CodeGen/CGHLSLRuntime.h b/clang/lib/CodeGen/CGHLSLRuntime.h
index 62349c9dea7eb..dbbc887353cec 100644
--- a/clang/lib/CodeGen/CGHLSLRuntime.h
+++ b/clang/lib/CodeGen/CGHLSLRuntime.h
@@ -281,6 +281,9 @@ class CGHLSLRuntime {
       const ArraySubscriptExpr *E, CodeGenFunction &CGF,
       llvm::function_ref<llvm::Value *(bool Promote)> EmitIdxAfterBase);
 
+  RawAddress createBufferMatrixTempAddress(const LValue &LV, SourceLocation Loc,
+                                           CodeGenFunction &CGF);
+
   bool emitBufferCopy(CodeGenFunction &CGF, Address DestPtr, Address SrcPtr,
                       QualType CType);
 
diff --git a/clang/lib/CodeGen/CodeGenAction.cpp b/clang/lib/CodeGen/CodeGenAction.cpp
index a5ef4ac9d361d..29dcabd1b0971 100644
--- a/clang/lib/CodeGen/CodeGenAction.cpp
+++ b/clang/lib/CodeGen/CodeGenAction.cpp
@@ -248,6 +248,8 @@ void BackendConsumer::HandleTranslationUnit(ASTContext &C) {
   LLVMContext &Ctx = getModule()->getContext();
   std::unique_ptr<DiagnosticHandler> OldDiagnosticHandler =
     Ctx.getDiagnosticHandler();
+  llvm::scope_exit RestoreDiagnosticHandler(
+      [&]() { Ctx.setDiagnosticHandler(std::move(OldDiagnosticHandler)); });
   Ctx.setDiagnosticHandler(std::make_unique<ClangDiagnosticHandler>(
       CodeGenOpts, this));
 
@@ -311,8 +313,6 @@ void BackendConsumer::HandleTranslationUnit(ASTContext &C) {
                     C.getTargetInfo().getDataLayoutString(), getModule(),
                     Action, FS, std::move(AsmOutStream), this);
 
-  Ctx.setDiagnosticHandler(std::move(OldDiagnosticHandler));
-
   if (OptRecordFile)
     OptRecordFile->keep();
 }
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index 6a087be3751f0..43b8af0b2156a 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -2928,7 +2928,9 @@ void CodeGenModule::SetLLVMFunctionAttributesForDefinition(const Decl *D,
       B.addAttribute(llvm::Attribute::MinSize);
   }
 
-  if (D->hasAttr<NoOutlineAttr>())
+  // Add `nooutline` if Outlining is disabled with a command-line flag or a
+  // function attribute.
+  if (CodeGenOpts.DisableOutlining || D->hasAttr<NoOutlineAttr>())
     B.addAttribute(llvm::Attribute::NoOutline);
 
   F->addFnAttrs(B);
diff --git a/clang/lib/CodeGen/HLSLBufferLayoutBuilder.cpp b/clang/lib/CodeGen/HLSLBufferLayoutBuilder.cpp
index 07cc738882b50..0b644b9d8e441 100644
--- a/clang/lib/CodeGen/HLSLBufferLayoutBuilder.cpp
+++ b/clang/lib/CodeGen/HLSLBufferLayoutBuilder.cpp
@@ -103,10 +103,8 @@ HLSLBufferLayoutBuilder::layOutStruct(const RecordType *RT,
   return NewTy;
 }
 
-llvm::Type *HLSLBufferLayoutBuilder::layOutArray(const ConstantArrayType *AT) {
-  llvm::Type *EltTy = layOutType(AT->getElementType());
-  uint64_t Count = AT->getZExtSize();
-
+llvm::Type *HLSLBufferLayoutBuilder::padArrayElements(llvm::Type *EltTy,
+                                                      uint64_t Count) {
   CharUnits EltSize =
       CharUnits::fromQuantity(CGM.getDataLayout().getTypeSizeInBits(EltTy) / 8);
   CharUnits Padding = EltSize.alignTo(CBufferRowSize) - EltSize;
@@ -127,6 +125,22 @@ llvm::Type *HLSLBufferLayoutBuilder::layOutArray(const ConstantArrayType *AT) {
       /*IsPacked=*/true);
 }
 
+llvm::Type *HLSLBufferLayoutBuilder::layOutArray(const ConstantArrayType *AT) {
+  llvm::Type *EltTy = layOutType(AT->getElementType());
+  uint64_t Count = AT->getZExtSize();
+  return padArrayElements(EltTy, Count);
+}
+
+llvm::Type *
+HLSLBufferLayoutBuilder::layOutMatrix(const ConstantMatrixType *MT) {
+  // ConvertTypeForMem already handles row/column-major layout and bool
+  // promotion, producing [Count x <VecLen x EltTy>]. We just need to add
+  // cbuffer padding between the array elements.
+  llvm::ArrayType *MemTy =
+      cast<llvm::ArrayType>(CGM.getTypes().ConvertTypeForMem(QualType(MT, 0)));
+  return padArrayElements(MemTy->getElementType(), MemTy->getNumElements());
+}
+
 llvm::Type *HLSLBufferLayoutBuilder::layOutType(QualType Ty) {
   if (const auto *AT = CGM.getContext().getAsConstantArrayType(Ty))
     return layOutArray(AT);
@@ -136,6 +150,11 @@ llvm::Type *HLSLBufferLayoutBuilder::layOutType(QualType Ty) {
     return layOutStruct(Ty->getAsCanonical<RecordType>(), EmptyOffsets);
   }
 
+  if (Ty->isConstantMatrixType()) {
+    const auto *MT = Ty->castAs<ConstantMatrixType>();
+    return layOutMatrix(MT);
+  }
+
   return CGM.getTypes().ConvertTypeForMem(Ty);
 }
 
diff --git a/clang/lib/CodeGen/HLSLBufferLayoutBuilder.h b/clang/lib/CodeGen/HLSLBufferLayoutBuilder.h
index c55f680fe5a98..5d75b36993d1f 100644
--- a/clang/lib/CodeGen/HLSLBufferLayoutBuilder.h
+++ b/clang/lib/CodeGen/HLSLBufferLayoutBuilder.h
@@ -25,6 +25,10 @@ class HLSLBufferLayoutBuilder {
 private:
   CodeGenModule &CGM;
 
+  /// Pads an array of elements to 16-byte cbuffer row boundaries.
+  /// This implements the common pattern of padding all-but-the-last element.
+  llvm::Type *padArrayElements(llvm::Type *EltTy, uint64_t Count);
+
 public:
   HLSLBufferLayoutBuilder(CodeGenModule &CGM) : CGM(CGM) {}
 
@@ -45,6 +49,9 @@ class HLSLBufferLayoutBuilder {
   /// Lays out an array type following HLSL buffer rules.
   llvm::Type *layOutArray(const ConstantArrayType *AT);
 
+  /// Lays out a matrix type following HLSL buffer rules.
+  llvm::Type *layOutMatrix(const ConstantMatrixType *MT);
+
   /// Lays out a type following HLSL buffer rules. Arrays and structures will be
   /// padded appropriately and nested objects will be converted as appropriate.
   llvm::Type *layOutType(QualType Type);
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index d7d744d1770b6..1f3f66bf37c4a 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -2318,7 +2318,14 @@ int Driver::ExecuteCompilation(
 
     if (!FailingCommand->getCreator().hasGoodDiagnostics() || CommandRes != 1) {
       // FIXME: See FIXME above regarding result code interpretation.
+#if LLVM_ON_UNIX
+      // On Unix, signals are represented by return codes of 128 plus the
+      // signal number. Return code 255 is excluded because some tools,
+      // such as llvm-ifs, exit with code 255 (-1) on failure.
+      if (CommandRes > 128 && CommandRes != 255)
+#else
       if (CommandRes < 0)
+#endif
         Diag(clang::diag::err_drv_command_signalled)
             << FailingTool.getShortName();
       else
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index 8bb271d27a3c4..9a17fa2546e68 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -2966,11 +2966,12 @@ void tools::addMachineOutlinerArgs(const Driver &D,
         D.Diag(diag::warn_drv_moutline_unsupported_opt) << Triple.getArchName();
       }
     } else {
-      // Disable all outlining behaviour.
-      //
-      // FIXME: This should probably use the `nooutline` attribute rather than
-      // tweaking Pipeline Pass flags, so `-mno-outline` and `-moutline` objects
-      // can be combined correctly during LTO.
+      if (!IsLTO)
+        // Disable all outlining behaviour using `nooutline` attribute, in case
+        // Linker Invocation lacks `-mno-outline`.
+        CmdArgs.push_back("-mno-outline");
+
+      // Disable Pass in Pipeline
       addArg(Twine("-enable-machine-outliner=never"));
     }
   }
diff --git a/clang/lib/Driver/ToolChains/Darwin.cpp b/clang/lib/Driver/ToolChains/Darwin.cpp
index 1c95a79a52a9c..74fcb10c0be22 100644
--- a/clang/lib/Driver/ToolChains/Darwin.cpp
+++ b/clang/lib/Driver/ToolChains/Darwin.cpp
@@ -1145,11 +1145,11 @@ void Darwin::VerifyTripleForSDK(const llvm::opt::ArgList &Args,
       getDriver().Diag(diag::warn_incompatible_sysroot)
           << SDKInfo->getDisplayName() << Triple.getTriple();
   } else if (const Arg *A = Args.getLastArg(options::OPT_isysroot)) {
+    // If there is no SDK info, assume this is building against an SDK that
+    // predates SDKSettings.json. Try to match the triple to the SDK path.
     const char *isysroot = A->getValue();
-    StringRef SDK = getSDKName(isysroot);
-    if (!SDK.empty()) {
-      size_t StartVer = SDK.find_first_of("0123456789");
-      StringRef SDKName = SDK.slice(0, StartVer);
+    StringRef SDKName = getSDKName(isysroot);
+    if (!SDKName.empty()) {
       bool supported = true;
       if (Triple.isWatchOS())
         supported = SDKName.starts_with("Watch");
@@ -1161,9 +1161,8 @@ void Darwin::VerifyTripleForSDK(const llvm::opt::ArgList &Args,
         supported = SDKName.starts_with("iPhone");
       else if (Triple.isMacOSX())
         supported = SDKName.starts_with("MacOSX");
-      else
-        llvm::reportFatalUsageError(Twine("SDK at '") + isysroot +
-                                    "' missing SDKSettings.json.");
+      // If it's not an older SDK, then it might be a damaged SDK or a
+      // non-standard -isysroot path. Don't try to diagnose that here.
 
       if (!supported)
         getDriver().Diag(diag::warn_incompatible_sysroot)
@@ -2484,6 +2483,8 @@ void Darwin::AddDeploymentTarget(DerivedArgList &Args) const {
   // Read the SDKSettings.json file for more information, like the SDK version
   // that we can pass down to the compiler.
   SDKInfo = parseSDKSettings(getVFS(), Args, getDriver());
+  // FIXME: If SDKInfo is std::nullopt, diagnose a bad isysroot value (e.g.
+  // doesn't end in .sdk).
 
   // The OS and the version can be specified using the -target argument.
   std::optional<DarwinPlatform> PlatformAndVersion =
diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
index 4823bb4265789..802a1bdbccfdd 100644
--- a/clang/lib/Sema/SemaHLSL.cpp
+++ b/clang/lib/Sema/SemaHLSL.cpp
@@ -218,8 +218,8 @@ Decl *SemaHLSL::ActOnStartBuffer(Scope *BufferScope, bool CBuffer,
 
 static unsigned calculateLegacyCbufferFieldAlign(const ASTContext &Context,
                                                  QualType T) {
-  // Arrays and Structs are always aligned to new buffer rows
-  if (T->isArrayType() || T->isStructureType())
+  // Arrays, Matrices, and Structs are always aligned to new buffer rows
+  if (T->isArrayType() || T->isStructureType() || T->isConstantMatrixType())
     return 16;
 
   // Vectors are aligned to the type they contain
diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp
index 498ffd0887630..b79c22603494c 100644
--- a/clang/lib/Sema/SemaInit.cpp
+++ b/clang/lib/Sema/SemaInit.cpp
@@ -6947,10 +6947,39 @@ void InitializationSequence::InitializeFrom(Sema &S,
   // For HLSL ext vector types we allow list initialization behavior for C++
   // functional cast expressions which look like constructor syntax. This is
   // accomplished by converting initialization arguments to InitListExpr.
-  if (S.getLangOpts().HLSL && Args.size() > 1 &&
-      (DestType->isExtVectorType() || DestType->isConstantMatrixType()) &&
-      (SourceType.isNull() ||
-       !Context.hasSameUnqualifiedType(SourceType, DestType))) {
+  auto ShouldTryListInitialization = [&]() -> bool {
+    // Only try list initialization for HLSL.
+    if (!S.getLangOpts().HLSL)
+      return false;
+
+    bool DestIsVec = DestType->isExtVectorType();
+    bool DestIsMat = DestType->isConstantMatrixType();
+
+    // If the destination type is neither a vector nor a matrix, then don't try
+    // list initialization.
+    if (!DestIsVec && !DestIsMat)
+      return false;
+
+    // If there is only a single source argument, then only try list
+    // initialization if initializing a matrix with a vector or vice versa.
+    if (Args.size() == 1) {
+      assert(!SourceType.isNull() &&
+             "Source QualType should not be null when arg size is exactly 1");
+      bool SourceIsVec = SourceType->isExtVectorType();
+      bool SourceIsMat = SourceType->isConstantMatrixType();
+
+      if (DestIsMat && !SourceIsVec)
+        return false;
+      if (DestIsVec && !SourceIsMat)
+        return false;
+    }
+
+    // Try list initialization if the source type is null or if the
+    // destination and source types differ.
+    return SourceType.isNull() ||
+           !Context.hasSameUnqualifiedType(SourceType, DestType);
+  };
+  if (ShouldTryListInitialization()) {
     InitListExpr *ILE = new (Context)
         InitListExpr(S.getASTContext(), Args.front()->getBeginLoc(), Args,
                      Args.back()->getEndLoc());
diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp
index a6d4b989cae3d..c0c0ab7a09c72 100644
--- a/clang/lib/Sema/SemaType.cpp
+++ b/clang/lib/Sema/SemaType.cpp
@@ -9309,59 +9309,11 @@ bool Sema::hasAcceptableDefinition(NamedDecl *D, NamedDecl **Suggested,
 
   // If this definition was instantiated from a template, map back to the
   // pattern from which it was instantiated.
-  if (isa<TagDecl>(D) && cast<TagDecl>(D)->isBeingDefined())
+  if (isa<TagDecl>(D) && cast<TagDecl>(D)->isBeingDefined()) {
     // We're in the middle of defining it; this definition should be treated
     // as visible.
     return true;
-
-  auto DefinitionIsAcceptable = [&](NamedDecl *D) {
-    // The (primary) definition might be in a visible module.
-    if (isAcceptable(D, Kind))
-      return true;
-
-    // A visible module might have a merged definition instead.
-    if (D->isModulePrivate() ? hasMergedDefinitionInCurrentModule(D)
-                             : hasVisibleMergedDefinition(D)) {
-      if (CodeSynthesisContexts.empty() &&
-          !getLangOpts().ModulesLocalVisibility) {
-        // Cache the fact that this definition is implicitly visible because
-        // there is a visible merged definition.
-        D->setVisibleDespiteOwningModule();
-      }
-      return true;
-    }
-
-    return false;
-  };
-  auto IsDefinition = [](NamedDecl *D) {
-    if (auto *RD = dyn_cast<CXXRecordDecl>(D))
-      return RD->isThisDeclarationADefinition();
-    if (auto *ED = dyn_cast<EnumDecl>(D))
-      return ED->isThisDeclarationADefinition();
-    if (auto *FD = dyn_cast<FunctionDecl>(D))
-      return FD->isThisDeclarationADefinition();
-    if (auto *VD = dyn_cast<VarDecl>(D))
-      return VD->isThisDeclarationADefinition() == VarDecl::Definition;
-    llvm_unreachable("unexpected decl type");
-  };
-  auto FoundAcceptableDefinition = [&](NamedDecl *D) {
-    if (!isa<CXXRecordDecl, FunctionDecl, EnumDecl, VarDecl>(D))
-      return DefinitionIsAcceptable(D);
-
-    for (auto *RD : D->redecls()) {
-      auto *ND = cast<NamedDecl>(RD);
-      if (!IsDefinition(ND))
-        continue;
-      if (DefinitionIsAcceptable(ND)) {
-        *Suggested = ND;
-        return true;
-      }
-    }
-
-    return false;
-  };
-
-  if (auto *RD = dyn_cast<CXXRecordDecl>(D)) {
+  } else if (auto *RD = dyn_cast<CXXRecordDecl>(D)) {
     if (auto *Pattern = RD->getTemplateInstantiationPattern())
       RD = Pattern;
     D = RD->getDefinition();
@@ -9400,14 +9352,34 @@ bool Sema::hasAcceptableDefinition(NamedDecl *D, NamedDecl **Suggested,
 
   *Suggested = D;
 
-  if (FoundAcceptableDefinition(D))
+  auto DefinitionIsAcceptable = [&] {
+    // The (primary) definition might be in a visible module.
+    if (isAcceptable(D, Kind))
+      return true;
+
+    // A visible module might have a merged definition instead.
+    if (D->isModulePrivate() ? hasMergedDefinitionInCurrentModule(D)
+                             : hasVisibleMergedDefinition(D)) {
+      if (CodeSynthesisContexts.empty() &&
+          !getLangOpts().ModulesLocalVisibility) {
+        // Cache the fact that this definition is implicitly visible because
+        // there is a visible merged definition.
+        D->setVisibleDespiteOwningModule();
+      }
+      return true;
+    }
+
+    return false;
+  };
+
+  if (DefinitionIsAcceptable())
     return true;
 
   // The external source may have additional definitions of this entity that are
   // visible, so complete the redeclaration chain now and ask again.
   if (auto *Source = Context.getExternalSource()) {
     Source->CompleteRedeclChain(D);
-    return FoundAcceptableDefinition(D);
+    return DefinitionIsAcceptable();
   }
 
   return false;
diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp
index f0fb247f1afb9..f8e9caa3f5d1d 100644
--- a/clang/lib/Serialization/ASTReaderDecl.cpp
+++ b/clang/lib/Serialization/ASTReaderDecl.cpp
@@ -3642,9 +3642,23 @@ template<>
 void ASTDeclReader::attachPreviousDeclImpl(ASTReader &Reader,
                                            Redeclarable<VarDecl> *D,
                                            Decl *Previous, Decl *Canon) {
+  auto *VD = static_cast<VarDecl *>(D);
   auto *PrevVD = cast<VarDecl>(Previous);
   D->RedeclLink.setPrevious(PrevVD);
   D->First = PrevVD->First;
+
+  // We should keep at most one definition on the chain.
+  // FIXME: Cache the definition once we've found it. Building a chain with
+  // N definitions currently takes O(N^2) time here.
+  if (VD->isThisDeclarationADefinition() == VarDecl::Definition) {
+    for (VarDecl *CurD = PrevVD; CurD; CurD = CurD->getPreviousDecl()) {
+      if (CurD->isThisDeclarationADefinition() == VarDecl::Definition) {
+        Reader.mergeDefinitionVisibility(CurD, VD);
+        VD->demoteThisDefinitionToDeclaration();
+        break;
+      }
+    }
+  }
 }
 
 static bool isUndeducedReturnType(QualType T) {
diff --git a/clang/lib/StaticAnalyzer/Checkers/BasicObjCFoundationChecks.cpp b/clang/lib/StaticAnalyzer/Checkers/BasicObjCFoundationChecks.cpp
index e682c4ef80896..f226f80aa441f 100644
--- a/clang/lib/StaticAnalyzer/Checkers/BasicObjCFoundationChecks.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/BasicObjCFoundationChecks.cpp
@@ -70,16 +70,12 @@ enum FoundationClass {
 
 static FoundationClass findKnownClass(const ObjCInterfaceDecl *ID,
                                       bool IncludeSuperclasses = true) {
-  static llvm::StringMap<FoundationClass> Classes;
-  if (Classes.empty()) {
-    Classes["NSArray"] = FC_NSArray;
-    Classes["NSDictionary"] = FC_NSDictionary;
-    Classes["NSEnumerator"] = FC_NSEnumerator;
-    Classes["NSNull"] = FC_NSNull;
-    Classes["NSOrderedSet"] = FC_NSOrderedSet;
-    Classes["NSSet"] = FC_NSSet;
-    Classes["NSString"] = FC_NSString;
-  }
+  static const llvm::StringMap<FoundationClass> Classes{
+      {"NSArray", FC_NSArray},           {"NSDictionary", FC_NSDictionary},
+      {"NSEnumerator", FC_NSEnumerator}, {"NSNull", FC_NSNull},
+      {"NSOrderedSet", FC_NSOrderedSet}, {"NSSet", FC_NSSet},
+      {"NSString", FC_NSString},
+  };
 
   // FIXME: Should we cache this at all?
   FoundationClass result = Classes.lookup(ID->getIdentifier()->getName());
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/NoDeleteChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/NoDeleteChecker.cpp
index 2740890704767..abbdc2967e859 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/NoDeleteChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/NoDeleteChecker.cpp
@@ -92,7 +92,11 @@ class NoDeleteChecker : public Checker<check::ASTDecl<TranslationUnitDecl>> {
       return;
 
     auto Body = FD->getBody();
-    if (!Body || TFA.isTrivial(Body))
+    if (!Body)
+      return;
+
+    auto hasTrivialDtor = [&](VarDecl *D) { return TFA.hasTrivialDtor(D); };
+    if (llvm::all_of(FD->parameters(), hasTrivialDtor) && TFA.isTrivial(Body))
       return;
 
     SmallString<100> Buf;
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
index c47dabf2ec5b0..8cd64c12b7a73 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
@@ -516,19 +516,57 @@ class TrivialFunctionAnalysisVisitor
     return Result;
   }
 
+  bool CanTriviallyDestruct(QualType Ty) {
+    assert(!Ty.isNull());
+
+    // T*, T& or T&& does not run its destructor.
+    if (Ty->isPointerOrReferenceType())
+      return true;
+
+    // Primitive types don't have destructors.
+    if (Ty->isIntegralOrEnumerationType())
+      return true;
+
+    if (const auto *R = Ty->getAsCXXRecordDecl()) {
+      // C++ trivially destructible classes are fine.
+      if (R->hasTrivialDestructor())
+        return true;
+
+      // For Webkit, side-effects are fine as long as we don't delete objects,
+      // so check recursively.
+      if (const auto *Dtor = R->getDestructor())
+        return IsFunctionTrivial(Dtor);
+    }
+
+    // Structs in C are trivial.
+    if (Ty->isRecordType())
+      return true;
+
+    // For arrays it depends on the element type.
+    // FIXME: We should really use ASTContext::getAsArrayType instead.
+    if (const auto *AT = Ty->getAsArrayTypeUnsafe())
+      return CanTriviallyDestruct(AT->getElementType());
+
+    return false; // Otherwise it's likely not trivial.
+  }
+
 public:
   using CacheTy = TrivialFunctionAnalysis::CacheTy;
 
   TrivialFunctionAnalysisVisitor(CacheTy &Cache) : Cache(Cache) {}
 
   bool IsFunctionTrivial(const Decl *D) {
-    if (auto *FnDecl = dyn_cast<FunctionDecl>(D)) {
-      if (isNoDeleteFunction(FnDecl))
-        return true;
-      if (FnDecl->isVirtualAsWritten())
-        return false;
-    }
     return WithCachedResult(D, [&]() {
+      if (auto *FnDecl = dyn_cast<FunctionDecl>(D)) {
+        if (isNoDeleteFunction(FnDecl))
+          return true;
+        if (auto *MD = dyn_cast<CXXMethodDecl>(D); MD && MD->isVirtual())
+          return false;
+        for (auto *Param : FnDecl->parameters()) {
+          if (!HasTrivialDestructor(Param))
+            return false;
+        }
+      }
       if (auto *CtorDecl = dyn_cast<CXXConstructorDecl>(D)) {
         for (auto *CtorInit : CtorDecl->inits()) {
           if (!Visit(CtorInit->getInit()))
@@ -542,6 +580,11 @@ class TrivialFunctionAnalysisVisitor
     });
   }
 
+  bool HasTrivialDestructor(const VarDecl *VD) {
+    return WithCachedResult(
+        VD, [&] { return CanTriviallyDestruct(VD->getType()); });
+  }
+
   bool IsStatementTrivial(const Stmt *S) {
     auto CacheIt = Cache.find(S);
     if (CacheIt != Cache.end())
@@ -579,7 +622,16 @@ class TrivialFunctionAnalysisVisitor
     return true;
   }
 
-  bool VisitDeclStmt(const DeclStmt *DS) { return VisitChildren(DS); }
+  bool VisitDeclStmt(const DeclStmt *DS) {
+    for (auto &Decl : DS->decls()) {
+      // FIXME: Handle DecompositionDecls.
+      if (auto *VD = dyn_cast<VarDecl>(Decl)) {
+        if (!HasTrivialDestructor(VD))
+          return false;
+      }
+    }
+    return VisitChildren(DS);
+  }
   bool VisitDoStmt(const DoStmt *DS) { return VisitChildren(DS); }
   bool VisitIfStmt(const IfStmt *IS) {
     return WithCachedResult(IS, [&]() { return VisitChildren(IS); });
@@ -731,6 +783,10 @@ class TrivialFunctionAnalysisVisitor
     return true;
   }
 
+  bool VisitCXXDefaultInitExpr(const CXXDefaultInitExpr *E) {
+    return Visit(E->getExpr());
+  }
+
   bool checkArguments(const CallExpr *CE) {
     for (const Expr *Arg : CE->arguments()) {
       if (Arg && !Visit(Arg))
@@ -749,6 +805,10 @@ class TrivialFunctionAnalysisVisitor
     return IsFunctionTrivial(CE->getConstructor());
   }
 
+  bool VisitCXXDeleteExpr(const CXXDeleteExpr *DE) {
+    return CanTriviallyDestruct(DE->getDestroyedType());
+  }
+
   bool VisitCXXInheritedCtorInitExpr(const CXXInheritedCtorInitExpr *E) {
     return IsFunctionTrivial(E->getConstructor());
   }
@@ -769,7 +829,7 @@ class TrivialFunctionAnalysisVisitor
 
   bool VisitCXXBindTemporaryExpr(const CXXBindTemporaryExpr *BTE) {
     if (auto *Temp = BTE->getTemporary()) {
-      if (!TrivialFunctionAnalysis::isTrivialImpl(Temp->getDestructor(), Cache))
+      if (!IsFunctionTrivial(Temp->getDestructor()))
         return false;
     }
     return Visit(BTE->getSubExpr());
@@ -857,4 +917,10 @@ bool TrivialFunctionAnalysis::isTrivialImpl(
   return V.IsStatementTrivial(S);
 }
 
+bool TrivialFunctionAnalysis::hasTrivialDtorImpl(const VarDecl *VD,
+                                                 CacheTy &Cache) {
+  TrivialFunctionAnalysisVisitor V(Cache);
+  return V.HasTrivialDestructor(VD);
+}
+
 } // namespace clang
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.h b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.h
index 431357a2150be..8a696a789c65b 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.h
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.h
@@ -28,6 +28,7 @@ class Stmt;
 class TranslationUnitDecl;
 class Type;
 class TypedefDecl;
+class VarDecl;
 
 // Ref-countability of a type is implicitly defined by Ref<T> and RefPtr<T>
 // implementation. It can be modeled as: type T having public methods ref() and
@@ -169,6 +170,9 @@ class TrivialFunctionAnalysis {
   /// \returns true if \p D is a "trivial" function.
   bool isTrivial(const Decl *D) const { return isTrivialImpl(D, TheCache); }
   bool isTrivial(const Stmt *S) const { return isTrivialImpl(S, TheCache); }
+  bool hasTrivialDtor(const VarDecl *VD) const {
+    return hasTrivialDtorImpl(VD, TheCache);
+  }
 
 private:
   friend class TrivialFunctionAnalysisVisitor;
@@ -179,6 +183,7 @@ class TrivialFunctionAnalysis {
 
   static bool isTrivialImpl(const Decl *D, CacheTy &Cache);
   static bool isTrivialImpl(const Stmt *S, CacheTy &Cache);
+  static bool hasTrivialDtorImpl(const VarDecl *VD, CacheTy &Cache);
 };
 
 } // namespace clang
diff --git a/clang/test/Analysis/Checkers/WebKit/mock-types.h b/clang/test/Analysis/Checkers/WebKit/mock-types.h
index 8a24a3c64e0e4..c139a5cb13de7 100644
--- a/clang/test/Analysis/Checkers/WebKit/mock-types.h
+++ b/clang/test/Analysis/Checkers/WebKit/mock-types.h
@@ -227,12 +227,20 @@ template <typename T> bool operator!=(const RefPtr<T> &, T &) { return false; }
 struct RefCountable {
   static Ref<RefCountable> create();
   static std::unique_ptr<RefCountable> makeUnique();
-  void ref() {}
-  void deref() {}
+  void ref() { ++m_refCount; }
+  void deref() {
+    --m_refCount;
+    if (!--m_refCount)
+      delete this;
+  }
+  ~RefCountable();
   void method();
   void constMethod() const;
   int trivial() { return 123; }
   RefCountable* next();
+  
+private:
+  unsigned m_refCount { 0 };
 };
 
 template <typename T> T *downcast(T *t) { return t; }
@@ -280,11 +288,14 @@ template <typename T> struct CheckedPtr {
 
 class CheckedObj {
 public:
-  void incrementCheckedPtrCount();
-  void decrementCheckedPtrCount();
+  void incrementCheckedPtrCount() { ++m_ptrCount; }
+  void decrementCheckedPtrCount() { --m_ptrCount; }
   void method();
   int trivial() { return 123; }
   CheckedObj* next();
+
+private:
+  unsigned m_ptrCount { 0 };
 };
 
 class RefCountableAndCheckable {
@@ -348,8 +359,8 @@ class WeakPtrImpl {
 
 private:
   template <typename T>
-  WeakPtrImpl(T* t)
-    : ptr(static_cast<void*>(t))
+  WeakPtrImpl(T& t)
+    : ptr(static_cast<void*>(&t))
   { }
 };
 
@@ -361,9 +372,9 @@ class CanMakeWeakPtr {
   template <typename U> friend class CanMakeWeakPtr;
   template <typename U> friend class WeakPtr;
 
-  Ref<WeakPtrImpl> createWeakPtrImpl() {
+  WeakPtrImpl& createWeakPtrImpl() {
     if (!impl)
-      impl = WeakPtrImpl::create(static_cast<T>(*this));
+      impl = WeakPtrImpl::create(static_cast<T&>(*this));
     return *impl;
   }
 
@@ -382,21 +393,26 @@ class WeakPtr {
   RefPtr<WeakPtrImpl> impl;
 
 public:
-  WeakPtr(T& t) {
-    *this = t;
+  WeakPtr(T& t)
+    : impl(t.createWeakPtrImpl()) {
   }
-  WeakPtr(T* t) {
-    *this = t;
+  WeakPtr(T* t)
+    : impl(t ? &t->createWeakPtrImpl() : nullptr) {
   }
 
   template <typename U>
   WeakPtr<T> operator=(U& obj) {
     impl = obj.createWeakPtrImpl();
+    return *this;
   }
 
   template <typename U>
   WeakPtr<T> operator=(U* obj) {
-    impl = obj ? obj->createWeakPtrImpl() : nullptr;
+    if (obj)
+      impl = obj->createWeakPtrImpl();
+    else
+      impl = nullptr;
+    return *this;
   }
 
   T* get() {
diff --git a/clang/test/Analysis/Checkers/WebKit/nodelete-annotation.cpp b/clang/test/Analysis/Checkers/WebKit/nodelete-annotation.cpp
index 82667a7916f42..98f4017e5e3fd 100644
--- a/clang/test/Analysis/Checkers/WebKit/nodelete-annotation.cpp
+++ b/clang/test/Analysis/Checkers/WebKit/nodelete-annotation.cpp
@@ -1,5 +1,7 @@
 // RUN: %clang_analyze_cc1 -analyzer-checker=alpha.webkit.NoDeleteChecker -verify %s
 
+#include "mock-types.h"
+
 void someFunction();
 void [[clang::annotate_type("webkit.nodelete")]] safeFunction();
 
@@ -28,7 +30,28 @@ void [[clang::annotate_type("webkit.nodelete")]] defWithNoDelete() {
   someFunction();
 }
 
+class WeakRefCountable : public CanMakeWeakPtr<WeakRefCountable> {
+public:
+  static Ref<WeakRefCountable> create();
+
+  ~WeakRefCountable();
+
+  void ref() { m_refCount++; }
+  void deref() {
+    m_refCount--;
+    if (!m_refCount)
+      delete this;
+  }
+
+private:
+  WeakRefCountable();
+
+  unsigned m_refCount { 0 };
+};
+
 class SomeClass {
+public:
+
   void [[clang::annotate_type("webkit.nodelete")]] someMethod();
   void [[clang::annotate_type("webkit.nodelete")]] unsafeMethod() {
     // expected-warning at -1{{A function 'unsafeMethod' has [[clang::annotate_type("webkit.nodelete")]] but it contains code that could destruct an object}}
@@ -57,6 +80,59 @@ class SomeClass {
   }
 
   virtual void [[clang::annotate_type("webkit.nodelete")]] anotherVirtualMethod();
+
+  void [[clang::annotate_type("webkit.nodelete")]] setObj(RefCountable* obj) {
+    // expected-warning at -1{{A function 'setObj' has [[clang::annotate_type("webkit.nodelete")]] but it contains code that could destruct an object}}
+    m_obj = obj;
+  }
+
+  void [[clang::annotate_type("webkit.nodelete")]] swapObj(RefPtr<RefCountable>&& obj) {
+    m_obj.swap(obj);
+  }
+
+  void [[clang::annotate_type("webkit.nodelete")]] clearObj(RefCountable* obj) {
+    // expected-warning at -1{{A function 'clearObj' has [[clang::annotate_type("webkit.nodelete")]] but it contains code that could destruct an object}}
+    m_obj = nullptr;
+  }
+
+  void [[clang::annotate_type("webkit.nodelete")]] deposeArg(WeakRefCountable&& unused) {
+  }
+
+  void [[clang::annotate_type("webkit.nodelete")]] deposeArgPtr(RefPtr<RefCountable>&& unused) {
+  }
+
+  enum class E : unsigned char { V1, V2 };
+  bool [[clang::annotate_type("webkit.nodelete")]] deposeArgEnum() {
+    E&& e = E::V1;
+    return e != E::V2;
+  }
+
+  void [[clang::annotate_type("webkit.nodelete")]] deposeLocal() {
+    // expected-warning at -1{{A function 'deposeLocal' has [[clang::annotate_type("webkit.nodelete")]] but it contains code that could destruct an object}}
+    RefPtr<RefCountable> obj = std::move(m_obj);
+  }
+
+  RefPtr<RefCountable> [[clang::annotate_type("webkit.nodelete")]] copyRefPtr() {
+    return m_obj;
+  }
+
+  Ref<WeakRefCountable> [[clang::annotate_type("webkit.nodelete")]] copyRef() {
+    return *m_weakObj.get();
+  }
+
+  RefPtr<WeakRefCountable> [[clang::annotate_type("webkit.nodelete")]] getWeakPtr() {
+    return m_weakObj.get();
+  }
+
+  WeakRefCountable* [[clang::annotate_type("webkit.nodelete")]] useWeakPtr() {
+    WeakPtr localWeak = m_weakObj.get();
+    return localWeak.get();
+  }
+
+private:
+  RefPtr<RefCountable> m_obj;
+  Ref<RefCountable> m_ref;
+  WeakPtr<WeakRefCountable> m_weakObj;
 };
 
 class IntermediateClass : public SomeClass {
@@ -81,3 +157,50 @@ class Derived : public Base<Type> {
 public:
   virtual unsigned foo() const { return 0; }
 };
+
+struct Data {
+  static Ref<Data> create() {
+    return adoptRef(*new Data);
+  }
+
+  void ref() {
+    ++refCount;
+  }
+
+  void deref() {
+    --refCount;
+    if (!refCount)
+      delete this;
+  }
+
+  virtual void doSomething() { }
+
+  int a[3] { 0 };
+  
+protected:
+  Data() = default;
+
+private:
+  unsigned refCount { 0 };
+};
+
+struct SubData : Data {
+  static Ref<SubData> create() {
+    return adoptRef(*new SubData);
+  }
+
+  void doSomething() override { }
+
+private:
+  SubData() = default;
+};
+
+void [[clang::annotate_type("webkit.nodelete")]] makeData() {
+  RefPtr<Data> constantData[2] = { Data::create() };
+  RefPtr<Data> data[] = { Data::create() };
+}
+
+void [[clang::annotate_type("webkit.nodelete")]] makeSubData() {
+  // expected-warning at -1{{A function 'makeSubData' has [[clang::annotate_type("webkit.nodelete")]] but it contains code that could destruct an object}}
+  SubData::create()->doSomething();
+}
diff --git a/clang/test/Analysis/Checkers/WebKit/uncounted-local-vars.cpp b/clang/test/Analysis/Checkers/WebKit/uncounted-local-vars.cpp
index e8022b7fe8ba0..ad90198d5ac8b 100644
--- a/clang/test/Analysis/Checkers/WebKit/uncounted-local-vars.cpp
+++ b/clang/test/Analysis/Checkers/WebKit/uncounted-local-vars.cpp
@@ -26,6 +26,7 @@ void foo_ref() {
 void foo_ref_trivial() {
   RefCountable automatic;
   RefCountable &bar = automatic;
+  // expected-warning at -1{{Local variable 'bar' is uncounted and unsafe [alpha.webkit.UncountedLocalVarsChecker]}}
 }
 
 void bar_ref(RefCountable &) {}
@@ -63,7 +64,12 @@ void foo4() {
 void foo5() {
   RefPtr<RefCountable> foo;
   auto* bar = foo.get();
+  // expected-warning at -1{{Local variable 'bar' is uncounted and unsafe [alpha.webkit.UncountedLocalVarsChecker]}}
   bar->trivial();
+  {
+    auto* baz = foo.get();
+    baz->trivial();
+  }
 }
 
 void foo6() {
diff --git a/clang/test/Analysis/analyzer-enabled-checkers.c b/clang/test/Analysis/analyzer-enabled-checkers.c
index bfe418b112a9d..c1ed882069073 100644
--- a/clang/test/Analysis/analyzer-enabled-checkers.c
+++ b/clang/test/Analysis/analyzer-enabled-checkers.c
@@ -15,7 +15,6 @@
 // CHECK-NEXT: core.CallAndMessage
 // CHECK-NEXT: core.DivideZero
 // CHECK-NEXT: core.DynamicTypePropagation
-// CHECK-NEXT: core.FixedAddressDereference
 // CHECK-NEXT: core.NonNullParamChecker
 // CHECK-NEXT: core.NonnilStringConstants
 // CHECK-NEXT: core.NullDereference
diff --git a/clang/test/Analysis/builtin_bitcast.cpp b/clang/test/Analysis/builtin_bitcast.cpp
index 2ba32ec6d23d2..bcaec9ecc3096 100644
--- a/clang/test/Analysis/builtin_bitcast.cpp
+++ b/clang/test/Analysis/builtin_bitcast.cpp
@@ -1,5 +1,5 @@
 // RUN: %clang_analyze_cc1 -triple x86_64-unknown-unknown -verify %s \
-// RUN:   -analyzer-checker=core,debug.ExprInspection -analyzer-disable-checker=core.FixedAddressDereference
+// RUN:   -analyzer-checker=core,debug.ExprInspection
 
 template <typename T> void clang_analyzer_dump(T);
 using size_t = decltype(sizeof(int));
diff --git a/clang/test/Analysis/concrete-address.c b/clang/test/Analysis/concrete-address.c
index 683b7f29f4611..0822c8a0b7532 100644
--- a/clang/test/Analysis/concrete-address.c
+++ b/clang/test/Analysis/concrete-address.c
@@ -1,4 +1,4 @@
-// RUN: %clang_analyze_cc1 -analyzer-checker=core,alpha.core -verify %s
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,alpha.core.FixedAddr,optin.core.FixedAddressDereference -verify %s
 
 extern void __assert_fail (__const char *__assertion, __const char *__file,
     unsigned int __line, __const char *__function)
diff --git a/clang/test/Analysis/dtor.cpp b/clang/test/Analysis/dtor.cpp
index 9e00e937a7c29..ab46ff5ec5ecf 100644
--- a/clang/test/Analysis/dtor.cpp
+++ b/clang/test/Analysis/dtor.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_analyze_cc1 -analyzer-checker=core,unix.Malloc,debug.ExprInspection,cplusplus -analyzer-disable-checker=core.FixedAddressDereference -analyzer-config c++-inlining=destructors -Wno-null-dereference -Wno-inaccessible-base -verify -analyzer-config eagerly-assume=false %s
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,unix.Malloc,debug.ExprInspection,cplusplus -analyzer-config c++-inlining=destructors -Wno-null-dereference -Wno-inaccessible-base -verify -analyzer-config eagerly-assume=false %s
 
 void clang_analyzer_eval(bool);
 void clang_analyzer_checkInlined(bool);
diff --git a/clang/test/Analysis/fixed-address-notes.c b/clang/test/Analysis/fixed-address-notes.c
index e246ee5a464b0..537fa8cbb6463 100644
--- a/clang/test/Analysis/fixed-address-notes.c
+++ b/clang/test/Analysis/fixed-address-notes.c
@@ -1,4 +1,4 @@
-// RUN: %clang_analyze_cc1 -analyzer-checker=core -analyzer-output=text -verify %s
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,optin.core.FixedAddressDereference -analyzer-output=text -verify %s
 
 extern char *something();
 
diff --git a/clang/test/Analysis/misc-ps.m b/clang/test/Analysis/misc-ps.m
index 794d8bbceb459..c22e0dbb6137d 100644
--- a/clang/test/Analysis/misc-ps.m
+++ b/clang/test/Analysis/misc-ps.m
@@ -1,6 +1,6 @@
 // NOTE: Use '-fobjc-gc' to test the analysis being run twice, and multiple reports are not issued.
-// RUN: %clang_analyze_cc1 -triple i386-apple-darwin10 -analyzer-checker=core,alpha.core,osx.cocoa.AtSync -analyzer-disable-checker=core.FixedAddressDereference -Wno-strict-prototypes -Wno-pointer-to-int-cast -verify -fblocks -Wno-unreachable-code -Wno-null-dereference -Wno-objc-root-class %s
-// RUN: %clang_analyze_cc1 -triple x86_64-apple-darwin10 -analyzer-checker=core,alpha.core,osx.cocoa.AtSync -analyzer-disable-checker=core.FixedAddressDereference -Wno-strict-prototypes -Wno-pointer-to-int-cast -verify -fblocks -Wno-unreachable-code -Wno-null-dereference -Wno-objc-root-class %s
+// RUN: %clang_analyze_cc1 -triple i386-apple-darwin10 -analyzer-checker=core,alpha.core,osx.cocoa.AtSync -Wno-strict-prototypes -Wno-pointer-to-int-cast -verify -fblocks -Wno-unreachable-code -Wno-null-dereference -Wno-objc-root-class %s
+// RUN: %clang_analyze_cc1 -triple x86_64-apple-darwin10 -analyzer-checker=core,alpha.core,osx.cocoa.AtSync -Wno-strict-prototypes -Wno-pointer-to-int-cast -verify -fblocks -Wno-unreachable-code -Wno-null-dereference -Wno-objc-root-class %s
 
 #ifndef __clang_analyzer__
 #error __clang_analyzer__ not defined
diff --git a/clang/test/Analysis/pr22954.c b/clang/test/Analysis/pr22954.c
index b5f8aeb2a5ca6..3d1cac1972066 100644
--- a/clang/test/Analysis/pr22954.c
+++ b/clang/test/Analysis/pr22954.c
@@ -3,7 +3,7 @@
 // At the moment the whole of the destination array content is invalidated.
 // If a.s1 region has a symbolic offset, the whole region of 'a' is invalidated.
 // Specific triple set to test structures of size 0.
-// RUN: %clang_analyze_cc1 -triple x86_64-pc-linux-gnu -analyzer-checker=core,unix.Malloc,debug.ExprInspection -analyzer-disable-checker=core.FixedAddressDereference -Wno-error=int-conversion -verify -analyzer-config eagerly-assume=false %s
+// RUN: %clang_analyze_cc1 -triple x86_64-pc-linux-gnu -analyzer-checker=core,unix.Malloc,debug.ExprInspection -Wno-error=int-conversion -verify -analyzer-config eagerly-assume=false %s
 
 typedef __typeof(sizeof(int)) size_t;
 
diff --git a/clang/test/Analysis/std-c-library-functions-arg-enabled-checkers.c b/clang/test/Analysis/std-c-library-functions-arg-enabled-checkers.c
index 9b3296064981f..4de004e00687a 100644
--- a/clang/test/Analysis/std-c-library-functions-arg-enabled-checkers.c
+++ b/clang/test/Analysis/std-c-library-functions-arg-enabled-checkers.c
@@ -23,7 +23,6 @@
 // CHECK-NEXT: core.CallAndMessage
 // CHECK-NEXT: core.DivideZero
 // CHECK-NEXT: core.DynamicTypePropagation
-// CHECK-NEXT: core.FixedAddressDereference
 // CHECK-NEXT: core.NonNullParamChecker
 // CHECK-NEXT: core.NonnilStringConstants
 // CHECK-NEXT: core.NullDereference
diff --git a/clang/test/Analysis/suppress-dereferences-from-any-address-space.c b/clang/test/Analysis/suppress-dereferences-from-any-address-space.c
index 5b42262c87223..c14781876c4ef 100644
--- a/clang/test/Analysis/suppress-dereferences-from-any-address-space.c
+++ b/clang/test/Analysis/suppress-dereferences-from-any-address-space.c
@@ -1,7 +1,7 @@
-// RUN: %clang_analyze_cc1 -triple x86_64-pc-linux-gnu -analyzer-checker=core,alpha.core -std=gnu99 -analyzer-config suppress-dereferences-from-any-address-space=false -verify=x86-nosuppress,common %s
-// RUN: %clang_analyze_cc1 -triple x86_64-pc-linux-gnu -analyzer-checker=core,alpha.core -std=gnu99 -verify=x86-suppress,common %s
-// RUN: %clang_analyze_cc1 -triple arm-pc-linux-gnu -analyzer-checker=core,alpha.core -std=gnu99 -analyzer-config suppress-dereferences-from-any-address-space=false -verify=other-nosuppress,common %s
-// RUN: %clang_analyze_cc1 -triple arm-pc-linux-gnu -analyzer-checker=core,alpha.core -std=gnu99 -verify=other-suppress,common %s
+// RUN: %clang_analyze_cc1 -triple x86_64-pc-linux-gnu -analyzer-checker=core,optin.core.FixedAddressDereference -std=gnu99 -analyzer-config suppress-dereferences-from-any-address-space=false -verify=x86-nosuppress,common %s
+// RUN: %clang_analyze_cc1 -triple x86_64-pc-linux-gnu -analyzer-checker=core,optin.core.FixedAddressDereference -std=gnu99 -verify=x86-suppress,common %s
+// RUN: %clang_analyze_cc1 -triple arm-pc-linux-gnu -analyzer-checker=core,optin.core.FixedAddressDereference -std=gnu99 -analyzer-config suppress-dereferences-from-any-address-space=false -verify=other-nosuppress,common %s
+// RUN: %clang_analyze_cc1 -triple arm-pc-linux-gnu -analyzer-checker=core,optin.core.FixedAddressDereference -std=gnu99 -verify=other-suppress,common %s
 
 // Address-space attributes suppress the report even if the pointees are not marked `volatile`.
 #define AS_ATTRIBUTE(_X) __attribute__((address_space(_X)))
diff --git a/clang/test/CIR/CodeGenBuiltins/builtin-structured-binding-size.cpp b/clang/test/CIR/CodeGenBuiltins/builtin-structured-binding-size.cpp
new file mode 100644
index 0000000000000..cd391bfe14f4b
--- /dev/null
+++ b/clang/test/CIR/CodeGenBuiltins/builtin-structured-binding-size.cpp
@@ -0,0 +1,30 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-cir %s -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s -check-prefix=CIR
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-llvm %s -o %t-cir.ll
+// RUN: FileCheck --input-file=%t-cir.ll %s -check-prefix=LLVM
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm %s -o %t.ll
+// RUN: FileCheck --input-file=%t.ll %s -check-prefix=OGCG
+
+struct S {
+  int a;
+  double b;
+  char c;
+};
+
+int test_structured_binding_size() {
+  return __builtin_structured_binding_size(S);
+}
+
+// CIR: cir.func {{.*}} @_Z28test_structured_binding_sizev()
+// CIR:   %[[SIZE:.*]] = cir.const #cir.int<3> : !s32i
+// CIR:   cir.store %[[SIZE:.*]], %[[RETVAL:.*]]
+// CIR:   %[[RET:.*]] = cir.load %[[RETVAL:.*]]
+// CIR:   cir.return %[[RET:.*]] : !s32i
+
+// LLVM: define{{.*}} i32 @_Z28test_structured_binding_sizev()
+// LLVM:   store i32 3, ptr %[[RETVAL:.*]]
+// LLVM:   %[[RET:.*]] = load i32, ptr %[[RETVAL:.*]]
+// LLVM:   ret i32 %[[RET:.*]]
+
+// OGCG: define{{.*}} i32 @_Z28test_structured_binding_sizev()
+// OGCG:   ret i32 3
diff --git a/clang/test/CIR/CodeGenBuiltins/builtin-trivally-copyable.cpp b/clang/test/CIR/CodeGenBuiltins/builtin-trivally-copyable.cpp
new file mode 100644
index 0000000000000..e8e0fbf3eeedd
--- /dev/null
+++ b/clang/test/CIR/CodeGenBuiltins/builtin-trivally-copyable.cpp
@@ -0,0 +1,57 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-cir %s -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s -check-prefix=CIR
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-llvm %s -o %t-cir.ll
+// RUN: FileCheck --input-file=%t-cir.ll %s -check-prefix=LLVM
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm %s -o %t.ll
+// RUN: FileCheck --input-file=%t.ll %s -check-prefix=OGCG
+
+bool g;
+void store_trivially_copyable_result() {
+  g = __is_trivially_copyable(int);
+}
+
+// CIR: cir.func {{.*}} @_Z31store_trivially_copyable_resultv()
+// CIR:   %[[TRUE:.*]] = cir.const #true
+// CIR:   %[[G_PTR:.*]] = cir.get_global @g : !cir.ptr<!cir.bool>
+// CIR:   cir.store{{.*}} %[[TRUE]], %[[G_PTR:.*]] : !cir.bool, !cir.ptr<!cir.bool>
+
+// LLVM: define{{.*}} void @_Z31store_trivially_copyable_resultv()
+// LLVM:   store i8 1, ptr @g
+
+// OGCG: define{{.*}} void @_Z31store_trivially_copyable_resultv()
+// OGCG:   store i8 1, ptr @g
+
+int test_trivially_copyable_as_bool() {
+  if (!__is_trivially_copyable(int))
+    return -1;
+  return 0;
+}
+
+// CIR: cir.func {{.*}} @_Z31test_trivially_copyable_as_boolv()
+// CIR:   %[[FALSE:.*]] = cir.const #false
+// CIR:   cir.if %[[FALSE]] {
+// CIR:     %[[NEG_ONE:.*]] = cir.const #cir.int<-1> : !s32i
+// CIR:     cir.store %[[NEG_ONE]], %[[RETVAL:.*]]
+// CIR:     %[[RET:.*]] = cir.load %[[RETVAL:.*]] : !cir.ptr<!s32i>, !s32i
+// CIR:     cir.return %[[RET:.*]] : !s32i
+// CIR:   }
+// CIR:   %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
+// CIR:   cir.store %[[ZERO]], %[[RETVAL:.*]] : !s32i, !cir.ptr<!s32i>
+// CIR:   %[[RET:.*]] = cir.load %[[RETVAL:.*]] : !cir.ptr<!s32i>, !s32i
+// CIR:   cir.return %[[RET:.*]] : !s32i
+
+// LLVM: define{{.*}} i32 @_Z31test_trivially_copyable_as_boolv()
+// LLVM:   br i1 false, label %[[IF_THEN:.*]], label %[[IF_ELSE:.*]]
+// LLVM: [[IF_THEN]]:
+// LLVM:   store i32 -1, ptr %[[RETVAL:.*]]
+// LLVM:   %[[RET:.*]] = load i32, ptr %[[RETVAL:.*]]
+// LLVM:   ret i32 %[[RET:.*]]
+// LLVM: [[IF_ELSE]]:
+// LLVM:   br label %[[IF_END:.*]]
+// LLVM: [[IF_END]]:
+// LLVM:   store i32 0, ptr %[[RETVAL:.*]]
+// LLVM:   %[[RET:.*]] = load i32, ptr %[[RETVAL:.*]]
+// LLVM:   ret i32 %[[RET:.*]]
+
+// OGCG: define{{.*}} i32 @_Z31test_trivially_copyable_as_boolv()
+// OGCG:   ret i32 0
diff --git a/clang/test/CIR/CodeGenBuiltins/builtin-types-compatible.c b/clang/test/CIR/CodeGenBuiltins/builtin-types-compatible.c
new file mode 100644
index 0000000000000..dcf5fd4246481
--- /dev/null
+++ b/clang/test/CIR/CodeGenBuiltins/builtin-types-compatible.c
@@ -0,0 +1,59 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-cir %s -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s -check-prefix=CIR
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-llvm %s -o %t-cir.ll
+// RUN: FileCheck --input-file=%t-cir.ll %s -check-prefix=LLVM
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm %s -o %t.ll
+// RUN: FileCheck --input-file=%t.ll %s -check-prefix=OGCG
+
+int g;
+void store_types_compatible_result() {
+  g = __builtin_types_compatible_p(int, const int);
+}
+
+// CIR: cir.func {{.*}} @store_types_compatible_result()
+// CIR:   %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+// CIR:   %[[G_PTR:.*]] = cir.get_global @g : !cir.ptr<!s32i>
+// CIR:   cir.store{{.*}} %[[ONE]], %[[G_PTR:.*]] : !s32i, !cir.ptr<!s32i>
+
+// LLVM: define{{.*}} void @store_types_compatible_result()
+// LLVM:   store i32 1, ptr @g
+
+// OGCG: define{{.*}} void @store_types_compatible_result()
+// OGCG:   store i32 1, ptr @g
+
+int test_convert_bool_to_int() {
+  if (!__builtin_types_compatible_p(int, const int))
+    return -1;
+  return 0;
+}
+
+// CIR: cir.func {{.*}} @test_convert_bool_to_int()
+// CIR:   %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+// CIR:   %[[BOOL:.*]] = cir.cast int_to_bool %[[ONE]] : !s32i -> !cir.bool
+// CIR:   %[[NOT:.*]] = cir.unary(not, %[[BOOL]]) : !cir.bool, !cir.bool
+// CIR:   cir.if %[[NOT]] {
+// CIR:     %[[NEG_ONE:.*]] = cir.const #cir.int<-1> : !s32i
+// CIR:     cir.store %[[NEG_ONE]], %[[RETVAL:.*]]
+// CIR:     %[[RET:.*]] = cir.load %[[RETVAL:.*]] : !cir.ptr<!s32i>, !s32i
+// CIR:     cir.return %[[RET:.*]] : !s32i
+// CIR:   }
+// CIR:   %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
+// CIR:   cir.store %[[ZERO]], %[[RETVAL:.*]] : !s32i, !cir.ptr<!s32i>
+// CIR:   %[[RET:.*]] = cir.load %[[RETVAL:.*]] : !cir.ptr<!s32i>, !s32i
+// CIR:   cir.return %[[RET:.*]] : !s32i
+
+// LLVM: define{{.*}} i32 @test_convert_bool_to_int()
+// LLVM:   br i1 false, label %[[IF_THEN:.*]], label %[[IF_ELSE:.*]]
+// LLVM: [[IF_THEN]]:
+// LLVM:   store i32 -1, ptr %[[RETVAL:.*]]
+// LLVM:   %[[RET:.*]] = load i32, ptr %[[RETVAL:.*]]
+// LLVM:   ret i32 %[[RET:.*]]
+// LLVM: [[IF_ELSE]]:
+// LLVM:   br label %[[IF_END:.*]]
+// LLVM: [[IF_END]]:
+// LLVM:   store i32 0, ptr %[[RETVAL:.*]]
+// LLVM:   %[[RET:.*]] = load i32, ptr %[[RETVAL:.*]]
+// LLVM:   ret i32 %[[RET:.*]]
+
+// OGCG: define{{.*}} i32 @test_convert_bool_to_int()
+// OGCG:   ret i32 0
diff --git a/clang/test/CodeGen/AArch64/neon/fullfp16.c b/clang/test/CodeGen/AArch64/neon/fullfp16.c
index f3268df2f4165..ab424fc08f176 100644
--- a/clang/test/CodeGen/AArch64/neon/fullfp16.c
+++ b/clang/test/CodeGen/AArch64/neon/fullfp16.c
@@ -50,3 +50,25 @@ float16_t test_vnegh_f16(float16_t a) {
 // LLVM: ret half [[NEG]]
   return vnegh_f16(a);
 }
+
+// ALL-LABEL: test_vfmah_f16
+float16_t test_vfmah_f16(float16_t a, float16_t b, float16_t c) {
+// CIR: cir.call_llvm_intrinsic "fma" {{.*}} : (!cir.f16, !cir.f16, !cir.f16) -> !cir.f16
+
+// LLVM-SAME: half{{.*}} [[A:%.*]], half{{.*}} [[B:%.*]], half{{.*}} [[C:%.*]])
+// LLVM:  [[FMA:%.*]] = call half @llvm.fma.f16(half [[B]], half [[C]], half [[A]])
+// LLVM:  ret half [[FMA]]
+  return vfmah_f16(a, b, c);
+}
+
+// ALL-LABEL: test_vfmsh_f16
+float16_t test_vfmsh_f16(float16_t a, float16_t b, float16_t c) {
+// CIR: [[SUB:%.*]] = cir.unary(minus, %{{.*}}) : !cir.f16, !cir.f16
+// CIR: cir.call_llvm_intrinsic "fma" [[SUB]], {{.*}} : (!cir.f16, !cir.f16, !cir.f16) -> !cir.f16
+
+// LLVM-SAME: half{{.*}} [[A:%.*]], half{{.*}} [[B:%.*]], half{{.*}} [[C:%.*]])
+// LLVM:  [[SUB:%.*]] = fneg half [[B]]
+// LLVM:  [[ADD:%.*]] = call half @llvm.fma.f16(half [[SUB]], half [[C]], half [[A]])
+// LLVM:  ret half [[ADD]]
+  return vfmsh_f16(a, b, c);
+}
diff --git a/clang/test/CodeGen/AArch64/v8.2a-fp16-intrinsics.c b/clang/test/CodeGen/AArch64/v8.2a-fp16-intrinsics.c
index 353f02195721f..080e2351ff1e7 100644
--- a/clang/test/CodeGen/AArch64/v8.2a-fp16-intrinsics.c
+++ b/clang/test/CodeGen/AArch64/v8.2a-fp16-intrinsics.c
@@ -619,19 +619,3 @@ float16_t test_vrsqrtsh_f16(float16_t a, float16_t b) {
 float16_t test_vsubh_f16(float16_t a, float16_t b) {
   return vsubh_f16(a, b);
 }
-
-// CHECK-LABEL: test_vfmah_f16
-// CHECK:  [[FMA:%.*]] = call half @llvm.fma.f16(half %b, half %c, half %a)
-// CHECK:  ret half [[FMA]]
-float16_t test_vfmah_f16(float16_t a, float16_t b, float16_t c) {
-  return vfmah_f16(a, b, c);
-}
-
-// CHECK-LABEL: test_vfmsh_f16
-// CHECK:  [[SUB:%.*]] = fneg half %b
-// CHECK:  [[ADD:%.*]] = call half @llvm.fma.f16(half [[SUB]], half %c, half %a)
-// CHECK:  ret half [[ADD]]
-float16_t test_vfmsh_f16(float16_t a, float16_t b, float16_t c) {
-  return vfmsh_f16(a, b, c);
-}
-
diff --git a/clang/test/CodeGen/attr-no-outline.c b/clang/test/CodeGen/attr-no-outline.c
index 60d2ab5563f34..3e82ca338a121 100644
--- a/clang/test/CodeGen/attr-no-outline.c
+++ b/clang/test/CodeGen/attr-no-outline.c
@@ -1,16 +1,46 @@
-// RUN: %clang_cc1 -emit-llvm -x c %s -triple x86_64-unknown-linux-gnu -o - -femit-all-decls -fblocks | FileCheck %s --check-prefix=C
-// RUN: %clang_cc1 -emit-llvm -x c++ %s -triple x86_64-unknown-linux-gnu -o - -femit-all-decls -fblocks | FileCheck %s --check-prefix=CXX
-// RUN: %clang_cc1 -emit-llvm -x c++ %s -triple x86_64-unknown-linux-gnu -o - -femit-all-decls -fblocks -std=c++23 | FileCheck %s --check-prefixes=CXX,CXX23
+// RUN: %clang_cc1 -emit-llvm -x c %s -triple x86_64-unknown-linux-gnu -o - -femit-all-decls -fblocks -DTEST_ATTR  | FileCheck %s --check-prefix=C,C-ATTR
+// RUN: %clang_cc1 -emit-llvm -x c %s -triple x86_64-unknown-linux-gnu -o - -femit-all-decls -fblocks -mno-outline | FileCheck %s --check-prefix=C,C-ARG
+// RUN: %clang_cc1 -emit-llvm -x c %s -triple x86_64-unknown-linux-gnu -o - -femit-all-decls -fblocks              | FileCheck %s --check-prefix=C,C-NONE
+
+
+// RUN: %clang_cc1 -emit-llvm -x c++ %s -triple x86_64-unknown-linux-gnu -o - -femit-all-decls -fblocks -std=c++23 -DTEST_ATTR  | FileCheck %s --check-prefixes=CXX,CXX-ATTR
+// RUN: %clang_cc1 -emit-llvm -x c++ %s -triple x86_64-unknown-linux-gnu -o - -femit-all-decls -fblocks -std=c++23 -mno-outline | FileCheck %s --check-prefixes=CXX,CXX-ARG
+// RUN: %clang_cc1 -emit-llvm -x c++ %s -triple x86_64-unknown-linux-gnu -o - -femit-all-decls -fblocks -std=c++23              | FileCheck %s --check-prefixes=CXX,CXX-NONE
+
+// This test checks that:
+// - [[clang::no_outline]] adds the nooutline IR attribute to specific definitions
+// - `-mno-outline` adds the nooutline IR attribute to all definitions
+// - Lack of either does not add nooutline IR attribute
+
+#ifdef TEST_ATTR
+#define ATTR [[clang::no_outline]]
+#define ATTR_DUNDER __attribute__((no_outline))
+#else
+#define ATTR
+#define ATTR_DUNDER
+#endif
 
 // C-LABEL: define dso_local i32 @toplevel_func(
-// C-SAME: ) #[[ATTR0:[0-9]+]] {
+// C-SAME: ) #[[ATTR1:[0-9]+]] {
 
 // CXX-LABEL: define dso_local noundef i32 @_Z13toplevel_funci(
-// CXX-SAME: ) #[[ATTR0:[0-9]+]] {
-[[clang::no_outline]] int toplevel_func(int x) {
+// CXX-SAME: ) #[[ATTR1:[0-9]+]] {
+ATTR int toplevel_func(int x) {
   return x;
 }
 
+// C-LABEL: define dso_local i32 @toplevel_func_noattr(
+// C-ATTR-SAME: ) #[[ATTR2:[0-9]+]] {
+// C-ARG-SAME:  ) #[[ATTR1]] {
+// C-NONE-SAME: ) #[[ATTR1]] {
+
+// CXX-LABEL: define dso_local noundef i32 @_Z20toplevel_func_noattri(
+// CXX-ATTR-SAME: ) #[[ATTR2:[0-9]+]] {
+// CXX-ARG-SAME:  ) #[[ATTR1]] {
+// CXX-NONE-SAME: ) #[[ATTR1]] {
+int toplevel_func_noattr(int x) {
+  return x;
+}
 
 // C-only: Function without prototype
 #ifndef __cplusplus
@@ -19,9 +49,9 @@
 #pragma clang diagnostic ignored "-Wimplicit-int"
 
 // C-LABEL: define dso_local i32 @no_proto_func(
-// C-SAME: ) #[[ATTR0]] {
+// C-SAME: ) #[[ATTR1]] {
 
-[[clang::no_outline]] no_proto_func(x)
+ATTR no_proto_func(x)
 int x; {
   return x;
 }
@@ -32,14 +62,25 @@ int x; {
 // With Blocks
 #if __has_feature(blocks)
 
+// C-LABEL: define dso_local i32 @func_with_block(
+// C-ATTR-SAME: ) #[[ATTR2]] {
+// C-ARG-SAME:  ) #[[ATTR1]] {
+// C-NONE-SAME: ) #[[ATTR1]] {
+
+// CXX-LABEL: define dso_local noundef i32 @_Z15func_with_blocki(
+// CXX-ATTR-SAME: ) #[[ATTR2]] {
+// CXX-ARG-SAME:  ) #[[ATTR1]] {
+// CXX-NONE-SAME: ) #[[ATTR1]] {
 int func_with_block(int x) {
+
 // C-LABEL: define internal i32 @__func_with_block_block_invoke(
-// C-SAME: ) #[[ATTR0]] {
+// C-SAME: ) #[[ATTR1]] {
 
 // CXX-LABEL: define internal noundef i32 @___Z15func_with_blocki_block_invoke(
-// CXX-SAME: ) #[[ATTR1:[0-9]+]] {
-
-  int (^block)(int) = ^ __attribute__((no_outline)) int (int y) { return y; };
+// CXX-ATTR-SAME: ) #[[ATTR3:[0-9]+]] {
+// CXX-ARG-SAME:  ) #[[ATTR2:[0-9]+]] {
+// CXX-NONE-SAME: ) #[[ATTR2:[0-9]+]] {
+  int (^block)(int) = ^ ATTR_DUNDER int (int y) { return y; };
 
   return block(x);
 }
@@ -51,57 +92,74 @@ int func_with_block(int x) {
 struct my_struct {
 
 // CXX-LABEL: define linkonce_odr noundef i32 @_ZN9my_struct11member_funcEi(
-// CXX-SAME: ) #[[ATTR0]] comdat
-  [[clang::no_outline]] int member_func(int x) {
+// CXX-SAME: ) #[[ATTR1]] comdat
+  ATTR int member_func(int x) {
     return x;
   }
 
 // CXX-LABEL: define linkonce_odr noundef i32 @_ZN9my_struct11static_funcEi(
-// CXX-SAME: ) #[[ATTR0]] comdat
-  [[clang::no_outline]] static int static_func(int x) {
+// CXX-SAME: ) #[[ATTR1]] comdat
+  ATTR static int static_func(int x) {
     return x;
   }
 };
 
 template <typename T> struct templated_struct {
-  [[clang::no_outline]] T member_func(T x) {
+  ATTR T member_func(T x) {
     return x;
   }
 
-  [[clang::no_outline]] static T static_func(T x) {
+  ATTR static T static_func(T x) {
     return x;
   }
 };
 
 // CXX-LABEL: define weak_odr noundef i32 @_ZN16templated_structIiE11member_funcEi(
-// CXX-SAME: ) #[[ATTR0]] comdat
+// CXX-SAME: ) #[[ATTR1]] comdat
 // CXX-LABEL: define weak_odr noundef i32 @_ZN16templated_structIiE11static_funcEi(
-// CXX-SAME: ) #[[ATTR0]] comdat
+// CXX-SAME: ) #[[ATTR1]] comdat
 template struct templated_struct<int>;
 
 
-#if __cplusplus >= 202302L
+// CXX-LABEL: define dso_local noundef i32 @_Z16func_with_lambdai(
+// CXX-ATTR-SAME: ) #[[ATTR2]]
+// CXX-ARG-SAME:  ) #[[ATTR1]]
+// CXX-NONE-SAME: ) #[[ATTR1]]
 int func_with_lambda(int x) {
-  // CXX23-LABEL: define internal noundef i32 @"_ZZ16func_with_lambdaiENK3$_0clEv"(
-  // CXX23-SAME: ) #[[ATTR0]]
-  auto lambda = [x][[clang::no_outline]]() -> int {
+
+// CXX-LABEL: define internal noundef i32 @"_ZZ16func_with_lambdaiENK3$_0clEv"(
+// CXX-SAME: ) #[[ATTR1]]
+  auto lambda = [x] ATTR () -> int {
     return x;
   };
 
   return lambda();
 }
 #endif
-#endif
 
 
-// C: attributes #[[ATTR0]] = {
-// C-SAME: nooutline
+// C: attributes #[[ATTR1]] = {
+// C-ATTR-SAME: nooutline
+// C-ARG-SAME: nooutline
+// C-NONE-NOT: nooutline
 // C-SAME: }
 
-// CXX: attributes #[[ATTR0]] = {
-// CXX-SAME: nooutline
-// CXX-SAME: }
+// C-ATTR: attributes #[[ATTR2]] = {
+// C-ATTR-NOT: nooutline
+// C-ATTR-SAME: }
 
 // CXX: attributes #[[ATTR1]] = {
-// CXX-SAME: nooutline
+// CXX-ATTR-SAME: nooutline
+// CXX-ARG-SAME: nooutline
+// CXX-NONE-NOT: nooutline
 // CXX-SAME: }
+
+// CXX: attributes #[[ATTR2]] = {
+// CXX-ATTR-NOT: nooutline
+// CXX-ARG-SAME: nooutline
+// CXX-NONE-NOT: nooutline
+// CXX-SAME: }
+
+// CXX-ATTR: attributes #[[ATTR3]] = {
+// CXX-ATTR-SAME: nooutline
+// CXX-ATTR-SAME: }
diff --git a/clang/test/CodeGenHLSL/BasicFeatures/MatrixToAndFromVectorConstructors.hlsl b/clang/test/CodeGenHLSL/BasicFeatures/MatrixToAndFromVectorConstructors.hlsl
new file mode 100644
index 0000000000000..8b1fb9038bedd
--- /dev/null
+++ b/clang/test/CodeGenHLSL/BasicFeatures/MatrixToAndFromVectorConstructors.hlsl
@@ -0,0 +1,121 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -disable-llvm-passes -emit-llvm -finclude-default-header -o - -fmatrix-memory-layout=column-major %s | FileCheck %s --check-prefixes=CHECK,COL-CHECK
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -disable-llvm-passes -emit-llvm -finclude-default-header -o - -fmatrix-memory-layout=row-major %s | FileCheck %s --check-prefixes=CHECK,ROW-CHECK
+
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z2fnu11matrix_typeILm2ELm2EfE(
+// CHECK-SAME: <4 x float> noundef nofpclass(nan inf) [[M:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[M_ADDR:%.*]] = alloca [2 x <2 x float>], align 4
+// CHECK-NEXT:    [[V:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    store <4 x float> [[M]], ptr [[M_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[M_ADDR]], align 4
+// CHECK-NEXT:    [[MATRIXEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <4 x float> poison, float [[MATRIXEXT]], i32 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[M_ADDR]], align 4
+// COL-CHECK-NEXT:    [[MATRIXEXT1:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+// ROW-CHECK-NEXT:    [[MATRIXEXT1:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+// CHECK-NEXT:    [[VECINIT2:%.*]] = insertelement <4 x float> [[VECINIT]], float [[MATRIXEXT1]], i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[M_ADDR]], align 4
+// COL-CHECK-NEXT:    [[MATRIXEXT3:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
+// ROW-CHECK-NEXT:    [[MATRIXEXT3:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
+// CHECK-NEXT:    [[VECINIT4:%.*]] = insertelement <4 x float> [[VECINIT2]], float [[MATRIXEXT3]], i32 2
+// CHECK-NEXT:    [[TMP3:%.*]] = load <4 x float>, ptr [[M_ADDR]], align 4
+// CHECK-NEXT:    [[MATRIXEXT5:%.*]] = extractelement <4 x float> [[TMP3]], i32 3
+// CHECK-NEXT:    [[VECINIT6:%.*]] = insertelement <4 x float> [[VECINIT4]], float [[MATRIXEXT5]], i32 3
+// CHECK-NEXT:    store <4 x float> [[VECINIT6]], ptr [[V]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x float>, ptr [[V]], align 16
+// CHECK-NEXT:    ret <4 x float> [[TMP4]]
+//
+float4 fn(float2x2 m) {
+    float4 v = m;
+    return v;
+}
+
+// CHECK-LABEL: define hidden noundef <4 x i32> @_Z2fnDv4_i(
+// CHECK-SAME: <4 x i32> noundef [[V:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[V_ADDR:%.*]] = alloca <4 x i32>, align 16
+// CHECK-NEXT:    [[M:%.*]] = alloca [2 x <2 x i32>], align 4
+// CHECK-NEXT:    store <4 x i32> [[V]], ptr [[V_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[V_ADDR]], align 16
+// CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <4 x i32> [[TMP0]], i64 0
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <4 x i32> poison, i32 [[VECEXT]], i32 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[V_ADDR]], align 16
+// CHECK-NEXT:    [[VECEXT1:%.*]] = extractelement <4 x i32> [[TMP1]], i64 2
+// CHECK-NEXT:    [[VECINIT2:%.*]] = insertelement <4 x i32> [[VECINIT]], i32 [[VECEXT1]], i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[V_ADDR]], align 16
+// CHECK-NEXT:    [[VECEXT3:%.*]] = extractelement <4 x i32> [[TMP2]], i64 1
+// CHECK-NEXT:    [[VECINIT4:%.*]] = insertelement <4 x i32> [[VECINIT2]], i32 [[VECEXT3]], i32 2
+// CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr [[V_ADDR]], align 16
+// CHECK-NEXT:    [[VECEXT5:%.*]] = extractelement <4 x i32> [[TMP3]], i64 3
+// CHECK-NEXT:    [[VECINIT6:%.*]] = insertelement <4 x i32> [[VECINIT4]], i32 [[VECEXT5]], i32 3
+// CHECK-NEXT:    store <4 x i32> [[VECINIT6]], ptr [[M]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr [[M]], align 4
+// CHECK-NEXT:    ret <4 x i32> [[TMP4]]
+//
+int2x2 fn(int4 v) {
+    int2x2 m = v;
+    return m;
+}
+
+// CHECK-LABEL: define hidden noundef <2 x i32> @_Z3fn1Dv2_i(
+// CHECK-SAME: <2 x i32> noundef [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[V_ADDR:%.*]] = alloca <2 x i32>, align 8
+// CHECK-NEXT:    store <2 x i32> [[V]], ptr [[V_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[V_ADDR]], align 8
+// CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <2 x i32> [[TMP0]], i64 0
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <2 x i32> poison, i32 [[VECEXT]], i32 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[V_ADDR]], align 8
+// CHECK-NEXT:    [[VECEXT1:%.*]] = extractelement <2 x i32> [[TMP1]], i64 1
+// CHECK-NEXT:    [[VECINIT2:%.*]] = insertelement <2 x i32> [[VECINIT]], i32 [[VECEXT1]], i32 1
+// CHECK-NEXT:    ret <2 x i32> [[VECINIT2]]
+//
+int1x2 fn1(int2 v) {
+    return v;
+}
+
+// CHECK-LABEL: define hidden noundef <3 x i1> @_Z3fn2Dv3_b(
+// CHECK-SAME: <3 x i1> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <3 x i32>, align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = zext <3 x i1> [[B]] to <3 x i32>
+// CHECK-NEXT:    store <3 x i32> [[TMP0]], ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <3 x i32>, ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    [[LOADEDV:%.*]] = trunc <3 x i32> [[TMP1]] to <3 x i1>
+// CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <3 x i1> [[LOADEDV]], i64 0
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <3 x i1> poison, i1 [[VECEXT]], i32 0
+// CHECK-NEXT:    [[TMP2:%.*]] = load <3 x i32>, ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    [[LOADEDV1:%.*]] = trunc <3 x i32> [[TMP2]] to <3 x i1>
+// CHECK-NEXT:    [[VECEXT2:%.*]] = extractelement <3 x i1> [[LOADEDV1]], i64 1
+// CHECK-NEXT:    [[VECINIT3:%.*]] = insertelement <3 x i1> [[VECINIT]], i1 [[VECEXT2]], i32 1
+// CHECK-NEXT:    [[TMP3:%.*]] = load <3 x i32>, ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    [[LOADEDV4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i1>
+// CHECK-NEXT:    [[VECEXT5:%.*]] = extractelement <3 x i1> [[LOADEDV4]], i64 2
+// CHECK-NEXT:    [[VECINIT6:%.*]] = insertelement <3 x i1> [[VECINIT3]], i1 [[VECEXT5]], i32 2
+// CHECK-NEXT:    ret <3 x i1> [[VECINIT6]]
+//
+bool3x1 fn2(bool3 b) {
+    return b;
+}
+
+// CHECK-LABEL: define hidden noundef <3 x i32> @_Z3fn3u11matrix_typeILm1ELm3EbE(
+// CHECK-SAME: <3 x i1> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// COL-CHECK-NEXT:    [[B_ADDR:%.*]] = alloca [3 x <1 x i32>], align 4
+// ROW-CHECK-NEXT:    [[B_ADDR:%.*]] = alloca [1 x <3 x i32>], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = zext <3 x i1> [[B]] to <3 x i32>
+// CHECK-NEXT:    store <3 x i32> [[TMP0]], ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load <3 x i32>, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[MATRIXEXT:%.*]] = extractelement <3 x i32> [[TMP1]], i32 0
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <3 x i32> poison, i32 [[MATRIXEXT]], i32 0
+// CHECK-NEXT:    [[TMP2:%.*]] = load <3 x i32>, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[MATRIXEXT1:%.*]] = extractelement <3 x i32> [[TMP2]], i32 1
+// CHECK-NEXT:    [[VECINIT2:%.*]] = insertelement <3 x i32> [[VECINIT]], i32 [[MATRIXEXT1]], i32 1
+// CHECK-NEXT:    [[TMP3:%.*]] = load <3 x i32>, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[MATRIXEXT3:%.*]] = extractelement <3 x i32> [[TMP3]], i32 2
+// CHECK-NEXT:    [[VECINIT4:%.*]] = insertelement <3 x i32> [[VECINIT2]], i32 [[MATRIXEXT3]], i32 2
+// CHECK-NEXT:    ret <3 x i32> [[VECINIT4]]
+//
+int3 fn3(bool1x3 b) {
+    return b;
+}
diff --git a/clang/test/CodeGenHLSL/matrix_types.hlsl b/clang/test/CodeGenHLSL/matrix_types.hlsl
index 1c2f9cd316543..c502a79d28e23 100644
--- a/clang/test/CodeGenHLSL/matrix_types.hlsl
+++ b/clang/test/CodeGenHLSL/matrix_types.hlsl
@@ -16,31 +16,31 @@
 // CHECK-ROW-MAJOR: @bool1x2_Val = external hidden addrspace(2) global [1 x <2 x i32>], align 4
 // CHECK-ROW-MAJOR: @bool1x3_Val = external hidden addrspace(2) global [1 x <3 x i32>], align 4
 // CHECK-ROW-MAJOR: @bool1x4_Val = external hidden addrspace(2) global [1 x <4 x i32>], align 4
-// CHECK-ROW-MAJOR: @bool2x1_Val = external hidden addrspace(2) global [2 x <1 x i32>], align 4
-// CHECK-ROW-MAJOR: @bool2x2_Val = external hidden addrspace(2) global [2 x <2 x i32>], align 4
-// CHECK-ROW-MAJOR: @bool2x3_Val = external hidden addrspace(2) global [2 x <3 x i32>], align 4
+// CHECK-ROW-MAJOR: @bool2x1_Val = external hidden addrspace(2) global <{ [1 x <{ <1 x i32>, target("dx.Padding", 12) }>], <1 x i32> }>, align 4
+// CHECK-ROW-MAJOR: @bool2x2_Val = external hidden addrspace(2) global <{ [1 x <{ <2 x i32>, target("dx.Padding", 8) }>], <2 x i32> }>, align 4
+// CHECK-ROW-MAJOR: @bool2x3_Val = external hidden addrspace(2) global <{ [1 x <{ <3 x i32>, target("dx.Padding", 4) }>], <3 x i32> }>, align 4
 // CHECK-ROW-MAJOR: @bool2x4_Val = external hidden addrspace(2) global [2 x <4 x i32>], align 4
-// CHECK-ROW-MAJOR: @bool3x1_Val = external hidden addrspace(2) global [3 x <1 x i32>], align 4
-// CHECK-ROW-MAJOR: @bool3x2_Val = external hidden addrspace(2) global [3 x <2 x i32>], align 4
-// CHECK-ROW-MAJOR: @bool3x3_Val = external hidden addrspace(2) global [3 x <3 x i32>], align 4
+// CHECK-ROW-MAJOR: @bool3x1_Val = external hidden addrspace(2) global <{ [2 x <{ <1 x i32>, target("dx.Padding", 12) }>], <1 x i32> }>, align 4
+// CHECK-ROW-MAJOR: @bool3x2_Val = external hidden addrspace(2) global <{ [2 x <{ <2 x i32>, target("dx.Padding", 8) }>], <2 x i32> }>, align 4
+// CHECK-ROW-MAJOR: @bool3x3_Val = external hidden addrspace(2) global <{ [2 x <{ <3 x i32>, target("dx.Padding", 4) }>], <3 x i32> }>, align 4
 // CHECK-ROW-MAJOR: @bool3x4_Val = external hidden addrspace(2) global [3 x <4 x i32>], align 4
-// CHECK-ROW-MAJOR: @bool4x1_Val = external hidden addrspace(2) global [4 x <1 x i32>], align 4
-// CHECK-ROW-MAJOR: @bool4x2_Val = external hidden addrspace(2) global [4 x <2 x i32>], align 4
-// CHECK-ROW-MAJOR: @bool4x3_Val = external hidden addrspace(2) global [4 x <3 x i32>], align 4
+// CHECK-ROW-MAJOR: @bool4x1_Val = external hidden addrspace(2) global <{ [3 x <{ <1 x i32>, target("dx.Padding", 12) }>], <1 x i32> }>, align 4
+// CHECK-ROW-MAJOR: @bool4x2_Val = external hidden addrspace(2) global <{ [3 x <{ <2 x i32>, target("dx.Padding", 8) }>], <2 x i32> }>, align 4
+// CHECK-ROW-MAJOR: @bool4x3_Val = external hidden addrspace(2) global <{ [3 x <{ <3 x i32>, target("dx.Padding", 4) }>], <3 x i32> }>, align 4
 // CHECK-ROW-MAJOR: @bool4x4_Val = external hidden addrspace(2) global [4 x <4 x i32>], align 4
 
 // CHECK-COL-MAJOR: @bool1x1_Val = external hidden addrspace(2) global [1 x <1 x i32>], align 4
-// CHECK-COL-MAJOR: @bool1x2_Val = external hidden addrspace(2) global [2 x <1 x i32>], align 4
-// CHECK-COL-MAJOR: @bool1x3_Val = external hidden addrspace(2) global [3 x <1 x i32>], align 4
-// CHECK-COL-MAJOR: @bool1x4_Val = external hidden addrspace(2) global [4 x <1 x i32>], align 4
+// CHECK-COL-MAJOR: @bool1x2_Val = external hidden addrspace(2) global <{ [1 x <{ <1 x i32>, target("dx.Padding", 12) }>], <1 x i32> }>, align 4
+// CHECK-COL-MAJOR: @bool1x3_Val = external hidden addrspace(2) global <{ [2 x <{ <1 x i32>, target("dx.Padding", 12) }>], <1 x i32> }>, align 4
+// CHECK-COL-MAJOR: @bool1x4_Val = external hidden addrspace(2) global <{ [3 x <{ <1 x i32>, target("dx.Padding", 12) }>], <1 x i32> }>, align 4
 // CHECK-COL-MAJOR: @bool2x1_Val = external hidden addrspace(2) global [1 x <2 x i32>], align 4
-// CHECK-COL-MAJOR: @bool2x2_Val = external hidden addrspace(2) global [2 x <2 x i32>], align 4
-// CHECK-COL-MAJOR: @bool2x3_Val = external hidden addrspace(2) global [3 x <2 x i32>], align 4
-// CHECK-COL-MAJOR: @bool2x4_Val = external hidden addrspace(2) global [4 x <2 x i32>], align 4
+// CHECK-COL-MAJOR: @bool2x2_Val = external hidden addrspace(2) global <{ [1 x <{ <2 x i32>, target("dx.Padding", 8) }>], <2 x i32> }>, align 4
+// CHECK-COL-MAJOR: @bool2x3_Val = external hidden addrspace(2) global <{ [2 x <{ <2 x i32>, target("dx.Padding", 8) }>], <2 x i32> }>, align 4
+// CHECK-COL-MAJOR: @bool2x4_Val = external hidden addrspace(2) global <{ [3 x <{ <2 x i32>, target("dx.Padding", 8) }>], <2 x i32> }>, align 4
 // CHECK-COL-MAJOR: @bool3x1_Val = external hidden addrspace(2) global [1 x <3 x i32>], align 4
-// CHECK-COL-MAJOR: @bool3x2_Val = external hidden addrspace(2) global [2 x <3 x i32>], align 4
-// CHECK-COL-MAJOR: @bool3x3_Val = external hidden addrspace(2) global [3 x <3 x i32>], align 4
-// CHECK-COL-MAJOR: @bool3x4_Val = external hidden addrspace(2) global [4 x <3 x i32>], align 4
+// CHECK-COL-MAJOR: @bool3x2_Val = external hidden addrspace(2) global <{ [1 x <{ <3 x i32>, target("dx.Padding", 4) }>], <3 x i32> }>, align 4
+// CHECK-COL-MAJOR: @bool3x3_Val = external hidden addrspace(2) global <{ [2 x <{ <3 x i32>, target("dx.Padding", 4) }>], <3 x i32> }>, align 4
+// CHECK-COL-MAJOR: @bool3x4_Val = external hidden addrspace(2) global <{ [3 x <{ <3 x i32>, target("dx.Padding", 4) }>], <3 x i32> }>, align 4
 // CHECK-COL-MAJOR: @bool4x1_Val = external hidden addrspace(2) global [1 x <4 x i32>], align 4
 // CHECK-COL-MAJOR: @bool4x2_Val = external hidden addrspace(2) global [2 x <4 x i32>], align 4
 // CHECK-COL-MAJOR: @bool4x3_Val = external hidden addrspace(2) global [3 x <4 x i32>], align 4
diff --git a/clang/test/CodeGenHLSL/resources/cbuffer_matrix_align.hlsl b/clang/test/CodeGenHLSL/resources/cbuffer_matrix_align.hlsl
new file mode 100644
index 0000000000000..70b2732372691
--- /dev/null
+++ b/clang/test/CodeGenHLSL/resources/cbuffer_matrix_align.hlsl
@@ -0,0 +1,71 @@
+// RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -fsyntax-only -verify -verify-ignore-unexpected=warning
+
+cbuffer MatArr0Pass {
+  float2x4 A0p[2] : packoffset(c0.x);
+  float    a0tail : packoffset(c4.x);
+}
+
+cbuffer MatArr0Fail {
+  float2x4 A0f[2] : packoffset(c0.x);
+  float    a0bad  : packoffset(c3.z);
+  // expected-error at -1 {{packoffset overlap between 'a0bad', 'A0f'}}
+}
+
+// Struct containing a matrix.
+
+struct MS0 {
+  float2x4 M;
+  float2   V;
+};
+
+cbuffer MatStruct0Pass {
+  MS0   s0p   : packoffset(c0.x);
+  float s0tail: packoffset(c2.z);
+}
+
+cbuffer MatStruct0Fail {
+  MS0   s0f   : packoffset(c0.x);
+  float s0bad : packoffset(c2.y);
+  // expected-error at -1 {{packoffset overlap between 's0bad', 's0f'}}
+}
+
+// Nested struct containing a matrix.
+struct Inner0 {
+  float2x4 M;
+  float    F;
+};
+
+struct Outer0 {
+  float2   Head;
+  Inner0   I;
+  float2   Tail;
+};
+
+cbuffer MatNested0Pass {
+  Outer0 o0p   : packoffset(c0.x);
+  float  o0tail: packoffset(c4.x);
+}
+
+cbuffer MatNested0Fail {
+  Outer0 o0f  : packoffset(c0.x);
+  float  o0bad: packoffset(c3.z);
+  // expected-error at -1 {{packoffset overlap between 'o0bad', 'o0f'}}
+}
+
+// Array-of-struct where struct contains a matrix.
+
+struct AMS0 {
+  float2x4 M;
+  float2   V;
+};
+
+cbuffer MatArrStruct0Pass {
+  AMS0  as0p[2] : packoffset(c0.x);
+  float as0tail : packoffset(c5.z);
+}
+
+cbuffer MatArrStruct0Fail {
+  AMS0  as0f[2] : packoffset(c0.x);
+  float as0bad  : packoffset(c5.y);
+  // expected-error at -1 {{packoffset overlap between 'as0bad', 'as0f'}}
+}
diff --git a/clang/test/CodeGenHLSL/resources/default_cbuffer_with_layout.hlsl b/clang/test/CodeGenHLSL/resources/default_cbuffer_with_layout.hlsl
index 7be1f9043042c..63960f817de8f 100644
--- a/clang/test/CodeGenHLSL/resources/default_cbuffer_with_layout.hlsl
+++ b/clang/test/CodeGenHLSL/resources/default_cbuffer_with_layout.hlsl
@@ -9,7 +9,10 @@
 // CHECK-SAME:   %S
 // CHECK-SAME:   i32,
 // CHECK-SAME:   target("dx.Padding", 4),
-// CHECK-SAME:   <4 x float>
+// CHECK-SAME:   <4 x float>,
+// CHECK-SAME:   <{ [2 x <{ <2 x float>, target("dx.Padding", 8) }>], <2 x float> }>,
+// CHECK-SAME:   target("dx.Padding", 8),
+// CHECK-SAME:   [3 x <4 x float>]
 // CHECK-SAME: }>
 
 // CHECK: %S = type <{ <2 x float> }>
@@ -21,6 +24,8 @@
 // CHECK-DAG: @d = external hidden addrspace(2) global <4 x i32>, align 16
 // CHECK-DAG: @e = external hidden addrspace(2) global <4 x float>, align 16
 // CHECK-DAG: @s = external hidden addrspace(2) global %S, align 1
+// CHECK-DAG: @m = external hidden addrspace(2) global <{ [2 x <{ <2 x float>, target("dx.Padding", 8) }>], <2 x float> }>, align 4
+// CHECK-DAG: @n = external hidden addrspace(2) global [3 x <4 x float>], align 4
 
 struct S {
   float2 v;
@@ -32,6 +37,8 @@ int4 d : register(c6);
 double c[4] : register(c2);
 float4 e;
 S s : register(c7);
+float2x3 m;
+float4x3 n;
 
 RWBuffer<float> Buf;
 
@@ -41,4 +48,4 @@ void main() {
 }
 
 // CHECK: !hlsl.cbs = !{![[CB:.*]]}
-// CHECK: ![[CB]] = !{ptr @"$Globals.cb", ptr addrspace(2) @b, ptr addrspace(2) @c, ptr addrspace(2) @d, ptr addrspace(2) @s, ptr addrspace(2) @a, ptr addrspace(2) @e}
+// CHECK: ![[CB]] = !{ptr @"$Globals.cb", ptr addrspace(2) @b, ptr addrspace(2) @c, ptr addrspace(2) @d, ptr addrspace(2) @s, ptr addrspace(2) @a, ptr addrspace(2) @e, ptr addrspace(2) @m, ptr addrspace(2) @n}
diff --git a/clang/test/CodeGenObjC/attr-no-outline.m b/clang/test/CodeGenObjC/attr-no-outline.m
index 16d1a9eb867a0..8819f1d81107c 100644
--- a/clang/test/CodeGenObjC/attr-no-outline.m
+++ b/clang/test/CodeGenObjC/attr-no-outline.m
@@ -1,9 +1,28 @@
-// RUN: %clang_cc1 -emit-llvm %s -triple x86_64-unknown-linux-gnu -o - | FileCheck %s --check-prefix=OBJC
-// RUN: %clang_cc1 -emit-llvm -x objective-c++ %s -triple x86_64-unknown-linux-gnu -o - | FileCheck %s --check-prefix=OBJCXX
+// RUN: %clang_cc1 -emit-llvm %s -triple x86_64-unknown-linux-gnu -o - -DTEST_ATTR  | FileCheck %s --check-prefixes=OBJC,OBJC-ATTR
+// RUN: %clang_cc1 -emit-llvm %s -triple x86_64-unknown-linux-gnu -o - -mno-outline | FileCheck %s --check-prefixes=OBJC,OBJC-ARG
+// RUN: %clang_cc1 -emit-llvm %s -triple x86_64-unknown-linux-gnu -o -              | FileCheck %s --check-prefixes=OBJC,OBJC-NONE
+
+// RUN: %clang_cc1 -emit-llvm -x objective-c++ %s -triple x86_64-unknown-linux-gnu -o - -DTEST_ATTR  | FileCheck %s --check-prefixes=OBJCXX,OBJCXX-ATTR
+// RUN: %clang_cc1 -emit-llvm -x objective-c++ %s -triple x86_64-unknown-linux-gnu -o - -mno-outline | FileCheck %s --check-prefixes=OBJCXX,OBJCXX-ARG
+// RUN: %clang_cc1 -emit-llvm -x objective-c++ %s -triple x86_64-unknown-linux-gnu -o -              | FileCheck %s --check-prefixes=OBJCXX,OBJCXX-NONE
+
+// This test checks that:
+// - [[clang::no_outline]] adds the nooutline IR attribute to specific definitions
+// - `-mno-outline` adds the nooutline IR attribute to all definitions
+// - Lack of either does not add nooutline IR attribute
+
+
+#ifdef TEST_ATTR
+#define ATTR [[clang::no_outline]]
+#else
+#define ATTR
+#endif
 
 @interface Test
 - (int)method:(int)x;
+- (int)method_no_attr:(int)x;
 + (int)static_method:(int)x;
++ (int)static_method_no_attr:(int)x;
 @end
 
 @implementation Test
@@ -13,7 +32,20 @@ @implementation Test
 
 // OBJCXX-LABEL: define internal noundef i32 @"\01-[Test method:]"(
 // OBJCXX: ) #[[ATTR0:[0-9]+]] {
-- (int)method:(int)x [[clang::no_outline]] {
+- (int)method:(int)x ATTR {
+  return x;
+}
+
+// OBJC-LABEL: define internal i32 @"\01-[Test method_no_attr:]"(
+// OBJC-ATTR: ) #[[ATTR1:[0-9]+]] {
+// OBJC-ARG:  ) #[[ATTR0]] {
+// OBJC-NONE: ) #[[ATTR0]] {
+
+// OBJCXX-LABEL: define internal noundef i32 @"\01-[Test method_no_attr:]"(
+// OBJCXX-ATTR: ) #[[ATTR1:[0-9]+]] {
+// OBJCXX-ARG:  ) #[[ATTR0]] {
+// OBJCXX-NONE: ) #[[ATTR0]] {
+- (int)method_no_attr:(int) x {
   return x;
 }
 
@@ -22,19 +54,44 @@ - (int)method:(int)x [[clang::no_outline]] {
 
 // OBJCXX-LABEL: define internal noundef i32 @"\01+[Test static_method:]"(
 // OBJCXX: ) #[[ATTR0]] {
-+ (int)static_method:(int)x [[clang::no_outline]] {
++ (int)static_method:(int)x ATTR {
+  return x;
+}
+
+
+// OBJC-LABEL: define internal i32 @"\01+[Test static_method_no_attr:]"(
+// OBJC-ATTR: ) #[[ATTR1]] {
+// OBJC-ARG:  ) #[[ATTR0]] {
+// OBJC-NONE: ) #[[ATTR0]] {
+
+
+// OBJCXX-LABEL: define internal noundef i32 @"\01+[Test static_method_no_attr:]"(
+// OBJCXX-ATTR: ) #[[ATTR1]] {
+// OBJCXX-ARG:  ) #[[ATTR0]] {
+// OBJCXX-NONE: ) #[[ATTR0]] {
+
++ (int)static_method_no_attr:(int)x {
   return x;
 }
 
 @end
 
 // OBJC: attributes #[[ATTR0]] = {
-// OBJC-SAME: nooutline
+// OBJC-ATTR-SAME: nooutline
+// OBJC-ARG-SAME: nooutline
+// OBJC-NONE-NOT: nooutline
 // OBJC-SAME: }
 
+// OBJC-ATTR: attributes #[[ATTR1]] = {
+// OBJC-ATTR-NOT: nooutline
+// OBJC-ATTR-SAME: }
+
 // OBJCXX: attributes #[[ATTR0]] = {
-// OBJCXX-SAME: nooutline
+// OBJCXX-ATTR-SAME: nooutline
+// OBJCXX-ARG-SAME: nooutline
+// OBJCXX-NONE-NOT: nooutline
 // OBJCXX-SAME: }
 
-
-
+// OBJCXX-ATTR: attributes #[[ATTR1]] = {
+// OBJCXX-ATTR-NOT: nooutline
+// OBJCXX-ATTR-SAME: }
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w32.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w32.cl
index 6326866ed3c35..47ae7ce82becf 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w32.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w32.cl
@@ -1,6 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
 // REQUIRES: amdgpu-registered-target
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize32 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1170 -target-feature +wavefrontsize32 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize32 -emit-llvm -o - %s | FileCheck %s
 
 typedef int    v2i   __attribute__((ext_vector_type(2)));
 typedef float  v8f   __attribute__((ext_vector_type(8)));
@@ -14,12 +15,12 @@ typedef int    v8i   __attribute__((ext_vector_type(8)));
 // amdgcn_wmma_f32_16x16x16_f16
 //
 
-// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_f16_w32(
-// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-// CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
-// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> [[A]], <8 x half> [[B]], <8 x float> [[C]])
-// CHECK-GFX1200-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8:![0-9]+]]
-// CHECK-GFX1200-NEXT:    ret void
+// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_f16_w32(
+// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> [[A]], <8 x half> [[B]], <8 x float> [[C]])
+// CHECK-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8:![0-9]+]]
+// CHECK-NEXT:    ret void
 //
 void test_amdgcn_wmma_f32_16x16x16_f16_w32(global v8f* out, v8h a, v8h b, v8f c)
 {
@@ -30,12 +31,12 @@ void test_amdgcn_wmma_f32_16x16x16_f16_w32(global v8f* out, v8h a, v8h b, v8f c)
 // amdgcn_wmma_f32_16x16x16_bf16
 //
 
-// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_bf16_w32(
-// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
-// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16(<8 x i16> [[A]], <8 x i16> [[B]], <8 x float> [[C]])
-// CHECK-GFX1200-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]]
-// CHECK-GFX1200-NEXT:    ret void
+// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_bf16_w32(
+// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16(<8 x i16> [[A]], <8 x i16> [[B]], <8 x float> [[C]])
+// CHECK-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]]
+// CHECK-NEXT:    ret void
 //
 void test_amdgcn_wmma_f32_16x16x16_bf16_w32(global v8f* out, v8s a, v8s b, v8f c)
 {
@@ -46,12 +47,12 @@ void test_amdgcn_wmma_f32_16x16x16_bf16_w32(global v8f* out, v8s a, v8s b, v8f c
 // amdgcn_wmma_f16_16x16x16_f16
 //
 
-// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_f16_16x16x16_f16_w32(
-// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
-// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> [[A]], <8 x half> [[B]], <8 x half> [[C]], i1 false)
-// CHECK-GFX1200-NEXT:    store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
-// CHECK-GFX1200-NEXT:    ret void
+// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_f16_16x16x16_f16_w32(
+// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> [[A]], <8 x half> [[B]], <8 x half> [[C]], i1 false)
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
+// CHECK-NEXT:    ret void
 //
 void test_amdgcn_wmma_f16_16x16x16_f16_w32(global v8h* out, v8h a, v8h b, v8h c)
 {
@@ -62,12 +63,12 @@ void test_amdgcn_wmma_f16_16x16x16_f16_w32(global v8h* out, v8h a, v8h b, v8h c)
 // amdgcn_wmma_bf16_16x16x16_bf16
 //
 
-// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_bf16_16x16x16_bf16_w32(
-// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <8 x i16> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
-// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16(<8 x i16> [[A]], <8 x i16> [[B]], <8 x i16> [[C]], i1 false)
-// CHECK-GFX1200-NEXT:    store <8 x i16> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
-// CHECK-GFX1200-NEXT:    ret void
+// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_bf16_16x16x16_bf16_w32(
+// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <8 x i16> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16(<8 x i16> [[A]], <8 x i16> [[B]], <8 x i16> [[C]], i1 false)
+// CHECK-NEXT:    store <8 x i16> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
+// CHECK-NEXT:    ret void
 //
 void test_amdgcn_wmma_bf16_16x16x16_bf16_w32(global v8s* out, v8s a, v8s b, v8s c)
 {
@@ -78,12 +79,12 @@ void test_amdgcn_wmma_bf16_16x16x16_bf16_w32(global v8s* out, v8s a, v8s b, v8s
 // amdgcn_wmma_i32_16x16x16_iu8
 //
 
-// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_i32_16x16x16_iu8_w32(
-// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <8 x i32> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
-// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32(i1 true, <2 x i32> [[A]], i1 true, <2 x i32> [[B]], <8 x i32> [[C]], i1 false)
-// CHECK-GFX1200-NEXT:    store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]]
-// CHECK-GFX1200-NEXT:    ret void
+// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_i32_16x16x16_iu8_w32(
+// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <8 x i32> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32(i1 true, <2 x i32> [[A]], i1 true, <2 x i32> [[B]], <8 x i32> [[C]], i1 false)
+// CHECK-NEXT:    store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]]
+// CHECK-NEXT:    ret void
 //
 void test_amdgcn_wmma_i32_16x16x16_iu8_w32(global v8i* out, v2i a, v2i b, v8i c)
 {
@@ -94,79 +95,79 @@ void test_amdgcn_wmma_i32_16x16x16_iu8_w32(global v8i* out, v2i a, v2i b, v8i c)
 // amdgcn_wmma_i32_16x16x16_iu4
 //
 
-// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_i32_16x16x16_iu4_w32(
-// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], <8 x i32> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
-// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32(i1 true, i32 [[A]], i1 true, i32 [[B]], <8 x i32> [[C]], i1 false)
-// CHECK-GFX1200-NEXT:    store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]]
-// CHECK-GFX1200-NEXT:    ret void
+// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_i32_16x16x16_iu4_w32(
+// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], <8 x i32> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32(i1 true, i32 [[A]], i1 true, i32 [[B]], <8 x i32> [[C]], i1 false)
+// CHECK-NEXT:    store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]]
+// CHECK-NEXT:    ret void
 //
 void test_amdgcn_wmma_i32_16x16x16_iu4_w32(global v8i* out, int a, int b, v8i c)
 {
   *out = __builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12(true, a, true, b, c, false);
 }
 
-// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32(
-// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
-// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32(<2 x i32> [[A]], <2 x i32> [[B]], <8 x float> [[C]])
-// CHECK-GFX1200-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]]
-// CHECK-GFX1200-NEXT:    ret void
+// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32(
+// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32(<2 x i32> [[A]], <2 x i32> [[B]], <8 x float> [[C]])
+// CHECK-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]]
+// CHECK-NEXT:    ret void
 //
 void test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32(global v8f* out, v2i a, v2i b, v8f c)
 {
   *out = __builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12(a, b, c);
 }
 
-// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32(
-// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
-// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32(<2 x i32> [[A]], <2 x i32> [[B]], <8 x float> [[C]])
-// CHECK-GFX1200-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]]
-// CHECK-GFX1200-NEXT:    ret void
+// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32(
+// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32(<2 x i32> [[A]], <2 x i32> [[B]], <8 x float> [[C]])
+// CHECK-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]]
+// CHECK-NEXT:    ret void
 //
 void test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32(global v8f* out, v2i a, v2i b, v8f c)
 {
   *out = __builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12(a, b, c);
 }
 
-// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32(
-// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
-// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32(<2 x i32> [[A]], <2 x i32> [[B]], <8 x float> [[C]])
-// CHECK-GFX1200-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]]
-// CHECK-GFX1200-NEXT:    ret void
+// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32(
+// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32(<2 x i32> [[A]], <2 x i32> [[B]], <8 x float> [[C]])
+// CHECK-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]]
+// CHECK-NEXT:    ret void
 //
 void test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32(global v8f* out, v2i a, v2i b, v8f c)
 {
   *out = __builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12(a, b, c);
 }
 
-// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32(
-// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
-// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32(<2 x i32> [[A]], <2 x i32> [[B]], <8 x float> [[C]])
-// CHECK-GFX1200-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]]
-// CHECK-GFX1200-NEXT:    ret void
+// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32(
+// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32(<2 x i32> [[A]], <2 x i32> [[B]], <8 x float> [[C]])
+// CHECK-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]]
+// CHECK-NEXT:    ret void
 //
 void test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32(global v8f* out, v2i a, v2i b, v8f c)
 {
   *out = __builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12(a, b, c);
 }
 
-// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_i32_16x16x32_iu4_w32(
-// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <8 x i32> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
-// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32(i1 true, <2 x i32> [[A]], i1 true, <2 x i32> [[B]], <8 x i32> [[C]], i1 false)
-// CHECK-GFX1200-NEXT:    store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]]
-// CHECK-GFX1200-NEXT:    ret void
+// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_i32_16x16x32_iu4_w32(
+// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <8 x i32> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32(i1 true, <2 x i32> [[A]], i1 true, <2 x i32> [[B]], <8 x i32> [[C]], i1 false)
+// CHECK-NEXT:    store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]]
+// CHECK-NEXT:    ret void
 //
 void test_amdgcn_wmma_i32_16x16x32_iu4_w32(global v8i* out, v2i a, v2i b, v8i c)
 {
   *out = __builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12(true, a, true, b, c, false);
 }
 //.
-// CHECK-GFX1200: [[META6:![0-9]+]] = !{!"omnipotent char", [[META7:![0-9]+]], i64 0}
-// CHECK-GFX1200: [[META7]] = !{!"Simple C/C++ TBAA"}
-// CHECK-GFX1200: [[CHAR_TBAA8]] = !{[[META6]], [[META6]], i64 0}
+// CHECK: [[META6:![0-9]+]] = !{!"omnipotent char", [[META7:![0-9]+]], i64 0}
+// CHECK: [[META7]] = !{!"Simple C/C++ TBAA"}
+// CHECK: [[CHAR_TBAA8]] = !{[[META6]], [[META6]], i64 0}
 //.
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w64.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w64.cl
index a79c3d4da1ebb..98ce84adf1554 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w64.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w64.cl
@@ -1,6 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
 // REQUIRES: amdgpu-registered-target
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize64 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1170 -target-feature +wavefrontsize64 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize64 -emit-llvm -o - %s | FileCheck %s
 
 typedef float  v4f   __attribute__((ext_vector_type(4)));
 typedef half   v4h   __attribute__((ext_vector_type(4)));
@@ -13,12 +14,12 @@ typedef int    v4i   __attribute__((ext_vector_type(4)));
 // amdgcn_wmma_f32_16x16x16_f16
 //
 
-// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_f16_w64(
-// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-// CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
-// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> [[A]], <4 x half> [[B]], <4 x float> [[C]])
-// CHECK-GFX1200-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8:![0-9]+]]
-// CHECK-GFX1200-NEXT:    ret void
+// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_f16_w64(
+// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> [[A]], <4 x half> [[B]], <4 x float> [[C]])
+// CHECK-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8:![0-9]+]]
+// CHECK-NEXT:    ret void
 //
 void test_amdgcn_wmma_f32_16x16x16_f16_w64(global v4f* out, v4h a, v4h b, v4f c)
 {
@@ -29,12 +30,12 @@ void test_amdgcn_wmma_f32_16x16x16_f16_w64(global v4f* out, v4h a, v4h b, v4f c)
 // amdgcn_wmma_f32_16x16x16_bf16
 //
 
-// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_bf16_w64(
-// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
-// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16(<4 x i16> [[A]], <4 x i16> [[B]], <4 x float> [[C]])
-// CHECK-GFX1200-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
-// CHECK-GFX1200-NEXT:    ret void
+// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_bf16_w64(
+// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16(<4 x i16> [[A]], <4 x i16> [[B]], <4 x float> [[C]])
+// CHECK-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
+// CHECK-NEXT:    ret void
 //
 void test_amdgcn_wmma_f32_16x16x16_bf16_w64(global v4f* out, v4s a, v4s b, v4f c)
 {
@@ -45,12 +46,12 @@ void test_amdgcn_wmma_f32_16x16x16_bf16_w64(global v4f* out, v4s a, v4s b, v4f c
 // amdgcn_wmma_f16_16x16x16_f16
 //
 
-// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_f16_16x16x16_f16_w64(
-// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 8)) [[OUT:%.*]], <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
-// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16(<4 x half> [[A]], <4 x half> [[B]], <4 x half> [[C]], i1 false)
-// CHECK-GFX1200-NEXT:    store <4 x half> [[TMP0]], ptr addrspace(1) [[OUT]], align 8, !tbaa [[CHAR_TBAA8]]
-// CHECK-GFX1200-NEXT:    ret void
+// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_f16_16x16x16_f16_w64(
+// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 8)) [[OUT:%.*]], <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16(<4 x half> [[A]], <4 x half> [[B]], <4 x half> [[C]], i1 false)
+// CHECK-NEXT:    store <4 x half> [[TMP0]], ptr addrspace(1) [[OUT]], align 8, !tbaa [[CHAR_TBAA8]]
+// CHECK-NEXT:    ret void
 //
 void test_amdgcn_wmma_f16_16x16x16_f16_w64(global v4h* out, v4h a, v4h b, v4h c)
 {
@@ -61,12 +62,12 @@ void test_amdgcn_wmma_f16_16x16x16_f16_w64(global v4h* out, v4h a, v4h b, v4h c)
 // amdgcn_wmma_bf16_16x16x16_bf16
 //
 
-// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_bf16_16x16x16_bf16_w64(
-// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 8)) [[OUT:%.*]], <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
-// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16(<4 x i16> [[A]], <4 x i16> [[B]], <4 x i16> [[C]], i1 false)
-// CHECK-GFX1200-NEXT:    store <4 x i16> [[TMP0]], ptr addrspace(1) [[OUT]], align 8, !tbaa [[CHAR_TBAA8]]
-// CHECK-GFX1200-NEXT:    ret void
+// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_bf16_16x16x16_bf16_w64(
+// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 8)) [[OUT:%.*]], <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16(<4 x i16> [[A]], <4 x i16> [[B]], <4 x i16> [[C]], i1 false)
+// CHECK-NEXT:    store <4 x i16> [[TMP0]], ptr addrspace(1) [[OUT]], align 8, !tbaa [[CHAR_TBAA8]]
+// CHECK-NEXT:    ret void
 //
 void test_amdgcn_wmma_bf16_16x16x16_bf16_w64(global v4s* out, v4s a, v4s b, v4s c)
 {
@@ -77,12 +78,12 @@ void test_amdgcn_wmma_bf16_16x16x16_bf16_w64(global v4s* out, v4s a, v4s b, v4s
 // amdgcn_wmma_i32_16x16x16_iu8
 //
 
-// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_i32_16x16x16_iu8_w64(
-// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
-// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32(i1 true, i32 [[A]], i1 true, i32 [[B]], <4 x i32> [[C]], i1 false)
-// CHECK-GFX1200-NEXT:    store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
-// CHECK-GFX1200-NEXT:    ret void
+// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_i32_16x16x16_iu8_w64(
+// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32(i1 true, i32 [[A]], i1 true, i32 [[B]], <4 x i32> [[C]], i1 false)
+// CHECK-NEXT:    store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
+// CHECK-NEXT:    ret void
 //
 void test_amdgcn_wmma_i32_16x16x16_iu8_w64(global v4i* out, int a, int b, v4i c)
 {
@@ -93,79 +94,79 @@ void test_amdgcn_wmma_i32_16x16x16_iu8_w64(global v4i* out, int a, int b, v4i c)
 // amdgcn_wmma_i32_16x16x16_iu4
 //
 
-// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_i32_16x16x16_iu4_w64(
-// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
-// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32(i1 true, i32 [[A]], i1 true, i32 [[B]], <4 x i32> [[C]], i1 false)
-// CHECK-GFX1200-NEXT:    store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
-// CHECK-GFX1200-NEXT:    ret void
+// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_i32_16x16x16_iu4_w64(
+// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32(i1 true, i32 [[A]], i1 true, i32 [[B]], <4 x i32> [[C]], i1 false)
+// CHECK-NEXT:    store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
+// CHECK-NEXT:    ret void
 //
 void test_amdgcn_wmma_i32_16x16x16_iu4_w64(global v4i* out, int a, int b, v4i c)
 {
   *out = __builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12(true, a, true, b, c, false);
 }
 
-// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32(
-// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
-// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32(i32 [[A]], i32 [[B]], <4 x float> [[C]])
-// CHECK-GFX1200-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
-// CHECK-GFX1200-NEXT:    ret void
+// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32(
+// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32(i32 [[A]], i32 [[B]], <4 x float> [[C]])
+// CHECK-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
+// CHECK-NEXT:    ret void
 //
 void test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32(global v4f* out, int a, int b, v4f c)
 {
   *out = __builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12(a, b, c);
 }
 
-// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32(
-// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
-// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32(i32 [[A]], i32 [[B]], <4 x float> [[C]])
-// CHECK-GFX1200-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
-// CHECK-GFX1200-NEXT:    ret void
+// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32(
+// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32(i32 [[A]], i32 [[B]], <4 x float> [[C]])
+// CHECK-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
+// CHECK-NEXT:    ret void
 //
 void test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32(global v4f* out, int a, int b, v4f c)
 {
   *out = __builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12(a, b, c);
 }
 
-// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32(
-// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
-// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32(i32 [[A]], i32 [[B]], <4 x float> [[C]])
-// CHECK-GFX1200-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
-// CHECK-GFX1200-NEXT:    ret void
+// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32(
+// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32(i32 [[A]], i32 [[B]], <4 x float> [[C]])
+// CHECK-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
+// CHECK-NEXT:    ret void
 //
 void test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32(global v4f* out, int a, int b, v4f c)
 {
   *out = __builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12(a, b, c);
 }
 
-// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32(
-// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
-// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32(i32 [[A]], i32 [[B]], <4 x float> [[C]])
-// CHECK-GFX1200-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
-// CHECK-GFX1200-NEXT:    ret void
+// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32(
+// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32(i32 [[A]], i32 [[B]], <4 x float> [[C]])
+// CHECK-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
+// CHECK-NEXT:    ret void
 //
 void test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32(global v4f* out, int a, int b, v4f c)
 {
   *out = __builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12(a, b, c);
 }
 
-// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_i32_16x16x32_iu4_w32(
-// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
-// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32(i1 true, i32 [[A]], i1 true, i32 [[B]], <4 x i32> [[C]], i1 false)
-// CHECK-GFX1200-NEXT:    store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
-// CHECK-GFX1200-NEXT:    ret void
+// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_i32_16x16x32_iu4_w32(
+// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32(i1 true, i32 [[A]], i1 true, i32 [[B]], <4 x i32> [[C]], i1 false)
+// CHECK-NEXT:    store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
+// CHECK-NEXT:    ret void
 //
 void test_amdgcn_wmma_i32_16x16x32_iu4_w32(global v4i* out, int a, int b, v4i c)
 {
   *out = __builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12(true, a, true, b, c, false);
 }
 //.
-// CHECK-GFX1200: [[META6:![0-9]+]] = !{!"omnipotent char", [[META7:![0-9]+]], i64 0}
-// CHECK-GFX1200: [[META7]] = !{!"Simple C/C++ TBAA"}
-// CHECK-GFX1200: [[CHAR_TBAA8]] = !{[[META6]], [[META6]], i64 0}
+// CHECK: [[META6:![0-9]+]] = !{!"omnipotent char", [[META7:![0-9]+]], i64 0}
+// CHECK: [[META7]] = !{!"Simple C/C++ TBAA"}
+// CHECK: [[CHAR_TBAA8]] = !{[[META6]], [[META6]], i64 0}
 //.
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32-gfx10-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32-gfx10-err.cl
index a1a56f0d8417d..ed72a8ee7dbd2 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32-gfx10-err.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32-gfx10-err.cl
@@ -21,14 +21,14 @@ void test_amdgcn_wmma_f32_16x16x16_bf16_w32(global v8f* out8f, v16s a16s, v16s b
                                             global v16s* out16s, v2i a2i, v2i b2i, v16s c16s,
                                             global v8i* out8i, v4i a4i, v4i b4i, v8i c8i)
 {
- *out8f = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32(a16h, b16h, c8f);  // expected-error{{'__builtin_amdgcn_wmma_f32_16x16x16_f16_w32' needs target feature gfx11-insts,wavefrontsize32}}
- *out8f = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32(a16s, b16s, c8f);  // expected-error{{'__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32' needs target feature gfx11-insts,wavefrontsize32}}
- *out16h = __builtin_amdgcn_wmma_f16_16x16x16_f16_w32(a16h, b16h, c16h, true); // expected-error{{'__builtin_amdgcn_wmma_f16_16x16x16_f16_w32' needs target feature gfx11-insts,wavefrontsize32}}
- *out16s = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32(a16s, b16s, c16s, true); // expected-error{{'__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32' needs target feature gfx11-insts,wavefrontsize32}}
- *out16h = __builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32(a16h, b16h, c16h, true); // expected-error{{'__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32' needs target feature gfx11-insts,wavefrontsize32}}
- *out16s = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32(a16s, b16s, c16s, true); // expected-error{{'__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32' needs target feature gfx11-insts,wavefrontsize32}}
- *out8i = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32(true, a4i, true, b4i, c8i, false); // expected-error{{'__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32' needs target feature gfx11-insts,wavefrontsize32}}
- *out8i = __builtin_amdgcn_wmma_i32_16x16x16_iu4_w32(true, a2i, true, b2i, c8i, false); // expected-error{{'__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32' needs target feature gfx11-insts,wavefrontsize32}}
+ *out8f = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32(a16h, b16h, c8f);  // expected-error{{'__builtin_amdgcn_wmma_f32_16x16x16_f16_w32' needs target feature wmma-256b-insts,wavefrontsize32}}
+ *out8f = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32(a16s, b16s, c8f);  // expected-error{{'__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32' needs target feature wmma-256b-insts,wavefrontsize32}}
+ *out16h = __builtin_amdgcn_wmma_f16_16x16x16_f16_w32(a16h, b16h, c16h, true); // expected-error{{'__builtin_amdgcn_wmma_f16_16x16x16_f16_w32' needs target feature wmma-256b-insts,wavefrontsize32}}
+ *out16s = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32(a16s, b16s, c16s, true); // expected-error{{'__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32' needs target feature wmma-256b-insts,wavefrontsize32}}
+ *out16h = __builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32(a16h, b16h, c16h, true); // expected-error{{'__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32' needs target feature wmma-256b-insts,wavefrontsize32}}
+ *out16s = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32(a16s, b16s, c16s, true); // expected-error{{'__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32' needs target feature wmma-256b-insts,wavefrontsize32}}
+ *out8i = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32(true, a4i, true, b4i, c8i, false); // expected-error{{'__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32' needs target feature wmma-256b-insts,wavefrontsize32}}
+ *out8i = __builtin_amdgcn_wmma_i32_16x16x16_iu4_w32(true, a2i, true, b2i, c8i, false); // expected-error{{'__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32' needs target feature wmma-256b-insts,wavefrontsize32}}
 }
 
 #endif
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w64-gfx10-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w64-gfx10-err.cl
index d995b1dc46be7..4b1808fe6d6e6 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w64-gfx10-err.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w64-gfx10-err.cl
@@ -21,14 +21,14 @@ void test_amdgcn_wmma_f32_16x16x16_bf16_w64(global v4f* out4f, v16h a16h, v16h b
                                             global v8s* out8s, v4i a4i, v4i b4i, v8s c8s,
                                             global v4i* out4i, v2i a2i, v2i b2i, v4i c4i)
 {
- *out4f = __builtin_amdgcn_wmma_f32_16x16x16_f16_w64(a16h, b16h, c4f);  // expected-error{{'__builtin_amdgcn_wmma_f32_16x16x16_f16_w64' needs target feature gfx11-insts,wavefrontsize64}}
- *out4f = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w64(a16s, b16s, c4f);  // expected-error{{'__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64' needs target feature gfx11-insts,wavefrontsize64}}
- *out8h = __builtin_amdgcn_wmma_f16_16x16x16_f16_w64(a16h, b16h, c8h, true); // expected-error{{'__builtin_amdgcn_wmma_f16_16x16x16_f16_w64' needs target feature gfx11-insts,wavefrontsize64}}
- *out8s = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64(a16s, b16s, c8s, true); // expected-error{{'__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64' needs target feature gfx11-insts,wavefrontsize64}}
- *out8h = __builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64(a16h, b16h, c8h, true); // expected-error{{'__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64' needs target feature gfx11-insts,wavefrontsize64}}
- *out8s = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64(a16s, b16s, c8s, true); // expected-error{{'__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64' needs target feature gfx11-insts,wavefrontsize64}}
- *out4i = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w64(true, a4i, true, b4i, c4i, false); // expected-error{{'__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64' needs target feature gfx11-insts,wavefrontsize64}}
- *out4i = __builtin_amdgcn_wmma_i32_16x16x16_iu4_w64(true, a2i, true, b2i, c4i, false); // expected-error{{'__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64' needs target feature gfx11-insts,wavefrontsize64}}
+ *out4f = __builtin_amdgcn_wmma_f32_16x16x16_f16_w64(a16h, b16h, c4f);  // expected-error{{'__builtin_amdgcn_wmma_f32_16x16x16_f16_w64' needs target feature wmma-256b-insts,wavefrontsize64}}
+ *out4f = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w64(a16s, b16s, c4f);  // expected-error{{'__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64' needs target feature wmma-256b-insts,wavefrontsize64}}
+ *out8h = __builtin_amdgcn_wmma_f16_16x16x16_f16_w64(a16h, b16h, c8h, true); // expected-error{{'__builtin_amdgcn_wmma_f16_16x16x16_f16_w64' needs target feature wmma-256b-insts,wavefrontsize64}}
+ *out8s = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64(a16s, b16s, c8s, true); // expected-error{{'__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64' needs target feature wmma-256b-insts,wavefrontsize64}}
+ *out8h = __builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64(a16h, b16h, c8h, true); // expected-error{{'__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64' needs target feature wmma-256b-insts,wavefrontsize64}}
+ *out8s = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64(a16s, b16s, c8s, true); // expected-error{{'__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64' needs target feature wmma-256b-insts,wavefrontsize64}}
+ *out4i = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w64(true, a4i, true, b4i, c4i, false); // expected-error{{'__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64' needs target feature wmma-256b-insts,wavefrontsize64}}
+ *out4i = __builtin_amdgcn_wmma_i32_16x16x16_iu4_w64(true, a2i, true, b2i, c4i, false); // expected-error{{'__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64' needs target feature wmma-256b-insts,wavefrontsize64}}
 }
 
 #endif
diff --git a/clang/test/Driver/Inputs/XRSimulator1.0.sdk/usr/include/libxml/.keep b/clang/test/Driver/Inputs/XRSimulator1.0.sdk/usr/include/libxml/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/aarch64-outliner.c b/clang/test/Driver/aarch64-outliner.c
index 5ed822f122fc4..4d5b7321e330f 100644
--- a/clang/test/Driver/aarch64-outliner.c
+++ b/clang/test/Driver/aarch64-outliner.c
@@ -3,4 +3,4 @@
 // ON: "-mllvm" "-enable-machine-outliner"
 // RUN: %clang --target=aarch64 -moutline -mno-outline -S %s -### 2>&1 | FileCheck %s -check-prefix=OFF
 // RUN: %clang --target=aarch64_be -moutline -mno-outline -S %s -### 2>&1 | FileCheck %s -check-prefix=OFF
-// OFF: "-mllvm" "-enable-machine-outliner=never"
+// OFF: "-mno-outline" "-mllvm" "-enable-machine-outliner=never"
diff --git a/clang/test/Driver/arm-machine-outliner.c b/clang/test/Driver/arm-machine-outliner.c
index a1e705cb60a1b..efa29d2ab8450 100644
--- a/clang/test/Driver/arm-machine-outliner.c
+++ b/clang/test/Driver/arm-machine-outliner.c
@@ -3,6 +3,6 @@
 // RUN: %clang -target armv7-linux-gnueabihf -flto -moutline %s -### 2>&1 | FileCheck %s -check-prefix=ON-LTO
 // ON-LTO: "-plugin-opt=-enable-machine-outliner"
 // RUN: %clang -target armv7-linux-gnueabihf -moutline -mno-outline -c %s -### 2>&1 | FileCheck %s -check-prefix=OFF
-// OFF: "-mllvm" "-enable-machine-outliner=never"
+// OFF: "-mno-outline" "-mllvm" "-enable-machine-outliner=never"
 // RUN: %clang -target armv7-linux-gnueabihf -flto -moutline -mno-outline %s -### 2>&1 | FileCheck %s -check-prefix=OFF-LTO
 // OFF-LTO: "-plugin-opt=-enable-machine-outliner=never"
diff --git a/clang/test/Driver/crash-diagnostics-dir-3.c b/clang/test/Driver/crash-diagnostics-dir-3.c
index a91bc48d7e462..63a5efc853a4c 100644
--- a/clang/test/Driver/crash-diagnostics-dir-3.c
+++ b/clang/test/Driver/crash-diagnostics-dir-3.c
@@ -1,6 +1,6 @@
 // RUN: export LSAN_OPTIONS=detect_leaks=0
 // RUN: rm -rf %t
-// RUN: not env CLANG_CRASH_DIAGNOSTICS_DIR=%t %clang -c %s -o - 2>&1 | FileCheck %s
+// RUN: not %crash_opt env CLANG_CRASH_DIAGNOSTICS_DIR=%t %clang -c %s -o - 2>&1 | FileCheck %s
 #pragma clang __debug parser_crash
 // CHECK: Preprocessed source(s) and associated run script(s) are located at:
 // CHECK: diagnostic msg: {{.*}}{{/|\\}}crash-diagnostics-dir-3.c.tmp{{(/|\\).*}}.c
diff --git a/clang/test/Driver/crash-diagnostics-dir.c b/clang/test/Driver/crash-diagnostics-dir.c
index 16382eff1cde7..9a8299bffe005 100644
--- a/clang/test/Driver/crash-diagnostics-dir.c
+++ b/clang/test/Driver/crash-diagnostics-dir.c
@@ -1,6 +1,6 @@
 // RUN: export LSAN_OPTIONS=detect_leaks=0
 // RUN: rm -rf %t
-// RUN: not %clang -fcrash-diagnostics-dir=%t -c %s -o - 2>&1 | FileCheck %s
+// RUN: not %crash_opt %clang -fcrash-diagnostics-dir=%t -c %s -o - 2>&1 | FileCheck %s
 #pragma clang __debug parser_crash
 // CHECK: Preprocessed source(s) and associated run script(s) are located at:
 // CHECK: diagnostic msg: {{.*}}{{/|\\}}crash-diagnostics-dir.c.tmp{{(/|\\).*}}.c
diff --git a/clang/test/Driver/crash-ir-repro.cpp b/clang/test/Driver/crash-ir-repro.cpp
index 217d5ed421bdb..1a2000ad1279f 100644
--- a/clang/test/Driver/crash-ir-repro.cpp
+++ b/clang/test/Driver/crash-ir-repro.cpp
@@ -1,5 +1,5 @@
 // RUN: %clang -S -emit-llvm -o %t.ll %s
-// RUN: not %clang -S -DCRASH %s -o %t.ll 2>&1 | FileCheck %s
+// RUN: not %crash_opt %clang -S -DCRASH %s -o %t.ll 2>&1 | FileCheck %s
 
 // TODO(boomanaiden154): This test case causes clang to raise a signal when
 // running under ubsan, but not in normal build configurations. This should
diff --git a/clang/test/Driver/crash-report-clang-cl.cpp b/clang/test/Driver/crash-report-clang-cl.cpp
index 963c3b6d0ab03..f61b94626f584 100644
--- a/clang/test/Driver/crash-report-clang-cl.cpp
+++ b/clang/test/Driver/crash-report-clang-cl.cpp
@@ -2,7 +2,7 @@
 // RUN: rm -rf %t
 // RUN: mkdir %t
 
-// RUN: not %clang_cl -fsyntax-only /Brepro /source-charset:utf-8 \
+// RUN: not %crash_opt %clang_cl -fsyntax-only /Brepro /source-charset:utf-8 \
 // RUN:     -fcrash-diagnostics-dir=%t -- %s 2>&1 | FileCheck %s
 // RUN: cat %t/crash-report-clang-cl-*.cpp | FileCheck --check-prefix=CHECKSRC %s
 // RUN: cat %t/crash-report-clang-cl-*.sh | FileCheck --check-prefix=CHECKSH %s
diff --git a/clang/test/Driver/crash-report-header.h b/clang/test/Driver/crash-report-header.h
index 04865a0cc300f..6d5156537126d 100644
--- a/clang/test/Driver/crash-report-header.h
+++ b/clang/test/Driver/crash-report-header.h
@@ -1,7 +1,7 @@
 // RUN: export LSAN_OPTIONS=detect_leaks=0
 // RUN: rm -rf %t
 // RUN: mkdir %t
-// RUN: env TMPDIR="%t" TEMP="%t" TMP="%t" RC_DEBUG_OPTIONS=1 not %clang -fsyntax-only %s 2>&1 | FileCheck %s
+// RUN: env TMPDIR="%t" TEMP="%t" TMP="%t" RC_DEBUG_OPTIONS=1 not %crash_opt %clang -fsyntax-only %s 2>&1 | FileCheck %s
 // RUN: cat %t/crash-report-header-*.h | FileCheck --check-prefix=CHECKSRC "%s"
 // RUN: cat %t/crash-report-header-*.sh | FileCheck --check-prefix=CHECKSH "%s"
 // REQUIRES: crash-recovery
diff --git a/clang/test/Driver/crash-report-spaces.c b/clang/test/Driver/crash-report-spaces.c
index b4d8ac1f57e83..b5fbb59683fc0 100644
--- a/clang/test/Driver/crash-report-spaces.c
+++ b/clang/test/Driver/crash-report-spaces.c
@@ -2,7 +2,7 @@
 // RUN: rm -rf "%t"
 // RUN: mkdir "%t"
 // RUN: cp "%s" "%t/crash report spaces.c"
-// RUN: env TMPDIR="%t" TEMP="%t" TMP="%t" RC_DEBUG_OPTIONS=1 not %clang -fsyntax-only "%t/crash report spaces.c" 2>&1 | FileCheck "%s"
+// RUN: env TMPDIR="%t" TEMP="%t" TMP="%t" RC_DEBUG_OPTIONS=1 not %crash_opt %clang -fsyntax-only "%t/crash report spaces.c" 2>&1 | FileCheck "%s"
 // RUN: cat "%t/crash report spaces"-*.c | FileCheck --check-prefix=CHECKSRC "%s"
 // RUN: cat "%t/crash report spaces"-*.sh | FileCheck --check-prefix=CHECKSH "%s"
 // REQUIRES: crash-recovery
diff --git a/clang/test/Driver/crash-report-with-asserts.c b/clang/test/Driver/crash-report-with-asserts.c
index 686c49f339fb7..278860a9158e4 100644
--- a/clang/test/Driver/crash-report-with-asserts.c
+++ b/clang/test/Driver/crash-report-with-asserts.c
@@ -12,13 +12,13 @@
 
 // RUN: env TMPDIR=%t TEMP=%t TMP=%t RC_DEBUG_OPTIONS=1                  \
 // RUN:  CC_PRINT_HEADERS=1 CC_LOG_DIAGNOSTICS=1                         \
-// RUN:  not %clang %s @%t.rsp -DASSERT 2>&1 | FileCheck %s
+// RUN:  not %crash_opt %clang %s @%t.rsp -DASSERT 2>&1 | FileCheck %s
 // RUN: cat %t/crash-report-*.c | FileCheck --check-prefix=CHECKSRC %s
 // RUN: cat %t/crash-report-*.sh | FileCheck --check-prefix=CHECKSH %s
 
 // RUN: env TMPDIR=%t TEMP=%t TMP=%t RC_DEBUG_OPTIONS=1                  \
 // RUN:  CC_PRINT_HEADERS=1 CC_LOG_DIAGNOSTICS=1                         \
-// RUN:  not %clang %s @%t.rsp -DUNREACHABLE 2>&1 | FileCheck %s
+// RUN:  not %crash_opt %clang %s @%t.rsp -DUNREACHABLE 2>&1 | FileCheck %s
 // RUN: cat %t/crash-report-with-asserts-*.c | FileCheck --check-prefix=CHECKSRC %s
 // RUN: cat %t/crash-report-with-asserts-*.sh | FileCheck --check-prefix=CHECKSH %s
 
diff --git a/clang/test/Driver/crash-report.cpp b/clang/test/Driver/crash-report.cpp
index 59eee65af57ee..c431940bf9ea1 100644
--- a/clang/test/Driver/crash-report.cpp
+++ b/clang/test/Driver/crash-report.cpp
@@ -12,13 +12,13 @@
 
 // RUN: env TMPDIR=%t TEMP=%t TMP=%t RC_DEBUG_OPTIONS=1                  \
 // RUN:  CC_PRINT_HEADERS=1 CC_LOG_DIAGNOSTICS=1                         \
-// RUN:  not %clang %s @%t.rsp -DPARSER 2>&1 | FileCheck %s
+// RUN:  not %crash_opt %clang %s @%t.rsp -DPARSER 2>&1 | FileCheck %s
 // RUN: cat %t/crash-report-*.cpp | FileCheck --check-prefix=CHECKSRC %s
 // RUN: cat %t/crash-report-*.sh | FileCheck --check-prefix=CHECKSH %s
 
 // RUN: env TMPDIR=%t TEMP=%t TMP=%t RC_DEBUG_OPTIONS=1                  \
 // RUN:  CC_PRINT_HEADERS=1 CC_LOG_DIAGNOSTICS=1                         \
-// RUN:  not %clang %s @%t.rsp -DCRASH 2>&1 | FileCheck %s
+// RUN:  not %crash_opt %clang %s @%t.rsp -DCRASH 2>&1 | FileCheck %s
 // RUN: cat %t/crash-report-*.cpp | FileCheck --check-prefix=CHECKSRC %s
 // RUN: cat %t/crash-report-*.sh | FileCheck --check-prefix=CHECKSH %s
 
diff --git a/clang/test/Driver/emit-reproducer.c b/clang/test/Driver/emit-reproducer.c
index 18e1b4e41b91d..6fd1735ee8549 100644
--- a/clang/test/Driver/emit-reproducer.c
+++ b/clang/test/Driver/emit-reproducer.c
@@ -3,13 +3,13 @@
 
 // RUN: echo "%s -fcrash-diagnostics-dir=%t -fsyntax-only" | sed -e 's/\\/\\\\/g' > %t.rsp
 
-// RUN: not %clang -DFATAL @%t.rsp -gen-reproducer=off    2>&1 | FileCheck %s --check-prefix=NOT
-// RUN: not %clang -DFATAL @%t.rsp -fno-crash-diagnostics 2>&1 | FileCheck %s --check-prefix=NOT
-// RUN: not %clang -DFATAL @%t.rsp                        2>&1 | FileCheck %s
-// RUN: not %clang -DFATAL @%t.rsp -gen-reproducer=crash  2>&1 | FileCheck %s
-// RUN: not %clang -DFATAL @%t.rsp -gen-reproducer=error  2>&1 | FileCheck %s
-// RUN: not %clang -DFATAL @%t.rsp -gen-reproducer=always 2>&1 | FileCheck %s
-// RUN: not %clang -DFATAL @%t.rsp -gen-reproducer        2>&1 | FileCheck %s
+// RUN: not %crash_opt %clang -DFATAL @%t.rsp -gen-reproducer=off    2>&1 | FileCheck %s --check-prefix=NOT
+// RUN: not %crash_opt %clang -DFATAL @%t.rsp -fno-crash-diagnostics 2>&1 | FileCheck %s --check-prefix=NOT
+// RUN: not %crash_opt %clang -DFATAL @%t.rsp                        2>&1 | FileCheck %s
+// RUN: not %crash_opt %clang -DFATAL @%t.rsp -gen-reproducer=crash  2>&1 | FileCheck %s
+// RUN: not %crash_opt %clang -DFATAL @%t.rsp -gen-reproducer=error  2>&1 | FileCheck %s
+// RUN: not %crash_opt %clang -DFATAL @%t.rsp -gen-reproducer=always 2>&1 | FileCheck %s
+// RUN: not %crash_opt %clang -DFATAL @%t.rsp -gen-reproducer        2>&1 | FileCheck %s
 
 // RUN: not %clang -DERROR @%t.rsp -gen-reproducer=off    2>&1 | FileCheck %s --check-prefix=NOT
 // RUN: not %clang -DERROR @%t.rsp -fno-crash-diagnostics 2>&1 | FileCheck %s --check-prefix=NOT
diff --git a/clang/test/Driver/incompatible_sysroot.c b/clang/test/Driver/incompatible_sysroot.c
index a5f7d03da7254..6bc8cd07d1f12 100644
--- a/clang/test/Driver/incompatible_sysroot.c
+++ b/clang/test/Driver/incompatible_sysroot.c
@@ -12,13 +12,14 @@
 // RUN: %clang -target arm64-apple-visionos1.0-simulator -Wincompatible-sysroot -isysroot %S/Inputs/XRSimulator1.0.sdk -S -o - %s 2>&1 | FileCheck -check-prefix CHECK-VISIONOSSIM %s
 // RUN: %clang -target arm64-apple-xros1.0 -Wincompatible-sysroot -isysroot %S/Inputs/XRSimulator1.0.sdk -S -o - %s 2>&1 | FileCheck -check-prefix CHECK-VISIONOSSIM-VISIONOS %s
 // RUN: %clang -target arm64-apple-ios17.1 -Wincompatible-sysroot -isysroot %S/Inputs/XRSimulator1.0.sdk -S -o - %s 2>&1 | FileCheck -check-prefix CHECK-VISIONOSSIM-IOS %s
+// RUN: %clang -target arm64-apple-visionos1.0-simulator -Wincompatible-sysroot -isysroot %S/Inputs/XRSimulator1.0.sdk/usr/include/libxml -S -o - %s 2>&1 | FileCheck -check-prefix CHECK-VISIONOSSIM %s
 
 int main() { return 0; }
-// CHECK-OSX-IOS: warning: using sysroot for 'MacOSX' but targeting 'x86_64-apple-ios9.0.0-simulator'
-// CHECK-IOS-WATCHOS: warning: using sysroot for 'iPhoneOS' but targeting 'arm64-apple-watchos2.0.0'
-// CHECK-IOS-TVOS: warning: using sysroot for 'iPhoneOS' but targeting 'arm64-apple-tvos9.0.0'
-// CHECK-OSX-DRIVERKIT: warning: using sysroot for 'MacOSX' but targeting 'x86_64-apple-driverkit19.0.0'
-// CHECK-IOS-DRIVERKIT: warning: using sysroot for 'iPhoneOS' but targeting 'x86_64-apple-driverkit19.0.0'
+// CHECK-OSX-IOS: warning: using sysroot for 'MacOSX10.9' but targeting 'x86_64-apple-ios9.0.0-simulator'
+// CHECK-IOS-WATCHOS: warning: using sysroot for 'iPhoneOS9.2' but targeting 'arm64-apple-watchos2.0.0'
+// CHECK-IOS-TVOS: warning: using sysroot for 'iPhoneOS9.2' but targeting 'arm64-apple-tvos9.0.0'
+// CHECK-OSX-DRIVERKIT: warning: using sysroot for 'MacOSX10.9' but targeting 'x86_64-apple-driverkit19.0.0'
+// CHECK-IOS-DRIVERKIT: warning: using sysroot for 'iPhoneOS9.2' but targeting 'x86_64-apple-driverkit19.0.0'
 // CHECK-IOS-IOSSIM-NOT: warning: using sysroot for '{{.*}}' but targeting '{{.*}}'
 // CHECK-OSX-IOS-DISABLED-NOT: warning: using sysroot for '{{.*}}' but targeting '{{.*}}'
 
diff --git a/clang/test/Driver/lit.local.cfg b/clang/test/Driver/lit.local.cfg
index 6370e9f92d89b..a47d0de90d763 100644
--- a/clang/test/Driver/lit.local.cfg
+++ b/clang/test/Driver/lit.local.cfg
@@ -1,4 +1,5 @@
 from lit.llvm import llvm_config
+import sys
 
 config.suffixes = [
     ".c",
@@ -27,6 +28,12 @@ config.substitutions.insert(
     0, ("%clang_cc1", """*** Do not use 'clang -cc1' in Driver tests. ***""")
 )
 
+is_windows = sys.platform.startswith("win")
+if is_windows:
+    config.substitutions.append(('%crash_opt', ''))
+else:
+    config.substitutions.append(('%crash_opt', '--crash'))
+
 # Remove harmful environmental variables for clang Driver tests.
 # Some might be useful for other tests so they are only removed here.
 driver_overwrite_env_vars = [
diff --git a/clang/test/Driver/output-file-cleanup.c b/clang/test/Driver/output-file-cleanup.c
index 3628df8192652..432ff640656e7 100644
--- a/clang/test/Driver/output-file-cleanup.c
+++ b/clang/test/Driver/output-file-cleanup.c
@@ -2,7 +2,7 @@
 // RUN: rm -f "%t.d" "%t1.s" "%t2.s" "%t3.s" "%t4.s" "%t5.s"
 //
 // RUN: touch %t.s
-// RUN: not %clang -S -DCRASH -o %t.s -MMD -MF %t.d %s
+// RUN: not %crash_opt %clang -S -DCRASH -o %t.s -MMD -MF %t.d %s
 // RUN: test ! -f %t.s
 // RUN: test ! -f %t.d
 
diff --git a/clang/test/Driver/riscv-outliner.c b/clang/test/Driver/riscv-outliner.c
index 9e9905ab4fd8a..fa69977331e13 100644
--- a/clang/test/Driver/riscv-outliner.c
+++ b/clang/test/Driver/riscv-outliner.c
@@ -4,4 +4,4 @@
 
 // RUN: %clang --target=riscv32 -moutline -mno-outline -S %s -### 2>&1 | FileCheck %s -check-prefix=OFF
 // RUN: %clang --target=riscv64 -moutline -mno-outline -S %s -### 2>&1 | FileCheck %s -check-prefix=OFF
-// OFF: "-mllvm" "-enable-machine-outliner=never"
+// OFF: "-mno-outline" "-mllvm" "-enable-machine-outliner=never"
diff --git a/clang/test/Driver/x86-outliner.c b/clang/test/Driver/x86-outliner.c
index e2af85d3d16ab..7da56ac93fa5e 100644
--- a/clang/test/Driver/x86-outliner.c
+++ b/clang/test/Driver/x86-outliner.c
@@ -4,4 +4,4 @@
 
 // RUN: %clang --target=i386 -moutline -mno-outline -S %s -### 2>&1 | FileCheck %s -check-prefix=OFF
 // RUN: %clang --target=x86_64 -moutline -mno-outline -S %s -### 2>&1 | FileCheck %s -check-prefix=OFF
-// OFF: "-mllvm" "-enable-machine-outliner=never"
+// OFF: "-mno-outline" "-mllvm" "-enable-machine-outliner=never"
diff --git a/clang/test/Modules/demote-var-def.cpp b/clang/test/Modules/demote-var-def.cpp
deleted file mode 100644
index 811440dd736f2..0000000000000
--- a/clang/test/Modules/demote-var-def.cpp
+++ /dev/null
@@ -1,94 +0,0 @@
-// RUN: rm -rf %t
-// RUN: mkdir -p %t
-// RUN: split-file %s %t
-// RUN: cd %t
-//
-// DEFINE: %{common-flags}= -I %t -isystem %t -xc++ -std=c++20 -fmodules
-//
-// RUN: mkdir -p %t/b2
-// RUN: mkdir -p %t/b1
-// RUN: %clang_cc1 %{common-flags} -emit-module -fmodule-name=module_d \
-// RUN:     d.cppmap -o d.pcm
-// RUN: %clang_cc1 %{common-flags} -emit-module -fmodule-name=module_a \
-// RUN:     -fmodule-file=d.pcm  a.cppmap -o a.pcm
-// RUN: %clang_cc1 %{common-flags} -emit-module -fmodule-name=module_b2 \
-// RUN:     -fmodule-file=a.pcm b2/b.cppmap -o b2/b.pcm
-// RUN: %clang_cc1 %{common-flags} -emit-module -fmodule-name=module_b1 \
-// RUN:     -fmodule-file=b2/b.pcm b1/b.cppmap -o b1/b.pcm
-// RUN: %clang_cc1 %{common-flags} -emit-module -fmodule-name=module_f \
-// RUN:     -fmodule-file=b1/b.pcm f.cppmap -o f.pcm
-// RUN: %clang_cc1 %{common-flags} -emit-module -fmodule-name=module_c \
-// RUN:     -fmodule-file=f.pcm c.cppmap -o c.pcm
-// RUN: %clang_cc1 %{common-flags} -emit-module \
-// RUN:     -fmodule-name=module_e e.cppmap -o e.pcm
-//
-// RUN: %clang_cc1 %{common-flags} \
-// RUN:     -fmodule-file=c.pcm -fmodule-file=e.pcm \
-// RUN:     src.cpp -o src.pic.o
-
-//--- invoke.h
-#ifndef _LIBCPP___TYPE_TRAITS_IS_SAME_H
-#define _LIBCPP___TYPE_TRAITS_IS_SAME_H
-namespace std { inline namespace _LIBCPP_ABI_NAMESPACE {
-template <class _Tp, class _Up>
-constexpr bool is_same_v = __is_same(_Tp, _Up);
-} }
-#endif
-
-//--- memory
-#include <invoke.h>
-namespace std { inline namespace _LIBCPP_ABI_NAMESPACE {
-template <class _Tp>
-using __decay_t = __decay(_Tp);
-template <class _Tp>
-using decay_t = __decay_t<_Tp>;
-} }
-
-//--- other.h
-#include <invoke.h>
-
-//--- a.cppmap
-module "module_a" {
-}
-
-//--- b1/b.cppmap
-module "module_b1" {
-}
-
-//--- b2/b.cppmap
-module "module_b2" {
-}
-
-//--- c.cppmap
-module "module_c" {
-}
-
-//--- d.cppmap
-module "module_d" {
-    header "d.h"
-}
-
-//--- d.h
-#include <other.h>
-
-//--- e.cppmap
-module "module_e" {
-    header "e.h"
-}
-
-//--- e.h
-#include <memory>
-
-//--- f.cppmap
-module "module_f" {
-}
-
-//--- src.cpp
-#include <d.h>
-#include <memory>
-template <typename T>
-concept coroutine_result =
-    std::is_same_v<std::decay_t<T>, T>;
-template <coroutine_result R>
-class Co;
-using T = Co<void>;
diff --git a/clang/test/Modules/pr149404-02.cppm b/clang/test/Modules/pr149404-02.cppm
deleted file mode 100644
index 291619ea05b8a..0000000000000
--- a/clang/test/Modules/pr149404-02.cppm
+++ /dev/null
@@ -1,104 +0,0 @@
-// RUN: rm -rf %t
-// RUN: mkdir -p %t
-// RUN: split-file %s %t
-
-// RUN: %clang_cc1 -std=c++20 -emit-module-interface -o %t/format.pcm %t/format.cppm
-// RUN: %clang_cc1 -std=c++20  -emit-module-interface -o %t/includes_in_gmf.pcm %t/includes_in_gmf.cppm
-// RUN: %clang_cc1 -std=c++20 -fprebuilt-module-path=%t %t/test.cpp -verify -fsyntax-only
-
-// RUN: %clang_cc1 -std=c++20 -emit-reduced-module-interface -o %t/format.pcm %t/format.cppm
-// RUN: %clang_cc1 -std=c++20  -emit-reduced-module-interface -o %t/includes_in_gmf.pcm %t/includes_in_gmf.cppm
-// RUN: %clang_cc1 -std=c++20 -fprebuilt-module-path=%t %t/test.cpp -verify -fsyntax-only
-
-//--- format.h
-#pragma once
-
-namespace test {
-
-template <class _Tp>
-struct type_identity {
-    typedef _Tp type;
-};
-
-template <class _Tp>
-using type_identity_t = typename type_identity<_Tp>::type;
-
-
-template <class _Tp, class _CharT>
-struct formatter
-{
-    formatter() = delete;
-};
-
-template <>
-struct formatter<char, char>
-{};
-
-template <class _CharT, class... _Args>
-struct basic_format_string {
-    static inline const int __handles_{ [] {
-        formatter<char, _CharT> f;
-        (void)f;
-        return 0;
-        }() };
-    
-    consteval basic_format_string(const _CharT*) {
-        (void)__handles_;
-    }
-};
-
-template <class... _Args>
-using wformat_string = basic_format_string<wchar_t, type_identity_t<_Args>...>;
-
-template <class... _Args>
-using format_string = basic_format_string<char, type_identity_t<_Args>...>;
-
-template <class... _Args>
-void format(format_string<_Args...> __fmt, _Args&&... __args) {}
-
-template <class... _Args>
-void format(wformat_string<_Args...> __fmt, _Args&&... __args) {}
-
-}
-
-//--- format.cppm
-module;
-#include "format.h"
-export module format;
-
-export namespace test {
-	using test::format;
-	using test::formatter;
-	using test::format_string;
-}
-
-auto something() -> void
-{
-	auto a = 'a';
-	test::format("{}", a);
-}
-
-//--- includes_in_gmf.cppm
-module;
-#include "format.h"
-export module includes_in_gmf;
-
-namespace test {
-	using test::format;
-	using test::formatter;
-	using test::format_string;
-}
-
-//--- test.cpp
-// expected-no-diagnostics
-import format;
-import includes_in_gmf;
-
-auto what() -> void
-{
-    auto a = 'a';
-    test::format("{}", a);
-
-    constexpr auto fs = "{}"; // test::format_string<char>{ "{}" }; // <- same result even passing exact param type
-    test::format(fs, 'r');
-}
diff --git a/clang/test/Modules/pr172241.cppm b/clang/test/Modules/pr172241.cppm
deleted file mode 100644
index 3eb885e8b2d9f..0000000000000
--- a/clang/test/Modules/pr172241.cppm
+++ /dev/null
@@ -1,47 +0,0 @@
-// RUN: rm -rf %t
-// RUN: mkdir -p %t
-// RUN: split-file %s %t
-//
-// RUN: %clang_cc1 -std=c++20 -triple %itanium_abi_triple %t/m.cppm -emit-module-interface -o %t/m.pcm
-// RUN: %clang_cc1 -std=c++20 -triple %itanium_abi_triple %t/use.cpp -fmodule-file=m=%t/m.pcm -emit-llvm -o - | FileCheck %t/use.cpp
-//
-// RUN: %clang_cc1 -std=c++20 -triple %itanium_abi_triple %t/m.cppm -emit-reduced-module-interface -o %t/m.pcm
-// RUN: %clang_cc1 -std=c++20 -triple %itanium_abi_triple %t/use.cpp -fmodule-file=m=%t/m.pcm -emit-llvm -o - | FileCheck %t/use.cpp
-
-//--- header.h
-#pragma once
-
-template <unsigned T>
-class Templ {
-public:
-    void lock() { __set_locked_bit(); }
-
-private:
-    static constexpr auto __set_locked_bit = [](){};
-};
-
-class JT {
-public:
-    ~JT() {
-        Templ<4> state;
-        state.lock();
-    }
-};
-
-//--- m.cppm
-module;
-#include "header.h"
-export module m;
-export struct M {
-    JT jt;
-};
-//--- use.cpp
-#include "header.h"
-import m;
-
-int main() {
-    M m;
-    return 0;
-}
-
-// CHECK: @_ZN5TemplILj4EE16__set_locked_bitE = {{.*}}linkonce_odr
diff --git a/clang/test/Modules/var-inst-def.cppm b/clang/test/Modules/var-inst-def.cppm
deleted file mode 100644
index 1414ec76c7be5..0000000000000
--- a/clang/test/Modules/var-inst-def.cppm
+++ /dev/null
@@ -1,110 +0,0 @@
-// RUN: rm -rf %t
-// RUN: mkdir -p %t
-// RUN: split-file %s %t
-// RUN: cd %t
-//
-// RUN: %clang_cc1 -fmodule-name=A -xc++ -emit-module -fmodules \
-// RUN:   -fno-cxx-modules -fno-implicit-modules \
-// RUN:   -fmodule-map-file-home-is-cwd -std=c++20 -I. a.modulemap -o a.pcm
-//
-// RUN: %clang_cc1 -fmodule-name=B -xc++ -emit-module -fmodules \
-// RUN:   -fno-cxx-modules -fno-implicit-modules \
-// RUN:   -fmodule-map-file-home-is-cwd -std=c++20 -I. b.modulemap -o b.pcm
-//
-// RUN: %clang_cc1 -fmodule-name=C -xc++ -emit-module -fmodules \
-// RUN:   -fno-cxx-modules -fno-implicit-modules \
-// RUN:   -fmodule-map-file-home-is-cwd -std=c++20 -I. c.modulemap -o c.pcm
-//
-// RUN: %clang_cc1 -fno-cxx-modules -fmodules -fno-implicit-modules \
-// RUN:   -fmodule-map-file-home-is-cwd \
-// RUN:   -fmodule-file=a.pcm -fmodule-file=b.pcm -fmodule-file=c.pcm \
-// RUN:   -std=c++20 -I. main.cpp -o /dev/null
-
-//--- a.modulemap
-module "A" { header "a.h" }
-//--- b.modulemap
-module "B" { header "b.h" }
-//--- c.modulemap
-module "C" { header "c.h" }
-
-//--- common.h
-#pragma once
-#include "stl.h"
-
-//--- a.h
-#pragma once
-#include "common.h"
-#include "repro.h"
-
-//--- b.h
-#pragma once
-#include "common.h"
-#include "repro.h"
-
-//--- c.h
-#pragma once
-#include "common.h"
-#include "repro.h"
-
-//--- repro.h
-#pragma once
-#include "stl.h"
-
-namespace k {
-template <template <typename> class , typename >
-struct is_instantiation : std::integral_constant<bool, false> {};
-template <template <typename> class C, typename T>
-constexpr bool is_instantiation_v = is_instantiation<C, T>::value;
-}  
-
-struct ThreadState;
-
-namespace cc::subtle {
-template <typename T>
-class U;
-}  
-namespace cc {
-template <typename T> class Co;
-namespace internal {
-template <typename T>
-class Promise {
-  static_assert(!k::is_instantiation_v<subtle::U, T>);
-};
-}  
-}
-
-//--- stl.h
-#pragma once
-namespace std {
-inline namespace abi {
-template <class _Tp, _Tp __v>
-struct integral_constant {
-  static const _Tp value = __v;
-};
-template <class _Tp, class _Up>
-constexpr bool is_same_v = __is_same(_Tp, _Up);
-template <class _Tp>
-using decay_t = __decay(_Tp);
-
-template <class>
-struct __invoke_result_impl ;
-template <class... _Args>
-using invoke_result_t = __invoke_result_impl<_Args...>;
-}
-}
-
-//--- main.cpp
-#include "stl.h"
-#include "a.h"
-
-namespace cc {
-template <typename F>
-  requires k::is_instantiation_v<Co, std::invoke_result_t<F>>
-using result_type =
-    std::invoke_result_t<F>;
-}  
-namespace cc::internal {
-class final {
- Promise<ThreadState> outgoing_work_;
-};
-}
diff --git a/clang/test/SemaHLSL/Types/BuiltinMatrix/MatrixSplatErrors.hlsl b/clang/test/SemaHLSL/Types/BuiltinMatrix/MatrixSplatErrors.hlsl
index 0c2e53d382180..fc3f8e7adc050 100644
--- a/clang/test/SemaHLSL/Types/BuiltinMatrix/MatrixSplatErrors.hlsl
+++ b/clang/test/SemaHLSL/Types/BuiltinMatrix/MatrixSplatErrors.hlsl
@@ -1,8 +1,13 @@
 // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.6-library -finclude-default-header -std=hlsl202x -verify %s
 
-void SplatOfVectortoMat(int4 V){
+void SplatOfUndersizedVectortoMat(int3 V){
     int2x2 M = V;
-    // expected-error at -1 {{cannot initialize a variable of type 'int2x2' (aka 'matrix<int, 2, 2>') with an lvalue of type 'int4' (aka 'vector<int, 4>')}}
+    // expected-error at -1 {{too few initializers in list for type 'int2x2' (aka 'matrix<int, 2, 2>') (expected 4 but found 3)}}
+}
+
+void SplatOfOversizedVectortoMat(int3 V){
+    int1x2 M = V;
+    // expected-error at -1 {{too many initializers in list for type 'int1x2' (aka 'matrix<int, 1, 2>') (expected 2 but found 3)}}
 }
 
 void SplatOfMattoMat(int4x3 N){
diff --git a/clang/tools/driver/driver.cpp b/clang/tools/driver/driver.cpp
index 490136961ebc6..e6cabcc7eb530 100644
--- a/clang/tools/driver/driver.cpp
+++ b/clang/tools/driver/driver.cpp
@@ -55,6 +55,9 @@
 #include <optional>
 #include <set>
 #include <system_error>
+#if LLVM_ON_UNIX
+#include <signal.h>
+#endif
 
 using namespace clang;
 using namespace clang::driver;
@@ -378,7 +381,7 @@ int clang_main(int Argc, char **Argv, const llvm::ToolContext &ToolContext) {
   if (!UseNewCC1Process) {
     TheDriver.CC1Main = ExecuteCC1WithContext;
     // Ensure the CC1Command actually catches cc1 crashes
-    llvm::CrashRecoveryContext::Enable();
+    llvm::CrashRecoveryContext::Enable(true);
   }
 
   std::unique_ptr<Compilation> C(TheDriver.BuildCompilation(Args));
@@ -407,6 +410,7 @@ int clang_main(int Argc, char **Argv, const llvm::ToolContext &ToolContext) {
   Driver::CommandStatus CommandStatus = Driver::CommandStatus::Ok;
   // Pretend the first command failed if ReproStatus is Always.
   const Command *FailingCommand = nullptr;
+  int CommandRes = 0;
   if (!C->getJobs().empty())
     FailingCommand = &*C->getJobs().begin();
   if (C && !C->containsError()) {
@@ -414,7 +418,7 @@ int clang_main(int Argc, char **Argv, const llvm::ToolContext &ToolContext) {
     Res = TheDriver.ExecuteCompilation(*C, FailingCommands);
 
     for (const auto &P : FailingCommands) {
-      int CommandRes = P.first;
+      CommandRes = P.first;
       FailingCommand = P.second;
       if (!Res)
         Res = CommandRes;
@@ -471,6 +475,18 @@ int clang_main(int Argc, char **Argv, const llvm::ToolContext &ToolContext) {
     Res = 1;
 #endif
 
+#if LLVM_ON_UNIX
+  // On Unix, signals are represented by return codes of 128 plus the signal
+  // number. If the return code indicates it was from a signal handler, raise
+  // the signal so that the exit code includes the signal number, as required
+  // by POSIX. Return code 255 is excluded because some tools, such as
+  // llvm-ifs, exit with code 255 (-1) on failure.
+  if (CommandRes > 128 && CommandRes != 255) {
+    llvm::sys::unregisterHandlers();
+    raise(CommandRes - 128);
+  }
+#endif
+
   // If we have multiple failing commands, we return the result of the first
   // failing command.
   return Res;
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_unwind_win.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_unwind_win.cpp
index 30ba812afc4b0..f1b2a157c3538 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_unwind_win.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_unwind_win.cpp
@@ -43,10 +43,47 @@ void BufferedStackTrace::UnwindSlow(uptr pc, u32 max_depth) {
   trace_buffer[0] = pc;
 }
 
-#ifdef __clang__
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wframe-larger-than="
-#endif
+PVOID CALLBACK FallbackFunctionTableAccess(HANDLE hProcess,
+                                           DWORD64 dwAddrBase) {
+  // First try DbgHelp's function.
+  if (PVOID pResult =
+          __sanitizer::SymFunctionTableAccess64(hProcess, dwAddrBase)) {
+    return pResult;
+  }
+
+  // Fall back to RtlLookupFunctionEntry for dynamic code.
+  // Function registered with RtlAddFunctionTable is not necessarily registered
+  // with DbgHelp, so this is required to cover some edge cases (e.g. JIT
+  // compilers can use Rtl* functions).
+#    if SANITIZER_WINDOWS64
+  DWORD64 dw64ImageBase = 0;
+  return RtlLookupFunctionEntry(dwAddrBase, &dw64ImageBase, nullptr);
+#    else
+  return nullptr;
+#    endif
+}
+
+DWORD64 CALLBACK FallbackGetModuleBase(HANDLE hProcess, DWORD64 dwAddr) {
+  if (DWORD64 dwResult = __sanitizer::SymGetModuleBase64(hProcess, dwAddr)) {
+    return dwResult;
+  }
+
+  // Both GetModuleBase and FunctionTableAccess must provide this fallback,
+  // otherwise dynamic functions won't be properly unwound.
+#    if SANITIZER_WINDOWS64
+  DWORD64 dw64ImageBase = 0;
+  if (RtlLookupFunctionEntry(dwAddr, &dw64ImageBase, nullptr)) {
+    return dw64ImageBase;
+  }
+#    endif
+
+  return 0;
+}
+
+#    ifdef __clang__
+#      pragma clang diagnostic push
+#      pragma clang diagnostic ignored "-Wframe-larger-than="
+#    endif
 void BufferedStackTrace::UnwindSlow(uptr pc, void *context, u32 max_depth) {
   CHECK(context);
   CHECK_GE(max_depth, 2);
@@ -91,8 +128,8 @@ void BufferedStackTrace::UnwindSlow(uptr pc, void *context, u32 max_depth) {
   stack_frame.AddrFrame.Mode = AddrModeFlat;
   stack_frame.AddrStack.Mode = AddrModeFlat;
   while (StackWalk64(machine_type, GetCurrentProcess(), GetCurrentThread(),
-                     &stack_frame, &ctx, NULL, SymFunctionTableAccess64,
-                     SymGetModuleBase64, NULL) &&
+                     &stack_frame, &ctx, NULL, FallbackFunctionTableAccess,
+                     FallbackGetModuleBase, NULL) &&
          size < Min(max_depth, kStackTraceMax)) {
     trace_buffer[size++] = (uptr)stack_frame.AddrPC.Offset;
   }
diff --git a/flang/include/flang/Optimizer/Builder/HLFIRTools.h b/flang/include/flang/Optimizer/Builder/HLFIRTools.h
index 37297dd0ad6d1..e75bddc7e1bef 100644
--- a/flang/include/flang/Optimizer/Builder/HLFIRTools.h
+++ b/flang/include/flang/Optimizer/Builder/HLFIRTools.h
@@ -456,13 +456,18 @@ mlir::Value inlineElementalOp(
 /// over the optimal extents deduced from both shapes. If \p emitWorkshareLoop
 /// is true, a workshare loop construct may be emitted when available.
 /// Allocatable LHS must be allocated with the right shape and parameters.
+/// An optional scalarCombineAndAssign can be provided to provide logic for more
+/// complex assignment actions like for reductions that may need to happen
+/// atomically. When provided, the callback will be passed scalar addresses for
+/// the LHS and RHS elements and is in charge of generating the combination and
+/// assignment logic.
 void genNoAliasArrayAssignment(
     mlir::Location loc, fir::FirOpBuilder &builder, hlfir::Entity rhs,
     hlfir::Entity lhs, bool emitWorkshareLoop = false,
     bool temporaryLHS = false,
-    std::function<hlfir::Entity(mlir::Location, fir::FirOpBuilder &,
-                                hlfir::Entity, hlfir::Entity)> *combiner =
-        nullptr,
+    std::function<void(mlir::Location, fir::FirOpBuilder &, hlfir::Entity,
+                       hlfir::Entity, mlir::ArrayAttr)>
+        *scalarCombineAndAssign = nullptr,
     mlir::ArrayAttr accessGroups = {});
 
 /// Generate an assignment from \p rhs to \p lhs when they are known not to
@@ -474,19 +479,19 @@ void genNoAliasAssignment(
     mlir::Location loc, fir::FirOpBuilder &builder, hlfir::Entity rhs,
     hlfir::Entity lhs, bool emitWorkshareLoop = false,
     bool temporaryLHS = false,
-    std::function<hlfir::Entity(mlir::Location, fir::FirOpBuilder &,
-                                hlfir::Entity, hlfir::Entity)> *combiner =
-        nullptr,
+    std::function<void(mlir::Location, fir::FirOpBuilder &, hlfir::Entity,
+                       hlfir::Entity, mlir::ArrayAttr accessGroups)>
+        *scalarCombineAndAssign = nullptr,
     mlir::ArrayAttr accessGroups = {});
 inline void genNoAliasAssignment(
     mlir::Location loc, fir::FirOpBuilder &builder, hlfir::Entity rhs,
     hlfir::Entity lhs, bool emitWorkshareLoop, bool temporaryLHS,
-    std::function<hlfir::Entity(mlir::Location, fir::FirOpBuilder &,
-                                hlfir::Entity, hlfir::Entity)>
-        combiner,
+    std::function<void(mlir::Location, fir::FirOpBuilder &, hlfir::Entity,
+                       hlfir::Entity, mlir::ArrayAttr)>
+        scalarCombineAndAssign,
     mlir::ArrayAttr accessGroups = {}) {
   genNoAliasAssignment(loc, builder, rhs, lhs, emitWorkshareLoop, temporaryLHS,
-                       &combiner, accessGroups);
+                       &scalarCombineAndAssign, accessGroups);
 }
 
 /// Create a new temporary with the shape and parameters of the provided
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index 6eedb089eac40..687c2f0f4a42a 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -2540,7 +2540,6 @@ class FirConverter : public Fortran::lower::AbstractConverter {
       // PFT branch analysis), allowing the loop to exit only when the condition
       // becomes false.
       if (!unstructuredContext) {
-        maybeStartBlock(preheaderBlock); // no block or empty block
         genDoWhileAsSCFWhile(*whileCondition, eval, doStmtEval);
         return;
       }
diff --git a/flang/lib/Optimizer/Builder/HLFIRTools.cpp b/flang/lib/Optimizer/Builder/HLFIRTools.cpp
index 3355bf1475e30..e7a286b73bb4b 100644
--- a/flang/lib/Optimizer/Builder/HLFIRTools.cpp
+++ b/flang/lib/Optimizer/Builder/HLFIRTools.cpp
@@ -1395,20 +1395,17 @@ bool hlfir::elementalOpMustProduceTemp(hlfir::ElementalOp elemental) {
 static void combineAndStoreElement(
     mlir::Location loc, fir::FirOpBuilder &builder, hlfir::Entity lhs,
     hlfir::Entity rhs, bool temporaryLHS,
-    std::function<hlfir::Entity(mlir::Location, fir::FirOpBuilder &,
-                                hlfir::Entity, hlfir::Entity)> *combiner,
+    std::function<void(mlir::Location, fir::FirOpBuilder &, hlfir::Entity,
+                       hlfir::Entity, mlir::ArrayAttr)> *scalarCombineAndAssign,
     mlir::ArrayAttr accessGroups) {
+  if (scalarCombineAndAssign) {
+    (*scalarCombineAndAssign)(loc, builder, lhs, rhs, accessGroups);
+    return;
+  }
   hlfir::Entity valueToAssign = hlfir::loadTrivialScalar(loc, builder, rhs);
   if (accessGroups)
     if (auto load = valueToAssign.getDefiningOp<fir::LoadOp>())
       load.setAccessGroupsAttr(accessGroups);
-  if (combiner) {
-    hlfir::Entity lhsValue = hlfir::loadTrivialScalar(loc, builder, lhs);
-    if (accessGroups)
-      if (auto load = lhsValue.getDefiningOp<fir::LoadOp>())
-        load.setAccessGroupsAttr(accessGroups);
-    valueToAssign = (*combiner)(loc, builder, lhsValue, valueToAssign);
-  }
   auto assign = hlfir::AssignOp::create(builder, loc, valueToAssign, lhs,
                                         /*realloc=*/false,
                                         /*keep_lhs_length_if_realloc=*/false,
@@ -1420,8 +1417,8 @@ static void combineAndStoreElement(
 void hlfir::genNoAliasArrayAssignment(
     mlir::Location loc, fir::FirOpBuilder &builder, hlfir::Entity rhs,
     hlfir::Entity lhs, bool emitWorkshareLoop, bool temporaryLHS,
-    std::function<hlfir::Entity(mlir::Location, fir::FirOpBuilder &,
-                                hlfir::Entity, hlfir::Entity)> *combiner,
+    std::function<void(mlir::Location, fir::FirOpBuilder &, hlfir::Entity,
+                       hlfir::Entity, mlir::ArrayAttr)> *scalarCombineAndAssign,
     mlir::ArrayAttr accessGroups) {
   mlir::OpBuilder::InsertionGuard guard(builder);
   rhs = hlfir::derefPointersAndAllocatables(loc, builder, rhs);
@@ -1441,28 +1438,30 @@ void hlfir::genNoAliasArrayAssignment(
   builder.setInsertionPointToStart(loopNest.body);
   auto rhsArrayElement =
       hlfir::getElementAt(loc, builder, rhs, loopNest.oneBasedIndices);
-  rhsArrayElement = hlfir::loadTrivialScalar(loc, builder, rhsArrayElement);
+  if (!scalarCombineAndAssign)
+    rhsArrayElement = hlfir::loadTrivialScalar(loc, builder, rhsArrayElement);
   auto lhsArrayElement =
       hlfir::getElementAt(loc, builder, lhs, loopNest.oneBasedIndices);
   combineAndStoreElement(loc, builder, lhsArrayElement, rhsArrayElement,
-                         temporaryLHS, combiner, accessGroups);
+                         temporaryLHS, scalarCombineAndAssign, accessGroups);
 }
 
 void hlfir::genNoAliasAssignment(
     mlir::Location loc, fir::FirOpBuilder &builder, hlfir::Entity rhs,
     hlfir::Entity lhs, bool emitWorkshareLoop, bool temporaryLHS,
-    std::function<hlfir::Entity(mlir::Location, fir::FirOpBuilder &,
-                                hlfir::Entity, hlfir::Entity)> *combiner,
+    std::function<void(mlir::Location, fir::FirOpBuilder &, hlfir::Entity,
+                       hlfir::Entity, mlir::ArrayAttr)> *scalarCombineAndAssign,
     mlir::ArrayAttr accessGroups) {
   if (lhs.isArray()) {
     genNoAliasArrayAssignment(loc, builder, rhs, lhs, emitWorkshareLoop,
-                              temporaryLHS, combiner, accessGroups);
+                              temporaryLHS, scalarCombineAndAssign,
+                              accessGroups);
     return;
   }
   rhs = hlfir::derefPointersAndAllocatables(loc, builder, rhs);
   lhs = hlfir::derefPointersAndAllocatables(loc, builder, lhs);
-  combineAndStoreElement(loc, builder, lhs, rhs, temporaryLHS, combiner,
-                         accessGroups);
+  combineAndStoreElement(loc, builder, lhs, rhs, temporaryLHS,
+                         scalarCombineAndAssign, accessGroups);
 }
 
 std::pair<hlfir::Entity, bool>
diff --git a/flang/lib/Optimizer/OpenACC/Support/FIROpenACCTypeInterfaces.cpp b/flang/lib/Optimizer/OpenACC/Support/FIROpenACCTypeInterfaces.cpp
index 9ced235f05707..d8ed9ce968e0a 100644
--- a/flang/lib/Optimizer/OpenACC/Support/FIROpenACCTypeInterfaces.cpp
+++ b/flang/lib/Optimizer/OpenACC/Support/FIROpenACCTypeInterfaces.cpp
@@ -31,6 +31,19 @@
 #include "mlir/Support/LLVM.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/TypeSwitch.h"
+#include "llvm/Support/CommandLine.h"
+
+static llvm::cl::opt<bool> useAccReductionCombine(
+    "openacc-use-reduction-combine",
+    llvm::cl::desc("Whether to generate acc.reduction_combine. Does not "
+                   "control reduction for MIN/MAX and logical reductions."),
+    llvm::cl::init(false));
+
+static llvm::cl::opt<bool> useAccReductionCombineAll(
+    "openacc-use-reduction-combine-all",
+    llvm::cl::desc("Whether to generate acc.reduction_combine for all types "
+                   "and operators"),
+    llvm::cl::init(false));
 
 namespace fir::acc {
 
@@ -1045,6 +1058,25 @@ static mlir::Value genScalarCombiner(fir::FirOpBuilder &builder,
   TODO(loc, "reduction operator");
 }
 
+static bool useAccReductionCombineOp(mlir::Type elementType,
+                                     mlir::acc::ReductionOperator op) {
+  if (useAccReductionCombineAll)
+    return true;
+  if (!useAccReductionCombine)
+    return false;
+  // LOGICAL operators do not have mlir operators and requires FIR specific
+  // logic to interpret the TRUE and FALSE values from the storage (implemented
+  // in fir.convert to i1).
+  if (!llvm::isa<mlir::IntegerType, mlir::FloatType, mlir::ComplexType>(
+          elementType))
+    return false;
+  // MIN/MAX for floating point can have different edge-case behaviors (NANs).
+  // Currently the mlir operator does not match the behavior implemented by
+  // flang.
+  return op != mlir::acc::ReductionOperator::AccMax &&
+         op != mlir::acc::ReductionOperator::AccMin;
+}
+
 template <typename Ty>
 bool OpenACCMappableModel<Ty>::generateCombiner(
     mlir::Type type, mlir::OpBuilder &mlirBuilder, mlir::Location loc,
@@ -1069,11 +1101,25 @@ bool OpenACCMappableModel<Ty>::generateCombiner(
   }
 
   mlir::Type elementType = fir::getFortranElementType(dest.getType());
-  auto genKernel = [&](mlir::Location l, fir::FirOpBuilder &b,
-                       hlfir::Entity srcElementValue,
-                       hlfir::Entity destElementValue) -> hlfir::Entity {
-    return hlfir::Entity{genScalarCombiner(builder, loc, op, elementType,
-                                           srcElementValue, destElementValue)};
+  auto genKernel =
+      [&](mlir::Location l, fir::FirOpBuilder &b, hlfir::Entity destElementAddr,
+          hlfir::Entity srcElementAddr, mlir::ArrayAttr accessGroups) -> void {
+    assert(!accessGroups && "access groups not expected in acc reductions");
+    if (useAccReductionCombineOp(elementType, op)) {
+      mlir::acc::ReductionCombineOp::create(builder, loc, destElementAddr,
+                                            srcElementAddr, op);
+      return;
+    }
+    hlfir::Entity srcElementValue =
+        hlfir::loadTrivialScalar(loc, builder, srcElementAddr);
+    hlfir::Entity destElementValue =
+        hlfir::loadTrivialScalar(loc, builder, destElementAddr);
+    hlfir::Entity combined(genScalarCombiner(
+        builder, loc, op, elementType, destElementValue, srcElementValue));
+    hlfir::AssignOp::create(builder, loc, combined, destElementAddr,
+                            /*realloc=*/false,
+                            /*keep_lhs_length_if_realloc=*/false,
+                            /*temporary_lhs=*/false);
   };
   hlfir::genNoAliasAssignment(loc, builder, srcSection, destSection,
                               /*emitWorkshareLoop=*/false,
diff --git a/flang/test/Lower/OpenACC/acc-reduction.f90 b/flang/test/Lower/OpenACC/acc-reduction.f90
index 339a4e3435c0d..2c79cacada050 100644
--- a/flang/test/Lower/OpenACC/acc-reduction.f90
+++ b/flang/test/Lower/OpenACC/acc-reduction.f90
@@ -1,6 +1,7 @@
 ! This test checks lowering of OpenACC reduction clause.
 
 ! RUN: bbc -fopenacc -emit-hlfir %s -o - | FileCheck %s
+! RUN: bbc -fopenacc -emit-hlfir %s -o - -openacc-use-reduction-combine | FileCheck -check-prefix=ACC_COMBINE %s
 
 ! CHECK-LABEL:   acc.reduction.recipe @reduction_lor_ref_box_heap_l32 : !fir.ref<!fir.box<!fir.heap<!fir.logical<4>>>> reduction_operator <lor> init {
 ! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.logical<4>>>>):
@@ -97,7 +98,6 @@
 ! CHECK:               %[[SUBI_1:.*]] = arith.subi %[[BOX_DIMS_5]]#0, %[[CONSTANT_7]] : index
 ! CHECK:               %[[ADDI_1:.*]] = arith.addi %[[VAL_2]], %[[SUBI_1]] : index
 ! CHECK:               %[[DESIGNATE_0:.*]] = hlfir.designate %[[VAL_1]] (%[[ADDI_0]], %[[ADDI_1]])  : (!fir.box<!fir.array<?x?xf32>>, index, index) -> !fir.ref<f32>
-! CHECK:               %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<f32>
 ! CHECK:               %[[CONSTANT_8:.*]] = arith.constant 0 : index
 ! CHECK:               %[[BOX_DIMS_6:.*]]:3 = fir.box_dims %[[VAL_0]], %[[CONSTANT_8]] : (!fir.box<!fir.array<?x?xf32>>, index) -> (index, index, index)
 ! CHECK:               %[[CONSTANT_9:.*]] = arith.constant 1 : index
@@ -108,6 +108,7 @@
 ! CHECK:               %[[SUBI_3:.*]] = arith.subi %[[BOX_DIMS_7]]#0, %[[CONSTANT_10]] : index
 ! CHECK:               %[[ADDI_3:.*]] = arith.addi %[[VAL_2]], %[[SUBI_3]] : index
 ! CHECK:               %[[DESIGNATE_1:.*]] = hlfir.designate %[[VAL_0]] (%[[ADDI_2]], %[[ADDI_3]])  : (!fir.box<!fir.array<?x?xf32>>, index, index) -> !fir.ref<f32>
+! CHECK:               %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<f32>
 ! CHECK:               %[[LOAD_1:.*]] = fir.load %[[DESIGNATE_1]] : !fir.ref<f32>
 ! CHECK:               %[[CMPF_0:.*]] = arith.cmpf ogt, %[[LOAD_1]], %[[LOAD_0]] fastmath<contract> : f32
 ! CHECK:               %[[SELECT_0:.*]] = arith.select %[[CMPF_0]], %[[LOAD_1]], %[[LOAD_0]] : f32
@@ -170,13 +171,13 @@
 ! CHECK:             %[[SUBI_0:.*]] = arith.subi %[[BOX_DIMS_2]]#0, %[[CONSTANT_4]] : index
 ! CHECK:             %[[ADDI_0:.*]] = arith.addi %[[VAL_2]], %[[SUBI_0]] : index
 ! CHECK:             %[[DESIGNATE_0:.*]] = hlfir.designate %[[LOAD_0]] (%[[ADDI_0]])  : (!fir.box<!fir.ptr<!fir.array<?xf32>>>, index) -> !fir.ref<f32>
-! CHECK:             %[[LOAD_2:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<f32>
 ! CHECK:             %[[CONSTANT_5:.*]] = arith.constant 0 : index
 ! CHECK:             %[[BOX_DIMS_3:.*]]:3 = fir.box_dims %[[LOAD_1]], %[[CONSTANT_5]] : (!fir.box<!fir.ptr<!fir.array<?xf32>>>, index) -> (index, index, index)
 ! CHECK:             %[[CONSTANT_6:.*]] = arith.constant 1 : index
 ! CHECK:             %[[SUBI_1:.*]] = arith.subi %[[BOX_DIMS_3]]#0, %[[CONSTANT_6]] : index
 ! CHECK:             %[[ADDI_1:.*]] = arith.addi %[[VAL_2]], %[[SUBI_1]] : index
 ! CHECK:             %[[DESIGNATE_1:.*]] = hlfir.designate %[[LOAD_1]] (%[[ADDI_1]])  : (!fir.box<!fir.ptr<!fir.array<?xf32>>>, index) -> !fir.ref<f32>
+! CHECK:             %[[LOAD_2:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<f32>
 ! CHECK:             %[[LOAD_3:.*]] = fir.load %[[DESIGNATE_1]] : !fir.ref<f32>
 ! CHECK:             %[[CMPF_0:.*]] = arith.cmpf ogt, %[[LOAD_3]], %[[LOAD_2]] fastmath<contract> : f32
 ! CHECK:             %[[SELECT_0:.*]] = arith.select %[[CMPF_0]], %[[LOAD_3]], %[[LOAD_2]] : f32
@@ -239,13 +240,13 @@
 ! CHECK:             %[[SUBI_0:.*]] = arith.subi %[[BOX_DIMS_2]]#0, %[[CONSTANT_4]] : index
 ! CHECK:             %[[ADDI_0:.*]] = arith.addi %[[VAL_2]], %[[SUBI_0]] : index
 ! CHECK:             %[[DESIGNATE_0:.*]] = hlfir.designate %[[LOAD_0]] (%[[ADDI_0]])  : (!fir.box<!fir.heap<!fir.array<?xf32>>>, index) -> !fir.ref<f32>
-! CHECK:             %[[LOAD_2:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<f32>
 ! CHECK:             %[[CONSTANT_5:.*]] = arith.constant 0 : index
 ! CHECK:             %[[BOX_DIMS_3:.*]]:3 = fir.box_dims %[[LOAD_1]], %[[CONSTANT_5]] : (!fir.box<!fir.heap<!fir.array<?xf32>>>, index) -> (index, index, index)
 ! CHECK:             %[[CONSTANT_6:.*]] = arith.constant 1 : index
 ! CHECK:             %[[SUBI_1:.*]] = arith.subi %[[BOX_DIMS_3]]#0, %[[CONSTANT_6]] : index
 ! CHECK:             %[[ADDI_1:.*]] = arith.addi %[[VAL_2]], %[[SUBI_1]] : index
 ! CHECK:             %[[DESIGNATE_1:.*]] = hlfir.designate %[[LOAD_1]] (%[[ADDI_1]])  : (!fir.box<!fir.heap<!fir.array<?xf32>>>, index) -> !fir.ref<f32>
+! CHECK:             %[[LOAD_2:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<f32>
 ! CHECK:             %[[LOAD_3:.*]] = fir.load %[[DESIGNATE_1]] : !fir.ref<f32>
 ! CHECK:             %[[CMPF_0:.*]] = arith.cmpf ogt, %[[LOAD_3]], %[[LOAD_2]] fastmath<contract> : f32
 ! CHECK:             %[[SELECT_0:.*]] = arith.select %[[CMPF_0]], %[[LOAD_3]], %[[LOAD_2]] : f32
@@ -321,8 +322,8 @@
 ! CHECK:           %[[CONSTANT_9:.*]] = arith.constant 1 : index
 ! CHECK:           fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_9]] to %[[CONSTANT_5]] step %[[CONSTANT_9]] unordered {
 ! CHECK:             %[[DESIGNATE_2:.*]] = hlfir.designate %[[DESIGNATE_0]] (%[[VAL_2]])  : (!fir.box<!fir.array<3xi32>>, index) -> !fir.ref<i32>
-! CHECK:             %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_2]] : !fir.ref<i32>
 ! CHECK:             %[[DESIGNATE_3:.*]] = hlfir.designate %[[DESIGNATE_1]] (%[[VAL_2]])  : (!fir.box<!fir.array<3xi32>>, index) -> !fir.ref<i32>
+! CHECK:             %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_2]] : !fir.ref<i32>
 ! CHECK:             %[[LOAD_1:.*]] = fir.load %[[DESIGNATE_3]] : !fir.ref<i32>
 ! CHECK:             %[[ADDI_4:.*]] = arith.addi %[[LOAD_1]], %[[LOAD_0]] : i32
 ! CHECK:             hlfir.assign %[[ADDI_4]] to %[[DESIGNATE_3]] : i32, !fir.ref<i32>
@@ -371,13 +372,13 @@
 ! CHECK:             %[[SUBI_0:.*]] = arith.subi %[[BOX_DIMS_2]]#0, %[[CONSTANT_4]] : index
 ! CHECK:             %[[ADDI_0:.*]] = arith.addi %[[VAL_2]], %[[SUBI_0]] : index
 ! CHECK:             %[[DESIGNATE_0:.*]] = hlfir.designate %[[VAL_1]] (%[[ADDI_0]])  : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
-! CHECK:             %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<f32>
 ! CHECK:             %[[CONSTANT_5:.*]] = arith.constant 0 : index
 ! CHECK:             %[[BOX_DIMS_3:.*]]:3 = fir.box_dims %[[VAL_0]], %[[CONSTANT_5]] : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index)
 ! CHECK:             %[[CONSTANT_6:.*]] = arith.constant 1 : index
 ! CHECK:             %[[SUBI_1:.*]] = arith.subi %[[BOX_DIMS_3]]#0, %[[CONSTANT_6]] : index
 ! CHECK:             %[[ADDI_1:.*]] = arith.addi %[[VAL_2]], %[[SUBI_1]] : index
 ! CHECK:             %[[DESIGNATE_1:.*]] = hlfir.designate %[[VAL_0]] (%[[ADDI_1]])  : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
+! CHECK:             %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<f32>
 ! CHECK:             %[[LOAD_1:.*]] = fir.load %[[DESIGNATE_1]] : !fir.ref<f32>
 ! CHECK:             %[[CMPF_0:.*]] = arith.cmpf ogt, %[[LOAD_1]], %[[LOAD_0]] fastmath<contract> : f32
 ! CHECK:             %[[SELECT_0:.*]] = arith.select %[[CMPF_0]], %[[LOAD_1]], %[[LOAD_0]] : f32
@@ -434,13 +435,13 @@
 ! CHECK:             %[[SUBI_0:.*]] = arith.subi %[[BOX_DIMS_2]]#0, %[[CONSTANT_4]] : index
 ! CHECK:             %[[ADDI_0:.*]] = arith.addi %[[VAL_2]], %[[SUBI_0]] : index
 ! CHECK:             %[[DESIGNATE_0:.*]] = hlfir.designate %[[VAL_1]] (%[[ADDI_0]])  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
-! CHECK:             %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<i32>
 ! CHECK:             %[[CONSTANT_5:.*]] = arith.constant 0 : index
 ! CHECK:             %[[BOX_DIMS_3:.*]]:3 = fir.box_dims %[[VAL_0]], %[[CONSTANT_5]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
 ! CHECK:             %[[CONSTANT_6:.*]] = arith.constant 1 : index
 ! CHECK:             %[[SUBI_1:.*]] = arith.subi %[[BOX_DIMS_3]]#0, %[[CONSTANT_6]] : index
 ! CHECK:             %[[ADDI_1:.*]] = arith.addi %[[VAL_2]], %[[SUBI_1]] : index
 ! CHECK:             %[[DESIGNATE_1:.*]] = hlfir.designate %[[VAL_0]] (%[[ADDI_1]])  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+! CHECK:             %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<i32>
 ! CHECK:             %[[LOAD_1:.*]] = fir.load %[[DESIGNATE_1]] : !fir.ref<i32>
 ! CHECK:             %[[ADDI_2:.*]] = arith.addi %[[LOAD_1]], %[[LOAD_0]] : i32
 ! CHECK:             hlfir.assign %[[ADDI_2]] to %[[DESIGNATE_1]] : i32, !fir.ref<i32>
@@ -524,8 +525,8 @@
 ! CHECK:           fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_17]] to %[[CONSTANT_9]] step %[[CONSTANT_17]] unordered {
 ! CHECK:             fir.do_loop %[[VAL_3:.*]] = %[[CONSTANT_17]] to %[[CONSTANT_6]] step %[[CONSTANT_17]] unordered {
 ! CHECK:               %[[DESIGNATE_2:.*]] = hlfir.designate %[[DESIGNATE_0]] (%[[VAL_3]], %[[VAL_2]])  : (!fir.ref<!fir.array<10x20xi32>>, index, index) -> !fir.ref<i32>
-! CHECK:               %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_2]] : !fir.ref<i32>
 ! CHECK:               %[[DESIGNATE_3:.*]] = hlfir.designate %[[DESIGNATE_1]] (%[[VAL_3]], %[[VAL_2]])  : (!fir.ref<!fir.array<10x20xi32>>, index, index) -> !fir.ref<i32>
+! CHECK:               %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_2]] : !fir.ref<i32>
 ! CHECK:               %[[LOAD_1:.*]] = fir.load %[[DESIGNATE_3]] : !fir.ref<i32>
 ! CHECK:               %[[ADDI_0:.*]] = arith.addi %[[LOAD_1]], %[[LOAD_0]] : i32
 ! CHECK:               hlfir.assign %[[ADDI_0]] to %[[DESIGNATE_3]] : i32, !fir.ref<i32>
@@ -584,8 +585,8 @@
 ! CHECK:           %[[CONSTANT_10:.*]] = arith.constant 1 : index
 ! CHECK:           fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_10]] to %[[CONSTANT_5]] step %[[CONSTANT_10]] unordered {
 ! CHECK:             %[[DESIGNATE_2:.*]] = hlfir.designate %[[DESIGNATE_0]] (%[[VAL_2]])  : (!fir.ref<!fir.array<10xi32>>, index) -> !fir.ref<i32>
-! CHECK:             %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_2]] : !fir.ref<i32>
 ! CHECK:             %[[DESIGNATE_3:.*]] = hlfir.designate %[[DESIGNATE_1]] (%[[VAL_2]])  : (!fir.ref<!fir.array<10xi32>>, index) -> !fir.ref<i32>
+! CHECK:             %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_2]] : !fir.ref<i32>
 ! CHECK:             %[[LOAD_1:.*]] = fir.load %[[DESIGNATE_3]] : !fir.ref<i32>
 ! CHECK:             %[[ADDI_0:.*]] = arith.addi %[[LOAD_1]], %[[LOAD_0]] : i32
 ! CHECK:             hlfir.assign %[[ADDI_0]] to %[[DESIGNATE_3]] : i32, !fir.ref<i32>
@@ -847,8 +848,8 @@
 ! CHECK:           %[[CONSTANT_2:.*]] = arith.constant 1 : index
 ! CHECK:           fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_2]] to %[[CONSTANT_0]] step %[[CONSTANT_2]] unordered {
 ! CHECK:             %[[DESIGNATE_0:.*]] = hlfir.designate %[[VAL_1]] (%[[VAL_2]])  : (!fir.ref<!fir.array<100xf32>>, index) -> !fir.ref<f32>
-! CHECK:             %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<f32>
 ! CHECK:             %[[DESIGNATE_1:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_2]])  : (!fir.ref<!fir.array<100xf32>>, index) -> !fir.ref<f32>
+! CHECK:             %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<f32>
 ! CHECK:             %[[LOAD_1:.*]] = fir.load %[[DESIGNATE_1]] : !fir.ref<f32>
 ! CHECK:             %[[CMPF_0:.*]] = arith.cmpf ogt, %[[LOAD_1]], %[[LOAD_0]] fastmath<contract> : f32
 ! CHECK:             %[[SELECT_0:.*]] = arith.select %[[CMPF_0]], %[[LOAD_1]], %[[LOAD_0]] : f32
@@ -906,8 +907,8 @@
 ! CHECK:           fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_4]] to %[[CONSTANT_1]] step %[[CONSTANT_4]] unordered {
 ! CHECK:             fir.do_loop %[[VAL_3:.*]] = %[[CONSTANT_4]] to %[[CONSTANT_0]] step %[[CONSTANT_4]] unordered {
 ! CHECK:               %[[DESIGNATE_0:.*]] = hlfir.designate %[[VAL_1]] (%[[VAL_3]], %[[VAL_2]])  : (!fir.ref<!fir.array<100x10xi32>>, index, index) -> !fir.ref<i32>
-! CHECK:               %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<i32>
 ! CHECK:               %[[DESIGNATE_1:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_3]], %[[VAL_2]])  : (!fir.ref<!fir.array<100x10xi32>>, index, index) -> !fir.ref<i32>
+! CHECK:               %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<i32>
 ! CHECK:               %[[LOAD_1:.*]] = fir.load %[[DESIGNATE_1]] : !fir.ref<i32>
 ! CHECK:               %[[CMPI_0:.*]] = arith.cmpi sgt, %[[LOAD_1]], %[[LOAD_0]] : i32
 ! CHECK:               %[[SELECT_0:.*]] = arith.select %[[CMPI_0]], %[[LOAD_1]], %[[LOAD_0]] : i32
@@ -966,8 +967,8 @@
 ! CHECK:           fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_4]] to %[[CONSTANT_1]] step %[[CONSTANT_4]] unordered {
 ! CHECK:             fir.do_loop %[[VAL_3:.*]] = %[[CONSTANT_4]] to %[[CONSTANT_0]] step %[[CONSTANT_4]] unordered {
 ! CHECK:               %[[DESIGNATE_0:.*]] = hlfir.designate %[[VAL_1]] (%[[VAL_3]], %[[VAL_2]])  : (!fir.ref<!fir.array<100x10xf32>>, index, index) -> !fir.ref<f32>
-! CHECK:               %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<f32>
 ! CHECK:               %[[DESIGNATE_1:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_3]], %[[VAL_2]])  : (!fir.ref<!fir.array<100x10xf32>>, index, index) -> !fir.ref<f32>
+! CHECK:               %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<f32>
 ! CHECK:               %[[LOAD_1:.*]] = fir.load %[[DESIGNATE_1]] : !fir.ref<f32>
 ! CHECK:               %[[CMPF_0:.*]] = arith.cmpf olt, %[[LOAD_1]], %[[LOAD_0]] fastmath<contract> : f32
 ! CHECK:               %[[SELECT_0:.*]] = arith.select %[[CMPF_0]], %[[LOAD_1]], %[[LOAD_0]] : f32
@@ -1019,8 +1020,8 @@
 ! CHECK:           %[[CONSTANT_2:.*]] = arith.constant 1 : index
 ! CHECK:           fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_2]] to %[[CONSTANT_0]] step %[[CONSTANT_2]] unordered {
 ! CHECK:             %[[DESIGNATE_0:.*]] = hlfir.designate %[[VAL_1]] (%[[VAL_2]])  : (!fir.ref<!fir.array<100xi32>>, index) -> !fir.ref<i32>
-! CHECK:             %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<i32>
 ! CHECK:             %[[DESIGNATE_1:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_2]])  : (!fir.ref<!fir.array<100xi32>>, index) -> !fir.ref<i32>
+! CHECK:             %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<i32>
 ! CHECK:             %[[LOAD_1:.*]] = fir.load %[[DESIGNATE_1]] : !fir.ref<i32>
 ! CHECK:             %[[CMPI_0:.*]] = arith.cmpi slt, %[[LOAD_1]], %[[LOAD_0]] : i32
 ! CHECK:             %[[SELECT_0:.*]] = arith.select %[[CMPI_0]], %[[LOAD_1]], %[[LOAD_0]] : i32
@@ -1071,8 +1072,8 @@
 ! CHECK:           %[[CONSTANT_2:.*]] = arith.constant 1 : index
 ! CHECK:           fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_2]] to %[[CONSTANT_0]] step %[[CONSTANT_2]] unordered {
 ! CHECK:             %[[DESIGNATE_0:.*]] = hlfir.designate %[[VAL_1]] (%[[VAL_2]])  : (!fir.ref<!fir.array<100xf32>>, index) -> !fir.ref<f32>
-! CHECK:             %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<f32>
 ! CHECK:             %[[DESIGNATE_1:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_2]])  : (!fir.ref<!fir.array<100xf32>>, index) -> !fir.ref<f32>
+! CHECK:             %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<f32>
 ! CHECK:             %[[LOAD_1:.*]] = fir.load %[[DESIGNATE_1]] : !fir.ref<f32>
 ! CHECK:             %[[MULF_0:.*]] = arith.mulf %[[LOAD_1]], %[[LOAD_0]] fastmath<contract> : f32
 ! CHECK:             hlfir.assign %[[MULF_0]] to %[[DESIGNATE_1]] : f32, !fir.ref<f32>
@@ -1121,8 +1122,8 @@
 ! CHECK:           %[[CONSTANT_2:.*]] = arith.constant 1 : index
 ! CHECK:           fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_2]] to %[[CONSTANT_0]] step %[[CONSTANT_2]] unordered {
 ! CHECK:             %[[DESIGNATE_0:.*]] = hlfir.designate %[[VAL_1]] (%[[VAL_2]])  : (!fir.ref<!fir.array<100xi32>>, index) -> !fir.ref<i32>
-! CHECK:             %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<i32>
 ! CHECK:             %[[DESIGNATE_1:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_2]])  : (!fir.ref<!fir.array<100xi32>>, index) -> !fir.ref<i32>
+! CHECK:             %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<i32>
 ! CHECK:             %[[LOAD_1:.*]] = fir.load %[[DESIGNATE_1]] : !fir.ref<i32>
 ! CHECK:             %[[MULI_0:.*]] = arith.muli %[[LOAD_1]], %[[LOAD_0]] : i32
 ! CHECK:             hlfir.assign %[[MULI_0]] to %[[DESIGNATE_1]] : i32, !fir.ref<i32>
@@ -1171,8 +1172,8 @@
 ! CHECK:           %[[CONSTANT_2:.*]] = arith.constant 1 : index
 ! CHECK:           fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_2]] to %[[CONSTANT_0]] step %[[CONSTANT_2]] unordered {
 ! CHECK:             %[[DESIGNATE_0:.*]] = hlfir.designate %[[VAL_1]] (%[[VAL_2]])  : (!fir.ref<!fir.array<100xf32>>, index) -> !fir.ref<f32>
-! CHECK:             %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<f32>
 ! CHECK:             %[[DESIGNATE_1:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_2]])  : (!fir.ref<!fir.array<100xf32>>, index) -> !fir.ref<f32>
+! CHECK:             %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<f32>
 ! CHECK:             %[[LOAD_1:.*]] = fir.load %[[DESIGNATE_1]] : !fir.ref<f32>
 ! CHECK:             %[[ADDF_0:.*]] = arith.addf %[[LOAD_1]], %[[LOAD_0]] fastmath<contract> : f32
 ! CHECK:             hlfir.assign %[[ADDF_0]] to %[[DESIGNATE_1]] : f32, !fir.ref<f32>
@@ -1235,8 +1236,8 @@
 ! CHECK:             fir.do_loop %[[VAL_3:.*]] = %[[CONSTANT_6]] to %[[CONSTANT_1]] step %[[CONSTANT_6]] unordered {
 ! CHECK:               fir.do_loop %[[VAL_4:.*]] = %[[CONSTANT_6]] to %[[CONSTANT_0]] step %[[CONSTANT_6]] unordered {
 ! CHECK:                 %[[DESIGNATE_0:.*]] = hlfir.designate %[[VAL_1]] (%[[VAL_4]], %[[VAL_3]], %[[VAL_2]])  : (!fir.ref<!fir.array<100x10x2xi32>>, index, index, index) -> !fir.ref<i32>
-! CHECK:                 %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<i32>
 ! CHECK:                 %[[DESIGNATE_1:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_4]], %[[VAL_3]], %[[VAL_2]])  : (!fir.ref<!fir.array<100x10x2xi32>>, index, index, index) -> !fir.ref<i32>
+! CHECK:                 %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<i32>
 ! CHECK:                 %[[LOAD_1:.*]] = fir.load %[[DESIGNATE_1]] : !fir.ref<i32>
 ! CHECK:                 %[[ADDI_0:.*]] = arith.addi %[[LOAD_1]], %[[LOAD_0]] : i32
 ! CHECK:                 hlfir.assign %[[ADDI_0]] to %[[DESIGNATE_1]] : i32, !fir.ref<i32>
@@ -1278,8 +1279,8 @@
 ! CHECK:           fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_4]] to %[[CONSTANT_1]] step %[[CONSTANT_4]] unordered {
 ! CHECK:             fir.do_loop %[[VAL_3:.*]] = %[[CONSTANT_4]] to %[[CONSTANT_0]] step %[[CONSTANT_4]] unordered {
 ! CHECK:               %[[DESIGNATE_0:.*]] = hlfir.designate %[[VAL_1]] (%[[VAL_3]], %[[VAL_2]])  : (!fir.ref<!fir.array<100x10xi32>>, index, index) -> !fir.ref<i32>
-! CHECK:               %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<i32>
 ! CHECK:               %[[DESIGNATE_1:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_3]], %[[VAL_2]])  : (!fir.ref<!fir.array<100x10xi32>>, index, index) -> !fir.ref<i32>
+! CHECK:               %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<i32>
 ! CHECK:               %[[LOAD_1:.*]] = fir.load %[[DESIGNATE_1]] : !fir.ref<i32>
 ! CHECK:               %[[ADDI_0:.*]] = arith.addi %[[LOAD_1]], %[[LOAD_0]] : i32
 ! CHECK:               hlfir.assign %[[ADDI_0]] to %[[DESIGNATE_1]] : i32, !fir.ref<i32>
@@ -1313,8 +1314,8 @@
 ! CHECK:           %[[CONSTANT_2:.*]] = arith.constant 1 : index
 ! CHECK:           fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_2]] to %[[CONSTANT_0]] step %[[CONSTANT_2]] unordered {
 ! CHECK:             %[[DESIGNATE_0:.*]] = hlfir.designate %[[VAL_1]] (%[[VAL_2]])  : (!fir.ref<!fir.array<100xi32>>, index) -> !fir.ref<i32>
-! CHECK:             %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<i32>
 ! CHECK:             %[[DESIGNATE_1:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_2]])  : (!fir.ref<!fir.array<100xi32>>, index) -> !fir.ref<i32>
+! CHECK:             %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<i32>
 ! CHECK:             %[[LOAD_1:.*]] = fir.load %[[DESIGNATE_1]] : !fir.ref<i32>
 ! CHECK:             %[[ADDI_0:.*]] = arith.addi %[[LOAD_1]], %[[LOAD_0]] : i32
 ! CHECK:             hlfir.assign %[[ADDI_0]] to %[[DESIGNATE_1]] : i32, !fir.ref<i32>
@@ -1884,3 +1885,111 @@ subroutine acc_reduction_logical_allocatable(l)
 ! CHECK-LABEL:   func.func @_QPacc_reduction_logical_allocatable(
 ! CHECK:           %[[REDUCTION_0:.*]] = acc.reduction varPtr(%{{.*}} : !fir.ref<!fir.box<!fir.heap<!fir.logical<4>>>>) recipe(@reduction_lor_ref_box_heap_l32) -> !fir.ref<!fir.box<!fir.heap<!fir.logical<4>>>> {name = "l"}
 ! CHECK:           acc.parallel reduction(%[[REDUCTION_0]] : !fir.ref<!fir.box<!fir.heap<!fir.logical<4>>>>)
+
+! ACC_COMBINE-LABEL:   acc.reduction.recipe @reduction_lor_ref_box_heap_l32 : !fir.ref<!fir.box<!fir.heap<!fir.logical<4>>>> reduction_operator <lor> init {
+! ACC_COMBINE-NOT:     acc.reduction_combine
+! ACC_COMBINE-LABEL:   acc.reduction.recipe @reduction_max_box_UxUxf32 : !fir.box<!fir.array<?x?xf32>> reduction_operator <max> init {
+! ACC_COMBINE-NOT:     acc.reduction_combine
+! ACC_COMBINE-LABEL:   acc.reduction.recipe @reduction_max_ref_box_ptr_Uxf32 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>> reduction_operator <max> init {
+! ACC_COMBINE-NOT:     acc.reduction_combine
+! ACC_COMBINE-LABEL:   acc.reduction.recipe @reduction_max_ref_box_heap_Uxf32 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> reduction_operator <max> init {
+! ACC_COMBINE-NOT:     acc.reduction_combine
+
+! ACC_COMBINE-LABEL:   acc.reduction.recipe @reduction_add_section_lb1.ub3_box_Uxi32 : !fir.box<!fir.array<?xi32>> reduction_operator <add> init {
+! ACC_COMBINE-LABEL:   } combiner {
+! ACC_COMBINE:         ^bb0(%[[VAL_0:.*]]: !fir.box<!fir.array<?xi32>>, %[[VAL_1:.*]]: !fir.box<!fir.array<?xi32>>):
+! ACC_COMBINE:           %[[CONSTANT_0:.*]] = arith.constant 1 : index
+! ACC_COMBINE:           %[[CONSTANT_1:.*]] = arith.constant 1 : index
+! ACC_COMBINE:           %[[CONSTANT_2:.*]] = arith.constant 3 : index
+! ACC_COMBINE:           %[[CONSTANT_3:.*]] = arith.constant 0 : index
+! ACC_COMBINE:           %[[CONSTANT_4:.*]] = arith.constant 2 : index
+! ACC_COMBINE:           %[[CONSTANT_5:.*]] = arith.constant 3 : index
+! ACC_COMBINE:           %[[CONSTANT_6:.*]] = arith.constant true
+! ACC_COMBINE:           %[[SHAPE_0:.*]] = fir.shape %[[CONSTANT_5]] : (index) -> !fir.shape<1>
+! ACC_COMBINE:           %[[CONSTANT_7:.*]] = arith.constant 0 : index
+! ACC_COMBINE:           %[[BOX_DIMS_0:.*]]:3 = fir.box_dims %[[VAL_0]], %[[CONSTANT_7]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
+! ACC_COMBINE:           %[[ADDI_0:.*]] = arith.addi %[[BOX_DIMS_0]]#0, %[[CONSTANT_1]] : index
+! ACC_COMBINE:           %[[ADDI_1:.*]] = arith.addi %[[BOX_DIMS_0]]#0, %[[CONSTANT_2]] : index
+! ACC_COMBINE:           %[[CONSTANT_8:.*]] = arith.constant 0 : index
+! ACC_COMBINE:           %[[BOX_DIMS_1:.*]]:3 = fir.box_dims %[[VAL_1]], %[[CONSTANT_8]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
+! ACC_COMBINE:           %[[ADDI_2:.*]] = arith.addi %[[BOX_DIMS_1]]#0, %[[CONSTANT_1]] : index
+! ACC_COMBINE:           %[[ADDI_3:.*]] = arith.addi %[[BOX_DIMS_1]]#0, %[[CONSTANT_2]] : index
+! ACC_COMBINE:           %[[DESIGNATE_0:.*]] = hlfir.designate %[[VAL_1]] (%[[ADDI_2]]:%[[ADDI_3]]:%[[CONSTANT_0]])  shape %[[SHAPE_0]] : (!fir.box<!fir.array<?xi32>>, index, index, index, !fir.shape<1>) -> !fir.box<!fir.array<3xi32>>
+! ACC_COMBINE:           %[[DESIGNATE_1:.*]] = hlfir.designate %[[VAL_0]] (%[[ADDI_0]]:%[[ADDI_1]]:%[[CONSTANT_0]])  shape %[[SHAPE_0]] : (!fir.box<!fir.array<?xi32>>, index, index, index, !fir.shape<1>) -> !fir.box<!fir.array<3xi32>>
+! ACC_COMBINE:           %[[CONSTANT_9:.*]] = arith.constant 1 : index
+! ACC_COMBINE:           fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_9]] to %[[CONSTANT_5]] step %[[CONSTANT_9]] unordered {
+! ACC_COMBINE:             %[[DESIGNATE_2:.*]] = hlfir.designate %[[DESIGNATE_0]] (%[[VAL_2]])  : (!fir.box<!fir.array<3xi32>>, index) -> !fir.ref<i32>
+! ACC_COMBINE:             %[[DESIGNATE_3:.*]] = hlfir.designate %[[DESIGNATE_1]] (%[[VAL_2]])  : (!fir.box<!fir.array<3xi32>>, index) -> !fir.ref<i32>
+! ACC_COMBINE:             acc.reduction_combine %[[DESIGNATE_2]] into %[[DESIGNATE_3]] <add> : !fir.ref<i32>
+! ACC_COMBINE:           }
+! ACC_COMBINE:           acc.yield %[[VAL_0]] : !fir.box<!fir.array<?xi32>>
+! ACC_COMBINE:         }
+
+
+! ACC_COMBINE-LABEL:   acc.reduction.recipe @reduction_max_box_Uxf32 : !fir.box<!fir.array<?xf32>> reduction_operator <max> init {
+! ACC_COMBINE-NOT:     acc.reduction_combine
+! ACC_COMBINE-LABEL:   acc.reduction.recipe @reduction_add_box_Uxi32 : !fir.box<!fir.array<?xi32>> reduction_operator <add> init {
+! ACC_COMBINE:             acc.reduction_combine %{{.*}} into %{{.*}} <add> : !fir.ref<i32>
+! ACC_COMBINE-LABEL:   acc.reduction.recipe @reduction_add_section_lb0.ub9xlb0.ub19_ref_10x20xi32 : !fir.ref<!fir.array<10x20xi32>> reduction_operator <add> init {
+! ACC_COMBINE:               acc.reduction_combine %{{.*}} into %{{.*}} <add> : !fir.ref<i32>
+! ACC_COMBINE-LABEL:   acc.reduction.recipe @reduction_add_section_lb10.ub19_ref_100xi32 : !fir.ref<!fir.array<100xi32>> reduction_operator <add> init {
+! ACC_COMBINE:             acc.reduction_combine %{{.*}} into %{{.*}} <add> : !fir.ref<i32>
+! ACC_COMBINE-LABEL:   acc.reduction.recipe @reduction_add_ref_box_ptr_i32 : !fir.ref<!fir.box<!fir.ptr<i32>>> reduction_operator <add> init {
+! ACC_COMBINE:           acc.reduction_combine %{{.*}} into %{{.*}} <add> : !fir.ptr<i32>
+! ACC_COMBINE-LABEL:   acc.reduction.recipe @reduction_add_ref_box_heap_i32 : !fir.ref<!fir.box<!fir.heap<i32>>> reduction_operator <add> init {
+! ACC_COMBINE:           acc.reduction_combine %{{.*}} into %{{.*}} <add> : !fir.heap<i32>
+! ACC_COMBINE-LABEL:   acc.reduction.recipe @reduction_mul_ref_z32 : !fir.ref<complex<f32>> reduction_operator <mul> init {
+! ACC_COMBINE:           acc.reduction_combine %{{.*}} into %{{.*}} <mul> : !fir.ref<complex<f32>>
+! ACC_COMBINE-LABEL:   acc.reduction.recipe @reduction_add_ref_z32 : !fir.ref<complex<f32>> reduction_operator <add> init {
+! ACC_COMBINE:           acc.reduction_combine %{{.*}} into %{{.*}} <add> : !fir.ref<complex<f32>>
+! ACC_COMBINE-LABEL:   acc.reduction.recipe @reduction_neqv_ref_l32 : !fir.ref<!fir.logical<4>> reduction_operator <neqv> init {
+! ACC_COMBINE-NOT:     acc.reduction_combine
+! ACC_COMBINE-LABEL:   acc.reduction.recipe @reduction_eqv_ref_l32 : !fir.ref<!fir.logical<4>> reduction_operator <eqv> init {
+! ACC_COMBINE-NOT:     acc.reduction_combine
+! ACC_COMBINE-LABEL:   acc.reduction.recipe @reduction_lor_ref_l32 : !fir.ref<!fir.logical<4>> reduction_operator <lor> init {
+! ACC_COMBINE-NOT:     acc.reduction_combine
+! ACC_COMBINE-LABEL:   acc.reduction.recipe @reduction_land_ref_l32 : !fir.ref<!fir.logical<4>> reduction_operator <land> init {
+! ACC_COMBINE-NOT:     acc.reduction_combine
+! ACC_COMBINE-LABEL:   acc.reduction.recipe @reduction_xor_ref_i32 : !fir.ref<i32> reduction_operator <xor> init {
+! ACC_COMBINE:           acc.reduction_combine %{{.*}} into %{{.*}} <xor> : !fir.ref<i32>
+! ACC_COMBINE-LABEL:   acc.reduction.recipe @reduction_ior_ref_i32 : !fir.ref<i32> reduction_operator <ior> init {
+! ACC_COMBINE:           acc.reduction_combine %{{.*}} into %{{.*}} <ior> : !fir.ref<i32>
+! ACC_COMBINE-LABEL:   acc.reduction.recipe @reduction_iand_ref_i32 : !fir.ref<i32> reduction_operator <iand> init {
+! ACC_COMBINE:           acc.reduction_combine %{{.*}} into %{{.*}} <iand> : !fir.ref<i32>
+! ACC_COMBINE-LABEL:   acc.reduction.recipe @reduction_max_ref_100xf32 : !fir.ref<!fir.array<100xf32>> reduction_operator <max> init {
+! ACC_COMBINE-NOT:     acc.reduction_combine
+! ACC_COMBINE-LABEL:   acc.reduction.recipe @reduction_max_ref_f32 : !fir.ref<f32> reduction_operator <max> init {
+! ACC_COMBINE-NOT:     acc.reduction_combine
+! ACC_COMBINE-LABEL:   acc.reduction.recipe @reduction_max_ref_100x10xi32 : !fir.ref<!fir.array<100x10xi32>> reduction_operator <max> init {
+! ACC_COMBINE-NOT:     acc.reduction_combine
+! ACC_COMBINE-LABEL:   acc.reduction.recipe @reduction_max_ref_i32 : !fir.ref<i32> reduction_operator <max> init {
+! ACC_COMBINE-NOT:     acc.reduction_combine
+! ACC_COMBINE-LABEL:   acc.reduction.recipe @reduction_min_ref_100x10xf32 : !fir.ref<!fir.array<100x10xf32>> reduction_operator <min> init {
+! ACC_COMBINE-NOT:     acc.reduction_combine
+! ACC_COMBINE-LABEL:   acc.reduction.recipe @reduction_min_ref_f32 : !fir.ref<f32> reduction_operator <min> init {
+! ACC_COMBINE-NOT:     acc.reduction_combine
+! ACC_COMBINE-LABEL:   acc.reduction.recipe @reduction_min_ref_100xi32 : !fir.ref<!fir.array<100xi32>> reduction_operator <min> init {
+! ACC_COMBINE-NOT:     acc.reduction_combine
+! ACC_COMBINE-LABEL:   acc.reduction.recipe @reduction_min_ref_i32 : !fir.ref<i32> reduction_operator <min> init {
+! ACC_COMBINE-NOT:     acc.reduction_combine
+! ACC_COMBINE-LABEL:   acc.reduction.recipe @reduction_mul_ref_100xf32 : !fir.ref<!fir.array<100xf32>> reduction_operator <mul> init {
+! ACC_COMBINE-NOT:     acc.reduction_combine
+! ACC_COMBINE:             acc.reduction_combine %{{.*}} into %{{.*}} <mul> : !fir.ref<f32>
+! ACC_COMBINE-LABEL:   acc.reduction.recipe @reduction_mul_ref_f32 : !fir.ref<f32> reduction_operator <mul> init {
+! ACC_COMBINE:           acc.reduction_combine %{{.*}} into %{{.*}} <mul> : !fir.ref<f32>
+! ACC_COMBINE-LABEL:   acc.reduction.recipe @reduction_mul_ref_100xi32 : !fir.ref<!fir.array<100xi32>> reduction_operator <mul> init {
+! ACC_COMBINE:             acc.reduction_combine %{{.*}} into %{{.*}} <mul> : !fir.ref<i32>
+! ACC_COMBINE-LABEL:   acc.reduction.recipe @reduction_mul_ref_i32 : !fir.ref<i32> reduction_operator <mul> init {
+! ACC_COMBINE:           acc.reduction_combine %{{.*}} into %{{.*}} <mul> : !fir.ref<i32>
+! ACC_COMBINE-LABEL:   acc.reduction.recipe @reduction_add_ref_100xf32 : !fir.ref<!fir.array<100xf32>> reduction_operator <add> init {
+! ACC_COMBINE:             acc.reduction_combine %{{.*}} into %{{.*}} <add> : !fir.ref<f32>
+! ACC_COMBINE-LABEL:   acc.reduction.recipe @reduction_add_ref_f32 : !fir.ref<f32> reduction_operator <add> init {
+! ACC_COMBINE:           acc.reduction_combine %{{.*}} into %{{.*}} <add> : !fir.ref<f32>
+! ACC_COMBINE-LABEL:   acc.reduction.recipe @reduction_add_ref_100x10x2xi32 : !fir.ref<!fir.array<100x10x2xi32>> reduction_operator <add> init {
+! ACC_COMBINE:                 acc.reduction_combine %{{.*}} into %{{.*}} <add> : !fir.ref<i32>
+! ACC_COMBINE-LABEL:   acc.reduction.recipe @reduction_add_ref_100x10xi32 : !fir.ref<!fir.array<100x10xi32>> reduction_operator <add> init {
+! ACC_COMBINE:               acc.reduction_combine %{{.*}} into %{{.*}} <add> : !fir.ref<i32>
+! ACC_COMBINE-LABEL:   acc.reduction.recipe @reduction_add_ref_100xi32 : !fir.ref<!fir.array<100xi32>> reduction_operator <add> init {
+! ACC_COMBINE:             acc.reduction_combine %{{.*}} into %{{.*}} <add> : !fir.ref<i32>
+! ACC_COMBINE-LABEL:   acc.reduction.recipe @reduction_add_ref_i32 : !fir.ref<i32> reduction_operator <add> init {
+! ACC_COMBINE:           acc.reduction_combine %{{.*}} into %{{.*}} <add> : !fir.ref<i32>
diff --git a/flang/test/Lower/do-while-to-scf-while.f90 b/flang/test/Lower/do-while-to-scf-while.f90
index d2f38d6e09694..6d057ed823c36 100644
--- a/flang/test/Lower/do-while-to-scf-while.f90
+++ b/flang/test/Lower/do-while-to-scf-while.f90
@@ -1,4 +1,4 @@
-! RUN: bbc -emit-fir -hlfir=false -lower-do-while-to-scf-while %s -o - | FileCheck %s
+! RUN: bbc -emit-hlfir -lower-do-while-to-scf-while %s -o - | FileCheck %s
 
 ! CHECK-LABEL: func.func @_QPsimple_do_while()
 ! CHECK: scf.while
@@ -85,3 +85,20 @@ subroutine do_while_goto_internal_backedge()
   print *, "sum=", sum
 end subroutine do_while_goto_internal_backedge
 
+! CHECK-LABEL:   func.func @_QPtest_after_unstructured(
+! CHECK:  scf.while
+! CHECK-NOT: cf.br
+! CHECK: return
+subroutine test_after_unstructured(cdt, switch)
+  logical :: cdt, eval
+  integer :: switch, i = 1
+  if (cdt) then
+    select case (switch)
+      case (0)
+        call print1()
+    end select
+  end if
+  do while(eval(i))
+    call incr(i)
+  end do
+end subroutine
diff --git a/libc/shared/math.h b/libc/shared/math.h
index 828980a3500df..841ad0a34b2f1 100644
--- a/libc/shared/math.h
+++ b/libc/shared/math.h
@@ -34,7 +34,13 @@
 #include "math/bf16addf.h"
 #include "math/bf16addf128.h"
 #include "math/bf16divf.h"
+#include "math/bf16divl.h"
 #include "math/bf16fmaf.h"
+#include "math/bf16fmal.h"
+#include "math/bf16mul.h"
+#include "math/bf16mulf.h"
+#include "math/bf16mulf128.h"
+#include "math/bf16mull.h"
 #include "math/canonicalize.h"
 #include "math/canonicalizebf16.h"
 #include "math/canonicalizef.h"
@@ -70,6 +76,10 @@
 #include "math/expm1.h"
 #include "math/expm1f.h"
 #include "math/expm1f16.h"
+#include "math/f16add.h"
+#include "math/f16addf.h"
+#include "math/f16addf128.h"
+#include "math/f16addl.h"
 #include "math/f16fma.h"
 #include "math/f16fmaf.h"
 #include "math/f16fmaf128.h"
@@ -113,6 +123,7 @@
 #include "math/logbf.h"
 #include "math/logbf128.h"
 #include "math/logbf16.h"
+#include "math/logbl.h"
 #include "math/logf.h"
 #include "math/logf16.h"
 #include "math/pow.h"
@@ -134,7 +145,9 @@
 #include "math/sqrtf16.h"
 #include "math/tan.h"
 #include "math/tanf.h"
+#include "math/tanf16.h"
 #include "math/tanhf.h"
 #include "math/tanhf16.h"
+#include "math/tanpif.h"
 
 #endif // LLVM_LIBC_SHARED_MATH_H
diff --git a/libc/shared/math/bf16divl.h b/libc/shared/math/bf16divl.h
new file mode 100644
index 0000000000000..f30cfaa012c4f
--- /dev/null
+++ b/libc/shared/math/bf16divl.h
@@ -0,0 +1,23 @@
+//===-- Shared bf16divl function --------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SHARED_MATH_BF16DIVL_H
+#define LLVM_LIBC_SHARED_MATH_BF16DIVL_H
+
+#include "shared/libc_common.h"
+#include "src/__support/math/bf16divl.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace shared {
+
+using math::bf16divl;
+
+} // namespace shared
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SHARED_MATH_BF16DIVL_H
diff --git a/libc/shared/math/bf16fmal.h b/libc/shared/math/bf16fmal.h
new file mode 100644
index 0000000000000..24aacc53c72a8
--- /dev/null
+++ b/libc/shared/math/bf16fmal.h
@@ -0,0 +1,25 @@
+//===-- Shared bf16fmal function --------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SHARED_MATH_BF16FMAL_H
+#define LLVM_LIBC_SHARED_MATH_BF16FMAL_H
+
+#include "shared/libc_common.h"
+#include "src/__support/math/bf16fmal.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+namespace shared {
+
+using math::bf16fmal;
+
+} // namespace shared
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SHARED_MATH_BF16FMAL_H
diff --git a/libc/shared/math/bf16mul.h b/libc/shared/math/bf16mul.h
new file mode 100644
index 0000000000000..064416c498f59
--- /dev/null
+++ b/libc/shared/math/bf16mul.h
@@ -0,0 +1,22 @@
+//===-- Shared bf16mul function ---------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SHARED_MATH_BF16MUL_H
+#define LLVM_LIBC_SHARED_MATH_BF16MUL_H
+
+#include "src/__support/math/bf16mul.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace shared {
+
+using math::bf16mul;
+
+} // namespace shared
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SHARED_MATH_BF16MUL_H
diff --git a/libc/shared/math/bf16mulf.h b/libc/shared/math/bf16mulf.h
new file mode 100644
index 0000000000000..456bf85bfadf4
--- /dev/null
+++ b/libc/shared/math/bf16mulf.h
@@ -0,0 +1,22 @@
+//===-- Shared bf16mulf function --------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SHARED_MATH_BF16MULF_H
+#define LLVM_LIBC_SHARED_MATH_BF16MULF_H
+
+#include "src/__support/math/bf16mulf.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace shared {
+
+using math::bf16mulf;
+
+} // namespace shared
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SHARED_MATH_BF16MULF_H
diff --git a/libc/shared/math/bf16mulf128.h b/libc/shared/math/bf16mulf128.h
new file mode 100644
index 0000000000000..41baf47dcd78d
--- /dev/null
+++ b/libc/shared/math/bf16mulf128.h
@@ -0,0 +1,28 @@
+//===-- Shared bf16mulf128 function -----------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SHARED_MATH_BF16MULF128_H
+#define LLVM_LIBC_SHARED_MATH_BF16MULF128_H
+
+#include "include/llvm-libc-types/float128.h"
+
+#ifdef LIBC_TYPES_HAS_FLOAT128
+
+#include "src/__support/math/bf16mulf128.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace shared {
+
+using math::bf16mulf128;
+
+} // namespace shared
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LIBC_TYPES_HAS_FLOAT128
+
+#endif // LLVM_LIBC_SHARED_MATH_BF16MULF128_H
diff --git a/libc/shared/math/bf16mull.h b/libc/shared/math/bf16mull.h
new file mode 100644
index 0000000000000..fdea2182279b7
--- /dev/null
+++ b/libc/shared/math/bf16mull.h
@@ -0,0 +1,22 @@
+//===-- Shared bf16mull function --------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SHARED_MATH_BF16MULL_H
+#define LLVM_LIBC_SHARED_MATH_BF16MULL_H
+
+#include "src/__support/math/bf16mull.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace shared {
+
+using math::bf16mull;
+
+} // namespace shared
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SHARED_MATH_BF16MULL_H
diff --git a/libc/shared/math/f16add.h b/libc/shared/math/f16add.h
new file mode 100644
index 0000000000000..4a51de05d0857
--- /dev/null
+++ b/libc/shared/math/f16add.h
@@ -0,0 +1,29 @@
+//===-- Shared f16add function ----------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SHARED_MATH_F16ADD_H
+#define LLVM_LIBC_SHARED_MATH_F16ADD_H
+
+#include "include/llvm-libc-macros/float16-macros.h"
+
+#ifdef LIBC_TYPES_HAS_FLOAT16
+
+#include "shared/libc_common.h"
+#include "src/__support/math/f16add.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace shared {
+
+using math::f16add;
+
+} // namespace shared
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LIBC_TYPES_HAS_FLOAT16
+
+#endif // LLVM_LIBC_SHARED_MATH_F16ADD_H
diff --git a/libc/shared/math/f16addf.h b/libc/shared/math/f16addf.h
new file mode 100644
index 0000000000000..346b584cb826d
--- /dev/null
+++ b/libc/shared/math/f16addf.h
@@ -0,0 +1,29 @@
+//===-- Shared f16addf function ---------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SHARED_MATH_F16ADDF_H
+#define LLVM_LIBC_SHARED_MATH_F16ADDF_H
+
+#include "include/llvm-libc-macros/float16-macros.h"
+
+#ifdef LIBC_TYPES_HAS_FLOAT16
+
+#include "shared/libc_common.h"
+#include "src/__support/math/f16addf.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace shared {
+
+using math::f16addf;
+
+} // namespace shared
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LIBC_TYPES_HAS_FLOAT16
+
+#endif // LLVM_LIBC_SHARED_MATH_F16ADDF_H
diff --git a/libc/shared/math/f16addf128.h b/libc/shared/math/f16addf128.h
new file mode 100644
index 0000000000000..40321695a6342
--- /dev/null
+++ b/libc/shared/math/f16addf128.h
@@ -0,0 +1,32 @@
+//===-- Shared f16addf128 function ------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SHARED_MATH_F16ADDF128_H
+#define LLVM_LIBC_SHARED_MATH_F16ADDF128_H
+
+#include "include/llvm-libc-macros/float16-macros.h"
+#include "include/llvm-libc-types/float128.h"
+
+#ifdef LIBC_TYPES_HAS_FLOAT16
+#ifdef LIBC_TYPES_HAS_FLOAT128
+
+#include "shared/libc_common.h"
+#include "src/__support/math/f16addf128.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace shared {
+
+using math::f16addf128;
+
+} // namespace shared
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LIBC_TYPES_HAS_FLOAT16
+#endif // LIBC_TYPES_HAS_FLOAT128
+
+#endif // LLVM_LIBC_SHARED_MATH_F16ADDF128_H
diff --git a/libc/shared/math/f16addl.h b/libc/shared/math/f16addl.h
new file mode 100644
index 0000000000000..3406b0e65313a
--- /dev/null
+++ b/libc/shared/math/f16addl.h
@@ -0,0 +1,29 @@
+//===-- Shared f16addl function ---------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SHARED_MATH_F16ADDL_H
+#define LLVM_LIBC_SHARED_MATH_F16ADDL_H
+
+#include "include/llvm-libc-macros/float16-macros.h"
+
+#ifdef LIBC_TYPES_HAS_FLOAT16
+
+#include "shared/libc_common.h"
+#include "src/__support/math/f16addl.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace shared {
+
+using math::f16addl;
+
+} // namespace shared
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LIBC_TYPES_HAS_FLOAT16
+
+#endif // LLVM_LIBC_SHARED_MATH_F16ADDL_H
diff --git a/libc/shared/math/logbl.h b/libc/shared/math/logbl.h
new file mode 100644
index 0000000000000..d2bee4afe4e76
--- /dev/null
+++ b/libc/shared/math/logbl.h
@@ -0,0 +1,23 @@
+//===-- Shared logbl function -----------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SHARED_MATH_LOGBL_H
+#define LLVM_LIBC_SHARED_MATH_LOGBL_H
+
+#include "shared/libc_common.h"
+#include "src/__support/math/logbl.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace shared {
+
+using math::logbl;
+
+} // namespace shared
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SHARED_MATH_LOGBL_H
diff --git a/libc/shared/math/tanf16.h b/libc/shared/math/tanf16.h
new file mode 100644
index 0000000000000..b8ca2b87335e5
--- /dev/null
+++ b/libc/shared/math/tanf16.h
@@ -0,0 +1,29 @@
+//===-- Shared tanf16 function ----------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SHARED_MATH_TANF16_H
+#define LLVM_LIBC_SHARED_MATH_TANF16_H
+
+#include "include/llvm-libc-macros/float16-macros.h"
+
+#ifdef LIBC_TYPES_HAS_FLOAT16
+
+#include "shared/libc_common.h"
+#include "src/__support/math/tanf16.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace shared {
+
+using math::tanf16;
+
+} // namespace shared
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LIBC_TYPES_HAS_FLOAT16
+
+#endif // LLVM_LIBC_SHARED_MATH_TANF16_H
diff --git a/libc/shared/math/tanpif.h b/libc/shared/math/tanpif.h
new file mode 100644
index 0000000000000..4c1f691ddb1d2
--- /dev/null
+++ b/libc/shared/math/tanpif.h
@@ -0,0 +1,23 @@
+//===-- Shared tanpif function ----------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SHARED_MATH_TANPIF_H
+#define LLVM_LIBC_SHARED_MATH_TANPIF_H
+
+#include "shared/libc_common.h"
+#include "src/__support/math/tanpif.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace shared {
+
+using math::tanpif;
+
+} // namespace shared
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SHARED_MATH_TANPIF_H
diff --git a/libc/src/__support/math/CMakeLists.txt b/libc/src/__support/math/CMakeLists.txt
index af2c66597b75a..98e2721b73a65 100644
--- a/libc/src/__support/math/CMakeLists.txt
+++ b/libc/src/__support/math/CMakeLists.txt
@@ -381,6 +381,43 @@ add_header_library(
     libc.src.__support.FPUtil.fma
     libc.src.__support.macros.config
 )
+add_header_library(
+  bf16mul
+  HDRS
+    bf16mul.h
+  DEPENDS
+    libc.src.__support.FPUtil.bfloat16
+    libc.src.__support.FPUtil.generic.mul
+    libc.src.__support.macros.config
+)
+add_header_library(
+  bf16mulf
+  HDRS
+    bf16mulf.h
+  DEPENDS
+    libc.src.__support.FPUtil.bfloat16
+    libc.src.__support.FPUtil.generic.mul
+    libc.src.__support.macros.config
+)
+add_header_library(
+  bf16mulf128
+  HDRS
+    bf16mulf128.h
+  DEPENDS
+    libc.include.llvm-libc-types.float128
+    libc.src.__support.FPUtil.bfloat16
+    libc.src.__support.FPUtil.generic.mul
+    libc.src.__support.macros.config
+)
+add_header_library(
+  bf16mull
+  HDRS
+    bf16mull.h
+  DEPENDS
+    libc.src.__support.FPUtil.bfloat16
+    libc.src.__support.FPUtil.generic.mul
+    libc.src.__support.macros.config
+)
 
 add_header_library(
   canonicalize
@@ -440,6 +477,16 @@ add_header_library(
 )
 
 
+add_header_library(
+  bf16divl
+  HDRS
+    bf16divl.h
+  DEPENDS
+    libc.src.__support.FPUtil.bfloat16
+    libc.src.__support.FPUtil.generic.div
+    libc.src.__support.macros.config
+)
+
 add_header_library(
   cbrt
   HDRS
@@ -701,6 +748,47 @@ add_header_library(
     libc.src.__support.math.exp10_float16_constants
 )
 
+add_header_library(
+  f16add
+  HDRS
+    f16add.h
+  DEPENDS
+    libc.include.llvm-libc-macros.float16_macros
+    libc.src.__support.FPUtil.generic.add_sub
+    libc.src.__support.macros.config
+)
+
+add_header_library(
+  f16addf
+  HDRS
+    f16addf.h
+  DEPENDS
+    libc.include.llvm-libc-macros.float16_macros
+    libc.src.__support.FPUtil.generic.add_sub
+    libc.src.__support.macros.config
+)
+
+add_header_library(
+  f16addf128
+  HDRS
+    f16addf128.h
+  DEPENDS
+    libc.include.llvm-libc-macros.float16_macros
+    libc.include.llvm-libc-types.float128
+    libc.src.__support.FPUtil.generic.add_sub
+    libc.src.__support.macros.config
+)
+
+add_header_library(
+  f16addl   
+  HDRS
+    f16addl.h
+  DEPENDS
+    libc.include.llvm-libc-macros.float16_macros
+    libc.src.__support.FPUtil.generic.add_sub
+    libc.src.__support.macros.config
+)
+
 add_header_library(
   ffmal
   HDRS
@@ -845,6 +933,16 @@ add_header_library(
     libc.include.llvm-libc-macros.float16_macros
 )
 
+add_header_library(
+  bf16fmal
+  HDRS
+    bf16fmal.h
+  DEPENDS
+    libc.src.__support.macros.config
+    libc.src.__support.FPUtil.fma
+    libc.src.__support.FPUtil.bfloat16
+)
+
 add_header_library(
   ilogb
   HDRS
@@ -1675,6 +1773,16 @@ add_header_library(
     libc.src.__support.macros.properties.cpu_features
 )
 
+add_header_library(
+  logbl
+  HDRS
+    logbl.h
+  DEPENDS
+    libc.src.__support.FPUtil.manipulation_functions
+    libc.src.__support.common
+    libc.src.__support.macros.config
+)
+
 add_header_library(
   log_range_reduction
   HDRS
@@ -1888,6 +1996,23 @@ add_header_library(
     libc.src.__support.macros.optimization
 )
 
+add_header_library(
+  tanf16
+  HDRS
+    tanf16.h
+  DEPENDS
+    .sincosf16_utils
+    libc.hdr.errno_macros
+    libc.hdr.fenv_macros
+    libc.src.__support.FPUtil.cast
+    libc.src.__support.FPUtil.fenv_impl
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.except_value_utils
+    libc.src.__support.FPUtil.multiply_add
+    libc.src.__support.macros.optimization
+    libc.include.llvm-libc-macros.float16_macros
+)
+
 add_header_library(
   tanhf
   HDRS
@@ -1924,3 +2049,19 @@ add_header_library(
     libc.src.__support.macros.optimization
     libc.include.llvm-libc-macros.float16_macros
 )
+
+add_header_library(
+  tanpif
+  HDRS
+    tanpif.h
+  DEPENDS
+    .sincosf16_utils
+    libc.src.__support.FPUtil.fenv_impl
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.cast
+    libc.src.__support.FPUtil.except_value_utils
+    libc.src.__support.FPUtil.multiply_add
+    libc.src.__support.common
+    libc.src.__support.macros.config
+    libc.src.__support.macros.optimization
+)
diff --git a/libc/src/__support/math/bf16divl.h b/libc/src/__support/math/bf16divl.h
new file mode 100644
index 0000000000000..ec5a9244b98d6
--- /dev/null
+++ b/libc/src/__support/math/bf16divl.h
@@ -0,0 +1,26 @@
+//===-- Implementation header for bf16divl ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_BF16DIVL_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_BF16DIVL_H
+
+#include "src/__support/FPUtil/bfloat16.h"
+#include "src/__support/FPUtil/generic/div.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace math {
+
+LIBC_INLINE bfloat16 bf16divl(long double x, long double y) {
+  return fputil::generic::div<bfloat16>(x, y);
+}
+
+} // namespace math
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_MATH_BF16DIVL_H
diff --git a/libc/src/__support/math/bf16fmal.h b/libc/src/__support/math/bf16fmal.h
new file mode 100644
index 0000000000000..93a04d0ec8fac
--- /dev/null
+++ b/libc/src/__support/math/bf16fmal.h
@@ -0,0 +1,26 @@
+//===-- Implementation header for bf16fmal ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_BF16FMAL_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_BF16FMAL_H
+
+#include "src/__support/FPUtil/FMA.h"
+#include "src/__support/FPUtil/bfloat16.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace math {
+
+LIBC_INLINE bfloat16 bf16fmal(long double x, long double y, long double z) {
+  return fputil::fma<bfloat16>(x, y, z);
+}
+
+} // namespace math
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_MATH_BF16FMAL_H
diff --git a/libc/src/__support/math/bf16mul.h b/libc/src/__support/math/bf16mul.h
new file mode 100644
index 0000000000000..af55c519ef9cf
--- /dev/null
+++ b/libc/src/__support/math/bf16mul.h
@@ -0,0 +1,27 @@
+//===-- Implementation header for bf16mul -----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_BF16MUL_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_BF16MUL_H
+
+#include "src/__support/FPUtil/bfloat16.h"
+#include "src/__support/FPUtil/generic/mul.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+namespace math {
+
+LIBC_INLINE constexpr bfloat16 bf16mul(double x, double y) {
+  return fputil::generic::mul<bfloat16>(x, y);
+}
+
+} // namespace math
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_MATH_BF16MUL_H
diff --git a/libc/src/__support/math/bf16mulf.h b/libc/src/__support/math/bf16mulf.h
new file mode 100644
index 0000000000000..24eae3d43f419
--- /dev/null
+++ b/libc/src/__support/math/bf16mulf.h
@@ -0,0 +1,27 @@
+//===-- Implementation header for bf16mulf ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_BF16MULF_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_BF16MULF_H
+
+#include "src/__support/FPUtil/bfloat16.h"
+#include "src/__support/FPUtil/generic/mul.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+namespace math {
+
+LIBC_INLINE constexpr bfloat16 bf16mulf(float x, float y) {
+  return fputil::generic::mul<bfloat16>(x, y);
+}
+
+} // namespace math
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_MATH_BF16MULF_H
diff --git a/libc/src/__support/math/bf16mulf128.h b/libc/src/__support/math/bf16mulf128.h
new file mode 100644
index 0000000000000..d9f50044047bd
--- /dev/null
+++ b/libc/src/__support/math/bf16mulf128.h
@@ -0,0 +1,33 @@
+//===-- Implementation header for bf16mulf128 -------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_BF16MULF128_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_BF16MULF128_H
+
+#include "include/llvm-libc-types/float128.h"
+
+#ifdef LIBC_TYPES_HAS_FLOAT128
+
+#include "src/__support/FPUtil/bfloat16.h"
+#include "src/__support/FPUtil/generic/mul.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+namespace math {
+
+LIBC_INLINE constexpr bfloat16 bf16mulf128(float128 x, float128 y) {
+  return fputil::generic::mul<bfloat16>(x, y);
+}
+
+} // namespace math
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LIBC_TYPES_HAS_FLOAT128
+
+#endif // LLVM_LIBC_SRC___SUPPORT_MATH_BF16MULF128_H
diff --git a/libc/src/__support/math/bf16mull.h b/libc/src/__support/math/bf16mull.h
new file mode 100644
index 0000000000000..d54a2e9bdc272
--- /dev/null
+++ b/libc/src/__support/math/bf16mull.h
@@ -0,0 +1,27 @@
+//===-- Implementation header for bf16mull ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_BF16MULL_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_BF16MULL_H
+
+#include "src/__support/FPUtil/bfloat16.h"
+#include "src/__support/FPUtil/generic/mul.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+namespace math {
+
+LIBC_INLINE constexpr bfloat16 bf16mull(long double x, long double y) {
+  return fputil::generic::mul<bfloat16>(x, y);
+}
+
+} // namespace math
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_MATH_BF16MULL_H
diff --git a/libc/src/__support/math/f16add.h b/libc/src/__support/math/f16add.h
new file mode 100644
index 0000000000000..7e046c3f115dc
--- /dev/null
+++ b/libc/src/__support/math/f16add.h
@@ -0,0 +1,31 @@
+//===-- Implementation header for f16add ------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_F16ADD_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_F16ADD_H
+
+#include "include/llvm-libc-macros/float16-macros.h"
+
+#ifdef LIBC_TYPES_HAS_FLOAT16
+
+#include "src/__support/FPUtil/generic/add_sub.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace math {
+
+LIBC_INLINE float16 f16add(double x, double y) {
+  return fputil::generic::add<float16>(x, y);
+}
+
+} // namespace math
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LIBC_TYPES_HAS_FLOAT16
+
+#endif // LLVM_LIBC_SRC___SUPPORT_MATH_F16ADD_H
diff --git a/libc/src/__support/math/f16addf.h b/libc/src/__support/math/f16addf.h
new file mode 100644
index 0000000000000..5e140bc6e5373
--- /dev/null
+++ b/libc/src/__support/math/f16addf.h
@@ -0,0 +1,31 @@
+//===-- Implementation header for f16addf -----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_F16ADDF_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_F16ADDF_H
+
+#include "include/llvm-libc-macros/float16-macros.h"
+
+#ifdef LIBC_TYPES_HAS_FLOAT16
+
+#include "src/__support/FPUtil/generic/add_sub.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace math {
+
+LIBC_INLINE float16 f16addf(float x, float y) {
+  return fputil::generic::add<float16>(x, y);
+}
+
+} // namespace math
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LIBC_TYPES_HAS_FLOAT16
+
+#endif // LLVM_LIBC_SRC___SUPPORT_MATH_F16ADDF_H
diff --git a/libc/src/__support/math/f16addf128.h b/libc/src/__support/math/f16addf128.h
new file mode 100644
index 0000000000000..8d259a273a8d1
--- /dev/null
+++ b/libc/src/__support/math/f16addf128.h
@@ -0,0 +1,34 @@
+//===-- Implementation header for f16addf128 --------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_F16ADDF128_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_F16ADDF128_H
+
+#include "include/llvm-libc-macros/float16-macros.h"
+#include "include/llvm-libc-types/float128.h"
+
+#ifdef LIBC_TYPES_HAS_FLOAT16
+#ifdef LIBC_TYPES_HAS_FLOAT128
+
+#include "src/__support/FPUtil/generic/add_sub.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace math {
+
+LIBC_INLINE float16 f16addf128(float128 x, float128 y) {
+  return fputil::generic::add<float16>(x, y);
+}
+
+} // namespace math
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LIBC_TYPES_HAS_FLOAT16
+#endif // LIBC_TYPES_HAS_FLOAT128
+
+#endif // LLVM_LIBC_SRC___SUPPORT_MATH_F16ADDF128_H
diff --git a/libc/src/__support/math/f16addl.h b/libc/src/__support/math/f16addl.h
new file mode 100644
index 0000000000000..88f16857be48e
--- /dev/null
+++ b/libc/src/__support/math/f16addl.h
@@ -0,0 +1,31 @@
+//===-- Implementation header for f16addl -----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_F16ADDL_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_F16ADDL_H
+
+#include "include/llvm-libc-macros/float16-macros.h"
+
+#ifdef LIBC_TYPES_HAS_FLOAT16
+
+#include "src/__support/FPUtil/generic/add_sub.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace math {
+
+LIBC_INLINE float16 f16addl(long double x, long double y) {
+  return fputil::generic::add<float16>(x, y);
+}
+
+} // namespace math
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LIBC_TYPES_HAS_FLOAT16
+
+#endif // LLVM_LIBC_SRC___SUPPORT_MATH_F16ADDL_H
diff --git a/libc/src/__support/math/logbl.h b/libc/src/__support/math/logbl.h
new file mode 100644
index 0000000000000..750050277c165
--- /dev/null
+++ b/libc/src/__support/math/logbl.h
@@ -0,0 +1,26 @@
+//===-- Implementation header for logbl -------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_LOGBL_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_LOGBL_H
+
+#include "src/__support/FPUtil/ManipulationFunctions.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace math {
+
+LIBC_INLINE constexpr long double logbl(long double x) {
+  return fputil::logb(x);
+}
+
+} // namespace math
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_MATH_LOGBL_H
diff --git a/libc/src/__support/math/tanf16.h b/libc/src/__support/math/tanf16.h
new file mode 100644
index 0000000000000..6b9b9224fb84d
--- /dev/null
+++ b/libc/src/__support/math/tanf16.h
@@ -0,0 +1,137 @@
+//===-- Single-precision tanf16 function ----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_TANF16_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_TANF16_H
+
+#include "include/llvm-libc-macros/float16-macros.h"
+
+#ifdef LIBC_TYPES_HAS_FLOAT16
+
+#include "hdr/errno_macros.h"
+#include "hdr/fenv_macros.h"
+#include "sincosf16_utils.h"
+#include "src/__support/FPUtil/FEnvImpl.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/cast.h"
+#include "src/__support/FPUtil/except_value_utils.h"
+#include "src/__support/FPUtil/multiply_add.h"
+#include "src/__support/macros/optimization.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+namespace math {
+
+LIBC_INLINE float16 tanf16(float16 x) {
+  using namespace sincosf16_internal;
+  using FPBits = fputil::FPBits<float16>;
+  FPBits xbits(x);
+
+#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+  constexpr size_t N_EXCEPTS = 9;
+  constexpr fputil::ExceptValues<float16, N_EXCEPTS> TANF16_EXCEPTS{{
+      // (input, RZ output, RU offset, RD offset, RN offset)
+      {0x2894, 0x2894, 1, 0, 1},
+      {0x3091, 0x3099, 1, 0, 0},
+      {0x3098, 0x30a0, 1, 0, 0},
+      {0x55ed, 0x3911, 1, 0, 0},
+      {0x607b, 0xc638, 0, 1, 1},
+      {0x674e, 0x3b7d, 1, 0, 0},
+      {0x6807, 0x4014, 1, 0, 1},
+      {0x6f4d, 0xbe19, 0, 1, 1},
+      {0x7330, 0xcb62, 0, 1, 0},
+  }};
+#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
+  uint16_t x_u = xbits.uintval();
+  uint16_t x_abs = x_u & 0x7fff;
+  float xf = x;
+
+#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+  bool x_sign = x_u >> 15;
+  // Handle exceptional values
+  if (auto r = TANF16_EXCEPTS.lookup_odd(x_abs, x_sign);
+      LIBC_UNLIKELY(r.has_value()))
+    return r.value();
+#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
+  // |x| <= 0x1.d1p-5
+  if (LIBC_UNLIKELY(x_abs <= 0x2b44)) {
+    // |x| <= 0x1.398p-11
+    if (LIBC_UNLIKELY(x_abs <= 0x10e6)) {
+      // tan(+/-0) = +/-0
+      if (LIBC_UNLIKELY(x_abs == 0))
+        return x;
+
+      int rounding = fputil::quick_get_round();
+
+      // Exhaustive tests show that, when:
+      // x > 0, and rounding upward or
+      // x < 0, and rounding downward then,
+      // tan(x) = x * 2^-11 + x
+      if ((xbits.is_pos() && rounding == FE_UPWARD) ||
+          (xbits.is_neg() && rounding == FE_DOWNWARD))
+        return fputil::cast<float16>(fputil::multiply_add(xf, 0x1.0p-11f, xf));
+      return x;
+    }
+
+    float xsq = xf * xf;
+
+    // Degree-6 minimax odd polynomial of tan(x) generated by Sollya with:
+    // > P = fpminimax(tan(x)/x, [|0, 2, 4, 6|], [|1, SG...|], [0, pi/32]);
+    float result = fputil::polyeval(xsq, 0x1p0f, 0x1.555556p-2f, 0x1.110ee4p-3f,
+                                    0x1.be80f6p-5f);
+
+    return fputil::cast<float16>(xf * result);
+  }
+
+  // tan(+/-inf) = NaN, and tan(NaN) = NaN
+  if (LIBC_UNLIKELY(x_abs >= 0x7c00)) {
+    if (xbits.is_signaling_nan()) {
+      fputil::raise_except_if_required(FE_INVALID);
+      return FPBits::quiet_nan().get_val();
+    }
+    // x = +/-inf
+    if (x_abs == 0x7c00) {
+      fputil::set_errno_if_required(EDOM);
+      fputil::raise_except_if_required(FE_INVALID);
+    }
+
+    return x + FPBits::quiet_nan().get_val();
+  }
+
+  // Range reduction:
+  // For |x| > pi/32, we perform range reduction as follows:
+  // Find k and y such that:
+  //   x = (k + y) * pi/32;
+  //   k is an integer, |y| < 0.5
+  //
+  // This is done by performing:
+  //   k = round(x * 32/pi)
+  //   y = x * 32/pi - k
+  //
+  // Once k and y are computed, we then deduce the answer by the formula:
+  // tan(x) = sin(x) / cos(x)
+  // 	    = (sin_y * cos_k + cos_y * sin_k) / (cos_y * cos_k - sin_y * sin_k)
+  float sin_k, cos_k, sin_y, cosm1_y;
+  sincosf16_eval(xf, sin_k, cos_k, sin_y, cosm1_y);
+
+  // Note that, cosm1_y = cos_y - 1:
+  using fputil::multiply_add;
+  return fputil::cast<float16>(
+      multiply_add(sin_y, cos_k, multiply_add(cosm1_y, sin_k, sin_k)) /
+      multiply_add(sin_y, -sin_k, multiply_add(cosm1_y, cos_k, cos_k)));
+}
+
+} // namespace math
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LIBC_TYPES_HAS_FLOAT16
+
+#endif // LLVM_LIBC_SRC___SUPPORT_MATH_TANF16_H
diff --git a/libc/src/__support/math/tanpif.h b/libc/src/__support/math/tanpif.h
new file mode 100644
index 0000000000000..114fcb6053d30
--- /dev/null
+++ b/libc/src/__support/math/tanpif.h
@@ -0,0 +1,115 @@
+//===-- Single-precision tanpi function -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_TANPIF_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_TANPIF_H
+
+#include "sincosf_utils.h"
+#include "src/__support/FPUtil/FEnvImpl.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/cast.h"
+#include "src/__support/FPUtil/except_value_utils.h"
+#include "src/__support/FPUtil/multiply_add.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
+
+namespace LIBC_NAMESPACE_DECL {
+
+namespace math {
+
+LIBC_INLINE float tanpif(float x) {
+  using namespace sincosf_utils_internal;
+
+  using FPBits = typename fputil::FPBits<float>;
+  FPBits xbits(x);
+
+#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+  constexpr size_t N_EXCEPTS = 3;
+  constexpr fputil::ExceptValues<float, N_EXCEPTS> TANPIF_EXCEPTS{{
+      // (input, RZ output, RU offset, RD offset, RN offset)
+      {0x38F26685, 0x39BE6182, 1, 0, 0},
+      {0x3E933802, 0x3FA267DD, 1, 0, 0},
+      {0x3F3663FF, 0xBFA267DD, 0, 1, 0},
+  }};
+#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
+  uint32_t x_u = xbits.uintval();
+  uint32_t x_abs = x_u & 0x7fff'ffffU;
+  double xd = static_cast<double>(xbits.get_val());
+
+  // Handle exceptional values
+  if (LIBC_UNLIKELY(x_abs <= 0x3F3663FF)) {
+    if (LIBC_UNLIKELY(x_abs == 0U))
+      return x;
+
+#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+    bool x_sign = x_u >> 31;
+
+    if (auto r = TANPIF_EXCEPTS.lookup_odd(x_abs, x_sign);
+        LIBC_UNLIKELY(r.has_value()))
+      return r.value();
+#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+  }
+
+  // Numbers greater or equal to 2^23 are always integers, or infinity, or NaN
+  if (LIBC_UNLIKELY(x_abs >= 0x4B00'0000)) {
+    // x is inf or NaN.
+    if (LIBC_UNLIKELY(x_abs >= 0x7f80'0000U)) {
+      if (xbits.is_signaling_nan()) {
+        fputil::raise_except_if_required(FE_INVALID);
+        return FPBits::quiet_nan().get_val();
+      }
+
+      if (x_abs == 0x7f80'0000U) {
+        fputil::set_errno_if_required(EDOM);
+        fputil::raise_except_if_required(FE_INVALID);
+      }
+
+      return x + FPBits::quiet_nan().get_val();
+    }
+
+    return FPBits::zero(xbits.sign()).get_val();
+  }
+
+  // Range reduction:
+  // For |x| > 1/32, we perform range reduction as follows:
+  // Find k and y such that:
+  //   x = (k + y) * 1/32
+  //   k is an integer
+  //   |y| < 0.5
+  //
+  // This is done by performing:
+  //   k = round(x * 32)
+  //   y = x * 32 - k
+  //
+  // Once k and y are computed, we then deduce the answer by the formula:
+  // tan(x) = sin(x) / cos(x)
+  //        = (sin_y * cos_k + cos_y * sin_k) / (cos_y * cos_k - sin_y * sin_k)
+  double sin_k, cos_k, sin_y, cosm1_y;
+  sincospif_eval(xd, sin_k, cos_k, sin_y, cosm1_y);
+
+  if (LIBC_UNLIKELY(sin_y == 0 && cos_k == 0)) {
+    fputil::set_errno_if_required(EDOM);
+    fputil::raise_except_if_required(FE_DIVBYZERO);
+
+    int32_t x_mp5_i = static_cast<int32_t>(xd - 0.5);
+    return FPBits::inf((x_mp5_i & 0x1) ? Sign::NEG : Sign::POS).get_val();
+  }
+
+  using fputil::multiply_add;
+  return fputil::cast<float>(
+      multiply_add(sin_y, cos_k, multiply_add(cosm1_y, sin_k, sin_k)) /
+      multiply_add(sin_y, -sin_k, multiply_add(cosm1_y, cos_k, cos_k)));
+}
+
+} // namespace math
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_MATH_TANPIF_H
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index 47101706ce4c8..d9e33686a3132 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -453,16 +453,7 @@ add_entrypoint_object(
   HDRS
     ../tanf16.h
   DEPENDS
-    libc.hdr.errno_macros
-    libc.hdr.fenv_macros
-    libc.src.__support.FPUtil.cast
-    libc.src.__support.FPUtil.fenv_impl
-    libc.src.__support.FPUtil.fp_bits
-    libc.src.__support.FPUtil.except_value_utils
-    libc.src.__support.FPUtil.multiply_add
-    libc.src.__support.macros.optimization
-    libc.src.__support.macros.properties.types
-    libc.src.__support.math.sincosf16_utils
+    libc.src.__support.math.tanf16
 )
 
 add_entrypoint_object(
@@ -472,12 +463,7 @@ add_entrypoint_object(
   HDRS
     ../tanpif.h
   DEPENDS
-    libc.src.__support.math.sincosf_utils
-    libc.src.__support.FPUtil.except_value_utils
-    libc.src.__support.FPUtil.fenv_impl
-    libc.src.__support.FPUtil.fp_bits
-    libc.src.__support.FPUtil.multiply_add
-    libc.src.__support.macros.optimization
+    libc.src.__support.math.tanpif
 )
 
 add_entrypoint_object(
@@ -2013,7 +1999,7 @@ add_entrypoint_object(
   HDRS
     ../logbl.h
   DEPENDS
-    libc.src.__support.FPUtil.manipulation_functions
+    libc.src.__support.math.logbl
 )
 
 add_entrypoint_object(
@@ -4791,8 +4777,7 @@ add_entrypoint_object(
   HDRS
     ../f16add.h
   DEPENDS
-    libc.src.__support.macros.properties.types
-    libc.src.__support.FPUtil.generic.add_sub
+    libc.src.__support.math.f16add
 )
 
 add_entrypoint_object(
@@ -4802,30 +4787,27 @@ add_entrypoint_object(
   HDRS
     ../f16addf.h
   DEPENDS
-    libc.src.__support.macros.properties.types
-    libc.src.__support.FPUtil.generic.add_sub
+    libc.src.__support.math.f16addf
 )
 
 add_entrypoint_object(
-  f16addl
+  f16addf128
   SRCS
-    f16addl.cpp
+    f16addf128.cpp
   HDRS
-    ../f16addl.h
+    ../f16addf128.h
   DEPENDS
-    libc.src.__support.macros.properties.types
-    libc.src.__support.FPUtil.generic.add_sub
+    libc.src.__support.math.f16addf128
 )
 
 add_entrypoint_object(
-  f16addf128
+  f16addl
   SRCS
-    f16addf128.cpp
+    f16addl.cpp
   HDRS
-    ../f16addf128.h
+    ../f16addl.h
   DEPENDS
-    libc.src.__support.macros.properties.types
-    libc.src.__support.FPUtil.generic.add_sub
+    libc.src.__support.math.f16addl
 )
 
 add_entrypoint_object(
@@ -5189,11 +5171,7 @@ add_entrypoint_object(
   HDRS
     ../bf16divl.h
   DEPENDS
-    libc.src.__support.common
-    libc.src.__support.FPUtil.bfloat16
-    libc.src.__support.FPUtil.generic.div
-    libc.src.__support.macros.config
-    libc.src.__support.macros.properties.types
+    libc.src.__support.math.bf16divl
 )
 
 add_entrypoint_object(
@@ -5241,11 +5219,7 @@ add_entrypoint_object(
   HDRS
     ../bf16fmal.h
   DEPENDS
-    libc.src.__support.common
-    libc.src.__support.FPUtil.bfloat16
-    libc.src.__support.FPUtil.fma
-    libc.src.__support.macros.config
-    libc.src.__support.macros.properties.types
+    libc.src.__support.math.bf16fmal
 )
 
 add_entrypoint_object(
@@ -5269,11 +5243,7 @@ add_entrypoint_object(
   HDRS
     ../bf16mul.h
   DEPENDS
-    libc.src.__support.common
-    libc.src.__support.FPUtil.bfloat16
-    libc.src.__support.FPUtil.generic.mul
-    libc.src.__support.macros.config
-    libc.src.__support.macros.properties.types
+    libc.src.__support.math.bf16mul
 )
 
 add_entrypoint_object(
@@ -5283,11 +5253,7 @@ add_entrypoint_object(
   HDRS
     ../bf16mulf.h
   DEPENDS
-    libc.src.__support.common
-    libc.src.__support.FPUtil.bfloat16
-    libc.src.__support.FPUtil.generic.mul
-    libc.src.__support.macros.config
-    libc.src.__support.macros.properties.types
+    libc.src.__support.math.bf16mulf
 )
 
 add_entrypoint_object(
@@ -5297,11 +5263,7 @@ add_entrypoint_object(
   HDRS
     ../bf16mull.h
   DEPENDS
-    libc.src.__support.common
-    libc.src.__support.FPUtil.bfloat16
-    libc.src.__support.FPUtil.generic.mul
-    libc.src.__support.macros.config
-    libc.src.__support.macros.properties.types
+    libc.src.__support.math.bf16mull
 )
 
 add_entrypoint_object(
@@ -5311,11 +5273,7 @@ add_entrypoint_object(
   HDRS
     ../bf16mulf128.h
   DEPENDS
-    libc.src.__support.common
-    libc.src.__support.FPUtil.bfloat16
-    libc.src.__support.FPUtil.generic.mul
-    libc.src.__support.macros.config
-    libc.src.__support.macros.properties.types
+    libc.src.__support.math.bf16mulf128
 )
 
 add_entrypoint_object(
diff --git a/libc/src/math/generic/bf16divl.cpp b/libc/src/math/generic/bf16divl.cpp
index 21dd6b150e07a..432ed829005f0 100644
--- a/libc/src/math/generic/bf16divl.cpp
+++ b/libc/src/math/generic/bf16divl.cpp
@@ -7,15 +7,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/bf16divl.h"
-#include "src/__support/FPUtil/bfloat16.h"
-#include "src/__support/FPUtil/generic/div.h"
-#include "src/__support/common.h"
-#include "src/__support/macros/config.h"
+#include "src/__support/math/bf16divl.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(bfloat16, bf16divl, (long double x, long double y)) {
-  return fputil::generic::div<bfloat16>(x, y);
+  return math::bf16divl(x, y);
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/bf16fmal.cpp b/libc/src/math/generic/bf16fmal.cpp
index f31ec6904760b..0e8f1901c2093 100644
--- a/libc/src/math/generic/bf16fmal.cpp
+++ b/libc/src/math/generic/bf16fmal.cpp
@@ -7,16 +7,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/bf16fmal.h"
-
-#include "src/__support/FPUtil/FMA.h"
-#include "src/__support/FPUtil/bfloat16.h"
-#include "src/__support/common.h"
-#include "src/__support/macros/config.h"
+#include "src/__support/math/bf16fmal.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(bfloat16, bf16fmal,
                    (long double x, long double y, long double z)) {
-  return fputil::fma<bfloat16>(x, y, z);
+  return math::bf16fmal(x, y, z);
 }
+
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/bf16mul.cpp b/libc/src/math/generic/bf16mul.cpp
index c50eec2b52e5c..e6b5d81004d97 100644
--- a/libc/src/math/generic/bf16mul.cpp
+++ b/libc/src/math/generic/bf16mul.cpp
@@ -7,15 +7,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/bf16mul.h"
-#include "src/__support/FPUtil/bfloat16.h"
-#include "src/__support/FPUtil/generic/mul.h"
-#include "src/__support/common.h"
-#include "src/__support/macros/config.h"
+#include "src/__support/math/bf16mul.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(bfloat16, bf16mul, (double x, double y)) {
-  return fputil::generic::mul<bfloat16>(x, y);
+  return math::bf16mul(x, y);
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/bf16mulf.cpp b/libc/src/math/generic/bf16mulf.cpp
index 117fcd1d661ab..a16086decb7ca 100644
--- a/libc/src/math/generic/bf16mulf.cpp
+++ b/libc/src/math/generic/bf16mulf.cpp
@@ -7,15 +7,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/bf16mulf.h"
-#include "src/__support/FPUtil/bfloat16.h"
-#include "src/__support/FPUtil/generic/mul.h"
-#include "src/__support/common.h"
-#include "src/__support/macros/config.h"
+#include "src/__support/math/bf16mulf.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(bfloat16, bf16mulf, (float x, float y)) {
-  return fputil::generic::mul<bfloat16>(x, y);
+  return math::bf16mulf(x, y);
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/bf16mulf128.cpp b/libc/src/math/generic/bf16mulf128.cpp
index ff2a081d82e6b..685568c15d161 100644
--- a/libc/src/math/generic/bf16mulf128.cpp
+++ b/libc/src/math/generic/bf16mulf128.cpp
@@ -7,15 +7,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/bf16mulf128.h"
-#include "src/__support/FPUtil/bfloat16.h"
-#include "src/__support/FPUtil/generic/mul.h"
-#include "src/__support/common.h"
-#include "src/__support/macros/config.h"
+#include "src/__support/math/bf16mulf128.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(bfloat16, bf16mulf128, (float128 x, float128 y)) {
-  return fputil::generic::mul<bfloat16>(x, y);
+  return math::bf16mulf128(x, y);
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/bf16mull.cpp b/libc/src/math/generic/bf16mull.cpp
index e7c4fc085a3cd..11a7ef833e6fb 100644
--- a/libc/src/math/generic/bf16mull.cpp
+++ b/libc/src/math/generic/bf16mull.cpp
@@ -7,15 +7,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/bf16mull.h"
-#include "src/__support/FPUtil/bfloat16.h"
-#include "src/__support/FPUtil/generic/mul.h"
-#include "src/__support/common.h"
-#include "src/__support/macros/config.h"
+#include "src/__support/math/bf16mull.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(bfloat16, bf16mull, (long double x, long double y)) {
-  return fputil::generic::mul<bfloat16>(x, y);
+  return math::bf16mull(x, y);
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/f16add.cpp b/libc/src/math/generic/f16add.cpp
index e9be8a743721e..fcd31e73183a1 100644
--- a/libc/src/math/generic/f16add.cpp
+++ b/libc/src/math/generic/f16add.cpp
@@ -7,14 +7,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/f16add.h"
-#include "src/__support/FPUtil/generic/add_sub.h"
-#include "src/__support/common.h"
-#include "src/__support/macros/config.h"
+#include "src/__support/math/f16add.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(float16, f16add, (double x, double y)) {
-  return fputil::generic::add<float16>(x, y);
+  return math::f16add(x, y);
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/f16addf.cpp b/libc/src/math/generic/f16addf.cpp
index ee05ff7f00531..63fe8c5f044bd 100644
--- a/libc/src/math/generic/f16addf.cpp
+++ b/libc/src/math/generic/f16addf.cpp
@@ -7,14 +7,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/f16addf.h"
-#include "src/__support/FPUtil/generic/add_sub.h"
-#include "src/__support/common.h"
-#include "src/__support/macros/config.h"
+#include "src/__support/math/f16addf.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(float16, f16addf, (float x, float y)) {
-  return fputil::generic::add<float16>(x, y);
+  return math::f16addf(x, y);
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/f16addf128.cpp b/libc/src/math/generic/f16addf128.cpp
index 4e9038e23125a..87e327e0c20d4 100644
--- a/libc/src/math/generic/f16addf128.cpp
+++ b/libc/src/math/generic/f16addf128.cpp
@@ -7,14 +7,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/f16addf128.h"
-#include "src/__support/FPUtil/generic/add_sub.h"
-#include "src/__support/common.h"
-#include "src/__support/macros/config.h"
+#include "src/__support/math/f16addf128.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(float16, f16addf128, (float128 x, float128 y)) {
-  return fputil::generic::add<float16>(x, y);
+  return math::f16addf128(x, y);
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/f16addl.cpp b/libc/src/math/generic/f16addl.cpp
index 925f08418b99d..4d93b7105aa79 100644
--- a/libc/src/math/generic/f16addl.cpp
+++ b/libc/src/math/generic/f16addl.cpp
@@ -7,14 +7,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/f16addl.h"
-#include "src/__support/FPUtil/generic/add_sub.h"
-#include "src/__support/common.h"
-#include "src/__support/macros/config.h"
+#include "src/__support/math/f16addl.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(float16, f16addl, (long double x, long double y)) {
-  return fputil::generic::add<float16>(x, y);
+  return math::f16addl(x, y);
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/logbl.cpp b/libc/src/math/generic/logbl.cpp
index dcab957f2c9c5..6c1df6d6549c0 100644
--- a/libc/src/math/generic/logbl.cpp
+++ b/libc/src/math/generic/logbl.cpp
@@ -7,14 +7,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/logbl.h"
-#include "src/__support/FPUtil/ManipulationFunctions.h"
-#include "src/__support/common.h"
-#include "src/__support/macros/config.h"
+#include "src/__support/math/logbl.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(long double, logbl, (long double x)) {
-  return fputil::logb(x);
+  return math::logbl(x);
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/tanf16.cpp b/libc/src/math/generic/tanf16.cpp
index 880ba0101a96e..8126a06cbaba9 100644
--- a/libc/src/math/generic/tanf16.cpp
+++ b/libc/src/math/generic/tanf16.cpp
@@ -7,118 +7,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/tanf16.h"
-#include "hdr/errno_macros.h"
-#include "hdr/fenv_macros.h"
-#include "src/__support/FPUtil/FEnvImpl.h"
-#include "src/__support/FPUtil/FPBits.h"
-#include "src/__support/FPUtil/cast.h"
-#include "src/__support/FPUtil/except_value_utils.h"
-#include "src/__support/FPUtil/multiply_add.h"
-#include "src/__support/macros/optimization.h"
-#include "src/__support/math/sincosf16_utils.h"
+#include "src/__support/math/tanf16.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
-#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-constexpr size_t N_EXCEPTS = 9;
-
-constexpr fputil::ExceptValues<float16, N_EXCEPTS> TANF16_EXCEPTS{{
-    // (input, RZ output, RU offset, RD offset, RN offset)
-    {0x2894, 0x2894, 1, 0, 1},
-    {0x3091, 0x3099, 1, 0, 0},
-    {0x3098, 0x30a0, 1, 0, 0},
-    {0x55ed, 0x3911, 1, 0, 0},
-    {0x607b, 0xc638, 0, 1, 1},
-    {0x674e, 0x3b7d, 1, 0, 0},
-    {0x6807, 0x4014, 1, 0, 1},
-    {0x6f4d, 0xbe19, 0, 1, 1},
-    {0x7330, 0xcb62, 0, 1, 0},
-}};
-#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-
-LLVM_LIBC_FUNCTION(float16, tanf16, (float16 x)) {
-  using namespace math::sincosf16_internal;
-  using FPBits = fputil::FPBits<float16>;
-  FPBits xbits(x);
-
-  uint16_t x_u = xbits.uintval();
-  uint16_t x_abs = x_u & 0x7fff;
-  float xf = x;
-
-#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-  bool x_sign = x_u >> 15;
-  // Handle exceptional values
-  if (auto r = TANF16_EXCEPTS.lookup_odd(x_abs, x_sign);
-      LIBC_UNLIKELY(r.has_value()))
-    return r.value();
-#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-
-  // |x| <= 0x1.d1p-5
-  if (LIBC_UNLIKELY(x_abs <= 0x2b44)) {
-    // |x| <= 0x1.398p-11
-    if (LIBC_UNLIKELY(x_abs <= 0x10e6)) {
-      // tan(+/-0) = +/-0
-      if (LIBC_UNLIKELY(x_abs == 0))
-        return x;
-
-      int rounding = fputil::quick_get_round();
-
-      // Exhaustive tests show that, when:
-      // x > 0, and rounding upward or
-      // x < 0, and rounding downward then,
-      // tan(x) = x * 2^-11 + x
-      if ((xbits.is_pos() && rounding == FE_UPWARD) ||
-          (xbits.is_neg() && rounding == FE_DOWNWARD))
-        return fputil::cast<float16>(fputil::multiply_add(xf, 0x1.0p-11f, xf));
-      return x;
-    }
-
-    float xsq = xf * xf;
-
-    // Degree-6 minimax odd polynomial of tan(x) generated by Sollya with:
-    // > P = fpminimax(tan(x)/x, [|0, 2, 4, 6|], [|1, SG...|], [0, pi/32]);
-    float result = fputil::polyeval(xsq, 0x1p0f, 0x1.555556p-2f, 0x1.110ee4p-3f,
-                                    0x1.be80f6p-5f);
-
-    return fputil::cast<float16>(xf * result);
-  }
-
-  // tan(+/-inf) = NaN, and tan(NaN) = NaN
-  if (LIBC_UNLIKELY(x_abs >= 0x7c00)) {
-    if (xbits.is_signaling_nan()) {
-      fputil::raise_except_if_required(FE_INVALID);
-      return FPBits::quiet_nan().get_val();
-    }
-    // x = +/-inf
-    if (x_abs == 0x7c00) {
-      fputil::set_errno_if_required(EDOM);
-      fputil::raise_except_if_required(FE_INVALID);
-    }
-
-    return x + FPBits::quiet_nan().get_val();
-  }
-
-  // Range reduction:
-  // For |x| > pi/32, we perform range reduction as follows:
-  // Find k and y such that:
-  //   x = (k + y) * pi/32;
-  //   k is an integer, |y| < 0.5
-  //
-  // This is done by performing:
-  //   k = round(x * 32/pi)
-  //   y = x * 32/pi - k
-  //
-  // Once k and y are computed, we then deduce the answer by the formula:
-  // tan(x) = sin(x) / cos(x)
-  // 	    = (sin_y * cos_k + cos_y * sin_k) / (cos_y * cos_k - sin_y * sin_k)
-  float sin_k, cos_k, sin_y, cosm1_y;
-  sincosf16_eval(xf, sin_k, cos_k, sin_y, cosm1_y);
-
-  // Note that, cosm1_y = cos_y - 1:
-  using fputil::multiply_add;
-  return fputil::cast<float16>(
-      multiply_add(sin_y, cos_k, multiply_add(cosm1_y, sin_k, sin_k)) /
-      multiply_add(sin_y, -sin_k, multiply_add(cosm1_y, cos_k, cos_k)));
-}
+LLVM_LIBC_FUNCTION(float16, tanf16, (float16 x)) { return math::tanf16(x); }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/tanpif.cpp b/libc/src/math/generic/tanpif.cpp
index 44df22b517a46..e3568d6e9d35c 100644
--- a/libc/src/math/generic/tanpif.cpp
+++ b/libc/src/math/generic/tanpif.cpp
@@ -7,101 +7,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/tanpif.h"
-#include "src/__support/FPUtil/FEnvImpl.h"
-#include "src/__support/FPUtil/FPBits.h"
-#include "src/__support/FPUtil/cast.h"
-#include "src/__support/FPUtil/except_value_utils.h"
-#include "src/__support/FPUtil/multiply_add.h"
-#include "src/__support/common.h"
-#include "src/__support/macros/config.h"
-#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
-#include "src/__support/math/sincosf_utils.h"
+#include "src/__support/math/tanpif.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
-#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-constexpr size_t N_EXCEPTS = 3;
-
-constexpr fputil::ExceptValues<float, N_EXCEPTS> TANPIF_EXCEPTS{{
-    // (input, RZ output, RU offset, RD offset, RN offset)
-    {0x38F26685, 0x39BE6182, 1, 0, 0},
-    {0x3E933802, 0x3FA267DD, 1, 0, 0},
-    {0x3F3663FF, 0xBFA267DD, 0, 1, 0},
-}};
-#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-
-LLVM_LIBC_FUNCTION(float, tanpif, (float x)) {
-  using namespace math::sincosf_utils_internal;
-  using FPBits = typename fputil::FPBits<float>;
-  FPBits xbits(x);
-
-  uint32_t x_u = xbits.uintval();
-  uint32_t x_abs = x_u & 0x7fff'ffffU;
-  double xd = static_cast<double>(xbits.get_val());
-
-  // Handle exceptional values
-  if (LIBC_UNLIKELY(x_abs <= 0x3F3663FF)) {
-    if (LIBC_UNLIKELY(x_abs == 0U))
-      return x;
-
-#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-    bool x_sign = x_u >> 31;
-
-    if (auto r = TANPIF_EXCEPTS.lookup_odd(x_abs, x_sign);
-        LIBC_UNLIKELY(r.has_value()))
-      return r.value();
-#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-  }
-
-  // Numbers greater or equal to 2^23 are always integers, or infinity, or NaN
-  if (LIBC_UNLIKELY(x_abs >= 0x4B00'0000)) {
-    // x is inf or NaN.
-    if (LIBC_UNLIKELY(x_abs >= 0x7f80'0000U)) {
-      if (xbits.is_signaling_nan()) {
-        fputil::raise_except_if_required(FE_INVALID);
-        return FPBits::quiet_nan().get_val();
-      }
-
-      if (x_abs == 0x7f80'0000U) {
-        fputil::set_errno_if_required(EDOM);
-        fputil::raise_except_if_required(FE_INVALID);
-      }
-
-      return x + FPBits::quiet_nan().get_val();
-    }
-
-    return FPBits::zero(xbits.sign()).get_val();
-  }
-
-  // Range reduction:
-  // For |x| > 1/32, we perform range reduction as follows:
-  // Find k and y such that:
-  //   x = (k + y) * 1/32
-  //   k is an integer
-  //   |y| < 0.5
-  //
-  // This is done by performing:
-  //   k = round(x * 32)
-  //   y = x * 32 - k
-  //
-  // Once k and y are computed, we then deduce the answer by the formula:
-  // tan(x) = sin(x) / cos(x)
-  //        = (sin_y * cos_k + cos_y * sin_k) / (cos_y * cos_k - sin_y * sin_k)
-  double sin_k, cos_k, sin_y, cosm1_y;
-  sincospif_eval(xd, sin_k, cos_k, sin_y, cosm1_y);
-
-  if (LIBC_UNLIKELY(sin_y == 0 && cos_k == 0)) {
-    fputil::set_errno_if_required(EDOM);
-    fputil::raise_except_if_required(FE_DIVBYZERO);
-
-    int32_t x_mp5_i = static_cast<int32_t>(xd - 0.5);
-    return FPBits::inf((x_mp5_i & 0x1) ? Sign::NEG : Sign::POS).get_val();
-  }
-
-  using fputil::multiply_add;
-  return fputil::cast<float>(
-      multiply_add(sin_y, cos_k, multiply_add(cosm1_y, sin_k, sin_k)) /
-      multiply_add(sin_y, -sin_k, multiply_add(cosm1_y, cos_k, cos_k)));
-}
+LLVM_LIBC_FUNCTION(float, tanpif, (float x)) { return math::tanpif(x); }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/test/shared/CMakeLists.txt b/libc/test/shared/CMakeLists.txt
index dfe2378269921..91942a12016ec 100644
--- a/libc/test/shared/CMakeLists.txt
+++ b/libc/test/shared/CMakeLists.txt
@@ -30,7 +30,13 @@ add_fp_unittest(
     libc.src.__support.math.bf16addf
     libc.src.__support.math.bf16addf128
     libc.src.__support.math.bf16divf
+    libc.src.__support.math.bf16divl
     libc.src.__support.math.bf16fmaf
+    libc.src.__support.math.bf16fmal
+    libc.src.__support.math.bf16mul
+    libc.src.__support.math.bf16mulf
+    libc.src.__support.math.bf16mulf128
+    libc.src.__support.math.bf16mull
     libc.src.__support.math.canonicalize
     libc.src.__support.math.canonicalizebf16
     libc.src.__support.math.canonicalizef
@@ -66,6 +72,10 @@ add_fp_unittest(
     libc.src.__support.math.exp10f16
     libc.src.__support.math.expf
     libc.src.__support.math.expf16
+    libc.src.__support.math.f16add
+    libc.src.__support.math.f16addf
+    libc.src.__support.math.f16addf128
+    libc.src.__support.math.f16addl
     libc.src.__support.math.f16fma
     libc.src.__support.math.f16fmaf
     libc.src.__support.math.f16fmaf128
@@ -110,6 +120,7 @@ add_fp_unittest(
     libc.src.__support.math.llogbf
     libc.src.__support.math.llogbf128
     libc.src.__support.math.llogbf16
+    libc.src.__support.math.logbl
     libc.src.__support.math.logf16    
     libc.src.__support.math.llogbl
     libc.src.__support.math.pow
@@ -131,6 +142,8 @@ add_fp_unittest(
     libc.src.__support.math.sqrtf
     libc.src.__support.math.tan
     libc.src.__support.math.tanf
+    libc.src.__support.math.tanf16
     libc.src.__support.math.tanhf
     libc.src.__support.math.tanhf16
+    libc.src.__support.math.tanpif
 )
diff --git a/libc/test/shared/shared_math_test.cpp b/libc/test/shared/shared_math_test.cpp
index 8eba836538c41..d290fadb52d11 100644
--- a/libc/test/shared/shared_math_test.cpp
+++ b/libc/test/shared/shared_math_test.cpp
@@ -47,8 +47,14 @@ TEST(LlvmLibcSharedMathTest, AllFloat16) {
   EXPECT_FP_EQ(10.0f16, LIBC_NAMESPACE::shared::f16fmaf128(
                             float128(2.0), float128(3.0), float128(4.0)));
 
+  EXPECT_FP_EQ(
+      5.0f16, LIBC_NAMESPACE::shared::f16addf128(float128(2.0), float128(3.0)));
+
 #endif
 
+  EXPECT_FP_EQ(5.0f16, LIBC_NAMESPACE::shared::f16add(2.0, 3.0));
+  EXPECT_FP_EQ(5.0f16, LIBC_NAMESPACE::shared::f16addf(2.0f, 3.0f));
+  EXPECT_FP_EQ(5.0f16, LIBC_NAMESPACE::shared::f16addl(2.0L, 3.0L));
   EXPECT_FP_EQ(0x0p+0f16, LIBC_NAMESPACE::shared::f16sqrt(0.0));
 
   EXPECT_FP_EQ(0x0p+0f16, LIBC_NAMESPACE::shared::f16sqrtf(0.0f));
@@ -73,6 +79,7 @@ TEST(LlvmLibcSharedMathTest, AllFloat16) {
   EXPECT_FP_EQ(0x1.921fb6p+0f16, LIBC_NAMESPACE::shared::acosf16(0.0f16));
   EXPECT_FP_EQ(0x1p+0f16, LIBC_NAMESPACE::shared::f16sqrtl(1.0L));
   EXPECT_FP_EQ(0.0f16, LIBC_NAMESPACE::shared::sinf16(0.0f16));
+  EXPECT_FP_EQ(0.0f16, LIBC_NAMESPACE::shared::tanf16(0.0f16));
   EXPECT_FP_EQ(0x0p+0f16, LIBC_NAMESPACE::shared::sinpif16(0.0f16));
   EXPECT_FP_EQ(0.0f16, LIBC_NAMESPACE::shared::tanhf16(0.0f16));
 
@@ -136,12 +143,15 @@ TEST(LlvmLibcSharedMathTest, AllFloat) {
   EXPECT_FP_EQ(0.0f, LIBC_NAMESPACE::shared::sqrtf(0.0f));
   EXPECT_FP_EQ(0.0f, LIBC_NAMESPACE::shared::tanf(0.0f));
   EXPECT_FP_EQ(0.0f, LIBC_NAMESPACE::shared::tanhf(0.0f));
+  EXPECT_FP_EQ(0.0f, LIBC_NAMESPACE::shared::tanpif(0.0f));
 
   float canonicalizef_cx = 0.0f;
   float canonicalizef_x = 0.0f;
   EXPECT_EQ(0, LIBC_NAMESPACE::shared::canonicalizef(&canonicalizef_cx,
                                                      &canonicalizef_x));
   EXPECT_FP_EQ(0x0p+0f, canonicalizef_cx);
+
+  EXPECT_FP_EQ(bfloat16(0.0), LIBC_NAMESPACE::shared::bf16mulf(0.0f, 0.0f));
 }
 
 TEST(LlvmLibcSharedMathTest, AllDouble) {
@@ -187,6 +197,7 @@ TEST(LlvmLibcSharedMathTest, AllLongDouble) {
   EXPECT_FP_EQ(0x0p+0f, LIBC_NAMESPACE::shared::fsqrtl(0.0L));
   EXPECT_EQ(0, LIBC_NAMESPACE::shared::ilogbl(0x1.p+0L));
   EXPECT_EQ(0L, LIBC_NAMESPACE::shared::llogbl(1.0L));
+  EXPECT_FP_EQ(0x0p+0L, LIBC_NAMESPACE::shared::logbl(1.0L));
   EXPECT_FP_EQ(10.0f, LIBC_NAMESPACE::shared::ffmal(2.0L, 3.0, 4.0L));
 
   long double canonicalizel_cx = 0.0L;
@@ -194,6 +205,10 @@ TEST(LlvmLibcSharedMathTest, AllLongDouble) {
   EXPECT_EQ(0, LIBC_NAMESPACE::shared::canonicalizel(&canonicalizel_cx,
                                                      &canonicalizel_x));
   EXPECT_FP_EQ(0x0p+0L, canonicalizel_cx);
+
+  EXPECT_FP_EQ(bfloat16(0.0), LIBC_NAMESPACE::shared::bf16mul(0.0L, 0.0L));
+
+  EXPECT_FP_EQ(bfloat16(0.0), LIBC_NAMESPACE::shared::bf16mull(0.0L, 0.0L));
 }
 
 #ifdef LIBC_TYPES_HAS_FLOAT128
@@ -229,6 +244,9 @@ TEST(LlvmLibcSharedMathTest, AllFloat128) {
   EXPECT_EQ(0, LIBC_NAMESPACE::shared::canonicalizef128(&canonicalizef128_cx,
                                                         &canonicalizef128_x));
   EXPECT_FP_EQ(float128(0.0), canonicalizef128_cx);
+
+  EXPECT_FP_EQ(bfloat16(0.0), LIBC_NAMESPACE::shared::bf16mulf128(
+                                  float128(0.0), float128(0.0)));
 }
 
 #endif // LIBC_TYPES_HAS_FLOAT128
@@ -236,6 +254,9 @@ TEST(LlvmLibcSharedMathTest, AllFloat128) {
 TEST(LlvmLibcSharedMathTest, AllBFloat16) {
   EXPECT_FP_EQ(bfloat16(5.0), LIBC_NAMESPACE::shared::bf16add(2.0, 3.0));
   EXPECT_FP_EQ(bfloat16(2.0f), LIBC_NAMESPACE::shared::bf16divf(4.0f, 2.0f));
+  EXPECT_FP_EQ(bfloat16(2.0), LIBC_NAMESPACE::shared::bf16divl(6.0L, 3.0L));
+  EXPECT_FP_EQ(bfloat16(10.0),
+               LIBC_NAMESPACE::shared::bf16fmal(2.0L, 3.0L, 4.0L));
 
   bfloat16 canonicalizebf16_cx = bfloat16(0.0);
   bfloat16 canonicalizebf16_x = bfloat16(0.0);
diff --git a/libclc/opencl/lib/amdgcn/workitem/get_local_size.cl b/libclc/opencl/lib/amdgcn/workitem/get_local_size.cl
index 8aa24201de573..34e4f2f1b4c19 100644
--- a/libclc/opencl/lib/amdgcn/workitem/get_local_size.cl
+++ b/libclc/opencl/lib/amdgcn/workitem/get_local_size.cl
@@ -8,18 +8,14 @@
 
 #include <clc/opencl/opencl-base.h>
 
-uint __clc_amdgcn_get_local_size_x(void) __asm("llvm.r600.read.local.size.x");
-uint __clc_amdgcn_get_local_size_y(void) __asm("llvm.r600.read.local.size.y");
-uint __clc_amdgcn_get_local_size_z(void) __asm("llvm.r600.read.local.size.z");
-
 _CLC_DEF _CLC_OVERLOAD size_t get_local_size(uint dim) {
   switch (dim) {
   case 0:
-    return __clc_amdgcn_get_local_size_x();
+    return __builtin_amdgcn_workgroup_size_x();
   case 1:
-    return __clc_amdgcn_get_local_size_y();
+    return __builtin_amdgcn_workgroup_size_y();
   case 2:
-    return __clc_amdgcn_get_local_size_z();
+    return __builtin_amdgcn_workgroup_size_z();
   default:
     return 1;
   }
diff --git a/libclc/opencl/lib/amdgcn/workitem/get_num_groups.cl b/libclc/opencl/lib/amdgcn/workitem/get_num_groups.cl
index 11c1ba373aeff..9e8dddb859064 100644
--- a/libclc/opencl/lib/amdgcn/workitem/get_num_groups.cl
+++ b/libclc/opencl/lib/amdgcn/workitem/get_num_groups.cl
@@ -8,18 +8,14 @@
 
 #include <clc/opencl/opencl-base.h>
 
-uint __clc_amdgcn_get_num_groups_x(void) __asm("llvm.r600.read.ngroups.x");
-uint __clc_amdgcn_get_num_groups_y(void) __asm("llvm.r600.read.ngroups.y");
-uint __clc_amdgcn_get_num_groups_z(void) __asm("llvm.r600.read.ngroups.z");
-
 _CLC_DEF _CLC_OVERLOAD size_t get_num_groups(uint dim) {
   switch (dim) {
   case 0:
-    return __clc_amdgcn_get_num_groups_x();
+    return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
   case 1:
-    return __clc_amdgcn_get_num_groups_y();
+    return __builtin_amdgcn_grid_size_y() / __builtin_amdgcn_workgroup_size_y();
   case 2:
-    return __clc_amdgcn_get_num_groups_z();
+    return __builtin_amdgcn_grid_size_z() / __builtin_amdgcn_workgroup_size_z();
   default:
     return 1;
   }
diff --git a/libsycl/src/detail/offload/offload_topology.cpp b/libsycl/src/detail/offload/offload_topology.cpp
index 5e595e520a452..ab4c57ecf37eb 100644
--- a/libsycl/src/detail/offload/offload_topology.cpp
+++ b/libsycl/src/detail/offload/offload_topology.cpp
@@ -56,7 +56,7 @@ void OffloadTopology::registerNewPlatformsAndDevices(
 }
 
 void discoverOffloadDevices() {
-  callAndThrow(olInit);
+  callAndThrow(olInit, nullptr);
 
   // liboffload returns devices sorted by backend + platform. We rely on this
   // behavior during device enumeration.
diff --git a/lldb/include/lldb/Host/ProcessLaunchInfo.h b/lldb/include/lldb/Host/ProcessLaunchInfo.h
index d89fe68b2d0d4..e13eecc9463ea 100644
--- a/lldb/include/lldb/Host/ProcessLaunchInfo.h
+++ b/lldb/include/lldb/Host/ProcessLaunchInfo.h
@@ -137,8 +137,7 @@ class ProcessLaunchInfo : public ProcessInfo {
   bool ShouldUsePTY() const {
 #ifdef _WIN32
     return GetPTY().GetPseudoTerminalHandle() != ((HANDLE)(long long)-1) &&
-           GetNumFileActions() == 0 &&
-           GetFlags().Test(lldb::eLaunchFlagLaunchInTTY);
+           GetNumFileActions() == 0;
 #else
     return true;
 #endif
diff --git a/lldb/include/lldb/Host/posix/HostThreadPosix.h b/lldb/include/lldb/Host/posix/HostThreadPosix.h
index 6c8e09fc11030..32be7154fa1d8 100644
--- a/lldb/include/lldb/Host/posix/HostThreadPosix.h
+++ b/lldb/include/lldb/Host/posix/HostThreadPosix.h
@@ -25,7 +25,7 @@ class HostThreadPosix : public HostNativeThreadBase {
   Status Join(lldb::thread_result_t *result) override;
   Status Cancel() override;
 
-  Status Detach();
+  void Reset() override;
 };
 
 } // namespace lldb_private
diff --git a/lldb/include/lldb/Utility/ArchSpec.h b/lldb/include/lldb/Utility/ArchSpec.h
index 361108fd8f0e7..438a5e1faf86d 100644
--- a/lldb/include/lldb/Utility/ArchSpec.h
+++ b/lldb/include/lldb/Utility/ArchSpec.h
@@ -14,6 +14,7 @@
 #include "lldb/lldb-forward.h"
 #include "lldb/lldb-private-enumerations.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/TargetParser/SubtargetFeature.h"
 #include "llvm/TargetParser/Triple.h"
 #include <cstddef>
 #include <cstdint>
@@ -542,6 +543,14 @@ class ArchSpec {
 
   void SetFlags(const std::string &elf_abi);
 
+  const llvm::SubtargetFeatures &GetSubtargetFeatures() const {
+    return m_subtarget_features;
+  }
+
+  void SetSubtargetFeatures(llvm::SubtargetFeatures &&subtarget_features) {
+    m_subtarget_features = std::move(subtarget_features);
+  }
+
 protected:
   void UpdateCore();
 
@@ -553,6 +562,8 @@ class ArchSpec {
   // these are application specific extensions like micromips, mips16 etc.
   uint32_t m_flags = 0;
 
+  llvm::SubtargetFeatures m_subtarget_features;
+
   // Called when m_def or m_entry are changed.  Fills in all remaining members
   // with default values.
   void CoreUpdated(bool update_triple);
diff --git a/lldb/packages/Python/lldbsuite/test/lldbinline.py b/lldb/packages/Python/lldbsuite/test/lldbinline.py
index ae38ab9d8c9d7..d1225db4d61a9 100644
--- a/lldb/packages/Python/lldbsuite/test/lldbinline.py
+++ b/lldb/packages/Python/lldbsuite/test/lldbinline.py
@@ -210,4 +210,5 @@ def MakeInlineTest(__file, __globals, decorators=None, name=None, build_dict=Non
     # correctly in test results.
     test_class.test_filename = __file
     test_class.mydir = TestBase.compute_mydir(__file)
+    test_class.SHARED_BUILD_TESTCASE = False
     return test_class
diff --git a/lldb/packages/Python/lldbsuite/test/lldbtest.py b/lldb/packages/Python/lldbsuite/test/lldbtest.py
index 6034eca3b93f2..65fd56ed76c1c 100644
--- a/lldb/packages/Python/lldbsuite/test/lldbtest.py
+++ b/lldb/packages/Python/lldbsuite/test/lldbtest.py
@@ -569,6 +569,11 @@ class Base(unittest.TestCase):
     # Can be overridden by the LLDB_TIME_WAIT_NEXT_LAUNCH environment variable.
     timeWaitNextLaunch = 1.0
 
+    # Some test case classes require a separate build directory for each test
+    # function. Subclasses can set this to False in those cases. This slows down
+    # the test, but provides isolation where needed.
+    SHARED_BUILD_TESTCASE = True
+
     @staticmethod
     def compute_mydir(test_file):
         """Subclasses should call this function to correctly calculate the
@@ -754,7 +759,10 @@ def getSourceDir(self):
         return os.path.join(configuration.test_src_root, self.mydir)
 
     def getBuildDirBasename(self):
-        return self.__class__.__module__ + "." + self.testMethodName
+        if self.SHARED_BUILD_TESTCASE:
+            return self.__class__.__module__
+        else:
+            return self.__class__.__module__ + "." + self.testMethodName
 
     def getBuildDir(self):
         """Return the full path to the current test."""
@@ -763,10 +771,10 @@ def getBuildDir(self):
         )
 
     def makeBuildDir(self):
-        """Create the test-specific working directory, deleting any previous
-        contents."""
+        """Create the test-specific working directory, optionally deleting any
+        previous contents."""
         bdir = self.getBuildDir()
-        if os.path.isdir(bdir):
+        if os.path.isdir(bdir) and not self.SHARED_BUILD_TESTCASE:
             shutil.rmtree(bdir)
         lldbutil.mkdir_p(bdir)
 
diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
index 8e342b5277fc4..e8fd8e8e37e65 100644
--- a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
+++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
@@ -245,10 +245,7 @@ def __init__(
         self.terminated: bool = False
         self.events: List[Event] = []
         self.progress_events: List[Event] = []
-        self.invalidated_event: Optional[Event] = None
-        self.memory_event: Optional[Event] = None
         self.reverse_requests: List[Request] = []
-        self.module_events: List[Dict] = []
         self.sequence: int = 1
         self.output: Dict[str, str] = {}
         self.reverse_process: Optional[subprocess.Popen] = None
@@ -513,10 +510,6 @@ def _handle_event(self, packet: Event) -> None:
         elif event == "capabilities" and body:
             # Update the capabilities with new ones from the event.
             self.capabilities.update(body["capabilities"])
-        elif event == "invalidated":
-            self.invalidated_event = packet
-        elif event == "memory":
-            self.memory_event = packet
 
     def _handle_reverse_request(self, request: Request) -> None:
         if request in self.reverse_requests:
@@ -704,6 +697,18 @@ def wait_for_terminated(self):
             raise ValueError("didn't get terminated event")
         return event_dict
 
+    def wait_for_invalidated(self):
+        event_dict = self.wait_for_event(["invalidated"])
+        if event_dict is None:
+            raise ValueError("didn't get invalidated event")
+        return event_dict
+
+    def wait_for_memory(self):
+        event_dict = self.wait_for_event(["memory"])
+        if event_dict is None:
+            raise ValueError("didn't get memory event")
+        return event_dict
+
     def get_capability(self, key: str):
         """Get a value for the given key if it there is a key/value pair in
         the capabilities reported by the adapter.
@@ -1581,7 +1586,7 @@ def request_threads(self):
         return response
 
     def request_variables(
-        self, variablesReference, start=None, count=None, is_hex=None
+        self, variablesReference, start=None, count=None, is_hex: Optional[bool] = None
     ):
         args_dict = {"variablesReference": variablesReference}
         if start is not None:
@@ -1597,7 +1602,7 @@ def request_variables(
         }
         return self._send_recv(command_dict)
 
-    def request_setVariable(self, containingVarRef, name, value, id=None):
+    def request_setVariable(self, containingVarRef, name, value, id=None, is_hex=None):
         args_dict = {
             "variablesReference": containingVarRef,
             "name": name,
@@ -1605,6 +1610,8 @@ def request_setVariable(self, containingVarRef, name, value, id=None):
         }
         if id is not None:
             args_dict["id"] = id
+        if is_hex is not None:
+            args_dict["format"] = {"hex": is_hex}
         command_dict = {
             "command": "setVariable",
             "type": "request",
diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py
index e49e4a28e3878..14a5698653588 100644
--- a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py
+++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py
@@ -265,21 +265,6 @@ def verify_commands(self, flavor: str, output: str, commands: List[str]):
                 f"Command '{flavor}' - '{cmd}' not found in output: {output}",
             )
 
-    def verify_invalidated_event(self, expected_areas):
-        event = self.dap_server.invalidated_event
-        self.dap_server.invalidated_event = None
-        self.assertIsNotNone(event)
-        areas = event["body"].get("areas", [])
-        self.assertEqual(set(expected_areas), set(areas))
-
-    def verify_memory_event(self, memoryReference):
-        if memoryReference is None:
-            self.assertIsNone(self.dap_server.memory_event)
-        event = self.dap_server.memory_event
-        self.dap_server.memory_event = None
-        self.assertIsNotNone(event)
-        self.assertEqual(memoryReference, event["body"].get("memoryReference"))
-
     def get_dict_value(self, d: Mapping[str, Any], key_path: List[str]) -> Any:
         """Verify each key in the key_path array is in contained in each
         dictionary within "d". Assert if any key isn't in the
@@ -375,21 +360,28 @@ def get_local_as_int(self, name, threadId=None):
         else:
             return int(value)
 
-    def set_variable(self, varRef, name, value, id=None):
+    def set_variable(self, varRef, name, value, id=None, is_hex: Optional[bool] = None):
         """Set a variable."""
-        response = self.dap_server.request_setVariable(varRef, name, str(value), id=id)
+        response = self.dap_server.request_setVariable(
+            varRef, name, str(value), id=id, is_hex=is_hex
+        )
         if response["success"]:
-            self.verify_invalidated_event(["variables"])
-            self.verify_memory_event(response["body"].get("memoryReference"))
+            invalidated_event = self.dap_server.wait_for_invalidated()
+            self.assertEqual(invalidated_event["body"].get("areas"), ["variables"])
+            memory_event = self.dap_server.wait_for_memory()
+            self.assertEqual(
+                memory_event["body"].get("memoryReference"),
+                response["body"].get("memoryReference"),
+            )
         return response
 
-    def set_local(self, name, value, id=None):
+    def set_local(self, name, value, id=None, is_hex: Optional[bool] = None):
         """Set a top level local variable only."""
         # Get the locals scope reference dynamically
         locals_ref = self.get_locals_scope_reference()
         if locals_ref is None:
             return None
-        return self.set_variable(locals_ref, name, str(value), id=id)
+        return self.set_variable(locals_ref, name, str(value), id=id, is_hex=is_hex)
 
     def get_locals_scope_reference(self):
         """Get the variablesReference for the locals scope."""
@@ -623,5 +615,6 @@ def writeMemory(self, memoryReference, data=None, offset=0, allowPartial=False):
             memoryReference, encodedData, offset=offset, allowPartial=allowPartial
         )
         if response["success"]:
-            self.verify_invalidated_event(["all"])
+            invalidated_event = self.dap_server.wait_for_invalidated()
+            self.assertEqual(invalidated_event["body"].get("areas"), ["all"])
         return response
diff --git a/lldb/source/Host/posix/HostThreadPosix.cpp b/lldb/source/Host/posix/HostThreadPosix.cpp
index a53a8cc9d8389..92f172ecd00a5 100644
--- a/lldb/source/Host/posix/HostThreadPosix.cpp
+++ b/lldb/source/Host/posix/HostThreadPosix.cpp
@@ -50,12 +50,8 @@ Status HostThreadPosix::Cancel() {
   return error;
 }
 
-Status HostThreadPosix::Detach() {
-  Status error;
-  if (IsJoinable()) {
-    int err = ::pthread_detach(m_thread);
-    error = Status(err, eErrorTypePOSIX);
-  }
-  Reset();
-  return error;
+void HostThreadPosix::Reset() {
+  if (IsJoinable())
+    ::pthread_detach(m_thread);
+  HostNativeThreadBase::Reset();
 }
diff --git a/lldb/source/Plugins/Disassembler/LLVMC/DisassemblerLLVMC.cpp b/lldb/source/Plugins/Disassembler/LLVMC/DisassemblerLLVMC.cpp
index 8e495e20d254a..6384b5e1bb57c 100644
--- a/lldb/source/Plugins/Disassembler/LLVMC/DisassemblerLLVMC.cpp
+++ b/lldb/source/Plugins/Disassembler/LLVMC/DisassemblerLLVMC.cpp
@@ -1593,23 +1593,28 @@ DisassemblerLLVMC::DisassemblerLLVMC(const ArchSpec &arch,
   }
 
   if (triple.isRISCV() && !cpu_or_features_overriden) {
-    uint32_t arch_flags = arch.GetFlags();
-    if (arch_flags & ArchSpec::eRISCV_rvc)
-      features_str += "+c,";
-    if (arch_flags & ArchSpec::eRISCV_rve)
-      features_str += "+e,";
-    if ((arch_flags & ArchSpec::eRISCV_float_abi_single) ==
-        ArchSpec::eRISCV_float_abi_single)
-      features_str += "+f,";
-    if ((arch_flags & ArchSpec::eRISCV_float_abi_double) ==
-        ArchSpec::eRISCV_float_abi_double)
-      features_str += "+f,+d,";
-    if ((arch_flags & ArchSpec::eRISCV_float_abi_quad) ==
-        ArchSpec::eRISCV_float_abi_quad)
-      features_str += "+f,+d,+q,";
-    // FIXME: how do we detect features such as `+a`, `+m`?
-    // Turn them on by default now, since everyone seems to use them
-    features_str += "+a,+m,";
+    auto subtarget_features = arch.GetSubtargetFeatures().getString();
+    if (!subtarget_features.empty()) {
+      features_str += subtarget_features;
+    } else {
+      uint32_t arch_flags = arch.GetFlags();
+      if (arch_flags & ArchSpec::eRISCV_rvc)
+        features_str += "+c,";
+      if (arch_flags & ArchSpec::eRISCV_rve)
+        features_str += "+e,";
+      if ((arch_flags & ArchSpec::eRISCV_float_abi_single) ==
+          ArchSpec::eRISCV_float_abi_single)
+        features_str += "+f,";
+      if ((arch_flags & ArchSpec::eRISCV_float_abi_double) ==
+          ArchSpec::eRISCV_float_abi_double)
+        features_str += "+f,+d,";
+      if ((arch_flags & ArchSpec::eRISCV_float_abi_quad) ==
+          ArchSpec::eRISCV_float_abi_quad)
+        features_str += "+f,+d,+q,";
+      // FIXME: how do we detect features such as `+a`, `+m`?
+      // Turn them on by default now, since everyone seems to use them
+      features_str += "+a,+m,";
+    }
   }
 
   // We use m_disasm_up.get() to tell whether we are valid or not, so if this
diff --git a/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp b/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp
index 1a515852e7092..830ff4c1091fa 100644
--- a/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp
+++ b/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp
@@ -13,6 +13,7 @@
 #include <optional>
 #include <unordered_map>
 
+#include "lldb/Core/Debugger.h"
 #include "lldb/Core/Module.h"
 #include "lldb/Core/ModuleSpec.h"
 #include "lldb/Core/PluginManager.h"
@@ -27,12 +28,14 @@
 #include "lldb/Target/Target.h"
 #include "lldb/Utility/ArchSpec.h"
 #include "lldb/Utility/DataBufferHeap.h"
+#include "lldb/Utility/DataExtractor.h"
 #include "lldb/Utility/FileSpecList.h"
 #include "lldb/Utility/LLDBLog.h"
 #include "lldb/Utility/Log.h"
 #include "lldb/Utility/RangeMap.h"
 #include "lldb/Utility/Status.h"
 #include "lldb/Utility/Stream.h"
+#include "lldb/Utility/StreamString.h"
 #include "lldb/Utility/Timer.h"
 #include "llvm/ADT/IntervalMap.h"
 #include "llvm/ADT/PointerUnion.h"
@@ -45,6 +48,9 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/MipsABIFlags.h"
+#include "llvm/Support/RISCVAttributes.h"
+#include "llvm/TargetParser/RISCVISAInfo.h"
+#include "llvm/TargetParser/SubtargetFeature.h"
 
 #define CASE_AND_STREAM(s, def, width)                                         \
   case def:                                                                    \
@@ -1407,6 +1413,178 @@ void ObjectFileELF::ParseARMAttributes(DataExtractor &data, uint64_t length,
   }
 }
 
+static std::optional<lldb::offset_t>
+FindSubSectionOffsetByName(const DataExtractor &data, lldb::offset_t offset,
+                           uint32_t length, llvm::StringRef name) {
+  uint32_t section_length = 0;
+  llvm::StringRef section_name;
+  do {
+    offset += section_length;
+    // Sub-section's size and name are included in the total sub-section length.
+    // Don't shift the offset here, so it will point at the beginning of the
+    // sub-section and could be used as a return value.
+    auto tmp_offset = offset;
+    section_length = data.GetU32(&tmp_offset);
+    section_name = data.GetCStr(&tmp_offset);
+  } while (section_name != name && offset + section_length < length);
+
+  if (section_name == name)
+    return offset;
+
+  return std::nullopt;
+}
+
+static std::optional<lldb::offset_t>
+FindSubSubSectionOffsetByTag(const DataExtractor &data, lldb::offset_t offset,
+                             unsigned tag) {
+  // Consume a sub-section size and name to shift the offset at the beginning of
+  // the sub-sub-sections list.
+  auto parent_section_length = data.GetU32(&offset);
+  data.GetCStr(&offset);
+  auto parent_section_end_offset = offset + parent_section_length;
+
+  uint32_t section_length = 0;
+  unsigned section_tag = 0;
+  do {
+    offset += section_length;
+    // Similar to sub-section sub-sub-section's tag and size are included in the
+    // total sub-sub-section length.
+    auto tmp_offset = offset;
+    section_tag = data.GetULEB128(&tmp_offset);
+    section_length = data.GetU32(&tmp_offset);
+  } while (section_tag != tag &&
+           offset + section_length < parent_section_end_offset);
+
+  if (section_tag == tag)
+    return offset;
+
+  return std::nullopt;
+}
+
+static std::optional<std::variant<uint64_t, llvm::StringRef>>
+GetAttributeValueByTag(const DataExtractor &data, lldb::offset_t offset,
+                       unsigned tag) {
+  // Consume a sub-sub-section tag and size to shift the offset at the beginning
+  // of the attribute list.
+  data.GetULEB128(&offset);
+  auto parent_section_length = data.GetU32(&offset);
+  auto parent_section_end_offset = offset + parent_section_length;
+
+  std::variant<uint64_t, llvm::StringRef> result;
+  unsigned attribute_tag = 0;
+  do {
+    attribute_tag = data.GetULEB128(&offset);
+    // From the riscv psABI document:
+    // RISC-V attributes have a string value if the tag number is odd and an
+    // integer value if the tag number is even.
+    if (attribute_tag % 2)
+      result = data.GetCStr(&offset);
+    else
+      result = data.GetULEB128(&offset);
+  } while (attribute_tag != tag && offset < parent_section_end_offset);
+
+  if (attribute_tag == tag)
+    return result;
+
+  return std::nullopt;
+}
+
+void ObjectFileELF::ParseRISCVAttributes(DataExtractor &data, uint64_t length,
+                                         ArchSpec &arch_spec) {
+  Log *log = GetLog(LLDBLog::Modules);
+
+  lldb::offset_t offset = 0;
+
+  // According to the riscv psABI, the .riscv.attributes section has the
+  // following hierarchical structure:
+  //
+  // Section:
+  //   .riscv.attributes {
+  //       - (uint8_t) format
+  //       - Sub-Section 1 {
+  //           * (uint32_t) length
+  //           * (c_str) name
+  //           * Sub-Sub-Section 1.1 {
+  //               > (uleb128_t) tag
+  //               > (uint32_t) length
+  //               > (uleb128_t) attribute_tag_1.1.1
+  //                   $ (c_str or uleb128_t) value
+  //               > (uleb128_t) attribute_tag_1.1.2
+  //                   $ (c_str or uleb128_t) value
+  //               ...
+  //               Other attributes...
+  //               ...
+  //               > (uleb128_t) attribute_tag_1.1.N
+  //                   $ (c_str or uleb128_t) value
+  //           }
+  //           * Sub-Sub-Section 1.2 {
+  //               ...
+  //               Sub-Sub-Section structure...
+  //               ...
+  //           }
+  //           ...
+  //           Other sub-sub-sections...
+  //           ...
+  //       }
+  //       - Sub-Section 2 {
+  //           ...
+  //           Sub-Section structure...
+  //           ...
+  //       }
+  //       ...
+  //       Other sub-sections...
+  //       ...
+  //   }
+
+  uint8_t format_version = data.GetU8(&offset);
+  if (format_version != llvm::ELFAttrs::Format_Version)
+    return;
+
+  auto subsection_or_opt =
+      FindSubSectionOffsetByName(data, offset, length, "riscv");
+  if (!subsection_or_opt) {
+    LLDB_LOGF(log,
+              "ObjectFileELF::%s Ill-formed .riscv.attributes section: "
+              "mandatory 'riscv' sub-section was not preserved",
+              __FUNCTION__);
+    return;
+  }
+
+  auto subsubsection_or_opt = FindSubSubSectionOffsetByTag(
+      data, *subsection_or_opt, llvm::ELFAttrs::File);
+  if (!subsubsection_or_opt)
+    return;
+
+  auto value_or_opt = GetAttributeValueByTag(data, *subsubsection_or_opt,
+                                             llvm::RISCVAttrs::ARCH);
+  if (!value_or_opt)
+    return;
+
+  auto normalized_isa_info = llvm::RISCVISAInfo::parseNormalizedArchString(
+      std::get<llvm::StringRef>(*value_or_opt));
+  if (llvm::errorToBool(normalized_isa_info.takeError()))
+    return;
+
+  llvm::SubtargetFeatures features;
+  features.addFeaturesVector((*normalized_isa_info)->toFeatures());
+  arch_spec.SetSubtargetFeatures(std::move(features));
+
+  // Additional verification of the arch string. This is primarily needed to
+  // warn users if the executable file contains conflicting RISC-V extensions
+  // that could lead to invalid disassembler output.
+  auto isa_info = llvm::RISCVISAInfo::parseArchString(
+      std::get<llvm::StringRef>(*value_or_opt),
+      /* EnableExperimentalExtension=*/true);
+  if (auto error = isa_info.takeError()) {
+    StreamString ss;
+    ss << "The .riscv.attributes section contains an invalid RISC-V arch "
+          "string: "
+       << llvm::toString(std::move(error))
+       << "\n\tThis could result in misleading disassembler output.\n";
+    Debugger::ReportWarning(ss.GetString().str());
+  }
+}
+
 // GetSectionHeaderInfo
 size_t ObjectFileELF::GetSectionHeaderInfo(SectionHeaderColl &section_headers,
                                            DataExtractor &object_data,
@@ -1624,6 +1802,15 @@ size_t ObjectFileELF::GetSectionHeaderInfo(SectionHeaderColl &section_headers,
             ParseARMAttributes(data, section_size, arch_spec);
         }
 
+        if (arch_spec.GetTriple().isRISCV()) {
+          DataExtractor data;
+          if (sheader.sh_type == llvm::ELF::SHT_RISCV_ATTRIBUTES &&
+              section_size != 0 &&
+              data.SetData(object_data, sheader.sh_offset, section_size) ==
+                  section_size)
+            ParseRISCVAttributes(data, section_size, arch_spec);
+        }
+
         if (name == g_sect_name_gnu_debuglink) {
           DataExtractor data;
           if (section_size && (data.SetData(object_data, sheader.sh_offset,
diff --git a/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.h b/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.h
index 9fc19bcd07f34..866ef270fa731 100644
--- a/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.h
+++ b/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.h
@@ -275,6 +275,10 @@ class ObjectFileELF : public lldb_private::ObjectFile {
                                  uint64_t length,
                                  lldb_private::ArchSpec &arch_spec);
 
+  static void ParseRISCVAttributes(lldb_private::DataExtractor &data,
+                                   uint64_t length,
+                                   lldb_private::ArchSpec &arch_spec);
+
   /// Parses the elf section headers and returns the uuid, debug link name,
   /// crc, archspec.
   static size_t GetSectionHeaderInfo(SectionHeaderColl &section_headers,
diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp
index 3e085e993cad7..5e1aa33a59f89 100644
--- a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp
+++ b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp
@@ -199,125 +199,137 @@ PlatformDarwin::PutFile(const lldb_private::FileSpec &source,
   return PlatformPOSIX::PutFile(source, destination, uid, gid);
 }
 
-FileSpecList PlatformDarwin::LocateExecutableScriptingResources(
-    Target *target, Module &module, Stream &feedback_stream) {
+static FileSpecList LocateExecutableScriptingResourcesFromDSYM(
+    Stream &feedback_stream, FileSpec module_spec, const Target &target,
+    const FileSpec &symfile_spec) {
   FileSpecList file_list;
-  if (target &&
-      target->GetDebugger().GetScriptLanguage() == eScriptLanguagePython) {
-    // NB some extensions might be meaningful and should not be stripped -
-    // "this.binary.file"
-    // should not lose ".file" but GetFileNameStrippingExtension() will do
-    // precisely that. Ideally, we should have a per-platform list of
-    // extensions (".exe", ".app", ".dSYM", ".framework") which should be
-    // stripped while leaving "this.binary.file" as-is.
-
-    FileSpec module_spec = module.GetFileSpec();
-
-    if (module_spec) {
-      if (SymbolFile *symfile = module.GetSymbolFile()) {
-        ObjectFile *objfile = symfile->GetObjectFile();
-        if (objfile) {
-          FileSpec symfile_spec(objfile->GetFileSpec());
-          if (symfile_spec &&
-              llvm::StringRef(symfile_spec.GetPath())
-                  .contains_insensitive(".dSYM/Contents/Resources/DWARF") &&
-              FileSystem::Instance().Exists(symfile_spec)) {
-            while (module_spec.GetFilename()) {
-              std::string module_basename(
-                  module_spec.GetFilename().GetCString());
-              std::string original_module_basename(module_basename);
-
-              bool was_keyword = false;
-
-              // FIXME: for Python, we cannot allow certain characters in
-              // module
-              // filenames we import. Theoretically, different scripting
-              // languages may have different sets of forbidden tokens in
-              // filenames, and that should be dealt with by each
-              // ScriptInterpreter. For now, we just replace dots with
-              // underscores, but if we ever support anything other than
-              // Python we will need to rework this
-              llvm::replace(module_basename, '.', '_');
-              llvm::replace(module_basename, ' ', '_');
-              llvm::replace(module_basename, '-', '_');
-              ScriptInterpreter *script_interpreter =
-                  target->GetDebugger().GetScriptInterpreter();
-              if (script_interpreter &&
-                  script_interpreter->IsReservedWord(module_basename.c_str())) {
-                module_basename.insert(module_basename.begin(), '_');
-                was_keyword = true;
-              }
+  while (module_spec.GetFilename()) {
+    std::string module_basename(module_spec.GetFilename().GetCString());
+    std::string original_module_basename(module_basename);
+
+    bool was_keyword = false;
+
+    // FIXME: for Python, don't allow certain characters in imported module
+    // filenames. Theoretically, different scripting languages may have
+    // different sets of forbidden tokens in filenames, and that should
+    // be dealt with by each ScriptInterpreter. For now, just replace dots
+    // with underscores. In order to support anything other than Python
+    // this will need to be reworked.
+    llvm::replace(module_basename, '.', '_');
+    llvm::replace(module_basename, ' ', '_');
+    llvm::replace(module_basename, '-', '_');
+    ScriptInterpreter *script_interpreter =
+        target.GetDebugger().GetScriptInterpreter();
+    if (script_interpreter &&
+        script_interpreter->IsReservedWord(module_basename.c_str())) {
+      module_basename.insert(module_basename.begin(), '_');
+      was_keyword = true;
+    }
 
-              StreamString path_string;
-              StreamString original_path_string;
-              // for OSX we are going to be in
-              // .dSYM/Contents/Resources/DWARF/<basename> let us go to
-              // .dSYM/Contents/Resources/Python/<basename>.py and see if the
-              // file exists
-              path_string.Printf("%s/../Python/%s.py",
-                                 symfile_spec.GetDirectory().GetCString(),
-                                 module_basename.c_str());
-              original_path_string.Printf(
-                  "%s/../Python/%s.py",
-                  symfile_spec.GetDirectory().GetCString(),
-                  original_module_basename.c_str());
-              FileSpec script_fspec(path_string.GetString());
-              FileSystem::Instance().Resolve(script_fspec);
-              FileSpec orig_script_fspec(original_path_string.GetString());
-              FileSystem::Instance().Resolve(orig_script_fspec);
-
-              // if we did some replacements of reserved characters, and a
-              // file with the untampered name exists, then warn the user
-              // that the file as-is shall not be loaded
-              if (module_basename != original_module_basename &&
-                  FileSystem::Instance().Exists(orig_script_fspec)) {
-                const char *reason_for_complaint =
-                    was_keyword ? "conflicts with a keyword"
-                                : "contains reserved characters";
-                if (FileSystem::Instance().Exists(script_fspec))
-                  feedback_stream.Printf(
-                      "warning: the symbol file '%s' contains a debug "
-                      "script. However, its name"
-                      " '%s' %s and as such cannot be loaded. LLDB will"
-                      " load '%s' instead. Consider removing the file with "
-                      "the malformed name to"
-                      " eliminate this warning.\n",
-                      symfile_spec.GetPath().c_str(),
-                      original_path_string.GetData(), reason_for_complaint,
-                      path_string.GetData());
-                else
-                  feedback_stream.Printf(
-                      "warning: the symbol file '%s' contains a debug "
-                      "script. However, its name"
-                      " %s and as such cannot be loaded. If you intend"
-                      " to have this script loaded, please rename '%s' to "
-                      "'%s' and retry.\n",
-                      symfile_spec.GetPath().c_str(), reason_for_complaint,
-                      original_path_string.GetData(), path_string.GetData());
-              }
+    StreamString path_string;
+    StreamString original_path_string;
+    // for OSX we are going to be in
+    // .dSYM/Contents/Resources/DWARF/<basename> let us go to
+    // .dSYM/Contents/Resources/Python/<basename>.py and see if the
+    // file exists
+    path_string.Printf("%s/../Python/%s.py",
+                       symfile_spec.GetDirectory().GetCString(),
+                       module_basename.c_str());
+    original_path_string.Printf("%s/../Python/%s.py",
+                                symfile_spec.GetDirectory().GetCString(),
+                                original_module_basename.c_str());
+    FileSpec script_fspec(path_string.GetString());
+    FileSystem::Instance().Resolve(script_fspec);
+    FileSpec orig_script_fspec(original_path_string.GetString());
+    FileSystem::Instance().Resolve(orig_script_fspec);
+
+    // if we did some replacements of reserved characters, and a
+    // file with the untampered name exists, then warn the user
+    // that the file as-is shall not be loaded
+    if (module_basename != original_module_basename &&
+        FileSystem::Instance().Exists(orig_script_fspec)) {
+      const char *reason_for_complaint = was_keyword
+                                             ? "conflicts with a keyword"
+                                             : "contains reserved characters";
+      if (FileSystem::Instance().Exists(script_fspec))
+        feedback_stream.Printf(
+            "warning: the symbol file '%s' contains a debug "
+            "script. However, its name"
+            " '%s' %s and as such cannot be loaded. LLDB will"
+            " load '%s' instead. Consider removing the file with "
+            "the malformed name to"
+            " eliminate this warning.\n",
+            symfile_spec.GetPath().c_str(), original_path_string.GetData(),
+            reason_for_complaint, path_string.GetData());
+      else
+        feedback_stream.Printf(
+            "warning: the symbol file '%s' contains a debug "
+            "script. However, its name"
+            " %s and as such cannot be loaded. If you intend"
+            " to have this script loaded, please rename '%s' to "
+            "'%s' and retry.\n",
+            symfile_spec.GetPath().c_str(), reason_for_complaint,
+            original_path_string.GetData(), path_string.GetData());
+    }
 
-              if (FileSystem::Instance().Exists(script_fspec)) {
-                file_list.Append(script_fspec);
-                break;
-              }
+    if (FileSystem::Instance().Exists(script_fspec)) {
+      file_list.Append(script_fspec);
+      break;
+    }
 
-              // If we didn't find the python file, then keep stripping the
-              // extensions and try again
-              ConstString filename_no_extension(
-                  module_spec.GetFileNameStrippingExtension());
-              if (module_spec.GetFilename() == filename_no_extension)
-                break;
+    // If we didn't find the python file, then keep stripping the
+    // extensions and try again
+    ConstString filename_no_extension(
+        module_spec.GetFileNameStrippingExtension());
+    if (module_spec.GetFilename() == filename_no_extension)
+      break;
 
-              module_spec.SetFilename(filename_no_extension);
-            }
-          }
-        }
-      }
-    }
+    module_spec.SetFilename(filename_no_extension);
   }
+
   return file_list;
 }
 
+FileSpecList PlatformDarwin::LocateExecutableScriptingResources(
+    Target *target, Module &module, Stream &feedback_stream) {
+  if (!target)
+    return {};
+
+  // For now only Python scripts supported for auto-loading.
+  if (target->GetDebugger().GetScriptLanguage() != eScriptLanguagePython)
+    return {};
+
+  // NB some extensions might be meaningful and should not be stripped -
+  // "this.binary.file"
+  // should not lose ".file" but GetFileNameStrippingExtension() will do
+  // precisely that. Ideally, we should have a per-platform list of
+  // extensions (".exe", ".app", ".dSYM", ".framework") which should be
+  // stripped while leaving "this.binary.file" as-is.
+
+  const FileSpec &module_spec = module.GetFileSpec();
+
+  if (!module_spec)
+    return {};
+
+  SymbolFile *symfile = module.GetSymbolFile();
+  if (!symfile)
+    return {};
+
+  ObjectFile *objfile = symfile->GetObjectFile();
+  if (!objfile)
+    return {};
+
+  const FileSpec &symfile_spec = objfile->GetFileSpec();
+  if (symfile_spec &&
+      llvm::StringRef(symfile_spec.GetPath())
+          .contains_insensitive(".dSYM/Contents/Resources/DWARF") &&
+      FileSystem::Instance().Exists(symfile_spec))
+    return LocateExecutableScriptingResourcesFromDSYM(
+        feedback_stream, module_spec, *target, symfile_spec);
+
+  return {};
+}
+
 Status PlatformDarwin::ResolveSymbolFile(Target &target,
                                          const ModuleSpec &sym_spec,
                                          FileSpec &sym_file) {
diff --git a/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.cpp b/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.cpp
index 9a53252b2b4ae..8c1919eca7dda 100644
--- a/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.cpp
+++ b/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.cpp
@@ -215,10 +215,6 @@ Status ProcessWindows::DoLaunch(Module *exe_module,
   if (error.Success())
     SetID(launch_info.GetProcessID());
   m_pty = launch_info.TakePTY();
-  // At this point, Process owns the ConPTY. If ProcessLaunchInfo still has a
-  // reference to it, it might get closed prematurely if another target is
-  // created.
-  assert(m_pty.use_count() == 1 && "More than one reference to the ConPTY");
   return error;
 }
 
diff --git a/lldb/test/API/commands/frame/var/TestFrameVar.py b/lldb/test/API/commands/frame/var/TestFrameVar.py
index d8260a5657618..b70120cb2d8e1 100644
--- a/lldb/test/API/commands/frame/var/TestFrameVar.py
+++ b/lldb/test/API/commands/frame/var/TestFrameVar.py
@@ -16,6 +16,7 @@ class TestFrameVar(TestBase):
     # set this to true.  That way it won't be run once for
     # each debug info format.
     NO_DEBUG_INFO_TESTCASE = True
+    SHARED_BUILD_TESTCASE = False
 
     def test_frame_var(self):
         self.build()
diff --git a/lldb/test/API/commands/platform/connect/TestPlatformConnect.py b/lldb/test/API/commands/platform/connect/TestPlatformConnect.py
index 5df0c16fbd1f7..0f9a51e216215 100644
--- a/lldb/test/API/commands/platform/connect/TestPlatformConnect.py
+++ b/lldb/test/API/commands/platform/connect/TestPlatformConnect.py
@@ -8,6 +8,7 @@
 
 class TestPlatformProcessConnect(TestBase):
     NO_DEBUG_INFO_TESTCASE = True
+    SHARED_BUILD_TESTCASE = False
 
     @skipIfRemote
     @expectedFailureAll(hostoslist=["windows"], triple=".*-android")
diff --git a/lldb/test/API/commands/platform/launchgdbserver/TestPlatformLaunchGDBServer.py b/lldb/test/API/commands/platform/launchgdbserver/TestPlatformLaunchGDBServer.py
index 65f0cefae96b4..e9e6845c0f549 100644
--- a/lldb/test/API/commands/platform/launchgdbserver/TestPlatformLaunchGDBServer.py
+++ b/lldb/test/API/commands/platform/launchgdbserver/TestPlatformLaunchGDBServer.py
@@ -9,6 +9,7 @@
 
 class TestPlatformProcessLaunchGDBServer(TestBase):
     NO_DEBUG_INFO_TESTCASE = True
+    SHARED_BUILD_TESTCASE = False
 
     def _launch_and_connect(self, exe):
         hostname = socket.getaddrinfo("localhost", 0, proto=socket.IPPROTO_TCP)[0][4][0]
diff --git a/lldb/test/API/commands/process/launch/TestProcessLaunch.py b/lldb/test/API/commands/process/launch/TestProcessLaunch.py
index 92d0c468741e5..28de5bc0623d0 100644
--- a/lldb/test/API/commands/process/launch/TestProcessLaunch.py
+++ b/lldb/test/API/commands/process/launch/TestProcessLaunch.py
@@ -13,6 +13,7 @@
 
 class ProcessLaunchTestCase(TestBase):
     NO_DEBUG_INFO_TESTCASE = True
+    SHARED_BUILD_TESTCASE = False
 
     def setUp(self):
         # Call super's setUp().
diff --git a/lldb/test/API/commands/settings/use_source_cache/TestUseSourceCache.py b/lldb/test/API/commands/settings/use_source_cache/TestUseSourceCache.py
index 8425ab09ab9d7..01f5e652d37bd 100644
--- a/lldb/test/API/commands/settings/use_source_cache/TestUseSourceCache.py
+++ b/lldb/test/API/commands/settings/use_source_cache/TestUseSourceCache.py
@@ -12,6 +12,7 @@
 
 class SettingsUseSourceCacheTestCase(TestBase):
     NO_DEBUG_INFO_TESTCASE = True
+    SHARED_BUILD_TESTCASE = False
 
     def test_set_use_source_cache_false(self):
         """Test that after 'set use-source-cache false', files are not locked."""
diff --git a/lldb/test/API/commands/statistics/basic/TestStats.py b/lldb/test/API/commands/statistics/basic/TestStats.py
index c8527abf3c84e..a32b8feecc5cf 100644
--- a/lldb/test/API/commands/statistics/basic/TestStats.py
+++ b/lldb/test/API/commands/statistics/basic/TestStats.py
@@ -11,6 +11,7 @@
 
 class TestCase(TestBase):
     NO_DEBUG_INFO_TESTCASE = True
+    SHARED_BUILD_TESTCASE = False
 
     def test_enable_disable(self):
         """
diff --git a/lldb/test/API/commands/target/auto-install-main-executable/TestAutoInstallMainExecutable.py b/lldb/test/API/commands/target/auto-install-main-executable/TestAutoInstallMainExecutable.py
index 165fae72319ae..47bbd2439434c 100644
--- a/lldb/test/API/commands/target/auto-install-main-executable/TestAutoInstallMainExecutable.py
+++ b/lldb/test/API/commands/target/auto-install-main-executable/TestAutoInstallMainExecutable.py
@@ -13,6 +13,7 @@
 
 class TestAutoInstallMainExecutable(TestBase):
     NO_DEBUG_INFO_TESTCASE = True
+    SHARED_BUILD_TESTCASE = False
 
     @skipIfRemote
     @skipIfWindows  # This test is flaky on Windows
diff --git a/lldb/test/API/commands/target/dump-separate-debug-info/dwo/TestDumpDwo.py b/lldb/test/API/commands/target/dump-separate-debug-info/dwo/TestDumpDwo.py
index d819b5ed9ca87..7ac4e16e8cdcb 100644
--- a/lldb/test/API/commands/target/dump-separate-debug-info/dwo/TestDumpDwo.py
+++ b/lldb/test/API/commands/target/dump-separate-debug-info/dwo/TestDumpDwo.py
@@ -12,6 +12,7 @@
 
 class TestDumpDWO(lldbtest.TestBase):
     NO_DEBUG_INFO_TESTCASE = True
+    SHARED_BUILD_TESTCASE = False
 
     def get_dwos_from_json_output(self):
         """Returns a dictionary of `symfile` -> {`dwo_name` -> dwo_info object}."""
diff --git a/lldb/test/API/commands/trace/TestTraceStartStop.py b/lldb/test/API/commands/trace/TestTraceStartStop.py
index 9450f8b0961a8..8f882eb5d974b 100644
--- a/lldb/test/API/commands/trace/TestTraceStartStop.py
+++ b/lldb/test/API/commands/trace/TestTraceStartStop.py
@@ -7,6 +7,8 @@
 
 @skipIfNoIntelPT
 class TestTraceStartStop(TraceIntelPTTestCaseBase):
+    SHARED_BUILD_TESTCASE = False
+
     def expectGenericHelpMessageForStartCommand(self):
         self.expect(
             "help thread trace start",
diff --git a/lldb/test/API/functionalities/breakpoint/breakpoint_command/TestBreakpointCommand.py b/lldb/test/API/functionalities/breakpoint/breakpoint_command/TestBreakpointCommand.py
index 605561c757372..e45362295690d 100644
--- a/lldb/test/API/functionalities/breakpoint/breakpoint_command/TestBreakpointCommand.py
+++ b/lldb/test/API/functionalities/breakpoint/breakpoint_command/TestBreakpointCommand.py
@@ -14,6 +14,7 @@
 
 class BreakpointCommandTestCase(TestBase):
     NO_DEBUG_INFO_TESTCASE = True
+    SHARED_BUILD_TESTCASE = False
 
     @expectedFailureAll(oslist=["windows"], bugnumber="llvm.org/pr24528")
     def test_breakpoint_command_sequence(self):
diff --git a/lldb/test/API/functionalities/breakpoint/comp_dir_symlink/TestCompDirSymLink.py b/lldb/test/API/functionalities/breakpoint/comp_dir_symlink/TestCompDirSymLink.py
index ca2c7c3d1ad93..1430eb890c2e6 100644
--- a/lldb/test/API/functionalities/breakpoint/comp_dir_symlink/TestCompDirSymLink.py
+++ b/lldb/test/API/functionalities/breakpoint/comp_dir_symlink/TestCompDirSymLink.py
@@ -16,6 +16,8 @@
 
 
 class CompDirSymLinkTestCase(TestBase):
+    SHARED_BUILD_TESTCASE = False
+
     def setUp(self):
         # Call super's setUp().
         TestBase.setUp(self)
diff --git a/lldb/test/API/functionalities/breakpoint/objc/TestObjCBreakpoints.py b/lldb/test/API/functionalities/breakpoint/objc/TestObjCBreakpoints.py
index 29cf31563a9a9..7cecd1d290683 100644
--- a/lldb/test/API/functionalities/breakpoint/objc/TestObjCBreakpoints.py
+++ b/lldb/test/API/functionalities/breakpoint/objc/TestObjCBreakpoints.py
@@ -13,6 +13,8 @@
 
 
 class TestObjCBreakpoints(TestBase):
+    SHARED_BUILD_TESTCASE = False
+
     @add_test_categories(["objc"])
     def test_break(self):
         """Test setting Objective-C specific breakpoints (DWARF in .o files)."""
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/map/TestDataFormatterStdMap.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/map/TestDataFormatterStdMap.py
index ca2d2d6b49541..2a420d8ed58ae 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/map/TestDataFormatterStdMap.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/map/TestDataFormatterStdMap.py
@@ -10,6 +10,7 @@
 
 class StdMapDataFormatterTestCase(TestBase):
     TEST_WITH_PDB_DEBUG_INFO = True
+    SHARED_BUILD_TESTCASE = False
 
     def setUp(self):
         TestBase.setUp(self)
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/string/TestDataFormatterStdString.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/string/TestDataFormatterStdString.py
index 00047e419de37..34989aea9de6e 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/string/TestDataFormatterStdString.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/string/TestDataFormatterStdString.py
@@ -12,6 +12,7 @@
 
 class StdStringDataFormatterTestCase(TestBase):
     TEST_WITH_PDB_DEBUG_INFO = True
+    SHARED_BUILD_TESTCASE = False
 
     def setUp(self):
         # Call super's setUp().
diff --git a/lldb/test/API/functionalities/gdb_remote_client/TestPty.py b/lldb/test/API/functionalities/gdb_remote_client/TestPty.py
index 94eeb6e3ba11a..47f687806dbab 100644
--- a/lldb/test/API/functionalities/gdb_remote_client/TestPty.py
+++ b/lldb/test/API/functionalities/gdb_remote_client/TestPty.py
@@ -7,6 +7,8 @@
 
 @skipIf(hostoslist=["windows"])
 class TestPty(GDBRemoteTestBase):
+    SHARED_BUILD_TESTCASE = False
+
     server_socket_class = PtyServerSocket
 
     def get_term_attrs(self):
diff --git a/lldb/test/API/functionalities/inferior-changed/TestInferiorChanged.py b/lldb/test/API/functionalities/inferior-changed/TestInferiorChanged.py
index ea0283e119f1e..c8114ab78d9e0 100644
--- a/lldb/test/API/functionalities/inferior-changed/TestInferiorChanged.py
+++ b/lldb/test/API/functionalities/inferior-changed/TestInferiorChanged.py
@@ -10,6 +10,8 @@
 
 
 class ChangedInferiorTestCase(TestBase):
+    SHARED_BUILD_TESTCASE = False
+
     @skipIf(hostoslist=["windows"])
     @no_debug_info_test
     def test_inferior_crashing(self):
diff --git a/lldb/test/API/functionalities/limit-debug-info/TestLimitDebugInfo.py b/lldb/test/API/functionalities/limit-debug-info/TestLimitDebugInfo.py
index 18371669462e2..668d31d8a5acd 100644
--- a/lldb/test/API/functionalities/limit-debug-info/TestLimitDebugInfo.py
+++ b/lldb/test/API/functionalities/limit-debug-info/TestLimitDebugInfo.py
@@ -10,6 +10,8 @@
 
 
 class LimitDebugInfoTestCase(TestBase):
+    SHARED_BUILD_TESTCASE = False
+
     def _check_type(self, target, name):
         exe = target.FindModule(lldb.SBFileSpec("a.out"))
         type_ = exe.FindFirstType(name)
diff --git a/lldb/test/API/functionalities/module_cache/bsd/TestModuleCacheBSD.py b/lldb/test/API/functionalities/module_cache/bsd/TestModuleCacheBSD.py
index 20312f829fdc1..180d487fd4845 100644
--- a/lldb/test/API/functionalities/module_cache/bsd/TestModuleCacheBSD.py
+++ b/lldb/test/API/functionalities/module_cache/bsd/TestModuleCacheBSD.py
@@ -10,6 +10,8 @@
 
 
 class ModuleCacheTestcaseBSD(TestBase):
+    SHARED_BUILD_TESTCASE = False
+
     def setUp(self):
         # Call super's setUp().
         TestBase.setUp(self)
diff --git a/lldb/test/API/functionalities/module_cache/debug_index/TestDebugIndexCache.py b/lldb/test/API/functionalities/module_cache/debug_index/TestDebugIndexCache.py
index 501ceb705c579..7d7d31b366f4b 100644
--- a/lldb/test/API/functionalities/module_cache/debug_index/TestDebugIndexCache.py
+++ b/lldb/test/API/functionalities/module_cache/debug_index/TestDebugIndexCache.py
@@ -9,6 +9,8 @@
 
 
 class DebugIndexCacheTestcase(TestBase):
+    SHARED_BUILD_TESTCASE = False
+
     def setUp(self):
         # Call super's setUp().
         TestBase.setUp(self)
diff --git a/lldb/test/API/functionalities/rerun_and_expr/TestRerunAndExpr.py b/lldb/test/API/functionalities/rerun_and_expr/TestRerunAndExpr.py
index 1d62af4299c3b..e0d6a811fc22c 100644
--- a/lldb/test/API/functionalities/rerun_and_expr/TestRerunAndExpr.py
+++ b/lldb/test/API/functionalities/rerun_and_expr/TestRerunAndExpr.py
@@ -11,6 +11,8 @@
 
 
 class TestRerunExpr(TestBase):
+    SHARED_BUILD_TESTCASE = False
+
     # FIXME: on Windows rebuilding the binary isn't enough to unload it
     #        on progrem restart. One will have to try hard to evict
     #        the module from the ModuleList (possibly including a call to
diff --git a/lldb/test/API/functionalities/rerun_and_expr_dylib/TestRerunAndExprDylib.py b/lldb/test/API/functionalities/rerun_and_expr_dylib/TestRerunAndExprDylib.py
index 19edaac964e62..cc9ffd280a021 100644
--- a/lldb/test/API/functionalities/rerun_and_expr_dylib/TestRerunAndExprDylib.py
+++ b/lldb/test/API/functionalities/rerun_and_expr_dylib/TestRerunAndExprDylib.py
@@ -26,6 +26,8 @@ def isUbuntu18_04():
 
 
 class TestRerunExprDylib(TestBase):
+    SHARED_BUILD_TESTCASE = False
+
     @skipTestIfFn(isUbuntu18_04, bugnumber="rdar://103831050")
     @skipIfWindows
     @skipIfRemote
diff --git a/lldb/test/API/functionalities/thread/step_until/TestStepUntilAPI.py b/lldb/test/API/functionalities/thread/step_until/TestStepUntilAPI.py
index 08d78fb996c75..8181f27a0d669 100644
--- a/lldb/test/API/functionalities/thread/step_until/TestStepUntilAPI.py
+++ b/lldb/test/API/functionalities/thread/step_until/TestStepUntilAPI.py
@@ -6,6 +6,7 @@
 
 class TestStepUntilAPI(TestBase):
     NO_DEBUG_INFO_TESTCASE = True
+    SHARED_BUILD_TESTCASE = False
 
     def setUp(self):
         super().setUp()
diff --git a/lldb/test/API/lang/c/calling-conventions/TestCCallingConventions.py b/lldb/test/API/lang/c/calling-conventions/TestCCallingConventions.py
index 9540dc066f308..a8cb87063f50a 100644
--- a/lldb/test/API/lang/c/calling-conventions/TestCCallingConventions.py
+++ b/lldb/test/API/lang/c/calling-conventions/TestCCallingConventions.py
@@ -7,6 +7,7 @@
 
 class TestCase(TestBase):
     NO_DEBUG_INFO_TESTCASE = True
+    SHARED_BUILD_TESTCASE = False
 
     def build_and_run(self, test_file):
         """
diff --git a/lldb/test/API/lang/c/shared_lib_stripped_symbols/TestSharedLibStrippedSymbols.py b/lldb/test/API/lang/c/shared_lib_stripped_symbols/TestSharedLibStrippedSymbols.py
index 5440778572f8d..173ebc769b922 100644
--- a/lldb/test/API/lang/c/shared_lib_stripped_symbols/TestSharedLibStrippedSymbols.py
+++ b/lldb/test/API/lang/c/shared_lib_stripped_symbols/TestSharedLibStrippedSymbols.py
@@ -9,6 +9,8 @@
 
 
 class SharedLibStrippedTestCase(TestBase):
+    SHARED_BUILD_TESTCASE = False
+
     @expectedFailureAll(oslist=["windows"])
     # Sometimes fails with:
     # error: Couldn't allocate space for materialized struct: Couldn't malloc: address space is full
diff --git a/lldb/test/API/lang/cpp/abi_tag_lookup/TestAbiTagLookup.py b/lldb/test/API/lang/cpp/abi_tag_lookup/TestAbiTagLookup.py
index 4f6e41ed29de1..19f4a4e14ed22 100644
--- a/lldb/test/API/lang/cpp/abi_tag_lookup/TestAbiTagLookup.py
+++ b/lldb/test/API/lang/cpp/abi_tag_lookup/TestAbiTagLookup.py
@@ -10,6 +10,8 @@
 
 
 class AbiTagLookupTestCase(TestBase):
+    SHARED_BUILD_TESTCASE = False
+
     @skipIfWindows
     @expectedFailureAll(debug_info=["dwarf", "gmodules", "dwo"])
     def test_abi_tag_lookup(self):
diff --git a/lldb/test/API/lang/cpp/abi_tag_structors/TestAbiTagStructors.py b/lldb/test/API/lang/cpp/abi_tag_structors/TestAbiTagStructors.py
index 2d3e4f7cdd472..58b726417ee0a 100644
--- a/lldb/test/API/lang/cpp/abi_tag_structors/TestAbiTagStructors.py
+++ b/lldb/test/API/lang/cpp/abi_tag_structors/TestAbiTagStructors.py
@@ -10,6 +10,8 @@
 
 
 class AbiTagStructorsTestCase(TestBase):
+    SHARED_BUILD_TESTCASE = False
+
     @skipIf(
         compiler="clang",
         compiler_version=["<", "22"],
diff --git a/lldb/test/API/lang/cpp/const_static_integral_member/TestConstStaticIntegralMember.py b/lldb/test/API/lang/cpp/const_static_integral_member/TestConstStaticIntegralMember.py
index 9de7eb2e4a6e3..3f42dc195d118 100644
--- a/lldb/test/API/lang/cpp/const_static_integral_member/TestConstStaticIntegralMember.py
+++ b/lldb/test/API/lang/cpp/const_static_integral_member/TestConstStaticIntegralMember.py
@@ -9,6 +9,8 @@
 
 
 class TestCase(TestBase):
+    SHARED_BUILD_TESTCASE = False
+
     def test(self):
         self.build()
         lldbutil.run_to_source_breakpoint(
diff --git a/lldb/test/API/lang/cpp/expr-definition-in-dylib/TestExprDefinitionInDylib.py b/lldb/test/API/lang/cpp/expr-definition-in-dylib/TestExprDefinitionInDylib.py
index b3bed43c75873..b0781c8d442e5 100644
--- a/lldb/test/API/lang/cpp/expr-definition-in-dylib/TestExprDefinitionInDylib.py
+++ b/lldb/test/API/lang/cpp/expr-definition-in-dylib/TestExprDefinitionInDylib.py
@@ -5,6 +5,7 @@
 
 
 class ExprDefinitionInDylibTestCase(TestBase):
+    SHARED_BUILD_TESTCASE = False
 
     @skipIf(
         compiler="clang",
diff --git a/lldb/test/API/lang/cpp/gmodules/template-with-same-arg/TestTemplateWithSameArg.py b/lldb/test/API/lang/cpp/gmodules/template-with-same-arg/TestTemplateWithSameArg.py
index d40be55872eae..00fa739ee0591 100644
--- a/lldb/test/API/lang/cpp/gmodules/template-with-same-arg/TestTemplateWithSameArg.py
+++ b/lldb/test/API/lang/cpp/gmodules/template-with-same-arg/TestTemplateWithSameArg.py
@@ -27,6 +27,8 @@
 
 
 class TestTemplateWithSameArg(TestBase):
+    SHARED_BUILD_TESTCASE = False
+
     def setUp(self):
         TestBase.setUp(self)
         self.build()
diff --git a/lldb/test/API/lang/cpp/namespace/TestNamespaceLookup.py b/lldb/test/API/lang/cpp/namespace/TestNamespaceLookup.py
index 41141164769ec..73d43207cd12c 100644
--- a/lldb/test/API/lang/cpp/namespace/TestNamespaceLookup.py
+++ b/lldb/test/API/lang/cpp/namespace/TestNamespaceLookup.py
@@ -11,6 +11,8 @@
 from lldbsuite.test import lldbplatformutil
 
 class NamespaceLookupTestCase(TestBase):
+    SHARED_BUILD_TESTCASE = False
+
     def setUp(self):
         # Call super's setUp().
         TestBase.setUp(self)
diff --git a/lldb/test/API/lang/cpp/template-alias/TestTemplateAlias.py b/lldb/test/API/lang/cpp/template-alias/TestTemplateAlias.py
index 8b6d6dcbc38ba..83a776fc1735d 100644
--- a/lldb/test/API/lang/cpp/template-alias/TestTemplateAlias.py
+++ b/lldb/test/API/lang/cpp/template-alias/TestTemplateAlias.py
@@ -5,6 +5,8 @@
 
 
 class TestTemplateAlias(TestBase):
+    SHARED_BUILD_TESTCASE = False
+
     def do_test(self, extra_flags):
         self.build(dictionary=extra_flags)
         self.main_source_file = lldb.SBFileSpec("main.cpp")
diff --git a/lldb/test/API/lang/cpp/template-function/TestTemplateFunctions.py b/lldb/test/API/lang/cpp/template-function/TestTemplateFunctions.py
index 3be93dedfd11d..aac9b0a2450cc 100644
--- a/lldb/test/API/lang/cpp/template-function/TestTemplateFunctions.py
+++ b/lldb/test/API/lang/cpp/template-function/TestTemplateFunctions.py
@@ -8,6 +8,8 @@
 
 
 class TemplateFunctionsTestCase(TestBase):
+    SHARED_BUILD_TESTCASE = False
+
     def do_test_template_function(self, add_cast):
         self.build()
         lldbutil.run_to_source_breakpoint(
diff --git a/lldb/test/API/lang/cpp/unique-types3/TestUniqueTypes3.py b/lldb/test/API/lang/cpp/unique-types3/TestUniqueTypes3.py
index f72701b5eee07..9f41bbeee0636 100644
--- a/lldb/test/API/lang/cpp/unique-types3/TestUniqueTypes3.py
+++ b/lldb/test/API/lang/cpp/unique-types3/TestUniqueTypes3.py
@@ -9,6 +9,8 @@
 
 
 class UniqueTypesTestCase3(TestBase):
+    SHARED_BUILD_TESTCASE = False
+
     def do_test(self, debug_flags):
         """Test that we display the correct template instantiation."""
         self.build(dictionary=debug_flags)
diff --git a/lldb/test/API/lang/objc/objc-struct-argument/TestObjCStructArgument.py b/lldb/test/API/lang/objc/objc-struct-argument/TestObjCStructArgument.py
index 480d99523e8a2..921db09db1c57 100644
--- a/lldb/test/API/lang/objc/objc-struct-argument/TestObjCStructArgument.py
+++ b/lldb/test/API/lang/objc/objc-struct-argument/TestObjCStructArgument.py
@@ -8,6 +8,8 @@
 
 
 class TestObjCStructArgument(TestBase):
+    SHARED_BUILD_TESTCASE = False
+
     def setUp(self):
         # Call super's setUp().
         TestBase.setUp(self)
diff --git a/lldb/test/API/lldbutil-tests/failed-to-hit-breakpoint/TestLLDBUtilFailedToHitBreakpoint.py b/lldb/test/API/lldbutil-tests/failed-to-hit-breakpoint/TestLLDBUtilFailedToHitBreakpoint.py
index 632645ac7c7df..60a94e9c25b34 100644
--- a/lldb/test/API/lldbutil-tests/failed-to-hit-breakpoint/TestLLDBUtilFailedToHitBreakpoint.py
+++ b/lldb/test/API/lldbutil-tests/failed-to-hit-breakpoint/TestLLDBUtilFailedToHitBreakpoint.py
@@ -11,7 +11,6 @@
 class LLDBUtilFailedToHitBreakpointTest(TestBase):
     NO_DEBUG_INFO_TESTCASE = True
 
-    @expectedFailureAll(oslist=["windows"])
     def test_error_message(self):
         """
         Tests that run_to_source_breakpoint prints the right error message
diff --git a/lldb/test/API/macosx/lc-note/firmware-corefile/TestFirmwareCorefiles.py b/lldb/test/API/macosx/lc-note/firmware-corefile/TestFirmwareCorefiles.py
index 9309de4824ec4..c941d7a61da05 100644
--- a/lldb/test/API/macosx/lc-note/firmware-corefile/TestFirmwareCorefiles.py
+++ b/lldb/test/API/macosx/lc-note/firmware-corefile/TestFirmwareCorefiles.py
@@ -12,6 +12,8 @@
 
 
 class TestFirmwareCorefiles(TestBase):
+    SHARED_BUILD_TESTCASE = False
+
     @skipIf(
         debug_info=no_match(["dsym"]),
         bugnumber="This test is looking explicitly for a dSYM",
diff --git a/lldb/test/API/macosx/simulator/TestSimulatorPlatform.py b/lldb/test/API/macosx/simulator/TestSimulatorPlatform.py
index b17ee83ea04fe..b6f6368f6da80 100644
--- a/lldb/test/API/macosx/simulator/TestSimulatorPlatform.py
+++ b/lldb/test/API/macosx/simulator/TestSimulatorPlatform.py
@@ -7,6 +7,7 @@
 
 class TestSimulatorPlatformLaunching(TestBase):
     NO_DEBUG_INFO_TESTCASE = True
+    SHARED_BUILD_TESTCASE = False
 
     def check_load_commands(self, expected_load_command):
         """sanity check the built binary for the expected number of load commands"""
diff --git a/lldb/test/API/macosx/skinny-corefile/TestSkinnyCorefile.py b/lldb/test/API/macosx/skinny-corefile/TestSkinnyCorefile.py
index bc19c69df7620..66a3cba83ff45 100644
--- a/lldb/test/API/macosx/skinny-corefile/TestSkinnyCorefile.py
+++ b/lldb/test/API/macosx/skinny-corefile/TestSkinnyCorefile.py
@@ -12,6 +12,8 @@
 
 
 class TestSkinnyCorefile(TestBase):
+    SHARED_BUILD_TESTCASE = False
+
     @skipIf(
         debug_info=no_match(["dsym"]),
         bugnumber="This test is looking explicitly for a dSYM",
diff --git a/lldb/test/API/python_api/debugger/TestDebuggerAPI.py b/lldb/test/API/python_api/debugger/TestDebuggerAPI.py
index 600cde3c6b807..93ee52dc88ef8 100644
--- a/lldb/test/API/python_api/debugger/TestDebuggerAPI.py
+++ b/lldb/test/API/python_api/debugger/TestDebuggerAPI.py
@@ -11,6 +11,7 @@
 
 class DebuggerAPITestCase(TestBase):
     NO_DEBUG_INFO_TESTCASE = True
+    SHARED_BUILD_TESTCASE = False
 
     def test_debugger_api_boundary_condition(self):
         """Exercise SBDebugger APIs with boundary conditions."""
diff --git a/lldb/test/API/python_api/target-arch-from-module/TestTargetArchFromModule.py b/lldb/test/API/python_api/target-arch-from-module/TestTargetArchFromModule.py
index 0141828ae1eab..ba9ab286f82e6 100644
--- a/lldb/test/API/python_api/target-arch-from-module/TestTargetArchFromModule.py
+++ b/lldb/test/API/python_api/target-arch-from-module/TestTargetArchFromModule.py
@@ -12,6 +12,8 @@
 
 
 class TargetArchFromModule(TestBase):
+    SHARED_BUILD_TESTCASE = False
+
     @skipIf(
         debug_info=no_match(["dsym"]),
         bugnumber="This test is looking explicitly for a dSYM",
diff --git a/lldb/test/API/python_api/unified_section_list/TestModuleUnifiedSectionList.py b/lldb/test/API/python_api/unified_section_list/TestModuleUnifiedSectionList.py
index 93b23d0ba81cb..a559307e59930 100644
--- a/lldb/test/API/python_api/unified_section_list/TestModuleUnifiedSectionList.py
+++ b/lldb/test/API/python_api/unified_section_list/TestModuleUnifiedSectionList.py
@@ -13,6 +13,8 @@
 
 
 class ModuleUnifiedSectionList(TestBase):
+    SHARED_BUILD_TESTCASE = False
+
     @skipUnlessPlatform(["linux", "freebsd", "netbsd"])
     def test_unified_section_list(self):
         self.build()
diff --git a/lldb/test/API/riscv/disassembler/TestDisassembler.py b/lldb/test/API/riscv/disassembler/TestDisassembler.py
new file mode 100644
index 0000000000000..2f01283786b1f
--- /dev/null
+++ b/lldb/test/API/riscv/disassembler/TestDisassembler.py
@@ -0,0 +1,79 @@
+"""
+Tests that LLDB can correctly set up a disassembler using extensions from the .riscv.attributes section.
+"""
+
+import lldb
+from lldbsuite.test.decorators import *
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test import lldbutil
+import os
+
+
+class TestDisassembler(TestBase):
+    expected_zbb_instrs = ["andn", "orn", "xnor", "rol", "ror"]
+
+    def test_without_riscv_attributes(self):
+        """
+        Tests disassembly of a riscv binary without the .riscv.attributes.
+        Without the .riscv.attributes section lldb won't set up a disassembler to
+        handle the bitmanip extension, so it is not expected to see zbb instructions
+        in the output.
+        """
+        yaml = os.path.join(self.getSourceDir(), "stripped.out.yaml")
+        exe = self.getBuildArtifact("stripped.out")
+        self.yaml2obj(yaml, exe)
+
+        target = self.dbg.CreateTarget(exe)
+
+        self.expect("disassemble --name do_zbb_stuff")
+        output = self.res.GetOutput()
+
+        for instr in self.expected_zbb_instrs:
+            self.assertFalse(
+                instr in output, "Zbb instructions should not be disassembled"
+            )
+
+        self.assertEqual(
+            output.count("unknown"),
+            len(self.expected_zbb_instrs),
+            "Instructions from the Zbb extension should be displayed as <unknown>",
+        )
+
+    def test_with_riscv_attributes(self):
+        """
+        Tests disassembly of a riscv binary with the .riscv.attributes.
+        """
+        yaml = os.path.join(self.getSourceDir(), "a.out.yaml")
+        exe = self.getBuildArtifact("a.out")
+        self.yaml2obj(yaml, exe)
+
+        target = self.dbg.CreateTarget(self.getBuildArtifact("a.out"))
+
+        self.expect("disassemble --name do_zbb_stuff")
+        output = self.res.GetOutput()
+
+        for instr in self.expected_zbb_instrs:
+            self.assertTrue(instr in output, "Invalid disassembler output")
+
+    def test_conflicting_extensions(self):
+        """
+        This test demonstrates the scenario where:
+        1. file_with_zcd.c is compiled with rv64gc (includes C and D).
+        2. file_with_zcmp.c is compiled with rv64imad_zcmp (includes Zcmp).
+        3. The linker merges .riscv.attributes, creating the union: C + D + Zcmp.
+
+        The Zcmp extension is incompatible with the C extension when the D extension is enabled.
+        Therefore, the arch string contains conflicting extensions, and LLDB should
+        display an appropriate warning in this case.
+        """
+        yaml = os.path.join(self.getSourceDir(), "conflicting.out.yaml")
+        exe = self.getBuildArtifact("conflicting.out")
+        self.yaml2obj(yaml, exe)
+
+        target = self.dbg.CreateTarget(self.getBuildArtifact("a.out"))
+        output = self.res.GetOutput()
+
+        self.assertIn(
+            output,
+            "The .riscv.attributes section contains an invalid RISC-V arch string",
+        )
diff --git a/lldb/test/API/riscv/disassembler/a.out.yaml b/lldb/test/API/riscv/disassembler/a.out.yaml
new file mode 100644
index 0000000000000..5823ded9606c8
--- /dev/null
+++ b/lldb/test/API/riscv/disassembler/a.out.yaml
@@ -0,0 +1,32 @@
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_DYN
+  Machine:         EM_RISCV
+  Flags:           [ EF_RISCV_RVC, EF_RISCV_FLOAT_ABI_DOUBLE ]
+Sections:
+  - Name:            .text
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x550
+    AddressAlign:    0x4
+    Content:         EF002002AA87172500000335A5A782653000137101FF814601470A88EFF05FFD029097210000938161298280010017250000130525A9972700009387A7A86388A7009727000083B7E7A491C38287828017250000130505A797250000938585A6898D93D73540FD91BE95858599C59727000083B7A7A291C3828782809727000083C747A49DE7411106E49727000083B7E79F91C717250000033545A28297EFF01FF9A260854717270000230DF7A041018280828071BF411106E422E000083376B5403366B5403346B5403316B5603356B560A260026441018280011106EC22E8001001452330A4FE2326A4FEEFF0BFFC033504FEE260426405618280
+  - Name:            .riscv.attributes
+    Type:            SHT_RISCV_ATTRIBUTES
+    AddressAlign:    0x1
+    Content:         416C000000726973637600016200000004100572763634693270315F6D3270305F613270315F663270325F643270325F633270305F7A696373723270305F7A6966656E6365693270305F7A6D6D756C3170305F7A61616D6F3170305F7A616C7273633170305F7A626231703000
+Symbols:
+  - Name:            _Z12do_zbb_stuffv
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
+    Value:           0x606
+    Size:            0x24
+  - Name:            main
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
+    Value:           0x62A
+    Size:            0x22
+...
diff --git a/lldb/test/API/riscv/disassembler/conflicting.out.yaml b/lldb/test/API/riscv/disassembler/conflicting.out.yaml
new file mode 100644
index 0000000000000..2e0a155f3d0ce
--- /dev/null
+++ b/lldb/test/API/riscv/disassembler/conflicting.out.yaml
@@ -0,0 +1,38 @@
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_DYN
+  Machine:         EM_RISCV
+  Flags:           [ EF_RISCV_RVC, EF_RISCV_FLOAT_ABI_DOUBLE ]
+Sections:
+  - Name:            .text
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x550
+    AddressAlign:    0x4
+    Content:         EF002002AA87172500000335A5A782653000137101FF814601470A88EFF05FFD029097210000938161298280010017250000130525A9972700009387A7A86388A7009727000083B7E7A491C38287828017250000130505A797250000938585A6898D93D73540FD91BE95858599C59727000083B7A7A291C3828782809727000083C747A49DE7411106E49727000083B7E79F91C717250000033545A28297EFF01FF9A260854717270000230DF7A041018280828071BF011106EC22E8001001452330A4FE2326A4FEEF004001EF008002033504FEE260426405618280411106E422E0000802A006A42AA82EACA260026441018280411106E422E0000872B866AC26AC72BEA260026441018280
+  - Name:            .riscv.attributes
+    Type:            SHT_RISCV_ATTRIBUTES
+    AddressAlign:    0x1
+    Content:         4174000000726973637600016A00000004100572763634693270315F6D3270305F613270315F663270325F643270325F633270305F7A696373723270305F7A6966656E6365693270305F7A6D6D756C3170305F7A61616D6F3170305F7A616C7273633170305F7A63613170305F7A636D7031703000
+Symbols:
+  - Name:            main
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
+    Value:           0x606
+    Size:            0x26
+  - Name:            function_with_zcd_instructions
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
+    Value:           0x62C
+    Size:            0x18
+  - Name:            function_with_zcmp_extension
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
+    Value:           0x644
+    Size:            0x18
+...
diff --git a/lldb/test/API/riscv/disassembler/stripped.out.yaml b/lldb/test/API/riscv/disassembler/stripped.out.yaml
new file mode 100644
index 0000000000000..7c94fa577abc7
--- /dev/null
+++ b/lldb/test/API/riscv/disassembler/stripped.out.yaml
@@ -0,0 +1,28 @@
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_DYN
+  Machine:         EM_RISCV
+  Flags:           [ EF_RISCV_RVC, EF_RISCV_FLOAT_ABI_DOUBLE ]
+Sections:
+  - Name:            .text
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x550
+    AddressAlign:    0x4
+    Content:         EF002002AA87172500000335A5A782653000137101FF814601470A88EFF05FFD029097210000938161298280010017250000130525A9972700009387A7A86388A7009727000083B7E7A491C38287828017250000130505A797250000938585A6898D93D73540FD91BE95858599C59727000083B7A7A291C3828782809727000083C747A49DE7411106E49727000083B7E79F91C717250000033545A28297EFF01FF9A260854717270000230DF7A041018280828071BF411106E422E000083376B5403366B5403346B5403316B5603356B560A260026441018280011106EC22E8001001452330A4FE2326A4FEEFF0BFFC033504FEE260426405618280
+Symbols:
+  - Name:            _Z12do_zbb_stuffv
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
+    Value:           0x606
+    Size:            0x24
+  - Name:            main
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
+    Value:           0x62A
+    Size:            0x22
+...
diff --git a/lldb/test/API/source-manager/TestSourceManager.py b/lldb/test/API/source-manager/TestSourceManager.py
index 3500dded815b9..9055dcba93fd1 100644
--- a/lldb/test/API/source-manager/TestSourceManager.py
+++ b/lldb/test/API/source-manager/TestSourceManager.py
@@ -30,6 +30,7 @@ def ansi_color_surround_regex(inner_regex_text):
 
 class SourceManagerTestCase(TestBase):
     NO_DEBUG_INFO_TESTCASE = True
+    SHARED_BUILD_TESTCASE = False
 
     def setUp(self):
         # Call super's setUp().
diff --git a/lldb/test/API/test_utils/base/TestBaseTest.py b/lldb/test/API/test_utils/base/TestBaseTest.py
index 41ba481b9b74f..afff48d0c6d13 100644
--- a/lldb/test/API/test_utils/base/TestBaseTest.py
+++ b/lldb/test/API/test_utils/base/TestBaseTest.py
@@ -9,6 +9,8 @@
 
 
 class TestBuildMethod(Base):
+    SHARED_BUILD_TESTCASE = False
+
     def setUp(self):
         super().setUp()
         self._traces = []
diff --git a/lldb/test/API/tools/lldb-dap/breakpoint/TestDAP_setBreakpoints.py b/lldb/test/API/tools/lldb-dap/breakpoint/TestDAP_setBreakpoints.py
index 3309800c1dd10..7f9d4325ce4f8 100644
--- a/lldb/test/API/tools/lldb-dap/breakpoint/TestDAP_setBreakpoints.py
+++ b/lldb/test/API/tools/lldb-dap/breakpoint/TestDAP_setBreakpoints.py
@@ -12,6 +12,8 @@
 
 
 class TestDAP_setBreakpoints(lldbdap_testcase.DAPTestCaseBase):
+    SHARED_BUILD_TESTCASE = False
+
     def setUp(self):
         lldbdap_testcase.DAPTestCaseBase.setUp(self)
 
diff --git a/lldb/test/API/tools/lldb-dap/disconnect/TestDAP_disconnect.py b/lldb/test/API/tools/lldb-dap/disconnect/TestDAP_disconnect.py
index 19f88d88c2ff4..3a4bc62fc6872 100644
--- a/lldb/test/API/tools/lldb-dap/disconnect/TestDAP_disconnect.py
+++ b/lldb/test/API/tools/lldb-dap/disconnect/TestDAP_disconnect.py
@@ -12,6 +12,8 @@
 
 
 class TestDAP_disconnect(lldbdap_testcase.DAPTestCaseBase):
+    SHARED_BUILD_TESTCASE = False
+
     source = "main.cpp"
 
     def disconnect_and_assert_no_output_printed(self):
diff --git a/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch_args.py b/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch_args.py
index 221147f535958..6774f0516ae79 100644
--- a/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch_args.py
+++ b/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch_args.py
@@ -11,9 +11,6 @@ class TestDAP_launch_args(lldbdap_testcase.DAPTestCaseBase):
     Tests launch of a simple program with arguments
     """
 
-    @expectedFailureWindows(
-        bugnumber="https://github.com/llvm/llvm-project/issues/137599"
-    )
     def test(self):
         program = self.getBuildArtifact("a.out")
         args = ["one", "with space", "'with single quotes'", '"with double quotes"']
diff --git a/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch_basic.py b/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch_basic.py
index d0e8a792e4e25..93ae5d05e9d6c 100644
--- a/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch_basic.py
+++ b/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch_basic.py
@@ -12,9 +12,6 @@ class TestDAP_launch_basic(lldbdap_testcase.DAPTestCaseBase):
     environment, or anything else is specified.
     """
 
-    @expectedFailureWindows(
-        bugnumber="https://github.com/llvm/llvm-project/issues/137599"
-    )
     def test(self):
         program = self.getBuildArtifact("a.out")
         self.build_and_launch(program)
diff --git a/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch_debuggerRoot.py b/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch_debuggerRoot.py
index 39397120bf5d8..deeab23d3ec56 100644
--- a/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch_debuggerRoot.py
+++ b/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch_debuggerRoot.py
@@ -14,9 +14,6 @@ class TestDAP_launch_debuggerRoot(lldbdap_testcase.DAPTestCaseBase):
     the lldb-dap debug adapter.
     """
 
-    @expectedFailureWindows(
-        bugnumber="https://github.com/llvm/llvm-project/issues/137599"
-    )
     def test(self):
         program = self.getBuildArtifact("a.out")
         program_parent_dir = os.path.realpath(os.path.dirname(os.path.dirname(program)))
diff --git a/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch_environment_with_object.py b/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch_environment_with_object.py
index f84aff742eed2..8c7994eac7926 100644
--- a/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch_environment_with_object.py
+++ b/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch_environment_with_object.py
@@ -7,9 +7,6 @@
 
 
 class TestDAP_launch(lldbdap_testcase.DAPTestCaseBase):
-    @expectedFailureWindows(
-        bugnumber="https://github.com/llvm/llvm-project/issues/137599"
-    )
     def test_environment_with_object(self):
         """
         Tests launch of a simple program with environment variables
diff --git a/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch_shellExpandArguments_disabled.py b/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch_shellExpandArguments_disabled.py
index b71bfc8152b69..d7b8579845956 100644
--- a/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch_shellExpandArguments_disabled.py
+++ b/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch_shellExpandArguments_disabled.py
@@ -13,9 +13,6 @@ class TestDAP_launch_shellExpandArguments_disabled(lldbdap_testcase.DAPTestCaseB
     disabled.
     """
 
-    @expectedFailureWindows(
-        bugnumber="https://github.com/llvm/llvm-project/issues/137599"
-    )
     def test(self):
         program = self.getBuildArtifact("a.out")
         program_dir = os.path.dirname(program)
diff --git a/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch_shellExpandArguments_enabled.py b/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch_shellExpandArguments_enabled.py
index 443bb6f6fee54..7ddde219fc88d 100644
--- a/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch_shellExpandArguments_enabled.py
+++ b/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch_shellExpandArguments_enabled.py
@@ -18,10 +18,9 @@ class TestDAP_launch_shellExpandArguments_enabled(lldbdap_testcase.DAPTestCaseBa
     """
 
     @skipIfLinux  # shell argument expansion doesn't seem to work on Linux
-    @expectedFailureWindows(
-        bugnumber="https://github.com/llvm/llvm-project/issues/137599"
+    @expectedFailureAll(
+        oslist=["freebsd", "netbsd", "windows"], bugnumber="llvm.org/pr48349"
     )
-    @expectedFailureAll(oslist=["freebsd", "netbsd"], bugnumber="llvm.org/pr48349")
     def test(self):
         program = self.getBuildArtifact("a.out")
         program_dir = os.path.dirname(program)
diff --git a/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch_stdio_redirection_and_console.py b/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch_stdio_redirection_and_console.py
index 0ed8a5e11bf8b..bec76fb4ef5e1 100644
--- a/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch_stdio_redirection_and_console.py
+++ b/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch_stdio_redirection_and_console.py
@@ -4,10 +4,10 @@
 
 from lldbsuite.test.decorators import (
     skipIfAsan,
-    expectedFailureWindows,
     skipIf,
     skipIfBuildType,
     no_match,
+    skipIfWindows,
 )
 import lldbdap_testcase
 import tempfile
@@ -19,9 +19,7 @@ class TestDAP_launch_stdio_redirection_and_console(lldbdap_testcase.DAPTestCaseB
     """
 
     @skipIfAsan
-    @expectedFailureWindows(
-        bugnumber="https://github.com/llvm/llvm-project/issues/137599"
-    )
+    @skipIfWindows  # https://github.com/llvm/llvm-project/issues/62336
     @skipIf(oslist=["linux"], archs=no_match(["x86_64"]))
     @skipIfBuildType(["debug"])
     def test(self):
diff --git a/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch_version.py b/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch_version.py
index a598db41595a5..fca153044da82 100644
--- a/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch_version.py
+++ b/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch_version.py
@@ -13,9 +13,6 @@ class TestDAP_launch_version(lldbdap_testcase.DAPTestCaseBase):
     as the one returned by "version" command.
     """
 
-    @expectedFailureWindows(
-        bugnumber="https://github.com/llvm/llvm-project/issues/137599"
-    )
     def test(self):
         program = self.getBuildArtifact("a.out")
         self.build_and_launch(program)
diff --git a/lldb/test/API/tools/lldb-dap/runInTerminal/TestDAP_runInTerminal.py b/lldb/test/API/tools/lldb-dap/runInTerminal/TestDAP_runInTerminal.py
index dfb4906ae6a49..0fdc719b6cb76 100644
--- a/lldb/test/API/tools/lldb-dap/runInTerminal/TestDAP_runInTerminal.py
+++ b/lldb/test/API/tools/lldb-dap/runInTerminal/TestDAP_runInTerminal.py
@@ -12,6 +12,8 @@
 
 @skipIfBuildType(["debug"])
 class TestDAP_runInTerminal(lldbdap_testcase.DAPTestCaseBase):
+    SHARED_BUILD_TESTCASE = False
+
     def read_pid_message(self, fifo_file):
         with open(fifo_file, "r") as file:
             self.assertIn("pid", file.readline())
diff --git a/lldb/test/API/tools/lldb-dap/variables/TestDAP_variables.py b/lldb/test/API/tools/lldb-dap/variables/TestDAP_variables.py
index 10c67a94407e6..a70fefd358b4b 100644
--- a/lldb/test/API/tools/lldb-dap/variables/TestDAP_variables.py
+++ b/lldb/test/API/tools/lldb-dap/variables/TestDAP_variables.py
@@ -17,6 +17,8 @@ def make_buffer_verify_dict(start_idx, count, offset=0):
 
 
 class TestDAP_variables(lldbdap_testcase.DAPTestCaseBase):
+    SHARED_BUILD_TESTCASE = False
+
     def verify_values(self, verify_dict, actual, varref_dict=None, expression=None):
         if "equals" in verify_dict:
             verify = verify_dict["equals"]
@@ -306,6 +308,16 @@ def do_test_scopes_variables_setVariable_evaluate(
             argv, 0x1234, "verify argv was set to 0x1234 (0x1234 != %#x)" % (argv)
         )
 
+        # Test hexadecimal format
+        response = self.set_local("argc", 42, is_hex=True)
+        verify_response = {
+            "type": "int",
+            "value": "0x0000002a",
+        }
+        for key, value in verify_response.items():
+            self.assertEqual(value, response["body"][key])
+        self.set_local("argc", 123)
+
         # Set a variable value whose name is synthetic, like a variable index
         # and verify the value by reading it
         variable_value = 100
diff --git a/lldb/test/API/tools/lldb-server/TestGdbRemotePlatformFile.py b/lldb/test/API/tools/lldb-server/TestGdbRemotePlatformFile.py
index 76b0b204123dd..4bfd816ae2cdb 100644
--- a/lldb/test/API/tools/lldb-server/TestGdbRemotePlatformFile.py
+++ b/lldb/test/API/tools/lldb-server/TestGdbRemotePlatformFile.py
@@ -43,6 +43,8 @@ def uint32_trunc(x):
 
 
 class TestGdbRemotePlatformFile(GdbRemoteTestCaseBase):
+    SHARED_BUILD_TESTCASE = False
+
     @skipIfWindows
     @add_test_categories(["llgs"])
     def test_platform_file_rdonly(self):
diff --git a/lldb/test/API/tools/lldb-server/commandline/TestGdbRemoteConnection.py b/lldb/test/API/tools/lldb-server/commandline/TestGdbRemoteConnection.py
index ed600d396fad4..12946d9d42d11 100644
--- a/lldb/test/API/tools/lldb-server/commandline/TestGdbRemoteConnection.py
+++ b/lldb/test/API/tools/lldb-server/commandline/TestGdbRemoteConnection.py
@@ -9,6 +9,8 @@
 
 
 class TestGdbRemoteConnection(gdbremote_testcase.GdbRemoteTestCaseBase):
+    SHARED_BUILD_TESTCASE = False
+
     @skipIfRemote  # reverse connect is not a supported use case for now
     def test_reverse_connect(self):
         # Reverse connect is the default connection method.
diff --git a/lldb/test/API/types/AbstractBase.py b/lldb/test/API/types/AbstractBase.py
index fb1e25254b281..0420ffd8f3bbb 100644
--- a/lldb/test/API/types/AbstractBase.py
+++ b/lldb/test/API/types/AbstractBase.py
@@ -22,6 +22,8 @@ class GenericTester(TestBase):
     # printf() stmts (see basic_type.cpp).
     pattern = re.compile(r" (\*?a[^=]*) = '([^=]*)'$")
 
+    SHARED_BUILD_TESTCASE = False
+
     # Assert message.
     DATA_TYPE_GROKKED = "Data type from expr parser output is parsed correctly"
 
diff --git a/lldb/tools/lldb-dap/Handler/SetVariableRequestHandler.cpp b/lldb/tools/lldb-dap/Handler/SetVariableRequestHandler.cpp
index fb02d0ada651e..725d5de094c95 100644
--- a/lldb/tools/lldb-dap/Handler/SetVariableRequestHandler.cpp
+++ b/lldb/tools/lldb-dap/Handler/SetVariableRequestHandler.cpp
@@ -54,8 +54,9 @@ SetVariableRequestHandler::Run(const SetVariableArguments &args) const {
   if (!success)
     return llvm::make_error<DAPError>(error.GetCString());
 
+  const bool hex = args.format ? args.format->hex : false;
   VariableDescription desc(variable,
-                           dap.configuration.enableAutoVariableSummaries);
+                           dap.configuration.enableAutoVariableSummaries, hex);
 
   SetVariableResponseBody body;
   body.value = desc.display_value;
diff --git a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h
index 9bf04757294d6..a8280bcdd9ee6 100644
--- a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h
+++ b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h
@@ -444,7 +444,7 @@ struct SetVariableArguments {
   std::string value;
 
   /// Specifies details on how to format the response value.
-  ValueFormat format;
+  std::optional<ValueFormat> format;
 };
 bool fromJSON(const llvm::json::Value &, SetVariableArguments &,
               llvm::json::Path);
diff --git a/lldb/unittests/DAP/ProtocolRequestsTest.cpp b/lldb/unittests/DAP/ProtocolRequestsTest.cpp
index 18ba5cbf58cfd..5d6086b46add0 100644
--- a/lldb/unittests/DAP/ProtocolRequestsTest.cpp
+++ b/lldb/unittests/DAP/ProtocolRequestsTest.cpp
@@ -11,6 +11,7 @@
 #include "TestingSupport/TestUtilities.h"
 #include "llvm/Testing/Support/Error.h"
 #include <gtest/gtest.h>
+#include <optional>
 
 using namespace llvm;
 using namespace lldb_dap::protocol;
@@ -413,3 +414,22 @@ TEST(ProtocolRequestsTest, StackTraceResponseBody) {
   ASSERT_THAT_EXPECTED(expected, llvm::Succeeded());
   EXPECT_EQ(PrettyPrint(*expected), PrettyPrint(body));
 }
+
+TEST(ProtocolRequestsTest, SetVariableArguments) {
+  llvm::Expected<SetVariableArguments> expected =
+      parse<SetVariableArguments>(R"({
+    "variablesReference": 42,
+    "name": "test",
+    "value": "12345"
+  })");
+  ASSERT_THAT_EXPECTED(expected, llvm::Succeeded());
+  EXPECT_EQ(expected->variablesReference.AsUInt32(), 42U);
+  EXPECT_EQ(expected->name, "test");
+  EXPECT_EQ(expected->value, "12345");
+  EXPECT_EQ(expected->format, std::nullopt);
+
+  // Check required keys.
+  EXPECT_THAT_EXPECTED(
+      parse<SetVariableArguments>(R"({})"),
+      FailedWithMessage("missing value at (root).variablesReference"));
+}
diff --git a/llvm/cmake/modules/AddLLVM.cmake b/llvm/cmake/modules/AddLLVM.cmake
index d938214f9d0df..81aaf6034cca7 100644
--- a/llvm/cmake/modules/AddLLVM.cmake
+++ b/llvm/cmake/modules/AddLLVM.cmake
@@ -1004,6 +1004,13 @@ macro(generate_llvm_objects name)
 
   if (ARG_GENERATE_DRIVER)
     string(REPLACE "-" "_" TOOL_NAME ${name})
+
+    set(INITLLVM_ARGS "")
+
+    if(${name} STREQUAL "clang")
+      set(INITLLVM_ARGS ", /*InstallPipeSignalExitHandler=*/true, /*NeedsPOSIXUtilitySignalHandling=*/true")
+    endif()
+
     foreach(path ${CMAKE_MODULE_PATH})
       if(EXISTS ${path}/llvm-driver-template.cpp.in)
         configure_file(
diff --git a/llvm/cmake/modules/HandleLLVMOptions.cmake b/llvm/cmake/modules/HandleLLVMOptions.cmake
index 8b7c8cd4028bd..b5372eea259f9 100644
--- a/llvm/cmake/modules/HandleLLVMOptions.cmake
+++ b/llvm/cmake/modules/HandleLLVMOptions.cmake
@@ -733,7 +733,6 @@ if (MSVC)
       -wd4244 # Suppress ''argument' : conversion from 'type1' to 'type2', possible loss of data'
       -wd4267 # Suppress ''var' : conversion from 'size_t' to 'type', possible loss of data'
       -wd4291 # Suppress ''declaration' : no matching operator delete found; memory will not be freed if initialization throws an exception'
-      -wd4351 # Suppress 'new behavior: elements of array 'array' will be default initialized'
       -wd4456 # Suppress 'declaration of 'var' hides local variable'
       -wd4457 # Suppress 'declaration of 'var' hides function parameter'
       -wd4458 # Suppress 'declaration of 'var' hides class member'
diff --git a/llvm/cmake/modules/llvm-driver-template.cpp.in b/llvm/cmake/modules/llvm-driver-template.cpp.in
index 1470ef1f06164..d4c385c8cf412 100644
--- a/llvm/cmake/modules/llvm-driver-template.cpp.in
+++ b/llvm/cmake/modules/llvm-driver-template.cpp.in
@@ -13,6 +13,6 @@
 int @TOOL_NAME at _main(int argc, char **, const llvm::ToolContext &);
 
 int main(int argc, char **argv) {
-  llvm::InitLLVM X(argc, argv);
+  llvm::InitLLVM X(argc, argv at INITLLVM_ARGS@);
   return @TOOL_NAME at _main(argc, argv, {argv[0], nullptr, false});
 }
diff --git a/llvm/docs/MIRLangRef.rst b/llvm/docs/MIRLangRef.rst
index f7df57d05baa0..a2ee28151b354 100644
--- a/llvm/docs/MIRLangRef.rst
+++ b/llvm/docs/MIRLangRef.rst
@@ -523,7 +523,7 @@ The full syntax of a register operand is shown below:
 
 .. code-block:: text
 
-    [<flags>] <register> [ :<subregister-idx-name> ] [ (tied-def <tied-op>) ]
+    [<flags>] <register> [ .<subregister-idx-name> ] [ :<register-class> ] [ (tied-def <tied-op>) ] [ (<type>) ]
 
 This example shows an instance of the X86 ``XOR32rr`` instruction that has
 5 register operands with different register flags:
@@ -532,6 +532,9 @@ This example shows an instance of the X86 ``XOR32rr`` instruction that has
 
   dead $eax = XOR32rr undef $eax, undef $eax, implicit-def dead $eflags, implicit-def $al
 
+Note that subregister-index, register-class and type cannot be specified for
+physical registers. Additionally, tied-def can only be specified for a use.
+
 .. _register-flags:
 
 Register Flags
@@ -602,7 +605,7 @@ lower bits from the 32-bit virtual register 0 to the 8-bit virtual register 1:
 
 .. code-block:: text
 
-    %1 = COPY %0:sub_8bit
+    %1 = COPY %0.sub_8bit
 
 The names of the subregister indices are target specific, and are typically
 defined in the target's ``*RegisterInfo.td`` file.
diff --git a/llvm/include/llvm/Support/CrashRecoveryContext.h b/llvm/include/llvm/Support/CrashRecoveryContext.h
index ffee81dd24587..4abd2d0eedcde 100644
--- a/llvm/include/llvm/Support/CrashRecoveryContext.h
+++ b/llvm/include/llvm/Support/CrashRecoveryContext.h
@@ -60,7 +60,7 @@ class CrashRecoveryContext {
   LLVM_ABI void unregisterCleanup(CrashRecoveryContextCleanup *cleanup);
 
   /// Enable crash recovery.
-  LLVM_ABI static void Enable();
+  LLVM_ABI static void Enable(bool NeedsPOSIXUtilitySignalHandling = false);
 
   /// Disable crash recovery.
   LLVM_ABI static void Disable();
diff --git a/llvm/include/llvm/Support/InitLLVM.h b/llvm/include/llvm/Support/InitLLVM.h
index 748f5d8aa6aea..4d513bfd576bb 100644
--- a/llvm/include/llvm/Support/InitLLVM.h
+++ b/llvm/include/llvm/Support/InitLLVM.h
@@ -36,10 +36,13 @@ namespace llvm {
 class InitLLVM {
 public:
   LLVM_ABI InitLLVM(int &Argc, const char **&Argv,
-                    bool InstallPipeSignalExitHandler = true);
-  InitLLVM(int &Argc, char **&Argv, bool InstallPipeSignalExitHandler = true)
+                    bool InstallPipeSignalExitHandler = true,
+                    bool NeedsPOSIXUtilitySignalHandling = false);
+  InitLLVM(int &Argc, char **&Argv, bool InstallPipeSignalExitHandler = true,
+           bool NeedsPOSIXUtilitySignalHandling = false)
       : InitLLVM(Argc, const_cast<const char **&>(Argv),
-                 InstallPipeSignalExitHandler) {}
+                 InstallPipeSignalExitHandler,
+                 NeedsPOSIXUtilitySignalHandling) {}
 
   LLVM_ABI ~InitLLVM();
 
diff --git a/llvm/include/llvm/Support/Signals.h b/llvm/include/llvm/Support/Signals.h
index 21b425fffef53..d9bff20b85393 100644
--- a/llvm/include/llvm/Support/Signals.h
+++ b/llvm/include/llvm/Support/Signals.h
@@ -99,8 +99,12 @@ using SignalHandlerCallback = void (*)(void *);
 
 /// Add a function to be called when an abort/kill signal is delivered to the
 /// process. The handler can have a cookie passed to it to identify what
-/// instance of the handler it is.
-LLVM_ABI void AddSignalHandler(SignalHandlerCallback FnPtr, void *Cookie);
+/// instance of the handler it is. The NeedsPOSIXUtilitySignalHandling
+/// argument indicates whether POSIX signal handling semantics are followed,
+/// so that the signal handler resignals itself to terminate after handling
+/// the signal.
+LLVM_ABI void AddSignalHandler(SignalHandlerCallback FnPtr, void *Cookie,
+                               bool NeedsPOSIXUtilitySignalHandling = false);
 
 /// This function registers a function to be called when the user "interrupts"
 /// the program (typically by pressing ctrl-c).  When the user interrupts the
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index b0ac14ba8b393..738d0c063a5e4 100644
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -329,8 +329,7 @@ bool llvm::IsConstantOffsetFromGlobal(Constant *C, GlobalValue *&GV,
 
   // Look through ptr->int and ptr->ptr casts.
   if (CE->getOpcode() == Instruction::PtrToInt ||
-      CE->getOpcode() == Instruction::PtrToAddr ||
-      CE->getOpcode() == Instruction::BitCast)
+      CE->getOpcode() == Instruction::PtrToAddr)
     return IsConstantOffsetFromGlobal(CE->getOperand(0), GV, Offset, DL,
                                       DSOEquiv);
 
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index 32b936aa45eae..2c8612a9d7822 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -984,6 +984,11 @@ class ModuleSummaryIndexBitcodeReader : public BitcodeReaderBase {
   /// the CallStackRadixTreeBuilder class in ProfileData/MemProf.h for format.
   std::vector<uint64_t> RadixArray;
 
+  /// Map from the module's stack id index to the index in the
+  /// ModuleSummaryIndex's StackIds vector. Populated when the STACK_IDS record
+  /// is processed and used to avoid repeated hash lookups.
+  std::vector<unsigned> StackIdToIndex;
+
 public:
   ModuleSummaryIndexBitcodeReader(
       BitstreamCursor Stream, StringRef Strtab, ModuleSummaryIndex &TheIndex,
@@ -7636,8 +7641,7 @@ SmallVector<unsigned> ModuleSummaryIndexBitcodeReader::parseAllocInfoContext(
     StackIdList.reserve(NumStackEntries);
     for (unsigned J = 0; J < NumStackEntries; J++) {
       assert(Record[I] < StackIds.size());
-      StackIdList.push_back(
-          TheIndex.addOrGetStackIdIndex(StackIds[Record[I++]]));
+      StackIdList.push_back(StackIdToIndex[Record[I++]]);
     }
   } else {
     unsigned RadixIndex = Record[I++];
@@ -7660,7 +7664,7 @@ SmallVector<unsigned> ModuleSummaryIndexBitcodeReader::parseAllocInfoContext(
         assert(static_cast<std::make_signed_t<unsigned>>(Elem) >= 0);
       }
       RadixIndex++;
-      StackIdList.push_back(TheIndex.addOrGetStackIdIndex(StackIds[Elem]));
+      StackIdList.push_back(StackIdToIndex[Elem]);
     }
   }
   return StackIdList;
@@ -8123,16 +8127,22 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
     case bitc::FS_STACK_IDS: { // [n x stackid]
       // Save stack ids in the reader to consult when adding stack ids from the
       // lists in the stack node and alloc node entries.
+      assert(StackIds.empty());
       if (Version <= 11) {
         StackIds = ArrayRef<uint64_t>(Record);
-        break;
+      } else {
+        // This is an array of 32-bit fixed-width values, holding each 64-bit
+        // context id as a pair of adjacent (most significant first) 32-bit
+        // words.
+        assert(Record.size() % 2 == 0);
+        StackIds.reserve(Record.size() / 2);
+        for (auto R = Record.begin(); R != Record.end(); R += 2)
+          StackIds.push_back(*R << 32 | *(R + 1));
       }
-      // This is an array of 32-bit fixed-width values, holding each 64-bit
-      // context id as a pair of adjacent (most significant first) 32-bit words.
-      assert(Record.size() % 2 == 0);
-      StackIds.reserve(Record.size() / 2);
-      for (auto R = Record.begin(); R != Record.end(); R += 2)
-        StackIds.push_back(*R << 32 | *(R + 1));
+      assert(StackIdToIndex.empty());
+      StackIdToIndex.reserve(StackIds.size());
+      for (uint64_t StackId : StackIds)
+        StackIdToIndex.push_back(TheIndex.addOrGetStackIdIndex(StackId));
       break;
     }
 
@@ -8146,7 +8156,7 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
       SmallVector<unsigned> StackIdList;
       for (uint64_t R : drop_begin(Record)) {
         assert(R < StackIds.size());
-        StackIdList.push_back(TheIndex.addOrGetStackIdIndex(StackIds[R]));
+        StackIdList.push_back(StackIdToIndex[R]);
       }
       ValueInfo VI = std::get<0>(getValueInfoFromValueId(ValueID));
       PendingCallsites.push_back(CallsiteInfo({VI, std::move(StackIdList)}));
@@ -8162,8 +8172,7 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
       SmallVector<unsigned> StackIdList;
       for (unsigned J = 0; J < NumStackIds; J++) {
         assert(*RecordIter < StackIds.size());
-        StackIdList.push_back(
-            TheIndex.addOrGetStackIdIndex(StackIds[*RecordIter++]));
+        StackIdList.push_back(StackIdToIndex[*RecordIter++]);
       }
       SmallVector<unsigned> Versions;
       for (unsigned J = 0; J < NumVersions; J++)
diff --git a/llvm/lib/CodeGen/MIRParser/MIParser.cpp b/llvm/lib/CodeGen/MIRParser/MIParser.cpp
index c90ee21c6750b..f54a0c44d717b 100644
--- a/llvm/lib/CodeGen/MIRParser/MIParser.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MIParser.cpp
@@ -1724,16 +1724,14 @@ bool MIParser::parseSubRegisterIndex(unsigned &SubReg) {
 }
 
 bool MIParser::parseRegisterTiedDefIndex(unsigned &TiedDefIdx) {
-  if (!consumeIfPresent(MIToken::kw_tied_def))
-    return true;
+  assert(Token.is(MIToken::kw_tied_def));
+  lex();
   if (Token.isNot(MIToken::IntegerLiteral))
     return error("expected an integer literal after 'tied-def'");
   if (getUnsigned(TiedDefIdx))
     return true;
   lex();
-  if (expectAndConsume(MIToken::rparen))
-    return true;
-  return false;
+  return expectAndConsume(MIToken::rparen);
 }
 
 bool MIParser::assignRegisterTies(MachineInstr &MI,
@@ -1781,6 +1779,8 @@ bool MIParser::parseRegisterOperand(MachineOperand &Dest,
     if (parseRegisterFlag(Flags))
       return true;
   }
+  // Update IsDef as we may have read a def flag.
+  IsDef = hasRegState(Flags, RegState::Define);
   if (!Token.isRegister())
     return error("expected a register after register flags");
   Register Reg;
@@ -1802,56 +1802,46 @@ bool MIParser::parseRegisterOperand(MachineOperand &Dest,
     if (parseRegisterClassOrBank(*RegInfo))
         return true;
   }
-  MachineRegisterInfo &MRI = MF.getRegInfo();
-  if (!hasRegState(Flags, RegState::Define)) {
-    if (consumeIfPresent(MIToken::lparen)) {
-      unsigned Idx;
-      if (!parseRegisterTiedDefIndex(Idx))
-        TiedDefIdx = Idx;
-      else {
-        // Try a redundant low-level type.
-        LLT Ty;
-        if (parseLowLevelType(Token.location(), Ty))
-          return error("expected tied-def or low-level type after '('");
-
-        if (expectAndConsume(MIToken::rparen))
-          return true;
-
-        if (MRI.getType(Reg).isValid() && MRI.getType(Reg) != Ty)
-          return error("inconsistent type for generic virtual register");
 
-        MRI.setRegClassOrRegBank(Reg, static_cast<RegisterBank *>(nullptr));
-        MRI.setType(Reg, Ty);
-        MRI.noteNewVirtualRegister(Reg);
-      }
-    }
-  } else if (consumeIfPresent(MIToken::lparen)) {
-    // Virtual registers may have a tpe with GlobalISel.
-    if (!Reg.isVirtual())
-      return error("unexpected type on physical register");
+  if (consumeIfPresent(MIToken::lparen)) {
+    // For a def, we only expect a type. For use we expect either a type or a
+    // tied-def. Additionally, for physical registers, we don't expect a type.
+    if (Token.is(MIToken::kw_tied_def)) {
+      if (IsDef)
+        return error("tied-def not supported for defs");
+      unsigned Idx;
+      if (parseRegisterTiedDefIndex(Idx))
+        return true;
+      TiedDefIdx = Idx;
+    } else {
+      if (!Reg.isVirtual())
+        return error("unexpected type on physical register");
 
-    LLT Ty;
-    if (parseLowLevelType(Token.location(), Ty))
-      return true;
+      LLT Ty;
+      // If type parsing fails, forwad the parse error for defs.
+      if (parseLowLevelType(Token.location(), Ty))
+        return IsDef ? true
+                     : error("expected tied-def or low-level type after '('");
 
-    if (expectAndConsume(MIToken::rparen))
-      return true;
+      if (expectAndConsume(MIToken::rparen))
+        return true;
 
-    if (MRI.getType(Reg).isValid() && MRI.getType(Reg) != Ty)
-      return error("inconsistent type for generic virtual register");
+      MachineRegisterInfo &MRI = MF.getRegInfo();
+      if (MRI.getType(Reg).isValid() && MRI.getType(Reg) != Ty)
+        return error("inconsistent type for generic virtual register");
 
-    MRI.setRegClassOrRegBank(Reg, static_cast<RegisterBank *>(nullptr));
-    MRI.setType(Reg, Ty);
-  } else if (Reg.isVirtual()) {
-    // Generic virtual registers must have a type.
-    // If we end up here this means the type hasn't been specified and
-    // this is bad!
+      MRI.setRegClassOrRegBank(Reg, static_cast<RegisterBank *>(nullptr));
+      MRI.setType(Reg, Ty);
+      MRI.noteNewVirtualRegister(Reg);
+    }
+  } else if (IsDef && Reg.isVirtual()) {
+    // Generic virtual registers defs must have a type.
     if (RegInfo->Kind == VRegInfo::GENERIC ||
         RegInfo->Kind == VRegInfo::REGBANK)
       return error("generic virtual registers must have a type");
   }
 
-  if (hasRegState(Flags, RegState::Define)) {
+  if (IsDef) {
     if (hasRegState(Flags, RegState::Kill))
       return error("cannot have a killed def operand");
   } else {
@@ -1859,15 +1849,14 @@ bool MIParser::parseRegisterOperand(MachineOperand &Dest,
       return error("cannot have a dead use operand");
   }
 
-  Dest = MachineOperand::CreateReg(Reg, hasRegState(Flags, RegState::Define),
-                                   hasRegState(Flags, RegState::Implicit),
-                                   hasRegState(Flags, RegState::Kill),
-                                   hasRegState(Flags, RegState::Dead),
-                                   hasRegState(Flags, RegState::Undef),
-                                   hasRegState(Flags, RegState::EarlyClobber),
-                                   SubReg, hasRegState(Flags, RegState::Debug),
-                                   hasRegState(Flags, RegState::InternalRead),
-                                   hasRegState(Flags, RegState::Renamable));
+  Dest = MachineOperand::CreateReg(
+      Reg, IsDef, hasRegState(Flags, RegState::Implicit),
+      hasRegState(Flags, RegState::Kill), hasRegState(Flags, RegState::Dead),
+      hasRegState(Flags, RegState::Undef),
+      hasRegState(Flags, RegState::EarlyClobber), SubReg,
+      hasRegState(Flags, RegState::Debug),
+      hasRegState(Flags, RegState::InternalRead),
+      hasRegState(Flags, RegState::Renamable));
 
   return false;
 }
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index e4b4d80896fa7..3c7b46a9021da 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -8439,6 +8439,10 @@ SDValue TargetLowering::expandCLMUL(SDNode *Node, SelectionDAG &DAG) const {
   unsigned BW = VT.getScalarSizeInBits();
   unsigned Opcode = Node->getOpcode();
 
+  // Scalarize if the vector multiplication is unlikely to work.
+  if (VT.isVector() && !isOperationLegalOrCustom(ISD::MUL, VT))
+    return DAG.UnrollVectorOp(Node);
+
   switch (Opcode) {
   case ISD::CLMUL: {
     // NOTE: If you change this expansion, please update the cost model
diff --git a/llvm/lib/Support/CrashRecoveryContext.cpp b/llvm/lib/Support/CrashRecoveryContext.cpp
index fc30d421506a6..fb9f2ec333a7b 100644
--- a/llvm/lib/Support/CrashRecoveryContext.cpp
+++ b/llvm/lib/Support/CrashRecoveryContext.cpp
@@ -92,7 +92,8 @@ static LLVM_THREAD_LOCAL const CrashRecoveryContext *IsRecoveringFromCrash;
 
 } // namespace
 
-static void installExceptionOrSignalHandlers();
+static void
+installExceptionOrSignalHandlers(bool NeedsPOSIXUtilitySignalHandling);
 static void uninstallExceptionOrSignalHandlers();
 
 CrashRecoveryContextCleanup::~CrashRecoveryContextCleanup() = default;
@@ -137,13 +138,13 @@ CrashRecoveryContext *CrashRecoveryContext::GetCurrent() {
   return CRCI->CRC;
 }
 
-void CrashRecoveryContext::Enable() {
+void CrashRecoveryContext::Enable(bool NeedsPOSIXUtilitySignalHandling) {
   std::lock_guard<std::mutex> L(getCrashRecoveryContextMutex());
   // FIXME: Shouldn't this be a refcount or something?
   if (gCrashRecoveryEnabled)
     return;
   gCrashRecoveryEnabled = true;
-  installExceptionOrSignalHandlers();
+  installExceptionOrSignalHandlers(NeedsPOSIXUtilitySignalHandling);
 }
 
 void CrashRecoveryContext::Disable() {
@@ -193,7 +194,8 @@ CrashRecoveryContext::unregisterCleanup(CrashRecoveryContextCleanup *cleanup) {
 // catches exceptions if they would bubble out from the stack frame with __try /
 // __except.
 
-static void installExceptionOrSignalHandlers() {}
+static void
+installExceptionOrSignalHandlers(bool NeedsPOSIXUtilitySignalHandling) {}
 static void uninstallExceptionOrSignalHandlers() {}
 
 // We need this function because the call to GetExceptionInformation() can only
@@ -309,7 +311,8 @@ static LONG CALLBACK ExceptionHandler(PEXCEPTION_POINTERS ExceptionInfo)
 // non-NULL, valid VEH handles, or NULL.
 static LLVM_THREAD_LOCAL const void* sCurrentExceptionHandle;
 
-static void installExceptionOrSignalHandlers() {
+static void
+installExceptionOrSignalHandlers(bool NeedsPOSIXUtilitySignalHandling) {
   // We can set up vectored exception handling now.  We will install our
   // handler as the front of the list, though there's no assurances that
   // it will remain at the front (another call could install itself before
@@ -390,7 +393,8 @@ static void CrashRecoverySignalHandler(int Signal) {
     const_cast<CrashRecoveryContextImpl *>(CRCI)->HandleCrash(RetCode, Signal);
 }
 
-static void installExceptionOrSignalHandlers() {
+static void
+installExceptionOrSignalHandlers(bool NeedsPOSIXUtilitySignalHandling) {
   // Setup the signal handler.
   struct sigaction Handler;
   Handler.sa_handler = CrashRecoverySignalHandler;
@@ -398,7 +402,14 @@ static void installExceptionOrSignalHandlers() {
   sigemptyset(&Handler.sa_mask);
 
   for (unsigned i = 0; i != NumSignals; ++i) {
-    sigaction(Signals[i], &Handler, &PrevActions[i]);
+    if (NeedsPOSIXUtilitySignalHandling) {
+      // Don't install the new handler if the signal disposition is SIG_IGN.
+      struct sigaction act;
+      if (sigaction(Signals[i], NULL, &act) == 0 && act.sa_handler != SIG_IGN)
+        sigaction(Signals[i], &Handler, &PrevActions[i]);
+    } else {
+      sigaction(Signals[i], &Handler, &PrevActions[i]);
+    }
   }
 }
 
diff --git a/llvm/lib/Support/InitLLVM.cpp b/llvm/lib/Support/InitLLVM.cpp
index b90f4e0714458..797c5d35bec35 100644
--- a/llvm/lib/Support/InitLLVM.cpp
+++ b/llvm/lib/Support/InitLLVM.cpp
@@ -73,7 +73,8 @@ using namespace llvm;
 using namespace llvm::sys;
 
 InitLLVM::InitLLVM(int &Argc, const char **&Argv,
-                   bool InstallPipeSignalExitHandler) {
+                   bool InstallPipeSignalExitHandler,
+                   bool NeedsPOSIXUtilitySignalHandling) {
 #ifndef NDEBUG
   static std::atomic<bool> Initialized{false};
   assert(!Initialized && "InitLLVM was already initialized!");
@@ -81,7 +82,12 @@ InitLLVM::InitLLVM(int &Argc, const char **&Argv,
 #endif
 
   // Bring stdin/stdout/stderr into a known state.
+#ifdef _WIN32
   sys::AddSignalHandler(CleanupStdHandles, nullptr);
+#else
+  sys::AddSignalHandler(CleanupStdHandles, nullptr,
+                        NeedsPOSIXUtilitySignalHandling);
+#endif
 
   if (InstallPipeSignalExitHandler)
     // The pipe signal handler must be installed before any other handlers are
diff --git a/llvm/lib/Support/KnownBits.cpp b/llvm/lib/Support/KnownBits.cpp
index 58d21154fed08..07e7781d0839d 100644
--- a/llvm/lib/Support/KnownBits.cpp
+++ b/llvm/lib/Support/KnownBits.cpp
@@ -632,31 +632,31 @@ KnownBits KnownBits::clmul(const KnownBits &LHS, const KnownBits &RHS) {
 
 std::optional<bool> KnownBits::eq(const KnownBits &LHS, const KnownBits &RHS) {
   if (LHS.isConstant() && RHS.isConstant())
-    return std::optional<bool>(LHS.getConstant() == RHS.getConstant());
+    return LHS.getConstant() == RHS.getConstant();
   if (LHS.One.intersects(RHS.Zero) || RHS.One.intersects(LHS.Zero))
-    return std::optional<bool>(false);
+    return false;
   return std::nullopt;
 }
 
 std::optional<bool> KnownBits::ne(const KnownBits &LHS, const KnownBits &RHS) {
   if (std::optional<bool> KnownEQ = eq(LHS, RHS))
-    return std::optional<bool>(!*KnownEQ);
+    return !*KnownEQ;
   return std::nullopt;
 }
 
 std::optional<bool> KnownBits::ugt(const KnownBits &LHS, const KnownBits &RHS) {
   // LHS >u RHS -> false if umax(LHS) <= umax(RHS)
   if (LHS.getMaxValue().ule(RHS.getMinValue()))
-    return std::optional<bool>(false);
+    return false;
   // LHS >u RHS -> true if umin(LHS) > umax(RHS)
   if (LHS.getMinValue().ugt(RHS.getMaxValue()))
-    return std::optional<bool>(true);
+    return true;
   return std::nullopt;
 }
 
 std::optional<bool> KnownBits::uge(const KnownBits &LHS, const KnownBits &RHS) {
   if (std::optional<bool> IsUGT = ugt(RHS, LHS))
-    return std::optional<bool>(!*IsUGT);
+    return !*IsUGT;
   return std::nullopt;
 }
 
@@ -671,16 +671,16 @@ std::optional<bool> KnownBits::ule(const KnownBits &LHS, const KnownBits &RHS) {
 std::optional<bool> KnownBits::sgt(const KnownBits &LHS, const KnownBits &RHS) {
   // LHS >s RHS -> false if smax(LHS) <= smax(RHS)
   if (LHS.getSignedMaxValue().sle(RHS.getSignedMinValue()))
-    return std::optional<bool>(false);
+    return false;
   // LHS >s RHS -> true if smin(LHS) > smax(RHS)
   if (LHS.getSignedMinValue().sgt(RHS.getSignedMaxValue()))
-    return std::optional<bool>(true);
+    return true;
   return std::nullopt;
 }
 
 std::optional<bool> KnownBits::sge(const KnownBits &LHS, const KnownBits &RHS) {
   if (std::optional<bool> KnownSGT = sgt(RHS, LHS))
-    return std::optional<bool>(!*KnownSGT);
+    return !*KnownSGT;
   return std::nullopt;
 }
 
diff --git a/llvm/lib/Support/Unix/Signals.inc b/llvm/lib/Support/Unix/Signals.inc
index 56ad4fc504153..e861240189617 100644
--- a/llvm/lib/Support/Unix/Signals.inc
+++ b/llvm/lib/Support/Unix/Signals.inc
@@ -84,8 +84,10 @@
 
 using namespace llvm;
 
-static void SignalHandler(int Sig, siginfo_t *Info, void *);
+static void SignalHandler(int Sig, siginfo_t *Info, void *Context);
+static void SignalHandlerTerminate(int Sig, siginfo_t *Info, void *Context);
 static void InfoSignalHandler(int Sig); // defined below.
+static void InfoSignalHandlerTerminate(int Sig); // defined below.
 
 using SignalHandlerFunctionType = void (*)();
 /// The function to call if ctrl-c is pressed.
@@ -292,7 +294,8 @@ static void CreateSigAltStack() {
 static void CreateSigAltStack() {}
 #endif
 
-static void RegisterHandlers() { // Not signal-safe.
+static void RegisterHandlers(
+    bool NeedsPOSIXUtilitySignalHandling = false) { // Not signal-safe.
   // The mutex prevents other threads from registering handlers while we're
   // doing it. We also have to protect the handlers and their count because
   // a signal handler could fire while we're registering handlers.
@@ -317,18 +320,34 @@ static void RegisterHandlers() { // Not signal-safe.
 
     switch (Kind) {
     case SignalKind::IsKill:
-      NewHandler.sa_sigaction = SignalHandler;
+      if (NeedsPOSIXUtilitySignalHandling)
+        // If POSIX signal-handling semantics are followed, the signal handler
+        // resignal itself to terminate after handling the signal.
+        NewHandler.sa_sigaction = SignalHandlerTerminate;
+      else
+        NewHandler.sa_sigaction = SignalHandler;
       NewHandler.sa_flags = SA_NODEFER | SA_RESETHAND | SA_ONSTACK | SA_SIGINFO;
       break;
     case SignalKind::IsInfo:
-      NewHandler.sa_handler = InfoSignalHandler;
+      if (NeedsPOSIXUtilitySignalHandling)
+        // If POSIX signal-handling semantics are followed, the signal handler
+        // resignal itself to terminate after handling the signal.
+        NewHandler.sa_handler = InfoSignalHandlerTerminate;
+      else
+        NewHandler.sa_handler = InfoSignalHandler;
       NewHandler.sa_flags = SA_ONSTACK;
       break;
     }
     sigemptyset(&NewHandler.sa_mask);
 
-    // Install the new handler, save the old one in RegisteredSignalInfo.
-    sigaction(Signal, &NewHandler, &RegisteredSignalInfo[Index].SA);
+    if (NeedsPOSIXUtilitySignalHandling) {
+      // Don't install the new handler if the signal disposition is SIG_IGN.
+      struct sigaction act;
+      if (sigaction(Signal, NULL, &act) == 0 && act.sa_handler != SIG_IGN)
+        sigaction(Signal, &NewHandler, &RegisteredSignalInfo[Index].SA);
+    } else {
+      sigaction(Signal, &NewHandler, &RegisteredSignalInfo[Index].SA);
+    }
     RegisteredSignalInfo[Index].SigNo = Signal;
     ++NumRegisteredSignals;
   };
@@ -377,7 +396,7 @@ void sys::CleanupOnSignal(uintptr_t Context) {
 }
 
 // The signal handler that runs.
-static void SignalHandler(int Sig, siginfo_t *Info, void *) {
+static void SignalHandler(int Sig, siginfo_t *Info, void *Context) {
   // Restore the signal behavior to default, so that the program actually
   // crashes when we return and the signal reissues.  This also ensures that if
   // we crash in our signal handler that the program will terminate immediately
@@ -437,12 +456,30 @@ static void SignalHandler(int Sig, siginfo_t *Info, void *) {
 #endif
 }
 
+static void SignalHandlerTerminate(int Sig, siginfo_t *Info, void *Context) {
+  SignalHandler(Sig, Info, Context);
+
+  // Resignal if it is a kill signal so that the exit code contains the
+  // terminating signal number.
+  if (llvm::is_contained(KillSigs, Sig))
+    raise(Sig); // Execute the default handler.
+}
+
 static void InfoSignalHandler(int Sig) {
   SaveAndRestore SaveErrnoDuringASignalHandler(errno);
   if (SignalHandlerFunctionType CurrentInfoFunction = InfoSignalFunction)
     CurrentInfoFunction();
 }
 
+static void InfoSignalHandlerTerminate(int Sig) {
+  InfoSignalHandler(Sig);
+
+  if (Sig == SIGUSR1) {
+    sys::unregisterHandlers();
+    raise(Sig);
+  }
+}
+
 void sys::RunInterruptHandlers() {
   // Let's not interfere with stack trace symbolication and friends.
   auto BypassSandbox = sandbox::scopedDisable();
@@ -488,10 +525,11 @@ void llvm::sys::DontRemoveFileOnSignal(StringRef Filename) {
 /// Add a function to be called when a signal is delivered to the process. The
 /// handler can have a cookie passed to it to identify what instance of the
 /// handler it is.
-void llvm::sys::AddSignalHandler(sys::SignalHandlerCallback FnPtr,
-                                 void *Cookie) { // Signal-safe.
+void llvm::sys::AddSignalHandler(sys::SignalHandlerCallback FnPtr, void *Cookie,
+                                 bool NeedsPOSIXUtilitySignalHandling) {
+  // Signal-safe.
   insertSignalHandler(FnPtr, Cookie);
-  RegisterHandlers();
+  RegisterHandlers(NeedsPOSIXUtilitySignalHandling);
 }
 
 #if ENABLE_BACKTRACES && defined(HAVE_BACKTRACE) &&                            \
diff --git a/llvm/lib/Support/Windows/Signals.inc b/llvm/lib/Support/Windows/Signals.inc
index eec112e5a80f5..7e6799befb983 100644
--- a/llvm/lib/Support/Windows/Signals.inc
+++ b/llvm/lib/Support/Windows/Signals.inc
@@ -549,8 +549,8 @@ void llvm::sys::CallOneShotPipeSignalHandler() {
 /// Add a function to be called when a signal is delivered to the process. The
 /// handler can have a cookie passed to it to identify what instance of the
 /// handler it is.
-void llvm::sys::AddSignalHandler(sys::SignalHandlerCallback FnPtr,
-                                 void *Cookie) {
+void llvm::sys::AddSignalHandler(sys::SignalHandlerCallback FnPtr, void *Cookie,
+                                 bool NeedsPOSIXUtilitySignalHandling) {
   insertSignalHandler(FnPtr, Cookie);
   RegisterHandler();
   LeaveCriticalSection(&CriticalSection);
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index ac6b9dd5dbd68..9babe675200c1 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -890,11 +890,11 @@ enum SMEMatrixType {
 #undef TSFLAG_INSTR_FLAGS
 #undef TSFLAG_SME_MATRIX_TYPE
 
-int64_t getSVEPseudoMap(uint32_t Opcode);
-int64_t getSVERevInstr(uint32_t Opcode);
-int64_t getSVENonRevInstr(uint32_t Opcode);
+int32_t getSVEPseudoMap(uint32_t Opcode);
+int32_t getSVERevInstr(uint32_t Opcode);
+int32_t getSVENonRevInstr(uint32_t Opcode);
 
-int64_t getSMEPseudoMap(uint32_t Opcode);
+int32_t getSMEPseudoMap(uint32_t Opcode);
 }
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
index 7630007d0f8af..a192f788ead71 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
@@ -1243,7 +1243,7 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       // Fp conversions to i16 must be kept on fp register banks to ensure
       // proper saturation, as there are no 16-bit gprs.
       // In addition, conversion intrinsics have fpr output when the input
-      // size matches the output size, or PRCVT is present.
+      // size matches the output size, or FPRCVT is present.
       if (DstSize == 16 ||
           ((DstSize == SrcSize || STI.hasFeature(AArch64::FeatureFPRCVT)) &&
            all_of(MRI.use_nodbg_instructions(MI.getOperand(0).getReg()),
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 9ad2f2e11fbcc..5f0341c5aaa92 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -775,6 +775,14 @@ defm CvtFP8VOP1Bug : AMDGPUSubtargetFeature<"cvt-fp8-vop1-bug",
   [FeatureFP8ConversionInsts]
 >;
 
+defm WMMA256bInsts : AMDGPUSubtargetFeature<"wmma-256b-insts",
+  "Has WMMA instructions where A and B matrices have duplicated data"
+>;
+
+defm WMMA128bInsts : AMDGPUSubtargetFeature<"wmma-128b-insts",
+  "Has WMMA instructions where A and B matrices do not have duplicated data"
+>;
+
 defm PkFmacF16Inst : AMDGPUSubtargetFeature<"pk-fmac-f16-inst",
   "Has v_pk_fmac_f16 instruction"
 >;
@@ -1799,7 +1807,6 @@ def FeatureISAVersion11_Common : FeatureSet<
    FeatureBackOffBarrier,
    FeatureLDSBankCount32,
    FeatureDLInsts,
-   FeatureDot5Insts,
    FeatureDot7Insts,
    FeatureDot8Insts,
    FeatureDot9Insts,
@@ -1820,9 +1827,9 @@ def FeatureISAVersion11_Common : FeatureSet<
    FeatureD16Writes32BitVgpr,
 ]>;
 
-// There are few workarounds that need to be
-// added to all targets. This pessimizes codegen
-// a bit on the generic GFX11 target.
+// There are few workarounds that need to be added to all targets. This
+// pessimizes codegen a bit on the generic GFX11 target. This generic target
+// does not include GFX1170 due to incompatible changes.
 def FeatureISAVersion11_Generic: FeatureSet<
   !listconcat(FeatureISAVersion11_Common.Features,
     [FeatureMSAALoadDstSelBug,
@@ -1831,14 +1838,18 @@ def FeatureISAVersion11_Generic: FeatureSet<
      FeatureMADIntraFwdBug,
      FeaturePrivEnabledTrap2NopBug,
      FeatureRequiresCOV6,
-     FeatureRequiredExportPriority])>;
+     FeatureRequiredExportPriority,
+     FeatureDot5Insts,
+     FeatureWMMA256bInsts])>;
 
 def FeatureISAVersion11_0_Common : FeatureSet<
   !listconcat(FeatureISAVersion11_Common.Features,
     [FeatureMSAALoadDstSelBug,
      FeatureVALUTransUseHazard,
      FeatureMADIntraFwdBug,
-     FeaturePrivEnabledTrap2NopBug])>;
+     FeaturePrivEnabledTrap2NopBug,
+     FeatureDot5Insts,
+     FeatureWMMA256bInsts])>;
 
 def FeatureISAVersion11_0_0 : FeatureSet<
   !listconcat(FeatureISAVersion11_0_Common.Features,
@@ -1861,7 +1872,9 @@ def FeatureISAVersion11_5_Common : FeatureSet<
   !listconcat(FeatureISAVersion11_Common.Features,
     [FeatureSALUFloatInsts,
      FeatureDPPSrc1SGPR,
-     FeatureRequiredExportPriority])>;
+     FeatureRequiredExportPriority,
+     FeatureDot5Insts,
+     FeatureWMMA256bInsts])>;
 
 def FeatureISAVersion11_5_0 : FeatureSet<
   !listconcat(FeatureISAVersion11_5_Common.Features,
@@ -1885,7 +1898,8 @@ def FeatureISAVersion11_7_0 : FeatureSet<
     [FeatureSALUFloatInsts,
      FeatureDPPSrc1SGPR,
      FeatureFP8ConversionInsts,
-     FeatureDot11Insts])>;
+     FeatureDot11Insts,
+     FeatureWMMA128bInsts])>;
 
 def FeatureISAVersion12 : FeatureSet<
   [FeatureGFX12,
@@ -1915,6 +1929,7 @@ def FeatureISAVersion12 : FeatureSet<
    FeatureImageInsts,
    FeatureExtendedImageInsts,
    FeatureFP8ConversionInsts,
+   FeatureWMMA128bInsts,
    FeatureIEEEMinimumMaximumInsts,
    FeaturePackedTID,
    FeatureVcmpxPermlaneHazard,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp
index f4872ec63f7c3..4c8b91da765f7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp
@@ -148,7 +148,7 @@ class AMDGPULowerVGPREncoding {
   /// bit mapping. Optionally takes second array \p Ops2 for VOPD.
   /// If provided and an operand from \p Ops is not a VGPR, then \p Ops2
   /// is checked.
-  void computeMode(ModeTy &NewMode, MachineInstr &MI,
+  void computeMode(ModeTy &NewMode, const MachineInstr &MI,
                    const AMDGPU::OpName Ops[OpNum],
                    const AMDGPU::OpName *Ops2 = nullptr);
 
@@ -224,13 +224,14 @@ AMDGPULowerVGPREncoding::getMSBs(const MachineOperand &MO) const {
   return Idx >> 8;
 }
 
-void AMDGPULowerVGPREncoding::computeMode(ModeTy &NewMode, MachineInstr &MI,
+void AMDGPULowerVGPREncoding::computeMode(ModeTy &NewMode,
+                                          const MachineInstr &MI,
                                           const AMDGPU::OpName Ops[OpNum],
                                           const AMDGPU::OpName *Ops2) {
   NewMode = {};
 
   for (unsigned I = 0; I < OpNum; ++I) {
-    MachineOperand *Op = TII->getNamedOperand(MI, Ops[I]);
+    const MachineOperand *Op = TII->getNamedOperand(MI, Ops[I]);
 
     std::optional<unsigned> MSBits;
     if (Op)
@@ -238,7 +239,7 @@ void AMDGPULowerVGPREncoding::computeMode(ModeTy &NewMode, MachineInstr &MI,
 
 #if !defined(NDEBUG)
     if (MSBits.has_value() && Ops2) {
-      auto Op2 = TII->getNamedOperand(MI, Ops2[I]);
+      const MachineOperand *Op2 = TII->getNamedOperand(MI, Ops2[I]);
       if (Op2) {
         std::optional<unsigned> MSBits2;
         MSBits2 = getMSBs(*Op2);
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 998a9d0910a07..01cc4ff4ae854 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -1556,6 +1556,8 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
     return AMDGPU::isGFX11Plus(getSTI());
   }
 
+  bool isGFX1170() const { return AMDGPU::isGFX1170(getSTI()); }
+
   bool isGFX12() const { return AMDGPU::isGFX12(getSTI()); }
 
   bool isGFX12Plus() const { return AMDGPU::isGFX12Plus(getSTI()); }
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index b2dfd098735a0..2309a56f612f1 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -686,11 +686,19 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
                         Address, CS))
         break;
 
+      if (isGFX1170() &&
+          tryDecodeInst(DecoderTableGFX117064, MI, QW, Address, CS))
+        break;
+
       if (isGFX11() &&
           tryDecodeInst(DecoderTableGFX1164, DecoderTableGFX11_FAKE1664, MI, QW,
                         Address, CS))
         break;
 
+      if (isGFX1170() &&
+          tryDecodeInst(DecoderTableGFX1170W6464, MI, QW, Address, CS))
+        break;
+
       if (isGFX11() &&
           tryDecodeInst(DecoderTableGFX11W6464, MI, QW, Address, CS))
         break;
@@ -2247,6 +2255,8 @@ bool AMDGPUDisassembler::isGFX11Plus() const {
   return AMDGPU::isGFX11Plus(STI);
 }
 
+bool AMDGPUDisassembler::isGFX1170() const { return AMDGPU::isGFX1170(STI); }
+
 bool AMDGPUDisassembler::isGFX12() const {
   return STI.hasFeature(AMDGPU::FeatureGFX12);
 }
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index 28f71d8d7556b..b01eb8dd59fad 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -178,6 +178,7 @@ class AMDGPUDisassembler : public MCDisassembler {
   bool isGFX10() const;
   bool isGFX10Plus() const;
   bool isGFX11() const;
+  bool isGFX1170() const;
   bool isGFX11Plus() const;
   bool isGFX12() const;
   bool isGFX12Plus() const;
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index b308e0d77305f..2365b6175a46f 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -396,6 +396,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
     return isMesa3DOS() && AMDGPU::isShader(F.getCallingConv());
   }
 
+  bool isGFX1170() const {
+    return getGeneration() == GFX11 && hasWMMA128bInsts();
+  }
+
   bool hasMad64_32() const { return getGeneration() >= SEA_ISLANDS; }
 
   bool hasAtomicFaddInsts() const {
diff --git a/llvm/lib/Target/AMDGPU/R600InstrInfo.h b/llvm/lib/Target/AMDGPU/R600InstrInfo.h
index b96c17e137072..f6e9d2d485444 100644
--- a/llvm/lib/Target/AMDGPU/R600InstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/R600InstrInfo.h
@@ -326,7 +326,7 @@ class R600InstrInfo final : public R600GenInstrInfo {
 
 namespace R600 {
 
-int64_t getLDSNoRetOp(uint32_t Opcode);
+int32_t getLDSNoRetOp(uint32_t Opcode);
 
 } //End namespace AMDGPU
 
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 9180d5fc8bcf0..d7c997c1f5092 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -1159,7 +1159,7 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
 }
 
 int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
-  int64_t NewOpc;
+  int32_t NewOpc;
 
   // Try to map original to commuted opcode
   NewOpc = AMDGPU::getCommuteRev(Opcode);
@@ -10377,9 +10377,9 @@ int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
       Opcode = MFMAOp;
   }
 
-  int64_t MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
+  int32_t MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
 
-  if (MCOp == (uint32_t)-1 && ST.hasGFX1250Insts())
+  if (MCOp == AMDGPU::INSTRUCTION_LIST_END && ST.hasGFX1250Insts())
     MCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX12);
 
   // -1 means that Opcode is already a native instruction.
@@ -10387,20 +10387,20 @@ int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
     return Opcode;
 
   if (ST.hasGFX90AInsts()) {
-    uint32_t NMCOp = (uint32_t)-1;
+    uint32_t NMCOp = AMDGPU::INSTRUCTION_LIST_END;
     if (ST.hasGFX940Insts())
       NMCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX940);
-    if (NMCOp == (uint32_t)-1)
+    if (NMCOp == AMDGPU::INSTRUCTION_LIST_END)
       NMCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX90A);
-    if (NMCOp == (uint32_t)-1)
+    if (NMCOp == AMDGPU::INSTRUCTION_LIST_END)
       NMCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX9);
-    if (NMCOp != (uint32_t)-1)
+    if (NMCOp != AMDGPU::INSTRUCTION_LIST_END)
       MCOp = NMCOp;
   }
 
-  // (uint32_t)-1 means that Opcode is a pseudo instruction that has
-  // no encoding in the given subtarget generation.
-  if (MCOp == (uint32_t)-1)
+  // INSTRUCTION_LIST_END means that Opcode is a pseudo instruction that has no
+  // encoding in the given subtarget generation.
+  if (MCOp == AMDGPU::INSTRUCTION_LIST_END)
     return -1;
 
   if (isAsmOnlyOpcode(MCOp))
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 0b54513bb6114..c945533f0f2ab 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1737,86 +1737,86 @@ bool execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI,
 namespace AMDGPU {
 
   LLVM_READONLY
-  int64_t getVOPe64(uint32_t Opcode);
+  int32_t getVOPe64(uint32_t Opcode);
 
   LLVM_READONLY
-  int64_t getVOPe32(uint32_t Opcode);
+  int32_t getVOPe32(uint32_t Opcode);
 
   LLVM_READONLY
-  int64_t getSDWAOp(uint32_t Opcode);
+  int32_t getSDWAOp(uint32_t Opcode);
 
   LLVM_READONLY
-  int64_t getDPPOp32(uint32_t Opcode);
+  int32_t getDPPOp32(uint32_t Opcode);
 
   LLVM_READONLY
-  int64_t getDPPOp64(uint32_t Opcode);
+  int32_t getDPPOp64(uint32_t Opcode);
 
   LLVM_READONLY
-  int64_t getBasicFromSDWAOp(uint32_t Opcode);
+  int32_t getBasicFromSDWAOp(uint32_t Opcode);
 
   LLVM_READONLY
-  int64_t getCommuteRev(uint32_t Opcode);
+  int32_t getCommuteRev(uint32_t Opcode);
 
   LLVM_READONLY
-  int64_t getCommuteOrig(uint32_t Opcode);
+  int32_t getCommuteOrig(uint32_t Opcode);
 
   LLVM_READONLY
-  int64_t getAddr64Inst(uint32_t Opcode);
+  int32_t getAddr64Inst(uint32_t Opcode);
 
   /// Check if \p Opcode is an Addr64 opcode.
   ///
   /// \returns \p Opcode if it is an Addr64 opcode, otherwise -1.
   LLVM_READONLY
-  int64_t getIfAddr64Inst(uint32_t Opcode);
+  int32_t getIfAddr64Inst(uint32_t Opcode);
 
   LLVM_READONLY
-  int64_t getSOPKOp(uint32_t Opcode);
+  int32_t getSOPKOp(uint32_t Opcode);
 
   /// \returns SADDR form of a FLAT Global instruction given an \p Opcode
   /// of a VADDR form.
   LLVM_READONLY
-  int64_t getGlobalSaddrOp(uint32_t Opcode);
+  int32_t getGlobalSaddrOp(uint32_t Opcode);
 
   /// \returns VADDR form of a FLAT Global instruction given an \p Opcode
   /// of a SADDR form.
   LLVM_READONLY
-  int64_t getGlobalVaddrOp(uint32_t Opcode);
+  int32_t getGlobalVaddrOp(uint32_t Opcode);
 
   LLVM_READONLY
-  int64_t getVCMPXNoSDstOp(uint32_t Opcode);
+  int32_t getVCMPXNoSDstOp(uint32_t Opcode);
 
   /// \returns ST form with only immediate offset of a FLAT Scratch instruction
   /// given an \p Opcode of an SS (SADDR) form.
   LLVM_READONLY
-  int64_t getFlatScratchInstSTfromSS(uint32_t Opcode);
+  int32_t getFlatScratchInstSTfromSS(uint32_t Opcode);
 
   /// \returns SV (VADDR) form of a FLAT Scratch instruction given an \p Opcode
   /// of an SVS (SADDR + VADDR) form.
   LLVM_READONLY
-  int64_t getFlatScratchInstSVfromSVS(uint32_t Opcode);
+  int32_t getFlatScratchInstSVfromSVS(uint32_t Opcode);
 
   /// \returns SS (SADDR) form of a FLAT Scratch instruction given an \p Opcode
   /// of an SV (VADDR) form.
   LLVM_READONLY
-  int64_t getFlatScratchInstSSfromSV(uint32_t Opcode);
+  int32_t getFlatScratchInstSSfromSV(uint32_t Opcode);
 
   /// \returns SV (VADDR) form of a FLAT Scratch instruction given an \p Opcode
   /// of an SS (SADDR) form.
   LLVM_READONLY
-  int64_t getFlatScratchInstSVfromSS(uint32_t Opcode);
+  int32_t getFlatScratchInstSVfromSS(uint32_t Opcode);
 
   /// \returns earlyclobber version of a MAC MFMA is exists.
   LLVM_READONLY
-  int64_t getMFMAEarlyClobberOp(uint32_t Opcode);
+  int32_t getMFMAEarlyClobberOp(uint32_t Opcode);
 
   /// \returns Version of an MFMA instruction which uses AGPRs for srcC and
   /// vdst, given an \p Opcode of an MFMA which uses VGPRs for srcC/vdst.
   LLVM_READONLY
-  int64_t getMFMASrcCVDstAGPROp(uint32_t Opcode);
+  int32_t getMFMASrcCVDstAGPROp(uint32_t Opcode);
 
   /// \returns v_cmpx version of a v_cmp instruction.
   LLVM_READONLY
-  int64_t getVCMPXOpFromVCMP(uint32_t Opcode);
+  int32_t getVCMPXOpFromVCMP(uint32_t Opcode);
 
   const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL;
   const uint64_t RSRC_ELEMENT_SIZE_SHIFT = (32 + 19);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index f063b4eb77774..c2396674e4f96 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -44,9 +44,10 @@ class GFXGen<Predicate pred, string dn, string suffix, int sub> {
 def GFX13Gen         : GFXGen<isGFX13Only, "GFX13", "_gfx13", SIEncodingFamily.GFX13>;
 def GFX1250Gen       : GFXGen<isGFX125xOnly, "GFX1250", "_gfx1250", SIEncodingFamily.GFX1250>;
 def GFX12Not12_50Gen : GFXGen<isGFX12Not12_50, "GFX12", "_gfx12", SIEncodingFamily.GFX12>;
-def GFX12Gen : GFXGen<isGFX12Only, "GFX12", "_gfx12", SIEncodingFamily.GFX12>;
-def GFX11Gen : GFXGen<isGFX11Only, "GFX11", "_gfx11", SIEncodingFamily.GFX11>;
-def GFX10Gen : GFXGen<isGFX10Only, "GFX10", "_gfx10", SIEncodingFamily.GFX10>;
+def GFX12Gen         : GFXGen<isGFX12Only, "GFX12", "_gfx12", SIEncodingFamily.GFX12>;
+def GFX1170Gen       : GFXGen<isGFX11Only, "GFX1170", "_gfx1170", SIEncodingFamily.GFX11>;
+def GFX11Gen         : GFXGen<isGFX11Only, "GFX11", "_gfx11", SIEncodingFamily.GFX11>;
+def GFX10Gen         : GFXGen<isGFX10Only, "GFX10", "_gfx10", SIEncodingFamily.GFX10>;
 
 //===----------------------------------------------------------------------===//
 // SI DAG Nodes
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 3f32d1166fc89..3ffae37bbf239 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -812,7 +812,7 @@ unsigned mapWMMA3AddrTo2AddrOpcode(unsigned Opc) {
 // Wrapper for Tablegen'd function.  enum Subtarget is not defined in any
 // header files, so we need to wrap it in a function that takes unsigned
 // instead.
-int64_t getMCOpcode(uint32_t Opcode, unsigned Gen) {
+int32_t getMCOpcode(uint32_t Opcode, unsigned Gen) {
   return getMCOpcodeGen(Opcode, static_cast<Subtarget>(Gen));
 }
 
@@ -2598,6 +2598,10 @@ bool isGFX11(const MCSubtargetInfo &STI) {
   return STI.hasFeature(AMDGPU::FeatureGFX11);
 }
 
+bool isGFX1170(const MCSubtargetInfo &STI) {
+  return isGFX11(STI) && STI.hasFeature(AMDGPU::FeatureWMMA128bInsts);
+}
+
 bool isGFX11Plus(const MCSubtargetInfo &STI) {
   return isGFX11(STI) || isGFX12Plus(STI);
 }
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 7500c2481a4bd..fa24383c90fa6 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -416,7 +416,7 @@ inline bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx) {
 }
 
 LLVM_READONLY
-int64_t getSOPPWithRelaxation(uint32_t Opcode);
+int32_t getSOPPWithRelaxation(uint32_t Opcode);
 
 struct MIMGBaseOpcodeInfo {
   MIMGBaseOpcode BaseOpcode;
@@ -646,7 +646,7 @@ const GcnBufferFormatInfo *getGcnBufferFormatInfo(uint8_t Format,
                                                   const MCSubtargetInfo &STI);
 
 LLVM_READONLY
-int64_t getMCOpcode(uint32_t Opcode, unsigned Gen);
+int32_t getMCOpcode(uint32_t Opcode, unsigned Gen);
 
 LLVM_READONLY
 unsigned getVOPDOpcode(unsigned Opc, bool VOPD3);
@@ -1705,6 +1705,7 @@ bool isGFX10Plus(const MCSubtargetInfo &STI);
 bool isNotGFX10Plus(const MCSubtargetInfo &STI);
 bool isGFX10Before1030(const MCSubtargetInfo &STI);
 bool isGFX11(const MCSubtargetInfo &STI);
+bool isGFX1170(const MCSubtargetInfo &STI);
 bool isGFX11Plus(const MCSubtargetInfo &STI);
 bool isGFX12(const MCSubtargetInfo &STI);
 bool isGFX12Plus(const MCSubtargetInfo &STI);
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 9a4054b8ad248..9e62dc7c9db0a 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -1426,22 +1426,18 @@ multiclass WMMAInst<string Suffix, string Instr, VOPProfile P, SDPatternOperator
   defvar WMMAConstraints3Addr = "@earlyclobber $vdst";
 
   defvar WMMAProfile = VOPProfileWMMA<P, Suffix, _Src01RC64, Type.hasClamp, Type.hasOpsel>;
-  let isConvergent = 1, Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in {
-    let Constraints = WMMAConstraints2Addr, isConvertibleToThreeAddress = convertibleTo3Addr in {
-      def _twoaddr # Suffix : VOP3P_Pseudo<Instr # Suffix, WMMAProfile>;
-    }
-  }
-  if convertibleTo3Addr then {
+
+  let SubtargetPredicate = HasWMMA256bInsts in {
     let isConvergent = 1, Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in {
-      let Constraints = WMMAConstraints3Addr, SchedRW = [Write32Bit, Write32Bit] in {
-        def _threeaddr # Suffix : VOP3P_Pseudo<Instr # Suffix, WMMAProfile>;
+      let Constraints = WMMAConstraints2Addr, isConvertibleToThreeAddress = convertibleTo3Addr in {
+        def _twoaddr # Suffix : VOP3P_Pseudo<Instr # Suffix, WMMAProfile>;
+      }
+      if convertibleTo3Addr then {
+        let Constraints = WMMAConstraints3Addr, SchedRW = [Write32Bit, Write32Bit] in {
+          def _threeaddr # Suffix : VOP3P_Pseudo<Instr # Suffix, WMMAProfile>;
+        }
       }
     }
-    def : WMMAOpcodeMapping<!cast<Instruction>(NAME # _twoaddr # Suffix),
-                          !cast<Instruction>(NAME # _threeaddr # Suffix)>;
-  }
-
-  let SubtargetPredicate = isGFX11Only in {
     if !eq(Type, WMMAOpSel) then {
       def : WMMAOpSelPat<!cast<Instruction>(NAME # _twoaddr # Suffix), node, P>;
     } else if !eq(Type, WMMAUIClamp) then {
@@ -1450,6 +1446,11 @@ multiclass WMMAInst<string Suffix, string Instr, VOPProfile P, SDPatternOperator
       def : WMMARegularPat<!cast<Instruction>(NAME # _twoaddr # Suffix), node, P>;
     }
   }
+
+  if convertibleTo3Addr then {
+    def : WMMAOpcodeMapping<!cast<Instruction>(NAME # _twoaddr # Suffix),
+                            !cast<Instruction>(NAME # _threeaddr # Suffix)>;
+  }
 }
 
 
@@ -1727,7 +1728,7 @@ multiclass WMMAInstGFX12<string Instr, VOP3PWMMA_Profile WMMAProfile, string Pse
   defvar WMMAConstraints2Addr = !if(DiffVdstSrc2, "@earlyclobber $vdst", "@earlyclobber $vdst,$vdst = $src2");
   defvar WMMAConstraints3Addr = "@earlyclobber $vdst";
 
-  let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0, isConvergent = 1 in {
+  let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0, isConvergent = 1, SubtargetPredicate = HasWMMA128bInsts in {
     let Constraints = WMMAConstraints2Addr, isConvertibleToThreeAddress = 1 in
       def _twoaddr : VOP3P_Pseudo<Instr, WMMAProfile>, WMMAInstInfo {
         let PseudoInstr = Instr#PseudoInstrSuffix;
@@ -2047,7 +2048,7 @@ class SWMMACPat_w64<Instruction Inst, SDPatternOperator node, VOP3PWMMA_Profile
             let WaveSizePredicate = isWave64;
           }
 
-let WaveSizePredicate = isWave32, SubtargetPredicate = isGFX12PlusNot12_50 in {
+let WaveSizePredicate = isWave32, SubtargetPredicate = isGFX11PlusNot12_50, OtherPredicates = [HasWMMA128bInsts] in {
   defm : WMMAPat<"V_WMMA_F32_16X16X16_F16_w32",     int_amdgcn_wmma_f32_16x16x16_f16,     F32_F16_WMMA_w32>;
   defm : WMMAPat<"V_WMMA_F32_16X16X16_BF16_w32",    int_amdgcn_wmma_f32_16x16x16_bf16,    F32_BF16_WMMA_w32>;
   defm : WMMAPat<"V_WMMA_F16_16X16X16_F16_w32",     int_amdgcn_wmma_f16_16x16x16_f16,     F16_F16_WMMA_w32,1>;
@@ -2074,7 +2075,7 @@ let WaveSizePredicate = isWave32, SubtargetPredicate = isGFX12PlusNot12_50 in {
   def : SWMMACPat<V_SWMMAC_F32_16X16X32_BF8_BF8_w32_twoaddr, int_amdgcn_swmmac_f32_16x16x32_bf8_bf8, F32_FP8BF8_SWMMAC_w32>;
 }
 
-let WaveSizePredicate = isWave64, SubtargetPredicate = isGFX12PlusNot12_50 in {
+let WaveSizePredicate = isWave64, SubtargetPredicate = isGFX11PlusNot12_50, OtherPredicates = [HasWMMA128bInsts] in {
   defm : WMMAPat<"V_WMMA_F32_16X16X16_F16_w64",     int_amdgcn_wmma_f32_16x16x16_f16,     F32_F16_WMMA_w64>;
   defm : WMMAPat<"V_WMMA_F32_16X16X16_BF16_w64",    int_amdgcn_wmma_f32_16x16x16_bf16,    F32_BF16_WMMA_w64>;
   defm : WMMAPat<"V_WMMA_F16_16X16X16_F16_w64",     int_amdgcn_wmma_f16_16x16x16_f16,     F16_F16_WMMA_w64,1>;
@@ -2229,6 +2230,18 @@ multiclass VOP3P_WMMA_Real_Base<GFXGen Gen, bits<8> op, VOP3PWMMA_Profile WMMAP,
     VOP3PeWmma<op, !cast<VOP3P_Pseudo>(backing_ps_name).Pfl, WMMAP>;
 }
 
+multiclass VOP3P_Real_WMMA_gfx1170 <bits<8> op, VOP3PWMMA_Profile WMMAP> {
+  let WaveSizePredicate = isWave32, DecoderNamespace = "GFX1170" in {
+    defm _twoaddr : VOP3P_WMMA_Real_Base <GFX1170Gen, op, WMMAP>;
+  }
+}
+
+multiclass VOP3P_Real_WMMA_gfx1170w64 <bits<8> op, VOP3PWMMA_Profile WMMAP> {
+  let WaveSizePredicate = isWave64, DecoderNamespace = "GFX1170W64" in {
+    defm _twoaddr : VOP3P_WMMA_Real_Base <GFX1170Gen, op, WMMAP>;
+  }
+}
+
 multiclass VOP3P_Real_WMMA_gfx12 <bits<8> op, VOP3PWMMA_Profile WMMAP> {
   let WaveSizePredicate = isWave32, DecoderNamespace = "GFX12" in {
     defm _twoaddr : VOP3P_WMMA_Real_Base <GFX12Gen, op, WMMAP>;
@@ -2241,6 +2254,14 @@ multiclass VOP3P_Real_WMMA_gfx12w64 <bits<8> op, VOP3PWMMA_Profile WMMAP> {
   }
 }
 
+multiclass VOP3P_Real_WMMA_gfx1170_gfx12 <bits<8> op, VOP3PWMMA_Profile WMMAP> :
+  VOP3P_Real_WMMA_gfx1170<op, WMMAP>,
+  VOP3P_Real_WMMA_gfx12<op, WMMAP>;
+
+multiclass VOP3P_Real_WMMA_gfx1170_gfx12w64 <bits<8> op, VOP3PWMMA_Profile WMMAP> :
+  VOP3P_Real_WMMA_gfx1170w64<op, WMMAP>,
+  VOP3P_Real_WMMA_gfx12w64<op, WMMAP>;
+
 multiclass VOP3P_Real_WMMA_gfx1250 <bits<8> op, VOP3PWMMA_Profile WMMAP> {
   let WaveSizePredicate = isWave32, DecoderNamespace = "GFX12" in {
     defm _twoaddr : VOP3P_WMMA_Real_Base <GFX1250Gen, op, WMMAP>;
@@ -2345,54 +2366,53 @@ multiclass VOP3PX2_Real_ScaledWMMA_SrcFormats<string Gen, bits<8> op, bits<8> Ld
   }
 }
 
-defm V_WMMA_F32_16X16X16_F16_w32     : VOP3P_Real_WMMA_gfx12 <0x040, F32_F16_WMMA_w32>;
-defm V_WMMA_F32_16X16X16_BF16_w32    : VOP3P_Real_WMMA_gfx12 <0x041, F32_BF16_WMMA_w32>;
-defm V_WMMA_F16_16X16X16_F16_w32     : VOP3P_Real_WMMA_gfx12 <0x042, F16_F16_WMMA_w32>;
-defm V_WMMA_BF16_16X16X16_BF16_w32   : VOP3P_Real_WMMA_gfx12 <0x043, BF16_BF16_WMMA_w32>;
-defm V_WMMA_I32_16X16X16_IU8_w32     : VOP3P_Real_WMMA_gfx12 <0x044, I32_IU8_WMMA_w32>;
-defm V_WMMA_I32_16X16X16_IU4_w32     : VOP3P_Real_WMMA_gfx12 <0x045, I32_IU4X16_WMMA_w32>;
-defm V_WMMA_F32_16X16X16_FP8_FP8_w32 : VOP3P_Real_WMMA_gfx12 <0x046, F32_FP8BF8_WMMA_w32>;
-defm V_WMMA_F32_16X16X16_FP8_BF8_w32 : VOP3P_Real_WMMA_gfx12 <0x047, F32_FP8BF8_WMMA_w32>;
-defm V_WMMA_F32_16X16X16_BF8_FP8_w32 : VOP3P_Real_WMMA_gfx12 <0x048, F32_FP8BF8_WMMA_w32>;
-defm V_WMMA_F32_16X16X16_BF8_BF8_w32 : VOP3P_Real_WMMA_gfx12 <0x049, F32_FP8BF8_WMMA_w32>;
-defm V_WMMA_I32_16X16X32_IU4_w32     : VOP3P_Real_WMMA_gfx12 <0x04a, I32_IU4X32_WMMA_w32>;
-
-defm V_WMMA_F32_16X16X16_F16_w64     : VOP3P_Real_WMMA_gfx12w64 <0x040, F32_F16_WMMA_w64>;
-defm V_WMMA_F32_16X16X16_BF16_w64    : VOP3P_Real_WMMA_gfx12w64 <0x041, F32_BF16_WMMA_w64>;
-defm V_WMMA_F16_16X16X16_F16_w64     : VOP3P_Real_WMMA_gfx12w64 <0x042, F16_F16_WMMA_w64>;
-defm V_WMMA_BF16_16X16X16_BF16_w64   : VOP3P_Real_WMMA_gfx12w64 <0x043, BF16_BF16_WMMA_w64>;
-defm V_WMMA_I32_16X16X16_IU8_w64     : VOP3P_Real_WMMA_gfx12w64 <0x044, I32_IU8_WMMA_w64>;
-defm V_WMMA_I32_16X16X16_IU4_w64     : VOP3P_Real_WMMA_gfx12w64 <0x045, I32_IU4X16_WMMA_w64>;
-defm V_WMMA_F32_16X16X16_FP8_FP8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x046, F32_FP8BF8_WMMA_w64>;
-defm V_WMMA_F32_16X16X16_FP8_BF8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x047, F32_FP8BF8_WMMA_w64>;
-defm V_WMMA_F32_16X16X16_BF8_FP8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x048, F32_FP8BF8_WMMA_w64>;
-defm V_WMMA_F32_16X16X16_BF8_BF8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x049, F32_FP8BF8_WMMA_w64>;
-defm V_WMMA_I32_16X16X32_IU4_w64     : VOP3P_Real_WMMA_gfx12w64 <0x04a, I32_IU4X32_WMMA_w64>;
-
-
-defm V_SWMMAC_F32_16X16X32_F16_w32     : VOP3P_Real_WMMA_gfx12 <0x050, F32_F16_SWMMAC_w32>;
-defm V_SWMMAC_F32_16X16X32_BF16_w32    : VOP3P_Real_WMMA_gfx12 <0x051, F32_BF16_SWMMAC_w32>;
-defm V_SWMMAC_F16_16X16X32_F16_w32     : VOP3P_Real_WMMA_gfx12 <0x052, F16_F16_SWMMAC_w32>;
-defm V_SWMMAC_BF16_16X16X32_BF16_w32   : VOP3P_Real_WMMA_gfx12 <0x053, BF16_BF16_SWMMAC_w32>;
-defm V_SWMMAC_I32_16X16X32_IU8_w32     : VOP3P_Real_WMMA_gfx12 <0x054, I32_IU8_SWMMAC_w32>;
-defm V_SWMMAC_I32_16X16X32_IU4_w32     : VOP3P_Real_WMMA_gfx12 <0x055, I32_IU4X32_SWMMAC_w32>;
-defm V_SWMMAC_I32_16X16X64_IU4_w32     : VOP3P_Real_WMMA_gfx12 <0x056, I32_IU4X64_SWMMAC_w32>;
-defm V_SWMMAC_F32_16X16X32_FP8_FP8_w32 : VOP3P_Real_WMMA_gfx12 <0x057, F32_FP8BF8_SWMMAC_w32>;
-defm V_SWMMAC_F32_16X16X32_FP8_BF8_w32 : VOP3P_Real_WMMA_gfx12 <0x058, F32_FP8BF8_SWMMAC_w32>;
-defm V_SWMMAC_F32_16X16X32_BF8_FP8_w32 : VOP3P_Real_WMMA_gfx12 <0x059, F32_FP8BF8_SWMMAC_w32>;
-defm V_SWMMAC_F32_16X16X32_BF8_BF8_w32 : VOP3P_Real_WMMA_gfx12 <0x05a, F32_FP8BF8_SWMMAC_w32>;
-
-defm V_SWMMAC_F32_16X16X32_F16_w64     : VOP3P_Real_WMMA_gfx12w64 <0x050, F32_F16_SWMMAC_w64>;
-defm V_SWMMAC_F32_16X16X32_BF16_w64    : VOP3P_Real_WMMA_gfx12w64 <0x051, F32_BF16_SWMMAC_w64>;
-defm V_SWMMAC_F16_16X16X32_F16_w64     : VOP3P_Real_WMMA_gfx12w64 <0x052, F16_F16_SWMMAC_w64>;
-defm V_SWMMAC_BF16_16X16X32_BF16_w64   : VOP3P_Real_WMMA_gfx12w64 <0x053, BF16_BF16_SWMMAC_w64>;
-defm V_SWMMAC_I32_16X16X32_IU8_w64     : VOP3P_Real_WMMA_gfx12w64 <0x054, I32_IU8_SWMMAC_w64>;
-defm V_SWMMAC_I32_16X16X32_IU4_w64     : VOP3P_Real_WMMA_gfx12w64 <0x055, I32_IU4X32_SWMMAC_w64>;
-defm V_SWMMAC_I32_16X16X64_IU4_w64     : VOP3P_Real_WMMA_gfx12w64 <0x056, I32_IU4X64_SWMMAC_w64>;
-defm V_SWMMAC_F32_16X16X32_FP8_FP8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x057, F32_FP8BF8_SWMMAC_w64>;
-defm V_SWMMAC_F32_16X16X32_FP8_BF8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x058, F32_FP8BF8_SWMMAC_w64>;
-defm V_SWMMAC_F32_16X16X32_BF8_FP8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x059, F32_FP8BF8_SWMMAC_w64>;
-defm V_SWMMAC_F32_16X16X32_BF8_BF8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x05a, F32_FP8BF8_SWMMAC_w64>;
+defm V_WMMA_F32_16X16X16_F16_w32     : VOP3P_Real_WMMA_gfx1170_gfx12 <0x040, F32_F16_WMMA_w32>;
+defm V_WMMA_F32_16X16X16_BF16_w32    : VOP3P_Real_WMMA_gfx1170_gfx12 <0x041, F32_BF16_WMMA_w32>;
+defm V_WMMA_F16_16X16X16_F16_w32     : VOP3P_Real_WMMA_gfx1170_gfx12 <0x042, F16_F16_WMMA_w32>;
+defm V_WMMA_BF16_16X16X16_BF16_w32   : VOP3P_Real_WMMA_gfx1170_gfx12 <0x043, BF16_BF16_WMMA_w32>;
+defm V_WMMA_I32_16X16X16_IU8_w32     : VOP3P_Real_WMMA_gfx1170_gfx12 <0x044, I32_IU8_WMMA_w32>;
+defm V_WMMA_I32_16X16X16_IU4_w32     : VOP3P_Real_WMMA_gfx1170_gfx12 <0x045, I32_IU4X16_WMMA_w32>;
+defm V_WMMA_F32_16X16X16_FP8_FP8_w32 : VOP3P_Real_WMMA_gfx1170_gfx12 <0x046, F32_FP8BF8_WMMA_w32>;
+defm V_WMMA_F32_16X16X16_FP8_BF8_w32 : VOP3P_Real_WMMA_gfx1170_gfx12 <0x047, F32_FP8BF8_WMMA_w32>;
+defm V_WMMA_F32_16X16X16_BF8_FP8_w32 : VOP3P_Real_WMMA_gfx1170_gfx12 <0x048, F32_FP8BF8_WMMA_w32>;
+defm V_WMMA_F32_16X16X16_BF8_BF8_w32 : VOP3P_Real_WMMA_gfx1170_gfx12 <0x049, F32_FP8BF8_WMMA_w32>;
+defm V_WMMA_I32_16X16X32_IU4_w32     : VOP3P_Real_WMMA_gfx1170_gfx12 <0x04a, I32_IU4X32_WMMA_w32>;
+
+defm V_WMMA_F32_16X16X16_F16_w64     : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x040, F32_F16_WMMA_w64>;
+defm V_WMMA_F32_16X16X16_BF16_w64    : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x041, F32_BF16_WMMA_w64>;
+defm V_WMMA_F16_16X16X16_F16_w64     : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x042, F16_F16_WMMA_w64>;
+defm V_WMMA_BF16_16X16X16_BF16_w64   : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x043, BF16_BF16_WMMA_w64>;
+defm V_WMMA_I32_16X16X16_IU8_w64     : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x044, I32_IU8_WMMA_w64>;
+defm V_WMMA_I32_16X16X16_IU4_w64     : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x045, I32_IU4X16_WMMA_w64>;
+defm V_WMMA_F32_16X16X16_FP8_FP8_w64 : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x046, F32_FP8BF8_WMMA_w64>;
+defm V_WMMA_F32_16X16X16_FP8_BF8_w64 : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x047, F32_FP8BF8_WMMA_w64>;
+defm V_WMMA_F32_16X16X16_BF8_FP8_w64 : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x048, F32_FP8BF8_WMMA_w64>;
+defm V_WMMA_F32_16X16X16_BF8_BF8_w64 : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x049, F32_FP8BF8_WMMA_w64>;
+defm V_WMMA_I32_16X16X32_IU4_w64     : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x04a, I32_IU4X32_WMMA_w64>;
+
+defm V_SWMMAC_F32_16X16X32_F16_w32     : VOP3P_Real_WMMA_gfx1170_gfx12 <0x050, F32_F16_SWMMAC_w32>;
+defm V_SWMMAC_F32_16X16X32_BF16_w32    : VOP3P_Real_WMMA_gfx1170_gfx12 <0x051, F32_BF16_SWMMAC_w32>;
+defm V_SWMMAC_F16_16X16X32_F16_w32     : VOP3P_Real_WMMA_gfx1170_gfx12 <0x052, F16_F16_SWMMAC_w32>;
+defm V_SWMMAC_BF16_16X16X32_BF16_w32   : VOP3P_Real_WMMA_gfx1170_gfx12 <0x053, BF16_BF16_SWMMAC_w32>;
+defm V_SWMMAC_I32_16X16X32_IU8_w32     : VOP3P_Real_WMMA_gfx1170_gfx12 <0x054, I32_IU8_SWMMAC_w32>;
+defm V_SWMMAC_I32_16X16X32_IU4_w32     : VOP3P_Real_WMMA_gfx1170_gfx12 <0x055, I32_IU4X32_SWMMAC_w32>;
+defm V_SWMMAC_I32_16X16X64_IU4_w32     : VOP3P_Real_WMMA_gfx1170_gfx12 <0x056, I32_IU4X64_SWMMAC_w32>;
+defm V_SWMMAC_F32_16X16X32_FP8_FP8_w32 : VOP3P_Real_WMMA_gfx1170_gfx12 <0x057, F32_FP8BF8_SWMMAC_w32>;
+defm V_SWMMAC_F32_16X16X32_FP8_BF8_w32 : VOP3P_Real_WMMA_gfx1170_gfx12 <0x058, F32_FP8BF8_SWMMAC_w32>;
+defm V_SWMMAC_F32_16X16X32_BF8_FP8_w32 : VOP3P_Real_WMMA_gfx1170_gfx12 <0x059, F32_FP8BF8_SWMMAC_w32>;
+defm V_SWMMAC_F32_16X16X32_BF8_BF8_w32 : VOP3P_Real_WMMA_gfx1170_gfx12 <0x05a, F32_FP8BF8_SWMMAC_w32>;
+
+defm V_SWMMAC_F32_16X16X32_F16_w64     : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x050, F32_F16_SWMMAC_w64>;
+defm V_SWMMAC_F32_16X16X32_BF16_w64    : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x051, F32_BF16_SWMMAC_w64>;
+defm V_SWMMAC_F16_16X16X32_F16_w64     : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x052, F16_F16_SWMMAC_w64>;
+defm V_SWMMAC_BF16_16X16X32_BF16_w64   : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x053, BF16_BF16_SWMMAC_w64>;
+defm V_SWMMAC_I32_16X16X32_IU8_w64     : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x054, I32_IU8_SWMMAC_w64>;
+defm V_SWMMAC_I32_16X16X32_IU4_w64     : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x055, I32_IU4X32_SWMMAC_w64>;
+defm V_SWMMAC_I32_16X16X64_IU4_w64     : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x056, I32_IU4X64_SWMMAC_w64>;
+defm V_SWMMAC_F32_16X16X32_FP8_FP8_w64 : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x057, F32_FP8BF8_SWMMAC_w64>;
+defm V_SWMMAC_F32_16X16X32_FP8_BF8_w64 : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x058, F32_FP8BF8_SWMMAC_w64>;
+defm V_SWMMAC_F32_16X16X32_BF8_FP8_w64 : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x059, F32_FP8BF8_SWMMAC_w64>;
+defm V_SWMMAC_F32_16X16X32_BF8_BF8_w64 : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x05a, F32_FP8BF8_SWMMAC_w64>;
 
 defm V_WMMA_F32_16X16X4_F32_w32       : VOP3P_Real_WMMA_gfx1250 <0x05d, F32_F32_WMMA_w32>;
 defm V_WMMA_F32_16X16X32_BF16_w32     : VOP3P_Real_WMMA_gfx1250 <0x062, F32_BF16X32_WMMA_w32>;
diff --git a/llvm/lib/Target/PowerPC/PPCHazardRecognizers.cpp b/llvm/lib/Target/PowerPC/PPCHazardRecognizers.cpp
index afa1345fdb469..41f066b49cfd5 100644
--- a/llvm/lib/Target/PowerPC/PPCHazardRecognizers.cpp
+++ b/llvm/lib/Target/PowerPC/PPCHazardRecognizers.cpp
@@ -78,7 +78,11 @@ bool PPCDispatchGroupSBHazardRecognizer::isBCTRAfterSet(SUnit *SU) {
 }
 
 // FIXME: Remove this when we don't need this:
-namespace llvm { namespace PPC { extern int64_t getNonRecordFormOpcode(uint32_t); } }
+namespace llvm {
+namespace PPC {
+extern int32_t getNonRecordFormOpcode(uint32_t);
+}
+} // namespace llvm
 
 // FIXME: A lot of code in PPCDispatchGroupSBHazardRecognizer is P7 specific.
 
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 3c2ad1b30b139..bf1755733392c 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -12046,31 +12046,15 @@ SDValue PPCTargetLowering::LowerDMFVectorLoad(SDValue Op,
   }
 
   SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains);
-  SDValue Lo =
-      DAG.getNode(PPCISD::INST512, dl, MVT::v512i1, Loads[0], Loads[1]);
-  SDValue LoSub = DAG.getTargetConstant(PPC::sub_wacc_lo, dl, MVT::i32);
-  SDValue Hi =
-      DAG.getNode(PPCISD::INST512HI, dl, MVT::v512i1, Loads[2], Loads[3]);
-  SDValue HiSub = DAG.getTargetConstant(PPC::sub_wacc_hi, dl, MVT::i32);
-  SDValue RC = DAG.getTargetConstant(PPC::DMRRCRegClassID, dl, MVT::i32);
-  const SDValue Ops[] = {RC, Lo, LoSub, Hi, HiSub};
-
-  SDValue Value =
-      SDValue(DAG.getMachineNode(PPC::REG_SEQUENCE, dl, MVT::v1024i1, Ops), 0);
+  SDValue Value = DMFInsert1024(Loads, dl, DAG);
 
   if (IsV1024i1) {
     return DAG.getMergeValues({Value, TF}, dl);
   }
 
   // Handle Loads for V2048i1 which represents a dmr pair.
-  SDValue DmrPValue;
-  SDValue Dmr1Lo =
-      DAG.getNode(PPCISD::INST512, dl, MVT::v512i1, Loads[4], Loads[5]);
-  SDValue Dmr1Hi =
-      DAG.getNode(PPCISD::INST512HI, dl, MVT::v512i1, Loads[6], Loads[7]);
-  const SDValue Dmr1Ops[] = {RC, Dmr1Lo, LoSub, Dmr1Hi, HiSub};
-  SDValue Dmr1Value = SDValue(
-      DAG.getMachineNode(PPC::REG_SEQUENCE, dl, MVT::v1024i1, Dmr1Ops), 0);
+  SmallVector<SDValue, 4> MoreLoads{Loads[4], Loads[5], Loads[6], Loads[7]};
+  SDValue Dmr1Value = DMFInsert1024(MoreLoads, dl, DAG);
 
   SDValue Dmr0Sub = DAG.getTargetConstant(PPC::sub_dmr0, dl, MVT::i32);
   SDValue Dmr1Sub = DAG.getTargetConstant(PPC::sub_dmr1, dl, MVT::i32);
@@ -12078,7 +12062,7 @@ SDValue PPCTargetLowering::LowerDMFVectorLoad(SDValue Op,
   SDValue DmrPRC = DAG.getTargetConstant(PPC::DMRpRCRegClassID, dl, MVT::i32);
   const SDValue DmrPOps[] = {DmrPRC, Value, Dmr0Sub, Dmr1Value, Dmr1Sub};
 
-  DmrPValue = SDValue(
+  SDValue DmrPValue = SDValue(
       DAG.getMachineNode(PPC::REG_SEQUENCE, dl, MVT::v2048i1, DmrPOps), 0);
 
   return DAG.getMergeValues({DmrPValue, TF}, dl);
diff --git a/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp b/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp
index f33d4bf89381b..9114ded4a3cc4 100644
--- a/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp
+++ b/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp
@@ -47,7 +47,7 @@ static cl::opt<bool> DisableVSXFMAMutate(
 #define DEBUG_TYPE "ppc-vsx-fma-mutate"
 
 namespace llvm { namespace PPC {
-  int64_t getAltVSXFMAOpcode(uint32_t Opcode);
+int32_t getAltVSXFMAOpcode(uint32_t Opcode);
 } }
 
 namespace {
diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
index 521b14e498af5..4168cb473fdf1 100644
--- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
+++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
@@ -3829,68 +3829,63 @@ bool RISCVAsmParser::validateInstruction(MCInst &Inst,
   if (!(MCID.TSFlags & RISCVII::RVVConstraintMask))
     return false;
 
-  if (Opcode == RISCV::SF_VC_V_XVW || Opcode == RISCV::SF_VC_V_IVW ||
-      Opcode == RISCV::SF_VC_V_FVW || Opcode == RISCV::SF_VC_V_VVW) {
-    // Operands Opcode, Dst, uimm, Dst, Rs2, Rs1 for SF_VC_V_XVW.
-    MCRegister VCIXDst = Inst.getOperand(0).getReg();
-    SMLoc VCIXDstLoc = Operands[2]->getStartLoc();
-    if (MCID.TSFlags & RISCVII::VS1Constraint) {
-      MCRegister VCIXRs1 = Inst.getOperand(Inst.getNumOperands() - 1).getReg();
-      if (VCIXDst == VCIXRs1)
-        return Error(VCIXDstLoc, "the destination vector register group cannot"
-                                 " overlap the source vector register group");
-    }
-    if (MCID.TSFlags & RISCVII::VS2Constraint) {
-      MCRegister VCIXRs2 = Inst.getOperand(Inst.getNumOperands() - 2).getReg();
-      if (VCIXDst == VCIXRs2)
-        return Error(VCIXDstLoc, "the destination vector register group cannot"
-                                 " overlap the source vector register group");
-    }
-    return false;
-  }
+  int DestIdx = RISCV::getNamedOperandIdx(Inst.getOpcode(), RISCV::OpName::vd);
+  MCRegister DestReg = Inst.getOperand(DestIdx).getReg();
 
-  MCRegister DestReg = Inst.getOperand(0).getReg();
-  unsigned Offset = 0;
-  int TiedOp = MCID.getOperandConstraint(1, MCOI::TIED_TO);
-  if (TiedOp == 0)
-    Offset = 1;
+  // Operands[1] or Operands[2] will be the first operand, DestReg.
+  const MCParsedAsmOperand *ParsedOp = Operands[1].get();
+  if (!ParsedOp->isReg()) {
+    // XSfvcp instructions may have an immediate before vd.
+    // FIXME: Is there a better way to do this?
+    ParsedOp = Operands[2].get();
+  }
+  assert(ParsedOp->getReg() == DestReg && "Can't find parsed dest operand");
+  SMLoc Loc = ParsedOp->getStartLoc();
 
-  // Operands[1] will be the first operand, DestReg.
-  SMLoc Loc = Operands[1]->getStartLoc();
   if (MCID.TSFlags & RISCVII::VS2Constraint) {
-    MCRegister CheckReg = Inst.getOperand(Offset + 1).getReg();
+    int VS2Idx =
+        RISCV::getNamedOperandIdx(Inst.getOpcode(), RISCV::OpName::vs2);
+    assert(VS2Idx >= 0 && "No vs2 operand?");
+    MCRegister CheckReg = Inst.getOperand(VS2Idx).getReg();
     if (DestReg == CheckReg)
       return Error(Loc, "the destination vector register group cannot overlap"
                         " the source vector register group");
   }
-  if ((MCID.TSFlags & RISCVII::VS1Constraint) && Inst.getOperand(Offset + 2).isReg()) {
-    MCRegister CheckReg = Inst.getOperand(Offset + 2).getReg();
-    if (DestReg == CheckReg)
-      return Error(Loc, "the destination vector register group cannot overlap"
-                        " the source vector register group");
+  if (MCID.TSFlags & RISCVII::VS1Constraint) {
+    int VS1Idx =
+        RISCV::getNamedOperandIdx(Inst.getOpcode(), RISCV::OpName::vs1);
+    // FIXME: The vs1 constraint is used on scalar and imm instructions so we
+    // need to check that the operand exists.
+    if (VS1Idx >= 0) {
+      MCRegister CheckReg = Inst.getOperand(VS1Idx).getReg();
+      if (DestReg == CheckReg)
+        return Error(Loc, "the destination vector register group cannot overlap"
+                          " the source vector register group");
+    }
   }
-  if ((MCID.TSFlags & RISCVII::VMConstraint) && (DestReg == RISCV::V0)) {
-    // vadc, vsbc are special cases. These instructions have no mask register.
-    // The destination register could not be V0.
-    if (Opcode == RISCV::VADC_VVM || Opcode == RISCV::VADC_VXM ||
-        Opcode == RISCV::VADC_VIM || Opcode == RISCV::VSBC_VVM ||
-        Opcode == RISCV::VSBC_VXM || Opcode == RISCV::VFMERGE_VFM ||
-        Opcode == RISCV::VMERGE_VIM || Opcode == RISCV::VMERGE_VVM ||
-        Opcode == RISCV::VMERGE_VXM)
-      return Error(Loc, "the destination vector register group cannot be V0");
-
-    // Regardless masked or unmasked version, the number of operands is the
-    // same. For example, "viota.m v0, v2" is "viota.m v0, v2, NoRegister"
-    // actually. We need to check the last operand to ensure whether it is
-    // masked or not.
-    MCRegister CheckReg = Inst.getOperand(Inst.getNumOperands() - 1).getReg();
-    assert((CheckReg == RISCV::V0 || !CheckReg) &&
-           "Unexpected register for mask operand");
 
-    if (DestReg == CheckReg)
-      return Error(Loc, "the destination vector register group cannot overlap"
-                        " the mask register");
+  if (MCID.TSFlags & RISCVII::VMConstraint) {
+    int VMIdx = RISCV::getNamedOperandIdx(Inst.getOpcode(), RISCV::OpName::vm);
+    assert(VMIdx >= 0 && "No vm operand?");
+
+    if (DestReg == RISCV::V0) {
+      if (MCID.operands()[Inst.getNumOperands() - 1].OperandType !=
+          RISCVOp::OPERAND_VMASK)
+        return Error(Loc, "the destination vector register group cannot be V0");
+
+      // Regardless masked or unmasked version, the number of operands is the
+      // same. For example, "viota.m v0, v2" is "viota.m v0, v2, NoRegister"
+      // actually. We need to check the operand to see whether it is masked or
+      // not.
+      MCRegister CheckReg = Inst.getOperand(VMIdx).getReg();
+      assert((!CheckReg.isValid() || CheckReg == RISCV::V0) &&
+             "Unexpected mask operand register");
+      if (CheckReg.isValid())
+        return Error(Loc, "the destination vector register group cannot overlap"
+                          " the mask register");
+    }
   }
+
   return false;
 }
 
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp
index 38d154d90d7e0..4cb41fc92c4ba 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp
@@ -35,6 +35,7 @@
 
 #define GET_INSTRINFO_MC_DESC
 #define ENABLE_INSTR_PREDICATE_VERIFIER
+#define GET_INSTRINFO_NAMED_OPS
 #include "RISCVGenInstrInfo.inc"
 
 #define GET_REGINFO_MC_DESC
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.h
index d1733886637f8..39a34f6ae434e 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.h
@@ -53,6 +53,7 @@ void updateCZceFeatureImplications(MCSubtargetInfo &STI);
 // Defines symbolic names for RISC-V instructions.
 #define GET_INSTRINFO_ENUM
 #define GET_INSTRINFO_MC_HELPER_DECLS
+#define GET_INSTRINFO_OPERAND_ENUM
 #include "RISCVGenInstrInfo.inc"
 
 #define GET_SUBTARGETINFO_ENUM
diff --git a/llvm/lib/Target/RISCV/RISCVInstrFormatsV.td b/llvm/lib/Target/RISCV/RISCVInstrFormatsV.td
index 8aa3fb341e3b4..c007838d31dfe 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrFormatsV.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrFormatsV.td
@@ -102,6 +102,8 @@ class RVInstVBase<bits<6> funct6, RISCVVFormat opv, dag outs, dag ins,
 
   let Uses = [VL, VTYPE];
   let RVVConstraint = VMConstraint;
+
+  let UseNamedOperandTable = true;
 }
 
 class RVInstVV<bits<6> funct6, RISCVVFormat opv, dag outs, dag ins,
@@ -160,6 +162,8 @@ class RVInstVUnaryRd<bits<6> funct6, bits<5> vs1, RISCVVFormat opv, dag outs,
 
   let Uses = [VL, VTYPE];
   let RVVConstraint = NoConstraint;
+
+  let UseNamedOperandTable = true;
 }
 
 class RVInstVLoadBase<bits<3> nf, RISCVWidth width, RISCVMOP mop,
@@ -181,6 +185,8 @@ class RVInstVLoadBase<bits<3> nf, RISCVWidth width, RISCVMOP mop,
 
   let Uses = [VL, VTYPE];
   let RVVConstraint = VMConstraint;
+
+  let UseNamedOperandTable = true;
 }
 
 class RVInstVLU<bits<3> nf, RISCVWidth width, RISCVLUMOP lumop, dag outs,
@@ -224,6 +230,8 @@ class RVInstVStoreBase<bits<3> nf, RISCVWidth width, RISCVMOP mop, dag outs,
   let Inst{6-0} = OPC_STORE_FP.Value;
 
   let Uses = [VL, VTYPE];
+
+  let UseNamedOperandTable = true;
 }
 
 class RVInstVSU<bits<3> nf, RISCVWidth width, RISCVSUMOP sumop, dag outs,
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 2b496b1b20318..98561a9345daf 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -42,7 +42,6 @@ using namespace llvm;
 #include "RISCVGenCompressInstEmitter.inc"
 
 #define GET_INSTRINFO_CTOR_DTOR
-#define GET_INSTRINFO_NAMED_OPS
 #include "RISCVGenInstrInfo.inc"
 
 #define DEBUG_TYPE "riscv-instr-info"
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
index 2932efffdb814..cfe2e5c474fbd 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
@@ -19,7 +19,6 @@
 #include "llvm/IR/DiagnosticInfo.h"
 
 #define GET_INSTRINFO_HEADER
-#define GET_INSTRINFO_OPERAND_ENUM
 #include "RISCVGenInstrInfo.inc"
 #include "RISCVGenRegisterInfo.inc"
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td
index 1b8cf3ddf7d2e..8f97e81537f1b 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td
@@ -380,6 +380,8 @@ class NDSRVInstVSINTLN<bits<5> funct5, string opcodestr>
   let mayLoad = 1;
   let mayStore = 0;
   let Uses = [VL, VTYPE];
+
+  let UseNamedOperandTable = true;
 }
 
 class NDSRVInstVSINTCvt<bits<5> fucnt5, string opcodestr>
@@ -401,6 +403,8 @@ class NDSRVInstVSINTCvt<bits<5> fucnt5, string opcodestr>
   let mayStore = 0;
   let Uses = [FRM, VL, VTYPE];
   let RVVConstraint = VMConstraint;
+
+  let UseNamedOperandTable = true;
 }
 
 class NDSRVInstBFHCvt<bits<7> funct7, bits<5> rs1val, DAGOperand rdty,
@@ -435,6 +439,8 @@ class NDSRVInstVFPMAD<bits<6> funct6, string opcodestr>
   let mayStore = 0;
 
   let RVVConstraint = VMConstraint;
+
+  let UseNamedOperandTable = true;
 }
 
 class NDSRVInstVD4DOT<bits<6> funct6, string opcodestr>
@@ -458,6 +464,8 @@ class NDSRVInstVD4DOT<bits<6> funct6, string opcodestr>
   let mayStore = 0;
 
   let RVVConstraint = VMConstraint;
+
+  let UseNamedOperandTable = true;
 }
 
 class NDSRVInstVBFHCvt<bits<5> vs1, string opcodestr>
@@ -477,6 +485,8 @@ class NDSRVInstVBFHCvt<bits<5> vs1, string opcodestr>
   let mayStore = 0;
 
   let Uses = [VL, VTYPE];
+
+  let UseNamedOperandTable = true;
 }
 
 class NDSRVInstVLN<bits<5> funct5, string opcodestr>
@@ -500,6 +510,8 @@ class NDSRVInstVLN<bits<5> funct5, string opcodestr>
 
   let Uses = [VL, VTYPE];
   let RVVConstraint = VMConstraint;
+
+  let UseNamedOperandTable = true;
 }
 
 class VPseudoVLN8NoMask<VReg RetClass, bit U> :
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXRivos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXRivos.td
index 51506f40d3811..601270d3be4ee 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXRivos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXRivos.td
@@ -29,6 +29,8 @@ class CustomRivosVXI<bits<6> funct6, RISCVVFormat opv, dag outs, dag ins,
   let Uses = [VL, VTYPE];
   let RVVConstraint = NoConstraint;
   let Constraints = "$vd = $vd_wb";
+
+  let UseNamedOperandTable = true;
 }
 
 class CustomRivosXVI<bits<6> funct6, RISCVVFormat opv, dag outs, dag ins,
@@ -49,6 +51,8 @@ class CustomRivosXVI<bits<6> funct6, RISCVVFormat opv, dag outs, dag ins,
 
   let Uses = [VL, VTYPE];
   let RVVConstraint = NoConstraint;
+
+  let UseNamedOperandTable = true;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
index bc65db1f77ffb..9cc5dbf595871 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
@@ -132,6 +132,8 @@ class RVInstVCCustom2Base<VCIXInfo info>
   let RVVConstraint = info.RVVConstraint;
   let ElementsDependOn = EltDepsVLMask;
   let ReadsPastVL = 1;
+
+  let UseNamedOperandTable = true;
 }
 
 // VCIX instructions with GPR rs1 operand
@@ -254,7 +256,7 @@ let Predicates = [HasVendorXSfvfexpa], DecoderNamespace = "XSfvector" in {
 }
 
 let Predicates = [HasVendorXSfvqmaccdod], DecoderNamespace = "XSfvector",
-    DestEEW = EEWSEWx4, RVVConstraint=VS2Constraint in {
+    DestEEW = EEWSEWx4, RVVConstraint=VS1Constraint in {
   def SF_VQMACCU_2x8x2  : CustomSiFiveVMACC<0b101100, OPMVV, "sf.vqmaccu.2x8x2">;
   def SF_VQMACC_2x8x2   : CustomSiFiveVMACC<0b101101, OPMVV, "sf.vqmacc.2x8x2">;
   def SF_VQMACCUS_2x8x2 : CustomSiFiveVMACC<0b101110, OPMVV, "sf.vqmaccus.2x8x2">;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td
index ebabd03731298..9aae940476ae2 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td
@@ -173,12 +173,10 @@ let Predicates = [HasStdExtZvkned] in {
   defm VAESDM     : VAES_MV_V_S<0b101000, 0b101001, 0b00000, OPMVV, "vaesdm">;
   defm VAESEF     : VAES_MV_V_S<0b101000, 0b101001, 0b00011, OPMVV, "vaesef">;
   defm VAESEM     : VAES_MV_V_S<0b101000, 0b101001, 0b00010, OPMVV, "vaesem">;
-  let RVVConstraint = NoConstraint in {
-    def  VAESKF1_VI : PALUVINoVm<0b100010, "vaeskf1.vi", uimm5>,
-                      SchedUnaryMC<"WriteVAESKF1V", "ReadVAESKF1V">;
-    def  VAESKF2_VI : PALUVINoVmBinary<0b101010, "vaeskf2.vi", uimm5>,
-                      SchedBinaryMC<"WriteVAESKF2V", "ReadVAESKF2V", "ReadVAESKF2V">;
-  }
+  def  VAESKF1_VI : PALUVINoVm<0b100010, "vaeskf1.vi", uimm5>,
+                    SchedUnaryMC<"WriteVAESKF1V", "ReadVAESKF1V">;
+  def  VAESKF2_VI : PALUVINoVmBinary<0b101010, "vaeskf2.vi", uimm5>,
+                    SchedBinaryMC<"WriteVAESKF2V", "ReadVAESKF2V", "ReadVAESKF2V">;
   let RVVConstraint = VS2Constraint in
   def  VAESZ_VS   : PALUVs2NoVmBinary<0b101001, 0b00111, OPMVV, "vaesz.vs">,
                     SchedBinaryMC<"WriteVAESZV", "ReadVAESZV", "ReadVAESZV">;
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.h b/llvm/lib/Target/SystemZ/SystemZInstrInfo.h
index 8e834c74f5031..bf832154ad717 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.h
@@ -159,8 +159,8 @@ enum FusedCompareType {
 } // end namespace SystemZII
 
 namespace SystemZ {
-int64_t getTwoOperandOpcode(uint32_t Opcode);
-int64_t getTargetMemOpcode(uint32_t Opcode);
+int32_t getTwoOperandOpcode(uint32_t Opcode);
+int32_t getTargetMemOpcode(uint32_t Opcode);
 
 // Return a version of comparison CC mask CCMask in which the LT and GT
 // actions are swapped.
diff --git a/llvm/lib/Target/WebAssembly/TargetInfo/WebAssemblyTargetInfo.h b/llvm/lib/Target/WebAssembly/TargetInfo/WebAssemblyTargetInfo.h
index bbe3d4a89adce..c24ae77066f41 100644
--- a/llvm/lib/Target/WebAssembly/TargetInfo/WebAssemblyTargetInfo.h
+++ b/llvm/lib/Target/WebAssembly/TargetInfo/WebAssemblyTargetInfo.h
@@ -25,9 +25,9 @@ Target &getTheWebAssemblyTarget64();
 
 namespace WebAssembly {
 
-int64_t getStackOpcode(uint32_t Opcode);
-int64_t getRegisterOpcode(uint32_t Opcode);
-int64_t getWasm64Opcode(uint32_t Opcode);
+int32_t getStackOpcode(uint32_t Opcode);
+int32_t getRegisterOpcode(uint32_t Opcode);
+int32_t getWasm64Opcode(uint32_t Opcode);
 
 } // namespace WebAssembly
 
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index dbeb8fd86b835..3678327627b97 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -24370,7 +24370,7 @@ static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, const SDLoc &dl,
   assert(And.getValueType().isScalarInteger() && "Scalar type expected");
 
   APInt AndRHSVal;
-  SDValue Shl, Src, BitNo;
+  SDValue Shl, Src, Mask, BitNo;
   if (sd_match(And,
                m_And(m_TruncOrSelf(m_Value(Src)),
                      m_TruncOrSelf(m_AllOf(m_Value(Shl),
@@ -24384,6 +24384,10 @@ static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, const SDLoc &dl,
       if (Known.countMinLeadingZeros() < (BitWidth - AndBitWidth))
         return SDValue();
     }
+  } else if (sd_match(And,
+                      m_ReassociatableAnd(m_Value(Src), m_Value(Mask),
+                                          m_Shl(m_One(), m_Value(BitNo))))) {
+    // (Src & Mask & (1 << BitNo)) ==/!= 0
   } else if (sd_match(And,
                       m_And(m_TruncOrSelf(m_Srl(m_Value(Src), m_Value(BitNo))),
                             m_One()))) {
@@ -24402,6 +24406,9 @@ static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, const SDLoc &dl,
     return SDValue();
   }
 
+  if (Mask)
+    Src = DAG.getNode(ISD::AND, dl, Src.getValueType(), Src, Mask);
+
   // Remove any bit flip.
   if (isBitwiseNot(Src)) {
     Src = Src.getOperand(0);
@@ -39314,6 +39321,7 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
     }
     break;
   }
+  case X86ISD::FANDN:
   case X86ISD::ANDNP: {
     KnownBits Known2;
     Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
diff --git a/llvm/lib/TargetParser/TargetParser.cpp b/llvm/lib/TargetParser/TargetParser.cpp
index fc5d7519bdffe..3d381e26c37b2 100644
--- a/llvm/lib/TargetParser/TargetParser.cpp
+++ b/llvm/lib/TargetParser/TargetParser.cpp
@@ -515,13 +515,37 @@ static void fillAMDGCNFeatureMap(StringRef GPU, const Triple &T,
     Features["qsad-insts"] = true;
     Features["cvt-pknorm-vop2-insts"] = true;
     Features["fp8-conversion-insts"] = true;
+    Features["wmma-128b-insts"] = true;
     Features["atomic-fmin-fmax-global-f32"] = true;
     break;
   case GK_GFX1170:
-    // TODO-GFX1170: Update features map for gfx1170
+    Features["ci-insts"] = true;
+    Features["dot7-insts"] = true;
+    Features["dot8-insts"] = true;
+    Features["dot9-insts"] = true;
+    Features["dot10-insts"] = true;
+    Features["dot12-insts"] = true;
+    Features["dl-insts"] = true;
+    Features["16-bit-insts"] = true;
+    Features["dpp"] = true;
+    Features["gfx8-insts"] = true;
+    Features["gfx9-insts"] = true;
+    Features["gfx10-insts"] = true;
+    Features["gfx10-3-insts"] = true;
+    Features["gfx11-insts"] = true;
+    Features["atomic-fadd-rtn-insts"] = true;
+    Features["image-insts"] = true;
+    Features["cube-insts"] = true;
+    Features["lerp-inst"] = true;
+    Features["sad-insts"] = true;
+    Features["qsad-insts"] = true;
+    Features["cvt-pknorm-vop2-insts"] = true;
+    Features["gws"] = true;
     Features["dot11-insts"] = true;
     Features["fp8-conversion-insts"] = true;
-    [[fallthrough]];
+    Features["wmma-128b-insts"] = true;
+    Features["atomic-fmin-fmax-global-f32"] = true;
+    break;
   case GK_GFX1153:
   case GK_GFX1152:
   case GK_GFX1151:
@@ -554,6 +578,7 @@ static void fillAMDGCNFeatureMap(StringRef GPU, const Triple &T,
     Features["qsad-insts"] = true;
     Features["cvt-pknorm-vop2-insts"] = true;
     Features["gws"] = true;
+    Features["wmma-256b-insts"] = true;
     Features["atomic-fmin-fmax-global-f32"] = true;
     break;
   case GK_GFX1036:
diff --git a/llvm/test/CodeGen/AArch64/fp16_intrinsic_scalar_1op.ll b/llvm/test/CodeGen/AArch64/fp16_intrinsic_scalar_1op.ll
index 5ccf7b1adc3a7..61f8d6edce7b6 100644
--- a/llvm/test/CodeGen/AArch64/fp16_intrinsic_scalar_1op.ll
+++ b/llvm/test/CodeGen/AArch64/fp16_intrinsic_scalar_1op.ll
@@ -2,31 +2,7 @@
 ; RUN: llc < %s -mtriple=aarch64 -global-isel=0 -mattr=+v8.2a,+fullfp16  | FileCheck %s --check-prefixes=CHECK,CHECK-SD
 ; RUN: llc < %s -mtriple=aarch64 -global-isel=1 -mattr=+v8.2a,+fullfp16  | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
-declare i64 @llvm.aarch64.neon.fcvtpu.i64.f16(half)
-declare i32 @llvm.aarch64.neon.fcvtpu.i32.f16(half)
-declare i64 @llvm.aarch64.neon.fcvtps.i64.f16(half)
-declare i32 @llvm.aarch64.neon.fcvtps.i32.f16(half)
-declare i64 @llvm.aarch64.neon.fcvtnu.i64.f16(half)
-declare i32 @llvm.aarch64.neon.fcvtnu.i32.f16(half)
-declare i64 @llvm.aarch64.neon.fcvtns.i64.f16(half)
-declare i32 @llvm.aarch64.neon.fcvtns.i32.f16(half)
-declare i64 @llvm.aarch64.neon.fcvtmu.i64.f16(half)
-declare i32 @llvm.aarch64.neon.fcvtmu.i32.f16(half)
-declare i64 @llvm.aarch64.neon.fcvtms.i64.f16(half)
-declare i32 @llvm.aarch64.neon.fcvtms.i32.f16(half)
-declare i64 @llvm.aarch64.neon.fcvtau.i64.f16(half)
-declare i32 @llvm.aarch64.neon.fcvtau.i32.f16(half)
-declare i64 @llvm.aarch64.neon.fcvtas.i64.f16(half)
-declare i32 @llvm.aarch64.neon.fcvtas.i32.f16(half)
-declare i64 @llvm.aarch64.neon.fcvtzs.i64.f16(half)
-declare i32 @llvm.aarch64.neon.fcvtzs.i32.f16(half)
-declare i64 @llvm.aarch64.neon.fcvtzu.i64.f16(half)
-declare i32 @llvm.aarch64.neon.fcvtzu.i32.f16(half)
-declare half @llvm.aarch64.neon.frsqrte.f16(half)
-declare half @llvm.aarch64.neon.frecpx.f16(half)
-declare half @llvm.aarch64.neon.frecpe.f16(half)
-
-define dso_local i16 @t2(half %a) {
+define i16 @t2(half %a) {
 ; CHECK-SD-LABEL: t2:
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    fcmp h0, #0.0
@@ -45,7 +21,7 @@ entry:
   ret i16 %vceqz
 }
 
-define dso_local i16 @t3(half %a) {
+define i16 @t3(half %a) {
 ; CHECK-SD-LABEL: t3:
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    fcmp h0, #0.0
@@ -64,7 +40,7 @@ entry:
   ret i16 %vcgez
 }
 
-define dso_local i16 @t4(half %a) {
+define i16 @t4(half %a) {
 ; CHECK-SD-LABEL: t4:
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    fcmp h0, #0.0
@@ -83,7 +59,7 @@ entry:
   ret i16 %vcgtz
 }
 
-define dso_local i16 @t5(half %a) {
+define i16 @t5(half %a) {
 ; CHECK-SD-LABEL: t5:
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    fcmp h0, #0.0
@@ -102,7 +78,7 @@ entry:
   ret i16 %vclez
 }
 
-define dso_local i16 @t6(half %a) {
+define i16 @t6(half %a) {
 ; CHECK-SD-LABEL: t6:
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    fcmp h0, #0.0
@@ -121,7 +97,7 @@ entry:
   ret i16 %vcltz
 }
 
-define dso_local half @t8(i32 %a) {
+define half @t8(i32 %a) {
 ; CHECK-LABEL: t8:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    scvtf h0, w0
@@ -131,7 +107,7 @@ entry:
   ret half %0
 }
 
-define dso_local half @t9(i64 %a) {
+define half @t9(i64 %a) {
 ; CHECK-LABEL: t9:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    scvtf h0, x0
@@ -141,7 +117,7 @@ entry:
   ret half %0
 }
 
-define dso_local half @t12(i64 %a) {
+define half @t12(i64 %a) {
 ; CHECK-LABEL: t12:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ucvtf h0, x0
@@ -151,7 +127,7 @@ entry:
   ret half %0
 }
 
-define dso_local i16 @t13(half %a) {
+define i16 @t13(half %a) {
 ; CHECK-LABEL: t13:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fcvtzs w0, h0
@@ -161,7 +137,7 @@ entry:
   ret i16 %0
 }
 
-define dso_local i64 @t15(half %a) {
+define i64 @t15(half %a) {
 ; CHECK-LABEL: t15:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fcvtzs x0, h0
@@ -171,7 +147,7 @@ entry:
   ret i64 %0
 }
 
-define dso_local i16 @t16(half %a) {
+define i16 @t16(half %a) {
 ; CHECK-SD-LABEL: t16:
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    fcvtzs w0, h0
@@ -186,7 +162,7 @@ entry:
   ret i16 %0
 }
 
-define dso_local i64 @t18(half %a) {
+define i64 @t18(half %a) {
 ; CHECK-LABEL: t18:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fcvtzu x0, h0
@@ -269,7 +245,7 @@ entry:
   ret i16 %fcvt
 }
 
-define dso_local i16 @t19(half %a) {
+define i16 @t19(half %a) {
 ; CHECK-LABEL: t19:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fcvtas w0, h0
@@ -280,7 +256,7 @@ entry:
   ret i16 %0
 }
 
-define dso_local i64 @t21(half %a) {
+define i64 @t21(half %a) {
 ; CHECK-LABEL: t21:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fcvtas x0, h0
@@ -301,7 +277,7 @@ entry:
   ret i16 %fcvt
 }
 
-define dso_local i16 @t22(half %a) {
+define i16 @t22(half %a) {
 ; CHECK-LABEL: t22:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fcvtau w0, h0
@@ -312,7 +288,7 @@ entry:
   ret i16 %0
 }
 
-define dso_local i64 @t24(half %a) {
+define i64 @t24(half %a) {
 ; CHECK-LABEL: t24:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fcvtau x0, h0
@@ -333,7 +309,7 @@ entry:
   ret i16 %fcvt
 }
 
-define dso_local i16 @t25(half %a) {
+define i16 @t25(half %a) {
 ; CHECK-LABEL: t25:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fcvtms w0, h0
@@ -344,7 +320,7 @@ entry:
   ret i16 %0
 }
 
-define dso_local i64 @t27(half %a) {
+define i64 @t27(half %a) {
 ; CHECK-LABEL: t27:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fcvtms x0, h0
@@ -365,7 +341,7 @@ entry:
   ret i16 %fcvt
 }
 
-define dso_local i16 @t28(half %a) {
+define i16 @t28(half %a) {
 ; CHECK-LABEL: t28:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fcvtmu w0, h0
@@ -376,7 +352,7 @@ entry:
   ret i16 %0
 }
 
-define dso_local i64 @t30(half %a) {
+define i64 @t30(half %a) {
 ; CHECK-LABEL: t30:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fcvtmu x0, h0
@@ -397,7 +373,7 @@ entry:
   ret i16 %fcvt
 }
 
-define dso_local i16 @t31(half %a) {
+define i16 @t31(half %a) {
 ; CHECK-LABEL: t31:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fcvtns w0, h0
@@ -408,7 +384,7 @@ entry:
   ret i16 %0
 }
 
-define dso_local i64 @t33(half %a) {
+define i64 @t33(half %a) {
 ; CHECK-LABEL: t33:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fcvtns x0, h0
@@ -429,7 +405,7 @@ entry:
   ret i16 %fcvt
 }
 
-define dso_local i16 @t34(half %a) {
+define i16 @t34(half %a) {
 ; CHECK-LABEL: t34:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fcvtnu w0, h0
@@ -440,7 +416,7 @@ entry:
   ret i16 %0
 }
 
-define dso_local i64 @t36(half %a) {
+define i64 @t36(half %a) {
 ; CHECK-LABEL: t36:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fcvtnu x0, h0
@@ -461,7 +437,7 @@ entry:
   ret i16 %fcvt
 }
 
-define dso_local i16 @t37(half %a) {
+define i16 @t37(half %a) {
 ; CHECK-LABEL: t37:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fcvtps w0, h0
@@ -472,7 +448,7 @@ entry:
   ret i16 %0
 }
 
-define dso_local i64 @t39(half %a) {
+define i64 @t39(half %a) {
 ; CHECK-LABEL: t39:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fcvtps x0, h0
@@ -493,7 +469,7 @@ entry:
   ret i16 %fcvt
 }
 
-define dso_local i16 @t40(half %a) {
+define i16 @t40(half %a) {
 ; CHECK-LABEL: t40:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fcvtpu w0, h0
@@ -504,7 +480,7 @@ entry:
   ret i16 %0
 }
 
-define dso_local i64 @t42(half %a) {
+define i64 @t42(half %a) {
 ; CHECK-LABEL: t42:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fcvtpu x0, h0
@@ -514,7 +490,7 @@ entry:
   ret i64 %vcvtph_u64_f16
 }
 
-define dso_local half @t44(half %a) {
+define half @t44(half %a) {
 ; CHECK-LABEL: t44:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    frecpe h0, h0
@@ -524,7 +500,7 @@ entry:
   ret half %vrecpeh_f16
 }
 
-define dso_local half @t45(half %a) {
+define half @t45(half %a) {
 ; CHECK-LABEL: t45:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    frecpx h0, h0
@@ -534,7 +510,7 @@ entry:
   ret half %vrecpxh_f16
 }
 
-define dso_local half @t53(half %a) {
+define half @t53(half %a) {
 ; CHECK-LABEL: t53:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    frsqrte h0, h0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll
index 9693d544d1535..450cd0701911a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll
@@ -1,14 +1,15 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1170 -mattr=-real-true16 < %s | FileCheck %s --check-prefixes=GCN,GFX1170
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck %s --check-prefixes=GCN,GFX12
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] neg_hi:[1,0,0]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
-; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_f16_negA:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[16:17], v[8:11], off
+; GCN-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
+; GCN-NEXT:    s_endpgm
 bb:
   %fneg.A = fneg <8 x half> %A
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %fneg.A, <8 x half> %B, <8 x float> %C)
@@ -17,13 +18,13 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negB:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] neg_hi:[0,1,0]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
-; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_f16_negB:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[16:17], v[8:11], off
+; GCN-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
+; GCN-NEXT:    s_endpgm
 bb:
   %fneg.B = fneg <8 x half> %B
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %fneg.B, <8 x float> %C)
@@ -32,13 +33,13 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negC:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
-; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_f16_negC:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[16:17], v[8:11], off
+; GCN-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
+; GCN-NEXT:    s_endpgm
 bb:
   %fneg.C = fneg <8 x float> %C
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fneg.C)
@@ -47,13 +48,13 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_f16_absC:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
-; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_f16_absC:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[16:17], v[8:11], off
+; GCN-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
+; GCN-NEXT:    s_endpgm
 bb:
   %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fabs.C)
@@ -62,13 +63,13 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_negC:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
-; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf16_negC:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[16:17], v[8:11], off
+; GCN-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
+; GCN-NEXT:    s_endpgm
 bb:
   %fneg.C = fneg <8 x float> %C
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> %fneg.C)
@@ -77,13 +78,13 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_absC:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
-; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf16_absC:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[16:17], v[8:11], off
+; GCN-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
+; GCN-NEXT:    s_endpgm
 bb:
   %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> %fabs.C)
@@ -92,11 +93,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negA:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] neg_hi:[1,0,0]
-; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f16_16x16x16_f16_negA:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GCN-NEXT:    global_store_b128 v[12:13], v[8:11], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fneg.A = fneg <8 x half> %A
   %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %fneg.A, <8 x half> %B, <8 x half> %C, i1 0)
@@ -105,11 +106,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] neg_hi:[0,1,0]
-; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f16_16x16x16_f16_negB:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GCN-NEXT:    global_store_b128 v[12:13], v[8:11], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fneg.B = fneg <8 x half> %B
   %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %fneg.B, <8 x half> %C, i1 0)
@@ -118,11 +119,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1]
-; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f16_16x16x16_f16_negC:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1]
+; GCN-NEXT:    global_store_b128 v[12:13], v[8:11], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fneg.C = fneg <8 x half> %C
   %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fneg.C, i1 0)
@@ -131,11 +132,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f16_16x16x16_f16_absC:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1]
-; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f16_16x16x16_f16_absC:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1]
+; GCN-NEXT:    global_store_b128 v[12:13], v[8:11], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fabs.C = call <8 x half> @llvm.fabs.v8f16(<8 x half> %C)
   %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fabs.C, i1 0)
@@ -144,13 +145,13 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_negC:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_fp8_fp8_negC:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GCN-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GCN-NEXT:    s_endpgm
 bb:
   %fneg.C = fneg <8 x float> %C
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
@@ -159,13 +160,13 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_absC:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_fp8_fp8_absC:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GCN-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GCN-NEXT:    s_endpgm
 bb:
   %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
@@ -174,13 +175,13 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_negC:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf8_fp8_negC:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GCN-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GCN-NEXT:    s_endpgm
 bb:
   %fneg.C = fneg <8 x float> %C
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
@@ -189,13 +190,13 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_absC:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf8_fp8_absC:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GCN-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GCN-NEXT:    s_endpgm
 bb:
   %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
@@ -204,13 +205,13 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_negC:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_fp8_bf8_negC:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GCN-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GCN-NEXT:    s_endpgm
 bb:
   %fneg.C = fneg <8 x float> %C
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
@@ -219,13 +220,13 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_absC:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_fp8_bf8_absC:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GCN-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GCN-NEXT:    s_endpgm
 bb:
   %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
@@ -234,13 +235,13 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_negC:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf8_bf8_negC:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GCN-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GCN-NEXT:    s_endpgm
 bb:
   %fneg.C = fneg <8 x float> %C
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
@@ -249,13 +250,13 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_absC:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf8_bf8_absC:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GCN-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GCN-NEXT:    s_endpgm
 bb:
   %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
@@ -264,13 +265,13 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negA:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] neg_hi:[1,0,0]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[21:22], v[12:15], off
-; GFX12-NEXT:    global_store_b128 v[21:22], v[16:19], off offset:16
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_f32_16x16x32_f16_negA:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[21:22], v[12:15], off
+; GCN-NEXT:    global_store_b128 v[21:22], v[16:19], off offset:16
+; GCN-NEXT:    s_endpgm
 bb:
   %fneg.A = fneg <8 x half> %A
   %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f16.v16f16.v8f32.i16(<8 x half> %fneg.A, <16 x half> %B, <8 x float> %C, i16 %Index)
@@ -279,13 +280,13 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negB:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] neg_hi:[0,1,0]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[21:22], v[12:15], off
-; GFX12-NEXT:    global_store_b128 v[21:22], v[16:19], off offset:16
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_f32_16x16x32_f16_negB:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[21:22], v[12:15], off
+; GCN-NEXT:    global_store_b128 v[21:22], v[16:19], off offset:16
+; GCN-NEXT:    s_endpgm
 bb:
   %fneg.B = fneg <16 x half> %B
   %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f16.v16f16.v8f32.i16(<8 x half> %A, <16 x half> %fneg.B, <8 x float> %C, i16 %Index)
@@ -294,11 +295,11 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negA:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] neg_hi:[1,0,0]
-; GFX12-NEXT:    global_store_b128 v[17:18], v[12:15], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_f16_16x16x32_f16_negA:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GCN-NEXT:    global_store_b128 v[17:18], v[12:15], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fneg.A = fneg <8 x half> %A
   %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v16f16.v8f16.i16(<8 x half> %fneg.A, <16 x half> %B, <8 x half> %C, i16 %Index)
@@ -307,11 +308,11 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negB:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] neg_hi:[0,1,0]
-; GFX12-NEXT:    global_store_b128 v[17:18], v[12:15], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_f16_16x16x32_f16_negB:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GCN-NEXT:    global_store_b128 v[17:18], v[12:15], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fneg.B = fneg <16 x half> %B
   %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v16f16.v8f16.i16(<8 x half> %A, <16 x half> %fneg.B, <8 x half> %C, i16 %Index)
@@ -322,13 +323,13 @@ bb:
 ; both neg and abs patterns (wmma matrix C f32 or f16 )
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negabsC:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] neg_hi:[0,0,1]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
-; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_f16_negabsC:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[16:17], v[8:11], off
+; GCN-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
+; GCN-NEXT:    s_endpgm
 bb:
   %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
   %fneg.fabs.C = fneg <8 x float> %fabs.C
@@ -338,11 +339,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negabsC:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] neg_hi:[0,0,1]
-; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f16_16x16x16_f16_negabsC:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GCN-NEXT:    global_store_b128 v[12:13], v[8:11], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fabs.C = call <8 x half> @llvm.fabs.v8f16(<8 x half> %C)
   %fneg.fabs.C = fneg <8 x half> %fabs.C
@@ -352,15 +353,15 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_f16_neg_partial_fabsA:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_and_b32_e32 v11, 0x7fffffff, v11
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
-; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_f16_neg_partial_fabsA:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_and_b32_e32 v11, 0x7fffffff, v11
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT:    v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[16:17], v[8:11], off
+; GCN-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
+; GCN-NEXT:    s_endpgm
 bb:
   %el3 = extractelement <8 x float> %C, i32 3
   %el3.fabs = call float @llvm.fabs.f32(float %el3)
@@ -374,13 +375,13 @@ bb:
 ; A or B matrix modifier and constant in C
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA_constantC:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
-; GFX12-NEXT:    global_store_b128 v[8:9], v[14:17], off offset:16
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_f16_negA_constantC:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[8:9], v[10:13], off
+; GCN-NEXT:    global_store_b128 v[8:9], v[14:17], off offset:16
+; GCN-NEXT:    s_endpgm
 bb:
   %fneg.A = fneg <8 x half> %A
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %fneg.A, <8 x half> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
@@ -389,11 +390,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB_constantC:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0]
-; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f16_16x16x16_f16_negB_constantC:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GCN-NEXT:    global_store_b128 v[8:9], v[10:13], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fneg.B = fneg <8 x half> %B
   %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %fneg.B, <8 x half> <half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
@@ -404,6 +405,27 @@ bb:
 ; pack f16 elements with v_perm_b32 since they don't come from same b32
 
 define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<8 x half> %A, <8 x half> %B, ptr %Caddr, ptr addrspace(1) %out) {
+; GFX1170-LABEL: test_wmma_f16_16x16x16_f16_negC_pack:
+; GFX1170:       ; %bb.0: ; %bb
+; GFX1170-NEXT:    s_clause 0x1
+; GFX1170-NEXT:    flat_load_b128 v[12:15], v[8:9]
+; GFX1170-NEXT:    flat_load_b128 v[16:19], v[8:9] offset:16
+; GFX1170-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
+; GFX1170-NEXT:    v_and_b32_e32 v8, 0xffff, v12
+; GFX1170-NEXT:    v_and_b32_e32 v9, 0xffff, v14
+; GFX1170-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX1170-NEXT:    v_and_b32_e32 v14, 0xffff, v16
+; GFX1170-NEXT:    v_and_b32_e32 v16, 0xffff, v18
+; GFX1170-NEXT:    v_lshl_or_b32 v12, v13, 16, v8
+; GFX1170-NEXT:    v_lshl_or_b32 v13, v15, 16, v9
+; GFX1170-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1170-NEXT:    v_lshl_or_b32 v14, v17, 16, v14
+; GFX1170-NEXT:    v_lshl_or_b32 v15, v19, 16, v16
+; GFX1170-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT:    v_wmma_f16_16x16x16_f16 v[12:15], v[0:3], v[4:7], v[12:15] neg_lo:[0,0,1]
+; GFX1170-NEXT:    global_store_b128 v[10:11], v[12:15], off
+; GFX1170-NEXT:    s_endpgm
+;
 ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC_pack:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    s_clause 0x1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-imm.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-imm.ll
index 6b749df71223f..8f8267952cbe1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-imm.ll
@@ -1,14 +1,15 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1170 < %s | FileCheck %s --check-prefixes=GCN,GFX1170
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefixes=GCN,GFX12
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], 1.0
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
-; GFX12-NEXT:    global_store_b128 v[8:9], v[14:17], off offset:16
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_f16_imm:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], 1.0
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[8:9], v[10:13], off
+; GCN-NEXT:    global_store_b128 v[8:9], v[14:17], off offset:16
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
   store <8 x float> %res, ptr addrspace(1) %out
@@ -16,27 +17,27 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm_non_inlineable(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm_non_inlineable:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    s_mov_b32 s0, 0x40400000
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_mov_b32 s7, s0
-; GFX12-NEXT:    s_mov_b32 s1, s0
-; GFX12-NEXT:    s_mov_b32 s2, s0
-; GFX12-NEXT:    s_mov_b32 s3, s0
-; GFX12-NEXT:    s_mov_b32 s4, s0
-; GFX12-NEXT:    s_mov_b32 s5, s0
-; GFX12-NEXT:    s_mov_b32 s6, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_mov_b32 v17, s7 :: v_dual_mov_b32 v16, s6
-; GFX12-NEXT:    v_dual_mov_b32 v15, s5 :: v_dual_mov_b32 v14, s4
-; GFX12-NEXT:    v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2
-; GFX12-NEXT:    v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0
-; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], v[10:17]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
-; GFX12-NEXT:    global_store_b128 v[8:9], v[14:17], off offset:16
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_f16_imm_non_inlineable:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_mov_b32 s0, 0x40400000
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT:    s_mov_b32 s7, s0
+; GCN-NEXT:    s_mov_b32 s1, s0
+; GCN-NEXT:    s_mov_b32 s2, s0
+; GCN-NEXT:    s_mov_b32 s3, s0
+; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    s_mov_b32 s5, s0
+; GCN-NEXT:    s_mov_b32 s6, s0
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GCN-NEXT:    v_dual_mov_b32 v17, s7 :: v_dual_mov_b32 v16, s6
+; GCN-NEXT:    v_dual_mov_b32 v15, s5 :: v_dual_mov_b32 v14, s4
+; GCN-NEXT:    v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2
+; GCN-NEXT:    v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0
+; GCN-NEXT:    v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], v[10:17]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[8:9], v[10:13], off
+; GCN-NEXT:    global_store_b128 v[8:9], v[14:17], off offset:16
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
   store <8 x float> %res, ptr addrspace(1) %out
@@ -44,13 +45,13 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[10:17], v[0:3], v[4:7], 1.0
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
-; GFX12-NEXT:    global_store_b128 v[8:9], v[14:17], off offset:16
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf16_imm:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_bf16 v[10:17], v[0:3], v[4:7], 1.0
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[8:9], v[10:13], off
+; GCN-NEXT:    global_store_b128 v[8:9], v[14:17], off offset:16
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
   store <8 x float> %res, ptr addrspace(1) %out
@@ -58,27 +59,27 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm_non_inlineable(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm_non_inlineable:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    s_mov_b32 s0, 0x40400000
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_mov_b32 s7, s0
-; GFX12-NEXT:    s_mov_b32 s1, s0
-; GFX12-NEXT:    s_mov_b32 s2, s0
-; GFX12-NEXT:    s_mov_b32 s3, s0
-; GFX12-NEXT:    s_mov_b32 s4, s0
-; GFX12-NEXT:    s_mov_b32 s5, s0
-; GFX12-NEXT:    s_mov_b32 s6, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_mov_b32 v17, s7 :: v_dual_mov_b32 v16, s6
-; GFX12-NEXT:    v_dual_mov_b32 v15, s5 :: v_dual_mov_b32 v14, s4
-; GFX12-NEXT:    v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2
-; GFX12-NEXT:    v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0
-; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[10:17], v[0:3], v[4:7], v[10:17]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
-; GFX12-NEXT:    global_store_b128 v[8:9], v[14:17], off offset:16
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf16_imm_non_inlineable:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_mov_b32 s0, 0x40400000
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT:    s_mov_b32 s7, s0
+; GCN-NEXT:    s_mov_b32 s1, s0
+; GCN-NEXT:    s_mov_b32 s2, s0
+; GCN-NEXT:    s_mov_b32 s3, s0
+; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    s_mov_b32 s5, s0
+; GCN-NEXT:    s_mov_b32 s6, s0
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GCN-NEXT:    v_dual_mov_b32 v17, s7 :: v_dual_mov_b32 v16, s6
+; GCN-NEXT:    v_dual_mov_b32 v15, s5 :: v_dual_mov_b32 v14, s4
+; GCN-NEXT:    v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2
+; GCN-NEXT:    v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0
+; GCN-NEXT:    v_wmma_f32_16x16x16_bf16 v[10:17], v[0:3], v[4:7], v[10:17]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[8:9], v[10:13], off
+; GCN-NEXT:    global_store_b128 v[8:9], v[14:17], off offset:16
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
   store <8 x float> %res, ptr addrspace(1) %out
@@ -86,11 +87,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0
-; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f16_16x16x16_f16_imm:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0
+; GCN-NEXT:    global_store_b128 v[8:9], v[10:13], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> <half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
   store <8 x half> %res, ptr addrspace(1) %out
@@ -98,19 +99,19 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm_non_inlineable(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm_non_inlineable:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    s_mov_b32 s0, 0x42004200
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_mov_b32 s3, s0
-; GFX12-NEXT:    s_mov_b32 s1, s0
-; GFX12-NEXT:    s_mov_b32 s2, s0
-; GFX12-NEXT:    v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2
-; GFX12-NEXT:    v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], v[10:13]
-; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f16_16x16x16_f16_imm_non_inlineable:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_mov_b32 s0, 0x42004200
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GCN-NEXT:    s_mov_b32 s3, s0
+; GCN-NEXT:    s_mov_b32 s1, s0
+; GCN-NEXT:    s_mov_b32 s2, s0
+; GCN-NEXT:    v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2
+; GCN-NEXT:    v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT:    v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], v[10:13]
+; GCN-NEXT:    global_store_b128 v[8:9], v[10:13], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> <half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0>, i1 0)
   store <8 x half> %res, ptr addrspace(1) %out
@@ -118,19 +119,19 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    s_mov_b32 s0, 0x3f803f80
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_mov_b32 s3, s0
-; GFX12-NEXT:    s_mov_b32 s1, s0
-; GFX12-NEXT:    s_mov_b32 s2, s0
-; GFX12-NEXT:    v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2
-; GFX12-NEXT:    v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13]
-; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_bf16_16x16x16_bf16_imm:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_mov_b32 s0, 0x3f803f80
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GCN-NEXT:    s_mov_b32 s3, s0
+; GCN-NEXT:    s_mov_b32 s1, s0
+; GCN-NEXT:    s_mov_b32 s2, s0
+; GCN-NEXT:    v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2
+; GCN-NEXT:    v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT:    v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13]
+; GCN-NEXT:    global_store_b128 v[8:9], v[10:13], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> <i16 16256, i16 16256, i16 16256, i16 16256, i16 16256, i16 16256, i16 16256, i16 16256>, i1 0)
   store <8 x i16> %res, ptr addrspace(1) %out
@@ -138,19 +139,19 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm_non_inlineable(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm_non_inlineable:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    s_mov_b32 s0, 0x3fc03fc0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_mov_b32 s3, s0
-; GFX12-NEXT:    s_mov_b32 s1, s0
-; GFX12-NEXT:    s_mov_b32 s2, s0
-; GFX12-NEXT:    v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2
-; GFX12-NEXT:    v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13]
-; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_bf16_16x16x16_bf16_imm_non_inlineable:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_mov_b32 s0, 0x3fc03fc0
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GCN-NEXT:    s_mov_b32 s3, s0
+; GCN-NEXT:    s_mov_b32 s1, s0
+; GCN-NEXT:    s_mov_b32 s2, s0
+; GCN-NEXT:    v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2
+; GCN-NEXT:    v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT:    v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13]
+; GCN-NEXT:    global_store_b128 v[8:9], v[10:13], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> <i16 16320, i16 16320, i16 16320, i16 16320, i16 16320, i16 16320, i16 16320, i16 16320>, i1 0)
   store <8 x i16> %res, ptr addrspace(1) %out
@@ -158,13 +159,13 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[6:13], v[0:1], v[2:3], 1
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
-; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu8_imm:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_i32_16x16x16_iu8 v[6:13], v[0:1], v[2:3], 1
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GCN-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i1 0)
   store <8 x i32> %res, ptr addrspace(1) %out
@@ -172,27 +173,27 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm_non_inlineable:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    s_movk_i32 s0, 0x80
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_mov_b32 s7, s0
-; GFX12-NEXT:    s_mov_b32 s1, s0
-; GFX12-NEXT:    s_mov_b32 s2, s0
-; GFX12-NEXT:    s_mov_b32 s3, s0
-; GFX12-NEXT:    s_mov_b32 s4, s0
-; GFX12-NEXT:    s_mov_b32 s5, s0
-; GFX12-NEXT:    s_mov_b32 s6, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
-; GFX12-NEXT:    v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
-; GFX12-NEXT:    v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
-; GFX12-NEXT:    v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
-; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[6:13], v[0:1], v[2:3], v[6:13]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
-; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu8_imm_non_inlineable:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_movk_i32 s0, 0x80
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT:    s_mov_b32 s7, s0
+; GCN-NEXT:    s_mov_b32 s1, s0
+; GCN-NEXT:    s_mov_b32 s2, s0
+; GCN-NEXT:    s_mov_b32 s3, s0
+; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    s_mov_b32 s5, s0
+; GCN-NEXT:    s_mov_b32 s6, s0
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GCN-NEXT:    v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
+; GCN-NEXT:    v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
+; GCN-NEXT:    v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
+; GCN-NEXT:    v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
+; GCN-NEXT:    v_wmma_i32_16x16x16_iu8 v[6:13], v[0:1], v[2:3], v[6:13]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GCN-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128>, i1 0)
   store <8 x i32> %res, ptr addrspace(1) %out
@@ -200,13 +201,13 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm(i32 %A, i32 %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:11], v0, v1, 1
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT:    global_store_b128 v[2:3], v[8:11], off offset:16
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu4_imm:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:11], v0, v1, 1
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GCN-NEXT:    global_store_b128 v[2:3], v[8:11], off offset:16
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i1 0)
   store <8 x i32> %res, ptr addrspace(1) %out
@@ -214,27 +215,27 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm_non_inlineable:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    s_movk_i32 s0, 0x80
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_mov_b32 s7, s0
-; GFX12-NEXT:    s_mov_b32 s1, s0
-; GFX12-NEXT:    s_mov_b32 s2, s0
-; GFX12-NEXT:    s_mov_b32 s3, s0
-; GFX12-NEXT:    s_mov_b32 s4, s0
-; GFX12-NEXT:    s_mov_b32 s5, s0
-; GFX12-NEXT:    s_mov_b32 s6, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_mov_b32 v11, s7 :: v_dual_mov_b32 v10, s6
-; GFX12-NEXT:    v_dual_mov_b32 v9, s5 :: v_dual_mov_b32 v8, s4
-; GFX12-NEXT:    v_dual_mov_b32 v7, s3 :: v_dual_mov_b32 v6, s2
-; GFX12-NEXT:    v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0
-; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:11], v0, v1, v[4:11]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT:    global_store_b128 v[2:3], v[8:11], off offset:16
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu4_imm_non_inlineable:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_movk_i32 s0, 0x80
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT:    s_mov_b32 s7, s0
+; GCN-NEXT:    s_mov_b32 s1, s0
+; GCN-NEXT:    s_mov_b32 s2, s0
+; GCN-NEXT:    s_mov_b32 s3, s0
+; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    s_mov_b32 s5, s0
+; GCN-NEXT:    s_mov_b32 s6, s0
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GCN-NEXT:    v_dual_mov_b32 v11, s7 :: v_dual_mov_b32 v10, s6
+; GCN-NEXT:    v_dual_mov_b32 v9, s5 :: v_dual_mov_b32 v8, s4
+; GCN-NEXT:    v_dual_mov_b32 v7, s3 :: v_dual_mov_b32 v6, s2
+; GCN-NEXT:    v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0
+; GCN-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:11], v0, v1, v[4:11]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GCN-NEXT:    global_store_b128 v[2:3], v[8:11], off offset:16
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> <i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128>, i1 0)
   store <8 x i32> %res, ptr addrspace(1) %out
@@ -242,13 +243,13 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[6:13], v[0:1], v[2:3], 1.0
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
-; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[6:13], v[0:1], v[2:3], 1.0
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GCN-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
   store <8 x float> %res, ptr addrspace(1) %out
@@ -256,27 +257,27 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    s_mov_b32 s0, 0x40400000
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_mov_b32 s7, s0
-; GFX12-NEXT:    s_mov_b32 s1, s0
-; GFX12-NEXT:    s_mov_b32 s2, s0
-; GFX12-NEXT:    s_mov_b32 s3, s0
-; GFX12-NEXT:    s_mov_b32 s4, s0
-; GFX12-NEXT:    s_mov_b32 s5, s0
-; GFX12-NEXT:    s_mov_b32 s6, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
-; GFX12-NEXT:    v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
-; GFX12-NEXT:    v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
-; GFX12-NEXT:    v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
-; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[6:13], v[0:1], v[2:3], v[6:13]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
-; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_mov_b32 s0, 0x40400000
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT:    s_mov_b32 s7, s0
+; GCN-NEXT:    s_mov_b32 s1, s0
+; GCN-NEXT:    s_mov_b32 s2, s0
+; GCN-NEXT:    s_mov_b32 s3, s0
+; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    s_mov_b32 s5, s0
+; GCN-NEXT:    s_mov_b32 s6, s0
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GCN-NEXT:    v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
+; GCN-NEXT:    v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
+; GCN-NEXT:    v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
+; GCN-NEXT:    v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
+; GCN-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[6:13], v[0:1], v[2:3], v[6:13]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GCN-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
   store <8 x float> %res, ptr addrspace(1) %out
@@ -284,13 +285,13 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[6:13], v[0:1], v[2:3], 1.0
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
-; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[6:13], v[0:1], v[2:3], 1.0
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GCN-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
   store <8 x float> %res, ptr addrspace(1) %out
@@ -298,27 +299,27 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    s_mov_b32 s0, 0x40400000
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_mov_b32 s7, s0
-; GFX12-NEXT:    s_mov_b32 s1, s0
-; GFX12-NEXT:    s_mov_b32 s2, s0
-; GFX12-NEXT:    s_mov_b32 s3, s0
-; GFX12-NEXT:    s_mov_b32 s4, s0
-; GFX12-NEXT:    s_mov_b32 s5, s0
-; GFX12-NEXT:    s_mov_b32 s6, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
-; GFX12-NEXT:    v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
-; GFX12-NEXT:    v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
-; GFX12-NEXT:    v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
-; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[6:13], v[0:1], v[2:3], v[6:13]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
-; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_mov_b32 s0, 0x40400000
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT:    s_mov_b32 s7, s0
+; GCN-NEXT:    s_mov_b32 s1, s0
+; GCN-NEXT:    s_mov_b32 s2, s0
+; GCN-NEXT:    s_mov_b32 s3, s0
+; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    s_mov_b32 s5, s0
+; GCN-NEXT:    s_mov_b32 s6, s0
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GCN-NEXT:    v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
+; GCN-NEXT:    v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
+; GCN-NEXT:    v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
+; GCN-NEXT:    v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
+; GCN-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[6:13], v[0:1], v[2:3], v[6:13]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GCN-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
   store <8 x float> %res, ptr addrspace(1) %out
@@ -326,13 +327,13 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[6:13], v[0:1], v[2:3], 1.0
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
-; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[6:13], v[0:1], v[2:3], 1.0
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GCN-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
   store <8 x float> %res, ptr addrspace(1) %out
@@ -340,27 +341,27 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    s_mov_b32 s0, 0x40400000
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_mov_b32 s7, s0
-; GFX12-NEXT:    s_mov_b32 s1, s0
-; GFX12-NEXT:    s_mov_b32 s2, s0
-; GFX12-NEXT:    s_mov_b32 s3, s0
-; GFX12-NEXT:    s_mov_b32 s4, s0
-; GFX12-NEXT:    s_mov_b32 s5, s0
-; GFX12-NEXT:    s_mov_b32 s6, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
-; GFX12-NEXT:    v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
-; GFX12-NEXT:    v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
-; GFX12-NEXT:    v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
-; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[6:13], v[0:1], v[2:3], v[6:13]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
-; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_mov_b32 s0, 0x40400000
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT:    s_mov_b32 s7, s0
+; GCN-NEXT:    s_mov_b32 s1, s0
+; GCN-NEXT:    s_mov_b32 s2, s0
+; GCN-NEXT:    s_mov_b32 s3, s0
+; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    s_mov_b32 s5, s0
+; GCN-NEXT:    s_mov_b32 s6, s0
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GCN-NEXT:    v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
+; GCN-NEXT:    v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
+; GCN-NEXT:    v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
+; GCN-NEXT:    v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
+; GCN-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[6:13], v[0:1], v[2:3], v[6:13]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GCN-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
   store <8 x float> %res, ptr addrspace(1) %out
@@ -368,13 +369,13 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[6:13], v[0:1], v[2:3], 1.0
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
-; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[6:13], v[0:1], v[2:3], 1.0
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GCN-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
   store <8 x float> %res, ptr addrspace(1) %out
@@ -382,27 +383,27 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    s_mov_b32 s0, 0x40400000
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_mov_b32 s7, s0
-; GFX12-NEXT:    s_mov_b32 s1, s0
-; GFX12-NEXT:    s_mov_b32 s2, s0
-; GFX12-NEXT:    s_mov_b32 s3, s0
-; GFX12-NEXT:    s_mov_b32 s4, s0
-; GFX12-NEXT:    s_mov_b32 s5, s0
-; GFX12-NEXT:    s_mov_b32 s6, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
-; GFX12-NEXT:    v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
-; GFX12-NEXT:    v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
-; GFX12-NEXT:    v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
-; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[6:13], v[0:1], v[2:3], v[6:13]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
-; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_mov_b32 s0, 0x40400000
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT:    s_mov_b32 s7, s0
+; GCN-NEXT:    s_mov_b32 s1, s0
+; GCN-NEXT:    s_mov_b32 s2, s0
+; GCN-NEXT:    s_mov_b32 s3, s0
+; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    s_mov_b32 s5, s0
+; GCN-NEXT:    s_mov_b32 s6, s0
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GCN-NEXT:    v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
+; GCN-NEXT:    v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
+; GCN-NEXT:    v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
+; GCN-NEXT:    v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
+; GCN-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[6:13], v[0:1], v[2:3], v[6:13]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GCN-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
   store <8 x float> %res, ptr addrspace(1) %out
@@ -410,13 +411,13 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[6:13], v[0:1], v[2:3], 1
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
-; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x32_iu4_imm:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_i32_16x16x32_iu4 v[6:13], v[0:1], v[2:3], 1
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GCN-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i1 0)
   store <8 x i32> %res, ptr addrspace(1) %out
@@ -424,27 +425,27 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm_non_inlineable:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    s_movk_i32 s0, 0x80
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_mov_b32 s7, s0
-; GFX12-NEXT:    s_mov_b32 s1, s0
-; GFX12-NEXT:    s_mov_b32 s2, s0
-; GFX12-NEXT:    s_mov_b32 s3, s0
-; GFX12-NEXT:    s_mov_b32 s4, s0
-; GFX12-NEXT:    s_mov_b32 s5, s0
-; GFX12-NEXT:    s_mov_b32 s6, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
-; GFX12-NEXT:    v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
-; GFX12-NEXT:    v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
-; GFX12-NEXT:    v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
-; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[6:13], v[0:1], v[2:3], v[6:13]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
-; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x32_iu4_imm_non_inlineable:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_movk_i32 s0, 0x80
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT:    s_mov_b32 s7, s0
+; GCN-NEXT:    s_mov_b32 s1, s0
+; GCN-NEXT:    s_mov_b32 s2, s0
+; GCN-NEXT:    s_mov_b32 s3, s0
+; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    s_mov_b32 s5, s0
+; GCN-NEXT:    s_mov_b32 s6, s0
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GCN-NEXT:    v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
+; GCN-NEXT:    v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
+; GCN-NEXT:    v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
+; GCN-NEXT:    v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
+; GCN-NEXT:    v_wmma_i32_16x16x32_iu4 v[6:13], v[0:1], v[2:3], v[6:13]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GCN-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128>, i1 0)
   store <8 x i32> %res, ptr addrspace(1) %out
@@ -473,3 +474,6 @@ declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v
 declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
 declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
 declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX1170: {{.*}}
+; GFX12: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-iu-modifiers.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-iu-modifiers.ll
index 929a51bfff53c..37900d6db1027 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-iu-modifiers.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-iu-modifiers.ll
@@ -1,14 +1,15 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1170 < %s | FileCheck %s --check-prefixes=GCN,GFX1170
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefixes=GCN,GFX12
 
 define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src0(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src0:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu8_zext_src0:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GCN-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
   store <8 x i32> %res, ptr addrspace(1) %out
@@ -16,13 +17,13 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src1(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src1:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu8_zext_src1:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GCN-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0)
   store <8 x i32> %res, ptr addrspace(1) %out
@@ -30,13 +31,13 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_clamp:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] clamp
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu8_clamp:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] clamp
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GCN-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1)
   store <8 x i32> %res, ptr addrspace(1) %out
@@ -46,13 +47,13 @@ bb:
 
 
 define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src0(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src0:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[1,0,0]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[10:11], v[2:5], off
-; GFX12-NEXT:    global_store_b128 v[10:11], v[6:9], off offset:16
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu4_zext_src0:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[1,0,0]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[10:11], v[2:5], off
+; GCN-NEXT:    global_store_b128 v[10:11], v[6:9], off offset:16
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 1, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 0)
   store <8 x i32> %res, ptr addrspace(1) %out
@@ -60,13 +61,13 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src1(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src1:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[0,1,0]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[10:11], v[2:5], off
-; GFX12-NEXT:    global_store_b128 v[10:11], v[6:9], off offset:16
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu4_zext_src1:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[0,1,0]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[10:11], v[2:5], off
+; GCN-NEXT:    global_store_b128 v[10:11], v[6:9], off offset:16
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 1, i32 %B, <8 x i32> %C, i1 0)
   store <8 x i32> %res, ptr addrspace(1) %out
@@ -74,13 +75,13 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_clamp(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_clamp:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] clamp
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[10:11], v[2:5], off
-; GFX12-NEXT:    global_store_b128 v[10:11], v[6:9], off offset:16
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu4_clamp:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] clamp
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[10:11], v[2:5], off
+; GCN-NEXT:    global_store_b128 v[10:11], v[6:9], off offset:16
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 1)
   store <8 x i32> %res, ptr addrspace(1) %out
@@ -90,13 +91,13 @@ bb:
 
 
 define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src0(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src0:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x32_iu4_zext_src0:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GCN-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
   store <8 x i32> %res, ptr addrspace(1) %out
@@ -104,13 +105,13 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src1(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src1:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x32_iu4_zext_src1:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GCN-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0)
   store <8 x i32> %res, ptr addrspace(1) %out
@@ -118,13 +119,13 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_clamp:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] clamp
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x32_iu4_clamp:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] clamp
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GCN-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1)
   store <8 x i32> %res, ptr addrspace(1) %out
@@ -136,13 +137,13 @@ bb:
 
 
 define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src0(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src0:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
-; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src0:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[15:16], v[6:9], off
+; GCN-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 1, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
   store <8 x i32> %res, ptr addrspace(1) %out
@@ -150,13 +151,13 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src1(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src1:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
-; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src1:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[15:16], v[6:9], off
+; GCN-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
   store <8 x i32> %res, ptr addrspace(1) %out
@@ -164,13 +165,13 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_clamp(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_clamp:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 clamp
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
-; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x32_iu8_clamp:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 clamp
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[15:16], v[6:9], off
+; GCN-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 1)
   store <8 x i32> %res, ptr addrspace(1) %out
@@ -180,13 +181,13 @@ bb:
 
 
 define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src0(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src0:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[1,0,0]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[12:13], v[3:6], off
-; GFX12-NEXT:    global_store_b128 v[12:13], v[7:10], off offset:16
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src0:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[1,0,0]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[12:13], v[3:6], off
+; GCN-NEXT:    global_store_b128 v[12:13], v[7:10], off offset:16
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 1, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
   store <8 x i32> %res, ptr addrspace(1) %out
@@ -194,13 +195,13 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src1(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src1:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[0,1,0]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[12:13], v[3:6], off
-; GFX12-NEXT:    global_store_b128 v[12:13], v[7:10], off offset:16
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src1:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[0,1,0]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[12:13], v[3:6], off
+; GCN-NEXT:    global_store_b128 v[12:13], v[7:10], off offset:16
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 1, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
   store <8 x i32> %res, ptr addrspace(1) %out
@@ -208,13 +209,13 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_clamp(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_clamp:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 clamp
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[12:13], v[3:6], off
-; GFX12-NEXT:    global_store_b128 v[12:13], v[7:10], off offset:16
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x32_iu4_clamp:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 clamp
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[12:13], v[3:6], off
+; GCN-NEXT:    global_store_b128 v[12:13], v[7:10], off offset:16
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 1)
   store <8 x i32> %res, ptr addrspace(1) %out
@@ -224,13 +225,13 @@ bb:
 
 
 define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src0(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src0:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
-; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src0:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[15:16], v[6:9], off
+; GCN-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 1, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0)
   store <8 x i32> %res, ptr addrspace(1) %out
@@ -238,13 +239,13 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src1(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src1:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
-; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src1:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[15:16], v[6:9], off
+; GCN-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 0, <2 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0)
   store <8 x i32> %res, ptr addrspace(1) %out
@@ -252,13 +253,13 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_clamp(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_clamp:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 clamp
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
-; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x64_iu4_clamp:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 clamp
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[15:16], v[6:9], off
+; GCN-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 1)
   store <8 x i32> %res, ptr addrspace(1) %out
@@ -271,3 +272,6 @@ declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1
 declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16 %Index, i1 immarg)
 declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16 %Index, i1 immarg)
 declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i32 %Index, i1 immarg)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX1170: {{.*}}
+; GFX12: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-swmmac-index_key.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-swmmac-index_key.ll
index 7c0f72606a5ba..a3d0da7dfc143 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-swmmac-index_key.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-swmmac-index_key.ll
@@ -1,7 +1,27 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1170 < %s | FileCheck %s --check-prefixes=GCN,GFX1170
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefixes=GCN,GFX12
 
 define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_index_key(<8 x half> %A, <16 x half> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX1170-LABEL: test_swmmac_f32_16x16x32_f16_index_key:
+; GFX1170:       ; %bb.0: ; %bb
+; GFX1170-NEXT:    global_load_b32 v20, v[20:21], off
+; GFX1170-NEXT:    v_dual_mov_b32 v33, v19 :: v_dual_mov_b32 v32, v18
+; GFX1170-NEXT:    v_dual_mov_b32 v31, v17 :: v_dual_mov_b32 v30, v16
+; GFX1170-NEXT:    v_dual_mov_b32 v29, v15 :: v_dual_mov_b32 v28, v14
+; GFX1170-NEXT:    v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
+; GFX1170-NEXT:    s_waitcnt vmcnt(0)
+; GFX1170-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT:    v_swmmac_f32_16x16x32_f16 v[26:33], v[0:3], v[4:11], v20
+; GFX1170-NEXT:    v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 index_key:1
+; GFX1170-NEXT:    s_clause 0x1
+; GFX1170-NEXT:    global_store_b128 v[22:23], v[26:29], off
+; GFX1170-NEXT:    global_store_b128 v[22:23], v[30:33], off offset:16
+; GFX1170-NEXT:    s_clause 0x1
+; GFX1170-NEXT:    global_store_b128 v[24:25], v[12:15], off
+; GFX1170-NEXT:    global_store_b128 v[24:25], v[16:19], off offset:16
+; GFX1170-NEXT:    s_endpgm
+;
 ; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_index_key:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    global_load_b32 v20, v[20:21], off
@@ -32,6 +52,25 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16_index_key(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX1170-LABEL: test_swmmac_f32_16x16x32_bf16_index_key:
+; GFX1170:       ; %bb.0: ; %bb
+; GFX1170-NEXT:    global_load_b32 v20, v[20:21], off
+; GFX1170-NEXT:    v_dual_mov_b32 v33, v19 :: v_dual_mov_b32 v32, v18
+; GFX1170-NEXT:    v_dual_mov_b32 v31, v17 :: v_dual_mov_b32 v30, v16
+; GFX1170-NEXT:    v_dual_mov_b32 v29, v15 :: v_dual_mov_b32 v28, v14
+; GFX1170-NEXT:    v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
+; GFX1170-NEXT:    s_waitcnt vmcnt(0)
+; GFX1170-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT:    v_swmmac_f32_16x16x32_bf16 v[26:33], v[0:3], v[4:11], v20
+; GFX1170-NEXT:    v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 index_key:1
+; GFX1170-NEXT:    s_clause 0x1
+; GFX1170-NEXT:    global_store_b128 v[22:23], v[26:29], off
+; GFX1170-NEXT:    global_store_b128 v[22:23], v[30:33], off offset:16
+; GFX1170-NEXT:    s_clause 0x1
+; GFX1170-NEXT:    global_store_b128 v[24:25], v[12:15], off
+; GFX1170-NEXT:    global_store_b128 v[24:25], v[16:19], off offset:16
+; GFX1170-NEXT:    s_endpgm
+;
 ; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16_index_key:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    global_load_b32 v20, v[20:21], off
@@ -62,6 +101,19 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_index_key(<8 x half> %A, <16 x half> %B, <8 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX1170-LABEL: test_swmmac_f16_16x16x32_f16_index_key:
+; GFX1170:       ; %bb.0: ; %bb
+; GFX1170-NEXT:    global_load_b32 v16, v[16:17], off
+; GFX1170-NEXT:    v_dual_mov_b32 v25, v15 :: v_dual_mov_b32 v24, v14
+; GFX1170-NEXT:    v_dual_mov_b32 v23, v13 :: v_dual_mov_b32 v22, v12
+; GFX1170-NEXT:    s_waitcnt vmcnt(0)
+; GFX1170-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT:    v_swmmac_f16_16x16x32_f16 v[22:25], v[0:3], v[4:11], v16
+; GFX1170-NEXT:    v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:1
+; GFX1170-NEXT:    global_store_b128 v[18:19], v[22:25], off
+; GFX1170-NEXT:    global_store_b128 v[20:21], v[12:15], off
+; GFX1170-NEXT:    s_endpgm
+;
 ; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_index_key:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    global_load_b32 v16, v[16:17], off
@@ -86,6 +138,19 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16_index_key(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX1170-LABEL: test_swmmac_bf16_16x16x32_bf16_index_key:
+; GFX1170:       ; %bb.0: ; %bb
+; GFX1170-NEXT:    global_load_b32 v16, v[16:17], off
+; GFX1170-NEXT:    v_dual_mov_b32 v25, v15 :: v_dual_mov_b32 v24, v14
+; GFX1170-NEXT:    v_dual_mov_b32 v23, v13 :: v_dual_mov_b32 v22, v12
+; GFX1170-NEXT:    s_waitcnt vmcnt(0)
+; GFX1170-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT:    v_swmmac_bf16_16x16x32_bf16 v[22:25], v[0:3], v[4:11], v16
+; GFX1170-NEXT:    v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:1
+; GFX1170-NEXT:    global_store_b128 v[18:19], v[22:25], off
+; GFX1170-NEXT:    global_store_b128 v[20:21], v[12:15], off
+; GFX1170-NEXT:    s_endpgm
+;
 ; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16_index_key:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    global_load_b32 v16, v[16:17], off
@@ -110,6 +175,25 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX1170-LABEL: test_swmmac_i32_16x16x32_iu8_index_key:
+; GFX1170:       ; %bb.0: ; %bb
+; GFX1170-NEXT:    global_load_b32 v14, v[14:15], off
+; GFX1170-NEXT:    v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
+; GFX1170-NEXT:    v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10
+; GFX1170-NEXT:    v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8
+; GFX1170-NEXT:    v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6
+; GFX1170-NEXT:    s_waitcnt vmcnt(0)
+; GFX1170-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT:    v_swmmac_i32_16x16x32_iu8 v[20:27], v[0:1], v[2:5], v14
+; GFX1170-NEXT:    v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 index_key:1
+; GFX1170-NEXT:    s_clause 0x1
+; GFX1170-NEXT:    global_store_b128 v[16:17], v[20:23], off
+; GFX1170-NEXT:    global_store_b128 v[16:17], v[24:27], off offset:16
+; GFX1170-NEXT:    s_clause 0x1
+; GFX1170-NEXT:    global_store_b128 v[18:19], v[6:9], off
+; GFX1170-NEXT:    global_store_b128 v[18:19], v[10:13], off offset:16
+; GFX1170-NEXT:    s_endpgm
+;
 ; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_index_key:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    global_load_b32 v14, v[14:15], off
@@ -140,6 +224,25 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_index_key(i32 %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX1170-LABEL: test_swmmac_i32_16x16x32_iu4_index_key:
+; GFX1170:       ; %bb.0: ; %bb
+; GFX1170-NEXT:    global_load_b32 v11, v[11:12], off
+; GFX1170-NEXT:    v_dual_mov_b32 v24, v10 :: v_dual_mov_b32 v23, v9
+; GFX1170-NEXT:    v_dual_mov_b32 v22, v8 :: v_dual_mov_b32 v21, v7
+; GFX1170-NEXT:    v_dual_mov_b32 v20, v6 :: v_dual_mov_b32 v19, v5
+; GFX1170-NEXT:    v_dual_mov_b32 v18, v4 :: v_dual_mov_b32 v17, v3
+; GFX1170-NEXT:    s_waitcnt vmcnt(0)
+; GFX1170-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT:    v_swmmac_i32_16x16x32_iu4 v[17:24], v0, v[1:2], v11
+; GFX1170-NEXT:    v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 index_key:1
+; GFX1170-NEXT:    s_clause 0x1
+; GFX1170-NEXT:    global_store_b128 v[13:14], v[17:20], off
+; GFX1170-NEXT:    global_store_b128 v[13:14], v[21:24], off offset:16
+; GFX1170-NEXT:    s_clause 0x1
+; GFX1170-NEXT:    global_store_b128 v[15:16], v[3:6], off
+; GFX1170-NEXT:    global_store_b128 v[15:16], v[7:10], off offset:16
+; GFX1170-NEXT:    s_endpgm
+;
 ; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_index_key:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    global_load_b32 v11, v[11:12], off
@@ -170,6 +273,25 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX1170-LABEL: test_swmmac_f32_16x16x32_fp8_fp8_index_key:
+; GFX1170:       ; %bb.0: ; %bb
+; GFX1170-NEXT:    global_load_b32 v14, v[14:15], off
+; GFX1170-NEXT:    v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
+; GFX1170-NEXT:    v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10
+; GFX1170-NEXT:    v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8
+; GFX1170-NEXT:    v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6
+; GFX1170-NEXT:    s_waitcnt vmcnt(0)
+; GFX1170-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT:    v_swmmac_f32_16x16x32_fp8_fp8 v[20:27], v[0:1], v[2:5], v14
+; GFX1170-NEXT:    v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1
+; GFX1170-NEXT:    s_clause 0x1
+; GFX1170-NEXT:    global_store_b128 v[16:17], v[20:23], off
+; GFX1170-NEXT:    global_store_b128 v[16:17], v[24:27], off offset:16
+; GFX1170-NEXT:    s_clause 0x1
+; GFX1170-NEXT:    global_store_b128 v[18:19], v[6:9], off
+; GFX1170-NEXT:    global_store_b128 v[18:19], v[10:13], off offset:16
+; GFX1170-NEXT:    s_endpgm
+;
 ; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8_index_key:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    global_load_b32 v14, v[14:15], off
@@ -200,6 +322,25 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX1170-LABEL: test_swmmac_f32_16x16x32_fp8_bf8_index_key:
+; GFX1170:       ; %bb.0: ; %bb
+; GFX1170-NEXT:    global_load_b32 v14, v[14:15], off
+; GFX1170-NEXT:    v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
+; GFX1170-NEXT:    v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10
+; GFX1170-NEXT:    v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8
+; GFX1170-NEXT:    v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6
+; GFX1170-NEXT:    s_waitcnt vmcnt(0)
+; GFX1170-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT:    v_swmmac_f32_16x16x32_fp8_bf8 v[20:27], v[0:1], v[2:5], v14
+; GFX1170-NEXT:    v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1
+; GFX1170-NEXT:    s_clause 0x1
+; GFX1170-NEXT:    global_store_b128 v[16:17], v[20:23], off
+; GFX1170-NEXT:    global_store_b128 v[16:17], v[24:27], off offset:16
+; GFX1170-NEXT:    s_clause 0x1
+; GFX1170-NEXT:    global_store_b128 v[18:19], v[6:9], off
+; GFX1170-NEXT:    global_store_b128 v[18:19], v[10:13], off offset:16
+; GFX1170-NEXT:    s_endpgm
+;
 ; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8_index_key:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    global_load_b32 v14, v[14:15], off
@@ -230,6 +371,25 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX1170-LABEL: test_swmmac_f32_16x16x32_bf8_fp8_index_key:
+; GFX1170:       ; %bb.0: ; %bb
+; GFX1170-NEXT:    global_load_b32 v14, v[14:15], off
+; GFX1170-NEXT:    v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
+; GFX1170-NEXT:    v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10
+; GFX1170-NEXT:    v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8
+; GFX1170-NEXT:    v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6
+; GFX1170-NEXT:    s_waitcnt vmcnt(0)
+; GFX1170-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT:    v_swmmac_f32_16x16x32_bf8_fp8 v[20:27], v[0:1], v[2:5], v14
+; GFX1170-NEXT:    v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1
+; GFX1170-NEXT:    s_clause 0x1
+; GFX1170-NEXT:    global_store_b128 v[16:17], v[20:23], off
+; GFX1170-NEXT:    global_store_b128 v[16:17], v[24:27], off offset:16
+; GFX1170-NEXT:    s_clause 0x1
+; GFX1170-NEXT:    global_store_b128 v[18:19], v[6:9], off
+; GFX1170-NEXT:    global_store_b128 v[18:19], v[10:13], off offset:16
+; GFX1170-NEXT:    s_endpgm
+;
 ; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8_index_key:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    global_load_b32 v14, v[14:15], off
@@ -260,6 +420,25 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX1170-LABEL: test_swmmac_f32_16x16x32_bf8_bf8_index_key:
+; GFX1170:       ; %bb.0: ; %bb
+; GFX1170-NEXT:    global_load_b32 v14, v[14:15], off
+; GFX1170-NEXT:    v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
+; GFX1170-NEXT:    v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10
+; GFX1170-NEXT:    v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8
+; GFX1170-NEXT:    v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6
+; GFX1170-NEXT:    s_waitcnt vmcnt(0)
+; GFX1170-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT:    v_swmmac_f32_16x16x32_bf8_bf8 v[20:27], v[0:1], v[2:5], v14
+; GFX1170-NEXT:    v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1
+; GFX1170-NEXT:    s_clause 0x1
+; GFX1170-NEXT:    global_store_b128 v[16:17], v[20:23], off
+; GFX1170-NEXT:    global_store_b128 v[16:17], v[24:27], off offset:16
+; GFX1170-NEXT:    s_clause 0x1
+; GFX1170-NEXT:    global_store_b128 v[18:19], v[6:9], off
+; GFX1170-NEXT:    global_store_b128 v[18:19], v[10:13], off offset:16
+; GFX1170-NEXT:    s_endpgm
+;
 ; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8_index_key:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    global_load_b32 v14, v[14:15], off
@@ -299,3 +478,5 @@ declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v
 declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
 declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
 declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32.ll
index da61bc4758879..4eacdbe171e3e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32.ll
@@ -1,14 +1,15 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1170 < %s | FileCheck %s --check-prefixes=GCN,GFX1170
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefixes=GCN,GFX12
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_f16:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
-; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_f16:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[16:17], v[8:11], off
+; GCN-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %C)
   store <8 x float> %res, ptr addrspace(1) %out
@@ -16,13 +17,13 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf16:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
-; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf16:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[16:17], v[8:11], off
+; GCN-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> %C)
   store <8 x float> %res, ptr addrspace(1) %out
@@ -30,11 +31,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f16_16x16x16_f16(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f16_16x16x16_f16:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11]
-; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f16_16x16x16_f16:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11]
+; GCN-NEXT:    global_store_b128 v[12:13], v[8:11], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %C, i1 0)
   store <8 x half> %res, ptr addrspace(1) %out
@@ -42,11 +43,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11]
-; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_bf16_16x16x16_bf16:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11]
+; GCN-NEXT:    global_store_b128 v[12:13], v[8:11], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i1 0)
   store <8 x i16> %res, ptr addrspace(1) %out
@@ -54,13 +55,13 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_i32_16x16x16_iu8(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu8:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu8:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GCN-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
   store <8 x i32> %res, ptr addrspace(1) %out
@@ -68,13 +69,13 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_i32_16x16x16_iu4(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu4:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[10:11], v[2:5], off
-; GFX12-NEXT:    global_store_b128 v[10:11], v[6:9], off offset:16
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu4:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[10:11], v[2:5], off
+; GCN-NEXT:    global_store_b128 v[10:11], v[6:9], off offset:16
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 0)
   store <8 x i32> %res, ptr addrspace(1) %out
@@ -82,13 +83,13 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_fp8_fp8:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GCN-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
   store <8 x float> %res, ptr addrspace(1) %out
@@ -96,13 +97,13 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf8_fp8:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GCN-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
   store <8 x float> %res, ptr addrspace(1) %out
@@ -110,13 +111,13 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_fp8_bf8:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GCN-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
   store <8 x float> %res, ptr addrspace(1) %out
@@ -124,13 +125,13 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf8_bf8:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GCN-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
   store <8 x float> %res, ptr addrspace(1) %out
@@ -138,13 +139,13 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_i32_16x16x32_iu4(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x32_iu4:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x32_iu4:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GCN-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
   store <8 x i32> %res, ptr addrspace(1) %out
@@ -153,13 +154,13 @@ bb:
 
 
 define amdgpu_ps void @test_swmmac_f32_16x16x32_f16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f32_16x16x32_f16:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[21:22], v[12:15], off
-; GFX12-NEXT:    global_store_b128 v[21:22], v[16:19], off offset:16
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_f32_16x16x32_f16:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[21:22], v[12:15], off
+; GCN-NEXT:    global_store_b128 v[21:22], v[16:19], off offset:16
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index)
   store <8 x float> %res, ptr addrspace(1) %out
@@ -167,13 +168,13 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[21:22], v[12:15], off
-; GFX12-NEXT:    global_store_b128 v[21:22], v[16:19], off offset:16
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_f32_16x16x32_bf16:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[21:22], v[12:15], off
+; GCN-NEXT:    global_store_b128 v[21:22], v[16:19], off offset:16
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index)
   store <8 x float> %res, ptr addrspace(1) %out
@@ -181,11 +182,11 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_f16_16x16x32_f16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f16_16x16x32_f16:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16
-; GFX12-NEXT:    global_store_b128 v[17:18], v[12:15], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_f16_16x16x32_f16:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16
+; GCN-NEXT:    global_store_b128 v[17:18], v[12:15], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index)
   store <8 x half> %res, ptr addrspace(1) %out
@@ -193,11 +194,11 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16
-; GFX12-NEXT:    global_store_b128 v[17:18], v[12:15], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_bf16_16x16x32_bf16:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16
+; GCN-NEXT:    global_store_b128 v[17:18], v[12:15], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index)
   store <8 x i16> %res, ptr addrspace(1) %out
@@ -205,13 +206,13 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
-; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x32_iu8:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[15:16], v[6:9], off
+; GCN-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
   store <8 x i32> %res, ptr addrspace(1) %out
@@ -219,13 +220,13 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[12:13], v[3:6], off
-; GFX12-NEXT:    global_store_b128 v[12:13], v[7:10], off offset:16
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x32_iu4:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[12:13], v[3:6], off
+; GCN-NEXT:    global_store_b128 v[12:13], v[7:10], off offset:16
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
   store <8 x i32> %res, ptr addrspace(1) %out
@@ -233,13 +234,13 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
-; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x64_iu4:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[15:16], v[6:9], off
+; GCN-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0)
   store <8 x i32> %res, ptr addrspace(1) %out
@@ -247,13 +248,13 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
-; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_f32_16x16x32_fp8_fp8:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[15:16], v[6:9], off
+; GCN-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
   store <8 x float> %res, ptr addrspace(1) %out
@@ -261,13 +262,13 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
-; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_f32_16x16x32_fp8_bf8:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[15:16], v[6:9], off
+; GCN-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
   store <8 x float> %res, ptr addrspace(1) %out
@@ -275,13 +276,13 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
-; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_f32_16x16x32_bf8_fp8:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[15:16], v[6:9], off
+; GCN-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
   store <8 x float> %res, ptr addrspace(1) %out
@@ -289,13 +290,13 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
-; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_f32_16x16x32_bf8_bf8:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[15:16], v[6:9], off
+; GCN-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
   store <8 x float> %res, ptr addrspace(1) %out
@@ -324,3 +325,6 @@ declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v
 declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
 declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
 declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX1170: {{.*}}
+; GFX12: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll
index a345ee6def7a7..3886a072b1763 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll
@@ -1,12 +1,13 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,-real-true16 < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1170 -mattr=+wavefrontsize64,-real-true16 < %s | FileCheck %s --check-prefixes=GCN,GFX1170
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,-real-true16 < %s | FileCheck %s --check-prefixes=GCN,GFX12
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] neg_hi:[1,0,0]
-; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_f16_negA:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GCN-NEXT:    global_store_b128 v[8:9], v[4:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fneg.A = fneg <4 x half> %A
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %fneg.A, <4 x half> %B, <4 x float> %C)
@@ -15,11 +16,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negB:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] neg_hi:[0,1,0]
-; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_f16_negB:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GCN-NEXT:    global_store_b128 v[8:9], v[4:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fneg.B = fneg <4 x half> %B
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %fneg.B, <4 x float> %C)
@@ -28,11 +29,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negC:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
-; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_f16_negC:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
+; GCN-NEXT:    global_store_b128 v[8:9], v[4:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fneg.C = fneg <4 x float> %C
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %fneg.C)
@@ -41,11 +42,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_f16_absC:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1]
-; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_f16_absC:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1]
+; GCN-NEXT:    global_store_b128 v[8:9], v[4:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %fabs.C)
@@ -54,11 +55,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_negC:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
-; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf16_negC:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
+; GCN-NEXT:    global_store_b128 v[8:9], v[4:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fneg.C = fneg <4 x float> %C
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> %fneg.C)
@@ -67,11 +68,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_absC:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1]
-; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf16_absC:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1]
+; GCN-NEXT:    global_store_b128 v[8:9], v[4:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> %fabs.C)
@@ -80,11 +81,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negA:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] neg_hi:[1,0,0]
-; GFX12-NEXT:    global_store_b64 v[6:7], v[4:5], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f16_16x16x16_f16_negA:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GCN-NEXT:    global_store_b64 v[6:7], v[4:5], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fneg.A = fneg <4 x half> %A
   %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %fneg.A, <4 x half> %B, <4 x half> %C, i1 0)
@@ -93,11 +94,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] neg_hi:[0,1,0]
-; GFX12-NEXT:    global_store_b64 v[6:7], v[4:5], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f16_16x16x16_f16_negB:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GCN-NEXT:    global_store_b64 v[6:7], v[4:5], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fneg.B = fneg <4 x half> %B
   %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %fneg.B, <4 x half> %C, i1 0)
@@ -106,11 +107,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1]
-; GFX12-NEXT:    global_store_b64 v[6:7], v[4:5], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f16_16x16x16_f16_negC:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1]
+; GCN-NEXT:    global_store_b64 v[6:7], v[4:5], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fneg.C = fneg <4 x half> %C
   %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fneg.C, i1 0)
@@ -119,11 +120,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f16_16x16x16_f16_absC:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1]
-; GFX12-NEXT:    global_store_b64 v[6:7], v[4:5], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f16_16x16x16_f16_absC:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1]
+; GCN-NEXT:    global_store_b64 v[6:7], v[4:5], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fabs.C = call <4 x half> @llvm.fabs.v4f16(<4 x half> %C)
   %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fabs.C, i1 0)
@@ -132,11 +133,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_negC:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
-; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_fp8_fp8_negC:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
+; GCN-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fneg.C = fneg <4 x float> %C
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C)
@@ -145,11 +146,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_absC:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
-; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_fp8_fp8_absC:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
+; GCN-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C)
@@ -158,11 +159,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_negC:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
-; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf8_fp8_negC:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
+; GCN-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fneg.C = fneg <4 x float> %C
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C)
@@ -171,11 +172,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_absC:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
-; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf8_fp8_absC:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
+; GCN-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C)
@@ -184,11 +185,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_negC:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
-; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_fp8_bf8_negC:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
+; GCN-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fneg.C = fneg <4 x float> %C
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C)
@@ -197,11 +198,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_absC:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
-; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_fp8_bf8_absC:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
+; GCN-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C)
@@ -210,11 +211,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_negC:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
-; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf8_bf8_negC:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
+; GCN-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fneg.C = fneg <4 x float> %C
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C)
@@ -223,11 +224,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_absC:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
-; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf8_bf8_absC:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
+; GCN-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C)
@@ -236,11 +237,11 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<4 x half> %A, <8 x half> %B, <4 x float> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negA:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] neg_hi:[1,0,0]
-; GFX12-NEXT:    global_store_b128 v[11:12], v[6:9], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_f32_16x16x32_f16_negA:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GCN-NEXT:    global_store_b128 v[11:12], v[6:9], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fneg.A = fneg <4 x half> %A
   %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f16.v8f16.v4f32.i16(<4 x half> %fneg.A, <8 x half> %B, <4 x float> %C, i16 %Index)
@@ -249,11 +250,11 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<4 x half> %A, <8 x half> %B, <4 x float> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negB:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] neg_hi:[0,1,0]
-; GFX12-NEXT:    global_store_b128 v[11:12], v[6:9], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_f32_16x16x32_f16_negB:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GCN-NEXT:    global_store_b128 v[11:12], v[6:9], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fneg.B = fneg <8 x half> %B
   %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f16.v8f16.v4f32.i16(<4 x half> %A, <8 x half> %fneg.B, <4 x float> %C, i16 %Index)
@@ -262,11 +263,11 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<4 x half> %A, <8 x half> %B, <4 x half> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negA:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] neg_hi:[1,0,0]
-; GFX12-NEXT:    global_store_b64 v[9:10], v[6:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_f16_16x16x32_f16_negA:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GCN-NEXT:    global_store_b64 v[9:10], v[6:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fneg.A = fneg <4 x half> %A
   %res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v8f16.v4f16.i16(<4 x half> %fneg.A, <8 x half> %B, <4 x half> %C, i16 %Index)
@@ -275,11 +276,11 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<4 x half> %A, <8 x half> %B, <4 x half> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negB:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] neg_hi:[0,1,0]
-; GFX12-NEXT:    global_store_b64 v[9:10], v[6:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_f16_16x16x32_f16_negB:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GCN-NEXT:    global_store_b64 v[9:10], v[6:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fneg.B = fneg <8 x half> %B
   %res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v8f16.v4f16.i16(<4 x half> %A, <8 x half> %fneg.B, <4 x half> %C, i16 %Index)
@@ -290,11 +291,11 @@ bb:
 ; both neg and abs patterns (wmma matrix C f32 or f16 )
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negabsC:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] neg_hi:[0,0,1]
-; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_f16_negabsC:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GCN-NEXT:    global_store_b128 v[8:9], v[4:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
   %fneg.fabs.C = fneg <4 x float> %fabs.C
@@ -304,11 +305,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negabsC:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] neg_hi:[0,0,1]
-; GFX12-NEXT:    global_store_b64 v[6:7], v[4:5], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f16_16x16x16_f16_negabsC:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GCN-NEXT:    global_store_b64 v[6:7], v[4:5], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fabs.C = call <4 x half> @llvm.fabs.v4f16(<4 x half> %C)
   %fneg.fabs.C = fneg <4 x half> %fabs.C
@@ -318,13 +319,13 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_f16_neg_partial_fabsA:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_and_b32_e32 v7, 0x7fffffff, v7
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
-; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_f16_neg_partial_fabsA:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_and_b32_e32 v7, 0x7fffffff, v7
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
+; GCN-NEXT:    global_store_b128 v[8:9], v[4:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %el3 = extractelement <4 x float> %C, i32 3
   %el3.fabs = call float @llvm.fabs.f32(float %el3)
@@ -338,11 +339,11 @@ bb:
 ; A or B matrix modifier and constant in C
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA_constantC:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0]
-; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_f16_negA_constantC:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GCN-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fneg.A = fneg <4 x half> %A
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %fneg.A, <4 x half> %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
@@ -351,11 +352,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB_constantC:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0]
-; GFX12-NEXT:    global_store_b64 v[4:5], v[6:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f16_16x16x16_f16_negB_constantC:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GCN-NEXT:    global_store_b64 v[4:5], v[6:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fneg.B = fneg <4 x half> %B
   %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %fneg.B, <4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
@@ -366,6 +367,20 @@ bb:
 ; pack f16 elements with v_perm_b32 since they don't come from same b32
 
 define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<4 x half> %A, <4 x half> %B, ptr %Caddr, ptr addrspace(1) %out) {
+; GFX1170-LABEL: test_wmma_f16_16x16x16_f16_negC_pack:
+; GFX1170:       ; %bb.0: ; %bb
+; GFX1170-NEXT:    flat_load_b128 v[8:11], v[4:5]
+; GFX1170-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX1170-NEXT:    v_and_b32_e32 v4, 0xffff, v8
+; GFX1170-NEXT:    v_and_b32_e32 v5, 0xffff, v10
+; GFX1170-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1170-NEXT:    v_lshl_or_b32 v4, v9, 16, v4
+; GFX1170-NEXT:    v_lshl_or_b32 v5, v11, 16, v5
+; GFX1170-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1]
+; GFX1170-NEXT:    global_store_b64 v[6:7], v[4:5], off
+; GFX1170-NEXT:    s_endpgm
+;
 ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC_pack:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    flat_load_b128 v[8:11], v[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-imm.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-imm.ll
index 5344ab8da1ade..ce9b8f9fc3c14 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-imm.ll
@@ -1,12 +1,13 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1170 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GCN,GFX1170
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GCN,GFX12
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0
-; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_f16_imm:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0
+; GCN-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
   store <4 x float> %res, ptr addrspace(1) %out
@@ -14,21 +15,21 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm_non_inlineable(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm_non_inlineable:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    s_mov_b32 s0, 0x40400000
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_mov_b32 s3, s0
-; GFX12-NEXT:    s_mov_b32 s1, s0
-; GFX12-NEXT:    s_mov_b32 s2, s0
-; GFX12-NEXT:    v_mov_b32_e32 v9, s3
-; GFX12-NEXT:    v_mov_b32_e32 v8, s2
-; GFX12-NEXT:    v_mov_b32_e32 v7, s1
-; GFX12-NEXT:    v_mov_b32_e32 v6, s0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], v[6:9]
-; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_f16_imm_non_inlineable:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_mov_b32 s0, 0x40400000
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT:    s_mov_b32 s3, s0
+; GCN-NEXT:    s_mov_b32 s1, s0
+; GCN-NEXT:    s_mov_b32 s2, s0
+; GCN-NEXT:    v_mov_b32_e32 v9, s3
+; GCN-NEXT:    v_mov_b32_e32 v8, s2
+; GCN-NEXT:    v_mov_b32_e32 v7, s1
+; GCN-NEXT:    v_mov_b32_e32 v6, s0
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT:    v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], v[6:9]
+; GCN-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
   store <4 x float> %res, ptr addrspace(1) %out
@@ -36,11 +37,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[6:9], v[0:1], v[2:3], 1.0
-; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf16_imm:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_bf16 v[6:9], v[0:1], v[2:3], 1.0
+; GCN-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
   store <4 x float> %res, ptr addrspace(1) %out
@@ -48,21 +49,21 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm_non_inlineable(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm_non_inlineable:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    s_mov_b32 s0, 0x40400000
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_mov_b32 s3, s0
-; GFX12-NEXT:    s_mov_b32 s1, s0
-; GFX12-NEXT:    s_mov_b32 s2, s0
-; GFX12-NEXT:    v_mov_b32_e32 v9, s3
-; GFX12-NEXT:    v_mov_b32_e32 v8, s2
-; GFX12-NEXT:    v_mov_b32_e32 v7, s1
-; GFX12-NEXT:    v_mov_b32_e32 v6, s0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[6:9], v[0:1], v[2:3], v[6:9]
-; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf16_imm_non_inlineable:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_mov_b32 s0, 0x40400000
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT:    s_mov_b32 s3, s0
+; GCN-NEXT:    s_mov_b32 s1, s0
+; GCN-NEXT:    s_mov_b32 s2, s0
+; GCN-NEXT:    v_mov_b32_e32 v9, s3
+; GCN-NEXT:    v_mov_b32_e32 v8, s2
+; GCN-NEXT:    v_mov_b32_e32 v7, s1
+; GCN-NEXT:    v_mov_b32_e32 v6, s0
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT:    v_wmma_f32_16x16x16_bf16 v[6:9], v[0:1], v[2:3], v[6:9]
+; GCN-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
   store <4 x float> %res, ptr addrspace(1) %out
@@ -70,11 +71,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0
-; GFX12-NEXT:    global_store_b64 v[4:5], v[6:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f16_16x16x16_f16_imm:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0
+; GCN-NEXT:    global_store_b64 v[4:5], v[6:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
   store <4 x half> %res, ptr addrspace(1) %out
@@ -82,17 +83,17 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm_non_inlineable(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm_non_inlineable:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    s_mov_b32 s0, 0x42004200
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_mov_b32 s1, s0
-; GFX12-NEXT:    v_mov_b32_e32 v7, s1
-; GFX12-NEXT:    v_mov_b32_e32 v6, s0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], v[6:7]
-; GFX12-NEXT:    global_store_b64 v[4:5], v[6:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f16_16x16x16_f16_imm_non_inlineable:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_mov_b32 s0, 0x42004200
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT:    s_mov_b32 s1, s0
+; GCN-NEXT:    v_mov_b32_e32 v7, s1
+; GCN-NEXT:    v_mov_b32_e32 v6, s0
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT:    v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], v[6:7]
+; GCN-NEXT:    global_store_b64 v[4:5], v[6:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> <half 3.0, half 3.0, half 3.0, half 3.0>, i1 0)
   store <4 x half> %res, ptr addrspace(1) %out
@@ -100,17 +101,17 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    s_mov_b32 s0, 0x3f803f80
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_mov_b32 s1, s0
-; GFX12-NEXT:    v_mov_b32_e32 v7, s1
-; GFX12-NEXT:    v_mov_b32_e32 v6, s0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], v[6:7]
-; GFX12-NEXT:    global_store_b64 v[4:5], v[6:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_bf16_16x16x16_bf16_imm:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_mov_b32 s0, 0x3f803f80
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT:    s_mov_b32 s1, s0
+; GCN-NEXT:    v_mov_b32_e32 v7, s1
+; GCN-NEXT:    v_mov_b32_e32 v6, s0
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT:    v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], v[6:7]
+; GCN-NEXT:    global_store_b64 v[4:5], v[6:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> <i16 16256, i16 16256, i16 16256, i16 16256>, i1 0)
   store <4 x i16> %res, ptr addrspace(1) %out
@@ -118,17 +119,17 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm_non_inlineable(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm_non_inlineable:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    s_mov_b32 s0, 0x3fc03fc0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_mov_b32 s1, s0
-; GFX12-NEXT:    v_mov_b32_e32 v7, s1
-; GFX12-NEXT:    v_mov_b32_e32 v6, s0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], v[6:7]
-; GFX12-NEXT:    global_store_b64 v[4:5], v[6:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_bf16_16x16x16_bf16_imm_non_inlineable:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_mov_b32 s0, 0x3fc03fc0
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT:    s_mov_b32 s1, s0
+; GCN-NEXT:    v_mov_b32_e32 v7, s1
+; GCN-NEXT:    v_mov_b32_e32 v6, s0
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT:    v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], v[6:7]
+; GCN-NEXT:    global_store_b64 v[4:5], v[6:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> <i16 16320, i16 16320, i16 16320, i16 16320>, i1 0)
   store <4 x i16> %res, ptr addrspace(1) %out
@@ -136,11 +137,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm(i32 %A, i32 %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[4:7], v0, v1, 1
-; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu8_imm:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_i32_16x16x16_iu8 v[4:7], v0, v1, 1
+; GCN-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, i1 0)
   store <4 x i32> %res, ptr addrspace(1) %out
@@ -148,21 +149,21 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm_non_inlineable:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    s_movk_i32 s0, 0x80
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_mov_b32 s3, s0
-; GFX12-NEXT:    s_mov_b32 s1, s0
-; GFX12-NEXT:    s_mov_b32 s2, s0
-; GFX12-NEXT:    v_mov_b32_e32 v7, s3
-; GFX12-NEXT:    v_mov_b32_e32 v6, s2
-; GFX12-NEXT:    v_mov_b32_e32 v5, s1
-; GFX12-NEXT:    v_mov_b32_e32 v4, s0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[4:7], v0, v1, v[4:7]
-; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu8_imm_non_inlineable:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_movk_i32 s0, 0x80
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT:    s_mov_b32 s3, s0
+; GCN-NEXT:    s_mov_b32 s1, s0
+; GCN-NEXT:    s_mov_b32 s2, s0
+; GCN-NEXT:    v_mov_b32_e32 v7, s3
+; GCN-NEXT:    v_mov_b32_e32 v6, s2
+; GCN-NEXT:    v_mov_b32_e32 v5, s1
+; GCN-NEXT:    v_mov_b32_e32 v4, s0
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT:    v_wmma_i32_16x16x16_iu8 v[4:7], v0, v1, v[4:7]
+; GCN-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 128, i32 128, i32 128, i32 128>, i1 0)
   store <4 x i32> %res, ptr addrspace(1) %out
@@ -170,11 +171,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm(i32 %A, i32 %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:7], v0, v1, 1
-; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu4_imm:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:7], v0, v1, 1
+; GCN-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, i1 0)
   store <4 x i32> %res, ptr addrspace(1) %out
@@ -182,21 +183,21 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm_non_inlineable:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    s_movk_i32 s0, 0x80
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_mov_b32 s3, s0
-; GFX12-NEXT:    s_mov_b32 s1, s0
-; GFX12-NEXT:    s_mov_b32 s2, s0
-; GFX12-NEXT:    v_mov_b32_e32 v7, s3
-; GFX12-NEXT:    v_mov_b32_e32 v6, s2
-; GFX12-NEXT:    v_mov_b32_e32 v5, s1
-; GFX12-NEXT:    v_mov_b32_e32 v4, s0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:7], v0, v1, v[4:7]
-; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu4_imm_non_inlineable:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_movk_i32 s0, 0x80
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT:    s_mov_b32 s3, s0
+; GCN-NEXT:    s_mov_b32 s1, s0
+; GCN-NEXT:    s_mov_b32 s2, s0
+; GCN-NEXT:    v_mov_b32_e32 v7, s3
+; GCN-NEXT:    v_mov_b32_e32 v6, s2
+; GCN-NEXT:    v_mov_b32_e32 v5, s1
+; GCN-NEXT:    v_mov_b32_e32 v4, s0
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:7], v0, v1, v[4:7]
+; GCN-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 128, i32 128, i32 128, i32 128>, i1 0)
   store <4 x i32> %res, ptr addrspace(1) %out
@@ -204,11 +205,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[4:7], v0, v1, 1.0
-; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[4:7], v0, v1, 1.0
+; GCN-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
   store <4 x float> %res, ptr addrspace(1) %out
@@ -216,21 +217,21 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    s_mov_b32 s0, 0x40400000
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_mov_b32 s3, s0
-; GFX12-NEXT:    s_mov_b32 s1, s0
-; GFX12-NEXT:    s_mov_b32 s2, s0
-; GFX12-NEXT:    v_mov_b32_e32 v7, s3
-; GFX12-NEXT:    v_mov_b32_e32 v6, s2
-; GFX12-NEXT:    v_mov_b32_e32 v5, s1
-; GFX12-NEXT:    v_mov_b32_e32 v4, s0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[4:7], v0, v1, v[4:7]
-; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_mov_b32 s0, 0x40400000
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT:    s_mov_b32 s3, s0
+; GCN-NEXT:    s_mov_b32 s1, s0
+; GCN-NEXT:    s_mov_b32 s2, s0
+; GCN-NEXT:    v_mov_b32_e32 v7, s3
+; GCN-NEXT:    v_mov_b32_e32 v6, s2
+; GCN-NEXT:    v_mov_b32_e32 v5, s1
+; GCN-NEXT:    v_mov_b32_e32 v4, s0
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[4:7], v0, v1, v[4:7]
+; GCN-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
   store <4 x float> %res, ptr addrspace(1) %out
@@ -238,11 +239,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[4:7], v0, v1, 1.0
-; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[4:7], v0, v1, 1.0
+; GCN-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
   store <4 x float> %res, ptr addrspace(1) %out
@@ -250,21 +251,21 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    s_mov_b32 s0, 0x40400000
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_mov_b32 s3, s0
-; GFX12-NEXT:    s_mov_b32 s1, s0
-; GFX12-NEXT:    s_mov_b32 s2, s0
-; GFX12-NEXT:    v_mov_b32_e32 v7, s3
-; GFX12-NEXT:    v_mov_b32_e32 v6, s2
-; GFX12-NEXT:    v_mov_b32_e32 v5, s1
-; GFX12-NEXT:    v_mov_b32_e32 v4, s0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[4:7], v0, v1, v[4:7]
-; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_mov_b32 s0, 0x40400000
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT:    s_mov_b32 s3, s0
+; GCN-NEXT:    s_mov_b32 s1, s0
+; GCN-NEXT:    s_mov_b32 s2, s0
+; GCN-NEXT:    v_mov_b32_e32 v7, s3
+; GCN-NEXT:    v_mov_b32_e32 v6, s2
+; GCN-NEXT:    v_mov_b32_e32 v5, s1
+; GCN-NEXT:    v_mov_b32_e32 v4, s0
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[4:7], v0, v1, v[4:7]
+; GCN-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
   store <4 x float> %res, ptr addrspace(1) %out
@@ -272,11 +273,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[4:7], v0, v1, 1.0
-; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[4:7], v0, v1, 1.0
+; GCN-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
   store <4 x float> %res, ptr addrspace(1) %out
@@ -284,21 +285,21 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    s_mov_b32 s0, 0x40400000
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_mov_b32 s3, s0
-; GFX12-NEXT:    s_mov_b32 s1, s0
-; GFX12-NEXT:    s_mov_b32 s2, s0
-; GFX12-NEXT:    v_mov_b32_e32 v7, s3
-; GFX12-NEXT:    v_mov_b32_e32 v6, s2
-; GFX12-NEXT:    v_mov_b32_e32 v5, s1
-; GFX12-NEXT:    v_mov_b32_e32 v4, s0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[4:7], v0, v1, v[4:7]
-; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_mov_b32 s0, 0x40400000
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT:    s_mov_b32 s3, s0
+; GCN-NEXT:    s_mov_b32 s1, s0
+; GCN-NEXT:    s_mov_b32 s2, s0
+; GCN-NEXT:    v_mov_b32_e32 v7, s3
+; GCN-NEXT:    v_mov_b32_e32 v6, s2
+; GCN-NEXT:    v_mov_b32_e32 v5, s1
+; GCN-NEXT:    v_mov_b32_e32 v4, s0
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[4:7], v0, v1, v[4:7]
+; GCN-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
   store <4 x float> %res, ptr addrspace(1) %out
@@ -306,11 +307,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[4:7], v0, v1, 1.0
-; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[4:7], v0, v1, 1.0
+; GCN-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
   store <4 x float> %res, ptr addrspace(1) %out
@@ -318,21 +319,21 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    s_mov_b32 s0, 0x40400000
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_mov_b32 s3, s0
-; GFX12-NEXT:    s_mov_b32 s1, s0
-; GFX12-NEXT:    s_mov_b32 s2, s0
-; GFX12-NEXT:    v_mov_b32_e32 v7, s3
-; GFX12-NEXT:    v_mov_b32_e32 v6, s2
-; GFX12-NEXT:    v_mov_b32_e32 v5, s1
-; GFX12-NEXT:    v_mov_b32_e32 v4, s0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[4:7], v0, v1, v[4:7]
-; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_mov_b32 s0, 0x40400000
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT:    s_mov_b32 s3, s0
+; GCN-NEXT:    s_mov_b32 s1, s0
+; GCN-NEXT:    s_mov_b32 s2, s0
+; GCN-NEXT:    v_mov_b32_e32 v7, s3
+; GCN-NEXT:    v_mov_b32_e32 v6, s2
+; GCN-NEXT:    v_mov_b32_e32 v5, s1
+; GCN-NEXT:    v_mov_b32_e32 v4, s0
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[4:7], v0, v1, v[4:7]
+; GCN-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
   store <4 x float> %res, ptr addrspace(1) %out
@@ -340,11 +341,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[4:7], v0, v1, 1
-; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x32_iu4_imm:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_i32_16x16x32_iu4 v[4:7], v0, v1, 1
+; GCN-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, i1 0)
   store <4 x i32> %res, ptr addrspace(1) %out
@@ -352,21 +353,21 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm_non_inlineable(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm_non_inlineable:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    s_movk_i32 s0, 0x80
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_mov_b32 s3, s0
-; GFX12-NEXT:    s_mov_b32 s1, s0
-; GFX12-NEXT:    s_mov_b32 s2, s0
-; GFX12-NEXT:    v_mov_b32_e32 v7, s3
-; GFX12-NEXT:    v_mov_b32_e32 v6, s2
-; GFX12-NEXT:    v_mov_b32_e32 v5, s1
-; GFX12-NEXT:    v_mov_b32_e32 v4, s0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[4:7], v0, v1, v[4:7]
-; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x32_iu4_imm_non_inlineable:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_movk_i32 s0, 0x80
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT:    s_mov_b32 s3, s0
+; GCN-NEXT:    s_mov_b32 s1, s0
+; GCN-NEXT:    s_mov_b32 s2, s0
+; GCN-NEXT:    v_mov_b32_e32 v7, s3
+; GCN-NEXT:    v_mov_b32_e32 v6, s2
+; GCN-NEXT:    v_mov_b32_e32 v5, s1
+; GCN-NEXT:    v_mov_b32_e32 v4, s0
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT:    v_wmma_i32_16x16x32_iu4 v[4:7], v0, v1, v[4:7]
+; GCN-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 128, i32 128, i32 128, i32 128>, i1 0)
   store <4 x i32> %res, ptr addrspace(1) %out
@@ -384,3 +385,6 @@ declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i
 declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
 declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
 declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX1170: {{.*}}
+; GFX12: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-iu-modifiers.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-iu-modifiers.ll
index e47350db4003e..a87163b0dca14 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-iu-modifiers.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-iu-modifiers.ll
@@ -1,12 +1,13 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1170 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GCN,GFX1170
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GCN,GFX12
 
 define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src0:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
-; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu8_zext_src0:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
+; GCN-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
   store <4 x i32> %res, ptr addrspace(1) %out
@@ -14,11 +15,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src1(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src1:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
-; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu8_zext_src1:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
+; GCN-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0)
   store <4 x i32> %res, ptr addrspace(1) %out
@@ -26,11 +27,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_clamp(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_clamp:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] clamp
-; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu8_clamp:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] clamp
+; GCN-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1)
   store <4 x i32> %res, ptr addrspace(1) %out
@@ -40,11 +41,11 @@ bb:
 
 
 define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src0:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
-; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu4_zext_src0:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
+; GCN-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
   store <4 x i32> %res, ptr addrspace(1) %out
@@ -52,11 +53,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src1(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src1:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
-; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu4_zext_src1:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
+; GCN-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0)
   store <4 x i32> %res, ptr addrspace(1) %out
@@ -64,11 +65,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_clamp(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_clamp:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] clamp
-; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu4_clamp:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] clamp
+; GCN-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1)
   store <4 x i32> %res, ptr addrspace(1) %out
@@ -78,11 +79,11 @@ bb:
 
 
 define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src0:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
-; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x32_iu4_zext_src0:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
+; GCN-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
   store <4 x i32> %res, ptr addrspace(1) %out
@@ -90,11 +91,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src1(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src1:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
-; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x32_iu4_zext_src1:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
+; GCN-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0)
   store <4 x i32> %res, ptr addrspace(1) %out
@@ -102,11 +103,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_clamp(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_clamp:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] clamp
-; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x32_iu4_clamp:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] clamp
+; GCN-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1)
   store <4 x i32> %res, ptr addrspace(1) %out
@@ -119,11 +120,11 @@ bb:
 
 
 define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src0(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src0:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
-; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src0:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
+; GCN-NEXT:    global_store_b128 v[8:9], v[3:6], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 1, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0)
   store <4 x i32> %res, ptr addrspace(1) %out
@@ -131,11 +132,11 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src1(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src1:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
-; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src1:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
+; GCN-NEXT:    global_store_b128 v[8:9], v[3:6], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 1, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0)
   store <4 x i32> %res, ptr addrspace(1) %out
@@ -143,11 +144,11 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_clamp(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_clamp:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 clamp
-; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x32_iu8_clamp:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 clamp
+; GCN-NEXT:    global_store_b128 v[8:9], v[3:6], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 1)
   store <4 x i32> %res, ptr addrspace(1) %out
@@ -157,11 +158,11 @@ bb:
 
 
 define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src0(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src0:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[1,0,0]
-; GFX12-NEXT:    global_store_b128 v[7:8], v[2:5], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src0:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[1,0,0]
+; GCN-NEXT:    global_store_b128 v[7:8], v[2:5], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 0)
   store <4 x i32> %res, ptr addrspace(1) %out
@@ -169,11 +170,11 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src1(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src1:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[0,1,0]
-; GFX12-NEXT:    global_store_b128 v[7:8], v[2:5], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src1:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[0,1,0]
+; GCN-NEXT:    global_store_b128 v[7:8], v[2:5], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i16 %Index, i1 0)
   store <4 x i32> %res, ptr addrspace(1) %out
@@ -181,11 +182,11 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_clamp(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_clamp:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 clamp
-; GFX12-NEXT:    global_store_b128 v[7:8], v[2:5], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x32_iu4_clamp:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 clamp
+; GCN-NEXT:    global_store_b128 v[7:8], v[2:5], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 1)
   store <4 x i32> %res, ptr addrspace(1) %out
@@ -195,11 +196,11 @@ bb:
 
 
 define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src0(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src0:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
-; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src0:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
+; GCN-NEXT:    global_store_b128 v[8:9], v[3:6], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 1, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0)
   store <4 x i32> %res, ptr addrspace(1) %out
@@ -207,11 +208,11 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src1(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src1:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
-; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src1:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
+; GCN-NEXT:    global_store_b128 v[8:9], v[3:6], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 1, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0)
   store <4 x i32> %res, ptr addrspace(1) %out
@@ -219,11 +220,11 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_clamp(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_clamp:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 clamp
-; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x64_iu4_clamp:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 clamp
+; GCN-NEXT:    global_store_b128 v[8:9], v[3:6], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 1)
   store <4 x i32> %res, ptr addrspace(1) %out
@@ -236,3 +237,6 @@ declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 imma
 declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i8 %Index, i1 immarg)
 declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i16 %Index, i1 immarg)
 declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i16 %Index, i1 immarg)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX1170: {{.*}}
+; GFX12: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-swmmac-index_key.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-swmmac-index_key.ll
index da6852042f7f5..7d31e262b4862 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-swmmac-index_key.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-swmmac-index_key.ll
@@ -1,7 +1,35 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1170 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GCN,GFX1170
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GCN,GFX12
 
 define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_index_key(<4 x half> %A, <8 x half> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX1170-LABEL: test_swmmac_f32_16x16x32_f16_index_key:
+; GFX1170:       ; %bb.0: ; %bb
+; GFX1170-NEXT:    global_load_b32 v10, v[10:11], off
+; GFX1170-NEXT:    v_mov_b32_e32 v23, v9
+; GFX1170-NEXT:    v_mov_b32_e32 v22, v8
+; GFX1170-NEXT:    v_mov_b32_e32 v21, v7
+; GFX1170-NEXT:    v_mov_b32_e32 v20, v6
+; GFX1170-NEXT:    v_mov_b32_e32 v27, v9
+; GFX1170-NEXT:    v_mov_b32_e32 v26, v8
+; GFX1170-NEXT:    v_mov_b32_e32 v25, v7
+; GFX1170-NEXT:    v_mov_b32_e32 v24, v6
+; GFX1170-NEXT:    v_mov_b32_e32 v31, v9
+; GFX1170-NEXT:    v_mov_b32_e32 v30, v8
+; GFX1170-NEXT:    v_mov_b32_e32 v29, v7
+; GFX1170-NEXT:    v_mov_b32_e32 v28, v6
+; GFX1170-NEXT:    s_waitcnt vmcnt(0)
+; GFX1170-NEXT:    v_swmmac_f32_16x16x32_f16 v[20:23], v[0:1], v[2:5], v10
+; GFX1170-NEXT:    v_swmmac_f32_16x16x32_f16 v[24:27], v[0:1], v[2:5], v10 index_key:1
+; GFX1170-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX1170-NEXT:    v_swmmac_f32_16x16x32_f16 v[28:31], v[0:1], v[2:5], v10 index_key:2
+; GFX1170-NEXT:    v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:3
+; GFX1170-NEXT:    global_store_b128 v[12:13], v[20:23], off
+; GFX1170-NEXT:    global_store_b128 v[14:15], v[24:27], off
+; GFX1170-NEXT:    global_store_b128 v[16:17], v[28:31], off
+; GFX1170-NEXT:    global_store_b128 v[18:19], v[6:9], off
+; GFX1170-NEXT:    s_endpgm
+;
 ; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_index_key:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    global_load_b32 v10, v[10:11], off
@@ -46,6 +74,33 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16_index_key(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX1170-LABEL: test_swmmac_f32_16x16x32_bf16_index_key:
+; GFX1170:       ; %bb.0: ; %bb
+; GFX1170-NEXT:    global_load_b32 v10, v[10:11], off
+; GFX1170-NEXT:    v_mov_b32_e32 v23, v9
+; GFX1170-NEXT:    v_mov_b32_e32 v22, v8
+; GFX1170-NEXT:    v_mov_b32_e32 v21, v7
+; GFX1170-NEXT:    v_mov_b32_e32 v20, v6
+; GFX1170-NEXT:    v_mov_b32_e32 v27, v9
+; GFX1170-NEXT:    v_mov_b32_e32 v26, v8
+; GFX1170-NEXT:    v_mov_b32_e32 v25, v7
+; GFX1170-NEXT:    v_mov_b32_e32 v24, v6
+; GFX1170-NEXT:    v_mov_b32_e32 v31, v9
+; GFX1170-NEXT:    v_mov_b32_e32 v30, v8
+; GFX1170-NEXT:    v_mov_b32_e32 v29, v7
+; GFX1170-NEXT:    v_mov_b32_e32 v28, v6
+; GFX1170-NEXT:    s_waitcnt vmcnt(0)
+; GFX1170-NEXT:    v_swmmac_f32_16x16x32_bf16 v[20:23], v[0:1], v[2:5], v10
+; GFX1170-NEXT:    v_swmmac_f32_16x16x32_bf16 v[24:27], v[0:1], v[2:5], v10 index_key:1
+; GFX1170-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX1170-NEXT:    v_swmmac_f32_16x16x32_bf16 v[28:31], v[0:1], v[2:5], v10 index_key:2
+; GFX1170-NEXT:    v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:3
+; GFX1170-NEXT:    global_store_b128 v[12:13], v[20:23], off
+; GFX1170-NEXT:    global_store_b128 v[14:15], v[24:27], off
+; GFX1170-NEXT:    global_store_b128 v[16:17], v[28:31], off
+; GFX1170-NEXT:    global_store_b128 v[18:19], v[6:9], off
+; GFX1170-NEXT:    s_endpgm
+;
 ; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16_index_key:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    global_load_b32 v10, v[10:11], off
@@ -90,6 +145,27 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_index_key(<4 x half> %A, <8 x half> %B, <4 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX1170-LABEL: test_swmmac_f16_16x16x32_f16_index_key:
+; GFX1170:       ; %bb.0: ; %bb
+; GFX1170-NEXT:    global_load_b32 v22, v[8:9], off
+; GFX1170-NEXT:    v_mov_b32_e32 v9, v7
+; GFX1170-NEXT:    v_mov_b32_e32 v8, v6
+; GFX1170-NEXT:    v_mov_b32_e32 v19, v7
+; GFX1170-NEXT:    v_mov_b32_e32 v18, v6
+; GFX1170-NEXT:    v_mov_b32_e32 v21, v7
+; GFX1170-NEXT:    v_mov_b32_e32 v20, v6
+; GFX1170-NEXT:    s_waitcnt vmcnt(0)
+; GFX1170-NEXT:    v_swmmac_f16_16x16x32_f16 v[8:9], v[0:1], v[2:5], v22
+; GFX1170-NEXT:    v_swmmac_f16_16x16x32_f16 v[18:19], v[0:1], v[2:5], v22 index_key:1
+; GFX1170-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX1170-NEXT:    v_swmmac_f16_16x16x32_f16 v[20:21], v[0:1], v[2:5], v22 index_key:2
+; GFX1170-NEXT:    v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v22 index_key:3
+; GFX1170-NEXT:    global_store_b64 v[10:11], v[8:9], off
+; GFX1170-NEXT:    global_store_b64 v[12:13], v[18:19], off
+; GFX1170-NEXT:    global_store_b64 v[14:15], v[20:21], off
+; GFX1170-NEXT:    global_store_b64 v[16:17], v[6:7], off
+; GFX1170-NEXT:    s_endpgm
+;
 ; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_index_key:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    global_load_b32 v22, v[8:9], off
@@ -128,6 +204,27 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16_index_key(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX1170-LABEL: test_swmmac_bf16_16x16x32_bf16_index_key:
+; GFX1170:       ; %bb.0: ; %bb
+; GFX1170-NEXT:    global_load_b32 v22, v[8:9], off
+; GFX1170-NEXT:    v_mov_b32_e32 v9, v7
+; GFX1170-NEXT:    v_mov_b32_e32 v8, v6
+; GFX1170-NEXT:    v_mov_b32_e32 v19, v7
+; GFX1170-NEXT:    v_mov_b32_e32 v18, v6
+; GFX1170-NEXT:    v_mov_b32_e32 v21, v7
+; GFX1170-NEXT:    v_mov_b32_e32 v20, v6
+; GFX1170-NEXT:    s_waitcnt vmcnt(0)
+; GFX1170-NEXT:    v_swmmac_bf16_16x16x32_bf16 v[8:9], v[0:1], v[2:5], v22
+; GFX1170-NEXT:    v_swmmac_bf16_16x16x32_bf16 v[18:19], v[0:1], v[2:5], v22 index_key:1
+; GFX1170-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX1170-NEXT:    v_swmmac_bf16_16x16x32_bf16 v[20:21], v[0:1], v[2:5], v22 index_key:2
+; GFX1170-NEXT:    v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v22 index_key:3
+; GFX1170-NEXT:    global_store_b64 v[10:11], v[8:9], off
+; GFX1170-NEXT:    global_store_b64 v[12:13], v[18:19], off
+; GFX1170-NEXT:    global_store_b64 v[14:15], v[20:21], off
+; GFX1170-NEXT:    global_store_b64 v[16:17], v[6:7], off
+; GFX1170-NEXT:    s_endpgm
+;
 ; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16_index_key:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    global_load_b32 v22, v[8:9], off
@@ -166,6 +263,33 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_index_key(i32 %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX1170-LABEL: test_swmmac_i32_16x16x32_iu8_index_key:
+; GFX1170:       ; %bb.0: ; %bb
+; GFX1170-NEXT:    global_load_b32 v7, v[7:8], off
+; GFX1170-NEXT:    v_mov_b32_e32 v20, v6
+; GFX1170-NEXT:    v_mov_b32_e32 v19, v5
+; GFX1170-NEXT:    v_mov_b32_e32 v18, v4
+; GFX1170-NEXT:    v_mov_b32_e32 v17, v3
+; GFX1170-NEXT:    v_mov_b32_e32 v24, v6
+; GFX1170-NEXT:    v_mov_b32_e32 v23, v5
+; GFX1170-NEXT:    v_mov_b32_e32 v22, v4
+; GFX1170-NEXT:    v_mov_b32_e32 v21, v3
+; GFX1170-NEXT:    v_mov_b32_e32 v28, v6
+; GFX1170-NEXT:    v_mov_b32_e32 v27, v5
+; GFX1170-NEXT:    v_mov_b32_e32 v26, v4
+; GFX1170-NEXT:    v_mov_b32_e32 v25, v3
+; GFX1170-NEXT:    s_waitcnt vmcnt(0)
+; GFX1170-NEXT:    v_swmmac_i32_16x16x32_iu8 v[17:20], v0, v[1:2], v7
+; GFX1170-NEXT:    v_swmmac_i32_16x16x32_iu8 v[21:24], v0, v[1:2], v7 index_key:1
+; GFX1170-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX1170-NEXT:    v_swmmac_i32_16x16x32_iu8 v[25:28], v0, v[1:2], v7 index_key:2
+; GFX1170-NEXT:    v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:3
+; GFX1170-NEXT:    global_store_b128 v[9:10], v[17:20], off
+; GFX1170-NEXT:    global_store_b128 v[11:12], v[21:24], off
+; GFX1170-NEXT:    global_store_b128 v[13:14], v[25:28], off
+; GFX1170-NEXT:    global_store_b128 v[15:16], v[3:6], off
+; GFX1170-NEXT:    s_endpgm
+;
 ; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_index_key:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    global_load_b32 v7, v[7:8], off
@@ -210,6 +334,21 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_index_key(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX1170-LABEL: test_swmmac_i32_16x16x32_iu4_index_key:
+; GFX1170:       ; %bb.0: ; %bb
+; GFX1170-NEXT:    global_load_b32 v6, v[6:7], off
+; GFX1170-NEXT:    v_mov_b32_e32 v15, v5
+; GFX1170-NEXT:    v_mov_b32_e32 v14, v4
+; GFX1170-NEXT:    v_mov_b32_e32 v13, v3
+; GFX1170-NEXT:    v_mov_b32_e32 v12, v2
+; GFX1170-NEXT:    s_waitcnt vmcnt(0)
+; GFX1170-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT:    v_swmmac_i32_16x16x32_iu4 v[12:15], v0, v1, v6
+; GFX1170-NEXT:    v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:1
+; GFX1170-NEXT:    global_store_b128 v[8:9], v[12:15], off
+; GFX1170-NEXT:    global_store_b128 v[10:11], v[2:5], off
+; GFX1170-NEXT:    s_endpgm
+;
 ; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_index_key:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    global_load_b32 v6, v[6:7], off
@@ -236,6 +375,21 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_index_key(i32 %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX1170-LABEL: test_swmmac_i32_16x16x64_iu4_index_key:
+; GFX1170:       ; %bb.0: ; %bb
+; GFX1170-NEXT:    global_load_b32 v7, v[7:8], off
+; GFX1170-NEXT:    v_mov_b32_e32 v16, v6
+; GFX1170-NEXT:    v_mov_b32_e32 v15, v5
+; GFX1170-NEXT:    v_mov_b32_e32 v14, v4
+; GFX1170-NEXT:    v_mov_b32_e32 v13, v3
+; GFX1170-NEXT:    s_waitcnt vmcnt(0)
+; GFX1170-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT:    v_swmmac_i32_16x16x64_iu4 v[13:16], v0, v[1:2], v7
+; GFX1170-NEXT:    v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:1
+; GFX1170-NEXT:    global_store_b128 v[9:10], v[13:16], off
+; GFX1170-NEXT:    global_store_b128 v[11:12], v[3:6], off
+; GFX1170-NEXT:    s_endpgm
+;
 ; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_index_key:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    global_load_b32 v7, v[7:8], off
@@ -262,6 +416,33 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX1170-LABEL: test_swmmac_f32_16x16x32_fp8_fp8_index_key:
+; GFX1170:       ; %bb.0: ; %bb
+; GFX1170-NEXT:    global_load_b32 v7, v[7:8], off
+; GFX1170-NEXT:    v_mov_b32_e32 v20, v6
+; GFX1170-NEXT:    v_mov_b32_e32 v19, v5
+; GFX1170-NEXT:    v_mov_b32_e32 v18, v4
+; GFX1170-NEXT:    v_mov_b32_e32 v17, v3
+; GFX1170-NEXT:    v_mov_b32_e32 v24, v6
+; GFX1170-NEXT:    v_mov_b32_e32 v23, v5
+; GFX1170-NEXT:    v_mov_b32_e32 v22, v4
+; GFX1170-NEXT:    v_mov_b32_e32 v21, v3
+; GFX1170-NEXT:    v_mov_b32_e32 v28, v6
+; GFX1170-NEXT:    v_mov_b32_e32 v27, v5
+; GFX1170-NEXT:    v_mov_b32_e32 v26, v4
+; GFX1170-NEXT:    v_mov_b32_e32 v25, v3
+; GFX1170-NEXT:    s_waitcnt vmcnt(0)
+; GFX1170-NEXT:    v_swmmac_f32_16x16x32_fp8_fp8 v[17:20], v0, v[1:2], v7
+; GFX1170-NEXT:    v_swmmac_f32_16x16x32_fp8_fp8 v[21:24], v0, v[1:2], v7 index_key:1
+; GFX1170-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX1170-NEXT:    v_swmmac_f32_16x16x32_fp8_fp8 v[25:28], v0, v[1:2], v7 index_key:2
+; GFX1170-NEXT:    v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:3
+; GFX1170-NEXT:    global_store_b128 v[9:10], v[17:20], off
+; GFX1170-NEXT:    global_store_b128 v[11:12], v[21:24], off
+; GFX1170-NEXT:    global_store_b128 v[13:14], v[25:28], off
+; GFX1170-NEXT:    global_store_b128 v[15:16], v[3:6], off
+; GFX1170-NEXT:    s_endpgm
+;
 ; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8_index_key:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    global_load_b32 v7, v[7:8], off
@@ -306,6 +487,33 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX1170-LABEL: test_swmmac_f32_16x16x32_fp8_bf8_index_key:
+; GFX1170:       ; %bb.0: ; %bb
+; GFX1170-NEXT:    global_load_b32 v7, v[7:8], off
+; GFX1170-NEXT:    v_mov_b32_e32 v20, v6
+; GFX1170-NEXT:    v_mov_b32_e32 v19, v5
+; GFX1170-NEXT:    v_mov_b32_e32 v18, v4
+; GFX1170-NEXT:    v_mov_b32_e32 v17, v3
+; GFX1170-NEXT:    v_mov_b32_e32 v24, v6
+; GFX1170-NEXT:    v_mov_b32_e32 v23, v5
+; GFX1170-NEXT:    v_mov_b32_e32 v22, v4
+; GFX1170-NEXT:    v_mov_b32_e32 v21, v3
+; GFX1170-NEXT:    v_mov_b32_e32 v28, v6
+; GFX1170-NEXT:    v_mov_b32_e32 v27, v5
+; GFX1170-NEXT:    v_mov_b32_e32 v26, v4
+; GFX1170-NEXT:    v_mov_b32_e32 v25, v3
+; GFX1170-NEXT:    s_waitcnt vmcnt(0)
+; GFX1170-NEXT:    v_swmmac_f32_16x16x32_fp8_bf8 v[17:20], v0, v[1:2], v7
+; GFX1170-NEXT:    v_swmmac_f32_16x16x32_fp8_bf8 v[21:24], v0, v[1:2], v7 index_key:1
+; GFX1170-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX1170-NEXT:    v_swmmac_f32_16x16x32_fp8_bf8 v[25:28], v0, v[1:2], v7 index_key:2
+; GFX1170-NEXT:    v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:3
+; GFX1170-NEXT:    global_store_b128 v[9:10], v[17:20], off
+; GFX1170-NEXT:    global_store_b128 v[11:12], v[21:24], off
+; GFX1170-NEXT:    global_store_b128 v[13:14], v[25:28], off
+; GFX1170-NEXT:    global_store_b128 v[15:16], v[3:6], off
+; GFX1170-NEXT:    s_endpgm
+;
 ; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8_index_key:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    global_load_b32 v7, v[7:8], off
@@ -350,6 +558,33 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX1170-LABEL: test_swmmac_f32_16x16x32_bf8_fp8_index_key:
+; GFX1170:       ; %bb.0: ; %bb
+; GFX1170-NEXT:    global_load_b32 v7, v[7:8], off
+; GFX1170-NEXT:    v_mov_b32_e32 v20, v6
+; GFX1170-NEXT:    v_mov_b32_e32 v19, v5
+; GFX1170-NEXT:    v_mov_b32_e32 v18, v4
+; GFX1170-NEXT:    v_mov_b32_e32 v17, v3
+; GFX1170-NEXT:    v_mov_b32_e32 v24, v6
+; GFX1170-NEXT:    v_mov_b32_e32 v23, v5
+; GFX1170-NEXT:    v_mov_b32_e32 v22, v4
+; GFX1170-NEXT:    v_mov_b32_e32 v21, v3
+; GFX1170-NEXT:    v_mov_b32_e32 v28, v6
+; GFX1170-NEXT:    v_mov_b32_e32 v27, v5
+; GFX1170-NEXT:    v_mov_b32_e32 v26, v4
+; GFX1170-NEXT:    v_mov_b32_e32 v25, v3
+; GFX1170-NEXT:    s_waitcnt vmcnt(0)
+; GFX1170-NEXT:    v_swmmac_f32_16x16x32_bf8_fp8 v[17:20], v0, v[1:2], v7
+; GFX1170-NEXT:    v_swmmac_f32_16x16x32_bf8_fp8 v[21:24], v0, v[1:2], v7 index_key:1
+; GFX1170-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX1170-NEXT:    v_swmmac_f32_16x16x32_bf8_fp8 v[25:28], v0, v[1:2], v7 index_key:2
+; GFX1170-NEXT:    v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:3
+; GFX1170-NEXT:    global_store_b128 v[9:10], v[17:20], off
+; GFX1170-NEXT:    global_store_b128 v[11:12], v[21:24], off
+; GFX1170-NEXT:    global_store_b128 v[13:14], v[25:28], off
+; GFX1170-NEXT:    global_store_b128 v[15:16], v[3:6], off
+; GFX1170-NEXT:    s_endpgm
+;
 ; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8_index_key:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    global_load_b32 v7, v[7:8], off
@@ -394,6 +629,33 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX1170-LABEL: test_swmmac_f32_16x16x32_bf8_bf8_index_key:
+; GFX1170:       ; %bb.0: ; %bb
+; GFX1170-NEXT:    global_load_b32 v7, v[7:8], off
+; GFX1170-NEXT:    v_mov_b32_e32 v20, v6
+; GFX1170-NEXT:    v_mov_b32_e32 v19, v5
+; GFX1170-NEXT:    v_mov_b32_e32 v18, v4
+; GFX1170-NEXT:    v_mov_b32_e32 v17, v3
+; GFX1170-NEXT:    v_mov_b32_e32 v24, v6
+; GFX1170-NEXT:    v_mov_b32_e32 v23, v5
+; GFX1170-NEXT:    v_mov_b32_e32 v22, v4
+; GFX1170-NEXT:    v_mov_b32_e32 v21, v3
+; GFX1170-NEXT:    v_mov_b32_e32 v28, v6
+; GFX1170-NEXT:    v_mov_b32_e32 v27, v5
+; GFX1170-NEXT:    v_mov_b32_e32 v26, v4
+; GFX1170-NEXT:    v_mov_b32_e32 v25, v3
+; GFX1170-NEXT:    s_waitcnt vmcnt(0)
+; GFX1170-NEXT:    v_swmmac_f32_16x16x32_bf8_bf8 v[17:20], v0, v[1:2], v7
+; GFX1170-NEXT:    v_swmmac_f32_16x16x32_bf8_bf8 v[21:24], v0, v[1:2], v7 index_key:1
+; GFX1170-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX1170-NEXT:    v_swmmac_f32_16x16x32_bf8_bf8 v[25:28], v0, v[1:2], v7 index_key:2
+; GFX1170-NEXT:    v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:3
+; GFX1170-NEXT:    global_store_b128 v[9:10], v[17:20], off
+; GFX1170-NEXT:    global_store_b128 v[11:12], v[21:24], off
+; GFX1170-NEXT:    global_store_b128 v[13:14], v[25:28], off
+; GFX1170-NEXT:    global_store_b128 v[15:16], v[3:6], off
+; GFX1170-NEXT:    s_endpgm
+;
 ; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8_index_key:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    global_load_b32 v7, v[7:8], off
@@ -448,3 +710,5 @@ declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f
 declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
 declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
 declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64.ll
index 957b7b1b2c77c..bb256883c29ae 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64.ll
@@ -1,12 +1,13 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1170 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GCN,GFX1170
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GCN,GFX12
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_f16:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7]
-; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_f16:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7]
+; GCN-NEXT:    global_store_b128 v[8:9], v[4:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %C)
   store <4 x float> %res, ptr addrspace(1) %out
@@ -14,11 +15,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf16:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7]
-; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf16:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7]
+; GCN-NEXT:    global_store_b128 v[8:9], v[4:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> %C)
   store <4 x float> %res, ptr addrspace(1) %out
@@ -26,11 +27,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f16_16x16x16_f16(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f16_16x16x16_f16:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5]
-; GFX12-NEXT:    global_store_b64 v[6:7], v[4:5], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f16_16x16x16_f16:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5]
+; GCN-NEXT:    global_store_b64 v[6:7], v[4:5], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> %C, i1 0)
   store <4 x half> %res, ptr addrspace(1) %out
@@ -38,11 +39,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5]
-; GFX12-NEXT:    global_store_b64 v[6:7], v[4:5], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_bf16_16x16x16_bf16:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5]
+; GCN-NEXT:    global_store_b64 v[6:7], v[4:5], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i1 0)
   store <4 x i16> %res, ptr addrspace(1) %out
@@ -50,11 +51,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_i32_16x16x16_iu8(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu8:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5]
-; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu8:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5]
+; GCN-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
   store <4 x i32> %res, ptr addrspace(1) %out
@@ -62,11 +63,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_i32_16x16x16_iu4(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu4:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5]
-; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu4:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5]
+; GCN-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
   store <4 x i32> %res, ptr addrspace(1) %out
@@ -74,11 +75,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5]
-; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_fp8_fp8:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5]
+; GCN-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C)
   store <4 x float> %res, ptr addrspace(1) %out
@@ -86,11 +87,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5]
-; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf8_fp8:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5]
+; GCN-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C)
   store <4 x float> %res, ptr addrspace(1) %out
@@ -98,11 +99,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5]
-; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_fp8_bf8:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5]
+; GCN-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C)
   store <4 x float> %res, ptr addrspace(1) %out
@@ -110,11 +111,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5]
-; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf8_bf8:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5]
+; GCN-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C)
   store <4 x float> %res, ptr addrspace(1) %out
@@ -122,11 +123,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_i32_16x16x32_iu4(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x32_iu4:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5]
-; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x32_iu4:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5]
+; GCN-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
   store <4 x i32> %res, ptr addrspace(1) %out
@@ -134,11 +135,11 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_f32_16x16x32_f16(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f32_16x16x32_f16:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10
-; GFX12-NEXT:    global_store_b128 v[11:12], v[6:9], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_f32_16x16x32_f16:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10
+; GCN-NEXT:    global_store_b128 v[11:12], v[6:9], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index)
   store <4 x float> %res, ptr addrspace(1) %out
@@ -146,11 +147,11 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10
-; GFX12-NEXT:    global_store_b128 v[11:12], v[6:9], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_f32_16x16x32_bf16:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10
+; GCN-NEXT:    global_store_b128 v[11:12], v[6:9], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index)
   store <4 x float> %res, ptr addrspace(1) %out
@@ -158,11 +159,11 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_f16_16x16x32_f16(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f16_16x16x32_f16:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8
-; GFX12-NEXT:    global_store_b64 v[9:10], v[6:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_f16_16x16x32_f16:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8
+; GCN-NEXT:    global_store_b64 v[9:10], v[6:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index)
   store <4 x half> %res, ptr addrspace(1) %out
@@ -170,11 +171,11 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8
-; GFX12-NEXT:    global_store_b64 v[9:10], v[6:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_bf16_16x16x32_bf16:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8
+; GCN-NEXT:    global_store_b64 v[9:10], v[6:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index)
   store <4 x i16> %res, ptr addrspace(1) %out
@@ -182,11 +183,11 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7
-; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x32_iu8:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7
+; GCN-NEXT:    global_store_b128 v[8:9], v[3:6], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0)
   store <4 x i32> %res, ptr addrspace(1) %out
@@ -194,11 +195,11 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6
-; GFX12-NEXT:    global_store_b128 v[7:8], v[2:5], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x32_iu4:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6
+; GCN-NEXT:    global_store_b128 v[7:8], v[2:5], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 0)
   store <4 x i32> %res, ptr addrspace(1) %out
@@ -206,11 +207,11 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7
-; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x64_iu4:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7
+; GCN-NEXT:    global_store_b128 v[8:9], v[3:6], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0)
   store <4 x i32> %res, ptr addrspace(1) %out
@@ -218,11 +219,11 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7
-; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_f32_16x16x32_fp8_fp8:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7
+; GCN-NEXT:    global_store_b128 v[8:9], v[3:6], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index)
   store <4 x float> %res, ptr addrspace(1) %out
@@ -230,11 +231,11 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7
-; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_f32_16x16x32_fp8_bf8:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7
+; GCN-NEXT:    global_store_b128 v[8:9], v[3:6], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index)
   store <4 x float> %res, ptr addrspace(1) %out
@@ -242,11 +243,11 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7
-; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_f32_16x16x32_bf8_fp8:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7
+; GCN-NEXT:    global_store_b128 v[8:9], v[3:6], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index)
   store <4 x float> %res, ptr addrspace(1) %out
@@ -254,11 +255,11 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7
-; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_f32_16x16x32_bf8_bf8:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7
+; GCN-NEXT:    global_store_b128 v[8:9], v[3:6], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index)
   store <4 x float> %res, ptr addrspace(1) %out
@@ -287,3 +288,6 @@ declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f
 declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
 declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
 declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX1170: {{.*}}
+; GFX12: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll
index 98cb09642511e..6919c6d3f70ea 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll
@@ -3,6 +3,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 < %s | FileCheck %s --check-prefixes=GCN,GFX10
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 < %s | FileCheck %s --check-prefixes=GCN,GFX10
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 < %s | FileCheck %s --check-prefixes=GCN,GFX10
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1170 -amdgpu-enable-vopd=0 < %s | FileCheck %s --check-prefixes=GCN,GFX12
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-enable-vopd=0 < %s | FileCheck %s --check-prefixes=GCN,GFX12
 
 declare float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %c, i1 %clamp)
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll
index 4a010071d58c8..bc5c3283fb49e 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll
@@ -1,14 +1,15 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1170 < %s | FileCheck %s --check-prefixes=GCN,GFX1170
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefixes=GCN,GFX12
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] neg_hi:[1,0,0]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
-; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_f16_negA:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
+; GCN-NEXT:    global_store_b128 v[16:17], v[8:11], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fneg.A = fneg <8 x half> %A
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> %fneg.A, <8 x half> %B, <8 x float> %C)
@@ -17,13 +18,13 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negB:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] neg_hi:[0,1,0]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
-; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_f16_negB:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
+; GCN-NEXT:    global_store_b128 v[16:17], v[8:11], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fneg.B = fneg <8 x half> %B
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> %A, <8 x half> %fneg.B, <8 x float> %C)
@@ -32,13 +33,13 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negC:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
-; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_f16_negC:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
+; GCN-NEXT:    global_store_b128 v[16:17], v[8:11], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fneg.C = fneg <8 x float> %C
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> %A, <8 x half> %B, <8 x float> %fneg.C)
@@ -47,13 +48,13 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_f16_absC:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
-; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_f16_absC:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
+; GCN-NEXT:    global_store_b128 v[16:17], v[8:11], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> %A, <8 x half> %B, <8 x float> %fabs.C)
@@ -62,13 +63,13 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_negC:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
-; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf16_negC:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
+; GCN-NEXT:    global_store_b128 v[16:17], v[8:11], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fneg.C = fneg <8 x float> %C
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x float> %fneg.C)
@@ -77,13 +78,13 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_absC:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
-; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf16_absC:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
+; GCN-NEXT:    global_store_b128 v[16:17], v[8:11], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x float> %fabs.C)
@@ -92,11 +93,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negA:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] neg_hi:[1,0,0]
-; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f16_16x16x16_f16_negA:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GCN-NEXT:    global_store_b128 v[12:13], v[8:11], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fneg.A = fneg <8 x half> %A
   %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %fneg.A, <8 x half> %B, <8 x half> %C, i1 0)
@@ -105,11 +106,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] neg_hi:[0,1,0]
-; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f16_16x16x16_f16_negB:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GCN-NEXT:    global_store_b128 v[12:13], v[8:11], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fneg.B = fneg <8 x half> %B
   %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %fneg.B, <8 x half> %C, i1 0)
@@ -118,11 +119,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1]
-; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f16_16x16x16_f16_negC:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1]
+; GCN-NEXT:    global_store_b128 v[12:13], v[8:11], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fneg.C = fneg <8 x half> %C
   %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fneg.C, i1 0)
@@ -131,11 +132,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f16_16x16x16_f16_absC:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1]
-; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f16_16x16x16_f16_absC:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1]
+; GCN-NEXT:    global_store_b128 v[12:13], v[8:11], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fabs.C = call <8 x half> @llvm.fabs.v8f16(<8 x half> %C)
   %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fabs.C, i1 0)
@@ -144,13 +145,13 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_negC:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_fp8_fp8_negC:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GCN-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fneg.C = fneg <8 x float> %C
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
@@ -159,13 +160,13 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_absC:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_fp8_fp8_absC:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GCN-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
@@ -174,13 +175,13 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_negC:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf8_fp8_negC:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GCN-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fneg.C = fneg <8 x float> %C
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
@@ -189,13 +190,13 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_absC:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf8_fp8_absC:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GCN-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
@@ -204,13 +205,13 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_negC:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_fp8_bf8_negC:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GCN-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fneg.C = fneg <8 x float> %C
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
@@ -219,13 +220,13 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_absC:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_fp8_bf8_absC:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GCN-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
@@ -234,13 +235,13 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_negC:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf8_bf8_negC:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GCN-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fneg.C = fneg <8 x float> %C
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
@@ -249,13 +250,13 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_absC:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf8_bf8_absC:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GCN-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
@@ -264,13 +265,13 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negA:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] neg_hi:[1,0,0]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[21:22], v[16:19], off offset:16
-; GFX12-NEXT:    global_store_b128 v[21:22], v[12:15], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_f32_16x16x32_f16_negA:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[21:22], v[16:19], off offset:16
+; GCN-NEXT:    global_store_b128 v[21:22], v[12:15], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fneg.A = fneg <8 x half> %A
   %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.i16(<8 x half> %fneg.A, <16 x half> %B, <8 x float> %C, i16 %Index)
@@ -279,13 +280,13 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negB:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] neg_hi:[0,1,0]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[21:22], v[16:19], off offset:16
-; GFX12-NEXT:    global_store_b128 v[21:22], v[12:15], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_f32_16x16x32_f16_negB:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[21:22], v[16:19], off offset:16
+; GCN-NEXT:    global_store_b128 v[21:22], v[12:15], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fneg.B = fneg <16 x half> %B
   %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.i16(<8 x half> %A, <16 x half> %fneg.B, <8 x float> %C, i16 %Index)
@@ -294,11 +295,11 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negA:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] neg_hi:[1,0,0]
-; GFX12-NEXT:    global_store_b128 v[17:18], v[12:15], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_f16_16x16x32_f16_negA:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GCN-NEXT:    global_store_b128 v[17:18], v[12:15], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fneg.A = fneg <8 x half> %A
   %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %fneg.A, <16 x half> %B, <8 x half> %C, i16 %Index)
@@ -307,11 +308,11 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negB:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] neg_hi:[0,1,0]
-; GFX12-NEXT:    global_store_b128 v[17:18], v[12:15], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_f16_16x16x32_f16_negB:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GCN-NEXT:    global_store_b128 v[17:18], v[12:15], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fneg.B = fneg <16 x half> %B
   %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %A, <16 x half> %fneg.B, <8 x half> %C, i16 %Index)
@@ -322,13 +323,13 @@ bb:
 ; both neg and abs patterns (wmma matrix C f32 or f16 )
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negabsC:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] neg_hi:[0,0,1]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
-; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_f16_negabsC:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
+; GCN-NEXT:    global_store_b128 v[16:17], v[8:11], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
   %fneg.fabs.C = fneg <8 x float> %fabs.C
@@ -338,11 +339,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negabsC:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] neg_hi:[0,0,1]
-; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f16_16x16x16_f16_negabsC:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GCN-NEXT:    global_store_b128 v[12:13], v[8:11], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fabs.C = call <8 x half> @llvm.fabs.v8f16(<8 x half> %C)
   %fneg.fabs.C = fneg <8 x half> %fabs.C
@@ -352,15 +353,15 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_f16_neg_partial_fabsA:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_and_b32_e32 v11, 0x7fffffff, v11
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
-; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_f16_neg_partial_fabsA:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_and_b32_e32 v11, 0x7fffffff, v11
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT:    v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
+; GCN-NEXT:    global_store_b128 v[16:17], v[8:11], off
+; GCN-NEXT:    s_endpgm
 bb:
   %el3 = extractelement <8 x float> %C, i32 3
   %el3.fabs = call float @llvm.fabs.f32(float %el3)
@@ -374,13 +375,13 @@ bb:
 ; A or B matrix modifier and constant in C
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA_constantC:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[8:9], v[14:17], off offset:16
-; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_f16_negA_constantC:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[8:9], v[14:17], off offset:16
+; GCN-NEXT:    global_store_b128 v[8:9], v[10:13], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fneg.A = fneg <8 x half> %A
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> %fneg.A, <8 x half> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
@@ -389,11 +390,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB_constantC:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0]
-; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f16_16x16x16_f16_negB_constantC:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GCN-NEXT:    global_store_b128 v[8:9], v[10:13], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fneg.B = fneg <8 x half> %B
   %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %fneg.B, <8 x half> <half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
@@ -404,6 +405,24 @@ bb:
 ; pack f16 elements with v_perm_b32 since they don't come from same b32
 
 define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<8 x half> %A, <8 x half> %B, ptr %Caddr, ptr addrspace(1) %out) {
+; GFX1170-LABEL: test_wmma_f16_16x16x16_f16_negC_pack:
+; GFX1170:       ; %bb.0: ; %bb
+; GFX1170-NEXT:    s_clause 0x1
+; GFX1170-NEXT:    flat_load_b128 v[12:15], v[8:9] offset:16
+; GFX1170-NEXT:    flat_load_b128 v[16:19], v[8:9]
+; GFX1170-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
+; GFX1170-NEXT:    v_mov_b16_e32 v8.l, v15.l
+; GFX1170-NEXT:    v_mov_b16_e32 v9.l, v14.l
+; GFX1170-NEXT:    v_perm_b32 v14, v13, v12, 0x5040100
+; GFX1170-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX1170-NEXT:    v_perm_b32 v13, v19, v18, 0x5040100
+; GFX1170-NEXT:    v_perm_b32 v12, v17, v16, 0x5040100
+; GFX1170-NEXT:    v_perm_b32 v15, v8, v9, 0x5040100
+; GFX1170-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT:    v_wmma_f16_16x16x16_f16 v[12:15], v[0:3], v[4:7], v[12:15] neg_lo:[0,0,1]
+; GFX1170-NEXT:    global_store_b128 v[10:11], v[12:15], off
+; GFX1170-NEXT:    s_endpgm
+;
 ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC_pack:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    s_clause 0x1
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-imm.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-imm.ll
index 1b44e8f01c0f9..2558dc3903640 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-imm.ll
@@ -1,14 +1,15 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1170 < %s | FileCheck %s --check-prefixes=GCN,GFX1170
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefixes=GCN,GFX12
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], 1.0
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[8:9], v[14:17], off offset:16
-; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_f16_imm:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], 1.0
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[8:9], v[14:17], off offset:16
+; GCN-NEXT:    global_store_b128 v[8:9], v[10:13], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> %A, <8 x half> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
   store <8 x float> %res, ptr addrspace(1) %out
@@ -16,6 +17,24 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm_non_inlineable(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
+; GFX1170-LABEL: test_wmma_f32_16x16x16_f16_imm_non_inlineable:
+; GFX1170:       ; %bb.0: ; %bb
+; GFX1170-NEXT:    v_mov_b32_e32 v10, 0x40400000
+; GFX1170-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT:    v_mov_b32_e32 v11, v10
+; GFX1170-NEXT:    v_mov_b32_e32 v12, v10
+; GFX1170-NEXT:    v_mov_b32_e32 v13, v10
+; GFX1170-NEXT:    v_mov_b32_e32 v14, v10
+; GFX1170-NEXT:    v_mov_b32_e32 v15, v10
+; GFX1170-NEXT:    v_mov_b32_e32 v16, v10
+; GFX1170-NEXT:    v_mov_b32_e32 v17, v10
+; GFX1170-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT:    v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], v[10:17]
+; GFX1170-NEXT:    s_clause 0x1
+; GFX1170-NEXT:    global_store_b128 v[8:9], v[14:17], off offset:16
+; GFX1170-NEXT:    global_store_b128 v[8:9], v[10:13], off
+; GFX1170-NEXT:    s_endpgm
+;
 ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm_non_inlineable:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_mov_b32_e32 v10, 0x40400000
@@ -36,13 +55,13 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[10:17], v[0:3], v[4:7], 1.0
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[8:9], v[14:17], off offset:16
-; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf16_imm:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_bf16 v[10:17], v[0:3], v[4:7], 1.0
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[8:9], v[14:17], off offset:16
+; GCN-NEXT:    global_store_b128 v[8:9], v[10:13], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
   store <8 x float> %res, ptr addrspace(1) %out
@@ -50,6 +69,24 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm_non_inlineable(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
+; GFX1170-LABEL: test_wmma_f32_16x16x16_bf16_imm_non_inlineable:
+; GFX1170:       ; %bb.0: ; %bb
+; GFX1170-NEXT:    v_mov_b32_e32 v10, 0x40400000
+; GFX1170-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT:    v_mov_b32_e32 v11, v10
+; GFX1170-NEXT:    v_mov_b32_e32 v12, v10
+; GFX1170-NEXT:    v_mov_b32_e32 v13, v10
+; GFX1170-NEXT:    v_mov_b32_e32 v14, v10
+; GFX1170-NEXT:    v_mov_b32_e32 v15, v10
+; GFX1170-NEXT:    v_mov_b32_e32 v16, v10
+; GFX1170-NEXT:    v_mov_b32_e32 v17, v10
+; GFX1170-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT:    v_wmma_f32_16x16x16_bf16 v[10:17], v[0:3], v[4:7], v[10:17]
+; GFX1170-NEXT:    s_clause 0x1
+; GFX1170-NEXT:    global_store_b128 v[8:9], v[14:17], off offset:16
+; GFX1170-NEXT:    global_store_b128 v[8:9], v[10:13], off
+; GFX1170-NEXT:    s_endpgm
+;
 ; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm_non_inlineable:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_mov_b32_e32 v10, 0x40400000
@@ -70,11 +107,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0
-; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f16_16x16x16_f16_imm:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0
+; GCN-NEXT:    global_store_b128 v[8:9], v[10:13], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> <half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
   store <8 x half> %res, ptr addrspace(1) %out
@@ -82,6 +119,17 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm_non_inlineable(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
+; GFX1170-LABEL: test_wmma_f16_16x16x16_f16_imm_non_inlineable:
+; GFX1170:       ; %bb.0: ; %bb
+; GFX1170-NEXT:    v_mov_b32_e32 v10, 0x42004200
+; GFX1170-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1170-NEXT:    v_mov_b32_e32 v11, v10
+; GFX1170-NEXT:    v_mov_b32_e32 v12, v10
+; GFX1170-NEXT:    v_mov_b32_e32 v13, v10
+; GFX1170-NEXT:    v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], v[10:13]
+; GFX1170-NEXT:    global_store_b128 v[8:9], v[10:13], off
+; GFX1170-NEXT:    s_endpgm
+;
 ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm_non_inlineable:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_mov_b32_e32 v10, 0x42004200
@@ -98,6 +146,17 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
+; GFX1170-LABEL: test_wmma_bf16_16x16x16_bf16_imm:
+; GFX1170:       ; %bb.0: ; %bb
+; GFX1170-NEXT:    v_mov_b32_e32 v10, 0x3f803f80
+; GFX1170-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1170-NEXT:    v_mov_b32_e32 v11, v10
+; GFX1170-NEXT:    v_mov_b32_e32 v12, v10
+; GFX1170-NEXT:    v_mov_b32_e32 v13, v10
+; GFX1170-NEXT:    v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13]
+; GFX1170-NEXT:    global_store_b128 v[8:9], v[10:13], off
+; GFX1170-NEXT:    s_endpgm
+;
 ; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_mov_b32_e32 v10, 0x3f803f80
@@ -114,6 +173,17 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm_non_inlineable(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
+; GFX1170-LABEL: test_wmma_bf16_16x16x16_bf16_imm_non_inlineable:
+; GFX1170:       ; %bb.0: ; %bb
+; GFX1170-NEXT:    v_mov_b32_e32 v10, 0x3fc03fc0
+; GFX1170-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1170-NEXT:    v_mov_b32_e32 v11, v10
+; GFX1170-NEXT:    v_mov_b32_e32 v12, v10
+; GFX1170-NEXT:    v_mov_b32_e32 v13, v10
+; GFX1170-NEXT:    v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13]
+; GFX1170-NEXT:    global_store_b128 v[8:9], v[10:13], off
+; GFX1170-NEXT:    s_endpgm
+;
 ; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm_non_inlineable:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_mov_b32_e32 v10, 0x3fc03fc0
@@ -130,13 +200,13 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[6:13], v[0:1], v[2:3], 1
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
-; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu8_imm:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_i32_16x16x16_iu8 v[6:13], v[0:1], v[2:3], 1
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
+; GCN-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i1 0)
   store <8 x i32> %res, ptr addrspace(1) %out
@@ -144,6 +214,24 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
+; GFX1170-LABEL: test_wmma_i32_16x16x16_iu8_imm_non_inlineable:
+; GFX1170:       ; %bb.0: ; %bb
+; GFX1170-NEXT:    v_mov_b32_e32 v6, 0x80
+; GFX1170-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT:    v_mov_b32_e32 v7, v6
+; GFX1170-NEXT:    v_mov_b32_e32 v8, v6
+; GFX1170-NEXT:    v_mov_b32_e32 v9, v6
+; GFX1170-NEXT:    v_mov_b32_e32 v10, v6
+; GFX1170-NEXT:    v_mov_b32_e32 v11, v6
+; GFX1170-NEXT:    v_mov_b32_e32 v12, v6
+; GFX1170-NEXT:    v_mov_b32_e32 v13, v6
+; GFX1170-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT:    v_wmma_i32_16x16x16_iu8 v[6:13], v[0:1], v[2:3], v[6:13]
+; GFX1170-NEXT:    s_clause 0x1
+; GFX1170-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX1170-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GFX1170-NEXT:    s_endpgm
+;
 ; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm_non_inlineable:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_mov_b32_e32 v6, 0x80
@@ -164,13 +252,13 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm(i32 %A, i32 %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:11], v0, v1, 1
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[2:3], v[8:11], off offset:16
-; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu4_imm:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:11], v0, v1, 1
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[2:3], v[8:11], off offset:16
+; GCN-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i1 0)
   store <8 x i32> %res, ptr addrspace(1) %out
@@ -178,6 +266,24 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) {
+; GFX1170-LABEL: test_wmma_i32_16x16x16_iu4_imm_non_inlineable:
+; GFX1170:       ; %bb.0: ; %bb
+; GFX1170-NEXT:    v_mov_b32_e32 v4, 0x80
+; GFX1170-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT:    v_mov_b32_e32 v5, v4
+; GFX1170-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1170-NEXT:    v_mov_b32_e32 v7, v4
+; GFX1170-NEXT:    v_mov_b32_e32 v8, v4
+; GFX1170-NEXT:    v_mov_b32_e32 v9, v4
+; GFX1170-NEXT:    v_mov_b32_e32 v10, v4
+; GFX1170-NEXT:    v_mov_b32_e32 v11, v4
+; GFX1170-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:11], v0, v1, v[4:11]
+; GFX1170-NEXT:    s_clause 0x1
+; GFX1170-NEXT:    global_store_b128 v[2:3], v[8:11], off offset:16
+; GFX1170-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX1170-NEXT:    s_endpgm
+;
 ; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm_non_inlineable:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_mov_b32_e32 v4, 0x80
@@ -198,13 +304,13 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[6:13], v[0:1], v[2:3], 1.0
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
-; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[6:13], v[0:1], v[2:3], 1.0
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
+; GCN-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
   store <8 x float> %res, ptr addrspace(1) %out
@@ -212,6 +318,24 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
+; GFX1170-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable:
+; GFX1170:       ; %bb.0: ; %bb
+; GFX1170-NEXT:    v_mov_b32_e32 v6, 0x40400000
+; GFX1170-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT:    v_mov_b32_e32 v7, v6
+; GFX1170-NEXT:    v_mov_b32_e32 v8, v6
+; GFX1170-NEXT:    v_mov_b32_e32 v9, v6
+; GFX1170-NEXT:    v_mov_b32_e32 v10, v6
+; GFX1170-NEXT:    v_mov_b32_e32 v11, v6
+; GFX1170-NEXT:    v_mov_b32_e32 v12, v6
+; GFX1170-NEXT:    v_mov_b32_e32 v13, v6
+; GFX1170-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[6:13], v[0:1], v[2:3], v[6:13]
+; GFX1170-NEXT:    s_clause 0x1
+; GFX1170-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX1170-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GFX1170-NEXT:    s_endpgm
+;
 ; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_mov_b32_e32 v6, 0x40400000
@@ -232,13 +356,13 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[6:13], v[0:1], v[2:3], 1.0
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
-; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[6:13], v[0:1], v[2:3], 1.0
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
+; GCN-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
   store <8 x float> %res, ptr addrspace(1) %out
@@ -246,6 +370,24 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
+; GFX1170-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable:
+; GFX1170:       ; %bb.0: ; %bb
+; GFX1170-NEXT:    v_mov_b32_e32 v6, 0x40400000
+; GFX1170-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT:    v_mov_b32_e32 v7, v6
+; GFX1170-NEXT:    v_mov_b32_e32 v8, v6
+; GFX1170-NEXT:    v_mov_b32_e32 v9, v6
+; GFX1170-NEXT:    v_mov_b32_e32 v10, v6
+; GFX1170-NEXT:    v_mov_b32_e32 v11, v6
+; GFX1170-NEXT:    v_mov_b32_e32 v12, v6
+; GFX1170-NEXT:    v_mov_b32_e32 v13, v6
+; GFX1170-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[6:13], v[0:1], v[2:3], v[6:13]
+; GFX1170-NEXT:    s_clause 0x1
+; GFX1170-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX1170-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GFX1170-NEXT:    s_endpgm
+;
 ; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_mov_b32_e32 v6, 0x40400000
@@ -266,13 +408,13 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[6:13], v[0:1], v[2:3], 1.0
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
-; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[6:13], v[0:1], v[2:3], 1.0
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
+; GCN-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
   store <8 x float> %res, ptr addrspace(1) %out
@@ -280,6 +422,24 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
+; GFX1170-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable:
+; GFX1170:       ; %bb.0: ; %bb
+; GFX1170-NEXT:    v_mov_b32_e32 v6, 0x40400000
+; GFX1170-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT:    v_mov_b32_e32 v7, v6
+; GFX1170-NEXT:    v_mov_b32_e32 v8, v6
+; GFX1170-NEXT:    v_mov_b32_e32 v9, v6
+; GFX1170-NEXT:    v_mov_b32_e32 v10, v6
+; GFX1170-NEXT:    v_mov_b32_e32 v11, v6
+; GFX1170-NEXT:    v_mov_b32_e32 v12, v6
+; GFX1170-NEXT:    v_mov_b32_e32 v13, v6
+; GFX1170-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[6:13], v[0:1], v[2:3], v[6:13]
+; GFX1170-NEXT:    s_clause 0x1
+; GFX1170-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX1170-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GFX1170-NEXT:    s_endpgm
+;
 ; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_mov_b32_e32 v6, 0x40400000
@@ -300,13 +460,13 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[6:13], v[0:1], v[2:3], 1.0
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
-; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[6:13], v[0:1], v[2:3], 1.0
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
+; GCN-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
   store <8 x float> %res, ptr addrspace(1) %out
@@ -314,6 +474,24 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
+; GFX1170-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable:
+; GFX1170:       ; %bb.0: ; %bb
+; GFX1170-NEXT:    v_mov_b32_e32 v6, 0x40400000
+; GFX1170-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT:    v_mov_b32_e32 v7, v6
+; GFX1170-NEXT:    v_mov_b32_e32 v8, v6
+; GFX1170-NEXT:    v_mov_b32_e32 v9, v6
+; GFX1170-NEXT:    v_mov_b32_e32 v10, v6
+; GFX1170-NEXT:    v_mov_b32_e32 v11, v6
+; GFX1170-NEXT:    v_mov_b32_e32 v12, v6
+; GFX1170-NEXT:    v_mov_b32_e32 v13, v6
+; GFX1170-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[6:13], v[0:1], v[2:3], v[6:13]
+; GFX1170-NEXT:    s_clause 0x1
+; GFX1170-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX1170-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GFX1170-NEXT:    s_endpgm
+;
 ; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_mov_b32_e32 v6, 0x40400000
@@ -334,13 +512,13 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[6:13], v[0:1], v[2:3], 1
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
-; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x32_iu4_imm:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_i32_16x16x32_iu4 v[6:13], v[0:1], v[2:3], 1
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
+; GCN-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i1 0)
   store <8 x i32> %res, ptr addrspace(1) %out
@@ -348,6 +526,24 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
+; GFX1170-LABEL: test_wmma_i32_16x16x32_iu4_imm_non_inlineable:
+; GFX1170:       ; %bb.0: ; %bb
+; GFX1170-NEXT:    v_mov_b32_e32 v6, 0x80
+; GFX1170-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT:    v_mov_b32_e32 v7, v6
+; GFX1170-NEXT:    v_mov_b32_e32 v8, v6
+; GFX1170-NEXT:    v_mov_b32_e32 v9, v6
+; GFX1170-NEXT:    v_mov_b32_e32 v10, v6
+; GFX1170-NEXT:    v_mov_b32_e32 v11, v6
+; GFX1170-NEXT:    v_mov_b32_e32 v12, v6
+; GFX1170-NEXT:    v_mov_b32_e32 v13, v6
+; GFX1170-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT:    v_wmma_i32_16x16x32_iu4 v[6:13], v[0:1], v[2:3], v[6:13]
+; GFX1170-NEXT:    s_clause 0x1
+; GFX1170-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX1170-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GFX1170-NEXT:    s_endpgm
+;
 ; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm_non_inlineable:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_mov_b32_e32 v6, 0x80
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-iu-modifiers.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-iu-modifiers.ll
index 945305848b3e1..9d8f26ea11cb8 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-iu-modifiers.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-iu-modifiers.ll
@@ -1,14 +1,15 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1170 < %s | FileCheck %s --check-prefixes=GCN,GFX1170
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefixes=GCN,GFX12
 
 define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src0(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src0:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu8_zext_src0:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GCN-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
   store <8 x i32> %res, ptr addrspace(1) %out
@@ -16,13 +17,13 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src1(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src1:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu8_zext_src1:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GCN-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0)
   store <8 x i32> %res, ptr addrspace(1) %out
@@ -30,13 +31,13 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_clamp:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] clamp
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu8_clamp:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] clamp
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GCN-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1)
   store <8 x i32> %res, ptr addrspace(1) %out
@@ -46,13 +47,13 @@ bb:
 
 
 define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src0(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src0:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[1,0,0]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[10:11], v[6:9], off offset:16
-; GFX12-NEXT:    global_store_b128 v[10:11], v[2:5], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu4_zext_src0:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[1,0,0]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[10:11], v[6:9], off offset:16
+; GCN-NEXT:    global_store_b128 v[10:11], v[2:5], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32(i1 1, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 0)
   store <8 x i32> %res, ptr addrspace(1) %out
@@ -60,13 +61,13 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src1(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src1:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[0,1,0]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[10:11], v[6:9], off offset:16
-; GFX12-NEXT:    global_store_b128 v[10:11], v[2:5], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu4_zext_src1:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[0,1,0]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[10:11], v[6:9], off offset:16
+; GCN-NEXT:    global_store_b128 v[10:11], v[2:5], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32(i1 0, i32 %A, i1 1, i32 %B, <8 x i32> %C, i1 0)
   store <8 x i32> %res, ptr addrspace(1) %out
@@ -74,13 +75,13 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_clamp(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_clamp:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] clamp
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[10:11], v[6:9], off offset:16
-; GFX12-NEXT:    global_store_b128 v[10:11], v[2:5], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu4_clamp:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] clamp
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[10:11], v[6:9], off offset:16
+; GCN-NEXT:    global_store_b128 v[10:11], v[2:5], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 1)
   store <8 x i32> %res, ptr addrspace(1) %out
@@ -90,13 +91,13 @@ bb:
 
 
 define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src0(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src0:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x32_iu4_zext_src0:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GCN-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
   store <8 x i32> %res, ptr addrspace(1) %out
@@ -104,13 +105,13 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src1(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src1:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x32_iu4_zext_src1:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GCN-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0)
   store <8 x i32> %res, ptr addrspace(1) %out
@@ -118,13 +119,13 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_clamp:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] clamp
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x32_iu4_clamp:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] clamp
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GCN-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1)
   store <8 x i32> %res, ptr addrspace(1) %out
@@ -136,13 +137,13 @@ bb:
 
 
 define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src0(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src0:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
-; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src0:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
+; GCN-NEXT:    global_store_b128 v[15:16], v[6:9], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.i16(i1 1, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
   store <8 x i32> %res, ptr addrspace(1) %out
@@ -150,13 +151,13 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src1(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src1:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
-; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src1:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
+; GCN-NEXT:    global_store_b128 v[15:16], v[6:9], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.i16(i1 0, <2 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
   store <8 x i32> %res, ptr addrspace(1) %out
@@ -164,13 +165,13 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_clamp(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_clamp:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 clamp
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
-; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x32_iu8_clamp:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 clamp
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
+; GCN-NEXT:    global_store_b128 v[15:16], v[6:9], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 1)
   store <8 x i32> %res, ptr addrspace(1) %out
@@ -180,13 +181,13 @@ bb:
 
 
 define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src0(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src0:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[1,0,0]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[12:13], v[7:10], off offset:16
-; GFX12-NEXT:    global_store_b128 v[12:13], v[3:6], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src0:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[1,0,0]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[12:13], v[7:10], off offset:16
+; GCN-NEXT:    global_store_b128 v[12:13], v[3:6], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.i16(i1 1, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
   store <8 x i32> %res, ptr addrspace(1) %out
@@ -194,13 +195,13 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src1(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src1:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[0,1,0]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[12:13], v[7:10], off offset:16
-; GFX12-NEXT:    global_store_b128 v[12:13], v[3:6], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src1:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[0,1,0]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[12:13], v[7:10], off offset:16
+; GCN-NEXT:    global_store_b128 v[12:13], v[3:6], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.i16(i1 0, i32 %A, i1 1, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
   store <8 x i32> %res, ptr addrspace(1) %out
@@ -208,13 +209,13 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_clamp(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_clamp:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 clamp
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[12:13], v[7:10], off offset:16
-; GFX12-NEXT:    global_store_b128 v[12:13], v[3:6], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x32_iu4_clamp:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 clamp
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[12:13], v[7:10], off offset:16
+; GCN-NEXT:    global_store_b128 v[12:13], v[3:6], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 1)
   store <8 x i32> %res, ptr addrspace(1) %out
@@ -224,13 +225,13 @@ bb:
 
 
 define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src0(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src0:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
-; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src0:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
+; GCN-NEXT:    global_store_b128 v[15:16], v[6:9], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.i32(i1 1, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0)
   store <8 x i32> %res, ptr addrspace(1) %out
@@ -238,13 +239,13 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src1(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src1:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
-; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src1:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
+; GCN-NEXT:    global_store_b128 v[15:16], v[6:9], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.i32(i1 0, <2 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0)
   store <8 x i32> %res, ptr addrspace(1) %out
@@ -252,13 +253,13 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_clamp(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_clamp:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 clamp
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
-; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x64_iu4_clamp:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 clamp
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
+; GCN-NEXT:    global_store_b128 v[15:16], v[6:9], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.i32(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 1)
   store <8 x i32> %res, ptr addrspace(1) %out
@@ -271,3 +272,6 @@ declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32(i1 immarg, <2 x
 declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.i16(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16 %Index, i1 immarg)
 declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16 %Index, i1 immarg)
 declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.i32(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i32 %Index, i1 immarg)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX1170: {{.*}}
+; GFX12: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-swmmac-index_key.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-swmmac-index_key.ll
index cd7edc21718c9..f7dd2d189a2b2 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-swmmac-index_key.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-swmmac-index_key.ll
@@ -1,7 +1,27 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1170 < %s | FileCheck %s --check-prefixes=GCN,GFX1170
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefixes=GCN,GFX12
 
 define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_index_key(<8 x half> %A, <16 x half> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX1170-LABEL: test_swmmac_f32_16x16x32_f16_index_key:
+; GFX1170:       ; %bb.0: ; %bb
+; GFX1170-NEXT:    global_load_b32 v20, v[20:21], off
+; GFX1170-NEXT:    v_dual_mov_b32 v33, v19 :: v_dual_mov_b32 v32, v18
+; GFX1170-NEXT:    v_dual_mov_b32 v31, v17 :: v_dual_mov_b32 v30, v16
+; GFX1170-NEXT:    v_dual_mov_b32 v29, v15 :: v_dual_mov_b32 v28, v14
+; GFX1170-NEXT:    v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
+; GFX1170-NEXT:    s_waitcnt vmcnt(0)
+; GFX1170-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT:    v_swmmac_f32_16x16x32_f16 v[26:33], v[0:3], v[4:11], v20
+; GFX1170-NEXT:    v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 index_key:1
+; GFX1170-NEXT:    s_clause 0x1
+; GFX1170-NEXT:    global_store_b128 v[22:23], v[30:33], off offset:16
+; GFX1170-NEXT:    global_store_b128 v[22:23], v[26:29], off
+; GFX1170-NEXT:    s_clause 0x1
+; GFX1170-NEXT:    global_store_b128 v[24:25], v[16:19], off offset:16
+; GFX1170-NEXT:    global_store_b128 v[24:25], v[12:15], off
+; GFX1170-NEXT:    s_endpgm
+;
 ; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_index_key:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    global_load_b32 v20, v[20:21], off
@@ -32,6 +52,25 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16_index_key(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX1170-LABEL: test_swmmac_f32_16x16x32_bf16_index_key:
+; GFX1170:       ; %bb.0: ; %bb
+; GFX1170-NEXT:    global_load_b32 v20, v[20:21], off
+; GFX1170-NEXT:    v_dual_mov_b32 v33, v19 :: v_dual_mov_b32 v32, v18
+; GFX1170-NEXT:    v_dual_mov_b32 v31, v17 :: v_dual_mov_b32 v30, v16
+; GFX1170-NEXT:    v_dual_mov_b32 v29, v15 :: v_dual_mov_b32 v28, v14
+; GFX1170-NEXT:    v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
+; GFX1170-NEXT:    s_waitcnt vmcnt(0)
+; GFX1170-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT:    v_swmmac_f32_16x16x32_bf16 v[26:33], v[0:3], v[4:11], v20
+; GFX1170-NEXT:    v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 index_key:1
+; GFX1170-NEXT:    s_clause 0x1
+; GFX1170-NEXT:    global_store_b128 v[22:23], v[30:33], off offset:16
+; GFX1170-NEXT:    global_store_b128 v[22:23], v[26:29], off
+; GFX1170-NEXT:    s_clause 0x1
+; GFX1170-NEXT:    global_store_b128 v[24:25], v[16:19], off offset:16
+; GFX1170-NEXT:    global_store_b128 v[24:25], v[12:15], off
+; GFX1170-NEXT:    s_endpgm
+;
 ; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16_index_key:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    global_load_b32 v20, v[20:21], off
@@ -62,6 +101,19 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_index_key(<8 x half> %A, <16 x half> %B, <8 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX1170-LABEL: test_swmmac_f16_16x16x32_f16_index_key:
+; GFX1170:       ; %bb.0: ; %bb
+; GFX1170-NEXT:    global_load_b32 v16, v[16:17], off
+; GFX1170-NEXT:    v_dual_mov_b32 v25, v15 :: v_dual_mov_b32 v24, v14
+; GFX1170-NEXT:    v_dual_mov_b32 v23, v13 :: v_dual_mov_b32 v22, v12
+; GFX1170-NEXT:    s_waitcnt vmcnt(0)
+; GFX1170-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT:    v_swmmac_f16_16x16x32_f16 v[22:25], v[0:3], v[4:11], v16
+; GFX1170-NEXT:    v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:1
+; GFX1170-NEXT:    global_store_b128 v[18:19], v[22:25], off
+; GFX1170-NEXT:    global_store_b128 v[20:21], v[12:15], off
+; GFX1170-NEXT:    s_endpgm
+;
 ; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_index_key:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    global_load_b32 v16, v[16:17], off
@@ -86,6 +138,19 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16_index_key(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX1170-LABEL: test_swmmac_bf16_16x16x32_bf16_index_key:
+; GFX1170:       ; %bb.0: ; %bb
+; GFX1170-NEXT:    global_load_b32 v16, v[16:17], off
+; GFX1170-NEXT:    v_dual_mov_b32 v25, v15 :: v_dual_mov_b32 v24, v14
+; GFX1170-NEXT:    v_dual_mov_b32 v23, v13 :: v_dual_mov_b32 v22, v12
+; GFX1170-NEXT:    s_waitcnt vmcnt(0)
+; GFX1170-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT:    v_swmmac_bf16_16x16x32_bf16 v[22:25], v[0:3], v[4:11], v16
+; GFX1170-NEXT:    v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:1
+; GFX1170-NEXT:    global_store_b128 v[18:19], v[22:25], off
+; GFX1170-NEXT:    global_store_b128 v[20:21], v[12:15], off
+; GFX1170-NEXT:    s_endpgm
+;
 ; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16_index_key:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    global_load_b32 v16, v[16:17], off
@@ -110,6 +175,25 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX1170-LABEL: test_swmmac_i32_16x16x32_iu8_index_key:
+; GFX1170:       ; %bb.0: ; %bb
+; GFX1170-NEXT:    global_load_b32 v14, v[14:15], off
+; GFX1170-NEXT:    v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
+; GFX1170-NEXT:    v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10
+; GFX1170-NEXT:    v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8
+; GFX1170-NEXT:    v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6
+; GFX1170-NEXT:    s_waitcnt vmcnt(0)
+; GFX1170-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT:    v_swmmac_i32_16x16x32_iu8 v[20:27], v[0:1], v[2:5], v14
+; GFX1170-NEXT:    v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 index_key:1
+; GFX1170-NEXT:    s_clause 0x1
+; GFX1170-NEXT:    global_store_b128 v[16:17], v[24:27], off offset:16
+; GFX1170-NEXT:    global_store_b128 v[16:17], v[20:23], off
+; GFX1170-NEXT:    s_clause 0x1
+; GFX1170-NEXT:    global_store_b128 v[18:19], v[10:13], off offset:16
+; GFX1170-NEXT:    global_store_b128 v[18:19], v[6:9], off
+; GFX1170-NEXT:    s_endpgm
+;
 ; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_index_key:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    global_load_b32 v14, v[14:15], off
@@ -140,6 +224,25 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_index_key(i32 %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX1170-LABEL: test_swmmac_i32_16x16x32_iu4_index_key:
+; GFX1170:       ; %bb.0: ; %bb
+; GFX1170-NEXT:    global_load_b32 v11, v[11:12], off
+; GFX1170-NEXT:    v_dual_mov_b32 v24, v10 :: v_dual_mov_b32 v23, v9
+; GFX1170-NEXT:    v_dual_mov_b32 v22, v8 :: v_dual_mov_b32 v21, v7
+; GFX1170-NEXT:    v_dual_mov_b32 v20, v6 :: v_dual_mov_b32 v19, v5
+; GFX1170-NEXT:    v_dual_mov_b32 v18, v4 :: v_dual_mov_b32 v17, v3
+; GFX1170-NEXT:    s_waitcnt vmcnt(0)
+; GFX1170-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT:    v_swmmac_i32_16x16x32_iu4 v[17:24], v0, v[1:2], v11
+; GFX1170-NEXT:    v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 index_key:1
+; GFX1170-NEXT:    s_clause 0x1
+; GFX1170-NEXT:    global_store_b128 v[13:14], v[21:24], off offset:16
+; GFX1170-NEXT:    global_store_b128 v[13:14], v[17:20], off
+; GFX1170-NEXT:    s_clause 0x1
+; GFX1170-NEXT:    global_store_b128 v[15:16], v[7:10], off offset:16
+; GFX1170-NEXT:    global_store_b128 v[15:16], v[3:6], off
+; GFX1170-NEXT:    s_endpgm
+;
 ; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_index_key:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    global_load_b32 v11, v[11:12], off
@@ -170,6 +273,25 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX1170-LABEL: test_swmmac_f32_16x16x32_fp8_fp8_index_key:
+; GFX1170:       ; %bb.0: ; %bb
+; GFX1170-NEXT:    global_load_b32 v14, v[14:15], off
+; GFX1170-NEXT:    v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
+; GFX1170-NEXT:    v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10
+; GFX1170-NEXT:    v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8
+; GFX1170-NEXT:    v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6
+; GFX1170-NEXT:    s_waitcnt vmcnt(0)
+; GFX1170-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT:    v_swmmac_f32_16x16x32_fp8_fp8 v[20:27], v[0:1], v[2:5], v14
+; GFX1170-NEXT:    v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1
+; GFX1170-NEXT:    s_clause 0x1
+; GFX1170-NEXT:    global_store_b128 v[16:17], v[24:27], off offset:16
+; GFX1170-NEXT:    global_store_b128 v[16:17], v[20:23], off
+; GFX1170-NEXT:    s_clause 0x1
+; GFX1170-NEXT:    global_store_b128 v[18:19], v[10:13], off offset:16
+; GFX1170-NEXT:    global_store_b128 v[18:19], v[6:9], off
+; GFX1170-NEXT:    s_endpgm
+;
 ; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8_index_key:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    global_load_b32 v14, v[14:15], off
@@ -200,6 +322,25 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX1170-LABEL: test_swmmac_f32_16x16x32_fp8_bf8_index_key:
+; GFX1170:       ; %bb.0: ; %bb
+; GFX1170-NEXT:    global_load_b32 v14, v[14:15], off
+; GFX1170-NEXT:    v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
+; GFX1170-NEXT:    v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10
+; GFX1170-NEXT:    v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8
+; GFX1170-NEXT:    v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6
+; GFX1170-NEXT:    s_waitcnt vmcnt(0)
+; GFX1170-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT:    v_swmmac_f32_16x16x32_fp8_bf8 v[20:27], v[0:1], v[2:5], v14
+; GFX1170-NEXT:    v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1
+; GFX1170-NEXT:    s_clause 0x1
+; GFX1170-NEXT:    global_store_b128 v[16:17], v[24:27], off offset:16
+; GFX1170-NEXT:    global_store_b128 v[16:17], v[20:23], off
+; GFX1170-NEXT:    s_clause 0x1
+; GFX1170-NEXT:    global_store_b128 v[18:19], v[10:13], off offset:16
+; GFX1170-NEXT:    global_store_b128 v[18:19], v[6:9], off
+; GFX1170-NEXT:    s_endpgm
+;
 ; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8_index_key:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    global_load_b32 v14, v[14:15], off
@@ -230,6 +371,25 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX1170-LABEL: test_swmmac_f32_16x16x32_bf8_fp8_index_key:
+; GFX1170:       ; %bb.0: ; %bb
+; GFX1170-NEXT:    global_load_b32 v14, v[14:15], off
+; GFX1170-NEXT:    v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
+; GFX1170-NEXT:    v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10
+; GFX1170-NEXT:    v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8
+; GFX1170-NEXT:    v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6
+; GFX1170-NEXT:    s_waitcnt vmcnt(0)
+; GFX1170-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT:    v_swmmac_f32_16x16x32_bf8_fp8 v[20:27], v[0:1], v[2:5], v14
+; GFX1170-NEXT:    v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1
+; GFX1170-NEXT:    s_clause 0x1
+; GFX1170-NEXT:    global_store_b128 v[16:17], v[24:27], off offset:16
+; GFX1170-NEXT:    global_store_b128 v[16:17], v[20:23], off
+; GFX1170-NEXT:    s_clause 0x1
+; GFX1170-NEXT:    global_store_b128 v[18:19], v[10:13], off offset:16
+; GFX1170-NEXT:    global_store_b128 v[18:19], v[6:9], off
+; GFX1170-NEXT:    s_endpgm
+;
 ; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8_index_key:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    global_load_b32 v14, v[14:15], off
@@ -260,6 +420,25 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX1170-LABEL: test_swmmac_f32_16x16x32_bf8_bf8_index_key:
+; GFX1170:       ; %bb.0: ; %bb
+; GFX1170-NEXT:    global_load_b32 v14, v[14:15], off
+; GFX1170-NEXT:    v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
+; GFX1170-NEXT:    v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10
+; GFX1170-NEXT:    v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8
+; GFX1170-NEXT:    v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6
+; GFX1170-NEXT:    s_waitcnt vmcnt(0)
+; GFX1170-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT:    v_swmmac_f32_16x16x32_bf8_bf8 v[20:27], v[0:1], v[2:5], v14
+; GFX1170-NEXT:    v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1
+; GFX1170-NEXT:    s_clause 0x1
+; GFX1170-NEXT:    global_store_b128 v[16:17], v[24:27], off offset:16
+; GFX1170-NEXT:    global_store_b128 v[16:17], v[20:23], off
+; GFX1170-NEXT:    s_clause 0x1
+; GFX1170-NEXT:    global_store_b128 v[18:19], v[10:13], off offset:16
+; GFX1170-NEXT:    global_store_b128 v[18:19], v[6:9], off
+; GFX1170-NEXT:    s_endpgm
+;
 ; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8_index_key:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    global_load_b32 v14, v[14:15], off
@@ -299,3 +478,5 @@ declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.i
 declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
 declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
 declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32.ll
index d67625248669a..0993c00c30415 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32.ll
@@ -1,14 +1,15 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1170 < %s | FileCheck %s --check-prefixes=GCN,GFX1170
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefixes=GCN,GFX12
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_f16:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
-; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_f16:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
+; GCN-NEXT:    global_store_b128 v[16:17], v[8:11], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> %A, <8 x half> %B, <8 x float> %C)
   store <8 x float> %res, ptr addrspace(1) %out
@@ -16,13 +17,13 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf16:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
-; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf16:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
+; GCN-NEXT:    global_store_b128 v[16:17], v[8:11], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x float> %C)
   store <8 x float> %res, ptr addrspace(1) %out
@@ -30,11 +31,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f16_16x16x16_f16(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f16_16x16x16_f16:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11]
-; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f16_16x16x16_f16:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11]
+; GCN-NEXT:    global_store_b128 v[12:13], v[8:11], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %C, i1 0)
   store <8 x half> %res, ptr addrspace(1) %out
@@ -42,11 +43,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11]
-; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_bf16_16x16x16_bf16:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11]
+; GCN-NEXT:    global_store_b128 v[12:13], v[8:11], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i1 0)
   store <8 x i16> %res, ptr addrspace(1) %out
@@ -54,13 +55,13 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_i32_16x16x16_iu8(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu8:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu8:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GCN-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
   store <8 x i32> %res, ptr addrspace(1) %out
@@ -68,13 +69,13 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_i32_16x16x16_iu4(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu4:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[10:11], v[6:9], off offset:16
-; GFX12-NEXT:    global_store_b128 v[10:11], v[2:5], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu4:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[10:11], v[6:9], off offset:16
+; GCN-NEXT:    global_store_b128 v[10:11], v[2:5], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 0)
   store <8 x i32> %res, ptr addrspace(1) %out
@@ -82,13 +83,13 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_fp8_fp8:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GCN-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
   store <8 x float> %res, ptr addrspace(1) %out
@@ -96,13 +97,13 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf8_fp8:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GCN-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
   store <8 x float> %res, ptr addrspace(1) %out
@@ -110,13 +111,13 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_fp8_bf8:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GCN-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
   store <8 x float> %res, ptr addrspace(1) %out
@@ -124,13 +125,13 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf8_bf8:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GCN-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
   store <8 x float> %res, ptr addrspace(1) %out
@@ -138,13 +139,13 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_i32_16x16x32_iu4(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x32_iu4:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11]
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x32_iu4:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11]
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GCN-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
   store <8 x i32> %res, ptr addrspace(1) %out
@@ -153,13 +154,13 @@ bb:
 
 
 define amdgpu_ps void @test_swmmac_f32_16x16x32_f16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f32_16x16x32_f16:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[21:22], v[16:19], off offset:16
-; GFX12-NEXT:    global_store_b128 v[21:22], v[12:15], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_f32_16x16x32_f16:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[21:22], v[16:19], off offset:16
+; GCN-NEXT:    global_store_b128 v[21:22], v[12:15], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.i16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index)
   store <8 x float> %res, ptr addrspace(1) %out
@@ -167,13 +168,13 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[21:22], v[16:19], off offset:16
-; GFX12-NEXT:    global_store_b128 v[21:22], v[12:15], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_f32_16x16x32_bf16:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[21:22], v[16:19], off offset:16
+; GCN-NEXT:    global_store_b128 v[21:22], v[12:15], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index)
   store <8 x float> %res, ptr addrspace(1) %out
@@ -181,11 +182,11 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_f16_16x16x32_f16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f16_16x16x32_f16:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16
-; GFX12-NEXT:    global_store_b128 v[17:18], v[12:15], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_f16_16x16x32_f16:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16
+; GCN-NEXT:    global_store_b128 v[17:18], v[12:15], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index)
   store <8 x half> %res, ptr addrspace(1) %out
@@ -193,11 +194,11 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16
-; GFX12-NEXT:    global_store_b128 v[17:18], v[12:15], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_bf16_16x16x32_bf16:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16
+; GCN-NEXT:    global_store_b128 v[17:18], v[12:15], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index)
   store <8 x i16> %res, ptr addrspace(1) %out
@@ -205,13 +206,13 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
-; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x32_iu8:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
+; GCN-NEXT:    global_store_b128 v[15:16], v[6:9], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
   store <8 x i32> %res, ptr addrspace(1) %out
@@ -219,13 +220,13 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[12:13], v[7:10], off offset:16
-; GFX12-NEXT:    global_store_b128 v[12:13], v[3:6], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x32_iu4:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[12:13], v[7:10], off offset:16
+; GCN-NEXT:    global_store_b128 v[12:13], v[3:6], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
   store <8 x i32> %res, ptr addrspace(1) %out
@@ -233,13 +234,13 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
-; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x64_iu4:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
+; GCN-NEXT:    global_store_b128 v[15:16], v[6:9], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.i32(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0)
   store <8 x i32> %res, ptr addrspace(1) %out
@@ -247,13 +248,13 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
-; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_f32_16x16x32_fp8_fp8:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
+; GCN-NEXT:    global_store_b128 v[15:16], v[6:9], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
   store <8 x float> %res, ptr addrspace(1) %out
@@ -261,13 +262,13 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
-; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_f32_16x16x32_fp8_bf8:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
+; GCN-NEXT:    global_store_b128 v[15:16], v[6:9], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
   store <8 x float> %res, ptr addrspace(1) %out
@@ -275,13 +276,13 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
-; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_f32_16x16x32_bf8_fp8:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
+; GCN-NEXT:    global_store_b128 v[15:16], v[6:9], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
   store <8 x float> %res, ptr addrspace(1) %out
@@ -289,13 +290,13 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
-; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_f32_16x16x32_bf8_bf8:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
+; GCN-NEXT:    global_store_b128 v[15:16], v[6:9], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
   store <8 x float> %res, ptr addrspace(1) %out
@@ -324,3 +325,6 @@ declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.i
 declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
 declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
 declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX1170: {{.*}}
+; GFX12: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll
index 53bede84513c9..1a2d59e969590 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll
@@ -1,13 +1,15 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GFX12,GFX12-TRUE16
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GFX12,GFX12-FAKE16
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1170 -mattr=+real-true16 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GCN,GFX1170-TRUE16
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1170 -mattr=-real-true16 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GCN,GFX1170-FAKE16
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GCN,GFX12-TRUE16
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GCN,GFX12-FAKE16
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] neg_hi:[1,0,0]
-; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_f16_negA:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GCN-NEXT:    global_store_b128 v[8:9], v[4:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fneg.A = fneg <4 x half> %A
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> %fneg.A, <4 x half> %B, <4 x float> %C)
@@ -16,11 +18,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negB:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] neg_hi:[0,1,0]
-; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_f16_negB:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GCN-NEXT:    global_store_b128 v[8:9], v[4:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fneg.B = fneg <4 x half> %B
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> %A, <4 x half> %fneg.B, <4 x float> %C)
@@ -29,11 +31,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negC:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
-; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_f16_negC:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
+; GCN-NEXT:    global_store_b128 v[8:9], v[4:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fneg.C = fneg <4 x float> %C
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> %A, <4 x half> %B, <4 x float> %fneg.C)
@@ -42,11 +44,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_f16_absC:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1]
-; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_f16_absC:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1]
+; GCN-NEXT:    global_store_b128 v[8:9], v[4:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> %A, <4 x half> %B, <4 x float> %fabs.C)
@@ -55,11 +57,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_negC:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
-; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf16_negC:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
+; GCN-NEXT:    global_store_b128 v[8:9], v[4:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fneg.C = fneg <4 x float> %C
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x float> %fneg.C)
@@ -68,11 +70,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_absC:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1]
-; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf16_absC:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1]
+; GCN-NEXT:    global_store_b128 v[8:9], v[4:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x float> %fabs.C)
@@ -81,11 +83,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negA:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] neg_hi:[1,0,0]
-; GFX12-NEXT:    global_store_b64 v[6:7], v[4:5], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f16_16x16x16_f16_negA:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GCN-NEXT:    global_store_b64 v[6:7], v[4:5], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fneg.A = fneg <4 x half> %A
   %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %fneg.A, <4 x half> %B, <4 x half> %C, i1 0)
@@ -94,11 +96,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] neg_hi:[0,1,0]
-; GFX12-NEXT:    global_store_b64 v[6:7], v[4:5], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f16_16x16x16_f16_negB:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GCN-NEXT:    global_store_b64 v[6:7], v[4:5], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fneg.B = fneg <4 x half> %B
   %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %fneg.B, <4 x half> %C, i1 0)
@@ -107,11 +109,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1]
-; GFX12-NEXT:    global_store_b64 v[6:7], v[4:5], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f16_16x16x16_f16_negC:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1]
+; GCN-NEXT:    global_store_b64 v[6:7], v[4:5], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fneg.C = fneg <4 x half> %C
   %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fneg.C, i1 0)
@@ -120,11 +122,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f16_16x16x16_f16_absC:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1]
-; GFX12-NEXT:    global_store_b64 v[6:7], v[4:5], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f16_16x16x16_f16_absC:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1]
+; GCN-NEXT:    global_store_b64 v[6:7], v[4:5], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fabs.C = call <4 x half> @llvm.fabs.v4f16(<4 x half> %C)
   %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fabs.C, i1 0)
@@ -133,11 +135,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_negC:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
-; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_fp8_fp8_negC:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
+; GCN-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fneg.C = fneg <4 x float> %C
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32(i32 %A, i32 %B, <4 x float> %fneg.C)
@@ -146,11 +148,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_absC:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
-; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_fp8_fp8_absC:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
+; GCN-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32(i32 %A, i32 %B, <4 x float> %fabs.C)
@@ -159,11 +161,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_negC:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
-; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf8_fp8_negC:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
+; GCN-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fneg.C = fneg <4 x float> %C
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32(i32 %A, i32 %B, <4 x float> %fneg.C)
@@ -172,11 +174,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_absC:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
-; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf8_fp8_absC:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
+; GCN-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32(i32 %A, i32 %B, <4 x float> %fabs.C)
@@ -185,11 +187,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_negC:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
-; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_fp8_bf8_negC:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
+; GCN-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fneg.C = fneg <4 x float> %C
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32(i32 %A, i32 %B, <4 x float> %fneg.C)
@@ -198,11 +200,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_absC:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
-; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_fp8_bf8_absC:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
+; GCN-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32(i32 %A, i32 %B, <4 x float> %fabs.C)
@@ -211,11 +213,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_negC:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
-; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf8_bf8_negC:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
+; GCN-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fneg.C = fneg <4 x float> %C
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32(i32 %A, i32 %B, <4 x float> %fneg.C)
@@ -224,11 +226,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_absC:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
-; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf8_bf8_absC:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
+; GCN-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32(i32 %A, i32 %B, <4 x float> %fabs.C)
@@ -237,11 +239,11 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<4 x half> %A, <8 x half> %B, <4 x float> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negA:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] neg_hi:[1,0,0]
-; GFX12-NEXT:    global_store_b128 v[11:12], v[6:9], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_f32_16x16x32_f16_negA:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GCN-NEXT:    global_store_b128 v[11:12], v[6:9], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fneg.A = fneg <4 x half> %A
   %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.i16(<4 x half> %fneg.A, <8 x half> %B, <4 x float> %C, i16 %Index)
@@ -250,11 +252,11 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<4 x half> %A, <8 x half> %B, <4 x float> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negB:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] neg_hi:[0,1,0]
-; GFX12-NEXT:    global_store_b128 v[11:12], v[6:9], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_f32_16x16x32_f16_negB:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GCN-NEXT:    global_store_b128 v[11:12], v[6:9], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fneg.B = fneg <8 x half> %B
   %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.i16(<4 x half> %A, <8 x half> %fneg.B, <4 x float> %C, i16 %Index)
@@ -263,11 +265,11 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<4 x half> %A, <8 x half> %B, <4 x half> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negA:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] neg_hi:[1,0,0]
-; GFX12-NEXT:    global_store_b64 v[9:10], v[6:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_f16_16x16x32_f16_negA:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GCN-NEXT:    global_store_b64 v[9:10], v[6:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fneg.A = fneg <4 x half> %A
   %res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.i16(<4 x half> %fneg.A, <8 x half> %B, <4 x half> %C, i16 %Index)
@@ -276,11 +278,11 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<4 x half> %A, <8 x half> %B, <4 x half> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negB:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] neg_hi:[0,1,0]
-; GFX12-NEXT:    global_store_b64 v[9:10], v[6:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_f16_16x16x32_f16_negB:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GCN-NEXT:    global_store_b64 v[9:10], v[6:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fneg.B = fneg <8 x half> %B
   %res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.i16(<4 x half> %A, <8 x half> %fneg.B, <4 x half> %C, i16 %Index)
@@ -291,11 +293,11 @@ bb:
 ; both neg and abs patterns (wmma matrix C f32 or f16 )
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negabsC:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] neg_hi:[0,0,1]
-; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_f16_negabsC:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GCN-NEXT:    global_store_b128 v[8:9], v[4:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
   %fneg.fabs.C = fneg <4 x float> %fabs.C
@@ -305,11 +307,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negabsC:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] neg_hi:[0,0,1]
-; GFX12-NEXT:    global_store_b64 v[6:7], v[4:5], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f16_16x16x16_f16_negabsC:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GCN-NEXT:    global_store_b64 v[6:7], v[4:5], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fabs.C = call <4 x half> @llvm.fabs.v4f16(<4 x half> %C)
   %fneg.fabs.C = fneg <4 x half> %fabs.C
@@ -319,13 +321,13 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_f16_neg_partial_fabsA:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_and_b32_e32 v7, 0x7fffffff, v7
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
-; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_f16_neg_partial_fabsA:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_and_b32_e32 v7, 0x7fffffff, v7
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
+; GCN-NEXT:    global_store_b128 v[8:9], v[4:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %el3 = extractelement <4 x float> %C, i32 3
   %el3.fabs = call float @llvm.fabs.f32(float %el3)
@@ -339,11 +341,11 @@ bb:
 ; A or B matrix modifier and constant in C
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA_constantC:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0]
-; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_f16_negA_constantC:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GCN-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fneg.A = fneg <4 x half> %A
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> %fneg.A, <4 x half> %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
@@ -352,11 +354,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB_constantC:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0]
-; GFX12-NEXT:    global_store_b64 v[4:5], v[6:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f16_16x16x16_f16_negB_constantC:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GCN-NEXT:    global_store_b64 v[4:5], v[6:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %fneg.B = fneg <4 x half> %B
   %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %fneg.B, <4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
@@ -367,6 +369,29 @@ bb:
 ; pack f16 elements with v_perm_b32 since they don't come from same b32
 
 define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<4 x half> %A, <4 x half> %B, ptr %Caddr, ptr addrspace(1) %out) {
+; GFX1170-TRUE16-LABEL: test_wmma_f16_16x16x16_f16_negC_pack:
+; GFX1170-TRUE16:       ; %bb.0: ; %bb
+; GFX1170-TRUE16-NEXT:    flat_load_b128 v[8:11], v[4:5]
+; GFX1170-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX1170-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v11.l
+; GFX1170-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v9.l
+; GFX1170-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1170-TRUE16-NEXT:    v_mov_b32_e32 v9, v10
+; GFX1170-TRUE16-NEXT:    v_wmma_f16_16x16x16_f16 v[8:9], v[0:1], v[2:3], v[8:9] neg_lo:[0,0,1]
+; GFX1170-TRUE16-NEXT:    global_store_b64 v[6:7], v[8:9], off
+; GFX1170-TRUE16-NEXT:    s_endpgm
+;
+; GFX1170-FAKE16-LABEL: test_wmma_f16_16x16x16_f16_negC_pack:
+; GFX1170-FAKE16:       ; %bb.0: ; %bb
+; GFX1170-FAKE16-NEXT:    flat_load_b128 v[8:11], v[4:5]
+; GFX1170-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX1170-FAKE16-NEXT:    v_perm_b32 v5, v11, v10, 0x5040100
+; GFX1170-FAKE16-NEXT:    v_perm_b32 v4, v9, v8, 0x5040100
+; GFX1170-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-FAKE16-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1]
+; GFX1170-FAKE16-NEXT:    global_store_b64 v[6:7], v[4:5], off
+; GFX1170-FAKE16-NEXT:    s_endpgm
+;
 ; GFX12-TRUE16-LABEL: test_wmma_f16_16x16x16_f16_negC_pack:
 ; GFX12-TRUE16:       ; %bb.0: ; %bb
 ; GFX12-TRUE16-NEXT:    flat_load_b128 v[8:11], v[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-imm.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-imm.ll
index a8f5726632aa1..a4222338a5038 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-imm.ll
@@ -1,12 +1,13 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1170 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GCN,GFX1170
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GCN,GFX12
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0
-; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_f16_imm:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0
+; GCN-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> %A, <4 x half> %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
   store <4 x float> %res, ptr addrspace(1) %out
@@ -14,16 +15,16 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm_non_inlineable(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm_non_inlineable:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_mov_b32_e32 v6, 0x40400000
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_mov_b32_e32 v7, v6
-; GFX12-NEXT:    v_mov_b32_e32 v8, v6
-; GFX12-NEXT:    v_mov_b32_e32 v9, v6
-; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], v[6:9]
-; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_f16_imm_non_inlineable:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_mov_b32_e32 v6, 0x40400000
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GCN-NEXT:    v_mov_b32_e32 v7, v6
+; GCN-NEXT:    v_mov_b32_e32 v8, v6
+; GCN-NEXT:    v_mov_b32_e32 v9, v6
+; GCN-NEXT:    v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], v[6:9]
+; GCN-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> %A, <4 x half> %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
   store <4 x float> %res, ptr addrspace(1) %out
@@ -31,11 +32,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[6:9], v[0:1], v[2:3], 1.0
-; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf16_imm:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_bf16 v[6:9], v[0:1], v[2:3], 1.0
+; GCN-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
   store <4 x float> %res, ptr addrspace(1) %out
@@ -43,16 +44,16 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm_non_inlineable(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm_non_inlineable:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_mov_b32_e32 v6, 0x40400000
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_mov_b32_e32 v7, v6
-; GFX12-NEXT:    v_mov_b32_e32 v8, v6
-; GFX12-NEXT:    v_mov_b32_e32 v9, v6
-; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[6:9], v[0:1], v[2:3], v[6:9]
-; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf16_imm_non_inlineable:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_mov_b32_e32 v6, 0x40400000
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GCN-NEXT:    v_mov_b32_e32 v7, v6
+; GCN-NEXT:    v_mov_b32_e32 v8, v6
+; GCN-NEXT:    v_mov_b32_e32 v9, v6
+; GCN-NEXT:    v_wmma_f32_16x16x16_bf16 v[6:9], v[0:1], v[2:3], v[6:9]
+; GCN-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
   store <4 x float> %res, ptr addrspace(1) %out
@@ -60,11 +61,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0
-; GFX12-NEXT:    global_store_b64 v[4:5], v[6:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f16_16x16x16_f16_imm:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0
+; GCN-NEXT:    global_store_b64 v[4:5], v[6:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
   store <4 x half> %res, ptr addrspace(1) %out
@@ -72,14 +73,14 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm_non_inlineable(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm_non_inlineable:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_mov_b32_e32 v6, 0x42004200
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_mov_b32_e32 v7, v6
-; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], v[6:7]
-; GFX12-NEXT:    global_store_b64 v[4:5], v[6:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f16_16x16x16_f16_imm_non_inlineable:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_mov_b32_e32 v6, 0x42004200
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GCN-NEXT:    v_mov_b32_e32 v7, v6
+; GCN-NEXT:    v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], v[6:7]
+; GCN-NEXT:    global_store_b64 v[4:5], v[6:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> <half 3.0, half 3.0, half 3.0, half 3.0>, i1 0)
   store <4 x half> %res, ptr addrspace(1) %out
@@ -87,14 +88,14 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_mov_b32_e32 v6, 0x3f803f80
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_mov_b32_e32 v7, v6
-; GFX12-NEXT:    v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], v[6:7]
-; GFX12-NEXT:    global_store_b64 v[4:5], v[6:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_bf16_16x16x16_bf16_imm:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_mov_b32_e32 v6, 0x3f803f80
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GCN-NEXT:    v_mov_b32_e32 v7, v6
+; GCN-NEXT:    v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], v[6:7]
+; GCN-NEXT:    global_store_b64 v[4:5], v[6:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> <i16 16256, i16 16256, i16 16256, i16 16256>, i1 0)
   store <4 x i16> %res, ptr addrspace(1) %out
@@ -102,14 +103,14 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm_non_inlineable(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm_non_inlineable:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_mov_b32_e32 v6, 0x3fc03fc0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_mov_b32_e32 v7, v6
-; GFX12-NEXT:    v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], v[6:7]
-; GFX12-NEXT:    global_store_b64 v[4:5], v[6:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_bf16_16x16x16_bf16_imm_non_inlineable:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_mov_b32_e32 v6, 0x3fc03fc0
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GCN-NEXT:    v_mov_b32_e32 v7, v6
+; GCN-NEXT:    v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], v[6:7]
+; GCN-NEXT:    global_store_b64 v[4:5], v[6:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> <i16 16320, i16 16320, i16 16320, i16 16320>, i1 0)
   store <4 x i16> %res, ptr addrspace(1) %out
@@ -117,11 +118,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm(i32 %A, i32 %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[4:7], v0, v1, 1
-; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu8_imm:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_i32_16x16x16_iu8 v[4:7], v0, v1, 1
+; GCN-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, i1 0)
   store <4 x i32> %res, ptr addrspace(1) %out
@@ -129,16 +130,16 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm_non_inlineable:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_mov_b32_e32 v4, 0x80
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_mov_b32_e32 v5, v4
-; GFX12-NEXT:    v_mov_b32_e32 v6, v4
-; GFX12-NEXT:    v_mov_b32_e32 v7, v4
-; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[4:7], v0, v1, v[4:7]
-; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu8_imm_non_inlineable:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_mov_b32_e32 v4, 0x80
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GCN-NEXT:    v_mov_b32_e32 v5, v4
+; GCN-NEXT:    v_mov_b32_e32 v6, v4
+; GCN-NEXT:    v_mov_b32_e32 v7, v4
+; GCN-NEXT:    v_wmma_i32_16x16x16_iu8 v[4:7], v0, v1, v[4:7]
+; GCN-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 128, i32 128, i32 128, i32 128>, i1 0)
   store <4 x i32> %res, ptr addrspace(1) %out
@@ -146,11 +147,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm(i32 %A, i32 %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:7], v0, v1, 1
-; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu4_imm:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:7], v0, v1, 1
+; GCN-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, i1 0)
   store <4 x i32> %res, ptr addrspace(1) %out
@@ -158,16 +159,16 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm_non_inlineable:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_mov_b32_e32 v4, 0x80
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_mov_b32_e32 v5, v4
-; GFX12-NEXT:    v_mov_b32_e32 v6, v4
-; GFX12-NEXT:    v_mov_b32_e32 v7, v4
-; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:7], v0, v1, v[4:7]
-; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu4_imm_non_inlineable:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_mov_b32_e32 v4, 0x80
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GCN-NEXT:    v_mov_b32_e32 v5, v4
+; GCN-NEXT:    v_mov_b32_e32 v6, v4
+; GCN-NEXT:    v_mov_b32_e32 v7, v4
+; GCN-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:7], v0, v1, v[4:7]
+; GCN-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 128, i32 128, i32 128, i32 128>, i1 0)
   store <4 x i32> %res, ptr addrspace(1) %out
@@ -175,11 +176,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[4:7], v0, v1, 1.0
-; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[4:7], v0, v1, 1.0
+; GCN-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
   store <4 x float> %res, ptr addrspace(1) %out
@@ -187,16 +188,16 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_mov_b32_e32 v4, 0x40400000
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_mov_b32_e32 v5, v4
-; GFX12-NEXT:    v_mov_b32_e32 v6, v4
-; GFX12-NEXT:    v_mov_b32_e32 v7, v4
-; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[4:7], v0, v1, v[4:7]
-; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_mov_b32_e32 v4, 0x40400000
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GCN-NEXT:    v_mov_b32_e32 v5, v4
+; GCN-NEXT:    v_mov_b32_e32 v6, v4
+; GCN-NEXT:    v_mov_b32_e32 v7, v4
+; GCN-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[4:7], v0, v1, v[4:7]
+; GCN-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
   store <4 x float> %res, ptr addrspace(1) %out
@@ -204,11 +205,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[4:7], v0, v1, 1.0
-; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[4:7], v0, v1, 1.0
+; GCN-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
   store <4 x float> %res, ptr addrspace(1) %out
@@ -216,16 +217,16 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_mov_b32_e32 v4, 0x40400000
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_mov_b32_e32 v5, v4
-; GFX12-NEXT:    v_mov_b32_e32 v6, v4
-; GFX12-NEXT:    v_mov_b32_e32 v7, v4
-; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[4:7], v0, v1, v[4:7]
-; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_mov_b32_e32 v4, 0x40400000
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GCN-NEXT:    v_mov_b32_e32 v5, v4
+; GCN-NEXT:    v_mov_b32_e32 v6, v4
+; GCN-NEXT:    v_mov_b32_e32 v7, v4
+; GCN-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[4:7], v0, v1, v[4:7]
+; GCN-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
   store <4 x float> %res, ptr addrspace(1) %out
@@ -233,11 +234,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[4:7], v0, v1, 1.0
-; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[4:7], v0, v1, 1.0
+; GCN-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
   store <4 x float> %res, ptr addrspace(1) %out
@@ -245,16 +246,16 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_mov_b32_e32 v4, 0x40400000
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_mov_b32_e32 v5, v4
-; GFX12-NEXT:    v_mov_b32_e32 v6, v4
-; GFX12-NEXT:    v_mov_b32_e32 v7, v4
-; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[4:7], v0, v1, v[4:7]
-; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_mov_b32_e32 v4, 0x40400000
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GCN-NEXT:    v_mov_b32_e32 v5, v4
+; GCN-NEXT:    v_mov_b32_e32 v6, v4
+; GCN-NEXT:    v_mov_b32_e32 v7, v4
+; GCN-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[4:7], v0, v1, v[4:7]
+; GCN-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
   store <4 x float> %res, ptr addrspace(1) %out
@@ -262,11 +263,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[4:7], v0, v1, 1.0
-; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[4:7], v0, v1, 1.0
+; GCN-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
   store <4 x float> %res, ptr addrspace(1) %out
@@ -274,16 +275,16 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_mov_b32_e32 v4, 0x40400000
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_mov_b32_e32 v5, v4
-; GFX12-NEXT:    v_mov_b32_e32 v6, v4
-; GFX12-NEXT:    v_mov_b32_e32 v7, v4
-; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[4:7], v0, v1, v[4:7]
-; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_mov_b32_e32 v4, 0x40400000
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GCN-NEXT:    v_mov_b32_e32 v5, v4
+; GCN-NEXT:    v_mov_b32_e32 v6, v4
+; GCN-NEXT:    v_mov_b32_e32 v7, v4
+; GCN-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[4:7], v0, v1, v[4:7]
+; GCN-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
   store <4 x float> %res, ptr addrspace(1) %out
@@ -291,11 +292,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[4:7], v0, v1, 1
-; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x32_iu4_imm:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_i32_16x16x32_iu4 v[4:7], v0, v1, 1
+; GCN-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, i1 0)
   store <4 x i32> %res, ptr addrspace(1) %out
@@ -303,16 +304,16 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm_non_inlineable(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm_non_inlineable:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_mov_b32_e32 v4, 0x80
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_mov_b32_e32 v5, v4
-; GFX12-NEXT:    v_mov_b32_e32 v6, v4
-; GFX12-NEXT:    v_mov_b32_e32 v7, v4
-; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[4:7], v0, v1, v[4:7]
-; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x32_iu4_imm_non_inlineable:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_mov_b32_e32 v4, 0x80
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GCN-NEXT:    v_mov_b32_e32 v5, v4
+; GCN-NEXT:    v_mov_b32_e32 v6, v4
+; GCN-NEXT:    v_mov_b32_e32 v7, v4
+; GCN-NEXT:    v_wmma_i32_16x16x32_iu4 v[4:7], v0, v1, v[4:7]
+; GCN-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 128, i32 128, i32 128, i32 128>, i1 0)
   store <4 x i32> %res, ptr addrspace(1) %out
@@ -330,3 +331,6 @@ declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32(i32, i32, <
 declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32(i32, i32, <4 x float>)
 declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32(i32, i32, <4 x float>)
 declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX1170: {{.*}}
+; GFX12: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-iu-modifiers.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-iu-modifiers.ll
index 9303dbfad437f..baeb81ab62957 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-iu-modifiers.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-iu-modifiers.ll
@@ -1,12 +1,13 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1170 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GCN,GFX1170
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GCN,GFX12
 
 define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src0:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
-; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu8_zext_src0:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
+; GCN-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
   store <4 x i32> %res, ptr addrspace(1) %out
@@ -14,11 +15,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src1(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src1:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
-; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu8_zext_src1:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
+; GCN-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0)
   store <4 x i32> %res, ptr addrspace(1) %out
@@ -26,11 +27,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_clamp(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_clamp:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] clamp
-; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu8_clamp:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] clamp
+; GCN-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1)
   store <4 x i32> %res, ptr addrspace(1) %out
@@ -40,11 +41,11 @@ bb:
 
 
 define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src0:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
-; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu4_zext_src0:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
+; GCN-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
   store <4 x i32> %res, ptr addrspace(1) %out
@@ -52,11 +53,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src1(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src1:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
-; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu4_zext_src1:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
+; GCN-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0)
   store <4 x i32> %res, ptr addrspace(1) %out
@@ -64,11 +65,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_clamp(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_clamp:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] clamp
-; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu4_clamp:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] clamp
+; GCN-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1)
   store <4 x i32> %res, ptr addrspace(1) %out
@@ -78,11 +79,11 @@ bb:
 
 
 define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src0:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
-; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x32_iu4_zext_src0:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
+; GCN-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
   store <4 x i32> %res, ptr addrspace(1) %out
@@ -90,11 +91,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src1(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src1:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
-; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x32_iu4_zext_src1:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
+; GCN-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0)
   store <4 x i32> %res, ptr addrspace(1) %out
@@ -102,11 +103,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_clamp(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_clamp:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] clamp
-; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x32_iu4_clamp:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] clamp
+; GCN-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1)
   store <4 x i32> %res, ptr addrspace(1) %out
@@ -114,11 +115,11 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src0(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src0:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
-; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src0:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
+; GCN-NEXT:    global_store_b128 v[8:9], v[3:6], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.i8(i1 1, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0)
   store <4 x i32> %res, ptr addrspace(1) %out
@@ -126,11 +127,11 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src1(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src1:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
-; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src1:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
+; GCN-NEXT:    global_store_b128 v[8:9], v[3:6], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.i8(i1 0, i32 %A, i1 1, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0)
   store <4 x i32> %res, ptr addrspace(1) %out
@@ -138,11 +139,11 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_clamp(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_clamp:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 clamp
-; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x32_iu8_clamp:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 clamp
+; GCN-NEXT:    global_store_b128 v[8:9], v[3:6], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 1)
   store <4 x i32> %res, ptr addrspace(1) %out
@@ -152,11 +153,11 @@ bb:
 
 
 define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src0(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src0:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[1,0,0]
-; GFX12-NEXT:    global_store_b128 v[7:8], v[2:5], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src0:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[1,0,0]
+; GCN-NEXT:    global_store_b128 v[7:8], v[2:5], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.i16(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 0)
   store <4 x i32> %res, ptr addrspace(1) %out
@@ -164,11 +165,11 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src1(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src1:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[0,1,0]
-; GFX12-NEXT:    global_store_b128 v[7:8], v[2:5], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src1:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[0,1,0]
+; GCN-NEXT:    global_store_b128 v[7:8], v[2:5], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.i16(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i16 %Index, i1 0)
   store <4 x i32> %res, ptr addrspace(1) %out
@@ -176,11 +177,11 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_clamp(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_clamp:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 clamp
-; GFX12-NEXT:    global_store_b128 v[7:8], v[2:5], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x32_iu4_clamp:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 clamp
+; GCN-NEXT:    global_store_b128 v[7:8], v[2:5], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 1)
   store <4 x i32> %res, ptr addrspace(1) %out
@@ -190,11 +191,11 @@ bb:
 
 
 define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src0(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src0:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
-; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src0:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
+; GCN-NEXT:    global_store_b128 v[8:9], v[3:6], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.i16(i1 1, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0)
   store <4 x i32> %res, ptr addrspace(1) %out
@@ -202,11 +203,11 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src1(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src1:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
-; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src1:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
+; GCN-NEXT:    global_store_b128 v[8:9], v[3:6], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.i16(i1 0, i32 %A, i1 1, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0)
   store <4 x i32> %res, ptr addrspace(1) %out
@@ -214,11 +215,11 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_clamp(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_clamp:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 clamp
-; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x64_iu4_clamp:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 clamp
+; GCN-NEXT:    global_store_b128 v[8:9], v[3:6], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 1)
   store <4 x i32> %res, ptr addrspace(1) %out
@@ -231,3 +232,6 @@ declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32(i1 immarg, i32, i
 declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.i8(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i8 %Index, i1 immarg)
 declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.i16(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i16 %Index, i1 immarg)
 declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i16 %Index, i1 immarg)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX1170: {{.*}}
+; GFX12: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll
index fdfec74e01b7b..183230a1242bf 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll
@@ -1,7 +1,35 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1170 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GCN,GFX1170
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GCN,GFX12
 
 define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_index_key(<4 x half> %A, <8 x half> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX1170-LABEL: test_swmmac_f32_16x16x32_f16_index_key:
+; GFX1170:       ; %bb.0: ; %bb
+; GFX1170-NEXT:    global_load_b32 v10, v[10:11], off
+; GFX1170-NEXT:    v_mov_b32_e32 v23, v9
+; GFX1170-NEXT:    v_mov_b32_e32 v22, v8
+; GFX1170-NEXT:    v_mov_b32_e32 v21, v7
+; GFX1170-NEXT:    v_mov_b32_e32 v20, v6
+; GFX1170-NEXT:    v_mov_b32_e32 v27, v9
+; GFX1170-NEXT:    v_mov_b32_e32 v26, v8
+; GFX1170-NEXT:    v_mov_b32_e32 v25, v7
+; GFX1170-NEXT:    v_mov_b32_e32 v24, v6
+; GFX1170-NEXT:    v_mov_b32_e32 v31, v9
+; GFX1170-NEXT:    v_mov_b32_e32 v30, v8
+; GFX1170-NEXT:    v_mov_b32_e32 v29, v7
+; GFX1170-NEXT:    v_mov_b32_e32 v28, v6
+; GFX1170-NEXT:    s_waitcnt vmcnt(0)
+; GFX1170-NEXT:    v_swmmac_f32_16x16x32_f16 v[20:23], v[0:1], v[2:5], v10
+; GFX1170-NEXT:    v_swmmac_f32_16x16x32_f16 v[24:27], v[0:1], v[2:5], v10 index_key:1
+; GFX1170-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX1170-NEXT:    v_swmmac_f32_16x16x32_f16 v[28:31], v[0:1], v[2:5], v10 index_key:2
+; GFX1170-NEXT:    v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:3
+; GFX1170-NEXT:    global_store_b128 v[12:13], v[20:23], off
+; GFX1170-NEXT:    global_store_b128 v[14:15], v[24:27], off
+; GFX1170-NEXT:    global_store_b128 v[16:17], v[28:31], off
+; GFX1170-NEXT:    global_store_b128 v[18:19], v[6:9], off
+; GFX1170-NEXT:    s_endpgm
+;
 ; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_index_key:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    global_load_b32 v10, v[10:11], off
@@ -46,6 +74,33 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16_index_key(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX1170-LABEL: test_swmmac_f32_16x16x32_bf16_index_key:
+; GFX1170:       ; %bb.0: ; %bb
+; GFX1170-NEXT:    global_load_b32 v10, v[10:11], off
+; GFX1170-NEXT:    v_mov_b32_e32 v23, v9
+; GFX1170-NEXT:    v_mov_b32_e32 v22, v8
+; GFX1170-NEXT:    v_mov_b32_e32 v21, v7
+; GFX1170-NEXT:    v_mov_b32_e32 v20, v6
+; GFX1170-NEXT:    v_mov_b32_e32 v27, v9
+; GFX1170-NEXT:    v_mov_b32_e32 v26, v8
+; GFX1170-NEXT:    v_mov_b32_e32 v25, v7
+; GFX1170-NEXT:    v_mov_b32_e32 v24, v6
+; GFX1170-NEXT:    v_mov_b32_e32 v31, v9
+; GFX1170-NEXT:    v_mov_b32_e32 v30, v8
+; GFX1170-NEXT:    v_mov_b32_e32 v29, v7
+; GFX1170-NEXT:    v_mov_b32_e32 v28, v6
+; GFX1170-NEXT:    s_waitcnt vmcnt(0)
+; GFX1170-NEXT:    v_swmmac_f32_16x16x32_bf16 v[20:23], v[0:1], v[2:5], v10
+; GFX1170-NEXT:    v_swmmac_f32_16x16x32_bf16 v[24:27], v[0:1], v[2:5], v10 index_key:1
+; GFX1170-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX1170-NEXT:    v_swmmac_f32_16x16x32_bf16 v[28:31], v[0:1], v[2:5], v10 index_key:2
+; GFX1170-NEXT:    v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:3
+; GFX1170-NEXT:    global_store_b128 v[12:13], v[20:23], off
+; GFX1170-NEXT:    global_store_b128 v[14:15], v[24:27], off
+; GFX1170-NEXT:    global_store_b128 v[16:17], v[28:31], off
+; GFX1170-NEXT:    global_store_b128 v[18:19], v[6:9], off
+; GFX1170-NEXT:    s_endpgm
+;
 ; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16_index_key:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    global_load_b32 v10, v[10:11], off
@@ -90,6 +145,27 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_index_key(<4 x half> %A, <8 x half> %B, <4 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX1170-LABEL: test_swmmac_f16_16x16x32_f16_index_key:
+; GFX1170:       ; %bb.0: ; %bb
+; GFX1170-NEXT:    global_load_b32 v22, v[8:9], off
+; GFX1170-NEXT:    v_mov_b32_e32 v9, v7
+; GFX1170-NEXT:    v_mov_b32_e32 v8, v6
+; GFX1170-NEXT:    v_mov_b32_e32 v19, v7
+; GFX1170-NEXT:    v_mov_b32_e32 v18, v6
+; GFX1170-NEXT:    v_mov_b32_e32 v21, v7
+; GFX1170-NEXT:    v_mov_b32_e32 v20, v6
+; GFX1170-NEXT:    s_waitcnt vmcnt(0)
+; GFX1170-NEXT:    v_swmmac_f16_16x16x32_f16 v[8:9], v[0:1], v[2:5], v22
+; GFX1170-NEXT:    v_swmmac_f16_16x16x32_f16 v[18:19], v[0:1], v[2:5], v22 index_key:1
+; GFX1170-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX1170-NEXT:    v_swmmac_f16_16x16x32_f16 v[20:21], v[0:1], v[2:5], v22 index_key:2
+; GFX1170-NEXT:    v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v22 index_key:3
+; GFX1170-NEXT:    global_store_b64 v[10:11], v[8:9], off
+; GFX1170-NEXT:    global_store_b64 v[12:13], v[18:19], off
+; GFX1170-NEXT:    global_store_b64 v[14:15], v[20:21], off
+; GFX1170-NEXT:    global_store_b64 v[16:17], v[6:7], off
+; GFX1170-NEXT:    s_endpgm
+;
 ; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_index_key:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    global_load_b32 v22, v[8:9], off
@@ -128,6 +204,27 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16_index_key(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX1170-LABEL: test_swmmac_bf16_16x16x32_bf16_index_key:
+; GFX1170:       ; %bb.0: ; %bb
+; GFX1170-NEXT:    global_load_b32 v22, v[8:9], off
+; GFX1170-NEXT:    v_mov_b32_e32 v9, v7
+; GFX1170-NEXT:    v_mov_b32_e32 v8, v6
+; GFX1170-NEXT:    v_mov_b32_e32 v19, v7
+; GFX1170-NEXT:    v_mov_b32_e32 v18, v6
+; GFX1170-NEXT:    v_mov_b32_e32 v21, v7
+; GFX1170-NEXT:    v_mov_b32_e32 v20, v6
+; GFX1170-NEXT:    s_waitcnt vmcnt(0)
+; GFX1170-NEXT:    v_swmmac_bf16_16x16x32_bf16 v[8:9], v[0:1], v[2:5], v22
+; GFX1170-NEXT:    v_swmmac_bf16_16x16x32_bf16 v[18:19], v[0:1], v[2:5], v22 index_key:1
+; GFX1170-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX1170-NEXT:    v_swmmac_bf16_16x16x32_bf16 v[20:21], v[0:1], v[2:5], v22 index_key:2
+; GFX1170-NEXT:    v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v22 index_key:3
+; GFX1170-NEXT:    global_store_b64 v[10:11], v[8:9], off
+; GFX1170-NEXT:    global_store_b64 v[12:13], v[18:19], off
+; GFX1170-NEXT:    global_store_b64 v[14:15], v[20:21], off
+; GFX1170-NEXT:    global_store_b64 v[16:17], v[6:7], off
+; GFX1170-NEXT:    s_endpgm
+;
 ; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16_index_key:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    global_load_b32 v22, v[8:9], off
@@ -166,6 +263,33 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_index_key(i32 %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX1170-LABEL: test_swmmac_i32_16x16x32_iu8_index_key:
+; GFX1170:       ; %bb.0: ; %bb
+; GFX1170-NEXT:    global_load_b32 v7, v[7:8], off
+; GFX1170-NEXT:    v_mov_b32_e32 v20, v6
+; GFX1170-NEXT:    v_mov_b32_e32 v19, v5
+; GFX1170-NEXT:    v_mov_b32_e32 v18, v4
+; GFX1170-NEXT:    v_mov_b32_e32 v17, v3
+; GFX1170-NEXT:    v_mov_b32_e32 v24, v6
+; GFX1170-NEXT:    v_mov_b32_e32 v23, v5
+; GFX1170-NEXT:    v_mov_b32_e32 v22, v4
+; GFX1170-NEXT:    v_mov_b32_e32 v21, v3
+; GFX1170-NEXT:    v_mov_b32_e32 v28, v6
+; GFX1170-NEXT:    v_mov_b32_e32 v27, v5
+; GFX1170-NEXT:    v_mov_b32_e32 v26, v4
+; GFX1170-NEXT:    v_mov_b32_e32 v25, v3
+; GFX1170-NEXT:    s_waitcnt vmcnt(0)
+; GFX1170-NEXT:    v_swmmac_i32_16x16x32_iu8 v[17:20], v0, v[1:2], v7
+; GFX1170-NEXT:    v_swmmac_i32_16x16x32_iu8 v[21:24], v0, v[1:2], v7 index_key:1
+; GFX1170-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX1170-NEXT:    v_swmmac_i32_16x16x32_iu8 v[25:28], v0, v[1:2], v7 index_key:2
+; GFX1170-NEXT:    v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:3
+; GFX1170-NEXT:    global_store_b128 v[9:10], v[17:20], off
+; GFX1170-NEXT:    global_store_b128 v[11:12], v[21:24], off
+; GFX1170-NEXT:    global_store_b128 v[13:14], v[25:28], off
+; GFX1170-NEXT:    global_store_b128 v[15:16], v[3:6], off
+; GFX1170-NEXT:    s_endpgm
+;
 ; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_index_key:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    global_load_b32 v7, v[7:8], off
@@ -210,6 +334,21 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_index_key(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX1170-LABEL: test_swmmac_i32_16x16x32_iu4_index_key:
+; GFX1170:       ; %bb.0: ; %bb
+; GFX1170-NEXT:    global_load_b32 v6, v[6:7], off
+; GFX1170-NEXT:    v_mov_b32_e32 v15, v5
+; GFX1170-NEXT:    v_mov_b32_e32 v14, v4
+; GFX1170-NEXT:    v_mov_b32_e32 v13, v3
+; GFX1170-NEXT:    v_mov_b32_e32 v12, v2
+; GFX1170-NEXT:    s_waitcnt vmcnt(0)
+; GFX1170-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT:    v_swmmac_i32_16x16x32_iu4 v[12:15], v0, v1, v6
+; GFX1170-NEXT:    v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:1
+; GFX1170-NEXT:    global_store_b128 v[8:9], v[12:15], off
+; GFX1170-NEXT:    global_store_b128 v[10:11], v[2:5], off
+; GFX1170-NEXT:    s_endpgm
+;
 ; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_index_key:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    global_load_b32 v6, v[6:7], off
@@ -236,6 +375,21 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_index_key(i32 %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX1170-LABEL: test_swmmac_i32_16x16x64_iu4_index_key:
+; GFX1170:       ; %bb.0: ; %bb
+; GFX1170-NEXT:    global_load_b32 v7, v[7:8], off
+; GFX1170-NEXT:    v_mov_b32_e32 v16, v6
+; GFX1170-NEXT:    v_mov_b32_e32 v15, v5
+; GFX1170-NEXT:    v_mov_b32_e32 v14, v4
+; GFX1170-NEXT:    v_mov_b32_e32 v13, v3
+; GFX1170-NEXT:    s_waitcnt vmcnt(0)
+; GFX1170-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT:    v_swmmac_i32_16x16x64_iu4 v[13:16], v0, v[1:2], v7
+; GFX1170-NEXT:    v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:1
+; GFX1170-NEXT:    global_store_b128 v[9:10], v[13:16], off
+; GFX1170-NEXT:    global_store_b128 v[11:12], v[3:6], off
+; GFX1170-NEXT:    s_endpgm
+;
 ; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_index_key:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    global_load_b32 v7, v[7:8], off
@@ -262,6 +416,33 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX1170-LABEL: test_swmmac_f32_16x16x32_fp8_fp8_index_key:
+; GFX1170:       ; %bb.0: ; %bb
+; GFX1170-NEXT:    global_load_b32 v7, v[7:8], off
+; GFX1170-NEXT:    v_mov_b32_e32 v20, v6
+; GFX1170-NEXT:    v_mov_b32_e32 v19, v5
+; GFX1170-NEXT:    v_mov_b32_e32 v18, v4
+; GFX1170-NEXT:    v_mov_b32_e32 v17, v3
+; GFX1170-NEXT:    v_mov_b32_e32 v24, v6
+; GFX1170-NEXT:    v_mov_b32_e32 v23, v5
+; GFX1170-NEXT:    v_mov_b32_e32 v22, v4
+; GFX1170-NEXT:    v_mov_b32_e32 v21, v3
+; GFX1170-NEXT:    v_mov_b32_e32 v28, v6
+; GFX1170-NEXT:    v_mov_b32_e32 v27, v5
+; GFX1170-NEXT:    v_mov_b32_e32 v26, v4
+; GFX1170-NEXT:    v_mov_b32_e32 v25, v3
+; GFX1170-NEXT:    s_waitcnt vmcnt(0)
+; GFX1170-NEXT:    v_swmmac_f32_16x16x32_fp8_fp8 v[17:20], v0, v[1:2], v7
+; GFX1170-NEXT:    v_swmmac_f32_16x16x32_fp8_fp8 v[21:24], v0, v[1:2], v7 index_key:1
+; GFX1170-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX1170-NEXT:    v_swmmac_f32_16x16x32_fp8_fp8 v[25:28], v0, v[1:2], v7 index_key:2
+; GFX1170-NEXT:    v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:3
+; GFX1170-NEXT:    global_store_b128 v[9:10], v[17:20], off
+; GFX1170-NEXT:    global_store_b128 v[11:12], v[21:24], off
+; GFX1170-NEXT:    global_store_b128 v[13:14], v[25:28], off
+; GFX1170-NEXT:    global_store_b128 v[15:16], v[3:6], off
+; GFX1170-NEXT:    s_endpgm
+;
 ; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8_index_key:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    global_load_b32 v7, v[7:8], off
@@ -306,6 +487,33 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX1170-LABEL: test_swmmac_f32_16x16x32_fp8_bf8_index_key:
+; GFX1170:       ; %bb.0: ; %bb
+; GFX1170-NEXT:    global_load_b32 v7, v[7:8], off
+; GFX1170-NEXT:    v_mov_b32_e32 v20, v6
+; GFX1170-NEXT:    v_mov_b32_e32 v19, v5
+; GFX1170-NEXT:    v_mov_b32_e32 v18, v4
+; GFX1170-NEXT:    v_mov_b32_e32 v17, v3
+; GFX1170-NEXT:    v_mov_b32_e32 v24, v6
+; GFX1170-NEXT:    v_mov_b32_e32 v23, v5
+; GFX1170-NEXT:    v_mov_b32_e32 v22, v4
+; GFX1170-NEXT:    v_mov_b32_e32 v21, v3
+; GFX1170-NEXT:    v_mov_b32_e32 v28, v6
+; GFX1170-NEXT:    v_mov_b32_e32 v27, v5
+; GFX1170-NEXT:    v_mov_b32_e32 v26, v4
+; GFX1170-NEXT:    v_mov_b32_e32 v25, v3
+; GFX1170-NEXT:    s_waitcnt vmcnt(0)
+; GFX1170-NEXT:    v_swmmac_f32_16x16x32_fp8_bf8 v[17:20], v0, v[1:2], v7
+; GFX1170-NEXT:    v_swmmac_f32_16x16x32_fp8_bf8 v[21:24], v0, v[1:2], v7 index_key:1
+; GFX1170-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX1170-NEXT:    v_swmmac_f32_16x16x32_fp8_bf8 v[25:28], v0, v[1:2], v7 index_key:2
+; GFX1170-NEXT:    v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:3
+; GFX1170-NEXT:    global_store_b128 v[9:10], v[17:20], off
+; GFX1170-NEXT:    global_store_b128 v[11:12], v[21:24], off
+; GFX1170-NEXT:    global_store_b128 v[13:14], v[25:28], off
+; GFX1170-NEXT:    global_store_b128 v[15:16], v[3:6], off
+; GFX1170-NEXT:    s_endpgm
+;
 ; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8_index_key:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    global_load_b32 v7, v[7:8], off
@@ -350,6 +558,33 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX1170-LABEL: test_swmmac_f32_16x16x32_bf8_fp8_index_key:
+; GFX1170:       ; %bb.0: ; %bb
+; GFX1170-NEXT:    global_load_b32 v7, v[7:8], off
+; GFX1170-NEXT:    v_mov_b32_e32 v20, v6
+; GFX1170-NEXT:    v_mov_b32_e32 v19, v5
+; GFX1170-NEXT:    v_mov_b32_e32 v18, v4
+; GFX1170-NEXT:    v_mov_b32_e32 v17, v3
+; GFX1170-NEXT:    v_mov_b32_e32 v24, v6
+; GFX1170-NEXT:    v_mov_b32_e32 v23, v5
+; GFX1170-NEXT:    v_mov_b32_e32 v22, v4
+; GFX1170-NEXT:    v_mov_b32_e32 v21, v3
+; GFX1170-NEXT:    v_mov_b32_e32 v28, v6
+; GFX1170-NEXT:    v_mov_b32_e32 v27, v5
+; GFX1170-NEXT:    v_mov_b32_e32 v26, v4
+; GFX1170-NEXT:    v_mov_b32_e32 v25, v3
+; GFX1170-NEXT:    s_waitcnt vmcnt(0)
+; GFX1170-NEXT:    v_swmmac_f32_16x16x32_bf8_fp8 v[17:20], v0, v[1:2], v7
+; GFX1170-NEXT:    v_swmmac_f32_16x16x32_bf8_fp8 v[21:24], v0, v[1:2], v7 index_key:1
+; GFX1170-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX1170-NEXT:    v_swmmac_f32_16x16x32_bf8_fp8 v[25:28], v0, v[1:2], v7 index_key:2
+; GFX1170-NEXT:    v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:3
+; GFX1170-NEXT:    global_store_b128 v[9:10], v[17:20], off
+; GFX1170-NEXT:    global_store_b128 v[11:12], v[21:24], off
+; GFX1170-NEXT:    global_store_b128 v[13:14], v[25:28], off
+; GFX1170-NEXT:    global_store_b128 v[15:16], v[3:6], off
+; GFX1170-NEXT:    s_endpgm
+;
 ; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8_index_key:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    global_load_b32 v7, v[7:8], off
@@ -394,6 +629,33 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX1170-LABEL: test_swmmac_f32_16x16x32_bf8_bf8_index_key:
+; GFX1170:       ; %bb.0: ; %bb
+; GFX1170-NEXT:    global_load_b32 v7, v[7:8], off
+; GFX1170-NEXT:    v_mov_b32_e32 v20, v6
+; GFX1170-NEXT:    v_mov_b32_e32 v19, v5
+; GFX1170-NEXT:    v_mov_b32_e32 v18, v4
+; GFX1170-NEXT:    v_mov_b32_e32 v17, v3
+; GFX1170-NEXT:    v_mov_b32_e32 v24, v6
+; GFX1170-NEXT:    v_mov_b32_e32 v23, v5
+; GFX1170-NEXT:    v_mov_b32_e32 v22, v4
+; GFX1170-NEXT:    v_mov_b32_e32 v21, v3
+; GFX1170-NEXT:    v_mov_b32_e32 v28, v6
+; GFX1170-NEXT:    v_mov_b32_e32 v27, v5
+; GFX1170-NEXT:    v_mov_b32_e32 v26, v4
+; GFX1170-NEXT:    v_mov_b32_e32 v25, v3
+; GFX1170-NEXT:    s_waitcnt vmcnt(0)
+; GFX1170-NEXT:    v_swmmac_f32_16x16x32_bf8_bf8 v[17:20], v0, v[1:2], v7
+; GFX1170-NEXT:    v_swmmac_f32_16x16x32_bf8_bf8 v[21:24], v0, v[1:2], v7 index_key:1
+; GFX1170-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX1170-NEXT:    v_swmmac_f32_16x16x32_bf8_bf8 v[25:28], v0, v[1:2], v7 index_key:2
+; GFX1170-NEXT:    v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:3
+; GFX1170-NEXT:    global_store_b128 v[9:10], v[17:20], off
+; GFX1170-NEXT:    global_store_b128 v[11:12], v[21:24], off
+; GFX1170-NEXT:    global_store_b128 v[13:14], v[25:28], off
+; GFX1170-NEXT:    global_store_b128 v[15:16], v[3:6], off
+; GFX1170-NEXT:    s_endpgm
+;
 ; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8_index_key:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    global_load_b32 v7, v[7:8], off
@@ -448,3 +710,5 @@ declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.i8(
 declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.i8(i32, <2 x i32>, <4 x float>, i8)
 declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.i8(i32, <2 x i32>, <4 x float>, i8)
 declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.i8(i32, <2 x i32>, <4 x float>, i8)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64.ll
index 896efb06d5595..60dc7cc766f75 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64.ll
@@ -1,12 +1,13 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1170 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GCN,GFX1170
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GCN,GFX12
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_f16:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7]
-; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_f16:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7]
+; GCN-NEXT:    global_store_b128 v[8:9], v[4:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> %A, <4 x half> %B, <4 x float> %C)
   store <4 x float> %res, ptr addrspace(1) %out
@@ -14,11 +15,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf16:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7]
-; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf16:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7]
+; GCN-NEXT:    global_store_b128 v[8:9], v[4:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x float> %C)
   store <4 x float> %res, ptr addrspace(1) %out
@@ -26,11 +27,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f16_16x16x16_f16(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f16_16x16x16_f16:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5]
-; GFX12-NEXT:    global_store_b64 v[6:7], v[4:5], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f16_16x16x16_f16:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5]
+; GCN-NEXT:    global_store_b64 v[6:7], v[4:5], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> %C, i1 0)
   store <4 x half> %res, ptr addrspace(1) %out
@@ -38,11 +39,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5]
-; GFX12-NEXT:    global_store_b64 v[6:7], v[4:5], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_bf16_16x16x16_bf16:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5]
+; GCN-NEXT:    global_store_b64 v[6:7], v[4:5], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i1 0)
   store <4 x i16> %res, ptr addrspace(1) %out
@@ -50,11 +51,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_i32_16x16x16_iu8(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu8:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5]
-; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu8:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5]
+; GCN-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
   store <4 x i32> %res, ptr addrspace(1) %out
@@ -62,11 +63,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_i32_16x16x16_iu4(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu4:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5]
-; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu4:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5]
+; GCN-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
   store <4 x i32> %res, ptr addrspace(1) %out
@@ -74,11 +75,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5]
-; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_fp8_fp8:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5]
+; GCN-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32(i32 %A, i32 %B, <4 x float> %C)
   store <4 x float> %res, ptr addrspace(1) %out
@@ -86,11 +87,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5]
-; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf8_fp8:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5]
+; GCN-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32(i32 %A, i32 %B, <4 x float> %C)
   store <4 x float> %res, ptr addrspace(1) %out
@@ -98,11 +99,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5]
-; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_fp8_bf8:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5]
+; GCN-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32(i32 %A, i32 %B, <4 x float> %C)
   store <4 x float> %res, ptr addrspace(1) %out
@@ -110,11 +111,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5]
-; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf8_bf8:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5]
+; GCN-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32(i32 %A, i32 %B, <4 x float> %C)
   store <4 x float> %res, ptr addrspace(1) %out
@@ -122,11 +123,11 @@ bb:
 }
 
 define amdgpu_ps void @test_wmma_i32_16x16x32_iu4(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x32_iu4:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5]
-; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x32_iu4:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5]
+; GCN-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
   store <4 x i32> %res, ptr addrspace(1) %out
@@ -134,11 +135,11 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_f32_16x16x32_f16(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f32_16x16x32_f16:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10
-; GFX12-NEXT:    global_store_b128 v[11:12], v[6:9], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_f32_16x16x32_f16:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10
+; GCN-NEXT:    global_store_b128 v[11:12], v[6:9], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index)
   store <4 x float> %res, ptr addrspace(1) %out
@@ -146,11 +147,11 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10
-; GFX12-NEXT:    global_store_b128 v[11:12], v[6:9], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_f32_16x16x32_bf16:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10
+; GCN-NEXT:    global_store_b128 v[11:12], v[6:9], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index)
   store <4 x float> %res, ptr addrspace(1) %out
@@ -158,11 +159,11 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_f16_16x16x32_f16(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f16_16x16x32_f16:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8
-; GFX12-NEXT:    global_store_b64 v[9:10], v[6:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_f16_16x16x32_f16:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8
+; GCN-NEXT:    global_store_b64 v[9:10], v[6:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index)
   store <4 x half> %res, ptr addrspace(1) %out
@@ -170,11 +171,11 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8
-; GFX12-NEXT:    global_store_b64 v[9:10], v[6:7], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_bf16_16x16x32_bf16:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8
+; GCN-NEXT:    global_store_b64 v[9:10], v[6:7], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index)
   store <4 x i16> %res, ptr addrspace(1) %out
@@ -182,11 +183,11 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7
-; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x32_iu8:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7
+; GCN-NEXT:    global_store_b128 v[8:9], v[3:6], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0)
   store <4 x i32> %res, ptr addrspace(1) %out
@@ -194,11 +195,11 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6
-; GFX12-NEXT:    global_store_b128 v[7:8], v[2:5], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x32_iu4:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6
+; GCN-NEXT:    global_store_b128 v[7:8], v[2:5], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 0)
   store <4 x i32> %res, ptr addrspace(1) %out
@@ -206,11 +207,11 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7
-; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x64_iu4:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7
+; GCN-NEXT:    global_store_b128 v[8:9], v[3:6], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0)
   store <4 x i32> %res, ptr addrspace(1) %out
@@ -218,11 +219,11 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7
-; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_f32_16x16x32_fp8_fp8:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7
+; GCN-NEXT:    global_store_b128 v[8:9], v[3:6], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index)
   store <4 x float> %res, ptr addrspace(1) %out
@@ -230,11 +231,11 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7
-; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_f32_16x16x32_fp8_bf8:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7
+; GCN-NEXT:    global_store_b128 v[8:9], v[3:6], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index)
   store <4 x float> %res, ptr addrspace(1) %out
@@ -242,11 +243,11 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7
-; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_f32_16x16x32_bf8_fp8:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7
+; GCN-NEXT:    global_store_b128 v[8:9], v[3:6], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index)
   store <4 x float> %res, ptr addrspace(1) %out
@@ -254,11 +255,11 @@ bb:
 }
 
 define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7
-; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
-; GFX12-NEXT:    s_endpgm
+; GCN-LABEL: test_swmmac_f32_16x16x32_bf8_bf8:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7
+; GCN-NEXT:    global_store_b128 v[8:9], v[3:6], off
+; GCN-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index)
   store <4 x float> %res, ptr addrspace(1) %out
@@ -287,3 +288,6 @@ declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.i8(
 declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.i8(i32, <2 x i32>, <4 x float>, i8)
 declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.i8(i32, <2 x i32>, <4 x float>, i8)
 declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.i8(i32, <2 x i32>, <4 x float>, i8)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX1170: {{.*}}
+; GFX12: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w32.mir b/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w32.mir
index ef85de2012943..897bd2d8517a4 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w32.mir
+++ b/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w32.mir
@@ -1,5 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=GFX12 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1170 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,GFX1170 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,GFX12 %s
 
 # D0 overlaps A1, B1, C1 or Index1. Overlap starts at vgpr0.
 #  $D0 = wmma0 $A0, $B0, $C0 or $D0 = swmmac0 $A0, $B0, $C0, $Index0
@@ -11,12 +12,12 @@ body: |
   bb.0:
     liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45
 
-    ; GFX12-LABEL: name: test_wmma_f32_16x16x16_f16_D0_overlaps_A1
-    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45
-    ; GFX12-NEXT: {{  $}}
-    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
-    ; GFX12-NEXT: V_NOP_e32 implicit $exec
-    ; GFX12-NEXT: early-clobber renamable $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35, 8, killed $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, implicit $exec
+    ; GCN-LABEL: name: test_wmma_f32_16x16x16_f16_D0_overlaps_A1
+    ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    ; GCN-NEXT: V_NOP_e32 implicit $exec
+    ; GCN-NEXT: early-clobber renamable $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35, 8, killed $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, implicit $exec
     early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
     early-clobber renamable $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35, 8, killed $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, implicit $exec
 ...
@@ -27,12 +28,12 @@ body: |
   bb.0:
     liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45
 
-    ; GFX12-LABEL: name: test_wmma_f32_16x16x16_bf16_D0_overlaps_B1
-    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45
-    ; GFX12-NEXT: {{  $}}
-    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
-    ; GFX12-NEXT: V_NOP_e32 implicit $exec
-    ; GFX12-NEXT: early-clobber renamable $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43 = V_WMMA_F32_16X16X16_BF16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, implicit $exec
+    ; GCN-LABEL: name: test_wmma_f32_16x16x16_bf16_D0_overlaps_B1
+    ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    ; GCN-NEXT: V_NOP_e32 implicit $exec
+    ; GCN-NEXT: early-clobber renamable $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43 = V_WMMA_F32_16X16X16_BF16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, implicit $exec
     early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
     early-clobber renamable $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43 = V_WMMA_F32_16X16X16_BF16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, implicit $exec
 ...
@@ -43,11 +44,11 @@ body: |
   bb.0:
     liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
 
-    ; GFX12-LABEL: name: test_wmma_f16_16x16x16_f16_D0_overlaps_C1
-    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
-    ; GFX12-NEXT: {{  $}}
-    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
-    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F16_16X16X16_F16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    ; GCN-LABEL: name: test_wmma_f16_16x16x16_f16_D0_overlaps_C1
+    ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F16_16X16X16_F16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
     early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
     early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F16_16X16X16_F16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
 ...
@@ -58,12 +59,12 @@ body: |
   bb.0:
     liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
 
-    ; GFX12-LABEL: name: test_wmma_bf16_16x16x16_bf16_D0_overlaps_A1
-    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
-    ; GFX12-NEXT: {{  $}}
-    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
-    ; GFX12-NEXT: V_NOP_e32 implicit $exec
-    ; GFX12-NEXT: early-clobber renamable $vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_BF16_16X16X16_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35, 8, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
+    ; GCN-LABEL: name: test_wmma_bf16_16x16x16_bf16_D0_overlaps_A1
+    ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    ; GCN-NEXT: V_NOP_e32 implicit $exec
+    ; GCN-NEXT: early-clobber renamable $vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_BF16_16X16X16_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35, 8, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
     early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
     early-clobber renamable $vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_BF16_16X16X16_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35, 8, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
 ...
@@ -73,12 +74,12 @@ body: |
   bb.0:
     liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
 
-    ; GFX12-LABEL: name: test_wmma_i32_16x16x16_iu8_D0_overlaps_B1
-    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
-    ; GFX12-NEXT: {{  $}}
-    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
-    ; GFX12-NEXT: V_NOP_e32 implicit $exec
-    ; GFX12-NEXT: early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_I32_16X16X16_IU8_w32_twoaddr 8, killed $vgpr28_vgpr29, 8, killed $vgpr0_vgpr1, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
+    ; GCN-LABEL: name: test_wmma_i32_16x16x16_iu8_D0_overlaps_B1
+    ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    ; GCN-NEXT: V_NOP_e32 implicit $exec
+    ; GCN-NEXT: early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_I32_16X16X16_IU8_w32_twoaddr 8, killed $vgpr28_vgpr29, 8, killed $vgpr0_vgpr1, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
     early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
     early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_I32_16X16X16_IU8_w32_twoaddr 8, killed $vgpr28_vgpr29, 8, killed $vgpr0_vgpr1, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
 ...
@@ -89,11 +90,11 @@ body: |
   bb.0:
     liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39
 
-    ; GFX12-LABEL: name: test_wmma_i32_16x16x16_iu4_D0_overlaps_C1
-    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39
-    ; GFX12-NEXT: {{  $}}
-    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
-    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_I32_16X16X16_IU4_w32_twoaddr 8, killed $vgpr28, 8, killed $vgpr29, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    ; GCN-LABEL: name: test_wmma_i32_16x16x16_iu4_D0_overlaps_C1
+    ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_I32_16X16X16_IU4_w32_twoaddr 8, killed $vgpr28, 8, killed $vgpr29, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
     early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
     early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_I32_16X16X16_IU4_w32_twoaddr 8, killed $vgpr28, 8, killed $vgpr29, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
 ...
@@ -104,12 +105,12 @@ body: |
   bb.0:
     liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
 
-    ; GFX12-LABEL: name: test_wmma_f32_16x16x16_fp8_fp8_D0_overlaps_A1
-    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
-    ; GFX12-NEXT: {{  $}}
-    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
-    ; GFX12-NEXT: V_NOP_e32 implicit $exec
-    ; GFX12-NEXT: early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X16_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
+    ; GCN-LABEL: name: test_wmma_f32_16x16x16_fp8_fp8_D0_overlaps_A1
+    ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    ; GCN-NEXT: V_NOP_e32 implicit $exec
+    ; GCN-NEXT: early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X16_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
     early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
     early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X16_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
 ...
@@ -120,12 +121,12 @@ body: |
   bb.0:
     liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
 
-    ; GFX12-LABEL: name: test_wmma_f32_16x16x16_bf8_fp8_D0_overlaps_B1
-    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
-    ; GFX12-NEXT: {{  $}}
-    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
-    ; GFX12-NEXT: V_NOP_e32 implicit $exec
-    ; GFX12-NEXT: early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X16_BF8_FP8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr0_vgpr1, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
+    ; GCN-LABEL: name: test_wmma_f32_16x16x16_bf8_fp8_D0_overlaps_B1
+    ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    ; GCN-NEXT: V_NOP_e32 implicit $exec
+    ; GCN-NEXT: early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X16_BF8_FP8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr0_vgpr1, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
     early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
     early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X16_BF8_FP8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr0_vgpr1, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
 ...
@@ -136,11 +137,11 @@ body: |
   bb.0:
     liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
 
-    ; GFX12-LABEL: name: test_wmma_f32_16x16x16_fp8_bf8_D0_overlaps_C1
-    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
-    ; GFX12-NEXT: {{  $}}
-    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
-    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_FP8_BF8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr30_vgpr31, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    ; GCN-LABEL: name: test_wmma_f32_16x16x16_fp8_bf8_D0_overlaps_C1
+    ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_FP8_BF8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr30_vgpr31, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
     early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
     early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_FP8_BF8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr30_vgpr31, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
 ...
@@ -151,12 +152,12 @@ body: |
   bb.0:
     liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
 
-    ; GFX12-LABEL: name: test_wmma_f32_16x16x16_bf8_bf8_D0_overlaps_A1
-    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
-    ; GFX12-NEXT: {{  $}}
-    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
-    ; GFX12-NEXT: V_NOP_e32 implicit $exec
-    ; GFX12-NEXT: early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X16_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
+    ; GCN-LABEL: name: test_wmma_f32_16x16x16_bf8_bf8_D0_overlaps_A1
+    ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    ; GCN-NEXT: V_NOP_e32 implicit $exec
+    ; GCN-NEXT: early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X16_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
     early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
     early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X16_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
 ...
@@ -167,12 +168,12 @@ body: |
   bb.0:
     liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
 
-    ; GFX12-LABEL: name: test_wmma_i32_16x16x32_iu4_D0_overlaps_B1
-    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
-    ; GFX12-NEXT: {{  $}}
-    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
-    ; GFX12-NEXT: V_NOP_e32 implicit $exec
-    ; GFX12-NEXT: early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_I32_16X16X32_IU4_w32_twoaddr 8, killed $vgpr28_vgpr29, 8, killed $vgpr0_vgpr1, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
+    ; GCN-LABEL: name: test_wmma_i32_16x16x32_iu4_D0_overlaps_B1
+    ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    ; GCN-NEXT: V_NOP_e32 implicit $exec
+    ; GCN-NEXT: early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_I32_16X16X32_IU4_w32_twoaddr 8, killed $vgpr28_vgpr29, 8, killed $vgpr0_vgpr1, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
     early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
     early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_I32_16X16X32_IU4_w32_twoaddr 8, killed $vgpr28_vgpr29, 8, killed $vgpr0_vgpr1, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
 ...
@@ -183,11 +184,11 @@ body: |
   bb.0:
     liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr48, $vgpr49, $vgpr50
 
-    ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_f16_D0_overlaps_C1
-    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr48, $vgpr49, $vgpr50
-    ; GFX12-NEXT: {{  $}}
-    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
-    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_SWMMAC_F32_16X16X32_F16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr44, 0, 0, 0, implicit $exec
+    ; GCN-LABEL: name: test_swmmac_f32_16x16x32_f16_D0_overlaps_C1
+    ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr48, $vgpr49, $vgpr50
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_SWMMAC_F32_16X16X32_F16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr44, 0, 0, 0, implicit $exec
     early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
     early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_SWMMAC_F32_16X16X32_F16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr44, 0, 0, 0, implicit $exec
 ...
@@ -198,6 +199,12 @@ body: |
   bb.0:
     liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr48, $vgpr49, $vgpr50
 
+    ; GFX1170-LABEL: name: test_swmmac_f32_16x16x32_bf16_D0_overlaps_Index1
+    ; GFX1170: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr48, $vgpr49, $vgpr50
+    ; GFX1170-NEXT: {{  $}}
+    ; GFX1170-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    ; GFX1170-NEXT: early-clobber renamable $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 = V_SWMMAC_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr0, 0, 0, 0, implicit $exec
+    ;
     ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_bf16_D0_overlaps_Index1
     ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr48, $vgpr49, $vgpr50
     ; GFX12-NEXT: {{  $}}
@@ -214,12 +221,12 @@ body: |
   bb.0:
     liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46
 
-    ; GFX12-LABEL: name: test_swmmac_f16_16x16x32_f16_D0_overlaps_A1
-    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46
-    ; GFX12-NEXT: {{  $}}
-    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
-    ; GFX12-NEXT: V_NOP_e32 implicit $exec
-    ; GFX12-NEXT: early-clobber renamable $vgpr40_vgpr41_vgpr42_vgpr43 = V_SWMMAC_F16_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43, killed $vgpr44, 0, 0, 0, implicit $exec
+    ; GCN-LABEL: name: test_swmmac_f16_16x16x32_f16_D0_overlaps_A1
+    ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    ; GCN-NEXT: V_NOP_e32 implicit $exec
+    ; GCN-NEXT: early-clobber renamable $vgpr40_vgpr41_vgpr42_vgpr43 = V_SWMMAC_F16_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43, killed $vgpr44, 0, 0, 0, implicit $exec
     early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
     early-clobber renamable $vgpr40_vgpr41_vgpr42_vgpr43 = V_SWMMAC_F16_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43, killed $vgpr44, 0, 0, 0, implicit $exec
 ...
@@ -230,12 +237,12 @@ body: |
   bb.0:
     liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46
 
-    ; GFX12-LABEL: name: test_swmmac_bf16_16x16x32_bf16_D0_overlaps_B1
-    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46
-    ; GFX12-NEXT: {{  $}}
-    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
-    ; GFX12-NEXT: V_NOP_e32 implicit $exec
-    ; GFX12-NEXT: early-clobber renamable $vgpr40_vgpr41_vgpr42_vgpr43 = V_SWMMAC_BF16_16X16X32_BF16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr40_vgpr41_vgpr42_vgpr43, killed $vgpr44, 0, 0, 0, implicit $exec
+    ; GCN-LABEL: name: test_swmmac_bf16_16x16x32_bf16_D0_overlaps_B1
+    ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    ; GCN-NEXT: V_NOP_e32 implicit $exec
+    ; GCN-NEXT: early-clobber renamable $vgpr40_vgpr41_vgpr42_vgpr43 = V_SWMMAC_BF16_16X16X32_BF16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr40_vgpr41_vgpr42_vgpr43, killed $vgpr44, 0, 0, 0, implicit $exec
     early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
     early-clobber renamable $vgpr40_vgpr41_vgpr42_vgpr43 = V_SWMMAC_BF16_16X16X32_BF16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr40_vgpr41_vgpr42_vgpr43, killed $vgpr44, 0, 0, 0, implicit $exec
 ...
@@ -246,11 +253,11 @@ body: |
   bb.0:
     liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
 
-    ; GFX12-LABEL: name: test_swmmac_i32_16x16x32_iu8_D0_overlaps_C1
-    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
-    ; GFX12-NEXT: {{  $}}
-    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
-    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_SWMMAC_I32_16X16X32_IU8_w32_twoaddr 8, killed $vgpr28_vgpr29, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr42, 0, 0, 0, implicit $exec
+    ; GCN-LABEL: name: test_swmmac_i32_16x16x32_iu8_D0_overlaps_C1
+    ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_SWMMAC_I32_16X16X32_IU8_w32_twoaddr 8, killed $vgpr28_vgpr29, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr42, 0, 0, 0, implicit $exec
     early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
     early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_SWMMAC_I32_16X16X32_IU8_w32_twoaddr 8, killed $vgpr28_vgpr29, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr42, 0, 0, 0, implicit $exec
 ...
@@ -261,6 +268,12 @@ body: |
   bb.0:
     liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
 
+    ; GFX1170-LABEL: name: test_swmmac_i32_16x16x32_iu4_D0_overlaps_Index1
+    ; GFX1170: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
+    ; GFX1170-NEXT: {{  $}}
+    ; GFX1170-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    ; GFX1170-NEXT: early-clobber renamable $vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38 = V_SWMMAC_I32_16X16X32_IU4_w32_twoaddr 8, killed $vgpr28, 8, killed $vgpr28_vgpr29, killed $vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38, killed $vgpr0, 0, 0, 0, implicit $exec
+    ;
     ; GFX12-LABEL: name: test_swmmac_i32_16x16x32_iu4_D0_overlaps_Index1
     ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
     ; GFX12-NEXT: {{  $}}
@@ -277,12 +290,12 @@ body: |
   bb.0:
     liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
 
-    ; GFX12-LABEL: name: test_swmmac_i32_16x16x64_iu4_D0_overlaps_A1
-    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
-    ; GFX12-NEXT: {{  $}}
-    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
-    ; GFX12-NEXT: V_NOP_e32 implicit $exec
-    ; GFX12-NEXT: early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_I32_16X16X64_IU4_w32_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42, 0, 0, implicit $exec
+    ; GCN-LABEL: name: test_swmmac_i32_16x16x64_iu4_D0_overlaps_A1
+    ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    ; GCN-NEXT: V_NOP_e32 implicit $exec
+    ; GCN-NEXT: early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_I32_16X16X64_IU4_w32_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42, 0, 0, implicit $exec
     early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
     early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_I32_16X16X64_IU4_w32_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42, 0, 0, implicit $exec
 ...
@@ -293,12 +306,12 @@ body: |
   bb.0:
     liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
 
-    ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_fp8_fp8_D0_overlaps_B1
-    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
-    ; GFX12-NEXT: {{  $}}
-    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
-    ; GFX12-NEXT: V_NOP_e32 implicit $exec
-    ; GFX12-NEXT: early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_F32_16X16X32_FP8_FP8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42, 0, implicit $exec
+    ; GCN-LABEL: name: test_swmmac_f32_16x16x32_fp8_fp8_D0_overlaps_B1
+    ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    ; GCN-NEXT: V_NOP_e32 implicit $exec
+    ; GCN-NEXT: early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_F32_16X16X32_FP8_FP8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42, 0, implicit $exec
     early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
     early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_F32_16X16X32_FP8_FP8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42, 0, implicit $exec
 ...
@@ -309,11 +322,11 @@ body: |
   bb.0:
     liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
 
-    ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_fp8_bf8_D0_overlaps_C1
-    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
-    ; GFX12-NEXT: {{  $}}
-    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
-    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_SWMMAC_F32_16X16X32_FP8_BF8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr42, 0, implicit $exec
+    ; GCN-LABEL: name: test_swmmac_f32_16x16x32_fp8_bf8_D0_overlaps_C1
+    ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_SWMMAC_F32_16X16X32_FP8_BF8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr42, 0, implicit $exec
     early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
     early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_SWMMAC_F32_16X16X32_FP8_BF8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr42, 0, implicit $exec
 ...
@@ -324,6 +337,12 @@ body: |
   bb.0:
     liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
 
+    ; GFX1170-LABEL: name: test_swmmac_f32_16x16x32_bf8_fp8_D0_overlaps_Index1
+    ; GFX1170: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
+    ; GFX1170-NEXT: {{  $}}
+    ; GFX1170-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    ; GFX1170-NEXT: early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_F32_16X16X32_BF8_FP8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr0, 0, implicit $exec
+    ;
     ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_bf8_fp8_D0_overlaps_Index1
     ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
     ; GFX12-NEXT: {{  $}}
@@ -340,12 +359,12 @@ body: |
   bb.0:
     liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
 
-    ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_bf8_bf8_D0_overlaps_A1
-    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
-    ; GFX12-NEXT: {{  $}}
-    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
-    ; GFX12-NEXT: V_NOP_e32 implicit $exec
-    ; GFX12-NEXT: early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_F32_16X16X32_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42, 0, implicit $exec
+    ; GCN-LABEL: name: test_swmmac_f32_16x16x32_bf8_bf8_D0_overlaps_A1
+    ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    ; GCN-NEXT: V_NOP_e32 implicit $exec
+    ; GCN-NEXT: early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_F32_16X16X32_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42, 0, implicit $exec
     early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
     early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_F32_16X16X32_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42, 0, implicit $exec
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w64.mir b/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w64.mir
index 4073964e2b038..0a80543b9977d 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w64.mir
+++ b/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w64.mir
@@ -1,5 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize64 -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=GFX12 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1170 -verify-machineinstrs -mattr=+wavefrontsize64 -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,GFX1170 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize64 -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,GFX12 %s
 
 # D0 overlaps A1, B1, C1 or Index1. Overlap starts at vgpr0.
 #  $D0 = wmma0 $A0, $B0, $C0 or $D0 = swmmac0 $A0, $B0, $C0, $Index0
@@ -11,12 +12,12 @@ body: |
   bb.0:
     liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
 
-    ; GFX12-LABEL: name: test_wmma_f32_16x16x16_f16_D0_overlaps_A1
-    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
-    ; GFX12-NEXT: {{  $}}
-    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
-    ; GFX12-NEXT: V_NOP_e32 implicit $exec
-    ; GFX12-NEXT: early-clobber renamable $vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr18_vgpr19, 8, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
+    ; GCN-LABEL: name: test_wmma_f32_16x16x16_f16_D0_overlaps_A1
+    ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: V_NOP_e32 implicit $exec
+    ; GCN-NEXT: early-clobber renamable $vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr18_vgpr19, 8, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
     early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
     early-clobber renamable $vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr18_vgpr19, 8, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
 ...
@@ -27,12 +28,12 @@ body: |
   bb.0:
     liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
 
-    ; GFX12-LABEL: name: test_wmma_f32_16x16x16_bf16_D0_overlaps_B1
-    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
-    ; GFX12-NEXT: {{  $}}
-    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
-    ; GFX12-NEXT: V_NOP_e32 implicit $exec
-    ; GFX12-NEXT: early-clobber renamable $vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_BF16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr0_vgpr1, 8, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
+    ; GCN-LABEL: name: test_wmma_f32_16x16x16_bf16_D0_overlaps_B1
+    ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: V_NOP_e32 implicit $exec
+    ; GCN-NEXT: early-clobber renamable $vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_BF16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr0_vgpr1, 8, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
     early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
     early-clobber renamable $vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_BF16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr0_vgpr1, 8, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
 ...
@@ -43,11 +44,11 @@ body: |
   bb.0:
     liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
 
-    ; GFX12-LABEL: name: test_wmma_f16_16x16x16_f16_D0_overlaps_C1
-    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
-    ; GFX12-NEXT: {{  $}}
-    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
-    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1 = V_WMMA_F16_16X16X16_F16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19, 8, killed $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GCN-LABEL: name: test_wmma_f16_16x16x16_f16_D0_overlaps_C1
+    ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1 = V_WMMA_F16_16X16X16_F16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19, 8, killed $vgpr0_vgpr1, 0, 0, implicit $exec
     early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
     early-clobber renamable $vgpr0_vgpr1 = V_WMMA_F16_16X16X16_F16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19, 8, killed $vgpr0_vgpr1, 0, 0, implicit $exec
 ...
@@ -58,12 +59,12 @@ body: |
   bb.0:
     liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
 
-    ; GFX12-LABEL: name: test_wmma_bf16_16x16x16_bf16_D0_overlaps_A1
-    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
-    ; GFX12-NEXT: {{  $}}
-    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
-    ; GFX12-NEXT: V_NOP_e32 implicit $exec
-    ; GFX12-NEXT: early-clobber renamable $vgpr20_vgpr21 = V_WMMA_BF16_16X16X16_BF16_w64_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr18_vgpr19, 8, killed $vgpr20_vgpr21, 0, 0, implicit $exec
+    ; GCN-LABEL: name: test_wmma_bf16_16x16x16_bf16_D0_overlaps_A1
+    ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: V_NOP_e32 implicit $exec
+    ; GCN-NEXT: early-clobber renamable $vgpr20_vgpr21 = V_WMMA_BF16_16X16X16_BF16_w64_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr18_vgpr19, 8, killed $vgpr20_vgpr21, 0, 0, implicit $exec
     early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
     early-clobber renamable $vgpr20_vgpr21 = V_WMMA_BF16_16X16X16_BF16_w64_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr18_vgpr19, 8, killed $vgpr20_vgpr21, 0, 0, implicit $exec
 ...
@@ -74,12 +75,12 @@ body: |
   bb.0:
     liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
 
-    ; GFX12-LABEL: name: test_wmma_i32_16x16x16_iu8_D0_overlaps_B1
-    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
-    ; GFX12-NEXT: {{  $}}
-    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
-    ; GFX12-NEXT: V_NOP_e32 implicit $exec
-    ; GFX12-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_I32_16X16X16_IU8_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr0, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
+    ; GCN-LABEL: name: test_wmma_i32_16x16x16_iu8_D0_overlaps_B1
+    ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: V_NOP_e32 implicit $exec
+    ; GCN-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_I32_16X16X16_IU8_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr0, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
     early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
     early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_I32_16X16X16_IU8_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr0, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
 ...
@@ -90,11 +91,11 @@ body: |
   bb.0:
     liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
 
-    ; GFX12-LABEL: name: test_wmma_i32_16x16x16_iu4_D0_overlaps_C1
-    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
-    ; GFX12-NEXT: {{  $}}
-    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
-    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_I32_16X16X16_IU4_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr17, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    ; GCN-LABEL: name: test_wmma_i32_16x16x16_iu4_D0_overlaps_C1
+    ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_I32_16X16X16_IU4_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr17, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
     early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
     early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_I32_16X16X16_IU4_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr17, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
 ...
@@ -105,12 +106,12 @@ body: |
   bb.0:
     liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
 
-    ; GFX12-LABEL: name: test_wmma_f32_16x16x16_fp8_fp8_D0_overlaps_A1
-    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
-    ; GFX12-NEXT: {{  $}}
-    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
-    ; GFX12-NEXT: V_NOP_e32 implicit $exec
-    ; GFX12-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_F32_16X16X16_FP8_FP8_w64_twoaddr killed $vgpr0, killed $vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
+    ; GCN-LABEL: name: test_wmma_f32_16x16x16_fp8_fp8_D0_overlaps_A1
+    ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: V_NOP_e32 implicit $exec
+    ; GCN-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_F32_16X16X16_FP8_FP8_w64_twoaddr killed $vgpr0, killed $vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
     early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
     early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_F32_16X16X16_FP8_FP8_w64_twoaddr killed $vgpr0, killed $vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
 ...
@@ -121,12 +122,12 @@ body: |
   bb.0:
     liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
 
-    ; GFX12-LABEL: name: test_wmma_f32_16x16x16_bf8_fp8_D0_overlaps_B1
-    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
-    ; GFX12-NEXT: {{  $}}
-    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
-    ; GFX12-NEXT: V_NOP_e32 implicit $exec
-    ; GFX12-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_F32_16X16X16_BF8_FP8_w64_twoaddr killed $vgpr16, killed $vgpr0, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
+    ; GCN-LABEL: name: test_wmma_f32_16x16x16_bf8_fp8_D0_overlaps_B1
+    ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: V_NOP_e32 implicit $exec
+    ; GCN-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_F32_16X16X16_BF8_FP8_w64_twoaddr killed $vgpr16, killed $vgpr0, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
     early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
     early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_F32_16X16X16_BF8_FP8_w64_twoaddr killed $vgpr16, killed $vgpr0, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
 ...
@@ -137,11 +138,11 @@ body: |
   bb.0:
     liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
 
-    ; GFX12-LABEL: name: test_wmma_f32_16x16x16_fp8_bf8_D0_overlaps_C1
-    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
-    ; GFX12-NEXT: {{  $}}
-    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
-    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_FP8_BF8_w64_twoaddr killed $vgpr16, killed $vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    ; GCN-LABEL: name: test_wmma_f32_16x16x16_fp8_bf8_D0_overlaps_C1
+    ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_FP8_BF8_w64_twoaddr killed $vgpr16, killed $vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
     early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
     early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_FP8_BF8_w64_twoaddr killed $vgpr16, killed $vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
 ...
@@ -152,12 +153,12 @@ body: |
   bb.0:
     liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
 
-    ; GFX12-LABEL: name: test_wmma_f32_16x16x16_bf8_bf8_D0_overlaps_A1
-    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
-    ; GFX12-NEXT: {{  $}}
-    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
-    ; GFX12-NEXT: V_NOP_e32 implicit $exec
-    ; GFX12-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_F32_16X16X16_BF8_BF8_w64_twoaddr killed $vgpr0, killed $vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
+    ; GCN-LABEL: name: test_wmma_f32_16x16x16_bf8_bf8_D0_overlaps_A1
+    ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: V_NOP_e32 implicit $exec
+    ; GCN-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_F32_16X16X16_BF8_BF8_w64_twoaddr killed $vgpr0, killed $vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
     early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
     early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_F32_16X16X16_BF8_BF8_w64_twoaddr killed $vgpr0, killed $vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
 ...
@@ -168,12 +169,12 @@ body: |
   bb.0:
     liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
 
-    ; GFX12-LABEL: name: test_wmma_i32_16x16x32_iu4_D0_overlaps_B1
-    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
-    ; GFX12-NEXT: {{  $}}
-    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
-    ; GFX12-NEXT: V_NOP_e32 implicit $exec
-    ; GFX12-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_I32_16X16X32_IU4_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr0, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
+    ; GCN-LABEL: name: test_wmma_i32_16x16x32_iu4_D0_overlaps_B1
+    ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: V_NOP_e32 implicit $exec
+    ; GCN-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_I32_16X16X32_IU4_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr0, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
     early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
     early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_I32_16X16X32_IU4_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr0, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
 ...
@@ -184,11 +185,11 @@ body: |
   bb.0:
     liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28
 
-    ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_f16_D0_overlaps_C1
-    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28
-    ; GFX12-NEXT: {{  $}}
-    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
-    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_SWMMAC_F32_16X16X32_F16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr26, 0, 0, 0, implicit $exec
+    ; GCN-LABEL: name: test_swmmac_f32_16x16x32_f16_D0_overlaps_C1
+    ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_SWMMAC_F32_16X16X32_F16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr26, 0, 0, 0, implicit $exec
     early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
     early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_SWMMAC_F32_16X16X32_F16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr26, 0, 0, 0, implicit $exec
 ...
@@ -199,6 +200,12 @@ body: |
   bb.0:
     liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28
 
+    ; GFX1170-LABEL: name: test_swmmac_f32_16x16x32_bf16_D0_overlaps_Index1
+    ; GFX1170: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28
+    ; GFX1170-NEXT: {{  $}}
+    ; GFX1170-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    ; GFX1170-NEXT: early-clobber renamable $vgpr22_vgpr23_vgpr24_vgpr25 = V_SWMMAC_F32_16X16X32_BF16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr22_vgpr23_vgpr24_vgpr25, killed $vgpr0, 0, 0, 0, implicit $exec
+    ;
     ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_bf16_D0_overlaps_Index1
     ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28
     ; GFX12-NEXT: {{  $}}
@@ -215,12 +222,12 @@ body: |
   bb.0:
     liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26
 
-    ; GFX12-LABEL: name: test_swmmac_f16_16x16x32_f16_D0_overlaps_A1
-    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26
-    ; GFX12-NEXT: {{  $}}
-    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
-    ; GFX12-NEXT: V_NOP_e32 implicit $exec
-    ; GFX12-NEXT: early-clobber renamable $vgpr22_vgpr23 = V_SWMMAC_F16_16X16X32_F16_w64_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr22_vgpr23, killed $vgpr24, 0, 0, 0, implicit $exec
+    ; GCN-LABEL: name: test_swmmac_f16_16x16x32_f16_D0_overlaps_A1
+    ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: V_NOP_e32 implicit $exec
+    ; GCN-NEXT: early-clobber renamable $vgpr22_vgpr23 = V_SWMMAC_F16_16X16X32_F16_w64_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr22_vgpr23, killed $vgpr24, 0, 0, 0, implicit $exec
     early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
     early-clobber renamable $vgpr22_vgpr23 = V_SWMMAC_F16_16X16X32_F16_w64_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr22_vgpr23, killed $vgpr24, 0, 0, 0, implicit $exec
 ...
@@ -231,12 +238,12 @@ body: |
   bb.0:
     liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26
 
-    ; GFX12-LABEL: name: test_swmmac_bf16_16x16x32_bf16_D0_overlaps_B1
-    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26
-    ; GFX12-NEXT: {{  $}}
-    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
-    ; GFX12-NEXT: V_NOP_e32 implicit $exec
-    ; GFX12-NEXT: early-clobber renamable $vgpr22_vgpr23 = V_SWMMAC_BF16_16X16X32_BF16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr22_vgpr23, killed $vgpr24, 0, 0, 0, implicit $exec
+    ; GCN-LABEL: name: test_swmmac_bf16_16x16x32_bf16_D0_overlaps_B1
+    ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: V_NOP_e32 implicit $exec
+    ; GCN-NEXT: early-clobber renamable $vgpr22_vgpr23 = V_SWMMAC_BF16_16X16X32_BF16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr22_vgpr23, killed $vgpr24, 0, 0, 0, implicit $exec
     early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
     early-clobber renamable $vgpr22_vgpr23 = V_SWMMAC_BF16_16X16X32_BF16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr22_vgpr23, killed $vgpr24, 0, 0, 0, implicit $exec
 ...
@@ -247,11 +254,11 @@ body: |
   bb.0:
     liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
 
-    ; GFX12-LABEL: name: test_swmmac_i32_16x16x32_iu8_D0_overlaps_C1
-    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
-    ; GFX12-NEXT: {{  $}}
-    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
-    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_SWMMAC_I32_16X16X32_IU8_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr17_vgpr18, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr23, 0, 0, 0, implicit $exec
+    ; GCN-LABEL: name: test_swmmac_i32_16x16x32_iu8_D0_overlaps_C1
+    ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_SWMMAC_I32_16X16X32_IU8_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr17_vgpr18, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr23, 0, 0, 0, implicit $exec
     early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
     early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_SWMMAC_I32_16X16X32_IU8_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr17_vgpr18, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr23, 0, 0, 0, implicit $exec
 ...
@@ -262,6 +269,12 @@ body: |
   bb.0:
     liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24
 
+    ; GFX1170-LABEL: name: test_swmmac_i32_16x16x32_iu4_D0_overlaps_Index1
+    ; GFX1170: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24
+    ; GFX1170-NEXT: {{  $}}
+    ; GFX1170-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    ; GFX1170-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_SWMMAC_I32_16X16X32_IU4_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr16, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr0, 0, 0, 0, implicit $exec
+    ;
     ; GFX12-LABEL: name: test_swmmac_i32_16x16x32_iu4_D0_overlaps_Index1
     ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24
     ; GFX12-NEXT: {{  $}}
@@ -278,12 +291,12 @@ body: |
   bb.0:
     liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
 
-    ; GFX12-LABEL: name: test_swmmac_i32_16x16x64_iu4_D0_overlaps_A1
-    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
-    ; GFX12-NEXT: {{  $}}
-    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
-    ; GFX12-NEXT: V_NOP_e32 implicit $exec
-    ; GFX12-NEXT: early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_I32_16X16X64_IU4_w64_twoaddr 8, killed $vgpr0, 8, killed $vgpr17_vgpr18, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr23, 0, 0, 0, implicit $exec
+    ; GCN-LABEL: name: test_swmmac_i32_16x16x64_iu4_D0_overlaps_A1
+    ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: V_NOP_e32 implicit $exec
+    ; GCN-NEXT: early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_I32_16X16X64_IU4_w64_twoaddr 8, killed $vgpr0, 8, killed $vgpr17_vgpr18, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr23, 0, 0, 0, implicit $exec
     early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
     early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_I32_16X16X64_IU4_w64_twoaddr 8, killed $vgpr0, 8, killed $vgpr17_vgpr18, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr23, 0, 0, 0, implicit $exec
 ...
@@ -294,12 +307,12 @@ body: |
   bb.0:
     liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
 
-    ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_fp8_fp8_D0_overlaps_B1
-    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
-    ; GFX12-NEXT: {{  $}}
-    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
-    ; GFX12-NEXT: V_NOP_e32 implicit $exec
-    ; GFX12-NEXT: early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_F32_16X16X32_FP8_FP8_w64_twoaddr killed $vgpr16, killed $vgpr0_vgpr1, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr23, 0, implicit $exec
+    ; GCN-LABEL: name: test_swmmac_f32_16x16x32_fp8_fp8_D0_overlaps_B1
+    ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: V_NOP_e32 implicit $exec
+    ; GCN-NEXT: early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_F32_16X16X32_FP8_FP8_w64_twoaddr killed $vgpr16, killed $vgpr0_vgpr1, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr23, 0, implicit $exec
     early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
     early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_F32_16X16X32_FP8_FP8_w64_twoaddr killed $vgpr16, killed $vgpr0_vgpr1, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr23, 0, implicit $exec
 ...
@@ -310,11 +323,11 @@ body: |
   bb.0:
     liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
 
-    ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_fp8_bf8_D0_overlaps_C1
-    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
-    ; GFX12-NEXT: {{  $}}
-    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
-    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_SWMMAC_F32_16X16X32_FP8_BF8_w64_twoaddr killed $vgpr16, killed $vgpr17_vgpr18, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr23, 0, implicit $exec
+    ; GCN-LABEL: name: test_swmmac_f32_16x16x32_fp8_bf8_D0_overlaps_C1
+    ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_SWMMAC_F32_16X16X32_FP8_BF8_w64_twoaddr killed $vgpr16, killed $vgpr17_vgpr18, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr23, 0, implicit $exec
     early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
     early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_SWMMAC_F32_16X16X32_FP8_BF8_w64_twoaddr killed $vgpr16, killed $vgpr17_vgpr18, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr23, 0, implicit $exec
 ...
@@ -325,6 +338,12 @@ body: |
   bb.0:
     liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
 
+    ; GFX1170-LABEL: name: test_swmmac_f32_16x16x32_bf8_fp8_D0_overlaps_Index1
+    ; GFX1170: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
+    ; GFX1170-NEXT: {{  $}}
+    ; GFX1170-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    ; GFX1170-NEXT: early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_F32_16X16X32_BF8_FP8_w64_twoaddr killed $vgpr16, killed $vgpr17_vgpr18, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr0, 0, implicit $exec
+    ;
     ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_bf8_fp8_D0_overlaps_Index1
     ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
     ; GFX12-NEXT: {{  $}}
@@ -341,12 +360,12 @@ body: |
   bb.0:
     liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
 
-    ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_bf8_bf8_D0_overlaps_A1
-    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
-    ; GFX12-NEXT: {{  $}}
-    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
-    ; GFX12-NEXT: V_NOP_e32 implicit $exec
-    ; GFX12-NEXT: early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_F32_16X16X32_BF8_BF8_w64_twoaddr killed $vgpr0, killed $vgpr17_vgpr18, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr23, 0, implicit $exec
+    ; GCN-LABEL: name: test_swmmac_f32_16x16x32_bf8_bf8_D0_overlaps_A1
+    ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    ; GCN-NEXT: V_NOP_e32 implicit $exec
+    ; GCN-NEXT: early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_F32_16X16X32_BF8_BF8_w64_twoaddr killed $vgpr0, killed $vgpr17_vgpr18, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr23, 0, implicit $exec
     early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
     early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_F32_16X16X32_BF8_BF8_w64_twoaddr killed $vgpr0, killed $vgpr17_vgpr18, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr23, 0, implicit $exec
 ...
diff --git a/llvm/test/CodeGen/MIR/X86/expected-integer-after-tied-def.mir b/llvm/test/CodeGen/MIR/X86/expected-integer-after-tied-def.mir
index 7160a3e3c3d84..5666c2141c5ee 100644
--- a/llvm/test/CodeGen/MIR/X86/expected-integer-after-tied-def.mir
+++ b/llvm/test/CodeGen/MIR/X86/expected-integer-after-tied-def.mir
@@ -17,7 +17,7 @@ body: |
   bb.0.entry:
     liveins: $rdi
 
-  ; CHECK: [[@LINE+1]]:78: expected tied-def or low-level type after '('
+  ; CHECK: [[@LINE+1]]:78: expected an integer literal after 'tied-def'
     INLINEASM &"$foo", 1, 2818058, def $rdi, 2147483657, killed $rdi(tied-def)
     $rax = COPY killed $rdi
     RET64 killed $rax
diff --git a/llvm/test/CodeGen/MIR/X86/invalid-tied-physical-reg-def.mir b/llvm/test/CodeGen/MIR/X86/invalid-tied-physical-reg-def.mir
new file mode 100644
index 0000000000000..66c458e79f316
--- /dev/null
+++ b/llvm/test/CodeGen/MIR/X86/invalid-tied-physical-reg-def.mir
@@ -0,0 +1,15 @@
+# RUN: not llc -mtriple=x86_64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
+---
+name:            test
+tracksRegLiveness: true
+liveins:
+  - { reg: '$rdi' }
+body: |
+  bb.0.entry:
+    liveins: $rdi
+
+  ; CHECK: [[@LINE+1]]:45: tied-def not supported for defs
+    INLINEASM &"$foo", 1, 2818058, def $rdi(tied-def 5), 2147483657, killed $rdi(tied-def 3)
+    $rax = COPY killed $rdi
+    RET64 killed $rax
+...
diff --git a/llvm/test/CodeGen/MIR/X86/expected-tied-def-after-lparen.mir b/llvm/test/CodeGen/MIR/X86/invalid-type-physical-reg.mir
similarity index 87%
rename from llvm/test/CodeGen/MIR/X86/expected-tied-def-after-lparen.mir
rename to llvm/test/CodeGen/MIR/X86/invalid-type-physical-reg.mir
index a2c65dd3f0195..f2d94339e3ae5 100644
--- a/llvm/test/CodeGen/MIR/X86/expected-tied-def-after-lparen.mir
+++ b/llvm/test/CodeGen/MIR/X86/invalid-type-physical-reg.mir
@@ -17,7 +17,7 @@ body: |
   bb.0.entry:
     liveins: $rdi
 
-  ; CHECK: [[@LINE+1]]:70: expected tied-def or low-level type after '('
+  ; CHECK: [[@LINE+1]]:70: unexpected type on physical register
     INLINEASM &"$foo", 1, 2818058, def $rdi, 2147483657, killed $rdi(3)
     $rax = COPY killed $rdi
     RET64 killed $rax
diff --git a/llvm/test/CodeGen/PowerPC/clmul-vector.ll b/llvm/test/CodeGen/PowerPC/clmul-vector.ll
new file mode 100644
index 0000000000000..9089dca5b0ed7
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/clmul-vector.ll
@@ -0,0 +1,8874 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=powerpc64-- | FileCheck %s --check-prefixes=CHECK,BE
+; RUN: llc < %s -mtriple=powerpc64le-- | FileCheck %s --check-prefixes=CHECK,LE
+
+define <16 x i8> @clmul_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
+; BE-LABEL: clmul_v16i8:
+; BE:       # %bb.0:
+; BE-NEXT:    addis 3, 2, .LCPI0_0 at toc@ha
+; BE-NEXT:    vspltisb 4, 2
+; BE-NEXT:    addi 3, 3, .LCPI0_0 at toc@l
+; BE-NEXT:    vand 4, 3, 4
+; BE-NEXT:    lvx 10, 0, 3
+; BE-NEXT:    addis 3, 2, .LCPI0_1 at toc@ha
+; BE-NEXT:    vspltisb 5, 1
+; BE-NEXT:    addi 3, 3, .LCPI0_1 at toc@l
+; BE-NEXT:    vspltisb 0, 4
+; BE-NEXT:    vand 5, 3, 5
+; BE-NEXT:    vspltisb 6, 8
+; BE-NEXT:    vspltisb 8, -1
+; BE-NEXT:    vmuloub 9, 2, 4
+; BE-NEXT:    vmuleub 4, 2, 4
+; BE-NEXT:    vand 1, 3, 0
+; BE-NEXT:    vperm 4, 4, 9, 10
+; BE-NEXT:    vmuloub 9, 2, 5
+; BE-NEXT:    vmuleub 5, 2, 5
+; BE-NEXT:    vand 7, 3, 6
+; BE-NEXT:    vaddubm 6, 6, 6
+; BE-NEXT:    vperm 5, 5, 9, 10
+; BE-NEXT:    vmuloub 9, 2, 1
+; BE-NEXT:    vmuleub 1, 2, 1
+; BE-NEXT:    vperm 1, 1, 9, 10
+; BE-NEXT:    vmuloub 9, 2, 7
+; BE-NEXT:    vmuleub 7, 2, 7
+; BE-NEXT:    vand 6, 3, 6
+; BE-NEXT:    vperm 7, 7, 9, 10
+; BE-NEXT:    vmuloub 9, 2, 6
+; BE-NEXT:    vmuleub 6, 2, 6
+; BE-NEXT:    vperm 6, 6, 9, 10
+; BE-NEXT:    lvx 9, 0, 3
+; BE-NEXT:    vslb 0, 0, 0
+; BE-NEXT:    vslb 8, 8, 8
+; BE-NEXT:    vand 0, 3, 0
+; BE-NEXT:    vand 8, 3, 8
+; BE-NEXT:    vand 3, 3, 9
+; BE-NEXT:    vmuloub 9, 2, 0
+; BE-NEXT:    vmuleub 0, 2, 0
+; BE-NEXT:    vxor 4, 5, 4
+; BE-NEXT:    vperm 0, 0, 9, 10
+; BE-NEXT:    vmuloub 9, 2, 8
+; BE-NEXT:    vmuleub 8, 2, 8
+; BE-NEXT:    vmuloub 5, 2, 3
+; BE-NEXT:    vmuleub 2, 2, 3
+; BE-NEXT:    vxor 3, 4, 1
+; BE-NEXT:    vxor 3, 3, 7
+; BE-NEXT:    vperm 2, 2, 5, 10
+; BE-NEXT:    vxor 3, 3, 6
+; BE-NEXT:    vxor 2, 3, 2
+; BE-NEXT:    vperm 8, 8, 9, 10
+; BE-NEXT:    vxor 2, 2, 0
+; BE-NEXT:    vxor 2, 2, 8
+; BE-NEXT:    blr
+;
+; LE-LABEL: clmul_v16i8:
+; LE:       # %bb.0:
+; LE-NEXT:    vspltisb 4, 2
+; LE-NEXT:    addis 3, 2, .LCPI0_0 at toc@ha
+; LE-NEXT:    vspltisb 5, 1
+; LE-NEXT:    addi 3, 3, .LCPI0_0 at toc@l
+; LE-NEXT:    xxland 36, 35, 36
+; LE-NEXT:    xxland 37, 35, 37
+; LE-NEXT:    vspltisb 0, 4
+; LE-NEXT:    vspltisb 1, 8
+; LE-NEXT:    lxvd2x 0, 0, 3
+; LE-NEXT:    vmuloub 7, 2, 4
+; LE-NEXT:    vmuleub 4, 2, 4
+; LE-NEXT:    addis 3, 2, .LCPI0_1 at toc@ha
+; LE-NEXT:    addi 3, 3, .LCPI0_1 at toc@l
+; LE-NEXT:    xxswapd 38, 0
+; LE-NEXT:    lxvd2x 0, 0, 3
+; LE-NEXT:    vperm 4, 4, 7, 6
+; LE-NEXT:    vmuloub 7, 2, 5
+; LE-NEXT:    vmuleub 5, 2, 5
+; LE-NEXT:    vperm 5, 5, 7, 6
+; LE-NEXT:    xxland 39, 35, 32
+; LE-NEXT:    vslb 0, 0, 0
+; LE-NEXT:    vmuloub 8, 2, 7
+; LE-NEXT:    vmuleub 7, 2, 7
+; LE-NEXT:    xxland 32, 35, 32
+; LE-NEXT:    vperm 7, 7, 8, 6
+; LE-NEXT:    xxland 40, 35, 33
+; LE-NEXT:    vaddubm 1, 1, 1
+; LE-NEXT:    vmuloub 9, 2, 8
+; LE-NEXT:    vmuleub 8, 2, 8
+; LE-NEXT:    xxland 33, 35, 33
+; LE-NEXT:    vperm 8, 8, 9, 6
+; LE-NEXT:    vmuloub 9, 2, 1
+; LE-NEXT:    vmuleub 1, 2, 1
+; LE-NEXT:    vperm 1, 1, 9, 6
+; LE-NEXT:    xxland 41, 35, 0
+; LE-NEXT:    xxlxor 0, 37, 36
+; LE-NEXT:    vmuloub 10, 2, 9
+; LE-NEXT:    vmuleub 9, 2, 9
+; LE-NEXT:    xxlxor 0, 0, 39
+; LE-NEXT:    xxlxor 0, 0, 40
+; LE-NEXT:    xxlxor 0, 0, 33
+; LE-NEXT:    vperm 9, 9, 10, 6
+; LE-NEXT:    vmuloub 10, 2, 0
+; LE-NEXT:    vmuleub 0, 2, 0
+; LE-NEXT:    xxlxor 0, 0, 41
+; LE-NEXT:    vperm 0, 0, 10, 6
+; LE-NEXT:    xxleqv 42, 42, 42
+; LE-NEXT:    vslb 10, 10, 10
+; LE-NEXT:    xxlxor 0, 0, 32
+; LE-NEXT:    xxland 35, 35, 42
+; LE-NEXT:    vmuloub 10, 2, 3
+; LE-NEXT:    vmuleub 2, 2, 3
+; LE-NEXT:    vperm 2, 2, 10, 6
+; LE-NEXT:    xxlxor 34, 0, 34
+; LE-NEXT:    blr
+  %res = call <16 x i8> @llvm.clmul.v16i8(<16 x i8> %a, <16 x i8> %b)
+  ret <16 x i8> %res
+}
+
+define <8 x i16> @clmul_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
+; BE-LABEL: clmul_v8i16:
+; BE:       # %bb.0:
+; BE-NEXT:    addis 3, 2, .LCPI1_0 at toc@ha
+; BE-NEXT:    vspltish 6, 2
+; BE-NEXT:    addi 3, 3, .LCPI1_0 at toc@l
+; BE-NEXT:    vand 4, 3, 6
+; BE-NEXT:    lvx 13, 0, 3
+; BE-NEXT:    addis 3, 2, .LCPI1_1 at toc@ha
+; BE-NEXT:    addi 3, 3, .LCPI1_1 at toc@l
+; BE-NEXT:    vspltish 7, 1
+; BE-NEXT:    lvx 14, 0, 3
+; BE-NEXT:    addis 3, 2, .LCPI1_2 at toc@ha
+; BE-NEXT:    addi 3, 3, .LCPI1_2 at toc@l
+; BE-NEXT:    vspltish 8, 4
+; BE-NEXT:    lvx 15, 0, 3
+; BE-NEXT:    addis 3, 2, .LCPI1_3 at toc@ha
+; BE-NEXT:    addi 3, 3, .LCPI1_3 at toc@l
+; BE-NEXT:    vspltish 9, 8
+; BE-NEXT:    vand 5, 3, 7
+; BE-NEXT:    lvx 16, 0, 3
+; BE-NEXT:    addis 3, 2, .LCPI1_4 at toc@ha
+; BE-NEXT:    addi 3, 3, .LCPI1_4 at toc@l
+; BE-NEXT:    vspltisb 12, -1
+; BE-NEXT:    lvx 17, 0, 3
+; BE-NEXT:    vand 0, 3, 8
+; BE-NEXT:    vand 1, 3, 9
+; BE-NEXT:    vslh 10, 8, 8
+; BE-NEXT:    vsldoi 7, 7, 7, 1
+; BE-NEXT:    vsldoi 6, 6, 6, 1
+; BE-NEXT:    vsldoi 8, 8, 8, 1
+; BE-NEXT:    vslh 11, 9, 9
+; BE-NEXT:    vadduhm 9, 9, 9
+; BE-NEXT:    vslh 12, 12, 12
+; BE-NEXT:    vand 9, 3, 9
+; BE-NEXT:    vand 10, 3, 10
+; BE-NEXT:    vand 7, 3, 7
+; BE-NEXT:    vand 6, 3, 6
+; BE-NEXT:    vand 8, 3, 8
+; BE-NEXT:    vand 11, 3, 11
+; BE-NEXT:    vand 12, 3, 12
+; BE-NEXT:    vand 13, 3, 13
+; BE-NEXT:    vand 14, 3, 14
+; BE-NEXT:    vand 15, 3, 15
+; BE-NEXT:    vand 16, 3, 16
+; BE-NEXT:    vand 3, 3, 17
+; BE-NEXT:    vxor 17, 17, 17
+; BE-NEXT:    vmladduhm 4, 2, 4, 17
+; BE-NEXT:    vmladduhm 5, 2, 5, 17
+; BE-NEXT:    vmladduhm 0, 2, 0, 17
+; BE-NEXT:    vmladduhm 1, 2, 1, 17
+; BE-NEXT:    vmladduhm 9, 2, 9, 17
+; BE-NEXT:    vmladduhm 10, 2, 10, 17
+; BE-NEXT:    vmladduhm 7, 2, 7, 17
+; BE-NEXT:    vmladduhm 6, 2, 6, 17
+; BE-NEXT:    vmladduhm 8, 2, 8, 17
+; BE-NEXT:    vmladduhm 11, 2, 11, 17
+; BE-NEXT:    vmladduhm 12, 2, 12, 17
+; BE-NEXT:    vmladduhm 13, 2, 13, 17
+; BE-NEXT:    vmladduhm 14, 2, 14, 17
+; BE-NEXT:    vmladduhm 15, 2, 15, 17
+; BE-NEXT:    vmladduhm 16, 2, 16, 17
+; BE-NEXT:    vmladduhm 2, 2, 3, 17
+; BE-NEXT:    vxor 3, 5, 4
+; BE-NEXT:    vxor 3, 3, 0
+; BE-NEXT:    vxor 3, 3, 1
+; BE-NEXT:    vxor 3, 3, 9
+; BE-NEXT:    vxor 3, 3, 13
+; BE-NEXT:    vxor 3, 3, 10
+; BE-NEXT:    vxor 3, 3, 14
+; BE-NEXT:    vxor 3, 3, 7
+; BE-NEXT:    vxor 3, 3, 6
+; BE-NEXT:    vxor 3, 3, 8
+; BE-NEXT:    vxor 3, 3, 11
+; BE-NEXT:    vxor 3, 3, 15
+; BE-NEXT:    vxor 3, 3, 16
+; BE-NEXT:    vxor 2, 3, 2
+; BE-NEXT:    vxor 2, 2, 12
+; BE-NEXT:    blr
+;
+; LE-LABEL: clmul_v8i16:
+; LE:       # %bb.0:
+; LE-NEXT:    vspltish 5, 2
+; LE-NEXT:    vspltish 0, 1
+; LE-NEXT:    addis 3, 2, .LCPI1_0 at toc@ha
+; LE-NEXT:    xxland 41, 35, 37
+; LE-NEXT:    vspltish 1, 4
+; LE-NEXT:    vspltish 4, 8
+; LE-NEXT:    addi 3, 3, .LCPI1_0 at toc@l
+; LE-NEXT:    lxvd2x 1, 0, 3
+; LE-NEXT:    vsldoi 6, 0, 0, 1
+; LE-NEXT:    xxland 32, 35, 32
+; LE-NEXT:    vsldoi 7, 5, 5, 1
+; LE-NEXT:    vxor 5, 5, 5
+; LE-NEXT:    vmladduhm 9, 2, 9, 5
+; LE-NEXT:    vmladduhm 0, 2, 0, 5
+; LE-NEXT:    addis 3, 2, .LCPI1_1 at toc@ha
+; LE-NEXT:    addi 3, 3, .LCPI1_1 at toc@l
+; LE-NEXT:    vsldoi 8, 1, 1, 1
+; LE-NEXT:    xxlxor 0, 32, 41
+; LE-NEXT:    xxland 32, 35, 33
+; LE-NEXT:    vmladduhm 0, 2, 0, 5
+; LE-NEXT:    xxlxor 0, 0, 32
+; LE-NEXT:    xxland 32, 35, 36
+; LE-NEXT:    vmladduhm 0, 2, 0, 5
+; LE-NEXT:    xxlxor 0, 0, 32
+; LE-NEXT:    vadduhm 0, 4, 4
+; LE-NEXT:    vslh 4, 4, 4
+; LE-NEXT:    xxland 32, 35, 32
+; LE-NEXT:    xxland 36, 35, 36
+; LE-NEXT:    vmladduhm 0, 2, 0, 5
+; LE-NEXT:    vmladduhm 4, 2, 4, 5
+; LE-NEXT:    xxlxor 0, 0, 32
+; LE-NEXT:    xxland 32, 35, 1
+; LE-NEXT:    lxvd2x 1, 0, 3
+; LE-NEXT:    addis 3, 2, .LCPI1_2 at toc@ha
+; LE-NEXT:    vmladduhm 0, 2, 0, 5
+; LE-NEXT:    addi 3, 3, .LCPI1_2 at toc@l
+; LE-NEXT:    xxlxor 0, 0, 32
+; LE-NEXT:    vslh 0, 1, 1
+; LE-NEXT:    xxland 32, 35, 32
+; LE-NEXT:    vmladduhm 0, 2, 0, 5
+; LE-NEXT:    xxlxor 0, 0, 32
+; LE-NEXT:    xxland 32, 35, 1
+; LE-NEXT:    lxvd2x 1, 0, 3
+; LE-NEXT:    addis 3, 2, .LCPI1_3 at toc@ha
+; LE-NEXT:    vmladduhm 0, 2, 0, 5
+; LE-NEXT:    addi 3, 3, .LCPI1_3 at toc@l
+; LE-NEXT:    xxlxor 0, 0, 32
+; LE-NEXT:    xxland 32, 35, 38
+; LE-NEXT:    vmladduhm 0, 2, 0, 5
+; LE-NEXT:    xxlxor 0, 0, 32
+; LE-NEXT:    xxland 32, 35, 39
+; LE-NEXT:    vmladduhm 0, 2, 0, 5
+; LE-NEXT:    xxlxor 0, 0, 32
+; LE-NEXT:    xxland 32, 35, 40
+; LE-NEXT:    vmladduhm 0, 2, 0, 5
+; LE-NEXT:    xxlxor 0, 0, 32
+; LE-NEXT:    xxlxor 0, 0, 36
+; LE-NEXT:    xxland 36, 35, 1
+; LE-NEXT:    lxvd2x 1, 0, 3
+; LE-NEXT:    addis 3, 2, .LCPI1_4 at toc@ha
+; LE-NEXT:    vmladduhm 4, 2, 4, 5
+; LE-NEXT:    addi 3, 3, .LCPI1_4 at toc@l
+; LE-NEXT:    xxlxor 0, 0, 36
+; LE-NEXT:    xxland 36, 35, 1
+; LE-NEXT:    lxvd2x 1, 0, 3
+; LE-NEXT:    vmladduhm 4, 2, 4, 5
+; LE-NEXT:    xxlxor 0, 0, 36
+; LE-NEXT:    xxland 36, 35, 1
+; LE-NEXT:    vmladduhm 4, 2, 4, 5
+; LE-NEXT:    xxlxor 0, 0, 36
+; LE-NEXT:    xxleqv 36, 36, 36
+; LE-NEXT:    vslh 4, 4, 4
+; LE-NEXT:    xxland 35, 35, 36
+; LE-NEXT:    vmladduhm 2, 2, 3, 5
+; LE-NEXT:    xxlxor 34, 0, 34
+; LE-NEXT:    blr
+  %res = call <8 x i16> @llvm.clmul.v8i16(<8 x i16> %a, <8 x i16> %b)
+  ret <8 x i16> %res
+}
+
+define <4 x i32> @clmul_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
+; BE-LABEL: clmul_v4i32:
+; BE:       # %bb.0:
+; BE-NEXT:    stdu 1, -1184(1)
+; BE-NEXT:    li 3, 992
+; BE-NEXT:    vspltisw 9, 4
+; BE-NEXT:    stvx 20, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 1008
+; BE-NEXT:    vand 4, 3, 9
+; BE-NEXT:    vspltisw 6, 8
+; BE-NEXT:    stvx 21, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 1024
+; BE-NEXT:    vspltisw 11, 1
+; BE-NEXT:    stvx 22, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 1040
+; BE-NEXT:    vand 1, 3, 11
+; BE-NEXT:    vspltisw 8, 2
+; BE-NEXT:    stvx 23, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 1056
+; BE-NEXT:    vspltisb 17, -1
+; BE-NEXT:    stvx 24, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 1072
+; BE-NEXT:    stvx 25, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 1088
+; BE-NEXT:    vsldoi 15, 11, 11, 1
+; BE-NEXT:    stvx 26, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 1104
+; BE-NEXT:    stvx 27, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 1120
+; BE-NEXT:    stvx 28, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 1136
+; BE-NEXT:    vslw 18, 6, 6
+; BE-NEXT:    stvx 29, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 1152
+; BE-NEXT:    stvx 30, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 1168
+; BE-NEXT:    vsldoi 5, 11, 11, 2
+; BE-NEXT:    stvx 31, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 976
+; BE-NEXT:    stvx 4, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 960
+; BE-NEXT:    vand 4, 3, 6
+; BE-NEXT:    stvx 4, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 624
+; BE-NEXT:    vsldoi 13, 6, 6, 2
+; BE-NEXT:    vsldoi 4, 11, 11, 3
+; BE-NEXT:    vsldoi 11, 6, 6, 3
+; BE-NEXT:    vadduwm 6, 6, 6
+; BE-NEXT:    vand 12, 3, 6
+; BE-NEXT:    stvx 12, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 752
+; BE-NEXT:    vand 6, 3, 18
+; BE-NEXT:    stvx 6, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 864
+; BE-NEXT:    vsldoi 19, 8, 8, 2
+; BE-NEXT:    vand 5, 3, 5
+; BE-NEXT:    stvx 5, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 592
+; BE-NEXT:    vsldoi 0, 9, 9, 2
+; BE-NEXT:    vand 5, 3, 19
+; BE-NEXT:    stvx 5, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 896
+; BE-NEXT:    vslw 10, 9, 9
+; BE-NEXT:    vsldoi 31, 9, 9, 1
+; BE-NEXT:    vsldoi 9, 9, 9, 3
+; BE-NEXT:    vand 0, 3, 0
+; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 944
+; BE-NEXT:    vand 23, 3, 13
+; BE-NEXT:    vand 13, 3, 4
+; BE-NEXT:    vand 4, 3, 9
+; BE-NEXT:    stvx 4, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    addis 3, 2, .LCPI2_0 at toc@ha
+; BE-NEXT:    addi 3, 3, .LCPI2_0 at toc@l
+; BE-NEXT:    lvx 4, 0, 3
+; BE-NEXT:    addis 3, 2, .LCPI2_1 at toc@ha
+; BE-NEXT:    addi 3, 3, .LCPI2_1 at toc@l
+; BE-NEXT:    vand 25, 3, 4
+; BE-NEXT:    lvx 4, 0, 3
+; BE-NEXT:    li 3, 928
+; BE-NEXT:    vand 4, 3, 4
+; BE-NEXT:    stvx 4, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    addis 3, 2, .LCPI2_2 at toc@ha
+; BE-NEXT:    addi 3, 3, .LCPI2_2 at toc@l
+; BE-NEXT:    lvx 4, 0, 3
+; BE-NEXT:    addis 3, 2, .LCPI2_3 at toc@ha
+; BE-NEXT:    addi 3, 3, .LCPI2_3 at toc@l
+; BE-NEXT:    vand 16, 3, 10
+; BE-NEXT:    vand 10, 3, 4
+; BE-NEXT:    lvx 4, 0, 3
+; BE-NEXT:    addis 3, 2, .LCPI2_4 at toc@ha
+; BE-NEXT:    addi 3, 3, .LCPI2_4 at toc@l
+; BE-NEXT:    vand 30, 3, 4
+; BE-NEXT:    lvx 4, 0, 3
+; BE-NEXT:    li 3, 768
+; BE-NEXT:    vand 4, 3, 4
+; BE-NEXT:    stvx 4, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    addis 3, 2, .LCPI2_5 at toc@ha
+; BE-NEXT:    addi 3, 3, .LCPI2_5 at toc@l
+; BE-NEXT:    lvx 4, 0, 3
+; BE-NEXT:    li 3, 704
+; BE-NEXT:    vand 4, 3, 4
+; BE-NEXT:    stvx 4, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    addis 3, 2, .LCPI2_6 at toc@ha
+; BE-NEXT:    addi 3, 3, .LCPI2_6 at toc@l
+; BE-NEXT:    lvx 4, 0, 3
+; BE-NEXT:    addis 3, 2, .LCPI2_7 at toc@ha
+; BE-NEXT:    addi 3, 3, .LCPI2_7 at toc@l
+; BE-NEXT:    vand 27, 3, 4
+; BE-NEXT:    lvx 4, 0, 3
+; BE-NEXT:    addis 3, 2, .LCPI2_8 at toc@ha
+; BE-NEXT:    addi 3, 3, .LCPI2_8 at toc@l
+; BE-NEXT:    vand 22, 3, 4
+; BE-NEXT:    lvx 4, 0, 3
+; BE-NEXT:    addis 3, 2, .LCPI2_9 at toc@ha
+; BE-NEXT:    addi 3, 3, .LCPI2_9 at toc@l
+; BE-NEXT:    vand 21, 3, 4
+; BE-NEXT:    lvx 4, 0, 3
+; BE-NEXT:    addis 3, 2, .LCPI2_10 at toc@ha
+; BE-NEXT:    addi 3, 3, .LCPI2_10 at toc@l
+; BE-NEXT:    vand 20, 3, 4
+; BE-NEXT:    lvx 4, 0, 3
+; BE-NEXT:    li 3, 496
+; BE-NEXT:    vand 4, 3, 4
+; BE-NEXT:    stvx 4, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    addis 3, 2, .LCPI2_11 at toc@ha
+; BE-NEXT:    addi 3, 3, .LCPI2_11 at toc@l
+; BE-NEXT:    lvx 4, 0, 3
+; BE-NEXT:    li 3, 448
+; BE-NEXT:    vand 4, 3, 4
+; BE-NEXT:    stvx 4, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    addis 3, 2, .LCPI2_12 at toc@ha
+; BE-NEXT:    addi 3, 3, .LCPI2_12 at toc@l
+; BE-NEXT:    lvx 4, 0, 3
+; BE-NEXT:    li 3, 368
+; BE-NEXT:    vand 7, 3, 8
+; BE-NEXT:    vsldoi 14, 8, 8, 1
+; BE-NEXT:    vsldoi 8, 8, 8, 3
+; BE-NEXT:    vslw 17, 17, 17
+; BE-NEXT:    vand 15, 3, 15
+; BE-NEXT:    vand 14, 3, 14
+; BE-NEXT:    vand 24, 3, 31
+; BE-NEXT:    vand 26, 3, 8
+; BE-NEXT:    vand 11, 3, 11
+; BE-NEXT:    vand 9, 3, 17
+; BE-NEXT:    vand 3, 3, 4
+; BE-NEXT:    stvx 3, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    vspltisw 3, -16
+; BE-NEXT:    li 3, 912
+; BE-NEXT:    vmulouh 6, 2, 7
+; BE-NEXT:    vrlw 7, 7, 3
+; BE-NEXT:    vmulouh 8, 2, 1
+; BE-NEXT:    vrlw 1, 1, 3
+; BE-NEXT:    vxor 0, 0, 0
+; BE-NEXT:    vmsumuhm 7, 2, 7, 0
+; BE-NEXT:    vmsumuhm 1, 2, 1, 0
+; BE-NEXT:    vslw 7, 7, 3
+; BE-NEXT:    vadduwm 6, 6, 7
+; BE-NEXT:    vslw 1, 1, 3
+; BE-NEXT:    vadduwm 1, 8, 1
+; BE-NEXT:    vxor 4, 1, 6
+; BE-NEXT:    stvx 4, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 976
+; BE-NEXT:    lvx 28, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 816
+; BE-NEXT:    vrlw 1, 28, 3
+; BE-NEXT:    vmsumuhm 4, 2, 1, 0
+; BE-NEXT:    stvx 4, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 960
+; BE-NEXT:    lvx 29, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 848
+; BE-NEXT:    vrlw 1, 29, 3
+; BE-NEXT:    vmsumuhm 4, 2, 1, 0
+; BE-NEXT:    stvx 4, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 880
+; BE-NEXT:    vrlw 1, 12, 3
+; BE-NEXT:    vmsumuhm 4, 2, 1, 0
+; BE-NEXT:    stvx 4, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 832
+; BE-NEXT:    vrlw 1, 16, 3
+; BE-NEXT:    vmsumuhm 4, 2, 1, 0
+; BE-NEXT:    stvx 4, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 800
+; BE-NEXT:    vrlw 1, 15, 3
+; BE-NEXT:    vmsumuhm 4, 2, 1, 0
+; BE-NEXT:    stvx 4, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 784
+; BE-NEXT:    vrlw 1, 14, 3
+; BE-NEXT:    vmsumuhm 4, 2, 1, 0
+; BE-NEXT:    stvx 4, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 736
+; BE-NEXT:    vrlw 1, 24, 3
+; BE-NEXT:    vmsumuhm 4, 2, 1, 0
+; BE-NEXT:    stvx 4, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 752
+; BE-NEXT:    lvx 17, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 720
+; BE-NEXT:    vrlw 1, 17, 3
+; BE-NEXT:    vmsumuhm 4, 2, 1, 0
+; BE-NEXT:    stvx 4, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 864
+; BE-NEXT:    vmr 31, 16
+; BE-NEXT:    lvx 16, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 688
+; BE-NEXT:    vrlw 1, 16, 3
+; BE-NEXT:    vmsumuhm 4, 2, 1, 0
+; BE-NEXT:    stvx 4, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 672
+; BE-NEXT:    vrlw 1, 5, 3
+; BE-NEXT:    vmsumuhm 4, 2, 1, 0
+; BE-NEXT:    stvx 4, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 896
+; BE-NEXT:    vmr 19, 15
+; BE-NEXT:    lvx 15, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 656
+; BE-NEXT:    vrlw 1, 15, 3
+; BE-NEXT:    vmsumuhm 4, 2, 1, 0
+; BE-NEXT:    stvx 4, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 640
+; BE-NEXT:    vrlw 1, 23, 3
+; BE-NEXT:    vmsumuhm 4, 2, 1, 0
+; BE-NEXT:    stvx 4, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 608
+; BE-NEXT:    vrlw 1, 13, 3
+; BE-NEXT:    vmsumuhm 4, 2, 1, 0
+; BE-NEXT:    stvx 4, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 576
+; BE-NEXT:    vrlw 1, 26, 3
+; BE-NEXT:    vmsumuhm 4, 2, 1, 0
+; BE-NEXT:    stvx 4, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 944
+; BE-NEXT:    lvx 12, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 544
+; BE-NEXT:    vrlw 1, 12, 3
+; BE-NEXT:    vmsumuhm 4, 2, 1, 0
+; BE-NEXT:    stvx 4, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 528
+; BE-NEXT:    vrlw 1, 11, 3
+; BE-NEXT:    vmsumuhm 4, 2, 1, 0
+; BE-NEXT:    stvx 4, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 512
+; BE-NEXT:    vrlw 1, 9, 3
+; BE-NEXT:    vmsumuhm 4, 2, 1, 0
+; BE-NEXT:    stvx 4, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 480
+; BE-NEXT:    vrlw 1, 25, 3
+; BE-NEXT:    vmsumuhm 4, 2, 1, 0
+; BE-NEXT:    stvx 4, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 928
+; BE-NEXT:    lvx 7, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 464
+; BE-NEXT:    vrlw 1, 7, 3
+; BE-NEXT:    vmsumuhm 4, 2, 1, 0
+; BE-NEXT:    stvx 4, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 432
+; BE-NEXT:    vrlw 1, 10, 3
+; BE-NEXT:    vmsumuhm 4, 2, 1, 0
+; BE-NEXT:    stvx 4, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 400
+; BE-NEXT:    vrlw 1, 30, 3
+; BE-NEXT:    vmsumuhm 4, 2, 1, 0
+; BE-NEXT:    stvx 4, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 768
+; BE-NEXT:    vmr 18, 14
+; BE-NEXT:    vmr 14, 23
+; BE-NEXT:    vmr 23, 26
+; BE-NEXT:    vmr 26, 30
+; BE-NEXT:    lvx 30, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 352
+; BE-NEXT:    vrlw 1, 30, 3
+; BE-NEXT:    vmsumuhm 4, 2, 1, 0
+; BE-NEXT:    stvx 4, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 704
+; BE-NEXT:    vmr 6, 25
+; BE-NEXT:    vmr 25, 10
+; BE-NEXT:    lvx 10, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 320
+; BE-NEXT:    vrlw 1, 10, 3
+; BE-NEXT:    vmsumuhm 4, 2, 1, 0
+; BE-NEXT:    stvx 4, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 288
+; BE-NEXT:    vrlw 1, 27, 3
+; BE-NEXT:    vmsumuhm 4, 2, 1, 0
+; BE-NEXT:    stvx 4, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 256
+; BE-NEXT:    vrlw 1, 22, 3
+; BE-NEXT:    vmsumuhm 4, 2, 1, 0
+; BE-NEXT:    stvx 4, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 224
+; BE-NEXT:    vrlw 1, 21, 3
+; BE-NEXT:    vmsumuhm 4, 2, 1, 0
+; BE-NEXT:    stvx 4, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 208
+; BE-NEXT:    vrlw 1, 20, 3
+; BE-NEXT:    vmsumuhm 4, 2, 1, 0
+; BE-NEXT:    stvx 4, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 496
+; BE-NEXT:    lvx 5, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 192
+; BE-NEXT:    vrlw 1, 5, 3
+; BE-NEXT:    vmsumuhm 4, 2, 1, 0
+; BE-NEXT:    stvx 4, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 448
+; BE-NEXT:    lvx 4, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 160
+; BE-NEXT:    vrlw 1, 4, 3
+; BE-NEXT:    vmsumuhm 1, 2, 1, 0
+; BE-NEXT:    stvx 1, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 368
+; BE-NEXT:    lvx 1, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 128
+; BE-NEXT:    vrlw 8, 1, 3
+; BE-NEXT:    vmsumuhm 0, 2, 8, 0
+; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 624
+; BE-NEXT:    vmulouh 8, 2, 29
+; BE-NEXT:    lvx 29, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 96
+; BE-NEXT:    vmulouh 29, 2, 29
+; BE-NEXT:    stvx 29, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 176
+; BE-NEXT:    vmulouh 31, 2, 31
+; BE-NEXT:    stvx 31, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 240
+; BE-NEXT:    vmulouh 19, 2, 19
+; BE-NEXT:    stvx 19, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 272
+; BE-NEXT:    vmulouh 18, 2, 18
+; BE-NEXT:    stvx 18, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 304
+; BE-NEXT:    vmulouh 18, 2, 24
+; BE-NEXT:    stvx 18, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 336
+; BE-NEXT:    vmulouh 17, 2, 17
+; BE-NEXT:    stvx 17, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 560
+; BE-NEXT:    vmulouh 16, 2, 16
+; BE-NEXT:    stvx 16, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 592
+; BE-NEXT:    lvx 16, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    vmulouh 16, 2, 16
+; BE-NEXT:    stvx 16, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 624
+; BE-NEXT:    vmulouh 15, 2, 15
+; BE-NEXT:    stvx 15, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 752
+; BE-NEXT:    vmulouh 14, 2, 14
+; BE-NEXT:    stvx 14, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 864
+; BE-NEXT:    vmulouh 13, 2, 13
+; BE-NEXT:    stvx 13, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 896
+; BE-NEXT:    vmulouh 13, 2, 23
+; BE-NEXT:    stvx 13, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 944
+; BE-NEXT:    vmulouh 12, 2, 12
+; BE-NEXT:    stvx 12, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 960
+; BE-NEXT:    vmulouh 11, 2, 11
+; BE-NEXT:    stvx 11, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 976
+; BE-NEXT:    vmulouh 9, 2, 9
+; BE-NEXT:    stvx 9, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 64
+; BE-NEXT:    vmulouh 23, 2, 6
+; BE-NEXT:    vmulouh 6, 2, 25
+; BE-NEXT:    stvx 6, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 80
+; BE-NEXT:    vmulouh 6, 2, 26
+; BE-NEXT:    stvx 6, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 112
+; BE-NEXT:    vmulouh 6, 2, 30
+; BE-NEXT:    stvx 6, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 144
+; BE-NEXT:    vmulouh 6, 2, 10
+; BE-NEXT:    stvx 6, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 384
+; BE-NEXT:    vmulouh 6, 2, 27
+; BE-NEXT:    stvx 6, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 416
+; BE-NEXT:    vmulouh 6, 2, 22
+; BE-NEXT:    stvx 6, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 704
+; BE-NEXT:    vmulouh 6, 2, 21
+; BE-NEXT:    stvx 6, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 768
+; BE-NEXT:    vmulouh 6, 2, 20
+; BE-NEXT:    stvx 6, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 928
+; BE-NEXT:    vmulouh 5, 2, 5
+; BE-NEXT:    stvx 5, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 496
+; BE-NEXT:    vmulouh 0, 2, 28
+; BE-NEXT:    vmulouh 24, 2, 7
+; BE-NEXT:    vmulouh 20, 2, 4
+; BE-NEXT:    vmulouh 2, 2, 1
+; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 816
+; BE-NEXT:    lvx 2, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 912
+; BE-NEXT:    vslw 9, 2, 3
+; BE-NEXT:    lvx 2, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 848
+; BE-NEXT:    vadduwm 4, 0, 9
+; BE-NEXT:    vxor 4, 2, 4
+; BE-NEXT:    lvx 2, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 880
+; BE-NEXT:    vslw 9, 2, 3
+; BE-NEXT:    lvx 2, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 832
+; BE-NEXT:    vadduwm 5, 8, 9
+; BE-NEXT:    vslw 9, 2, 3
+; BE-NEXT:    lvx 2, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 800
+; BE-NEXT:    vslw 8, 2, 3
+; BE-NEXT:    lvx 2, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 784
+; BE-NEXT:    vslw 10, 2, 3
+; BE-NEXT:    lvx 2, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 736
+; BE-NEXT:    vslw 11, 2, 3
+; BE-NEXT:    lvx 2, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 720
+; BE-NEXT:    vslw 12, 2, 3
+; BE-NEXT:    lvx 2, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 688
+; BE-NEXT:    vslw 13, 2, 3
+; BE-NEXT:    lvx 2, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 672
+; BE-NEXT:    vslw 18, 2, 3
+; BE-NEXT:    lvx 2, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 656
+; BE-NEXT:    vslw 19, 2, 3
+; BE-NEXT:    lvx 2, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 640
+; BE-NEXT:    vslw 31, 2, 3
+; BE-NEXT:    lvx 2, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 608
+; BE-NEXT:    vslw 29, 2, 3
+; BE-NEXT:    lvx 2, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 576
+; BE-NEXT:    vslw 22, 2, 3
+; BE-NEXT:    lvx 2, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 544
+; BE-NEXT:    vslw 27, 2, 3
+; BE-NEXT:    lvx 2, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 528
+; BE-NEXT:    vslw 25, 2, 3
+; BE-NEXT:    lvx 2, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 512
+; BE-NEXT:    vslw 30, 2, 3
+; BE-NEXT:    lvx 2, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 912
+; BE-NEXT:    vslw 2, 2, 3
+; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 480
+; BE-NEXT:    lvx 2, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 464
+; BE-NEXT:    vxor 6, 4, 5
+; BE-NEXT:    lvx 4, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 432
+; BE-NEXT:    lvx 5, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 400
+; BE-NEXT:    lvx 0, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 352
+; BE-NEXT:    vslw 2, 2, 3
+; BE-NEXT:    lvx 1, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 320
+; BE-NEXT:    vadduwm 2, 23, 2
+; BE-NEXT:    lvx 7, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 288
+; BE-NEXT:    lvx 14, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 256
+; BE-NEXT:    vslw 4, 4, 3
+; BE-NEXT:    vadduwm 4, 24, 4
+; BE-NEXT:    lvx 15, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 224
+; BE-NEXT:    lvx 16, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 208
+; BE-NEXT:    lvx 17, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 192
+; BE-NEXT:    vslw 5, 5, 3
+; BE-NEXT:    lvx 28, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 160
+; BE-NEXT:    lvx 26, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 128
+; BE-NEXT:    vslw 0, 0, 3
+; BE-NEXT:    lvx 21, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 96
+; BE-NEXT:    vslw 1, 1, 3
+; BE-NEXT:    vslw 7, 7, 3
+; BE-NEXT:    vslw 14, 14, 3
+; BE-NEXT:    vslw 15, 15, 3
+; BE-NEXT:    vslw 16, 16, 3
+; BE-NEXT:    vslw 17, 17, 3
+; BE-NEXT:    vslw 28, 28, 3
+; BE-NEXT:    vslw 26, 26, 3
+; BE-NEXT:    vslw 3, 21, 3
+; BE-NEXT:    lvx 21, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 176
+; BE-NEXT:    vadduwm 9, 21, 9
+; BE-NEXT:    vxor 6, 6, 9
+; BE-NEXT:    vxor 2, 6, 2
+; BE-NEXT:    lvx 6, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 240
+; BE-NEXT:    vadduwm 6, 6, 8
+; BE-NEXT:    vxor 2, 2, 6
+; BE-NEXT:    vxor 2, 2, 4
+; BE-NEXT:    lvx 4, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 272
+; BE-NEXT:    vadduwm 4, 4, 10
+; BE-NEXT:    vxor 2, 2, 4
+; BE-NEXT:    lvx 4, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 304
+; BE-NEXT:    vadduwm 4, 4, 11
+; BE-NEXT:    vxor 2, 2, 4
+; BE-NEXT:    lvx 4, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 336
+; BE-NEXT:    vadduwm 4, 4, 12
+; BE-NEXT:    vxor 2, 2, 4
+; BE-NEXT:    lvx 4, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 64
+; BE-NEXT:    vadduwm 4, 4, 13
+; BE-NEXT:    vxor 2, 2, 4
+; BE-NEXT:    lvx 4, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 80
+; BE-NEXT:    vadduwm 4, 4, 5
+; BE-NEXT:    vxor 2, 2, 4
+; BE-NEXT:    lvx 4, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 112
+; BE-NEXT:    vadduwm 4, 4, 0
+; BE-NEXT:    vxor 2, 2, 4
+; BE-NEXT:    lvx 4, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 144
+; BE-NEXT:    vadduwm 4, 4, 1
+; BE-NEXT:    vxor 2, 2, 4
+; BE-NEXT:    lvx 4, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 560
+; BE-NEXT:    vadduwm 4, 4, 7
+; BE-NEXT:    vxor 2, 2, 4
+; BE-NEXT:    lvx 4, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 592
+; BE-NEXT:    vadduwm 4, 4, 18
+; BE-NEXT:    vxor 2, 2, 4
+; BE-NEXT:    lvx 4, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 624
+; BE-NEXT:    vadduwm 4, 4, 19
+; BE-NEXT:    vxor 2, 2, 4
+; BE-NEXT:    lvx 4, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 752
+; BE-NEXT:    vadduwm 4, 4, 31
+; BE-NEXT:    vxor 2, 2, 4
+; BE-NEXT:    lvx 4, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 384
+; BE-NEXT:    vadduwm 4, 4, 29
+; BE-NEXT:    vxor 2, 2, 4
+; BE-NEXT:    lvx 4, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 416
+; BE-NEXT:    vadduwm 4, 4, 14
+; BE-NEXT:    vxor 2, 2, 4
+; BE-NEXT:    lvx 4, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 704
+; BE-NEXT:    vadduwm 4, 4, 15
+; BE-NEXT:    vxor 2, 2, 4
+; BE-NEXT:    lvx 4, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 768
+; BE-NEXT:    vadduwm 4, 4, 16
+; BE-NEXT:    vxor 2, 2, 4
+; BE-NEXT:    lvx 4, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 864
+; BE-NEXT:    vadduwm 4, 4, 17
+; BE-NEXT:    vxor 2, 2, 4
+; BE-NEXT:    lvx 4, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 896
+; BE-NEXT:    vadduwm 4, 4, 22
+; BE-NEXT:    vxor 2, 2, 4
+; BE-NEXT:    lvx 4, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 944
+; BE-NEXT:    vadduwm 4, 4, 27
+; BE-NEXT:    vxor 2, 2, 4
+; BE-NEXT:    lvx 4, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 960
+; BE-NEXT:    vadduwm 4, 4, 25
+; BE-NEXT:    vxor 2, 2, 4
+; BE-NEXT:    lvx 4, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 928
+; BE-NEXT:    vadduwm 4, 4, 30
+; BE-NEXT:    vxor 2, 2, 4
+; BE-NEXT:    lvx 4, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 496
+; BE-NEXT:    vadduwm 4, 4, 28
+; BE-NEXT:    vxor 2, 2, 4
+; BE-NEXT:    vadduwm 4, 20, 26
+; BE-NEXT:    vxor 2, 2, 4
+; BE-NEXT:    lvx 4, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 976
+; BE-NEXT:    vadduwm 3, 4, 3
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 912
+; BE-NEXT:    lvx 4, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 1168
+; BE-NEXT:    lvx 31, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 1152
+; BE-NEXT:    vadduwm 3, 3, 4
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 30, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 1136
+; BE-NEXT:    lvx 29, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 1120
+; BE-NEXT:    lvx 28, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 1104
+; BE-NEXT:    lvx 27, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 1088
+; BE-NEXT:    lvx 26, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 1072
+; BE-NEXT:    lvx 25, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 1056
+; BE-NEXT:    lvx 24, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 1040
+; BE-NEXT:    lvx 23, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 1024
+; BE-NEXT:    lvx 22, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 1008
+; BE-NEXT:    lvx 21, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 992
+; BE-NEXT:    lvx 20, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    addi 1, 1, 1184
+; BE-NEXT:    blr
+;
+; LE-LABEL: clmul_v4i32:
+; LE:       # %bb.0:
+; LE-NEXT:    vspltisw 0, 2
+; LE-NEXT:    vspltisw 1, 1
+; LE-NEXT:    addis 3, 2, .LCPI2_0 at toc@ha
+; LE-NEXT:    xxland 45, 35, 32
+; LE-NEXT:    xxland 46, 35, 33
+; LE-NEXT:    vspltisw 5, 4
+; LE-NEXT:    vspltisw 4, 8
+; LE-NEXT:    addi 3, 3, .LCPI2_0 at toc@l
+; LE-NEXT:    lxvd2x 1, 0, 3
+; LE-NEXT:    vmuluwm 13, 2, 13
+; LE-NEXT:    vmuluwm 14, 2, 14
+; LE-NEXT:    addis 3, 2, .LCPI2_1 at toc@ha
+; LE-NEXT:    vsldoi 12, 1, 1, 1
+; LE-NEXT:    vsldoi 8, 0, 0, 1
+; LE-NEXT:    addi 3, 3, .LCPI2_1 at toc@l
+; LE-NEXT:    xxland 44, 35, 44
+; LE-NEXT:    vmuluwm 12, 2, 12
+; LE-NEXT:    xxland 40, 35, 40
+; LE-NEXT:    vmuluwm 8, 2, 8
+; LE-NEXT:    vsldoi 10, 5, 5, 1
+; LE-NEXT:    vsldoi 6, 1, 1, 2
+; LE-NEXT:    xxland 38, 35, 38
+; LE-NEXT:    vmuluwm 6, 2, 6
+; LE-NEXT:    vsldoi 7, 0, 0, 2
+; LE-NEXT:    vsldoi 9, 5, 5, 2
+; LE-NEXT:    vsldoi 11, 4, 4, 2
+; LE-NEXT:    vsldoi 1, 1, 1, 3
+; LE-NEXT:    vsldoi 0, 0, 0, 3
+; LE-NEXT:    xxland 33, 35, 33
+; LE-NEXT:    vmuluwm 1, 2, 1
+; LE-NEXT:    xxland 32, 35, 32
+; LE-NEXT:    vmuluwm 0, 2, 0
+; LE-NEXT:    xxlxor 0, 46, 45
+; LE-NEXT:    xxland 45, 35, 37
+; LE-NEXT:    vmuluwm 13, 2, 13
+; LE-NEXT:    xxlxor 0, 0, 45
+; LE-NEXT:    xxland 45, 35, 36
+; LE-NEXT:    vmuluwm 13, 2, 13
+; LE-NEXT:    xxlxor 0, 0, 45
+; LE-NEXT:    vadduwm 13, 4, 4
+; LE-NEXT:    xxland 45, 35, 45
+; LE-NEXT:    vmuluwm 13, 2, 13
+; LE-NEXT:    xxlxor 0, 0, 45
+; LE-NEXT:    xxland 45, 35, 1
+; LE-NEXT:    lxvd2x 1, 0, 3
+; LE-NEXT:    addis 3, 2, .LCPI2_2 at toc@ha
+; LE-NEXT:    vmuluwm 13, 2, 13
+; LE-NEXT:    addi 3, 3, .LCPI2_2 at toc@l
+; LE-NEXT:    xxlxor 0, 0, 45
+; LE-NEXT:    vslw 13, 5, 5
+; LE-NEXT:    xxland 45, 35, 45
+; LE-NEXT:    vmuluwm 13, 2, 13
+; LE-NEXT:    xxlxor 0, 0, 45
+; LE-NEXT:    xxland 45, 35, 1
+; LE-NEXT:    lxvd2x 1, 0, 3
+; LE-NEXT:    addis 3, 2, .LCPI2_3 at toc@ha
+; LE-NEXT:    vmuluwm 13, 2, 13
+; LE-NEXT:    addi 3, 3, .LCPI2_3 at toc@l
+; LE-NEXT:    xxlxor 0, 0, 45
+; LE-NEXT:    xxlxor 0, 0, 44
+; LE-NEXT:    xxlxor 0, 0, 40
+; LE-NEXT:    xxland 40, 35, 42
+; LE-NEXT:    vmuluwm 8, 2, 8
+; LE-NEXT:    xxlxor 0, 0, 40
+; LE-NEXT:    vslw 8, 4, 4
+; LE-NEXT:    xxland 40, 35, 40
+; LE-NEXT:    vmuluwm 8, 2, 8
+; LE-NEXT:    xxlxor 0, 0, 40
+; LE-NEXT:    xxland 40, 35, 1
+; LE-NEXT:    lxvd2x 1, 0, 3
+; LE-NEXT:    addis 3, 2, .LCPI2_4 at toc@ha
+; LE-NEXT:    vmuluwm 8, 2, 8
+; LE-NEXT:    addi 3, 3, .LCPI2_4 at toc@l
+; LE-NEXT:    xxlxor 0, 0, 40
+; LE-NEXT:    xxland 40, 35, 1
+; LE-NEXT:    lxvd2x 1, 0, 3
+; LE-NEXT:    addis 3, 2, .LCPI2_5 at toc@ha
+; LE-NEXT:    vmuluwm 8, 2, 8
+; LE-NEXT:    addi 3, 3, .LCPI2_5 at toc@l
+; LE-NEXT:    xxlxor 0, 0, 40
+; LE-NEXT:    xxland 40, 35, 1
+; LE-NEXT:    lxvd2x 1, 0, 3
+; LE-NEXT:    addis 3, 2, .LCPI2_6 at toc@ha
+; LE-NEXT:    vmuluwm 8, 2, 8
+; LE-NEXT:    addi 3, 3, .LCPI2_6 at toc@l
+; LE-NEXT:    xxlxor 0, 0, 40
+; LE-NEXT:    xxland 40, 35, 1
+; LE-NEXT:    lxvd2x 1, 0, 3
+; LE-NEXT:    addis 3, 2, .LCPI2_7 at toc@ha
+; LE-NEXT:    vmuluwm 8, 2, 8
+; LE-NEXT:    addi 3, 3, .LCPI2_7 at toc@l
+; LE-NEXT:    xxlxor 0, 0, 40
+; LE-NEXT:    xxlxor 0, 0, 38
+; LE-NEXT:    xxland 38, 35, 39
+; LE-NEXT:    vmuluwm 6, 2, 6
+; LE-NEXT:    xxlxor 0, 0, 38
+; LE-NEXT:    xxland 38, 35, 41
+; LE-NEXT:    vmuluwm 6, 2, 6
+; LE-NEXT:    xxlxor 0, 0, 38
+; LE-NEXT:    xxland 38, 35, 43
+; LE-NEXT:    vmuluwm 6, 2, 6
+; LE-NEXT:    xxlxor 0, 0, 38
+; LE-NEXT:    xxland 38, 35, 1
+; LE-NEXT:    lxvd2x 1, 0, 3
+; LE-NEXT:    addis 3, 2, .LCPI2_8 at toc@ha
+; LE-NEXT:    vmuluwm 6, 2, 6
+; LE-NEXT:    addi 3, 3, .LCPI2_8 at toc@l
+; LE-NEXT:    xxlxor 0, 0, 38
+; LE-NEXT:    xxland 38, 35, 1
+; LE-NEXT:    lxvd2x 1, 0, 3
+; LE-NEXT:    addis 3, 2, .LCPI2_9 at toc@ha
+; LE-NEXT:    vmuluwm 6, 2, 6
+; LE-NEXT:    addi 3, 3, .LCPI2_9 at toc@l
+; LE-NEXT:    xxlxor 0, 0, 38
+; LE-NEXT:    xxland 38, 35, 1
+; LE-NEXT:    lxvd2x 1, 0, 3
+; LE-NEXT:    addis 3, 2, .LCPI2_10 at toc@ha
+; LE-NEXT:    vmuluwm 6, 2, 6
+; LE-NEXT:    addi 3, 3, .LCPI2_10 at toc@l
+; LE-NEXT:    xxlxor 0, 0, 38
+; LE-NEXT:    xxland 38, 35, 1
+; LE-NEXT:    lxvd2x 1, 0, 3
+; LE-NEXT:    addis 3, 2, .LCPI2_11 at toc@ha
+; LE-NEXT:    vmuluwm 6, 2, 6
+; LE-NEXT:    addi 3, 3, .LCPI2_11 at toc@l
+; LE-NEXT:    xxlxor 0, 0, 38
+; LE-NEXT:    xxlxor 0, 0, 33
+; LE-NEXT:    xxlxor 0, 0, 32
+; LE-NEXT:    vsldoi 5, 5, 5, 3
+; LE-NEXT:    xxland 37, 35, 37
+; LE-NEXT:    vmuluwm 5, 2, 5
+; LE-NEXT:    xxlxor 0, 0, 37
+; LE-NEXT:    vsldoi 4, 4, 4, 3
+; LE-NEXT:    xxland 36, 35, 36
+; LE-NEXT:    vmuluwm 4, 2, 4
+; LE-NEXT:    xxlxor 0, 0, 36
+; LE-NEXT:    xxland 36, 35, 1
+; LE-NEXT:    lxvd2x 1, 0, 3
+; LE-NEXT:    addis 3, 2, .LCPI2_12 at toc@ha
+; LE-NEXT:    vmuluwm 4, 2, 4
+; LE-NEXT:    addi 3, 3, .LCPI2_12 at toc@l
+; LE-NEXT:    xxlxor 0, 0, 36
+; LE-NEXT:    xxland 36, 35, 1
+; LE-NEXT:    lxvd2x 1, 0, 3
+; LE-NEXT:    vmuluwm 4, 2, 4
+; LE-NEXT:    xxlxor 0, 0, 36
+; LE-NEXT:    xxland 36, 35, 1
+; LE-NEXT:    vmuluwm 4, 2, 4
+; LE-NEXT:    xxlxor 0, 0, 36
+; LE-NEXT:    xxleqv 36, 36, 36
+; LE-NEXT:    vslw 4, 4, 4
+; LE-NEXT:    xxland 35, 35, 36
+; LE-NEXT:    vmuluwm 2, 2, 3
+; LE-NEXT:    xxlxor 34, 0, 34
+; LE-NEXT:    blr
+  %res = call <4 x i32> @llvm.clmul.v4i32(<4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %res
+}
+
+define <2 x i64> @clmul_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
+; BE-LABEL: clmul_v2i64:
+; BE:       # %bb.0:
+; BE-NEXT:    stdu 1, -1008(1)
+; BE-NEXT:    rlwinm 7, 5, 0, 30, 30
+; BE-NEXT:    rlwinm 8, 5, 0, 29, 29
+; BE-NEXT:    std 2, 856(1) # 8-byte Folded Spill
+; BE-NEXT:    mulld 7, 3, 7
+; BE-NEXT:    std 7, 848(1) # 8-byte Folded Spill
+; BE-NEXT:    clrldi 7, 5, 63
+; BE-NEXT:    mulld 2, 3, 7
+; BE-NEXT:    std 31, 1000(1) # 8-byte Folded Spill
+; BE-NEXT:    mulld 7, 3, 8
+; BE-NEXT:    std 15, 872(1) # 8-byte Folded Spill
+; BE-NEXT:    std 7, 840(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 7, 5, 0, 28, 28
+; BE-NEXT:    rlwinm 8, 5, 0, 27, 27
+; BE-NEXT:    std 14, 864(1) # 8-byte Folded Spill
+; BE-NEXT:    mulld 7, 3, 7
+; BE-NEXT:    std 7, 824(1) # 8-byte Folded Spill
+; BE-NEXT:    mulld 7, 3, 8
+; BE-NEXT:    std 7, 832(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 7, 5, 0, 26, 26
+; BE-NEXT:    std 17, 888(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 8, 5, 0, 25, 25
+; BE-NEXT:    mulld 7, 3, 7
+; BE-NEXT:    std 7, 808(1) # 8-byte Folded Spill
+; BE-NEXT:    mulld 7, 3, 8
+; BE-NEXT:    std 7, 816(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 7, 5, 0, 24, 24
+; BE-NEXT:    std 16, 880(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 8, 5, 0, 23, 23
+; BE-NEXT:    mulld 7, 3, 7
+; BE-NEXT:    std 7, 792(1) # 8-byte Folded Spill
+; BE-NEXT:    mulld 7, 3, 8
+; BE-NEXT:    std 7, 800(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 7, 5, 0, 22, 22
+; BE-NEXT:    std 19, 904(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 8, 5, 0, 21, 21
+; BE-NEXT:    mulld 7, 3, 7
+; BE-NEXT:    std 7, 776(1) # 8-byte Folded Spill
+; BE-NEXT:    mulld 7, 3, 8
+; BE-NEXT:    std 7, 784(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 7, 5, 0, 20, 20
+; BE-NEXT:    std 18, 896(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 8, 5, 0, 19, 19
+; BE-NEXT:    mulld 7, 3, 7
+; BE-NEXT:    std 7, 760(1) # 8-byte Folded Spill
+; BE-NEXT:    mulld 7, 3, 8
+; BE-NEXT:    std 7, 768(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 7, 5, 0, 18, 18
+; BE-NEXT:    std 21, 920(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 8, 5, 0, 17, 17
+; BE-NEXT:    mulld 7, 3, 7
+; BE-NEXT:    std 7, 744(1) # 8-byte Folded Spill
+; BE-NEXT:    mulld 7, 3, 8
+; BE-NEXT:    std 7, 752(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 7, 5, 0, 16, 16
+; BE-NEXT:    std 20, 912(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 8, 5, 0, 15, 15
+; BE-NEXT:    mulld 7, 3, 7
+; BE-NEXT:    std 7, 728(1) # 8-byte Folded Spill
+; BE-NEXT:    mulld 7, 3, 8
+; BE-NEXT:    std 7, 736(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 7, 5, 0, 14, 14
+; BE-NEXT:    std 23, 936(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 8, 5, 0, 13, 13
+; BE-NEXT:    mulld 7, 3, 7
+; BE-NEXT:    std 7, 712(1) # 8-byte Folded Spill
+; BE-NEXT:    mulld 7, 3, 8
+; BE-NEXT:    std 7, 720(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 7, 5, 0, 12, 12
+; BE-NEXT:    std 22, 928(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 8, 5, 0, 11, 11
+; BE-NEXT:    mulld 7, 3, 7
+; BE-NEXT:    std 7, 696(1) # 8-byte Folded Spill
+; BE-NEXT:    mulld 7, 3, 8
+; BE-NEXT:    std 7, 704(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 7, 5, 0, 10, 10
+; BE-NEXT:    std 25, 952(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 8, 5, 0, 9, 9
+; BE-NEXT:    mulld 7, 3, 7
+; BE-NEXT:    std 7, 680(1) # 8-byte Folded Spill
+; BE-NEXT:    mulld 7, 3, 8
+; BE-NEXT:    std 7, 688(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 7, 5, 0, 8, 8
+; BE-NEXT:    std 24, 944(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 8, 5, 0, 7, 7
+; BE-NEXT:    mulld 7, 3, 7
+; BE-NEXT:    std 7, 664(1) # 8-byte Folded Spill
+; BE-NEXT:    mulld 7, 3, 8
+; BE-NEXT:    std 7, 672(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 7, 5, 0, 6, 6
+; BE-NEXT:    std 27, 968(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 8, 5, 0, 5, 5
+; BE-NEXT:    mulld 7, 3, 7
+; BE-NEXT:    std 7, 648(1) # 8-byte Folded Spill
+; BE-NEXT:    mulld 7, 3, 8
+; BE-NEXT:    std 7, 656(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 7, 5, 0, 4, 4
+; BE-NEXT:    std 26, 960(1) # 8-byte Folded Spill
+; BE-NEXT:    mulld 7, 3, 7
+; BE-NEXT:    rldicr 8, 5, 0, 0
+; BE-NEXT:    std 7, 640(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 7, 5, 0, 3, 3
+; BE-NEXT:    mulld 7, 3, 7
+; BE-NEXT:    std 7, 632(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 7, 5, 0, 2, 2
+; BE-NEXT:    std 29, 984(1) # 8-byte Folded Spill
+; BE-NEXT:    mulld 7, 3, 7
+; BE-NEXT:    std 7, 624(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 7, 5, 0, 1, 1
+; BE-NEXT:    mulld 7, 3, 7
+; BE-NEXT:    std 7, 616(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 7, 5, 0, 0, 0
+; BE-NEXT:    std 28, 976(1) # 8-byte Folded Spill
+; BE-NEXT:    mulld 7, 3, 7
+; BE-NEXT:    std 7, 608(1) # 8-byte Folded Spill
+; BE-NEXT:    rldicl 7, 5, 32, 32
+; BE-NEXT:    rldicl 7, 7, 32, 31
+; BE-NEXT:    std 30, 992(1) # 8-byte Folded Spill
+; BE-NEXT:    mulld 7, 3, 7
+; BE-NEXT:    mulld 8, 3, 8
+; BE-NEXT:    std 7, 592(1) # 8-byte Folded Spill
+; BE-NEXT:    std 8, 600(1) # 8-byte Folded Spill
+; BE-NEXT:    rldicl 7, 5, 31, 33
+; BE-NEXT:    rldicl 7, 7, 33, 30
+; BE-NEXT:    rldicl 8, 5, 30, 34
+; BE-NEXT:    rldicl 8, 8, 34, 29
+; BE-NEXT:    mulld 7, 3, 7
+; BE-NEXT:    std 7, 576(1) # 8-byte Folded Spill
+; BE-NEXT:    mulld 7, 3, 8
+; BE-NEXT:    std 7, 584(1) # 8-byte Folded Spill
+; BE-NEXT:    rldicl 7, 5, 29, 35
+; BE-NEXT:    rldicl 7, 7, 35, 28
+; BE-NEXT:    rldicl 8, 5, 28, 36
+; BE-NEXT:    rldicl 8, 8, 36, 27
+; BE-NEXT:    mulld 7, 3, 7
+; BE-NEXT:    std 7, 560(1) # 8-byte Folded Spill
+; BE-NEXT:    mulld 7, 3, 8
+; BE-NEXT:    std 7, 568(1) # 8-byte Folded Spill
+; BE-NEXT:    rldicl 7, 5, 27, 37
+; BE-NEXT:    rldicl 7, 7, 37, 26
+; BE-NEXT:    rldicl 8, 5, 26, 38
+; BE-NEXT:    rldicl 8, 8, 38, 25
+; BE-NEXT:    mulld 7, 3, 7
+; BE-NEXT:    std 7, 544(1) # 8-byte Folded Spill
+; BE-NEXT:    mulld 7, 3, 8
+; BE-NEXT:    std 7, 552(1) # 8-byte Folded Spill
+; BE-NEXT:    rldicl 7, 5, 25, 39
+; BE-NEXT:    rldicl 7, 7, 39, 24
+; BE-NEXT:    rldicl 8, 5, 24, 40
+; BE-NEXT:    rldicl 8, 8, 40, 23
+; BE-NEXT:    mulld 7, 3, 7
+; BE-NEXT:    std 7, 528(1) # 8-byte Folded Spill
+; BE-NEXT:    mulld 7, 3, 8
+; BE-NEXT:    std 7, 536(1) # 8-byte Folded Spill
+; BE-NEXT:    rldicl 7, 5, 23, 41
+; BE-NEXT:    rldicl 7, 7, 41, 22
+; BE-NEXT:    rldicl 8, 5, 22, 42
+; BE-NEXT:    rldicl 8, 8, 42, 21
+; BE-NEXT:    mulld 7, 3, 7
+; BE-NEXT:    std 7, 512(1) # 8-byte Folded Spill
+; BE-NEXT:    mulld 7, 3, 8
+; BE-NEXT:    std 7, 520(1) # 8-byte Folded Spill
+; BE-NEXT:    rldicl 7, 5, 21, 43
+; BE-NEXT:    rldicl 7, 7, 43, 20
+; BE-NEXT:    rldicl 8, 5, 20, 44
+; BE-NEXT:    rldicl 8, 8, 44, 19
+; BE-NEXT:    mulld 7, 3, 7
+; BE-NEXT:    std 7, 496(1) # 8-byte Folded Spill
+; BE-NEXT:    mulld 7, 3, 8
+; BE-NEXT:    std 7, 504(1) # 8-byte Folded Spill
+; BE-NEXT:    rldicl 7, 5, 19, 45
+; BE-NEXT:    rldicl 7, 7, 45, 18
+; BE-NEXT:    rldicl 8, 5, 18, 46
+; BE-NEXT:    rldicl 8, 8, 46, 17
+; BE-NEXT:    mulld 7, 3, 7
+; BE-NEXT:    std 7, 480(1) # 8-byte Folded Spill
+; BE-NEXT:    mulld 7, 3, 8
+; BE-NEXT:    std 7, 488(1) # 8-byte Folded Spill
+; BE-NEXT:    rldicl 7, 5, 17, 47
+; BE-NEXT:    rldicl 7, 7, 47, 16
+; BE-NEXT:    rldicl 8, 5, 16, 48
+; BE-NEXT:    rldicl 8, 8, 48, 15
+; BE-NEXT:    mulld 7, 3, 7
+; BE-NEXT:    std 7, 464(1) # 8-byte Folded Spill
+; BE-NEXT:    mulld 7, 3, 8
+; BE-NEXT:    std 7, 472(1) # 8-byte Folded Spill
+; BE-NEXT:    rldicl 7, 5, 15, 49
+; BE-NEXT:    rldicl 7, 7, 49, 14
+; BE-NEXT:    rldicl 8, 5, 14, 50
+; BE-NEXT:    rldicl 8, 8, 50, 13
+; BE-NEXT:    mulld 7, 3, 7
+; BE-NEXT:    std 7, 448(1) # 8-byte Folded Spill
+; BE-NEXT:    mulld 7, 3, 8
+; BE-NEXT:    std 7, 456(1) # 8-byte Folded Spill
+; BE-NEXT:    rldicl 7, 5, 13, 51
+; BE-NEXT:    rldicl 7, 7, 51, 12
+; BE-NEXT:    rldicl 8, 5, 12, 52
+; BE-NEXT:    rldicl 8, 8, 52, 11
+; BE-NEXT:    mulld 7, 3, 7
+; BE-NEXT:    std 7, 432(1) # 8-byte Folded Spill
+; BE-NEXT:    mulld 7, 3, 8
+; BE-NEXT:    std 7, 440(1) # 8-byte Folded Spill
+; BE-NEXT:    rldicl 7, 5, 11, 53
+; BE-NEXT:    rldicl 7, 7, 53, 10
+; BE-NEXT:    rldicl 8, 5, 10, 54
+; BE-NEXT:    rldicl 8, 8, 54, 9
+; BE-NEXT:    mulld 7, 3, 7
+; BE-NEXT:    std 7, 416(1) # 8-byte Folded Spill
+; BE-NEXT:    mulld 7, 3, 8
+; BE-NEXT:    std 7, 424(1) # 8-byte Folded Spill
+; BE-NEXT:    rldicl 7, 5, 9, 55
+; BE-NEXT:    rldicl 7, 7, 55, 8
+; BE-NEXT:    rldicl 8, 5, 8, 56
+; BE-NEXT:    rldicl 8, 8, 56, 7
+; BE-NEXT:    mulld 7, 3, 7
+; BE-NEXT:    std 7, 400(1) # 8-byte Folded Spill
+; BE-NEXT:    mulld 7, 3, 8
+; BE-NEXT:    std 7, 408(1) # 8-byte Folded Spill
+; BE-NEXT:    rldicl 7, 5, 7, 57
+; BE-NEXT:    rldicl 7, 7, 57, 6
+; BE-NEXT:    rldicl 8, 5, 6, 58
+; BE-NEXT:    rldicl 8, 8, 58, 5
+; BE-NEXT:    mulld 7, 3, 7
+; BE-NEXT:    std 7, 384(1) # 8-byte Folded Spill
+; BE-NEXT:    mulld 7, 3, 8
+; BE-NEXT:    std 7, 392(1) # 8-byte Folded Spill
+; BE-NEXT:    rldicl 7, 5, 5, 59
+; BE-NEXT:    rldicl 7, 7, 59, 4
+; BE-NEXT:    rldicl 8, 5, 4, 60
+; BE-NEXT:    rldicl 8, 8, 60, 3
+; BE-NEXT:    mulld 7, 3, 7
+; BE-NEXT:    std 7, 368(1) # 8-byte Folded Spill
+; BE-NEXT:    mulld 7, 3, 8
+; BE-NEXT:    std 7, 376(1) # 8-byte Folded Spill
+; BE-NEXT:    rldicl 7, 5, 3, 61
+; BE-NEXT:    rldicl 5, 5, 2, 62
+; BE-NEXT:    rldicl 7, 7, 61, 2
+; BE-NEXT:    rldicl 5, 5, 62, 1
+; BE-NEXT:    mulld 7, 3, 7
+; BE-NEXT:    mulld 3, 3, 5
+; BE-NEXT:    std 7, 352(1) # 8-byte Folded Spill
+; BE-NEXT:    std 3, 360(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 3, 6, 0, 30, 30
+; BE-NEXT:    mulld 3, 4, 3
+; BE-NEXT:    std 3, 344(1) # 8-byte Folded Spill
+; BE-NEXT:    clrldi 3, 6, 63
+; BE-NEXT:    mulld 3, 4, 3
+; BE-NEXT:    std 3, 336(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 3, 6, 0, 29, 29
+; BE-NEXT:    mulld 3, 4, 3
+; BE-NEXT:    std 3, 328(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 3, 6, 0, 28, 28
+; BE-NEXT:    mulld 3, 4, 3
+; BE-NEXT:    std 3, 320(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 3, 6, 0, 27, 27
+; BE-NEXT:    mulld 3, 4, 3
+; BE-NEXT:    std 3, 312(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 3, 6, 0, 26, 26
+; BE-NEXT:    mulld 3, 4, 3
+; BE-NEXT:    std 3, 304(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 3, 6, 0, 25, 25
+; BE-NEXT:    mulld 3, 4, 3
+; BE-NEXT:    std 3, 296(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 3, 6, 0, 24, 24
+; BE-NEXT:    mulld 3, 4, 3
+; BE-NEXT:    std 3, 288(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 3, 6, 0, 23, 23
+; BE-NEXT:    mulld 3, 4, 3
+; BE-NEXT:    std 3, 280(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 3, 6, 0, 22, 22
+; BE-NEXT:    mulld 3, 4, 3
+; BE-NEXT:    std 3, 272(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 3, 6, 0, 21, 21
+; BE-NEXT:    mulld 3, 4, 3
+; BE-NEXT:    std 3, 264(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 3, 6, 0, 20, 20
+; BE-NEXT:    mulld 3, 4, 3
+; BE-NEXT:    std 3, 256(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 3, 6, 0, 19, 19
+; BE-NEXT:    mulld 3, 4, 3
+; BE-NEXT:    std 3, 248(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 3, 6, 0, 18, 18
+; BE-NEXT:    mulld 3, 4, 3
+; BE-NEXT:    std 3, 240(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 3, 6, 0, 17, 17
+; BE-NEXT:    mulld 3, 4, 3
+; BE-NEXT:    std 3, 232(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 3, 6, 0, 16, 16
+; BE-NEXT:    mulld 3, 4, 3
+; BE-NEXT:    std 3, 224(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 3, 6, 0, 15, 15
+; BE-NEXT:    mulld 3, 4, 3
+; BE-NEXT:    std 3, 216(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 3, 6, 0, 14, 14
+; BE-NEXT:    mulld 3, 4, 3
+; BE-NEXT:    std 3, 208(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 3, 6, 0, 13, 13
+; BE-NEXT:    mulld 3, 4, 3
+; BE-NEXT:    std 3, 200(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 3, 6, 0, 12, 12
+; BE-NEXT:    mulld 3, 4, 3
+; BE-NEXT:    std 3, 192(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 3, 6, 0, 11, 11
+; BE-NEXT:    mulld 3, 4, 3
+; BE-NEXT:    std 3, 184(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 3, 6, 0, 10, 10
+; BE-NEXT:    mulld 3, 4, 3
+; BE-NEXT:    std 3, 176(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 3, 6, 0, 9, 9
+; BE-NEXT:    mulld 3, 4, 3
+; BE-NEXT:    std 3, 168(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 3, 6, 0, 8, 8
+; BE-NEXT:    mulld 3, 4, 3
+; BE-NEXT:    std 3, 160(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 3, 6, 0, 7, 7
+; BE-NEXT:    mulld 3, 4, 3
+; BE-NEXT:    std 3, 152(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 3, 6, 0, 6, 6
+; BE-NEXT:    mulld 3, 4, 3
+; BE-NEXT:    std 3, 144(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 3, 6, 0, 5, 5
+; BE-NEXT:    mulld 3, 4, 3
+; BE-NEXT:    std 3, 136(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 3, 6, 0, 4, 4
+; BE-NEXT:    mulld 3, 4, 3
+; BE-NEXT:    std 3, 128(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 3, 6, 0, 3, 3
+; BE-NEXT:    mulld 3, 4, 3
+; BE-NEXT:    std 3, 120(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 3, 6, 0, 2, 2
+; BE-NEXT:    mulld 3, 4, 3
+; BE-NEXT:    std 3, 112(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 3, 6, 0, 1, 1
+; BE-NEXT:    mulld 3, 4, 3
+; BE-NEXT:    std 3, 104(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 3, 6, 0, 0, 0
+; BE-NEXT:    mulld 3, 4, 3
+; BE-NEXT:    std 3, 96(1) # 8-byte Folded Spill
+; BE-NEXT:    rldicl 3, 6, 32, 32
+; BE-NEXT:    rldicl 3, 3, 32, 31
+; BE-NEXT:    rldicr 5, 6, 0, 0
+; BE-NEXT:    mulld 3, 4, 3
+; BE-NEXT:    mulld 5, 4, 5
+; BE-NEXT:    std 3, 80(1) # 8-byte Folded Spill
+; BE-NEXT:    std 5, 88(1) # 8-byte Folded Spill
+; BE-NEXT:    rldicl 3, 6, 31, 33
+; BE-NEXT:    rldicl 5, 6, 30, 34
+; BE-NEXT:    rldicl 3, 3, 33, 30
+; BE-NEXT:    rldicl 5, 5, 34, 29
+; BE-NEXT:    mulld 3, 4, 3
+; BE-NEXT:    std 3, 64(1) # 8-byte Folded Spill
+; BE-NEXT:    mulld 3, 4, 5
+; BE-NEXT:    std 3, 72(1) # 8-byte Folded Spill
+; BE-NEXT:    rldicl 3, 6, 29, 35
+; BE-NEXT:    rldicl 5, 6, 28, 36
+; BE-NEXT:    rldicl 3, 3, 35, 28
+; BE-NEXT:    rldicl 5, 5, 36, 27
+; BE-NEXT:    mulld 31, 4, 3
+; BE-NEXT:    mulld 3, 4, 5
+; BE-NEXT:    std 3, 56(1) # 8-byte Folded Spill
+; BE-NEXT:    rldicl 3, 6, 27, 37
+; BE-NEXT:    rldicl 3, 3, 37, 26
+; BE-NEXT:    rldicl 5, 6, 26, 38
+; BE-NEXT:    mulld 15, 4, 3
+; BE-NEXT:    rldicl 3, 6, 25, 39
+; BE-NEXT:    rldicl 5, 5, 38, 25
+; BE-NEXT:    rldicl 3, 3, 39, 24
+; BE-NEXT:    mulld 14, 4, 5
+; BE-NEXT:    rldicl 5, 6, 24, 40
+; BE-NEXT:    mulld 17, 4, 3
+; BE-NEXT:    rldicl 3, 6, 23, 41
+; BE-NEXT:    rldicl 5, 5, 40, 23
+; BE-NEXT:    rldicl 3, 3, 41, 22
+; BE-NEXT:    mulld 16, 4, 5
+; BE-NEXT:    rldicl 5, 6, 22, 42
+; BE-NEXT:    mulld 19, 4, 3
+; BE-NEXT:    rldicl 3, 6, 21, 43
+; BE-NEXT:    rldicl 5, 5, 42, 21
+; BE-NEXT:    rldicl 3, 3, 43, 20
+; BE-NEXT:    mulld 18, 4, 5
+; BE-NEXT:    rldicl 5, 6, 20, 44
+; BE-NEXT:    mulld 21, 4, 3
+; BE-NEXT:    rldicl 3, 6, 19, 45
+; BE-NEXT:    rldicl 5, 5, 44, 19
+; BE-NEXT:    rldicl 3, 3, 45, 18
+; BE-NEXT:    mulld 20, 4, 5
+; BE-NEXT:    rldicl 5, 6, 18, 46
+; BE-NEXT:    mulld 23, 4, 3
+; BE-NEXT:    rldicl 3, 6, 17, 47
+; BE-NEXT:    rldicl 5, 5, 46, 17
+; BE-NEXT:    rldicl 3, 3, 47, 16
+; BE-NEXT:    mulld 22, 4, 5
+; BE-NEXT:    rldicl 5, 6, 16, 48
+; BE-NEXT:    mulld 25, 4, 3
+; BE-NEXT:    rldicl 3, 6, 15, 49
+; BE-NEXT:    rldicl 5, 5, 48, 15
+; BE-NEXT:    rldicl 3, 3, 49, 14
+; BE-NEXT:    mulld 24, 4, 5
+; BE-NEXT:    rldicl 5, 6, 14, 50
+; BE-NEXT:    mulld 27, 4, 3
+; BE-NEXT:    rldicl 3, 6, 13, 51
+; BE-NEXT:    rldicl 5, 5, 50, 13
+; BE-NEXT:    rldicl 3, 3, 51, 12
+; BE-NEXT:    mulld 26, 4, 5
+; BE-NEXT:    rldicl 5, 6, 12, 52
+; BE-NEXT:    mulld 29, 4, 3
+; BE-NEXT:    rldicl 3, 6, 11, 53
+; BE-NEXT:    rldicl 5, 5, 52, 11
+; BE-NEXT:    rldicl 3, 3, 53, 10
+; BE-NEXT:    mulld 28, 4, 5
+; BE-NEXT:    rldicl 5, 6, 10, 54
+; BE-NEXT:    mulld 0, 4, 3
+; BE-NEXT:    rldicl 3, 6, 9, 55
+; BE-NEXT:    rldicl 5, 5, 54, 9
+; BE-NEXT:    rldicl 3, 3, 55, 8
+; BE-NEXT:    mulld 30, 4, 5
+; BE-NEXT:    rldicl 5, 6, 8, 56
+; BE-NEXT:    mulld 11, 4, 3
+; BE-NEXT:    rldicl 3, 6, 7, 57
+; BE-NEXT:    rldicl 5, 5, 56, 7
+; BE-NEXT:    rldicl 3, 3, 57, 6
+; BE-NEXT:    mulld 12, 4, 5
+; BE-NEXT:    rldicl 5, 6, 6, 58
+; BE-NEXT:    mulld 9, 4, 3
+; BE-NEXT:    rldicl 3, 6, 5, 59
+; BE-NEXT:    rldicl 5, 5, 58, 5
+; BE-NEXT:    rldicl 3, 3, 59, 4
+; BE-NEXT:    mulld 10, 4, 5
+; BE-NEXT:    rldicl 5, 6, 4, 60
+; BE-NEXT:    mulld 7, 4, 3
+; BE-NEXT:    rldicl 3, 6, 3, 61
+; BE-NEXT:    rldicl 5, 5, 60, 3
+; BE-NEXT:    rldicl 6, 6, 2, 62
+; BE-NEXT:    rldicl 3, 3, 61, 2
+; BE-NEXT:    mulld 8, 4, 5
+; BE-NEXT:    rldicl 5, 6, 62, 1
+; BE-NEXT:    mulld 6, 4, 3
+; BE-NEXT:    ld 3, 848(1) # 8-byte Folded Reload
+; BE-NEXT:    mulld 4, 4, 5
+; BE-NEXT:    ld 5, 344(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 2, 3
+; BE-NEXT:    ld 2, 336(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 5, 2, 5
+; BE-NEXT:    ld 2, 840(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 328(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 5, 5, 2
+; BE-NEXT:    ld 2, 824(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 320(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 5, 5, 2
+; BE-NEXT:    ld 2, 832(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 312(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 5, 5, 2
+; BE-NEXT:    ld 2, 808(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 304(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 5, 5, 2
+; BE-NEXT:    ld 2, 816(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 296(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 5, 5, 2
+; BE-NEXT:    ld 2, 792(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 288(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 5, 5, 2
+; BE-NEXT:    ld 2, 800(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 280(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 5, 5, 2
+; BE-NEXT:    ld 2, 776(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 272(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 5, 5, 2
+; BE-NEXT:    ld 2, 784(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 264(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 5, 5, 2
+; BE-NEXT:    ld 2, 760(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 256(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 5, 5, 2
+; BE-NEXT:    ld 2, 768(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 248(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 5, 5, 2
+; BE-NEXT:    ld 2, 744(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 240(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 5, 5, 2
+; BE-NEXT:    ld 2, 752(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 232(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 5, 5, 2
+; BE-NEXT:    ld 2, 728(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 224(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 5, 5, 2
+; BE-NEXT:    ld 2, 736(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 216(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 5, 5, 2
+; BE-NEXT:    ld 2, 712(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 208(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 5, 5, 2
+; BE-NEXT:    ld 2, 720(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 200(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 5, 5, 2
+; BE-NEXT:    ld 2, 696(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 192(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 5, 5, 2
+; BE-NEXT:    ld 2, 704(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 184(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 5, 5, 2
+; BE-NEXT:    ld 2, 680(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 176(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 5, 5, 2
+; BE-NEXT:    ld 2, 688(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 168(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 5, 5, 2
+; BE-NEXT:    ld 2, 664(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 160(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 5, 5, 2
+; BE-NEXT:    ld 2, 672(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 152(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 5, 5, 2
+; BE-NEXT:    ld 2, 648(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 144(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 5, 5, 2
+; BE-NEXT:    ld 2, 656(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 136(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 5, 5, 2
+; BE-NEXT:    ld 2, 640(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 128(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 5, 5, 2
+; BE-NEXT:    ld 2, 632(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 120(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 5, 5, 2
+; BE-NEXT:    ld 2, 624(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 112(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 5, 5, 2
+; BE-NEXT:    ld 2, 616(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 104(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 5, 5, 2
+; BE-NEXT:    ld 2, 608(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 96(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 5, 5, 2
+; BE-NEXT:    ld 2, 592(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 80(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 5, 5, 2
+; BE-NEXT:    ld 2, 576(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 64(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 5, 5, 2
+; BE-NEXT:    ld 2, 584(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 72(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 5, 5, 2
+; BE-NEXT:    ld 2, 560(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 5, 5, 31
+; BE-NEXT:    ld 31, 568(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    xor 3, 3, 31
+; BE-NEXT:    ld 31, 56(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 5, 5, 31
+; BE-NEXT:    ld 31, 544(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 5, 5, 15
+; BE-NEXT:    xor 5, 5, 14
+; BE-NEXT:    ld 15, 552(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 31
+; BE-NEXT:    xor 5, 5, 17
+; BE-NEXT:    xor 5, 5, 16
+; BE-NEXT:    xor 3, 3, 15
+; BE-NEXT:    ld 15, 528(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 5, 5, 19
+; BE-NEXT:    xor 5, 5, 18
+; BE-NEXT:    ld 17, 536(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 15
+; BE-NEXT:    xor 5, 5, 21
+; BE-NEXT:    xor 5, 5, 20
+; BE-NEXT:    xor 3, 3, 17
+; BE-NEXT:    ld 17, 512(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 5, 5, 23
+; BE-NEXT:    xor 5, 5, 22
+; BE-NEXT:    ld 19, 520(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 17
+; BE-NEXT:    xor 5, 5, 25
+; BE-NEXT:    xor 5, 5, 24
+; BE-NEXT:    xor 3, 3, 19
+; BE-NEXT:    ld 19, 496(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 5, 5, 27
+; BE-NEXT:    xor 5, 5, 26
+; BE-NEXT:    ld 21, 504(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 19
+; BE-NEXT:    xor 5, 5, 29
+; BE-NEXT:    xor 5, 5, 28
+; BE-NEXT:    xor 3, 3, 21
+; BE-NEXT:    ld 21, 480(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 5, 5, 0
+; BE-NEXT:    xor 5, 5, 30
+; BE-NEXT:    ld 23, 488(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 21
+; BE-NEXT:    xor 5, 5, 11
+; BE-NEXT:    xor 5, 5, 12
+; BE-NEXT:    xor 3, 3, 23
+; BE-NEXT:    ld 23, 464(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 5, 5, 9
+; BE-NEXT:    xor 5, 5, 10
+; BE-NEXT:    ld 25, 472(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 23
+; BE-NEXT:    xor 5, 5, 7
+; BE-NEXT:    xor 5, 5, 8
+; BE-NEXT:    xor 3, 3, 25
+; BE-NEXT:    ld 25, 448(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 5, 5, 6
+; BE-NEXT:    xor 4, 5, 4
+; BE-NEXT:    ld 27, 456(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 25
+; BE-NEXT:    xor 3, 3, 27
+; BE-NEXT:    ld 27, 432(1) # 8-byte Folded Reload
+; BE-NEXT:    ld 29, 440(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 27
+; BE-NEXT:    xor 3, 3, 29
+; BE-NEXT:    ld 29, 416(1) # 8-byte Folded Reload
+; BE-NEXT:    ld 0, 424(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 29
+; BE-NEXT:    xor 3, 3, 0
+; BE-NEXT:    ld 0, 400(1) # 8-byte Folded Reload
+; BE-NEXT:    ld 11, 408(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 0
+; BE-NEXT:    xor 3, 3, 11
+; BE-NEXT:    ld 11, 384(1) # 8-byte Folded Reload
+; BE-NEXT:    ld 9, 392(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 11
+; BE-NEXT:    xor 3, 3, 9
+; BE-NEXT:    ld 9, 368(1) # 8-byte Folded Reload
+; BE-NEXT:    ld 7, 376(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 9
+; BE-NEXT:    xor 3, 3, 7
+; BE-NEXT:    ld 7, 352(1) # 8-byte Folded Reload
+; BE-NEXT:    ld 6, 360(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 7
+; BE-NEXT:    ld 5, 600(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 6
+; BE-NEXT:    xor 3, 3, 5
+; BE-NEXT:    ld 5, 88(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 5
+; BE-NEXT:    ld 2, 856(1) # 8-byte Folded Reload
+; BE-NEXT:    ld 31, 1000(1) # 8-byte Folded Reload
+; BE-NEXT:    ld 30, 992(1) # 8-byte Folded Reload
+; BE-NEXT:    ld 29, 984(1) # 8-byte Folded Reload
+; BE-NEXT:    ld 28, 976(1) # 8-byte Folded Reload
+; BE-NEXT:    ld 27, 968(1) # 8-byte Folded Reload
+; BE-NEXT:    ld 26, 960(1) # 8-byte Folded Reload
+; BE-NEXT:    ld 25, 952(1) # 8-byte Folded Reload
+; BE-NEXT:    ld 24, 944(1) # 8-byte Folded Reload
+; BE-NEXT:    ld 23, 936(1) # 8-byte Folded Reload
+; BE-NEXT:    ld 22, 928(1) # 8-byte Folded Reload
+; BE-NEXT:    ld 21, 920(1) # 8-byte Folded Reload
+; BE-NEXT:    ld 20, 912(1) # 8-byte Folded Reload
+; BE-NEXT:    ld 19, 904(1) # 8-byte Folded Reload
+; BE-NEXT:    ld 18, 896(1) # 8-byte Folded Reload
+; BE-NEXT:    ld 17, 888(1) # 8-byte Folded Reload
+; BE-NEXT:    ld 16, 880(1) # 8-byte Folded Reload
+; BE-NEXT:    ld 15, 872(1) # 8-byte Folded Reload
+; BE-NEXT:    ld 14, 864(1) # 8-byte Folded Reload
+; BE-NEXT:    addi 1, 1, 1008
+; BE-NEXT:    blr
+;
+; LE-LABEL: clmul_v2i64:
+; LE:       # %bb.0:
+; LE-NEXT:    stdu 1, -480(1)
+; LE-NEXT:    mfvsrd 4, 35
+; LE-NEXT:    mfvsrd 3, 34
+; LE-NEXT:    std 16, 352(1) # 8-byte Folded Spill
+; LE-NEXT:    std 14, 336(1) # 8-byte Folded Spill
+; LE-NEXT:    std 15, 344(1) # 8-byte Folded Spill
+; LE-NEXT:    std 17, 360(1) # 8-byte Folded Spill
+; LE-NEXT:    xxswapd 0, 35
+; LE-NEXT:    xxswapd 1, 34
+; LE-NEXT:    std 18, 368(1) # 8-byte Folded Spill
+; LE-NEXT:    std 30, 464(1) # 8-byte Folded Spill
+; LE-NEXT:    rlwinm 5, 4, 0, 30, 30
+; LE-NEXT:    clrldi 6, 4, 63
+; LE-NEXT:    rlwinm 7, 4, 0, 29, 29
+; LE-NEXT:    rlwinm 8, 4, 0, 28, 28
+; LE-NEXT:    rlwinm 9, 4, 0, 27, 27
+; LE-NEXT:    rlwinm 10, 4, 0, 26, 26
+; LE-NEXT:    rlwinm 11, 4, 0, 25, 25
+; LE-NEXT:    rlwinm 12, 4, 0, 24, 24
+; LE-NEXT:    rlwinm 0, 4, 0, 23, 23
+; LE-NEXT:    rlwinm 30, 4, 0, 22, 22
+; LE-NEXT:    std 19, 376(1) # 8-byte Folded Spill
+; LE-NEXT:    std 29, 456(1) # 8-byte Folded Spill
+; LE-NEXT:    rlwinm 29, 4, 0, 21, 21
+; LE-NEXT:    std 20, 384(1) # 8-byte Folded Spill
+; LE-NEXT:    std 28, 448(1) # 8-byte Folded Spill
+; LE-NEXT:    rlwinm 28, 4, 0, 20, 20
+; LE-NEXT:    std 21, 392(1) # 8-byte Folded Spill
+; LE-NEXT:    std 22, 400(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 5, 3, 5
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    mulld 7, 3, 7
+; LE-NEXT:    mulld 8, 3, 8
+; LE-NEXT:    std 27, 440(1) # 8-byte Folded Spill
+; LE-NEXT:    rlwinm 27, 4, 0, 19, 19
+; LE-NEXT:    std 23, 408(1) # 8-byte Folded Spill
+; LE-NEXT:    std 26, 432(1) # 8-byte Folded Spill
+; LE-NEXT:    rlwinm 26, 4, 0, 18, 18
+; LE-NEXT:    std 24, 416(1) # 8-byte Folded Spill
+; LE-NEXT:    std 25, 424(1) # 8-byte Folded Spill
+; LE-NEXT:    rlwinm 25, 4, 0, 17, 17
+; LE-NEXT:    std 31, 472(1) # 8-byte Folded Spill
+; LE-NEXT:    std 2, 328(1) # 8-byte Folded Spill
+; LE-NEXT:    xor 5, 6, 5
+; LE-NEXT:    std 8, 64(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 8, 3, 9
+; LE-NEXT:    xor 16, 5, 7
+; LE-NEXT:    rlwinm 5, 4, 0, 16, 16
+; LE-NEXT:    mulld 5, 3, 5
+; LE-NEXT:    std 8, 80(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 8, 3, 10
+; LE-NEXT:    std 5, 256(1) # 8-byte Folded Spill
+; LE-NEXT:    rlwinm 5, 4, 0, 15, 15
+; LE-NEXT:    std 8, 96(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 8, 3, 11
+; LE-NEXT:    mulld 5, 3, 5
+; LE-NEXT:    std 8, 112(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 8, 3, 12
+; LE-NEXT:    std 5, 272(1) # 8-byte Folded Spill
+; LE-NEXT:    rlwinm 5, 4, 0, 14, 14
+; LE-NEXT:    mulld 5, 3, 5
+; LE-NEXT:    std 8, 128(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 8, 3, 0
+; LE-NEXT:    std 5, 288(1) # 8-byte Folded Spill
+; LE-NEXT:    rlwinm 5, 4, 0, 13, 13
+; LE-NEXT:    mulld 5, 3, 5
+; LE-NEXT:    std 8, 144(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 8, 3, 30
+; LE-NEXT:    std 5, 304(1) # 8-byte Folded Spill
+; LE-NEXT:    rlwinm 5, 4, 0, 12, 12
+; LE-NEXT:    std 8, 160(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 8, 3, 29
+; LE-NEXT:    mulld 5, 3, 5
+; LE-NEXT:    std 8, 184(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 8, 3, 28
+; LE-NEXT:    std 5, 312(1) # 8-byte Folded Spill
+; LE-NEXT:    rlwinm 5, 4, 0, 11, 11
+; LE-NEXT:    mulld 5, 3, 5
+; LE-NEXT:    std 8, 200(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 8, 3, 27
+; LE-NEXT:    std 5, 320(1) # 8-byte Folded Spill
+; LE-NEXT:    rlwinm 5, 4, 0, 10, 10
+; LE-NEXT:    std 8, 224(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 8, 3, 26
+; LE-NEXT:    mulld 5, 3, 5
+; LE-NEXT:    std 8, 240(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 8, 3, 25
+; LE-NEXT:    std 5, 296(1) # 8-byte Folded Spill
+; LE-NEXT:    rlwinm 5, 4, 0, 9, 9
+; LE-NEXT:    mulld 5, 3, 5
+; LE-NEXT:    std 8, 280(1) # 8-byte Folded Spill
+; LE-NEXT:    std 5, 264(1) # 8-byte Folded Spill
+; LE-NEXT:    rlwinm 5, 4, 0, 8, 8
+; LE-NEXT:    mulld 5, 3, 5
+; LE-NEXT:    std 5, 248(1) # 8-byte Folded Spill
+; LE-NEXT:    rlwinm 5, 4, 0, 7, 7
+; LE-NEXT:    mulld 5, 3, 5
+; LE-NEXT:    std 5, 232(1) # 8-byte Folded Spill
+; LE-NEXT:    rlwinm 5, 4, 0, 6, 6
+; LE-NEXT:    mulld 5, 3, 5
+; LE-NEXT:    std 5, 216(1) # 8-byte Folded Spill
+; LE-NEXT:    rlwinm 5, 4, 0, 5, 5
+; LE-NEXT:    mulld 5, 3, 5
+; LE-NEXT:    std 5, 208(1) # 8-byte Folded Spill
+; LE-NEXT:    rlwinm 5, 4, 0, 4, 4
+; LE-NEXT:    mulld 5, 3, 5
+; LE-NEXT:    std 5, 192(1) # 8-byte Folded Spill
+; LE-NEXT:    rlwinm 5, 4, 0, 3, 3
+; LE-NEXT:    mulld 5, 3, 5
+; LE-NEXT:    std 5, 176(1) # 8-byte Folded Spill
+; LE-NEXT:    rlwinm 5, 4, 0, 2, 2
+; LE-NEXT:    mulld 5, 3, 5
+; LE-NEXT:    std 5, 168(1) # 8-byte Folded Spill
+; LE-NEXT:    rlwinm 5, 4, 0, 1, 1
+; LE-NEXT:    mulld 5, 3, 5
+; LE-NEXT:    std 5, 152(1) # 8-byte Folded Spill
+; LE-NEXT:    rlwinm 5, 4, 0, 0, 0
+; LE-NEXT:    mulld 5, 3, 5
+; LE-NEXT:    std 5, 136(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 5, 4, 32, 32
+; LE-NEXT:    rldicl 5, 5, 32, 31
+; LE-NEXT:    mulld 5, 3, 5
+; LE-NEXT:    std 5, 120(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 5, 4, 31, 33
+; LE-NEXT:    rldicl 5, 5, 33, 30
+; LE-NEXT:    mulld 5, 3, 5
+; LE-NEXT:    std 5, 104(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 5, 4, 30, 34
+; LE-NEXT:    rldicl 5, 5, 34, 29
+; LE-NEXT:    mulld 5, 3, 5
+; LE-NEXT:    std 5, 88(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 5, 4, 29, 35
+; LE-NEXT:    rldicl 5, 5, 35, 28
+; LE-NEXT:    mulld 5, 3, 5
+; LE-NEXT:    std 5, 72(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 5, 4, 28, 36
+; LE-NEXT:    rldicl 5, 5, 36, 27
+; LE-NEXT:    mulld 5, 3, 5
+; LE-NEXT:    std 5, 56(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 5, 4, 27, 37
+; LE-NEXT:    rldicl 5, 5, 37, 26
+; LE-NEXT:    mulld 5, 3, 5
+; LE-NEXT:    std 5, 48(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 5, 4, 26, 38
+; LE-NEXT:    rldicl 5, 5, 38, 25
+; LE-NEXT:    mulld 5, 3, 5
+; LE-NEXT:    std 5, 40(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 5, 4, 25, 39
+; LE-NEXT:    rldicl 5, 5, 39, 24
+; LE-NEXT:    mulld 14, 3, 5
+; LE-NEXT:    rldicl 5, 4, 24, 40
+; LE-NEXT:    rldicl 5, 5, 40, 23
+; LE-NEXT:    mulld 15, 3, 5
+; LE-NEXT:    rldicl 5, 4, 23, 41
+; LE-NEXT:    rldicl 5, 5, 41, 22
+; LE-NEXT:    mulld 17, 3, 5
+; LE-NEXT:    rldicl 5, 4, 22, 42
+; LE-NEXT:    rldicl 5, 5, 42, 21
+; LE-NEXT:    mulld 18, 3, 5
+; LE-NEXT:    rldicl 5, 4, 21, 43
+; LE-NEXT:    rldicl 5, 5, 43, 20
+; LE-NEXT:    mulld 19, 3, 5
+; LE-NEXT:    rldicl 5, 4, 20, 44
+; LE-NEXT:    rldicl 5, 5, 44, 19
+; LE-NEXT:    mulld 20, 3, 5
+; LE-NEXT:    rldicl 5, 4, 19, 45
+; LE-NEXT:    rldicl 5, 5, 45, 18
+; LE-NEXT:    mulld 21, 3, 5
+; LE-NEXT:    rldicl 5, 4, 18, 46
+; LE-NEXT:    rldicl 5, 5, 46, 17
+; LE-NEXT:    mulld 22, 3, 5
+; LE-NEXT:    rldicl 5, 4, 17, 47
+; LE-NEXT:    rldicl 5, 5, 47, 16
+; LE-NEXT:    mulld 23, 3, 5
+; LE-NEXT:    rldicl 5, 4, 16, 48
+; LE-NEXT:    rldicl 5, 5, 48, 15
+; LE-NEXT:    mulld 24, 3, 5
+; LE-NEXT:    rldicl 5, 4, 15, 49
+; LE-NEXT:    rldicl 5, 5, 49, 14
+; LE-NEXT:    mulld 25, 3, 5
+; LE-NEXT:    rldicl 5, 4, 14, 50
+; LE-NEXT:    rldicl 5, 5, 50, 13
+; LE-NEXT:    mulld 26, 3, 5
+; LE-NEXT:    rldicl 5, 4, 13, 51
+; LE-NEXT:    rldicl 5, 5, 51, 12
+; LE-NEXT:    mulld 27, 3, 5
+; LE-NEXT:    rldicl 5, 4, 12, 52
+; LE-NEXT:    rldicl 5, 5, 52, 11
+; LE-NEXT:    mulld 28, 3, 5
+; LE-NEXT:    rldicl 5, 4, 11, 53
+; LE-NEXT:    rldicl 5, 5, 53, 10
+; LE-NEXT:    mulld 29, 3, 5
+; LE-NEXT:    rldicl 5, 4, 10, 54
+; LE-NEXT:    rldicl 5, 5, 54, 9
+; LE-NEXT:    mulld 30, 3, 5
+; LE-NEXT:    rldicl 5, 4, 9, 55
+; LE-NEXT:    rldicl 5, 5, 55, 8
+; LE-NEXT:    mulld 0, 3, 5
+; LE-NEXT:    rldicl 5, 4, 8, 56
+; LE-NEXT:    rldicl 5, 5, 56, 7
+; LE-NEXT:    mulld 12, 3, 5
+; LE-NEXT:    rldicl 5, 4, 7, 57
+; LE-NEXT:    rldicl 5, 5, 57, 6
+; LE-NEXT:    mulld 11, 3, 5
+; LE-NEXT:    rldicl 5, 4, 6, 58
+; LE-NEXT:    rldicl 5, 5, 58, 5
+; LE-NEXT:    mulld 10, 3, 5
+; LE-NEXT:    rldicl 5, 4, 5, 59
+; LE-NEXT:    rldicl 5, 5, 59, 4
+; LE-NEXT:    mulld 9, 3, 5
+; LE-NEXT:    rldicl 5, 4, 4, 60
+; LE-NEXT:    rldicl 5, 5, 60, 3
+; LE-NEXT:    mulld 8, 3, 5
+; LE-NEXT:    rldicl 5, 4, 3, 61
+; LE-NEXT:    rldicl 5, 5, 61, 2
+; LE-NEXT:    mulld 7, 3, 5
+; LE-NEXT:    rldicl 5, 4, 2, 62
+; LE-NEXT:    rldicr 4, 4, 0, 0
+; LE-NEXT:    rldicl 5, 5, 62, 1
+; LE-NEXT:    mulld 6, 3, 5
+; LE-NEXT:    mulld 5, 3, 4
+; LE-NEXT:    mffprd 4, 0
+; LE-NEXT:    mffprd 3, 1
+; LE-NEXT:    rlwinm 2, 4, 0, 30, 30
+; LE-NEXT:    clrldi 31, 4, 63
+; LE-NEXT:    mulld 2, 3, 2
+; LE-NEXT:    mulld 31, 3, 31
+; LE-NEXT:    xor 31, 31, 2
+; LE-NEXT:    ld 2, 64(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 16, 16, 2
+; LE-NEXT:    rlwinm 2, 4, 0, 29, 29
+; LE-NEXT:    mulld 2, 3, 2
+; LE-NEXT:    xor 31, 31, 2
+; LE-NEXT:    ld 2, 80(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 16, 16, 2
+; LE-NEXT:    rlwinm 2, 4, 0, 28, 28
+; LE-NEXT:    mulld 2, 3, 2
+; LE-NEXT:    xor 31, 31, 2
+; LE-NEXT:    ld 2, 96(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 16, 16, 2
+; LE-NEXT:    rlwinm 2, 4, 0, 27, 27
+; LE-NEXT:    mulld 2, 3, 2
+; LE-NEXT:    xor 31, 31, 2
+; LE-NEXT:    ld 2, 112(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 16, 16, 2
+; LE-NEXT:    rlwinm 2, 4, 0, 26, 26
+; LE-NEXT:    mulld 2, 3, 2
+; LE-NEXT:    xor 31, 31, 2
+; LE-NEXT:    ld 2, 128(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 16, 16, 2
+; LE-NEXT:    rlwinm 2, 4, 0, 25, 25
+; LE-NEXT:    mulld 2, 3, 2
+; LE-NEXT:    xor 31, 31, 2
+; LE-NEXT:    ld 2, 144(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 16, 16, 2
+; LE-NEXT:    rlwinm 2, 4, 0, 24, 24
+; LE-NEXT:    mulld 2, 3, 2
+; LE-NEXT:    xor 31, 31, 2
+; LE-NEXT:    ld 2, 160(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 16, 16, 2
+; LE-NEXT:    rlwinm 2, 4, 0, 23, 23
+; LE-NEXT:    mulld 2, 3, 2
+; LE-NEXT:    xor 31, 31, 2
+; LE-NEXT:    ld 2, 184(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 16, 16, 2
+; LE-NEXT:    rlwinm 2, 4, 0, 22, 22
+; LE-NEXT:    mulld 2, 3, 2
+; LE-NEXT:    xor 31, 31, 2
+; LE-NEXT:    ld 2, 200(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 16, 16, 2
+; LE-NEXT:    rlwinm 2, 4, 0, 21, 21
+; LE-NEXT:    mulld 2, 3, 2
+; LE-NEXT:    xor 31, 31, 2
+; LE-NEXT:    ld 2, 224(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 16, 16, 2
+; LE-NEXT:    rlwinm 2, 4, 0, 20, 20
+; LE-NEXT:    mulld 2, 3, 2
+; LE-NEXT:    xor 31, 31, 2
+; LE-NEXT:    ld 2, 240(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 16, 16, 2
+; LE-NEXT:    rlwinm 2, 4, 0, 19, 19
+; LE-NEXT:    mulld 2, 3, 2
+; LE-NEXT:    xor 31, 31, 2
+; LE-NEXT:    ld 2, 280(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 16, 16, 2
+; LE-NEXT:    rlwinm 2, 4, 0, 18, 18
+; LE-NEXT:    mulld 2, 3, 2
+; LE-NEXT:    xor 31, 31, 2
+; LE-NEXT:    ld 2, 256(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 16, 16, 2
+; LE-NEXT:    rlwinm 2, 4, 0, 17, 17
+; LE-NEXT:    mulld 2, 3, 2
+; LE-NEXT:    xor 31, 31, 2
+; LE-NEXT:    ld 2, 272(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 16, 16, 2
+; LE-NEXT:    rlwinm 2, 4, 0, 16, 16
+; LE-NEXT:    mulld 2, 3, 2
+; LE-NEXT:    xor 31, 31, 2
+; LE-NEXT:    ld 2, 288(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 16, 16, 2
+; LE-NEXT:    rlwinm 2, 4, 0, 15, 15
+; LE-NEXT:    mulld 2, 3, 2
+; LE-NEXT:    xor 31, 31, 2
+; LE-NEXT:    ld 2, 304(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 16, 16, 2
+; LE-NEXT:    rlwinm 2, 4, 0, 14, 14
+; LE-NEXT:    mulld 2, 3, 2
+; LE-NEXT:    xor 31, 31, 2
+; LE-NEXT:    ld 2, 312(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 16, 16, 2
+; LE-NEXT:    rlwinm 2, 4, 0, 13, 13
+; LE-NEXT:    mulld 2, 3, 2
+; LE-NEXT:    xor 31, 31, 2
+; LE-NEXT:    ld 2, 320(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 16, 16, 2
+; LE-NEXT:    ld 2, 296(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 16, 16, 2
+; LE-NEXT:    ld 2, 264(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 16, 16, 2
+; LE-NEXT:    ld 2, 248(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 16, 16, 2
+; LE-NEXT:    ld 2, 232(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 16, 16, 2
+; LE-NEXT:    ld 2, 216(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 16, 16, 2
+; LE-NEXT:    ld 2, 208(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 16, 16, 2
+; LE-NEXT:    ld 2, 192(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 16, 16, 2
+; LE-NEXT:    ld 2, 176(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 16, 16, 2
+; LE-NEXT:    ld 2, 168(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 16, 16, 2
+; LE-NEXT:    ld 2, 152(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 16, 16, 2
+; LE-NEXT:    ld 2, 136(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 16, 16, 2
+; LE-NEXT:    ld 2, 120(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 16, 16, 2
+; LE-NEXT:    ld 2, 104(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 16, 16, 2
+; LE-NEXT:    ld 2, 88(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 16, 16, 2
+; LE-NEXT:    ld 2, 72(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 16, 16, 2
+; LE-NEXT:    ld 2, 56(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 16, 16, 2
+; LE-NEXT:    ld 2, 48(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 16, 16, 2
+; LE-NEXT:    ld 2, 40(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 16, 16, 2
+; LE-NEXT:    ld 2, 328(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 16, 16, 14
+; LE-NEXT:    ld 14, 336(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 16, 16, 15
+; LE-NEXT:    ld 15, 344(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 17, 16, 17
+; LE-NEXT:    ld 16, 352(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 18, 17, 18
+; LE-NEXT:    ld 17, 360(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 19, 18, 19
+; LE-NEXT:    ld 18, 368(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 20, 19, 20
+; LE-NEXT:    ld 19, 376(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 21, 20, 21
+; LE-NEXT:    ld 20, 384(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 22, 21, 22
+; LE-NEXT:    ld 21, 392(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 23, 22, 23
+; LE-NEXT:    ld 22, 400(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 24, 23, 24
+; LE-NEXT:    ld 23, 408(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 25, 24, 25
+; LE-NEXT:    ld 24, 416(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 26, 25, 26
+; LE-NEXT:    ld 25, 424(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 27, 26, 27
+; LE-NEXT:    ld 26, 432(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 28, 27, 28
+; LE-NEXT:    ld 27, 440(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 29, 28, 29
+; LE-NEXT:    ld 28, 448(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 30, 29, 30
+; LE-NEXT:    ld 29, 456(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 0, 30, 0
+; LE-NEXT:    ld 30, 464(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 12, 0, 12
+; LE-NEXT:    xor 11, 12, 11
+; LE-NEXT:    xor 10, 11, 10
+; LE-NEXT:    xor 9, 10, 9
+; LE-NEXT:    xor 8, 9, 8
+; LE-NEXT:    xor 7, 8, 7
+; LE-NEXT:    xor 6, 7, 6
+; LE-NEXT:    xor 5, 6, 5
+; LE-NEXT:    rlwinm 6, 4, 0, 11, 11
+; LE-NEXT:    mtfprd 0, 5
+; LE-NEXT:    rlwinm 5, 4, 0, 12, 12
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    mulld 5, 3, 5
+; LE-NEXT:    xor 5, 31, 5
+; LE-NEXT:    ld 31, 472(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    rlwinm 6, 4, 0, 10, 10
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    rlwinm 6, 4, 0, 9, 9
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    rlwinm 6, 4, 0, 8, 8
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    rlwinm 6, 4, 0, 7, 7
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    rlwinm 6, 4, 0, 6, 6
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    rlwinm 6, 4, 0, 5, 5
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    rlwinm 6, 4, 0, 4, 4
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    rlwinm 6, 4, 0, 3, 3
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    rlwinm 6, 4, 0, 2, 2
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    rlwinm 6, 4, 0, 1, 1
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    rlwinm 6, 4, 0, 0, 0
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    rldicl 6, 4, 32, 32
+; LE-NEXT:    rldicl 6, 6, 32, 31
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    rldicl 6, 4, 31, 33
+; LE-NEXT:    rldicl 6, 6, 33, 30
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    rldicl 6, 4, 30, 34
+; LE-NEXT:    rldicl 6, 6, 34, 29
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    rldicl 6, 4, 29, 35
+; LE-NEXT:    rldicl 6, 6, 35, 28
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    rldicl 6, 4, 28, 36
+; LE-NEXT:    rldicl 6, 6, 36, 27
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    rldicl 6, 4, 27, 37
+; LE-NEXT:    rldicl 6, 6, 37, 26
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    rldicl 6, 4, 26, 38
+; LE-NEXT:    rldicl 6, 6, 38, 25
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    rldicl 6, 4, 25, 39
+; LE-NEXT:    rldicl 6, 6, 39, 24
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    rldicl 6, 4, 24, 40
+; LE-NEXT:    rldicl 6, 6, 40, 23
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    rldicl 6, 4, 23, 41
+; LE-NEXT:    rldicl 6, 6, 41, 22
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    rldicl 6, 4, 22, 42
+; LE-NEXT:    rldicl 6, 6, 42, 21
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    rldicl 6, 4, 21, 43
+; LE-NEXT:    rldicl 6, 6, 43, 20
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    rldicl 6, 4, 20, 44
+; LE-NEXT:    rldicl 6, 6, 44, 19
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    rldicl 6, 4, 19, 45
+; LE-NEXT:    rldicl 6, 6, 45, 18
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    rldicl 6, 4, 18, 46
+; LE-NEXT:    rldicl 6, 6, 46, 17
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    rldicl 6, 4, 17, 47
+; LE-NEXT:    rldicl 6, 6, 47, 16
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    rldicl 6, 4, 16, 48
+; LE-NEXT:    rldicl 6, 6, 48, 15
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    rldicl 6, 4, 15, 49
+; LE-NEXT:    rldicl 6, 6, 49, 14
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    rldicl 6, 4, 14, 50
+; LE-NEXT:    rldicl 6, 6, 50, 13
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    rldicl 6, 4, 13, 51
+; LE-NEXT:    rldicl 6, 6, 51, 12
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    rldicl 6, 4, 12, 52
+; LE-NEXT:    rldicl 6, 6, 52, 11
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    rldicl 6, 4, 11, 53
+; LE-NEXT:    rldicl 6, 6, 53, 10
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    rldicl 6, 4, 10, 54
+; LE-NEXT:    rldicl 6, 6, 54, 9
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    rldicl 6, 4, 9, 55
+; LE-NEXT:    rldicl 6, 6, 55, 8
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    rldicl 6, 4, 8, 56
+; LE-NEXT:    rldicl 6, 6, 56, 7
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    rldicl 6, 4, 7, 57
+; LE-NEXT:    rldicl 6, 6, 57, 6
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    rldicl 6, 4, 6, 58
+; LE-NEXT:    rldicl 6, 6, 58, 5
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    rldicl 6, 4, 5, 59
+; LE-NEXT:    rldicl 6, 6, 59, 4
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    rldicl 6, 4, 4, 60
+; LE-NEXT:    rldicl 6, 6, 60, 3
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    rldicl 6, 4, 3, 61
+; LE-NEXT:    rldicl 6, 6, 61, 2
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    rldicl 6, 4, 2, 62
+; LE-NEXT:    rldicr 4, 4, 0, 0
+; LE-NEXT:    rldicl 6, 6, 62, 1
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    mulld 3, 3, 4
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    xor 3, 5, 3
+; LE-NEXT:    mtfprd 1, 3
+; LE-NEXT:    xxmrghd 34, 0, 1
+; LE-NEXT:    addi 1, 1, 480
+; LE-NEXT:    blr
+  %res = call <2 x i64> @llvm.clmul.v2i64(<2 x i64> %a, <2 x i64> %b)
+  ret <2 x i64> %res
+}
+
+define <16 x i8> @clmulr_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
+; BE-LABEL: clmulr_v16i8:
+; BE:       # %bb.0:
+; BE-NEXT:    li 3, -48
+; BE-NEXT:    vspltisb 4, 4
+; BE-NEXT:    stvx 29, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, -32
+; BE-NEXT:    vsrb 1, 3, 4
+; BE-NEXT:    vspltisb 5, 15
+; BE-NEXT:    stvx 30, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, -16
+; BE-NEXT:    vspltisb 7, -1
+; BE-NEXT:    stvx 31, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    addis 3, 2, .LCPI4_0 at toc@ha
+; BE-NEXT:    addi 3, 3, .LCPI4_0 at toc@l
+; BE-NEXT:    vand 3, 3, 5
+; BE-NEXT:    vspltisb 13, 8
+; BE-NEXT:    vslb 3, 3, 4
+; BE-NEXT:    vsrb 0, 2, 4
+; BE-NEXT:    vand 2, 2, 5
+; BE-NEXT:    vor 1, 1, 3
+; BE-NEXT:    lvx 3, 0, 3
+; BE-NEXT:    addis 3, 2, .LCPI4_1 at toc@ha
+; BE-NEXT:    addi 3, 3, .LCPI4_1 at toc@l
+; BE-NEXT:    vslb 2, 2, 4
+; BE-NEXT:    vor 0, 0, 2
+; BE-NEXT:    vspltisb 2, 2
+; BE-NEXT:    vsrb 9, 1, 2
+; BE-NEXT:    vand 1, 1, 3
+; BE-NEXT:    vand 9, 9, 3
+; BE-NEXT:    vslb 1, 1, 2
+; BE-NEXT:    vsrb 8, 0, 2
+; BE-NEXT:    vand 0, 0, 3
+; BE-NEXT:    vor 9, 9, 1
+; BE-NEXT:    lvx 1, 0, 3
+; BE-NEXT:    addis 3, 2, .LCPI4_3 at toc@ha
+; BE-NEXT:    addi 3, 3, .LCPI4_3 at toc@l
+; BE-NEXT:    lvx 15, 0, 3
+; BE-NEXT:    addis 3, 2, .LCPI4_2 at toc@ha
+; BE-NEXT:    addi 3, 3, .LCPI4_2 at toc@l
+; BE-NEXT:    vand 8, 8, 3
+; BE-NEXT:    vslb 0, 0, 2
+; BE-NEXT:    vor 8, 8, 0
+; BE-NEXT:    vspltisb 0, 1
+; BE-NEXT:    vsrb 11, 9, 0
+; BE-NEXT:    vand 9, 9, 1
+; BE-NEXT:    vaddubm 9, 9, 9
+; BE-NEXT:    vand 11, 11, 1
+; BE-NEXT:    vsrb 10, 8, 0
+; BE-NEXT:    vand 8, 8, 1
+; BE-NEXT:    vaddubm 8, 8, 8
+; BE-NEXT:    vor 9, 11, 9
+; BE-NEXT:    vslb 6, 4, 4
+; BE-NEXT:    vslb 7, 7, 7
+; BE-NEXT:    vand 10, 10, 1
+; BE-NEXT:    vand 14, 9, 13
+; BE-NEXT:    vaddubm 13, 13, 13
+; BE-NEXT:    vor 8, 10, 8
+; BE-NEXT:    vand 10, 9, 2
+; BE-NEXT:    vand 11, 9, 0
+; BE-NEXT:    vand 12, 9, 4
+; BE-NEXT:    vand 13, 9, 13
+; BE-NEXT:    vand 15, 9, 15
+; BE-NEXT:    vand 6, 9, 6
+; BE-NEXT:    vand 7, 9, 7
+; BE-NEXT:    vmuloub 9, 8, 10
+; BE-NEXT:    vmuleub 10, 8, 10
+; BE-NEXT:    vmuloub 16, 8, 11
+; BE-NEXT:    vmuleub 11, 8, 11
+; BE-NEXT:    vmuloub 17, 8, 12
+; BE-NEXT:    vmuleub 12, 8, 12
+; BE-NEXT:    vmuloub 18, 8, 14
+; BE-NEXT:    vmuleub 14, 8, 14
+; BE-NEXT:    vmuloub 19, 8, 13
+; BE-NEXT:    vmuleub 13, 8, 13
+; BE-NEXT:    vmuloub 31, 8, 15
+; BE-NEXT:    vmuleub 15, 8, 15
+; BE-NEXT:    vmuloub 30, 8, 6
+; BE-NEXT:    vmuleub 6, 8, 6
+; BE-NEXT:    vmuloub 29, 8, 7
+; BE-NEXT:    vmuleub 7, 8, 7
+; BE-NEXT:    lvx 8, 0, 3
+; BE-NEXT:    li 3, -16
+; BE-NEXT:    vperm 9, 10, 9, 8
+; BE-NEXT:    vperm 10, 11, 16, 8
+; BE-NEXT:    vperm 11, 12, 17, 8
+; BE-NEXT:    vperm 12, 14, 18, 8
+; BE-NEXT:    vperm 13, 13, 19, 8
+; BE-NEXT:    vperm 14, 15, 31, 8
+; BE-NEXT:    lvx 31, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, -32
+; BE-NEXT:    vperm 6, 6, 30, 8
+; BE-NEXT:    lvx 30, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, -48
+; BE-NEXT:    vperm 7, 7, 29, 8
+; BE-NEXT:    lvx 29, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    vxor 8, 10, 9
+; BE-NEXT:    vxor 8, 8, 11
+; BE-NEXT:    vxor 8, 8, 12
+; BE-NEXT:    vxor 8, 8, 13
+; BE-NEXT:    vxor 8, 8, 14
+; BE-NEXT:    vxor 6, 8, 6
+; BE-NEXT:    vxor 6, 6, 7
+; BE-NEXT:    vand 5, 6, 5
+; BE-NEXT:    vsrb 7, 6, 4
+; BE-NEXT:    vslb 4, 5, 4
+; BE-NEXT:    vor 4, 7, 4
+; BE-NEXT:    vand 5, 4, 3
+; BE-NEXT:    vsrb 4, 4, 2
+; BE-NEXT:    vslb 2, 5, 2
+; BE-NEXT:    vand 3, 4, 3
+; BE-NEXT:    vor 2, 3, 2
+; BE-NEXT:    vsrb 3, 2, 0
+; BE-NEXT:    vand 2, 2, 1
+; BE-NEXT:    vaddubm 2, 2, 2
+; BE-NEXT:    vand 3, 3, 1
+; BE-NEXT:    vor 2, 3, 2
+; BE-NEXT:    blr
+;
+; LE-LABEL: clmulr_v16i8:
+; LE:       # %bb.0:
+; LE-NEXT:    addis 3, 2, .LCPI4_0 at toc@ha
+; LE-NEXT:    vspltisb 4, 4
+; LE-NEXT:    vspltisb 5, 2
+; LE-NEXT:    addi 3, 3, .LCPI4_0 at toc@l
+; LE-NEXT:    vslb 1, 3, 4
+; LE-NEXT:    vsrb 3, 3, 4
+; LE-NEXT:    vslb 6, 2, 4
+; LE-NEXT:    vsrb 2, 2, 4
+; LE-NEXT:    lxvd2x 0, 0, 3
+; LE-NEXT:    addis 3, 2, .LCPI4_1 at toc@ha
+; LE-NEXT:    xxlor 35, 35, 33
+; LE-NEXT:    xxlor 34, 34, 38
+; LE-NEXT:    vspltisb 0, 1
+; LE-NEXT:    addi 3, 3, .LCPI4_1 at toc@l
+; LE-NEXT:    vsrb 1, 3, 5
+; LE-NEXT:    vsrb 7, 2, 5
+; LE-NEXT:    vspltisb 6, 8
+; LE-NEXT:    lxvd2x 1, 0, 3
+; LE-NEXT:    addis 3, 2, .LCPI4_2 at toc@ha
+; LE-NEXT:    xxland 35, 35, 0
+; LE-NEXT:    xxland 34, 34, 0
+; LE-NEXT:    xxland 2, 33, 0
+; LE-NEXT:    xxland 3, 39, 0
+; LE-NEXT:    addi 3, 3, .LCPI4_2 at toc@l
+; LE-NEXT:    vslb 3, 3, 5
+; LE-NEXT:    vslb 2, 2, 5
+; LE-NEXT:    xxlor 35, 2, 35
+; LE-NEXT:    xxlor 34, 3, 34
+; LE-NEXT:    lxvd2x 3, 0, 3
+; LE-NEXT:    addis 3, 2, .LCPI4_3 at toc@ha
+; LE-NEXT:    vsrb 1, 3, 0
+; LE-NEXT:    xxland 35, 35, 1
+; LE-NEXT:    vsrb 7, 2, 0
+; LE-NEXT:    xxland 34, 34, 1
+; LE-NEXT:    addi 3, 3, .LCPI4_3 at toc@l
+; LE-NEXT:    xxland 2, 33, 1
+; LE-NEXT:    vaddubm 3, 3, 3
+; LE-NEXT:    vaddubm 2, 2, 2
+; LE-NEXT:    xxlor 2, 2, 35
+; LE-NEXT:    xxland 35, 2, 37
+; LE-NEXT:    xxswapd 33, 3
+; LE-NEXT:    xxland 3, 39, 1
+; LE-NEXT:    xxlor 34, 3, 34
+; LE-NEXT:    lxvd2x 3, 0, 3
+; LE-NEXT:    vmuloub 7, 2, 3
+; LE-NEXT:    vmuleub 3, 2, 3
+; LE-NEXT:    vperm 3, 3, 7, 1
+; LE-NEXT:    xxland 39, 2, 32
+; LE-NEXT:    vmuloub 8, 2, 7
+; LE-NEXT:    vmuleub 7, 2, 7
+; LE-NEXT:    vperm 7, 7, 8, 1
+; LE-NEXT:    xxland 40, 2, 36
+; LE-NEXT:    vmuloub 9, 2, 8
+; LE-NEXT:    vmuleub 8, 2, 8
+; LE-NEXT:    vperm 8, 8, 9, 1
+; LE-NEXT:    xxland 41, 2, 38
+; LE-NEXT:    vaddubm 6, 6, 6
+; LE-NEXT:    vmuloub 10, 2, 9
+; LE-NEXT:    vmuleub 9, 2, 9
+; LE-NEXT:    xxland 38, 2, 38
+; LE-NEXT:    vperm 9, 9, 10, 1
+; LE-NEXT:    vmuloub 10, 2, 6
+; LE-NEXT:    vmuleub 6, 2, 6
+; LE-NEXT:    vperm 6, 6, 10, 1
+; LE-NEXT:    xxland 42, 2, 3
+; LE-NEXT:    vmuloub 11, 2, 10
+; LE-NEXT:    vmuleub 10, 2, 10
+; LE-NEXT:    vperm 10, 10, 11, 1
+; LE-NEXT:    vslb 11, 4, 4
+; LE-NEXT:    xxland 43, 2, 43
+; LE-NEXT:    vmuloub 12, 2, 11
+; LE-NEXT:    vmuleub 11, 2, 11
+; LE-NEXT:    vperm 11, 11, 12, 1
+; LE-NEXT:    xxleqv 44, 44, 44
+; LE-NEXT:    vslb 12, 12, 12
+; LE-NEXT:    xxland 44, 2, 44
+; LE-NEXT:    xxlxor 2, 39, 35
+; LE-NEXT:    xxlxor 2, 2, 40
+; LE-NEXT:    vmuloub 13, 2, 12
+; LE-NEXT:    vmuleub 2, 2, 12
+; LE-NEXT:    xxlxor 2, 2, 41
+; LE-NEXT:    xxlxor 2, 2, 38
+; LE-NEXT:    xxlxor 2, 2, 42
+; LE-NEXT:    xxlxor 2, 2, 43
+; LE-NEXT:    vperm 2, 2, 13, 1
+; LE-NEXT:    xxlxor 34, 2, 34
+; LE-NEXT:    vslb 3, 2, 4
+; LE-NEXT:    vsrb 2, 2, 4
+; LE-NEXT:    xxlor 34, 34, 35
+; LE-NEXT:    xxland 35, 34, 0
+; LE-NEXT:    vsrb 2, 2, 5
+; LE-NEXT:    vslb 3, 3, 5
+; LE-NEXT:    xxland 0, 34, 0
+; LE-NEXT:    xxlor 34, 0, 35
+; LE-NEXT:    vsrb 3, 2, 0
+; LE-NEXT:    xxland 34, 34, 1
+; LE-NEXT:    xxland 0, 35, 1
+; LE-NEXT:    vaddubm 2, 2, 2
+; LE-NEXT:    xxlor 34, 0, 34
+; LE-NEXT:    blr
+  %a.ext = zext <16 x i8> %a to <16 x i16>
+  %b.ext = zext <16 x i8> %b to <16 x i16>
+  %clmul = call <16 x i16> @llvm.clmul.v16i16(<16 x i16> %a.ext, <16 x i16> %b.ext)
+  %res.ext = lshr <16 x i16> %clmul, splat (i16 7)
+  %res = trunc <16 x i16> %res.ext to <16 x i8>
+  ret <16 x i8> %res
+}
+
+define <8 x i16> @clmulr_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
+; BE-LABEL: clmulr_v8i16:
+; BE:       # %bb.0:
+; BE-NEXT:    li 3, -80
+; BE-NEXT:    vspltish 4, 8
+; BE-NEXT:    vxor 5, 5, 5
+; BE-NEXT:    stvx 27, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, -64
+; BE-NEXT:    vadduhm 19, 4, 4
+; BE-NEXT:    vspltisb 1, -1
+; BE-NEXT:    stvx 28, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, -48
+; BE-NEXT:    vspltish 0, 2
+; BE-NEXT:    stvx 29, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, -32
+; BE-NEXT:    vrlh 8, 2, 4
+; BE-NEXT:    vspltish 2, 4
+; BE-NEXT:    stvx 30, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, -16
+; BE-NEXT:    stvx 31, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    addis 3, 2, .LCPI5_0 at toc@ha
+; BE-NEXT:    addi 3, 3, .LCPI5_0 at toc@l
+; BE-NEXT:    vrlh 6, 3, 4
+; BE-NEXT:    vspltish 3, 1
+; BE-NEXT:    vslh 13, 1, 1
+; BE-NEXT:    vspltisb 1, 15
+; BE-NEXT:    vand 14, 8, 1
+; BE-NEXT:    vsrh 8, 8, 2
+; BE-NEXT:    vand 15, 6, 1
+; BE-NEXT:    vsrh 6, 6, 2
+; BE-NEXT:    vslh 14, 14, 2
+; BE-NEXT:    vand 8, 8, 1
+; BE-NEXT:    vslh 15, 15, 2
+; BE-NEXT:    vand 6, 6, 1
+; BE-NEXT:    vor 8, 8, 14
+; BE-NEXT:    vor 14, 6, 15
+; BE-NEXT:    lvx 6, 0, 3
+; BE-NEXT:    addis 3, 2, .LCPI5_1 at toc@ha
+; BE-NEXT:    addi 3, 3, .LCPI5_1 at toc@l
+; BE-NEXT:    vand 15, 8, 6
+; BE-NEXT:    vsrh 8, 8, 0
+; BE-NEXT:    vslh 15, 15, 0
+; BE-NEXT:    vand 8, 8, 6
+; BE-NEXT:    vor 15, 8, 15
+; BE-NEXT:    lvx 8, 0, 3
+; BE-NEXT:    addis 3, 2, .LCPI5_2 at toc@ha
+; BE-NEXT:    addi 3, 3, .LCPI5_2 at toc@l
+; BE-NEXT:    lvx 31, 0, 3
+; BE-NEXT:    addis 3, 2, .LCPI5_3 at toc@ha
+; BE-NEXT:    addi 3, 3, .LCPI5_3 at toc@l
+; BE-NEXT:    lvx 30, 0, 3
+; BE-NEXT:    addis 3, 2, .LCPI5_4 at toc@ha
+; BE-NEXT:    addi 3, 3, .LCPI5_4 at toc@l
+; BE-NEXT:    vand 16, 14, 6
+; BE-NEXT:    lvx 29, 0, 3
+; BE-NEXT:    addis 3, 2, .LCPI5_5 at toc@ha
+; BE-NEXT:    addi 3, 3, .LCPI5_5 at toc@l
+; BE-NEXT:    lvx 28, 0, 3
+; BE-NEXT:    addis 3, 2, .LCPI5_6 at toc@ha
+; BE-NEXT:    addi 3, 3, .LCPI5_6 at toc@l
+; BE-NEXT:    lvx 27, 0, 3
+; BE-NEXT:    li 3, -16
+; BE-NEXT:    vsrh 14, 14, 0
+; BE-NEXT:    vslh 16, 16, 0
+; BE-NEXT:    vand 14, 14, 6
+; BE-NEXT:    vor 14, 14, 16
+; BE-NEXT:    vsrh 17, 14, 3
+; BE-NEXT:    vand 14, 14, 8
+; BE-NEXT:    vadduhm 14, 14, 14
+; BE-NEXT:    vsrh 16, 15, 3
+; BE-NEXT:    vand 15, 15, 8
+; BE-NEXT:    vadduhm 15, 15, 15
+; BE-NEXT:    vand 17, 17, 8
+; BE-NEXT:    vand 16, 16, 8
+; BE-NEXT:    vor 14, 17, 14
+; BE-NEXT:    vslh 7, 2, 2
+; BE-NEXT:    vsldoi 9, 3, 3, 1
+; BE-NEXT:    vsldoi 10, 0, 0, 1
+; BE-NEXT:    vsldoi 11, 2, 2, 1
+; BE-NEXT:    vslh 12, 4, 4
+; BE-NEXT:    vor 15, 16, 15
+; BE-NEXT:    vand 16, 14, 0
+; BE-NEXT:    vand 17, 14, 3
+; BE-NEXT:    vand 18, 14, 2
+; BE-NEXT:    vand 19, 14, 19
+; BE-NEXT:    vand 31, 14, 31
+; BE-NEXT:    vand 7, 14, 7
+; BE-NEXT:    vand 30, 14, 30
+; BE-NEXT:    vand 9, 14, 9
+; BE-NEXT:    vand 10, 14, 10
+; BE-NEXT:    vand 11, 14, 11
+; BE-NEXT:    vand 12, 14, 12
+; BE-NEXT:    vand 29, 14, 29
+; BE-NEXT:    vand 28, 14, 28
+; BE-NEXT:    vand 27, 14, 27
+; BE-NEXT:    vand 13, 14, 13
+; BE-NEXT:    vand 14, 14, 4
+; BE-NEXT:    vmladduhm 16, 15, 16, 5
+; BE-NEXT:    vmladduhm 17, 15, 17, 5
+; BE-NEXT:    vmladduhm 18, 15, 18, 5
+; BE-NEXT:    vmladduhm 14, 15, 14, 5
+; BE-NEXT:    vmladduhm 19, 15, 19, 5
+; BE-NEXT:    vmladduhm 31, 15, 31, 5
+; BE-NEXT:    vmladduhm 7, 15, 7, 5
+; BE-NEXT:    vmladduhm 30, 15, 30, 5
+; BE-NEXT:    vmladduhm 9, 15, 9, 5
+; BE-NEXT:    vmladduhm 10, 15, 10, 5
+; BE-NEXT:    vmladduhm 11, 15, 11, 5
+; BE-NEXT:    vmladduhm 12, 15, 12, 5
+; BE-NEXT:    vmladduhm 29, 15, 29, 5
+; BE-NEXT:    vmladduhm 28, 15, 28, 5
+; BE-NEXT:    vmladduhm 27, 15, 27, 5
+; BE-NEXT:    vmladduhm 5, 15, 13, 5
+; BE-NEXT:    vxor 13, 17, 16
+; BE-NEXT:    vxor 13, 13, 18
+; BE-NEXT:    vxor 13, 13, 14
+; BE-NEXT:    vxor 13, 13, 19
+; BE-NEXT:    vxor 13, 13, 31
+; BE-NEXT:    lvx 31, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, -32
+; BE-NEXT:    vxor 7, 13, 7
+; BE-NEXT:    vxor 7, 7, 30
+; BE-NEXT:    lvx 30, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, -48
+; BE-NEXT:    vxor 7, 7, 9
+; BE-NEXT:    vxor 7, 7, 10
+; BE-NEXT:    vxor 7, 7, 11
+; BE-NEXT:    vxor 7, 7, 12
+; BE-NEXT:    vxor 7, 7, 29
+; BE-NEXT:    lvx 29, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, -64
+; BE-NEXT:    vxor 7, 7, 28
+; BE-NEXT:    lvx 28, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, -80
+; BE-NEXT:    vxor 7, 7, 27
+; BE-NEXT:    lvx 27, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    vxor 5, 7, 5
+; BE-NEXT:    vrlh 4, 5, 4
+; BE-NEXT:    vand 5, 4, 1
+; BE-NEXT:    vsrh 4, 4, 2
+; BE-NEXT:    vslh 2, 5, 2
+; BE-NEXT:    vand 4, 4, 1
+; BE-NEXT:    vor 2, 4, 2
+; BE-NEXT:    vand 4, 2, 6
+; BE-NEXT:    vsrh 2, 2, 0
+; BE-NEXT:    vslh 4, 4, 0
+; BE-NEXT:    vand 2, 2, 6
+; BE-NEXT:    vor 2, 2, 4
+; BE-NEXT:    vsrh 3, 2, 3
+; BE-NEXT:    vand 2, 2, 8
+; BE-NEXT:    vadduhm 2, 2, 2
+; BE-NEXT:    vand 3, 3, 8
+; BE-NEXT:    vor 2, 3, 2
+; BE-NEXT:    blr
+;
+; LE-LABEL: clmulr_v8i16:
+; LE:       # %bb.0:
+; LE-NEXT:    vspltish 5, 8
+; LE-NEXT:    vspltisb 4, 15
+; LE-NEXT:    addis 3, 2, .LCPI5_0 at toc@ha
+; LE-NEXT:    vrlh 2, 2, 5
+; LE-NEXT:    vspltish 0, 4
+; LE-NEXT:    addi 3, 3, .LCPI5_0 at toc@l
+; LE-NEXT:    vspltish 1, 2
+; LE-NEXT:    vspltish 6, 1
+; LE-NEXT:    vrlh 3, 3, 5
+; LE-NEXT:    xxland 42, 34, 36
+; LE-NEXT:    vsrh 2, 2, 0
+; LE-NEXT:    vslh 10, 10, 0
+; LE-NEXT:    xxland 0, 34, 36
+; LE-NEXT:    vsldoi 7, 6, 6, 1
+; LE-NEXT:    vsldoi 8, 1, 1, 1
+; LE-NEXT:    vsldoi 9, 0, 0, 1
+; LE-NEXT:    xxlor 34, 0, 42
+; LE-NEXT:    lxvd2x 0, 0, 3
+; LE-NEXT:    addis 3, 2, .LCPI5_1 at toc@ha
+; LE-NEXT:    addi 3, 3, .LCPI5_1 at toc@l
+; LE-NEXT:    xxland 42, 34, 0
+; LE-NEXT:    vsrh 2, 2, 1
+; LE-NEXT:    vslh 10, 10, 1
+; LE-NEXT:    xxland 1, 34, 0
+; LE-NEXT:    xxlor 34, 1, 42
+; LE-NEXT:    lxvd2x 1, 0, 3
+; LE-NEXT:    addis 3, 2, .LCPI5_2 at toc@ha
+; LE-NEXT:    vsrh 10, 2, 6
+; LE-NEXT:    addi 3, 3, .LCPI5_2 at toc@l
+; LE-NEXT:    lxvd2x 4, 0, 3
+; LE-NEXT:    addis 3, 2, .LCPI5_3 at toc@ha
+; LE-NEXT:    xxland 34, 34, 1
+; LE-NEXT:    xxland 2, 42, 1
+; LE-NEXT:    xxland 42, 35, 36
+; LE-NEXT:    vsrh 3, 3, 0
+; LE-NEXT:    addi 3, 3, .LCPI5_3 at toc@l
+; LE-NEXT:    vadduhm 2, 2, 2
+; LE-NEXT:    vslh 10, 10, 0
+; LE-NEXT:    xxlor 34, 2, 34
+; LE-NEXT:    xxland 2, 35, 36
+; LE-NEXT:    xxlor 35, 2, 42
+; LE-NEXT:    xxland 42, 35, 0
+; LE-NEXT:    vsrh 3, 3, 1
+; LE-NEXT:    vslh 10, 10, 1
+; LE-NEXT:    xxland 2, 35, 0
+; LE-NEXT:    xxlor 35, 2, 42
+; LE-NEXT:    vsrh 10, 3, 6
+; LE-NEXT:    xxland 35, 35, 1
+; LE-NEXT:    xxland 2, 42, 1
+; LE-NEXT:    vadduhm 3, 3, 3
+; LE-NEXT:    xxlor 2, 2, 35
+; LE-NEXT:    vxor 3, 3, 3
+; LE-NEXT:    xxland 42, 2, 33
+; LE-NEXT:    xxland 43, 2, 38
+; LE-NEXT:    xxland 39, 2, 39
+; LE-NEXT:    vmladduhm 10, 2, 10, 3
+; LE-NEXT:    vmladduhm 11, 2, 11, 3
+; LE-NEXT:    vmladduhm 7, 2, 7, 3
+; LE-NEXT:    xxlxor 3, 43, 42
+; LE-NEXT:    xxland 42, 2, 32
+; LE-NEXT:    vmladduhm 10, 2, 10, 3
+; LE-NEXT:    xxlxor 3, 3, 42
+; LE-NEXT:    xxland 42, 2, 37
+; LE-NEXT:    vmladduhm 10, 2, 10, 3
+; LE-NEXT:    xxlxor 3, 3, 42
+; LE-NEXT:    vadduhm 10, 5, 5
+; LE-NEXT:    xxland 42, 2, 42
+; LE-NEXT:    vmladduhm 10, 2, 10, 3
+; LE-NEXT:    xxlxor 3, 3, 42
+; LE-NEXT:    xxland 42, 2, 4
+; LE-NEXT:    lxvd2x 4, 0, 3
+; LE-NEXT:    addis 3, 2, .LCPI5_4 at toc@ha
+; LE-NEXT:    vmladduhm 10, 2, 10, 3
+; LE-NEXT:    addi 3, 3, .LCPI5_4 at toc@l
+; LE-NEXT:    xxlxor 3, 3, 42
+; LE-NEXT:    vslh 10, 0, 0
+; LE-NEXT:    xxland 42, 2, 42
+; LE-NEXT:    vmladduhm 10, 2, 10, 3
+; LE-NEXT:    xxlxor 3, 3, 42
+; LE-NEXT:    xxland 42, 2, 4
+; LE-NEXT:    lxvd2x 4, 0, 3
+; LE-NEXT:    addis 3, 2, .LCPI5_5 at toc@ha
+; LE-NEXT:    vmladduhm 10, 2, 10, 3
+; LE-NEXT:    addi 3, 3, .LCPI5_5 at toc@l
+; LE-NEXT:    xxlxor 3, 3, 42
+; LE-NEXT:    xxlxor 3, 3, 39
+; LE-NEXT:    xxland 39, 2, 40
+; LE-NEXT:    vmladduhm 7, 2, 7, 3
+; LE-NEXT:    xxlxor 3, 3, 39
+; LE-NEXT:    xxland 39, 2, 41
+; LE-NEXT:    vmladduhm 7, 2, 7, 3
+; LE-NEXT:    xxlxor 3, 3, 39
+; LE-NEXT:    vslh 7, 5, 5
+; LE-NEXT:    xxland 39, 2, 39
+; LE-NEXT:    vmladduhm 7, 2, 7, 3
+; LE-NEXT:    xxlxor 3, 3, 39
+; LE-NEXT:    xxland 39, 2, 4
+; LE-NEXT:    lxvd2x 4, 0, 3
+; LE-NEXT:    addis 3, 2, .LCPI5_6 at toc@ha
+; LE-NEXT:    vmladduhm 7, 2, 7, 3
+; LE-NEXT:    addi 3, 3, .LCPI5_6 at toc@l
+; LE-NEXT:    xxlxor 3, 3, 39
+; LE-NEXT:    xxland 39, 2, 4
+; LE-NEXT:    lxvd2x 4, 0, 3
+; LE-NEXT:    vmladduhm 7, 2, 7, 3
+; LE-NEXT:    xxlxor 3, 3, 39
+; LE-NEXT:    xxland 39, 2, 4
+; LE-NEXT:    vmladduhm 7, 2, 7, 3
+; LE-NEXT:    xxlxor 3, 3, 39
+; LE-NEXT:    xxleqv 39, 39, 39
+; LE-NEXT:    vslh 7, 7, 7
+; LE-NEXT:    xxland 39, 2, 39
+; LE-NEXT:    vmladduhm 2, 2, 7, 3
+; LE-NEXT:    xxlxor 34, 3, 34
+; LE-NEXT:    vrlh 2, 2, 5
+; LE-NEXT:    xxland 35, 34, 36
+; LE-NEXT:    vsrh 2, 2, 0
+; LE-NEXT:    vslh 3, 3, 0
+; LE-NEXT:    xxland 2, 34, 36
+; LE-NEXT:    xxlor 34, 2, 35
+; LE-NEXT:    xxland 35, 34, 0
+; LE-NEXT:    vsrh 2, 2, 1
+; LE-NEXT:    vslh 3, 3, 1
+; LE-NEXT:    xxland 0, 34, 0
+; LE-NEXT:    xxlor 34, 0, 35
+; LE-NEXT:    vsrh 3, 2, 6
+; LE-NEXT:    xxland 34, 34, 1
+; LE-NEXT:    xxland 0, 35, 1
+; LE-NEXT:    vadduhm 2, 2, 2
+; LE-NEXT:    xxlor 34, 0, 34
+; LE-NEXT:    blr
+  %a.ext = zext <8 x i16> %a to <8 x i32>
+  %b.ext = zext <8 x i16> %b to <8 x i32>
+  %clmul = call <8 x i32> @llvm.clmul.v8i32(<8 x i32> %a.ext, <8 x i32> %b.ext)
+  %res.ext = lshr <8 x i32> %clmul, splat (i32 15)
+  %res = trunc <8 x i32> %res.ext to <8 x i16>
+  ret <8 x i16> %res
+}
+
+define <4 x i32> @clmulr_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
+; BE-LABEL: clmulr_v4i32:
+; BE:       # %bb.0:
+; BE-NEXT:    stdu 1, -1472(1)
+; BE-NEXT:    li 3, 1280
+; BE-NEXT:    vspltisb 12, -1
+; BE-NEXT:    stvx 20, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 1296
+; BE-NEXT:    vslw 15, 12, 12
+; BE-NEXT:    vspltisw 12, 12
+; BE-NEXT:    stvx 21, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 1312
+; BE-NEXT:    vadduwm 17, 12, 12
+; BE-NEXT:    vspltisw 18, 8
+; BE-NEXT:    stvx 22, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 1328
+; BE-NEXT:    vsrw 6, 2, 18
+; BE-NEXT:    vspltisw 19, 4
+; BE-NEXT:    stvx 23, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 1344
+; BE-NEXT:    stvx 24, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 1360
+; BE-NEXT:    stvx 25, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 1376
+; BE-NEXT:    vsrw 9, 3, 18
+; BE-NEXT:    stvx 26, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 1392
+; BE-NEXT:    stvx 27, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 1408
+; BE-NEXT:    stvx 28, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 1424
+; BE-NEXT:    vsrw 12, 2, 17
+; BE-NEXT:    stvx 29, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 1440
+; BE-NEXT:    stvx 30, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 1456
+; BE-NEXT:    vspltisw 30, 2
+; BE-NEXT:    vslw 14, 2, 17
+; BE-NEXT:    stvx 31, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 1264
+; BE-NEXT:    vspltisw 31, 1
+; BE-NEXT:    stvx 17, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    addis 3, 2, .LCPI6_0 at toc@ha
+; BE-NEXT:    addi 3, 3, .LCPI6_0 at toc@l
+; BE-NEXT:    lvx 29, 0, 3
+; BE-NEXT:    li 3, 1248
+; BE-NEXT:    vsrw 16, 3, 17
+; BE-NEXT:    stvx 29, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 1232
+; BE-NEXT:    vslw 17, 3, 17
+; BE-NEXT:    vand 2, 2, 29
+; BE-NEXT:    vand 3, 3, 29
+; BE-NEXT:    vand 6, 6, 29
+; BE-NEXT:    vand 9, 9, 29
+; BE-NEXT:    vslw 2, 2, 18
+; BE-NEXT:    vslw 3, 3, 18
+; BE-NEXT:    vor 6, 6, 12
+; BE-NEXT:    vspltisb 12, 15
+; BE-NEXT:    stvx 12, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    addis 3, 2, .LCPI6_1 at toc@ha
+; BE-NEXT:    addi 3, 3, .LCPI6_1 at toc@l
+; BE-NEXT:    vor 9, 9, 16
+; BE-NEXT:    vor 2, 14, 2
+; BE-NEXT:    vor 3, 17, 3
+; BE-NEXT:    vor 2, 2, 6
+; BE-NEXT:    vor 3, 3, 9
+; BE-NEXT:    vand 6, 2, 12
+; BE-NEXT:    vsrw 2, 2, 19
+; BE-NEXT:    vand 9, 3, 12
+; BE-NEXT:    vsrw 3, 3, 19
+; BE-NEXT:    vand 2, 2, 12
+; BE-NEXT:    vand 3, 3, 12
+; BE-NEXT:    lvx 12, 0, 3
+; BE-NEXT:    li 3, 1216
+; BE-NEXT:    stvx 12, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    addis 3, 2, .LCPI6_2 at toc@ha
+; BE-NEXT:    addi 3, 3, .LCPI6_2 at toc@l
+; BE-NEXT:    vslw 6, 6, 19
+; BE-NEXT:    vslw 9, 9, 19
+; BE-NEXT:    vor 2, 2, 6
+; BE-NEXT:    vor 3, 3, 9
+; BE-NEXT:    vand 6, 2, 12
+; BE-NEXT:    vsrw 2, 2, 30
+; BE-NEXT:    vand 9, 3, 12
+; BE-NEXT:    vsrw 3, 3, 30
+; BE-NEXT:    vand 2, 2, 12
+; BE-NEXT:    vand 3, 3, 12
+; BE-NEXT:    lvx 12, 0, 3
+; BE-NEXT:    li 3, 1200
+; BE-NEXT:    stvx 12, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 1136
+; BE-NEXT:    stvx 18, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    addis 3, 2, .LCPI6_3 at toc@ha
+; BE-NEXT:    addi 3, 3, .LCPI6_3 at toc@l
+; BE-NEXT:    vslw 6, 6, 30
+; BE-NEXT:    vslw 9, 9, 30
+; BE-NEXT:    vor 2, 2, 6
+; BE-NEXT:    vor 3, 3, 9
+; BE-NEXT:    vsrw 6, 2, 31
+; BE-NEXT:    vand 2, 2, 12
+; BE-NEXT:    vadduwm 2, 2, 2
+; BE-NEXT:    vsrw 9, 3, 31
+; BE-NEXT:    vand 3, 3, 12
+; BE-NEXT:    vand 6, 6, 12
+; BE-NEXT:    vand 12, 9, 12
+; BE-NEXT:    vor 9, 6, 2
+; BE-NEXT:    vadduwm 2, 3, 3
+; BE-NEXT:    vor 14, 12, 2
+; BE-NEXT:    vadduwm 2, 18, 18
+; BE-NEXT:    vand 28, 14, 2
+; BE-NEXT:    lvx 2, 0, 3
+; BE-NEXT:    addis 3, 2, .LCPI6_4 at toc@ha
+; BE-NEXT:    addi 3, 3, .LCPI6_4 at toc@l
+; BE-NEXT:    vand 27, 14, 2
+; BE-NEXT:    lvx 2, 0, 3
+; BE-NEXT:    addis 3, 2, .LCPI6_5 at toc@ha
+; BE-NEXT:    addi 3, 3, .LCPI6_5 at toc@l
+; BE-NEXT:    vand 25, 14, 2
+; BE-NEXT:    lvx 2, 0, 3
+; BE-NEXT:    addis 3, 2, .LCPI6_6 at toc@ha
+; BE-NEXT:    addi 3, 3, .LCPI6_6 at toc@l
+; BE-NEXT:    vslw 4, 19, 19
+; BE-NEXT:    vand 26, 14, 4
+; BE-NEXT:    vand 4, 14, 2
+; BE-NEXT:    lvx 2, 0, 3
+; BE-NEXT:    addis 3, 2, .LCPI6_7 at toc@ha
+; BE-NEXT:    addi 3, 3, .LCPI6_7 at toc@l
+; BE-NEXT:    vsldoi 5, 31, 31, 1
+; BE-NEXT:    vand 24, 14, 5
+; BE-NEXT:    vand 5, 14, 2
+; BE-NEXT:    lvx 2, 0, 3
+; BE-NEXT:    addis 3, 2, .LCPI6_8 at toc@ha
+; BE-NEXT:    addi 3, 3, .LCPI6_8 at toc@l
+; BE-NEXT:    vand 29, 14, 2
+; BE-NEXT:    lvx 2, 0, 3
+; BE-NEXT:    addis 3, 2, .LCPI6_9 at toc@ha
+; BE-NEXT:    addi 3, 3, .LCPI6_9 at toc@l
+; BE-NEXT:    vand 21, 14, 2
+; BE-NEXT:    lvx 2, 0, 3
+; BE-NEXT:    addis 3, 2, .LCPI6_10 at toc@ha
+; BE-NEXT:    addi 3, 3, .LCPI6_10 at toc@l
+; BE-NEXT:    vslw 7, 18, 18
+; BE-NEXT:    vand 3, 14, 7
+; BE-NEXT:    vand 7, 14, 2
+; BE-NEXT:    lvx 2, 0, 3
+; BE-NEXT:    addis 3, 2, .LCPI6_11 at toc@ha
+; BE-NEXT:    addi 3, 3, .LCPI6_11 at toc@l
+; BE-NEXT:    vsldoi 13, 18, 18, 2
+; BE-NEXT:    vand 16, 14, 13
+; BE-NEXT:    vand 13, 14, 2
+; BE-NEXT:    lvx 2, 0, 3
+; BE-NEXT:    addis 3, 2, .LCPI6_12 at toc@ha
+; BE-NEXT:    addi 3, 3, .LCPI6_12 at toc@l
+; BE-NEXT:    vand 12, 14, 2
+; BE-NEXT:    lvx 2, 0, 3
+; BE-NEXT:    li 3, 1184
+; BE-NEXT:    stvx 31, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 1168
+; BE-NEXT:    stvx 30, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 1152
+; BE-NEXT:    vsldoi 11, 31, 31, 2
+; BE-NEXT:    stvx 19, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 1072
+; BE-NEXT:    vsldoi 1, 19, 19, 1
+; BE-NEXT:    vsldoi 10, 30, 30, 2
+; BE-NEXT:    vand 20, 14, 11
+; BE-NEXT:    vand 11, 14, 2
+; BE-NEXT:    vsldoi 2, 31, 31, 3
+; BE-NEXT:    vsldoi 8, 19, 19, 2
+; BE-NEXT:    vand 22, 14, 1
+; BE-NEXT:    vand 1, 14, 10
+; BE-NEXT:    vand 10, 14, 2
+; BE-NEXT:    vsldoi 2, 30, 30, 3
+; BE-NEXT:    vand 17, 14, 8
+; BE-NEXT:    vand 8, 14, 2
+; BE-NEXT:    vsldoi 2, 19, 19, 3
+; BE-NEXT:    vand 2, 14, 2
+; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 1040
+; BE-NEXT:    vsldoi 2, 18, 18, 3
+; BE-NEXT:    vand 2, 14, 2
+; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    addis 3, 2, .LCPI6_13 at toc@ha
+; BE-NEXT:    addi 3, 3, .LCPI6_13 at toc@l
+; BE-NEXT:    lvx 2, 0, 3
+; BE-NEXT:    li 3, 1008
+; BE-NEXT:    vand 2, 14, 2
+; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    addis 3, 2, .LCPI6_14 at toc@ha
+; BE-NEXT:    addi 3, 3, .LCPI6_14 at toc@l
+; BE-NEXT:    lvx 2, 0, 3
+; BE-NEXT:    li 3, 288
+; BE-NEXT:    vand 2, 14, 2
+; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    addis 3, 2, .LCPI6_15 at toc@ha
+; BE-NEXT:    addi 3, 3, .LCPI6_15 at toc@l
+; BE-NEXT:    lvx 2, 0, 3
+; BE-NEXT:    li 3, 192
+; BE-NEXT:    vand 2, 14, 2
+; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 272
+; BE-NEXT:    vand 2, 14, 15
+; BE-NEXT:    vspltisw 15, -16
+; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 976
+; BE-NEXT:    vand 2, 14, 30
+; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 944
+; BE-NEXT:    vand 31, 14, 31
+; BE-NEXT:    stvx 31, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 912
+; BE-NEXT:    vsldoi 0, 30, 30, 1
+; BE-NEXT:    vand 19, 14, 19
+; BE-NEXT:    stvx 19, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 880
+; BE-NEXT:    vand 23, 14, 0
+; BE-NEXT:    vand 14, 14, 18
+; BE-NEXT:    stvx 14, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 1120
+; BE-NEXT:    vxor 6, 6, 6
+; BE-NEXT:    vrlw 0, 2, 15
+; BE-NEXT:    vmsumuhm 2, 9, 0, 6
+; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 1104
+; BE-NEXT:    vrlw 0, 31, 15
+; BE-NEXT:    vmsumuhm 2, 9, 0, 6
+; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 1088
+; BE-NEXT:    vrlw 0, 19, 15
+; BE-NEXT:    vmsumuhm 2, 9, 0, 6
+; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 1056
+; BE-NEXT:    vrlw 0, 14, 15
+; BE-NEXT:    vmsumuhm 2, 9, 0, 6
+; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 1024
+; BE-NEXT:    vrlw 0, 28, 15
+; BE-NEXT:    vmsumuhm 2, 9, 0, 6
+; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 992
+; BE-NEXT:    vrlw 0, 27, 15
+; BE-NEXT:    vmsumuhm 2, 9, 0, 6
+; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 960
+; BE-NEXT:    vrlw 0, 26, 15
+; BE-NEXT:    vmsumuhm 2, 9, 0, 6
+; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 928
+; BE-NEXT:    vrlw 0, 25, 15
+; BE-NEXT:    vmsumuhm 2, 9, 0, 6
+; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 896
+; BE-NEXT:    vrlw 0, 24, 15
+; BE-NEXT:    vmsumuhm 2, 9, 0, 6
+; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 864
+; BE-NEXT:    vrlw 0, 23, 15
+; BE-NEXT:    vmsumuhm 2, 9, 0, 6
+; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 832
+; BE-NEXT:    vrlw 0, 22, 15
+; BE-NEXT:    vmsumuhm 2, 9, 0, 6
+; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 800
+; BE-NEXT:    vrlw 0, 3, 15
+; BE-NEXT:    vmsumuhm 2, 9, 0, 6
+; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 768
+; BE-NEXT:    vrlw 0, 4, 15
+; BE-NEXT:    vmsumuhm 2, 9, 0, 6
+; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 736
+; BE-NEXT:    vrlw 0, 5, 15
+; BE-NEXT:    vmsumuhm 2, 9, 0, 6
+; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 704
+; BE-NEXT:    vrlw 0, 29, 15
+; BE-NEXT:    vmsumuhm 2, 9, 0, 6
+; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 672
+; BE-NEXT:    vrlw 0, 21, 15
+; BE-NEXT:    vmsumuhm 2, 9, 0, 6
+; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 640
+; BE-NEXT:    vrlw 0, 20, 15
+; BE-NEXT:    vmsumuhm 2, 9, 0, 6
+; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 592
+; BE-NEXT:    vrlw 0, 1, 15
+; BE-NEXT:    vmsumuhm 2, 9, 0, 6
+; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 560
+; BE-NEXT:    vrlw 0, 17, 15
+; BE-NEXT:    vmsumuhm 2, 9, 0, 6
+; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 528
+; BE-NEXT:    vrlw 0, 16, 15
+; BE-NEXT:    vmsumuhm 2, 9, 0, 6
+; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 496
+; BE-NEXT:    vrlw 0, 7, 15
+; BE-NEXT:    vmsumuhm 2, 9, 0, 6
+; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 464
+; BE-NEXT:    vrlw 0, 13, 15
+; BE-NEXT:    vmsumuhm 2, 9, 0, 6
+; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 432
+; BE-NEXT:    vrlw 0, 12, 15
+; BE-NEXT:    vmsumuhm 2, 9, 0, 6
+; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 400
+; BE-NEXT:    vrlw 0, 11, 15
+; BE-NEXT:    vmsumuhm 2, 9, 0, 6
+; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 368
+; BE-NEXT:    vrlw 0, 10, 15
+; BE-NEXT:    vmsumuhm 2, 9, 0, 6
+; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 336
+; BE-NEXT:    vrlw 0, 8, 15
+; BE-NEXT:    vmsumuhm 2, 9, 0, 6
+; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 1072
+; BE-NEXT:    vmr 14, 7
+; BE-NEXT:    lvx 7, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 304
+; BE-NEXT:    vrlw 0, 7, 15
+; BE-NEXT:    vmsumuhm 2, 9, 0, 6
+; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 1040
+; BE-NEXT:    vmr 30, 1
+; BE-NEXT:    lvx 1, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 240
+; BE-NEXT:    vrlw 0, 1, 15
+; BE-NEXT:    vmsumuhm 2, 9, 0, 6
+; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 1008
+; BE-NEXT:    vmr 19, 5
+; BE-NEXT:    lvx 5, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 208
+; BE-NEXT:    vrlw 0, 5, 15
+; BE-NEXT:    vmsumuhm 2, 9, 0, 6
+; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 288
+; BE-NEXT:    vmr 18, 4
+; BE-NEXT:    lvx 4, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 160
+; BE-NEXT:    vrlw 0, 4, 15
+; BE-NEXT:    vmsumuhm 2, 9, 0, 6
+; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 192
+; BE-NEXT:    vmr 31, 3
+; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 128
+; BE-NEXT:    vrlw 0, 3, 15
+; BE-NEXT:    vmsumuhm 2, 9, 0, 6
+; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 272
+; BE-NEXT:    lvx 2, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 64
+; BE-NEXT:    vrlw 0, 2, 15
+; BE-NEXT:    vmsumuhm 0, 9, 0, 6
+; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 976
+; BE-NEXT:    lvx 0, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 96
+; BE-NEXT:    vmulouh 0, 9, 0
+; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 944
+; BE-NEXT:    lvx 0, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 80
+; BE-NEXT:    vmulouh 0, 9, 0
+; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 912
+; BE-NEXT:    lvx 0, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 112
+; BE-NEXT:    vmulouh 0, 9, 0
+; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 880
+; BE-NEXT:    lvx 0, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 144
+; BE-NEXT:    vmulouh 0, 9, 0
+; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 176
+; BE-NEXT:    vmulouh 0, 9, 28
+; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 224
+; BE-NEXT:    vmulouh 0, 9, 27
+; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 256
+; BE-NEXT:    vmulouh 0, 9, 26
+; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 320
+; BE-NEXT:    vmulouh 0, 9, 25
+; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 352
+; BE-NEXT:    vmulouh 0, 9, 24
+; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 384
+; BE-NEXT:    vmulouh 0, 9, 23
+; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 416
+; BE-NEXT:    vmulouh 0, 9, 22
+; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 448
+; BE-NEXT:    vmulouh 0, 9, 31
+; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 480
+; BE-NEXT:    vmulouh 0, 9, 18
+; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 512
+; BE-NEXT:    vmulouh 0, 9, 19
+; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 544
+; BE-NEXT:    vmulouh 0, 9, 29
+; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 576
+; BE-NEXT:    vmulouh 0, 9, 21
+; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 608
+; BE-NEXT:    vmulouh 0, 9, 20
+; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 624
+; BE-NEXT:    vmulouh 0, 9, 30
+; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 656
+; BE-NEXT:    vmulouh 0, 9, 17
+; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 688
+; BE-NEXT:    vmulouh 0, 9, 16
+; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 720
+; BE-NEXT:    vmulouh 0, 9, 14
+; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 752
+; BE-NEXT:    vmulouh 0, 9, 13
+; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 784
+; BE-NEXT:    vmulouh 0, 9, 12
+; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 816
+; BE-NEXT:    vmulouh 0, 9, 11
+; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 848
+; BE-NEXT:    vmulouh 0, 9, 10
+; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 880
+; BE-NEXT:    vmulouh 0, 9, 8
+; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 912
+; BE-NEXT:    vmulouh 0, 9, 7
+; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 944
+; BE-NEXT:    vmulouh 0, 9, 1
+; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 976
+; BE-NEXT:    vmulouh 5, 9, 5
+; BE-NEXT:    stvx 5, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 1008
+; BE-NEXT:    vmulouh 4, 9, 4
+; BE-NEXT:    stvx 4, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 1040
+; BE-NEXT:    vmulouh 3, 9, 3
+; BE-NEXT:    stvx 3, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 1072
+; BE-NEXT:    vmulouh 2, 9, 2
+; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 1120
+; BE-NEXT:    lvx 2, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 1104
+; BE-NEXT:    vslw 9, 2, 15
+; BE-NEXT:    lvx 2, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 1088
+; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 1056
+; BE-NEXT:    lvx 4, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 1024
+; BE-NEXT:    vslw 2, 2, 15
+; BE-NEXT:    lvx 5, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 992
+; BE-NEXT:    lvx 0, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 960
+; BE-NEXT:    lvx 1, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 928
+; BE-NEXT:    vslw 3, 3, 15
+; BE-NEXT:    lvx 6, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 896
+; BE-NEXT:    lvx 7, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 864
+; BE-NEXT:    lvx 8, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 832
+; BE-NEXT:    vslw 4, 4, 15
+; BE-NEXT:    lvx 10, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 800
+; BE-NEXT:    lvx 11, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 768
+; BE-NEXT:    vslw 5, 5, 15
+; BE-NEXT:    lvx 12, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 736
+; BE-NEXT:    lvx 13, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 704
+; BE-NEXT:    lvx 14, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 672
+; BE-NEXT:    vslw 0, 0, 15
+; BE-NEXT:    lvx 16, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 640
+; BE-NEXT:    lvx 17, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 592
+; BE-NEXT:    lvx 18, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 560
+; BE-NEXT:    vslw 1, 1, 15
+; BE-NEXT:    lvx 19, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 528
+; BE-NEXT:    lvx 31, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 496
+; BE-NEXT:    vslw 6, 6, 15
+; BE-NEXT:    lvx 30, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 464
+; BE-NEXT:    lvx 29, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 432
+; BE-NEXT:    lvx 28, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 400
+; BE-NEXT:    vslw 7, 7, 15
+; BE-NEXT:    lvx 27, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 368
+; BE-NEXT:    lvx 26, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 336
+; BE-NEXT:    lvx 25, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 304
+; BE-NEXT:    vslw 8, 8, 15
+; BE-NEXT:    lvx 24, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 240
+; BE-NEXT:    lvx 23, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 208
+; BE-NEXT:    vslw 10, 10, 15
+; BE-NEXT:    lvx 22, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 160
+; BE-NEXT:    lvx 21, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 128
+; BE-NEXT:    lvx 20, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 1120
+; BE-NEXT:    vslw 11, 11, 15
+; BE-NEXT:    vslw 20, 20, 15
+; BE-NEXT:    stvx 20, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 64
+; BE-NEXT:    lvx 20, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 96
+; BE-NEXT:    vslw 12, 12, 15
+; BE-NEXT:    vslw 13, 13, 15
+; BE-NEXT:    vslw 14, 14, 15
+; BE-NEXT:    vslw 16, 16, 15
+; BE-NEXT:    vslw 17, 17, 15
+; BE-NEXT:    vslw 18, 18, 15
+; BE-NEXT:    vslw 19, 19, 15
+; BE-NEXT:    vslw 31, 31, 15
+; BE-NEXT:    vslw 30, 30, 15
+; BE-NEXT:    vslw 29, 29, 15
+; BE-NEXT:    vslw 28, 28, 15
+; BE-NEXT:    vslw 27, 27, 15
+; BE-NEXT:    vslw 26, 26, 15
+; BE-NEXT:    vslw 25, 25, 15
+; BE-NEXT:    vslw 24, 24, 15
+; BE-NEXT:    vslw 23, 23, 15
+; BE-NEXT:    vslw 22, 22, 15
+; BE-NEXT:    vslw 21, 21, 15
+; BE-NEXT:    vslw 20, 20, 15
+; BE-NEXT:    lvx 15, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 80
+; BE-NEXT:    vadduwm 9, 15, 9
+; BE-NEXT:    lvx 15, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 112
+; BE-NEXT:    vadduwm 2, 15, 2
+; BE-NEXT:    vxor 2, 2, 9
+; BE-NEXT:    lvx 9, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 144
+; BE-NEXT:    vadduwm 3, 9, 3
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 176
+; BE-NEXT:    vadduwm 3, 3, 4
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 224
+; BE-NEXT:    vadduwm 3, 3, 5
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 256
+; BE-NEXT:    vadduwm 3, 3, 0
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 320
+; BE-NEXT:    vadduwm 3, 3, 1
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 352
+; BE-NEXT:    vadduwm 3, 3, 6
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 384
+; BE-NEXT:    vadduwm 3, 3, 7
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 416
+; BE-NEXT:    vadduwm 3, 3, 8
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 448
+; BE-NEXT:    vadduwm 3, 3, 10
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 480
+; BE-NEXT:    vadduwm 3, 3, 11
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 512
+; BE-NEXT:    vadduwm 3, 3, 12
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 544
+; BE-NEXT:    vadduwm 3, 3, 13
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 576
+; BE-NEXT:    vadduwm 3, 3, 14
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 608
+; BE-NEXT:    vadduwm 3, 3, 16
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 624
+; BE-NEXT:    vadduwm 3, 3, 17
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 656
+; BE-NEXT:    vadduwm 3, 3, 18
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 688
+; BE-NEXT:    vadduwm 3, 3, 19
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 720
+; BE-NEXT:    vadduwm 3, 3, 31
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 752
+; BE-NEXT:    vadduwm 3, 3, 30
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 784
+; BE-NEXT:    vadduwm 3, 3, 29
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 816
+; BE-NEXT:    vadduwm 3, 3, 28
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 848
+; BE-NEXT:    vadduwm 3, 3, 27
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 880
+; BE-NEXT:    vadduwm 3, 3, 26
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 912
+; BE-NEXT:    vadduwm 3, 3, 25
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 944
+; BE-NEXT:    vadduwm 3, 3, 24
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 976
+; BE-NEXT:    vadduwm 3, 3, 23
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 1008
+; BE-NEXT:    vadduwm 3, 3, 22
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 1040
+; BE-NEXT:    vadduwm 3, 3, 21
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 1120
+; BE-NEXT:    lvx 4, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 1072
+; BE-NEXT:    vadduwm 3, 3, 4
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 1264
+; BE-NEXT:    lvx 5, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 1136
+; BE-NEXT:    vadduwm 3, 3, 20
+; BE-NEXT:    lvx 1, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 1248
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 0, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 1232
+; BE-NEXT:    vsrw 3, 2, 5
+; BE-NEXT:    vsrw 4, 2, 1
+; BE-NEXT:    vslw 5, 2, 5
+; BE-NEXT:    vand 2, 2, 0
+; BE-NEXT:    vslw 2, 2, 1
+; BE-NEXT:    vand 4, 4, 0
+; BE-NEXT:    vor 2, 5, 2
+; BE-NEXT:    lvx 5, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 1152
+; BE-NEXT:    vor 3, 4, 3
+; BE-NEXT:    lvx 4, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 1216
+; BE-NEXT:    vor 2, 2, 3
+; BE-NEXT:    vand 3, 2, 5
+; BE-NEXT:    vsrw 2, 2, 4
+; BE-NEXT:    vand 2, 2, 5
+; BE-NEXT:    lvx 5, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 1168
+; BE-NEXT:    vslw 3, 3, 4
+; BE-NEXT:    lvx 4, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 1184
+; BE-NEXT:    vor 2, 2, 3
+; BE-NEXT:    vand 3, 2, 5
+; BE-NEXT:    vsrw 2, 2, 4
+; BE-NEXT:    vslw 3, 3, 4
+; BE-NEXT:    vand 2, 2, 5
+; BE-NEXT:    vor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 1200
+; BE-NEXT:    lvx 4, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 1456
+; BE-NEXT:    lvx 31, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 1440
+; BE-NEXT:    vsrw 3, 2, 3
+; BE-NEXT:    lvx 30, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 1424
+; BE-NEXT:    lvx 29, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 1408
+; BE-NEXT:    lvx 28, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 1392
+; BE-NEXT:    vand 2, 2, 4
+; BE-NEXT:    vadduwm 2, 2, 2
+; BE-NEXT:    lvx 27, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 1376
+; BE-NEXT:    lvx 26, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 1360
+; BE-NEXT:    lvx 25, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 1344
+; BE-NEXT:    vand 3, 3, 4
+; BE-NEXT:    lvx 24, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 1328
+; BE-NEXT:    lvx 23, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 1312
+; BE-NEXT:    vor 2, 3, 2
+; BE-NEXT:    lvx 22, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 1296
+; BE-NEXT:    lvx 21, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 1280
+; BE-NEXT:    lvx 20, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    addi 1, 1, 1472
+; BE-NEXT:    blr
+;
+; LE-LABEL: clmulr_v4i32:
+; LE:       # %bb.0:
+; LE-NEXT:    addis 3, 2, .LCPI6_0 at toc@ha
+; LE-NEXT:    vspltisw 7, 12
+; LE-NEXT:    vspltisw 4, 8
+; LE-NEXT:    addi 3, 3, .LCPI6_0 at toc@l
+; LE-NEXT:    vadduwm 7, 7, 7
+; LE-NEXT:    vsrw 17, 2, 4
+; LE-NEXT:    vspltisb 5, 15
+; LE-NEXT:    vspltisw 0, 4
+; LE-NEXT:    lxvd2x 0, 0, 3
+; LE-NEXT:    vsrw 16, 2, 7
+; LE-NEXT:    addis 3, 2, .LCPI6_1 at toc@ha
+; LE-NEXT:    vspltisw 1, 2
+; LE-NEXT:    vspltisw 6, 1
+; LE-NEXT:    vsldoi 10, 0, 0, 1
+; LE-NEXT:    addi 3, 3, .LCPI6_1 at toc@l
+; LE-NEXT:    vsldoi 13, 0, 0, 2
+; LE-NEXT:    vsldoi 9, 1, 1, 1
+; LE-NEXT:    vsldoi 12, 1, 1, 2
+; LE-NEXT:    vsldoi 14, 4, 4, 2
+; LE-NEXT:    xxland 1, 49, 0
+; LE-NEXT:    vsldoi 8, 6, 6, 1
+; LE-NEXT:    vsldoi 11, 6, 6, 2
+; LE-NEXT:    vsldoi 15, 6, 6, 3
+; LE-NEXT:    xxlor 1, 1, 48
+; LE-NEXT:    vslw 16, 2, 7
+; LE-NEXT:    xxland 34, 34, 0
+; LE-NEXT:    vslw 2, 2, 4
+; LE-NEXT:    xxlor 2, 48, 34
+; LE-NEXT:    xxlor 34, 2, 1
+; LE-NEXT:    xxland 49, 34, 37
+; LE-NEXT:    vsrw 2, 2, 0
+; LE-NEXT:    vslw 17, 17, 0
+; LE-NEXT:    xxland 1, 34, 37
+; LE-NEXT:    xxlor 34, 1, 49
+; LE-NEXT:    lxvd2x 1, 0, 3
+; LE-NEXT:    addis 3, 2, .LCPI6_2 at toc@ha
+; LE-NEXT:    addi 3, 3, .LCPI6_2 at toc@l
+; LE-NEXT:    xxland 50, 34, 1
+; LE-NEXT:    vsrw 2, 2, 1
+; LE-NEXT:    vslw 18, 18, 1
+; LE-NEXT:    xxland 2, 34, 1
+; LE-NEXT:    xxlor 34, 2, 50
+; LE-NEXT:    lxvd2x 2, 0, 3
+; LE-NEXT:    addis 3, 2, .LCPI6_3 at toc@ha
+; LE-NEXT:    vsrw 19, 2, 6
+; LE-NEXT:    addi 3, 3, .LCPI6_3 at toc@l
+; LE-NEXT:    lxvd2x 5, 0, 3
+; LE-NEXT:    addis 3, 2, .LCPI6_4 at toc@ha
+; LE-NEXT:    xxland 34, 34, 2
+; LE-NEXT:    xxland 3, 51, 2
+; LE-NEXT:    vsrw 19, 3, 4
+; LE-NEXT:    addi 3, 3, .LCPI6_4 at toc@l
+; LE-NEXT:    vadduwm 2, 2, 2
+; LE-NEXT:    xxlor 34, 3, 34
+; LE-NEXT:    xxland 3, 51, 0
+; LE-NEXT:    vsrw 19, 3, 7
+; LE-NEXT:    xxlor 3, 3, 51
+; LE-NEXT:    vslw 19, 3, 7
+; LE-NEXT:    xxland 35, 35, 0
+; LE-NEXT:    vslw 3, 3, 4
+; LE-NEXT:    vsldoi 16, 1, 1, 3
+; LE-NEXT:    xxlor 4, 51, 35
+; LE-NEXT:    xxlor 35, 4, 3
+; LE-NEXT:    xxland 51, 35, 37
+; LE-NEXT:    vsrw 3, 3, 0
+; LE-NEXT:    vslw 19, 19, 0
+; LE-NEXT:    xxland 3, 35, 37
+; LE-NEXT:    xxlor 35, 3, 51
+; LE-NEXT:    xxland 51, 35, 1
+; LE-NEXT:    vsrw 3, 3, 1
+; LE-NEXT:    vslw 19, 19, 1
+; LE-NEXT:    xxland 3, 35, 1
+; LE-NEXT:    xxlor 35, 3, 51
+; LE-NEXT:    vsrw 19, 3, 6
+; LE-NEXT:    xxland 35, 35, 2
+; LE-NEXT:    xxland 3, 51, 2
+; LE-NEXT:    vadduwm 3, 3, 3
+; LE-NEXT:    xxlor 3, 3, 35
+; LE-NEXT:    xxland 35, 3, 33
+; LE-NEXT:    xxland 51, 3, 38
+; LE-NEXT:    vmuluwm 3, 2, 3
+; LE-NEXT:    vmuluwm 19, 2, 19
+; LE-NEXT:    xxlxor 4, 51, 35
+; LE-NEXT:    xxland 35, 3, 32
+; LE-NEXT:    vmuluwm 3, 2, 3
+; LE-NEXT:    xxlxor 4, 4, 35
+; LE-NEXT:    xxland 35, 3, 36
+; LE-NEXT:    vmuluwm 3, 2, 3
+; LE-NEXT:    xxlxor 4, 4, 35
+; LE-NEXT:    vadduwm 3, 4, 4
+; LE-NEXT:    xxland 35, 3, 35
+; LE-NEXT:    vmuluwm 3, 2, 3
+; LE-NEXT:    vsldoi 17, 0, 0, 3
+; LE-NEXT:    xxlxor 4, 4, 35
+; LE-NEXT:    xxland 35, 3, 5
+; LE-NEXT:    lxvd2x 5, 0, 3
+; LE-NEXT:    addis 3, 2, .LCPI6_5 at toc@ha
+; LE-NEXT:    vmuluwm 3, 2, 3
+; LE-NEXT:    addi 3, 3, .LCPI6_5 at toc@l
+; LE-NEXT:    xxlxor 4, 4, 35
+; LE-NEXT:    vslw 3, 0, 0
+; LE-NEXT:    xxland 35, 3, 35
+; LE-NEXT:    vmuluwm 3, 2, 3
+; LE-NEXT:    xxlxor 4, 4, 35
+; LE-NEXT:    xxland 35, 3, 5
+; LE-NEXT:    lxvd2x 5, 0, 3
+; LE-NEXT:    addis 3, 2, .LCPI6_6 at toc@ha
+; LE-NEXT:    vmuluwm 3, 2, 3
+; LE-NEXT:    addi 3, 3, .LCPI6_6 at toc@l
+; LE-NEXT:    xxlxor 4, 4, 35
+; LE-NEXT:    xxland 35, 3, 40
+; LE-NEXT:    vmuluwm 3, 2, 3
+; LE-NEXT:    xxlxor 4, 4, 35
+; LE-NEXT:    xxland 35, 3, 41
+; LE-NEXT:    vmuluwm 3, 2, 3
+; LE-NEXT:    xxlxor 4, 4, 35
+; LE-NEXT:    xxland 35, 3, 42
+; LE-NEXT:    vmuluwm 3, 2, 3
+; LE-NEXT:    xxlxor 4, 4, 35
+; LE-NEXT:    vslw 3, 4, 4
+; LE-NEXT:    xxland 35, 3, 35
+; LE-NEXT:    vmuluwm 3, 2, 3
+; LE-NEXT:    xxlxor 4, 4, 35
+; LE-NEXT:    xxland 35, 3, 5
+; LE-NEXT:    lxvd2x 5, 0, 3
+; LE-NEXT:    addis 3, 2, .LCPI6_7 at toc@ha
+; LE-NEXT:    vmuluwm 3, 2, 3
+; LE-NEXT:    addi 3, 3, .LCPI6_7 at toc@l
+; LE-NEXT:    vsldoi 18, 4, 4, 3
+; LE-NEXT:    xxlxor 4, 4, 35
+; LE-NEXT:    xxland 35, 3, 5
+; LE-NEXT:    lxvd2x 5, 0, 3
+; LE-NEXT:    addis 3, 2, .LCPI6_8 at toc@ha
+; LE-NEXT:    vmuluwm 3, 2, 3
+; LE-NEXT:    addi 3, 3, .LCPI6_8 at toc@l
+; LE-NEXT:    xxlxor 4, 4, 35
+; LE-NEXT:    xxland 35, 3, 5
+; LE-NEXT:    lxvd2x 5, 0, 3
+; LE-NEXT:    addis 3, 2, .LCPI6_9 at toc@ha
+; LE-NEXT:    vmuluwm 3, 2, 3
+; LE-NEXT:    addi 3, 3, .LCPI6_9 at toc@l
+; LE-NEXT:    xxlxor 4, 4, 35
+; LE-NEXT:    xxland 35, 3, 5
+; LE-NEXT:    lxvd2x 5, 0, 3
+; LE-NEXT:    addis 3, 2, .LCPI6_10 at toc@ha
+; LE-NEXT:    vmuluwm 3, 2, 3
+; LE-NEXT:    addi 3, 3, .LCPI6_10 at toc@l
+; LE-NEXT:    xxlxor 4, 4, 35
+; LE-NEXT:    xxland 35, 3, 43
+; LE-NEXT:    vmuluwm 3, 2, 3
+; LE-NEXT:    xxlxor 4, 4, 35
+; LE-NEXT:    xxland 35, 3, 44
+; LE-NEXT:    vmuluwm 3, 2, 3
+; LE-NEXT:    xxlxor 4, 4, 35
+; LE-NEXT:    xxland 35, 3, 45
+; LE-NEXT:    vmuluwm 3, 2, 3
+; LE-NEXT:    xxlxor 4, 4, 35
+; LE-NEXT:    xxland 35, 3, 46
+; LE-NEXT:    vmuluwm 3, 2, 3
+; LE-NEXT:    xxlxor 4, 4, 35
+; LE-NEXT:    xxland 35, 3, 5
+; LE-NEXT:    lxvd2x 5, 0, 3
+; LE-NEXT:    addis 3, 2, .LCPI6_11 at toc@ha
+; LE-NEXT:    vmuluwm 3, 2, 3
+; LE-NEXT:    addi 3, 3, .LCPI6_11 at toc@l
+; LE-NEXT:    xxlxor 4, 4, 35
+; LE-NEXT:    xxland 35, 3, 5
+; LE-NEXT:    lxvd2x 5, 0, 3
+; LE-NEXT:    addis 3, 2, .LCPI6_12 at toc@ha
+; LE-NEXT:    vmuluwm 3, 2, 3
+; LE-NEXT:    addi 3, 3, .LCPI6_12 at toc@l
+; LE-NEXT:    xxlxor 4, 4, 35
+; LE-NEXT:    xxland 35, 3, 5
+; LE-NEXT:    lxvd2x 5, 0, 3
+; LE-NEXT:    addis 3, 2, .LCPI6_13 at toc@ha
+; LE-NEXT:    vmuluwm 3, 2, 3
+; LE-NEXT:    addi 3, 3, .LCPI6_13 at toc@l
+; LE-NEXT:    xxlxor 4, 4, 35
+; LE-NEXT:    xxland 35, 3, 5
+; LE-NEXT:    lxvd2x 5, 0, 3
+; LE-NEXT:    addis 3, 2, .LCPI6_14 at toc@ha
+; LE-NEXT:    vmuluwm 3, 2, 3
+; LE-NEXT:    addi 3, 3, .LCPI6_14 at toc@l
+; LE-NEXT:    xxlxor 4, 4, 35
+; LE-NEXT:    xxland 35, 3, 47
+; LE-NEXT:    vmuluwm 3, 2, 3
+; LE-NEXT:    xxlxor 4, 4, 35
+; LE-NEXT:    xxland 35, 3, 48
+; LE-NEXT:    vmuluwm 3, 2, 3
+; LE-NEXT:    xxlxor 4, 4, 35
+; LE-NEXT:    xxland 35, 3, 49
+; LE-NEXT:    vmuluwm 3, 2, 3
+; LE-NEXT:    xxlxor 4, 4, 35
+; LE-NEXT:    xxland 35, 3, 50
+; LE-NEXT:    vmuluwm 3, 2, 3
+; LE-NEXT:    xxlxor 4, 4, 35
+; LE-NEXT:    xxland 35, 3, 5
+; LE-NEXT:    lxvd2x 5, 0, 3
+; LE-NEXT:    addis 3, 2, .LCPI6_15 at toc@ha
+; LE-NEXT:    vmuluwm 3, 2, 3
+; LE-NEXT:    addi 3, 3, .LCPI6_15 at toc@l
+; LE-NEXT:    xxlxor 4, 4, 35
+; LE-NEXT:    xxland 35, 3, 5
+; LE-NEXT:    lxvd2x 5, 0, 3
+; LE-NEXT:    vmuluwm 3, 2, 3
+; LE-NEXT:    xxlxor 4, 4, 35
+; LE-NEXT:    xxland 35, 3, 5
+; LE-NEXT:    vmuluwm 3, 2, 3
+; LE-NEXT:    xxlxor 4, 4, 35
+; LE-NEXT:    xxleqv 35, 35, 35
+; LE-NEXT:    vslw 3, 3, 3
+; LE-NEXT:    xxland 35, 3, 35
+; LE-NEXT:    vmuluwm 2, 2, 3
+; LE-NEXT:    xxlxor 34, 4, 34
+; LE-NEXT:    vsrw 8, 2, 4
+; LE-NEXT:    vsrw 3, 2, 7
+; LE-NEXT:    xxland 3, 40, 0
+; LE-NEXT:    xxlor 3, 3, 35
+; LE-NEXT:    vslw 3, 2, 7
+; LE-NEXT:    xxland 34, 34, 0
+; LE-NEXT:    vslw 2, 2, 4
+; LE-NEXT:    xxlor 0, 35, 34
+; LE-NEXT:    xxlor 34, 0, 3
+; LE-NEXT:    xxland 35, 34, 37
+; LE-NEXT:    vsrw 2, 2, 0
+; LE-NEXT:    vslw 3, 3, 0
+; LE-NEXT:    xxland 0, 34, 37
+; LE-NEXT:    xxlor 34, 0, 35
+; LE-NEXT:    xxland 35, 34, 1
+; LE-NEXT:    vsrw 2, 2, 1
+; LE-NEXT:    vslw 3, 3, 1
+; LE-NEXT:    xxland 0, 34, 1
+; LE-NEXT:    xxlor 34, 0, 35
+; LE-NEXT:    vsrw 3, 2, 6
+; LE-NEXT:    xxland 34, 34, 2
+; LE-NEXT:    xxland 0, 35, 2
+; LE-NEXT:    vadduwm 2, 2, 2
+; LE-NEXT:    xxlor 34, 0, 34
+; LE-NEXT:    blr
+  %a.ext = zext <4 x i32> %a to <4 x i64>
+  %b.ext = zext <4 x i32> %b to <4 x i64>
+  %clmul = call <4 x i64> @llvm.clmul.v4i64(<4 x i64> %a.ext, <4 x i64> %b.ext)
+  %res.ext = lshr <4 x i64> %clmul, splat (i64 31)
+  %res = trunc <4 x i64> %res.ext to <4 x i32>
+  ret <4 x i32> %res
+}
+
+define <2 x i64> @clmulr_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
+; BE-LABEL: clmulr_v2i64:
+; BE:       # %bb.0:
+; BE-NEXT:    stdu 1, -1056(1)
+; BE-NEXT:    lis 7, -21846
+; BE-NEXT:    lis 8, 21845
+; BE-NEXT:    std 26, 1008(1) # 8-byte Folded Spill
+; BE-NEXT:    ori 7, 7, 43690
+; BE-NEXT:    ori 8, 8, 21845
+; BE-NEXT:    std 27, 1016(1) # 8-byte Folded Spill
+; BE-NEXT:    sldi 7, 7, 32
+; BE-NEXT:    sldi 8, 8, 32
+; BE-NEXT:    lis 9, -13108
+; BE-NEXT:    lis 10, 13107
+; BE-NEXT:    std 30, 1040(1) # 8-byte Folded Spill
+; BE-NEXT:    oris 7, 7, 43690
+; BE-NEXT:    oris 8, 8, 21845
+; BE-NEXT:    std 28, 1024(1) # 8-byte Folded Spill
+; BE-NEXT:    sldi 0, 3, 1
+; BE-NEXT:    rldicl 3, 3, 63, 1
+; BE-NEXT:    ori 9, 9, 52428
+; BE-NEXT:    ori 10, 10, 13107
+; BE-NEXT:    std 29, 1032(1) # 8-byte Folded Spill
+; BE-NEXT:    ori 27, 7, 43690
+; BE-NEXT:    ori 26, 8, 21845
+; BE-NEXT:    std 2, 904(1) # 8-byte Folded Spill
+; BE-NEXT:    sldi 9, 9, 32
+; BE-NEXT:    sldi 10, 10, 32
+; BE-NEXT:    and 7, 0, 27
+; BE-NEXT:    and 3, 3, 26
+; BE-NEXT:    std 31, 1048(1) # 8-byte Folded Spill
+; BE-NEXT:    lis 11, -3856
+; BE-NEXT:    lis 12, 3855
+; BE-NEXT:    std 15, 920(1) # 8-byte Folded Spill
+; BE-NEXT:    sldi 30, 5, 1
+; BE-NEXT:    rldicl 5, 5, 63, 1
+; BE-NEXT:    oris 9, 9, 52428
+; BE-NEXT:    oris 10, 10, 13107
+; BE-NEXT:    std 14, 912(1) # 8-byte Folded Spill
+; BE-NEXT:    or 3, 3, 7
+; BE-NEXT:    ori 11, 11, 61680
+; BE-NEXT:    std 17, 936(1) # 8-byte Folded Spill
+; BE-NEXT:    ori 12, 12, 3855
+; BE-NEXT:    ori 29, 9, 52428
+; BE-NEXT:    ori 28, 10, 13107
+; BE-NEXT:    and 8, 30, 27
+; BE-NEXT:    std 16, 928(1) # 8-byte Folded Spill
+; BE-NEXT:    and 5, 5, 26
+; BE-NEXT:    sldi 7, 3, 2
+; BE-NEXT:    std 19, 952(1) # 8-byte Folded Spill
+; BE-NEXT:    rldicl 3, 3, 62, 2
+; BE-NEXT:    sldi 11, 11, 32
+; BE-NEXT:    sldi 12, 12, 32
+; BE-NEXT:    or 5, 5, 8
+; BE-NEXT:    std 18, 944(1) # 8-byte Folded Spill
+; BE-NEXT:    and 7, 7, 29
+; BE-NEXT:    and 3, 3, 28
+; BE-NEXT:    std 21, 968(1) # 8-byte Folded Spill
+; BE-NEXT:    oris 11, 11, 61680
+; BE-NEXT:    oris 12, 12, 3855
+; BE-NEXT:    sldi 8, 5, 2
+; BE-NEXT:    rldicl 5, 5, 62, 2
+; BE-NEXT:    std 20, 960(1) # 8-byte Folded Spill
+; BE-NEXT:    or 3, 3, 7
+; BE-NEXT:    ori 9, 11, 61680
+; BE-NEXT:    std 23, 984(1) # 8-byte Folded Spill
+; BE-NEXT:    ori 10, 12, 3855
+; BE-NEXT:    and 8, 8, 29
+; BE-NEXT:    and 5, 5, 28
+; BE-NEXT:    sldi 7, 3, 4
+; BE-NEXT:    std 22, 976(1) # 8-byte Folded Spill
+; BE-NEXT:    rldicl 3, 3, 60, 4
+; BE-NEXT:    or 5, 5, 8
+; BE-NEXT:    std 25, 1000(1) # 8-byte Folded Spill
+; BE-NEXT:    and 7, 7, 9
+; BE-NEXT:    and 3, 3, 10
+; BE-NEXT:    sldi 8, 5, 4
+; BE-NEXT:    rldicl 5, 5, 60, 4
+; BE-NEXT:    std 24, 992(1) # 8-byte Folded Spill
+; BE-NEXT:    or 3, 3, 7
+; BE-NEXT:    and 8, 8, 9
+; BE-NEXT:    std 27, 360(1) # 8-byte Folded Spill
+; BE-NEXT:    and 5, 5, 10
+; BE-NEXT:    rotlwi 7, 3, 24
+; BE-NEXT:    or 5, 5, 8
+; BE-NEXT:    rlwimi 7, 3, 8, 8, 15
+; BE-NEXT:    std 26, 352(1) # 8-byte Folded Spill
+; BE-NEXT:    mr 30, 9
+; BE-NEXT:    std 29, 376(1) # 8-byte Folded Spill
+; BE-NEXT:    rotlwi 8, 5, 24
+; BE-NEXT:    rldicl 9, 3, 32, 32
+; BE-NEXT:    rlwimi 7, 3, 8, 24, 31
+; BE-NEXT:    rldicl 3, 5, 32, 32
+; BE-NEXT:    std 28, 368(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwimi 8, 5, 8, 8, 15
+; BE-NEXT:    std 30, 384(1) # 8-byte Folded Spill
+; BE-NEXT:    rotlwi 11, 3, 24
+; BE-NEXT:    mr 0, 10
+; BE-NEXT:    rotlwi 10, 9, 24
+; BE-NEXT:    std 0, 392(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwimi 11, 3, 8, 8, 15
+; BE-NEXT:    rlwimi 8, 5, 8, 24, 31
+; BE-NEXT:    rlwimi 10, 9, 8, 8, 15
+; BE-NEXT:    rlwimi 11, 3, 8, 24, 31
+; BE-NEXT:    sldi 5, 8, 32
+; BE-NEXT:    rlwimi 10, 9, 8, 24, 31
+; BE-NEXT:    sldi 3, 7, 32
+; BE-NEXT:    or 11, 5, 11
+; BE-NEXT:    or 12, 3, 10
+; BE-NEXT:    rlwinm 3, 11, 0, 30, 30
+; BE-NEXT:    rlwinm 5, 11, 0, 29, 29
+; BE-NEXT:    mulld 3, 12, 3
+; BE-NEXT:    std 3, 896(1) # 8-byte Folded Spill
+; BE-NEXT:    clrldi 3, 11, 63
+; BE-NEXT:    mulld 2, 12, 3
+; BE-NEXT:    mulld 3, 12, 5
+; BE-NEXT:    std 3, 888(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 3, 11, 0, 28, 28
+; BE-NEXT:    rlwinm 5, 11, 0, 27, 27
+; BE-NEXT:    mulld 3, 12, 3
+; BE-NEXT:    std 3, 872(1) # 8-byte Folded Spill
+; BE-NEXT:    mulld 3, 12, 5
+; BE-NEXT:    std 3, 880(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 3, 11, 0, 26, 26
+; BE-NEXT:    rlwinm 5, 11, 0, 25, 25
+; BE-NEXT:    mulld 3, 12, 3
+; BE-NEXT:    std 3, 856(1) # 8-byte Folded Spill
+; BE-NEXT:    mulld 3, 12, 5
+; BE-NEXT:    std 3, 864(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 3, 11, 0, 24, 24
+; BE-NEXT:    rlwinm 5, 11, 0, 23, 23
+; BE-NEXT:    mulld 3, 12, 3
+; BE-NEXT:    std 3, 840(1) # 8-byte Folded Spill
+; BE-NEXT:    mulld 3, 12, 5
+; BE-NEXT:    std 3, 848(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 3, 11, 0, 22, 22
+; BE-NEXT:    rlwinm 5, 11, 0, 21, 21
+; BE-NEXT:    mulld 3, 12, 3
+; BE-NEXT:    std 3, 824(1) # 8-byte Folded Spill
+; BE-NEXT:    mulld 3, 12, 5
+; BE-NEXT:    std 3, 832(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 3, 11, 0, 20, 20
+; BE-NEXT:    rlwinm 5, 11, 0, 19, 19
+; BE-NEXT:    mulld 3, 12, 3
+; BE-NEXT:    std 3, 808(1) # 8-byte Folded Spill
+; BE-NEXT:    mulld 3, 12, 5
+; BE-NEXT:    std 3, 816(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 3, 11, 0, 18, 18
+; BE-NEXT:    rlwinm 5, 11, 0, 17, 17
+; BE-NEXT:    mulld 3, 12, 3
+; BE-NEXT:    std 3, 792(1) # 8-byte Folded Spill
+; BE-NEXT:    mulld 3, 12, 5
+; BE-NEXT:    std 3, 800(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 3, 11, 0, 16, 16
+; BE-NEXT:    rlwinm 5, 11, 0, 15, 15
+; BE-NEXT:    mulld 3, 12, 3
+; BE-NEXT:    std 3, 776(1) # 8-byte Folded Spill
+; BE-NEXT:    mulld 3, 12, 5
+; BE-NEXT:    std 3, 784(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 3, 11, 0, 14, 14
+; BE-NEXT:    rlwinm 5, 11, 0, 13, 13
+; BE-NEXT:    mulld 3, 12, 3
+; BE-NEXT:    std 3, 760(1) # 8-byte Folded Spill
+; BE-NEXT:    mulld 3, 12, 5
+; BE-NEXT:    std 3, 768(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 3, 11, 0, 12, 12
+; BE-NEXT:    rlwinm 5, 11, 0, 11, 11
+; BE-NEXT:    mulld 3, 12, 3
+; BE-NEXT:    std 3, 744(1) # 8-byte Folded Spill
+; BE-NEXT:    mulld 3, 12, 5
+; BE-NEXT:    std 3, 752(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 3, 11, 0, 10, 10
+; BE-NEXT:    mulld 3, 12, 3
+; BE-NEXT:    std 3, 736(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 3, 11, 0, 9, 9
+; BE-NEXT:    mulld 3, 12, 3
+; BE-NEXT:    std 3, 728(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 3, 11, 0, 8, 8
+; BE-NEXT:    mulld 3, 12, 3
+; BE-NEXT:    std 3, 720(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 3, 11, 0, 7, 7
+; BE-NEXT:    mulld 3, 12, 3
+; BE-NEXT:    std 3, 712(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 3, 11, 0, 6, 6
+; BE-NEXT:    mulld 3, 12, 3
+; BE-NEXT:    std 3, 704(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 3, 11, 0, 5, 5
+; BE-NEXT:    mulld 3, 12, 3
+; BE-NEXT:    std 3, 696(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 3, 11, 0, 4, 4
+; BE-NEXT:    mulld 3, 12, 3
+; BE-NEXT:    std 3, 688(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 3, 11, 0, 3, 3
+; BE-NEXT:    mulld 3, 12, 3
+; BE-NEXT:    std 3, 680(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 3, 11, 0, 2, 2
+; BE-NEXT:    mulld 3, 12, 3
+; BE-NEXT:    std 3, 672(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 3, 11, 0, 1, 1
+; BE-NEXT:    mulld 3, 12, 3
+; BE-NEXT:    std 3, 664(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 3, 11, 0, 0, 0
+; BE-NEXT:    mulld 3, 12, 3
+; BE-NEXT:    std 3, 656(1) # 8-byte Folded Spill
+; BE-NEXT:    rldicl 3, 11, 32, 32
+; BE-NEXT:    rldicl 3, 3, 32, 31
+; BE-NEXT:    rldicr 5, 11, 0, 0
+; BE-NEXT:    mulld 3, 12, 3
+; BE-NEXT:    mulld 5, 12, 5
+; BE-NEXT:    std 3, 640(1) # 8-byte Folded Spill
+; BE-NEXT:    std 5, 648(1) # 8-byte Folded Spill
+; BE-NEXT:    rldicl 3, 11, 31, 33
+; BE-NEXT:    rldicl 5, 11, 30, 34
+; BE-NEXT:    rldicl 3, 3, 33, 30
+; BE-NEXT:    rldicl 5, 5, 34, 29
+; BE-NEXT:    mulld 3, 12, 3
+; BE-NEXT:    std 3, 624(1) # 8-byte Folded Spill
+; BE-NEXT:    mulld 3, 12, 5
+; BE-NEXT:    std 3, 632(1) # 8-byte Folded Spill
+; BE-NEXT:    rldicl 3, 11, 29, 35
+; BE-NEXT:    rldicl 3, 3, 35, 28
+; BE-NEXT:    rldicl 5, 11, 28, 36
+; BE-NEXT:    rldicl 5, 5, 36, 27
+; BE-NEXT:    mulld 3, 12, 3
+; BE-NEXT:    std 3, 608(1) # 8-byte Folded Spill
+; BE-NEXT:    mulld 3, 12, 5
+; BE-NEXT:    std 3, 616(1) # 8-byte Folded Spill
+; BE-NEXT:    rldicl 3, 11, 27, 37
+; BE-NEXT:    rldicl 3, 3, 37, 26
+; BE-NEXT:    rldicl 5, 11, 26, 38
+; BE-NEXT:    rldicl 5, 5, 38, 25
+; BE-NEXT:    mulld 3, 12, 3
+; BE-NEXT:    std 3, 592(1) # 8-byte Folded Spill
+; BE-NEXT:    mulld 3, 12, 5
+; BE-NEXT:    std 3, 600(1) # 8-byte Folded Spill
+; BE-NEXT:    rldicl 3, 11, 25, 39
+; BE-NEXT:    rldicl 3, 3, 39, 24
+; BE-NEXT:    rldicl 5, 11, 24, 40
+; BE-NEXT:    rldicl 5, 5, 40, 23
+; BE-NEXT:    mulld 3, 12, 3
+; BE-NEXT:    std 3, 576(1) # 8-byte Folded Spill
+; BE-NEXT:    mulld 3, 12, 5
+; BE-NEXT:    std 3, 584(1) # 8-byte Folded Spill
+; BE-NEXT:    rldicl 3, 11, 23, 41
+; BE-NEXT:    rldicl 3, 3, 41, 22
+; BE-NEXT:    rldicl 5, 11, 22, 42
+; BE-NEXT:    rldicl 5, 5, 42, 21
+; BE-NEXT:    mulld 3, 12, 3
+; BE-NEXT:    std 3, 560(1) # 8-byte Folded Spill
+; BE-NEXT:    mulld 3, 12, 5
+; BE-NEXT:    std 3, 568(1) # 8-byte Folded Spill
+; BE-NEXT:    rldicl 3, 11, 21, 43
+; BE-NEXT:    rldicl 3, 3, 43, 20
+; BE-NEXT:    rldicl 5, 11, 20, 44
+; BE-NEXT:    rldicl 5, 5, 44, 19
+; BE-NEXT:    mulld 3, 12, 3
+; BE-NEXT:    std 3, 544(1) # 8-byte Folded Spill
+; BE-NEXT:    mulld 3, 12, 5
+; BE-NEXT:    std 3, 552(1) # 8-byte Folded Spill
+; BE-NEXT:    rldicl 3, 11, 19, 45
+; BE-NEXT:    rldicl 3, 3, 45, 18
+; BE-NEXT:    rldicl 5, 11, 18, 46
+; BE-NEXT:    rldicl 5, 5, 46, 17
+; BE-NEXT:    mulld 3, 12, 3
+; BE-NEXT:    std 3, 528(1) # 8-byte Folded Spill
+; BE-NEXT:    mulld 3, 12, 5
+; BE-NEXT:    std 3, 536(1) # 8-byte Folded Spill
+; BE-NEXT:    rldicl 3, 11, 17, 47
+; BE-NEXT:    rldicl 3, 3, 47, 16
+; BE-NEXT:    rldicl 5, 11, 16, 48
+; BE-NEXT:    rldicl 5, 5, 48, 15
+; BE-NEXT:    mulld 3, 12, 3
+; BE-NEXT:    std 3, 512(1) # 8-byte Folded Spill
+; BE-NEXT:    mulld 3, 12, 5
+; BE-NEXT:    std 3, 520(1) # 8-byte Folded Spill
+; BE-NEXT:    rldicl 3, 11, 15, 49
+; BE-NEXT:    rldicl 3, 3, 49, 14
+; BE-NEXT:    rldicl 5, 11, 14, 50
+; BE-NEXT:    rldicl 5, 5, 50, 13
+; BE-NEXT:    mulld 3, 12, 3
+; BE-NEXT:    std 3, 496(1) # 8-byte Folded Spill
+; BE-NEXT:    mulld 3, 12, 5
+; BE-NEXT:    std 3, 504(1) # 8-byte Folded Spill
+; BE-NEXT:    rldicl 3, 11, 13, 51
+; BE-NEXT:    rldicl 3, 3, 51, 12
+; BE-NEXT:    rldicl 5, 11, 12, 52
+; BE-NEXT:    rldicl 5, 5, 52, 11
+; BE-NEXT:    mulld 3, 12, 3
+; BE-NEXT:    std 3, 480(1) # 8-byte Folded Spill
+; BE-NEXT:    mulld 3, 12, 5
+; BE-NEXT:    std 3, 488(1) # 8-byte Folded Spill
+; BE-NEXT:    rldicl 3, 11, 11, 53
+; BE-NEXT:    rldicl 3, 3, 53, 10
+; BE-NEXT:    rldicl 5, 11, 10, 54
+; BE-NEXT:    rldicl 5, 5, 54, 9
+; BE-NEXT:    mulld 3, 12, 3
+; BE-NEXT:    std 3, 464(1) # 8-byte Folded Spill
+; BE-NEXT:    mulld 3, 12, 5
+; BE-NEXT:    std 3, 472(1) # 8-byte Folded Spill
+; BE-NEXT:    rldicl 3, 11, 9, 55
+; BE-NEXT:    rldicl 3, 3, 55, 8
+; BE-NEXT:    rldicl 5, 11, 8, 56
+; BE-NEXT:    rldicl 5, 5, 56, 7
+; BE-NEXT:    mulld 3, 12, 3
+; BE-NEXT:    std 3, 448(1) # 8-byte Folded Spill
+; BE-NEXT:    mulld 3, 12, 5
+; BE-NEXT:    std 3, 456(1) # 8-byte Folded Spill
+; BE-NEXT:    rldicl 3, 11, 7, 57
+; BE-NEXT:    rldicl 3, 3, 57, 6
+; BE-NEXT:    rldicl 5, 11, 6, 58
+; BE-NEXT:    rldicl 5, 5, 58, 5
+; BE-NEXT:    mulld 3, 12, 3
+; BE-NEXT:    std 3, 432(1) # 8-byte Folded Spill
+; BE-NEXT:    mulld 3, 12, 5
+; BE-NEXT:    std 3, 440(1) # 8-byte Folded Spill
+; BE-NEXT:    rldicl 3, 11, 5, 59
+; BE-NEXT:    rldicl 3, 3, 59, 4
+; BE-NEXT:    rldicl 5, 11, 4, 60
+; BE-NEXT:    rldicl 5, 5, 60, 3
+; BE-NEXT:    mulld 3, 12, 3
+; BE-NEXT:    std 3, 416(1) # 8-byte Folded Spill
+; BE-NEXT:    mulld 3, 12, 5
+; BE-NEXT:    std 3, 424(1) # 8-byte Folded Spill
+; BE-NEXT:    rldicl 3, 11, 3, 61
+; BE-NEXT:    rldicl 5, 11, 2, 62
+; BE-NEXT:    rldicl 3, 3, 61, 2
+; BE-NEXT:    rldicl 5, 5, 62, 1
+; BE-NEXT:    mulld 3, 12, 3
+; BE-NEXT:    std 3, 400(1) # 8-byte Folded Spill
+; BE-NEXT:    mulld 3, 12, 5
+; BE-NEXT:    std 3, 408(1) # 8-byte Folded Spill
+; BE-NEXT:    sldi 3, 4, 1
+; BE-NEXT:    rldicl 4, 4, 63, 1
+; BE-NEXT:    and 3, 3, 27
+; BE-NEXT:    and 4, 4, 26
+; BE-NEXT:    or 3, 4, 3
+; BE-NEXT:    sldi 4, 3, 2
+; BE-NEXT:    rldicl 3, 3, 62, 2
+; BE-NEXT:    and 4, 4, 29
+; BE-NEXT:    and 3, 3, 28
+; BE-NEXT:    or 3, 3, 4
+; BE-NEXT:    sldi 4, 3, 4
+; BE-NEXT:    rldicl 3, 3, 60, 4
+; BE-NEXT:    and 4, 4, 30
+; BE-NEXT:    and 3, 3, 0
+; BE-NEXT:    or 3, 3, 4
+; BE-NEXT:    rotlwi 4, 3, 24
+; BE-NEXT:    rlwimi 4, 3, 8, 8, 15
+; BE-NEXT:    rlwimi 4, 3, 8, 24, 31
+; BE-NEXT:    rldicl 3, 3, 32, 32
+; BE-NEXT:    rotlwi 5, 3, 24
+; BE-NEXT:    rlwimi 5, 3, 8, 8, 15
+; BE-NEXT:    rlwimi 5, 3, 8, 24, 31
+; BE-NEXT:    sldi 3, 6, 1
+; BE-NEXT:    rldicl 6, 6, 63, 1
+; BE-NEXT:    and 3, 3, 27
+; BE-NEXT:    and 6, 6, 26
+; BE-NEXT:    or 3, 6, 3
+; BE-NEXT:    sldi 6, 3, 2
+; BE-NEXT:    rldicl 3, 3, 62, 2
+; BE-NEXT:    and 6, 6, 29
+; BE-NEXT:    and 3, 3, 28
+; BE-NEXT:    or 3, 3, 6
+; BE-NEXT:    sldi 6, 3, 4
+; BE-NEXT:    rldicl 3, 3, 60, 4
+; BE-NEXT:    and 6, 6, 30
+; BE-NEXT:    and 3, 3, 0
+; BE-NEXT:    or 3, 3, 6
+; BE-NEXT:    rotlwi 6, 3, 24
+; BE-NEXT:    rlwimi 6, 3, 8, 8, 15
+; BE-NEXT:    rlwimi 6, 3, 8, 24, 31
+; BE-NEXT:    rldicl 3, 3, 32, 32
+; BE-NEXT:    rotlwi 7, 3, 24
+; BE-NEXT:    rlwimi 7, 3, 8, 8, 15
+; BE-NEXT:    rlwimi 7, 3, 8, 24, 31
+; BE-NEXT:    sldi 3, 4, 32
+; BE-NEXT:    or 4, 3, 5
+; BE-NEXT:    sldi 3, 6, 32
+; BE-NEXT:    or 3, 3, 7
+; BE-NEXT:    rlwinm 5, 3, 0, 30, 30
+; BE-NEXT:    mulld 5, 4, 5
+; BE-NEXT:    std 5, 344(1) # 8-byte Folded Spill
+; BE-NEXT:    clrldi 5, 3, 63
+; BE-NEXT:    mulld 5, 4, 5
+; BE-NEXT:    std 5, 336(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 5, 3, 0, 29, 29
+; BE-NEXT:    mulld 5, 4, 5
+; BE-NEXT:    std 5, 328(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 5, 3, 0, 28, 28
+; BE-NEXT:    mulld 5, 4, 5
+; BE-NEXT:    std 5, 320(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 5, 3, 0, 27, 27
+; BE-NEXT:    mulld 5, 4, 5
+; BE-NEXT:    std 5, 312(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 5, 3, 0, 26, 26
+; BE-NEXT:    mulld 5, 4, 5
+; BE-NEXT:    std 5, 304(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 5, 3, 0, 25, 25
+; BE-NEXT:    mulld 5, 4, 5
+; BE-NEXT:    std 5, 296(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 5, 3, 0, 24, 24
+; BE-NEXT:    mulld 5, 4, 5
+; BE-NEXT:    std 5, 288(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 5, 3, 0, 23, 23
+; BE-NEXT:    mulld 5, 4, 5
+; BE-NEXT:    std 5, 280(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 5, 3, 0, 22, 22
+; BE-NEXT:    mulld 5, 4, 5
+; BE-NEXT:    std 5, 272(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 5, 3, 0, 21, 21
+; BE-NEXT:    mulld 5, 4, 5
+; BE-NEXT:    std 5, 264(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 5, 3, 0, 20, 20
+; BE-NEXT:    mulld 5, 4, 5
+; BE-NEXT:    std 5, 256(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 5, 3, 0, 19, 19
+; BE-NEXT:    mulld 5, 4, 5
+; BE-NEXT:    std 5, 248(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 5, 3, 0, 18, 18
+; BE-NEXT:    mulld 5, 4, 5
+; BE-NEXT:    std 5, 240(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 5, 3, 0, 17, 17
+; BE-NEXT:    mulld 5, 4, 5
+; BE-NEXT:    std 5, 232(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 5, 3, 0, 16, 16
+; BE-NEXT:    mulld 5, 4, 5
+; BE-NEXT:    std 5, 224(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 5, 3, 0, 15, 15
+; BE-NEXT:    mulld 5, 4, 5
+; BE-NEXT:    std 5, 216(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 5, 3, 0, 14, 14
+; BE-NEXT:    mulld 5, 4, 5
+; BE-NEXT:    std 5, 208(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 5, 3, 0, 13, 13
+; BE-NEXT:    mulld 5, 4, 5
+; BE-NEXT:    std 5, 200(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 5, 3, 0, 12, 12
+; BE-NEXT:    mulld 5, 4, 5
+; BE-NEXT:    std 5, 192(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 5, 3, 0, 11, 11
+; BE-NEXT:    mulld 5, 4, 5
+; BE-NEXT:    std 5, 184(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 5, 3, 0, 10, 10
+; BE-NEXT:    mulld 5, 4, 5
+; BE-NEXT:    std 5, 176(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 5, 3, 0, 9, 9
+; BE-NEXT:    mulld 5, 4, 5
+; BE-NEXT:    std 5, 168(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 5, 3, 0, 8, 8
+; BE-NEXT:    mulld 5, 4, 5
+; BE-NEXT:    std 5, 160(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 5, 3, 0, 7, 7
+; BE-NEXT:    mulld 5, 4, 5
+; BE-NEXT:    std 5, 152(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 5, 3, 0, 6, 6
+; BE-NEXT:    mulld 5, 4, 5
+; BE-NEXT:    std 5, 144(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 5, 3, 0, 5, 5
+; BE-NEXT:    mulld 5, 4, 5
+; BE-NEXT:    std 5, 136(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 5, 3, 0, 4, 4
+; BE-NEXT:    mulld 5, 4, 5
+; BE-NEXT:    std 5, 128(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 5, 3, 0, 3, 3
+; BE-NEXT:    mulld 5, 4, 5
+; BE-NEXT:    std 5, 120(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 5, 3, 0, 2, 2
+; BE-NEXT:    mulld 5, 4, 5
+; BE-NEXT:    std 5, 112(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 5, 3, 0, 1, 1
+; BE-NEXT:    mulld 5, 4, 5
+; BE-NEXT:    std 5, 104(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 5, 3, 0, 0, 0
+; BE-NEXT:    mulld 5, 4, 5
+; BE-NEXT:    std 5, 96(1) # 8-byte Folded Spill
+; BE-NEXT:    rldicl 5, 3, 32, 32
+; BE-NEXT:    rldicl 5, 5, 32, 31
+; BE-NEXT:    rldicr 6, 3, 0, 0
+; BE-NEXT:    mulld 5, 4, 5
+; BE-NEXT:    mulld 6, 4, 6
+; BE-NEXT:    std 5, 80(1) # 8-byte Folded Spill
+; BE-NEXT:    std 6, 88(1) # 8-byte Folded Spill
+; BE-NEXT:    rldicl 5, 3, 31, 33
+; BE-NEXT:    rldicl 5, 5, 33, 30
+; BE-NEXT:    rldicl 6, 3, 30, 34
+; BE-NEXT:    rldicl 6, 6, 34, 29
+; BE-NEXT:    mulld 5, 4, 5
+; BE-NEXT:    std 5, 64(1) # 8-byte Folded Spill
+; BE-NEXT:    mulld 5, 4, 6
+; BE-NEXT:    std 5, 72(1) # 8-byte Folded Spill
+; BE-NEXT:    rldicl 5, 3, 29, 35
+; BE-NEXT:    rldicl 6, 3, 28, 36
+; BE-NEXT:    rldicl 5, 5, 35, 28
+; BE-NEXT:    rldicl 6, 6, 36, 27
+; BE-NEXT:    mulld 31, 4, 5
+; BE-NEXT:    mulld 5, 4, 6
+; BE-NEXT:    std 5, 56(1) # 8-byte Folded Spill
+; BE-NEXT:    rldicl 5, 3, 27, 37
+; BE-NEXT:    rldicl 5, 5, 37, 26
+; BE-NEXT:    rldicl 6, 3, 26, 38
+; BE-NEXT:    mulld 15, 4, 5
+; BE-NEXT:    rldicl 5, 3, 25, 39
+; BE-NEXT:    rldicl 6, 6, 38, 25
+; BE-NEXT:    rldicl 5, 5, 39, 24
+; BE-NEXT:    mulld 14, 4, 6
+; BE-NEXT:    rldicl 6, 3, 24, 40
+; BE-NEXT:    mulld 17, 4, 5
+; BE-NEXT:    rldicl 5, 3, 23, 41
+; BE-NEXT:    rldicl 6, 6, 40, 23
+; BE-NEXT:    rldicl 5, 5, 41, 22
+; BE-NEXT:    mulld 16, 4, 6
+; BE-NEXT:    rldicl 6, 3, 22, 42
+; BE-NEXT:    mulld 19, 4, 5
+; BE-NEXT:    rldicl 5, 3, 21, 43
+; BE-NEXT:    rldicl 6, 6, 42, 21
+; BE-NEXT:    rldicl 5, 5, 43, 20
+; BE-NEXT:    mulld 18, 4, 6
+; BE-NEXT:    rldicl 6, 3, 20, 44
+; BE-NEXT:    mulld 21, 4, 5
+; BE-NEXT:    rldicl 5, 3, 19, 45
+; BE-NEXT:    rldicl 6, 6, 44, 19
+; BE-NEXT:    rldicl 5, 5, 45, 18
+; BE-NEXT:    mulld 20, 4, 6
+; BE-NEXT:    rldicl 6, 3, 18, 46
+; BE-NEXT:    mulld 23, 4, 5
+; BE-NEXT:    rldicl 5, 3, 17, 47
+; BE-NEXT:    rldicl 6, 6, 46, 17
+; BE-NEXT:    rldicl 5, 5, 47, 16
+; BE-NEXT:    mulld 22, 4, 6
+; BE-NEXT:    rldicl 6, 3, 16, 48
+; BE-NEXT:    mulld 25, 4, 5
+; BE-NEXT:    rldicl 5, 3, 15, 49
+; BE-NEXT:    rldicl 6, 6, 48, 15
+; BE-NEXT:    rldicl 5, 5, 49, 14
+; BE-NEXT:    mulld 24, 4, 6
+; BE-NEXT:    rldicl 6, 3, 14, 50
+; BE-NEXT:    mulld 27, 4, 5
+; BE-NEXT:    rldicl 5, 3, 13, 51
+; BE-NEXT:    rldicl 6, 6, 50, 13
+; BE-NEXT:    rldicl 5, 5, 51, 12
+; BE-NEXT:    mulld 26, 4, 6
+; BE-NEXT:    rldicl 6, 3, 12, 52
+; BE-NEXT:    mulld 29, 4, 5
+; BE-NEXT:    rldicl 5, 3, 11, 53
+; BE-NEXT:    rldicl 6, 6, 52, 11
+; BE-NEXT:    rldicl 5, 5, 53, 10
+; BE-NEXT:    mulld 28, 4, 6
+; BE-NEXT:    rldicl 6, 3, 10, 54
+; BE-NEXT:    mulld 0, 4, 5
+; BE-NEXT:    rldicl 5, 3, 9, 55
+; BE-NEXT:    rldicl 6, 6, 54, 9
+; BE-NEXT:    rldicl 5, 5, 55, 8
+; BE-NEXT:    mulld 30, 4, 6
+; BE-NEXT:    rldicl 6, 3, 8, 56
+; BE-NEXT:    mulld 11, 4, 5
+; BE-NEXT:    rldicl 5, 3, 7, 57
+; BE-NEXT:    rldicl 6, 6, 56, 7
+; BE-NEXT:    rldicl 5, 5, 57, 6
+; BE-NEXT:    mulld 12, 4, 6
+; BE-NEXT:    rldicl 6, 3, 6, 58
+; BE-NEXT:    mulld 9, 4, 5
+; BE-NEXT:    rldicl 5, 3, 5, 59
+; BE-NEXT:    rldicl 6, 6, 58, 5
+; BE-NEXT:    rldicl 5, 5, 59, 4
+; BE-NEXT:    mulld 10, 4, 6
+; BE-NEXT:    rldicl 6, 3, 4, 60
+; BE-NEXT:    mulld 7, 4, 5
+; BE-NEXT:    rldicl 5, 3, 3, 61
+; BE-NEXT:    rldicl 3, 3, 2, 62
+; BE-NEXT:    rldicl 6, 6, 60, 3
+; BE-NEXT:    rldicl 3, 3, 62, 1
+; BE-NEXT:    mulld 8, 4, 6
+; BE-NEXT:    mulld 6, 4, 3
+; BE-NEXT:    ld 3, 896(1) # 8-byte Folded Reload
+; BE-NEXT:    rldicl 5, 5, 61, 2
+; BE-NEXT:    mulld 5, 4, 5
+; BE-NEXT:    xor 3, 2, 3
+; BE-NEXT:    ld 4, 344(1) # 8-byte Folded Reload
+; BE-NEXT:    ld 2, 336(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 2, 4
+; BE-NEXT:    ld 2, 888(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 328(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 2
+; BE-NEXT:    ld 2, 872(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 320(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 2
+; BE-NEXT:    ld 2, 880(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 312(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 2
+; BE-NEXT:    ld 2, 856(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 304(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 2
+; BE-NEXT:    ld 2, 864(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 296(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 2
+; BE-NEXT:    ld 2, 840(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 288(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 2
+; BE-NEXT:    ld 2, 848(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 280(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 2
+; BE-NEXT:    ld 2, 824(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 272(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 2
+; BE-NEXT:    ld 2, 832(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 264(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 2
+; BE-NEXT:    ld 2, 808(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 256(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 2
+; BE-NEXT:    ld 2, 816(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 248(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 2
+; BE-NEXT:    ld 2, 792(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 240(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 2
+; BE-NEXT:    ld 2, 800(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 232(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 2
+; BE-NEXT:    ld 2, 776(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 224(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 2
+; BE-NEXT:    ld 2, 784(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 216(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 2
+; BE-NEXT:    ld 2, 760(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 208(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 2
+; BE-NEXT:    ld 2, 768(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 200(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 2
+; BE-NEXT:    ld 2, 744(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 192(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 2
+; BE-NEXT:    ld 2, 752(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 184(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 2
+; BE-NEXT:    ld 2, 736(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 176(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 2
+; BE-NEXT:    ld 2, 728(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 168(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 2
+; BE-NEXT:    ld 2, 720(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 160(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 2
+; BE-NEXT:    ld 2, 712(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 152(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 2
+; BE-NEXT:    ld 2, 704(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 144(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 2
+; BE-NEXT:    ld 2, 696(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 136(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 2
+; BE-NEXT:    ld 2, 688(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 128(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 2
+; BE-NEXT:    ld 2, 680(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 120(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 2
+; BE-NEXT:    ld 2, 672(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 112(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 2
+; BE-NEXT:    ld 2, 664(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 104(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 2
+; BE-NEXT:    ld 2, 656(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 96(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 2
+; BE-NEXT:    ld 2, 640(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 80(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 2
+; BE-NEXT:    ld 2, 624(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 64(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 2
+; BE-NEXT:    ld 2, 632(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 72(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 2
+; BE-NEXT:    ld 2, 608(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 31
+; BE-NEXT:    ld 31, 616(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    xor 3, 3, 31
+; BE-NEXT:    ld 31, 56(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 31
+; BE-NEXT:    ld 31, 592(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 15
+; BE-NEXT:    xor 4, 4, 14
+; BE-NEXT:    ld 15, 600(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 31
+; BE-NEXT:    xor 4, 4, 17
+; BE-NEXT:    xor 4, 4, 16
+; BE-NEXT:    xor 3, 3, 15
+; BE-NEXT:    ld 15, 576(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 19
+; BE-NEXT:    xor 4, 4, 18
+; BE-NEXT:    ld 17, 584(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 15
+; BE-NEXT:    xor 4, 4, 21
+; BE-NEXT:    xor 4, 4, 20
+; BE-NEXT:    xor 3, 3, 17
+; BE-NEXT:    ld 17, 560(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 23
+; BE-NEXT:    xor 4, 4, 22
+; BE-NEXT:    ld 19, 568(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 17
+; BE-NEXT:    xor 4, 4, 25
+; BE-NEXT:    xor 4, 4, 24
+; BE-NEXT:    xor 3, 3, 19
+; BE-NEXT:    ld 19, 544(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 27
+; BE-NEXT:    xor 4, 4, 26
+; BE-NEXT:    ld 21, 552(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 19
+; BE-NEXT:    xor 4, 4, 29
+; BE-NEXT:    xor 4, 4, 28
+; BE-NEXT:    xor 3, 3, 21
+; BE-NEXT:    ld 21, 528(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 0
+; BE-NEXT:    xor 4, 4, 30
+; BE-NEXT:    ld 23, 536(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 21
+; BE-NEXT:    xor 4, 4, 11
+; BE-NEXT:    xor 4, 4, 12
+; BE-NEXT:    xor 3, 3, 23
+; BE-NEXT:    ld 23, 512(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 9
+; BE-NEXT:    xor 4, 4, 10
+; BE-NEXT:    ld 25, 520(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 23
+; BE-NEXT:    xor 4, 4, 7
+; BE-NEXT:    xor 4, 4, 8
+; BE-NEXT:    xor 3, 3, 25
+; BE-NEXT:    ld 25, 496(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 5
+; BE-NEXT:    xor 4, 4, 6
+; BE-NEXT:    ld 27, 504(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 25
+; BE-NEXT:    xor 3, 3, 27
+; BE-NEXT:    ld 27, 480(1) # 8-byte Folded Reload
+; BE-NEXT:    ld 29, 488(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 27
+; BE-NEXT:    xor 3, 3, 29
+; BE-NEXT:    ld 29, 464(1) # 8-byte Folded Reload
+; BE-NEXT:    ld 0, 472(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 29
+; BE-NEXT:    xor 3, 3, 0
+; BE-NEXT:    ld 0, 448(1) # 8-byte Folded Reload
+; BE-NEXT:    ld 11, 456(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 0
+; BE-NEXT:    xor 3, 3, 11
+; BE-NEXT:    ld 11, 432(1) # 8-byte Folded Reload
+; BE-NEXT:    ld 9, 440(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 11
+; BE-NEXT:    xor 3, 3, 9
+; BE-NEXT:    ld 9, 416(1) # 8-byte Folded Reload
+; BE-NEXT:    ld 7, 424(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 9
+; BE-NEXT:    xor 3, 3, 7
+; BE-NEXT:    ld 7, 400(1) # 8-byte Folded Reload
+; BE-NEXT:    ld 5, 408(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 7
+; BE-NEXT:    xor 3, 3, 5
+; BE-NEXT:    ld 5, 648(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 5
+; BE-NEXT:    ld 5, 88(1) # 8-byte Folded Reload
+; BE-NEXT:    ld 7, 360(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 5
+; BE-NEXT:    sldi 5, 3, 1
+; BE-NEXT:    rldicl 3, 3, 63, 1
+; BE-NEXT:    sldi 6, 4, 1
+; BE-NEXT:    ld 8, 352(1) # 8-byte Folded Reload
+; BE-NEXT:    rldicl 4, 4, 63, 1
+; BE-NEXT:    and 5, 5, 7
+; BE-NEXT:    and 3, 3, 8
+; BE-NEXT:    and 6, 6, 7
+; BE-NEXT:    ld 7, 376(1) # 8-byte Folded Reload
+; BE-NEXT:    and 4, 4, 8
+; BE-NEXT:    or 3, 3, 5
+; BE-NEXT:    ld 8, 368(1) # 8-byte Folded Reload
+; BE-NEXT:    or 4, 4, 6
+; BE-NEXT:    sldi 5, 3, 2
+; BE-NEXT:    rldicl 3, 3, 62, 2
+; BE-NEXT:    sldi 6, 4, 2
+; BE-NEXT:    ld 2, 904(1) # 8-byte Folded Reload
+; BE-NEXT:    rldicl 4, 4, 62, 2
+; BE-NEXT:    and 5, 5, 7
+; BE-NEXT:    ld 31, 1048(1) # 8-byte Folded Reload
+; BE-NEXT:    and 3, 3, 8
+; BE-NEXT:    and 6, 6, 7
+; BE-NEXT:    and 4, 4, 8
+; BE-NEXT:    ld 8, 384(1) # 8-byte Folded Reload
+; BE-NEXT:    or 3, 3, 5
+; BE-NEXT:    sldi 5, 3, 4
+; BE-NEXT:    rldicl 3, 3, 60, 4
+; BE-NEXT:    ld 7, 392(1) # 8-byte Folded Reload
+; BE-NEXT:    or 4, 4, 6
+; BE-NEXT:    and 5, 5, 8
+; BE-NEXT:    and 3, 3, 7
+; BE-NEXT:    sldi 6, 4, 4
+; BE-NEXT:    ld 30, 1040(1) # 8-byte Folded Reload
+; BE-NEXT:    rldicl 4, 4, 60, 4
+; BE-NEXT:    or 3, 3, 5
+; BE-NEXT:    ld 29, 1032(1) # 8-byte Folded Reload
+; BE-NEXT:    and 6, 6, 8
+; BE-NEXT:    and 4, 4, 7
+; BE-NEXT:    rotlwi 5, 3, 24
+; BE-NEXT:    or 4, 4, 6
+; BE-NEXT:    ld 28, 1024(1) # 8-byte Folded Reload
+; BE-NEXT:    rlwimi 5, 3, 8, 8, 15
+; BE-NEXT:    rotlwi 6, 4, 24
+; BE-NEXT:    ld 27, 1016(1) # 8-byte Folded Reload
+; BE-NEXT:    rldicl 7, 3, 32, 32
+; BE-NEXT:    rlwimi 5, 3, 8, 24, 31
+; BE-NEXT:    rldicl 3, 4, 32, 32
+; BE-NEXT:    ld 26, 1008(1) # 8-byte Folded Reload
+; BE-NEXT:    rlwimi 6, 4, 8, 8, 15
+; BE-NEXT:    ld 25, 1000(1) # 8-byte Folded Reload
+; BE-NEXT:    rotlwi 8, 7, 24
+; BE-NEXT:    rotlwi 9, 3, 24
+; BE-NEXT:    rlwimi 8, 7, 8, 8, 15
+; BE-NEXT:    ld 24, 992(1) # 8-byte Folded Reload
+; BE-NEXT:    rlwimi 9, 3, 8, 8, 15
+; BE-NEXT:    ld 23, 984(1) # 8-byte Folded Reload
+; BE-NEXT:    rlwimi 6, 4, 8, 24, 31
+; BE-NEXT:    rlwimi 8, 7, 8, 24, 31
+; BE-NEXT:    ld 22, 976(1) # 8-byte Folded Reload
+; BE-NEXT:    rlwimi 9, 3, 8, 24, 31
+; BE-NEXT:    ld 21, 968(1) # 8-byte Folded Reload
+; BE-NEXT:    sldi 3, 5, 32
+; BE-NEXT:    sldi 4, 6, 32
+; BE-NEXT:    or 3, 3, 8
+; BE-NEXT:    ld 20, 960(1) # 8-byte Folded Reload
+; BE-NEXT:    or 4, 4, 9
+; BE-NEXT:    ld 19, 952(1) # 8-byte Folded Reload
+; BE-NEXT:    ld 18, 944(1) # 8-byte Folded Reload
+; BE-NEXT:    ld 17, 936(1) # 8-byte Folded Reload
+; BE-NEXT:    ld 16, 928(1) # 8-byte Folded Reload
+; BE-NEXT:    ld 15, 920(1) # 8-byte Folded Reload
+; BE-NEXT:    ld 14, 912(1) # 8-byte Folded Reload
+; BE-NEXT:    addi 1, 1, 1056
+; BE-NEXT:    blr
+;
+; LE-LABEL: clmulr_v2i64:
+; LE:       # %bb.0:
+; LE-NEXT:    stdu 1, -752(1)
+; LE-NEXT:    lis 4, -21846
+; LE-NEXT:    lis 5, 21845
+; LE-NEXT:    xxswapd 1, 35
+; LE-NEXT:    xxswapd 0, 34
+; LE-NEXT:    mfvsrd 3, 35
+; LE-NEXT:    mfvsrd 9, 34
+; LE-NEXT:    lis 6, -13108
+; LE-NEXT:    lis 7, 13107
+; LE-NEXT:    ori 4, 4, 43690
+; LE-NEXT:    ori 5, 5, 21845
+; LE-NEXT:    mffprd 8, 1
+; LE-NEXT:    mffprd 10, 0
+; LE-NEXT:    std 28, 720(1) # 8-byte Folded Spill
+; LE-NEXT:    std 29, 728(1) # 8-byte Folded Spill
+; LE-NEXT:    ori 6, 6, 52428
+; LE-NEXT:    ori 7, 7, 13107
+; LE-NEXT:    sldi 4, 4, 32
+; LE-NEXT:    sldi 5, 5, 32
+; LE-NEXT:    sldi 6, 6, 32
+; LE-NEXT:    sldi 7, 7, 32
+; LE-NEXT:    sldi 11, 3, 1
+; LE-NEXT:    rldicl 3, 3, 63, 1
+; LE-NEXT:    std 30, 736(1) # 8-byte Folded Spill
+; LE-NEXT:    lis 0, -3856
+; LE-NEXT:    oris 4, 4, 43690
+; LE-NEXT:    oris 5, 5, 21845
+; LE-NEXT:    lis 30, 3855
+; LE-NEXT:    oris 6, 6, 52428
+; LE-NEXT:    sldi 12, 10, 1
+; LE-NEXT:    rldicl 10, 10, 63, 1
+; LE-NEXT:    oris 7, 7, 13107
+; LE-NEXT:    std 27, 712(1) # 8-byte Folded Spill
+; LE-NEXT:    ori 28, 4, 43690
+; LE-NEXT:    ori 29, 5, 21845
+; LE-NEXT:    std 14, 608(1) # 8-byte Folded Spill
+; LE-NEXT:    std 15, 616(1) # 8-byte Folded Spill
+; LE-NEXT:    sldi 4, 8, 1
+; LE-NEXT:    rldicl 5, 8, 63, 1
+; LE-NEXT:    std 16, 624(1) # 8-byte Folded Spill
+; LE-NEXT:    std 17, 632(1) # 8-byte Folded Spill
+; LE-NEXT:    sldi 8, 9, 1
+; LE-NEXT:    rldicl 9, 9, 63, 1
+; LE-NEXT:    std 28, 584(1) # 8-byte Folded Spill
+; LE-NEXT:    std 29, 592(1) # 8-byte Folded Spill
+; LE-NEXT:    and 11, 11, 28
+; LE-NEXT:    and 3, 3, 29
+; LE-NEXT:    std 18, 640(1) # 8-byte Folded Spill
+; LE-NEXT:    std 19, 648(1) # 8-byte Folded Spill
+; LE-NEXT:    and 4, 4, 28
+; LE-NEXT:    and 5, 5, 29
+; LE-NEXT:    std 20, 656(1) # 8-byte Folded Spill
+; LE-NEXT:    std 21, 664(1) # 8-byte Folded Spill
+; LE-NEXT:    and 8, 8, 28
+; LE-NEXT:    and 9, 9, 29
+; LE-NEXT:    std 22, 672(1) # 8-byte Folded Spill
+; LE-NEXT:    std 23, 680(1) # 8-byte Folded Spill
+; LE-NEXT:    and 12, 12, 28
+; LE-NEXT:    and 10, 10, 29
+; LE-NEXT:    std 24, 688(1) # 8-byte Folded Spill
+; LE-NEXT:    std 25, 696(1) # 8-byte Folded Spill
+; LE-NEXT:    or 3, 3, 11
+; LE-NEXT:    or 4, 5, 4
+; LE-NEXT:    std 26, 704(1) # 8-byte Folded Spill
+; LE-NEXT:    std 31, 744(1) # 8-byte Folded Spill
+; LE-NEXT:    ori 5, 0, 61680
+; LE-NEXT:    ori 11, 30, 3855
+; LE-NEXT:    std 2, 600(1) # 8-byte Folded Spill
+; LE-NEXT:    ori 30, 6, 52428
+; LE-NEXT:    ori 0, 7, 13107
+; LE-NEXT:    std 30, 568(1) # 8-byte Folded Spill
+; LE-NEXT:    std 0, 576(1) # 8-byte Folded Spill
+; LE-NEXT:    or 6, 9, 8
+; LE-NEXT:    or 7, 10, 12
+; LE-NEXT:    sldi 8, 3, 2
+; LE-NEXT:    rldicl 3, 3, 62, 2
+; LE-NEXT:    sldi 9, 4, 2
+; LE-NEXT:    rldicl 4, 4, 62, 2
+; LE-NEXT:    sldi 5, 5, 32
+; LE-NEXT:    sldi 10, 11, 32
+; LE-NEXT:    sldi 11, 6, 2
+; LE-NEXT:    rldicl 6, 6, 62, 2
+; LE-NEXT:    sldi 12, 7, 2
+; LE-NEXT:    rldicl 7, 7, 62, 2
+; LE-NEXT:    and 8, 8, 30
+; LE-NEXT:    and 3, 3, 0
+; LE-NEXT:    and 9, 9, 30
+; LE-NEXT:    and 4, 4, 0
+; LE-NEXT:    oris 5, 5, 61680
+; LE-NEXT:    oris 10, 10, 3855
+; LE-NEXT:    and 11, 11, 30
+; LE-NEXT:    and 6, 6, 0
+; LE-NEXT:    and 12, 12, 30
+; LE-NEXT:    and 7, 7, 0
+; LE-NEXT:    or 3, 3, 8
+; LE-NEXT:    or 4, 4, 9
+; LE-NEXT:    ori 30, 5, 61680
+; LE-NEXT:    std 30, 552(1) # 8-byte Folded Spill
+; LE-NEXT:    ori 0, 10, 3855
+; LE-NEXT:    std 0, 560(1) # 8-byte Folded Spill
+; LE-NEXT:    or 5, 6, 11
+; LE-NEXT:    or 6, 7, 12
+; LE-NEXT:    sldi 7, 3, 4
+; LE-NEXT:    rldicl 3, 3, 60, 4
+; LE-NEXT:    sldi 8, 4, 4
+; LE-NEXT:    rldicl 4, 4, 60, 4
+; LE-NEXT:    sldi 9, 5, 4
+; LE-NEXT:    rldicl 5, 5, 60, 4
+; LE-NEXT:    sldi 10, 6, 4
+; LE-NEXT:    rldicl 6, 6, 60, 4
+; LE-NEXT:    and 7, 7, 30
+; LE-NEXT:    and 3, 3, 0
+; LE-NEXT:    and 8, 8, 30
+; LE-NEXT:    and 4, 4, 0
+; LE-NEXT:    and 9, 9, 30
+; LE-NEXT:    and 5, 5, 0
+; LE-NEXT:    and 10, 10, 30
+; LE-NEXT:    and 6, 6, 0
+; LE-NEXT:    or 3, 3, 7
+; LE-NEXT:    or 4, 4, 8
+; LE-NEXT:    or 5, 5, 9
+; LE-NEXT:    or 6, 6, 10
+; LE-NEXT:    rldicl 7, 3, 32, 32
+; LE-NEXT:    rotlwi 8, 3, 24
+; LE-NEXT:    rldicl 9, 4, 32, 32
+; LE-NEXT:    rotlwi 10, 4, 24
+; LE-NEXT:    rldicl 11, 5, 32, 32
+; LE-NEXT:    rotlwi 12, 5, 24
+; LE-NEXT:    rotlwi 29, 7, 24
+; LE-NEXT:    rlwimi 8, 3, 8, 8, 15
+; LE-NEXT:    rotlwi 28, 9, 24
+; LE-NEXT:    rlwimi 10, 4, 8, 8, 15
+; LE-NEXT:    rlwimi 8, 3, 8, 24, 31
+; LE-NEXT:    rlwimi 10, 4, 8, 24, 31
+; LE-NEXT:    rotlwi 4, 11, 24
+; LE-NEXT:    rlwimi 12, 5, 8, 8, 15
+; LE-NEXT:    rlwimi 29, 7, 8, 8, 15
+; LE-NEXT:    sldi 3, 8, 32
+; LE-NEXT:    rlwimi 28, 9, 8, 8, 15
+; LE-NEXT:    sldi 8, 10, 32
+; LE-NEXT:    rlwimi 12, 5, 8, 24, 31
+; LE-NEXT:    rlwimi 29, 7, 8, 24, 31
+; LE-NEXT:    rlwimi 28, 9, 8, 24, 31
+; LE-NEXT:    rlwimi 4, 11, 8, 8, 15
+; LE-NEXT:    sldi 5, 12, 32
+; LE-NEXT:    or 9, 3, 29
+; LE-NEXT:    or 3, 8, 28
+; LE-NEXT:    rlwimi 4, 11, 8, 24, 31
+; LE-NEXT:    or 10, 5, 4
+; LE-NEXT:    rlwinm 4, 3, 0, 30, 30
+; LE-NEXT:    std 4, 544(1) # 8-byte Folded Spill
+; LE-NEXT:    rlwinm 4, 3, 0, 5, 5
+; LE-NEXT:    std 4, 384(1) # 8-byte Folded Spill
+; LE-NEXT:    rlwinm 4, 3, 0, 4, 4
+; LE-NEXT:    std 4, 376(1) # 8-byte Folded Spill
+; LE-NEXT:    rlwinm 4, 3, 0, 3, 3
+; LE-NEXT:    std 4, 368(1) # 8-byte Folded Spill
+; LE-NEXT:    rlwinm 4, 3, 0, 2, 2
+; LE-NEXT:    std 4, 360(1) # 8-byte Folded Spill
+; LE-NEXT:    rlwinm 4, 3, 0, 1, 1
+; LE-NEXT:    std 4, 352(1) # 8-byte Folded Spill
+; LE-NEXT:    rlwinm 4, 3, 0, 0, 0
+; LE-NEXT:    std 4, 344(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 32, 32
+; LE-NEXT:    std 4, 336(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 31, 33
+; LE-NEXT:    std 4, 280(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 30, 34
+; LE-NEXT:    std 4, 272(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 29, 35
+; LE-NEXT:    std 4, 264(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 28, 36
+; LE-NEXT:    std 4, 256(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 27, 37
+; LE-NEXT:    std 4, 248(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 26, 38
+; LE-NEXT:    std 4, 240(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 25, 39
+; LE-NEXT:    std 4, 232(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 24, 40
+; LE-NEXT:    rldicl 0, 6, 32, 32
+; LE-NEXT:    rotlwi 30, 6, 24
+; LE-NEXT:    rotlwi 27, 0, 24
+; LE-NEXT:    std 4, 224(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 23, 41
+; LE-NEXT:    rlwimi 30, 6, 8, 8, 15
+; LE-NEXT:    rlwimi 30, 6, 8, 24, 31
+; LE-NEXT:    rlwimi 27, 0, 8, 8, 15
+; LE-NEXT:    std 4, 216(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 22, 42
+; LE-NEXT:    sldi 6, 30, 32
+; LE-NEXT:    rlwimi 27, 0, 8, 24, 31
+; LE-NEXT:    or 11, 6, 27
+; LE-NEXT:    std 4, 208(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 21, 43
+; LE-NEXT:    clrldi 5, 3, 63
+; LE-NEXT:    rlwinm 6, 3, 0, 29, 29
+; LE-NEXT:    rlwinm 7, 3, 0, 28, 28
+; LE-NEXT:    std 4, 200(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 20, 44
+; LE-NEXT:    rlwinm 8, 3, 0, 27, 27
+; LE-NEXT:    rlwinm 12, 3, 0, 26, 26
+; LE-NEXT:    rlwinm 0, 3, 0, 25, 25
+; LE-NEXT:    std 4, 192(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 19, 45
+; LE-NEXT:    rlwinm 30, 3, 0, 24, 24
+; LE-NEXT:    rlwinm 29, 3, 0, 23, 23
+; LE-NEXT:    rlwinm 28, 3, 0, 22, 22
+; LE-NEXT:    std 4, 184(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 18, 46
+; LE-NEXT:    rlwinm 27, 3, 0, 21, 21
+; LE-NEXT:    rlwinm 26, 3, 0, 20, 20
+; LE-NEXT:    rlwinm 25, 3, 0, 19, 19
+; LE-NEXT:    std 4, 176(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 17, 47
+; LE-NEXT:    rlwinm 24, 3, 0, 18, 18
+; LE-NEXT:    rlwinm 23, 3, 0, 17, 17
+; LE-NEXT:    rlwinm 22, 3, 0, 16, 16
+; LE-NEXT:    std 4, 168(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 16, 48
+; LE-NEXT:    rlwinm 21, 3, 0, 15, 15
+; LE-NEXT:    rlwinm 20, 3, 0, 14, 14
+; LE-NEXT:    rlwinm 19, 3, 0, 13, 13
+; LE-NEXT:    std 4, 160(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 15, 49
+; LE-NEXT:    rlwinm 18, 3, 0, 12, 12
+; LE-NEXT:    rlwinm 17, 3, 0, 11, 11
+; LE-NEXT:    rlwinm 16, 3, 0, 10, 10
+; LE-NEXT:    std 4, 152(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 14, 50
+; LE-NEXT:    rlwinm 15, 3, 0, 9, 9
+; LE-NEXT:    rlwinm 14, 3, 0, 8, 8
+; LE-NEXT:    rlwinm 31, 3, 0, 7, 7
+; LE-NEXT:    std 4, 144(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 13, 51
+; LE-NEXT:    rlwinm 2, 3, 0, 6, 6
+; LE-NEXT:    std 4, 136(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 12, 52
+; LE-NEXT:    std 4, 128(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 11, 53
+; LE-NEXT:    std 4, 120(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 10, 54
+; LE-NEXT:    std 4, 112(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 9, 55
+; LE-NEXT:    std 4, 104(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 8, 56
+; LE-NEXT:    std 4, 96(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 7, 57
+; LE-NEXT:    std 4, 88(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 6, 58
+; LE-NEXT:    std 4, 80(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 5, 59
+; LE-NEXT:    std 4, 72(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 4, 60
+; LE-NEXT:    std 4, 64(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 3, 61
+; LE-NEXT:    std 4, 56(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 2, 62
+; LE-NEXT:    rldicr 3, 3, 0, 0
+; LE-NEXT:    std 3, 40(1) # 8-byte Folded Spill
+; LE-NEXT:    ld 3, 544(1) # 8-byte Folded Reload
+; LE-NEXT:    std 4, 48(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 3
+; LE-NEXT:    std 3, 296(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 5
+; LE-NEXT:    std 3, 288(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 6
+; LE-NEXT:    std 3, 304(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 7
+; LE-NEXT:    std 3, 312(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 8
+; LE-NEXT:    std 3, 320(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 12
+; LE-NEXT:    std 3, 328(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 0
+; LE-NEXT:    std 3, 544(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 30
+; LE-NEXT:    std 3, 536(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 29
+; LE-NEXT:    std 3, 528(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 28
+; LE-NEXT:    std 3, 520(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 27
+; LE-NEXT:    std 3, 512(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 26
+; LE-NEXT:    std 3, 504(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 25
+; LE-NEXT:    std 3, 496(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 24
+; LE-NEXT:    std 3, 488(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 23
+; LE-NEXT:    std 3, 480(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 22
+; LE-NEXT:    std 3, 472(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 21
+; LE-NEXT:    std 3, 464(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 20
+; LE-NEXT:    std 3, 456(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 19
+; LE-NEXT:    std 3, 448(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 18
+; LE-NEXT:    std 3, 440(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 17
+; LE-NEXT:    std 3, 432(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 16
+; LE-NEXT:    std 3, 424(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 15
+; LE-NEXT:    std 3, 416(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 14
+; LE-NEXT:    std 3, 408(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 31
+; LE-NEXT:    std 3, 400(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 2
+; LE-NEXT:    std 3, 392(1) # 8-byte Folded Spill
+; LE-NEXT:    ld 3, 384(1) # 8-byte Folded Reload
+; LE-NEXT:    mulld 3, 11, 3
+; LE-NEXT:    std 3, 384(1) # 8-byte Folded Spill
+; LE-NEXT:    ld 3, 376(1) # 8-byte Folded Reload
+; LE-NEXT:    mulld 3, 11, 3
+; LE-NEXT:    std 3, 376(1) # 8-byte Folded Spill
+; LE-NEXT:    ld 3, 368(1) # 8-byte Folded Reload
+; LE-NEXT:    mulld 3, 11, 3
+; LE-NEXT:    std 3, 368(1) # 8-byte Folded Spill
+; LE-NEXT:    ld 3, 360(1) # 8-byte Folded Reload
+; LE-NEXT:    mulld 3, 11, 3
+; LE-NEXT:    std 3, 360(1) # 8-byte Folded Spill
+; LE-NEXT:    ld 3, 352(1) # 8-byte Folded Reload
+; LE-NEXT:    mulld 3, 11, 3
+; LE-NEXT:    std 3, 352(1) # 8-byte Folded Spill
+; LE-NEXT:    ld 3, 344(1) # 8-byte Folded Reload
+; LE-NEXT:    mulld 3, 11, 3
+; LE-NEXT:    std 3, 344(1) # 8-byte Folded Spill
+; LE-NEXT:    ld 3, 336(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 4, 3, 32, 31
+; LE-NEXT:    ld 3, 280(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 5, 3, 33, 30
+; LE-NEXT:    ld 3, 272(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 6, 3, 34, 29
+; LE-NEXT:    ld 3, 264(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 7, 3, 35, 28
+; LE-NEXT:    ld 3, 256(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 8, 3, 36, 27
+; LE-NEXT:    ld 3, 248(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 12, 3, 37, 26
+; LE-NEXT:    ld 3, 240(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 0, 3, 38, 25
+; LE-NEXT:    ld 3, 232(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 30, 3, 39, 24
+; LE-NEXT:    ld 3, 224(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 29, 3, 40, 23
+; LE-NEXT:    ld 3, 216(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 28, 3, 41, 22
+; LE-NEXT:    ld 3, 208(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 27, 3, 42, 21
+; LE-NEXT:    ld 3, 200(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 26, 3, 43, 20
+; LE-NEXT:    ld 3, 192(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 25, 3, 44, 19
+; LE-NEXT:    ld 3, 184(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 24, 3, 45, 18
+; LE-NEXT:    ld 3, 176(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 23, 3, 46, 17
+; LE-NEXT:    ld 3, 168(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 22, 3, 47, 16
+; LE-NEXT:    ld 3, 160(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 21, 3, 48, 15
+; LE-NEXT:    ld 3, 152(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 20, 3, 49, 14
+; LE-NEXT:    ld 3, 144(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 19, 3, 50, 13
+; LE-NEXT:    ld 3, 136(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 18, 3, 51, 12
+; LE-NEXT:    ld 3, 128(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 17, 3, 52, 11
+; LE-NEXT:    ld 3, 120(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 16, 3, 53, 10
+; LE-NEXT:    ld 3, 112(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 15, 3, 54, 9
+; LE-NEXT:    ld 3, 104(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 14, 3, 55, 8
+; LE-NEXT:    ld 3, 96(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 31, 3, 56, 7
+; LE-NEXT:    ld 3, 88(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 2, 3, 57, 6
+; LE-NEXT:    ld 3, 80(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 3, 3, 58, 5
+; LE-NEXT:    std 3, 256(1) # 8-byte Folded Spill
+; LE-NEXT:    ld 3, 72(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 3, 3, 59, 4
+; LE-NEXT:    std 3, 248(1) # 8-byte Folded Spill
+; LE-NEXT:    ld 3, 64(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 3, 3, 60, 3
+; LE-NEXT:    std 3, 240(1) # 8-byte Folded Spill
+; LE-NEXT:    ld 3, 56(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 3, 3, 61, 2
+; LE-NEXT:    std 3, 232(1) # 8-byte Folded Spill
+; LE-NEXT:    ld 3, 48(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 3, 3, 62, 1
+; LE-NEXT:    std 3, 224(1) # 8-byte Folded Spill
+; LE-NEXT:    ld 3, 40(1) # 8-byte Folded Reload
+; LE-NEXT:    mulld 3, 11, 3
+; LE-NEXT:    std 3, 336(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 4
+; LE-NEXT:    clrldi 4, 9, 63
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    std 3, 280(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 5
+; LE-NEXT:    ld 5, 288(1) # 8-byte Folded Reload
+; LE-NEXT:    std 3, 272(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 6
+; LE-NEXT:    mulld 6, 11, 7
+; LE-NEXT:    mulld 7, 11, 8
+; LE-NEXT:    mulld 8, 11, 12
+; LE-NEXT:    mulld 12, 11, 0
+; LE-NEXT:    mulld 0, 11, 30
+; LE-NEXT:    mulld 30, 11, 29
+; LE-NEXT:    mulld 29, 11, 28
+; LE-NEXT:    mulld 28, 11, 27
+; LE-NEXT:    mulld 27, 11, 26
+; LE-NEXT:    mulld 26, 11, 25
+; LE-NEXT:    mulld 25, 11, 24
+; LE-NEXT:    mulld 24, 11, 23
+; LE-NEXT:    mulld 23, 11, 22
+; LE-NEXT:    mulld 22, 11, 21
+; LE-NEXT:    mulld 21, 11, 20
+; LE-NEXT:    mulld 20, 11, 19
+; LE-NEXT:    mulld 19, 11, 18
+; LE-NEXT:    mulld 18, 11, 17
+; LE-NEXT:    mulld 17, 11, 16
+; LE-NEXT:    mulld 16, 11, 15
+; LE-NEXT:    mulld 15, 11, 14
+; LE-NEXT:    mulld 14, 11, 31
+; LE-NEXT:    mulld 31, 11, 2
+; LE-NEXT:    std 3, 264(1) # 8-byte Folded Spill
+; LE-NEXT:    ld 3, 256(1) # 8-byte Folded Reload
+; LE-NEXT:    mulld 2, 11, 3
+; LE-NEXT:    ld 3, 248(1) # 8-byte Folded Reload
+; LE-NEXT:    mulld 3, 11, 3
+; LE-NEXT:    std 3, 256(1) # 8-byte Folded Spill
+; LE-NEXT:    ld 3, 240(1) # 8-byte Folded Reload
+; LE-NEXT:    mulld 3, 11, 3
+; LE-NEXT:    std 3, 248(1) # 8-byte Folded Spill
+; LE-NEXT:    ld 3, 232(1) # 8-byte Folded Reload
+; LE-NEXT:    mulld 3, 11, 3
+; LE-NEXT:    std 3, 240(1) # 8-byte Folded Spill
+; LE-NEXT:    ld 3, 224(1) # 8-byte Folded Reload
+; LE-NEXT:    mulld 11, 11, 3
+; LE-NEXT:    rlwinm 3, 9, 0, 30, 30
+; LE-NEXT:    mulld 3, 10, 3
+; LE-NEXT:    xor 3, 4, 3
+; LE-NEXT:    ld 4, 296(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 4, 5, 4
+; LE-NEXT:    rlwinm 5, 9, 0, 29, 29
+; LE-NEXT:    mulld 5, 10, 5
+; LE-NEXT:    xor 3, 3, 5
+; LE-NEXT:    ld 5, 304(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 4, 4, 5
+; LE-NEXT:    rlwinm 5, 9, 0, 28, 28
+; LE-NEXT:    mulld 5, 10, 5
+; LE-NEXT:    xor 3, 3, 5
+; LE-NEXT:    ld 5, 312(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 4, 4, 5
+; LE-NEXT:    rlwinm 5, 9, 0, 27, 27
+; LE-NEXT:    mulld 5, 10, 5
+; LE-NEXT:    xor 3, 3, 5
+; LE-NEXT:    ld 5, 320(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 4, 4, 5
+; LE-NEXT:    rlwinm 5, 9, 0, 26, 26
+; LE-NEXT:    mulld 5, 10, 5
+; LE-NEXT:    xor 3, 3, 5
+; LE-NEXT:    ld 5, 328(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 4, 4, 5
+; LE-NEXT:    rlwinm 5, 9, 0, 25, 25
+; LE-NEXT:    mulld 5, 10, 5
+; LE-NEXT:    xor 3, 3, 5
+; LE-NEXT:    std 3, 328(1) # 8-byte Folded Spill
+; LE-NEXT:    ld 3, 544(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 4, 3
+; LE-NEXT:    ld 4, 536(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 528(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 520(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 512(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 504(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 496(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 488(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 480(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 472(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 464(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 456(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 448(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 440(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 432(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 424(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 416(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 408(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 400(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 392(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 384(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 376(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 368(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 360(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 352(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 344(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 280(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 272(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 264(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 256(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 6
+; LE-NEXT:    ld 6, 592(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 7
+; LE-NEXT:    ld 7, 584(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 8
+; LE-NEXT:    ld 8, 576(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 12
+; LE-NEXT:    ld 12, 560(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 0
+; LE-NEXT:    ld 0, 552(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 30
+; LE-NEXT:    ld 30, 736(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 29
+; LE-NEXT:    ld 29, 728(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 28
+; LE-NEXT:    ld 28, 720(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 27
+; LE-NEXT:    ld 27, 712(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 26
+; LE-NEXT:    ld 26, 704(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 25
+; LE-NEXT:    ld 25, 696(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 24
+; LE-NEXT:    ld 24, 688(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 23
+; LE-NEXT:    ld 23, 680(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 22
+; LE-NEXT:    ld 22, 672(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 21
+; LE-NEXT:    ld 21, 664(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 20
+; LE-NEXT:    ld 20, 656(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 19
+; LE-NEXT:    ld 19, 648(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 18
+; LE-NEXT:    ld 18, 640(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 17
+; LE-NEXT:    ld 17, 632(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 16
+; LE-NEXT:    ld 16, 624(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 15
+; LE-NEXT:    ld 15, 616(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 14
+; LE-NEXT:    ld 14, 608(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 31
+; LE-NEXT:    ld 31, 744(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 2
+; LE-NEXT:    ld 2, 600(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 248(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 240(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 336(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 11
+; LE-NEXT:    ld 11, 568(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    sldi 4, 3, 1
+; LE-NEXT:    rldicl 3, 3, 63, 1
+; LE-NEXT:    and 4, 4, 7
+; LE-NEXT:    and 3, 3, 6
+; LE-NEXT:    or 3, 3, 4
+; LE-NEXT:    sldi 4, 3, 2
+; LE-NEXT:    rldicl 3, 3, 62, 2
+; LE-NEXT:    and 4, 4, 11
+; LE-NEXT:    and 3, 3, 8
+; LE-NEXT:    or 3, 3, 4
+; LE-NEXT:    sldi 4, 3, 4
+; LE-NEXT:    rldicl 3, 3, 60, 4
+; LE-NEXT:    and 4, 4, 0
+; LE-NEXT:    and 3, 3, 12
+; LE-NEXT:    or 3, 3, 4
+; LE-NEXT:    rotlwi 5, 3, 24
+; LE-NEXT:    rldicl 4, 3, 32, 32
+; LE-NEXT:    rlwimi 5, 3, 8, 8, 15
+; LE-NEXT:    rlwimi 5, 3, 8, 24, 31
+; LE-NEXT:    rotlwi 3, 4, 24
+; LE-NEXT:    rlwimi 3, 4, 8, 8, 15
+; LE-NEXT:    rlwimi 3, 4, 8, 24, 31
+; LE-NEXT:    sldi 4, 5, 32
+; LE-NEXT:    or 3, 4, 3
+; LE-NEXT:    ld 4, 328(1) # 8-byte Folded Reload
+; LE-NEXT:    mtfprd 0, 3
+; LE-NEXT:    rlwinm 3, 9, 0, 24, 24
+; LE-NEXT:    mulld 3, 10, 3
+; LE-NEXT:    xor 3, 4, 3
+; LE-NEXT:    rlwinm 4, 9, 0, 23, 23
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 22, 22
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 21, 21
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 20, 20
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 19, 19
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 18, 18
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 17, 17
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 16, 16
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 15, 15
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 14, 14
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 13, 13
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 12, 12
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 11, 11
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 10, 10
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 9, 9
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 8, 8
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 7, 7
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 6, 6
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 5, 5
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 4, 4
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 3, 3
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 2, 2
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 1, 1
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 0, 0
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 32, 32
+; LE-NEXT:    rldicl 4, 4, 32, 31
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 31, 33
+; LE-NEXT:    rldicl 4, 4, 33, 30
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 30, 34
+; LE-NEXT:    rldicl 4, 4, 34, 29
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 29, 35
+; LE-NEXT:    rldicl 4, 4, 35, 28
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 28, 36
+; LE-NEXT:    rldicl 4, 4, 36, 27
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 27, 37
+; LE-NEXT:    rldicl 4, 4, 37, 26
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 26, 38
+; LE-NEXT:    rldicl 4, 4, 38, 25
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 25, 39
+; LE-NEXT:    rldicl 4, 4, 39, 24
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 24, 40
+; LE-NEXT:    rldicl 4, 4, 40, 23
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 23, 41
+; LE-NEXT:    rldicl 4, 4, 41, 22
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 22, 42
+; LE-NEXT:    rldicl 4, 4, 42, 21
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 21, 43
+; LE-NEXT:    rldicl 4, 4, 43, 20
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 20, 44
+; LE-NEXT:    rldicl 4, 4, 44, 19
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 19, 45
+; LE-NEXT:    rldicl 4, 4, 45, 18
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 18, 46
+; LE-NEXT:    rldicl 4, 4, 46, 17
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 17, 47
+; LE-NEXT:    rldicl 4, 4, 47, 16
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 16, 48
+; LE-NEXT:    rldicl 4, 4, 48, 15
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 15, 49
+; LE-NEXT:    rldicl 4, 4, 49, 14
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 14, 50
+; LE-NEXT:    rldicl 4, 4, 50, 13
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 13, 51
+; LE-NEXT:    rldicl 4, 4, 51, 12
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 12, 52
+; LE-NEXT:    rldicl 4, 4, 52, 11
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 11, 53
+; LE-NEXT:    rldicl 4, 4, 53, 10
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 10, 54
+; LE-NEXT:    rldicl 4, 4, 54, 9
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 9, 55
+; LE-NEXT:    rldicl 4, 4, 55, 8
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 8, 56
+; LE-NEXT:    rldicl 4, 4, 56, 7
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 7, 57
+; LE-NEXT:    rldicl 4, 4, 57, 6
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 6, 58
+; LE-NEXT:    rldicl 4, 4, 58, 5
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 5, 59
+; LE-NEXT:    rldicl 4, 4, 59, 4
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 4, 60
+; LE-NEXT:    rldicl 4, 4, 60, 3
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 3, 61
+; LE-NEXT:    rldicl 4, 4, 61, 2
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 2, 62
+; LE-NEXT:    rldicl 4, 4, 62, 1
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicr 4, 9, 0, 0
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    sldi 4, 3, 1
+; LE-NEXT:    rldicl 3, 3, 63, 1
+; LE-NEXT:    and 4, 4, 7
+; LE-NEXT:    and 3, 3, 6
+; LE-NEXT:    or 3, 3, 4
+; LE-NEXT:    sldi 4, 3, 2
+; LE-NEXT:    rldicl 3, 3, 62, 2
+; LE-NEXT:    and 4, 4, 11
+; LE-NEXT:    and 3, 3, 8
+; LE-NEXT:    or 3, 3, 4
+; LE-NEXT:    sldi 4, 3, 4
+; LE-NEXT:    rldicl 3, 3, 60, 4
+; LE-NEXT:    and 4, 4, 0
+; LE-NEXT:    and 3, 3, 12
+; LE-NEXT:    or 3, 3, 4
+; LE-NEXT:    rldicl 4, 3, 32, 32
+; LE-NEXT:    rotlwi 5, 4, 24
+; LE-NEXT:    rlwimi 5, 4, 8, 8, 15
+; LE-NEXT:    rlwimi 5, 4, 8, 24, 31
+; LE-NEXT:    rotlwi 4, 3, 24
+; LE-NEXT:    rlwimi 4, 3, 8, 8, 15
+; LE-NEXT:    rlwimi 4, 3, 8, 24, 31
+; LE-NEXT:    sldi 3, 4, 32
+; LE-NEXT:    or 3, 3, 5
+; LE-NEXT:    mtfprd 1, 3
+; LE-NEXT:    xxmrghd 34, 1, 0
+; LE-NEXT:    addi 1, 1, 752
+; LE-NEXT:    blr
+  %a.ext = zext <2 x i64> %a to <2 x i128>
+  %b.ext = zext <2 x i64> %b to <2 x i128>
+  %clmul = call <2 x i128> @llvm.clmul.v2i128(<2 x i128> %a.ext, <2 x i128> %b.ext)
+  %res.ext = lshr <2 x i128> %clmul, splat (i128 63)
+  %res = trunc <2 x i128> %res.ext to <2 x i64>
+  ret <2 x i64> %res
+}
+
+define <16 x i8> @clmulh_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
+; BE-LABEL: clmulh_v16i8:
+; BE:       # %bb.0:
+; BE-NEXT:    li 3, -48
+; BE-NEXT:    vspltisb 4, 4
+; BE-NEXT:    stvx 29, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, -32
+; BE-NEXT:    vsrb 1, 3, 4
+; BE-NEXT:    vspltisb 5, 15
+; BE-NEXT:    stvx 30, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, -16
+; BE-NEXT:    vspltisb 7, -1
+; BE-NEXT:    stvx 31, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    addis 3, 2, .LCPI8_0 at toc@ha
+; BE-NEXT:    addi 3, 3, .LCPI8_0 at toc@l
+; BE-NEXT:    vand 3, 3, 5
+; BE-NEXT:    vspltisb 13, 8
+; BE-NEXT:    vslb 3, 3, 4
+; BE-NEXT:    vsrb 0, 2, 4
+; BE-NEXT:    vand 2, 2, 5
+; BE-NEXT:    vor 1, 1, 3
+; BE-NEXT:    lvx 3, 0, 3
+; BE-NEXT:    addis 3, 2, .LCPI8_1 at toc@ha
+; BE-NEXT:    addi 3, 3, .LCPI8_1 at toc@l
+; BE-NEXT:    vslb 2, 2, 4
+; BE-NEXT:    vor 0, 0, 2
+; BE-NEXT:    vspltisb 2, 2
+; BE-NEXT:    vsrb 9, 1, 2
+; BE-NEXT:    vand 1, 1, 3
+; BE-NEXT:    vand 9, 9, 3
+; BE-NEXT:    vslb 1, 1, 2
+; BE-NEXT:    vsrb 8, 0, 2
+; BE-NEXT:    vand 0, 0, 3
+; BE-NEXT:    vor 9, 9, 1
+; BE-NEXT:    lvx 1, 0, 3
+; BE-NEXT:    addis 3, 2, .LCPI8_3 at toc@ha
+; BE-NEXT:    addi 3, 3, .LCPI8_3 at toc@l
+; BE-NEXT:    lvx 15, 0, 3
+; BE-NEXT:    addis 3, 2, .LCPI8_2 at toc@ha
+; BE-NEXT:    addi 3, 3, .LCPI8_2 at toc@l
+; BE-NEXT:    vand 8, 8, 3
+; BE-NEXT:    vslb 0, 0, 2
+; BE-NEXT:    vor 8, 8, 0
+; BE-NEXT:    vspltisb 0, 1
+; BE-NEXT:    vsrb 11, 9, 0
+; BE-NEXT:    vand 9, 9, 1
+; BE-NEXT:    vaddubm 9, 9, 9
+; BE-NEXT:    vand 11, 11, 1
+; BE-NEXT:    vsrb 10, 8, 0
+; BE-NEXT:    vand 8, 8, 1
+; BE-NEXT:    vaddubm 8, 8, 8
+; BE-NEXT:    vor 9, 11, 9
+; BE-NEXT:    vslb 6, 4, 4
+; BE-NEXT:    vslb 7, 7, 7
+; BE-NEXT:    vand 10, 10, 1
+; BE-NEXT:    vand 14, 9, 13
+; BE-NEXT:    vaddubm 13, 13, 13
+; BE-NEXT:    vor 8, 10, 8
+; BE-NEXT:    vand 10, 9, 2
+; BE-NEXT:    vand 11, 9, 0
+; BE-NEXT:    vand 12, 9, 4
+; BE-NEXT:    vand 13, 9, 13
+; BE-NEXT:    vand 15, 9, 15
+; BE-NEXT:    vand 6, 9, 6
+; BE-NEXT:    vand 7, 9, 7
+; BE-NEXT:    vmuloub 9, 8, 10
+; BE-NEXT:    vmuleub 10, 8, 10
+; BE-NEXT:    vmuloub 16, 8, 11
+; BE-NEXT:    vmuleub 11, 8, 11
+; BE-NEXT:    vmuloub 17, 8, 12
+; BE-NEXT:    vmuleub 12, 8, 12
+; BE-NEXT:    vmuloub 18, 8, 14
+; BE-NEXT:    vmuleub 14, 8, 14
+; BE-NEXT:    vmuloub 19, 8, 13
+; BE-NEXT:    vmuleub 13, 8, 13
+; BE-NEXT:    vmuloub 31, 8, 15
+; BE-NEXT:    vmuleub 15, 8, 15
+; BE-NEXT:    vmuloub 30, 8, 6
+; BE-NEXT:    vmuleub 6, 8, 6
+; BE-NEXT:    vmuloub 29, 8, 7
+; BE-NEXT:    vmuleub 7, 8, 7
+; BE-NEXT:    lvx 8, 0, 3
+; BE-NEXT:    li 3, -16
+; BE-NEXT:    vperm 9, 10, 9, 8
+; BE-NEXT:    vperm 10, 11, 16, 8
+; BE-NEXT:    vperm 11, 12, 17, 8
+; BE-NEXT:    vperm 12, 14, 18, 8
+; BE-NEXT:    vperm 13, 13, 19, 8
+; BE-NEXT:    vperm 14, 15, 31, 8
+; BE-NEXT:    lvx 31, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, -32
+; BE-NEXT:    vperm 6, 6, 30, 8
+; BE-NEXT:    lvx 30, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, -48
+; BE-NEXT:    vperm 7, 7, 29, 8
+; BE-NEXT:    lvx 29, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    vxor 8, 10, 9
+; BE-NEXT:    vxor 8, 8, 11
+; BE-NEXT:    vxor 8, 8, 12
+; BE-NEXT:    vxor 8, 8, 13
+; BE-NEXT:    vxor 8, 8, 14
+; BE-NEXT:    vxor 6, 8, 6
+; BE-NEXT:    vxor 6, 6, 7
+; BE-NEXT:    vand 5, 6, 5
+; BE-NEXT:    vsrb 7, 6, 4
+; BE-NEXT:    vslb 4, 5, 4
+; BE-NEXT:    vor 4, 7, 4
+; BE-NEXT:    vand 5, 4, 3
+; BE-NEXT:    vsrb 4, 4, 2
+; BE-NEXT:    vslb 2, 5, 2
+; BE-NEXT:    vand 3, 4, 3
+; BE-NEXT:    vor 2, 3, 2
+; BE-NEXT:    vsrb 3, 2, 0
+; BE-NEXT:    vand 2, 2, 1
+; BE-NEXT:    vaddubm 2, 2, 2
+; BE-NEXT:    vand 3, 3, 1
+; BE-NEXT:    vor 2, 3, 2
+; BE-NEXT:    vsrb 2, 2, 0
+; BE-NEXT:    blr
+;
+; LE-LABEL: clmulh_v16i8:
+; LE:       # %bb.0:
+; LE-NEXT:    addis 3, 2, .LCPI8_0 at toc@ha
+; LE-NEXT:    vspltisb 4, 4
+; LE-NEXT:    vspltisb 5, 2
+; LE-NEXT:    addi 3, 3, .LCPI8_0 at toc@l
+; LE-NEXT:    vslb 1, 3, 4
+; LE-NEXT:    vsrb 3, 3, 4
+; LE-NEXT:    vslb 6, 2, 4
+; LE-NEXT:    vsrb 2, 2, 4
+; LE-NEXT:    lxvd2x 0, 0, 3
+; LE-NEXT:    addis 3, 2, .LCPI8_1 at toc@ha
+; LE-NEXT:    xxlor 35, 35, 33
+; LE-NEXT:    xxlor 34, 34, 38
+; LE-NEXT:    vspltisb 0, 1
+; LE-NEXT:    addi 3, 3, .LCPI8_1 at toc@l
+; LE-NEXT:    vsrb 1, 3, 5
+; LE-NEXT:    vsrb 7, 2, 5
+; LE-NEXT:    vspltisb 6, 8
+; LE-NEXT:    lxvd2x 1, 0, 3
+; LE-NEXT:    addis 3, 2, .LCPI8_2 at toc@ha
+; LE-NEXT:    xxland 35, 35, 0
+; LE-NEXT:    xxland 34, 34, 0
+; LE-NEXT:    xxland 2, 33, 0
+; LE-NEXT:    xxland 3, 39, 0
+; LE-NEXT:    addi 3, 3, .LCPI8_2 at toc@l
+; LE-NEXT:    vslb 3, 3, 5
+; LE-NEXT:    vslb 2, 2, 5
+; LE-NEXT:    xxlor 35, 2, 35
+; LE-NEXT:    xxlor 34, 3, 34
+; LE-NEXT:    lxvd2x 3, 0, 3
+; LE-NEXT:    addis 3, 2, .LCPI8_3 at toc@ha
+; LE-NEXT:    vsrb 1, 3, 0
+; LE-NEXT:    xxland 35, 35, 1
+; LE-NEXT:    vsrb 7, 2, 0
+; LE-NEXT:    xxland 34, 34, 1
+; LE-NEXT:    addi 3, 3, .LCPI8_3 at toc@l
+; LE-NEXT:    xxland 2, 33, 1
+; LE-NEXT:    vaddubm 3, 3, 3
+; LE-NEXT:    vaddubm 2, 2, 2
+; LE-NEXT:    xxlor 2, 2, 35
+; LE-NEXT:    xxland 35, 2, 37
+; LE-NEXT:    xxswapd 33, 3
+; LE-NEXT:    xxland 3, 39, 1
+; LE-NEXT:    xxlor 34, 3, 34
+; LE-NEXT:    lxvd2x 3, 0, 3
+; LE-NEXT:    vmuloub 7, 2, 3
+; LE-NEXT:    vmuleub 3, 2, 3
+; LE-NEXT:    vperm 3, 3, 7, 1
+; LE-NEXT:    xxland 39, 2, 32
+; LE-NEXT:    vmuloub 8, 2, 7
+; LE-NEXT:    vmuleub 7, 2, 7
+; LE-NEXT:    vperm 7, 7, 8, 1
+; LE-NEXT:    xxland 40, 2, 36
+; LE-NEXT:    vmuloub 9, 2, 8
+; LE-NEXT:    vmuleub 8, 2, 8
+; LE-NEXT:    vperm 8, 8, 9, 1
+; LE-NEXT:    xxland 41, 2, 38
+; LE-NEXT:    vaddubm 6, 6, 6
+; LE-NEXT:    vmuloub 10, 2, 9
+; LE-NEXT:    vmuleub 9, 2, 9
+; LE-NEXT:    xxland 38, 2, 38
+; LE-NEXT:    vperm 9, 9, 10, 1
+; LE-NEXT:    vmuloub 10, 2, 6
+; LE-NEXT:    vmuleub 6, 2, 6
+; LE-NEXT:    vperm 6, 6, 10, 1
+; LE-NEXT:    xxland 42, 2, 3
+; LE-NEXT:    vmuloub 11, 2, 10
+; LE-NEXT:    vmuleub 10, 2, 10
+; LE-NEXT:    vperm 10, 10, 11, 1
+; LE-NEXT:    vslb 11, 4, 4
+; LE-NEXT:    xxland 43, 2, 43
+; LE-NEXT:    vmuloub 12, 2, 11
+; LE-NEXT:    vmuleub 11, 2, 11
+; LE-NEXT:    vperm 11, 11, 12, 1
+; LE-NEXT:    xxleqv 44, 44, 44
+; LE-NEXT:    vslb 12, 12, 12
+; LE-NEXT:    xxland 44, 2, 44
+; LE-NEXT:    xxlxor 2, 39, 35
+; LE-NEXT:    xxlxor 2, 2, 40
+; LE-NEXT:    vmuloub 13, 2, 12
+; LE-NEXT:    vmuleub 2, 2, 12
+; LE-NEXT:    xxlxor 2, 2, 41
+; LE-NEXT:    xxlxor 2, 2, 38
+; LE-NEXT:    xxlxor 2, 2, 42
+; LE-NEXT:    xxlxor 2, 2, 43
+; LE-NEXT:    vperm 2, 2, 13, 1
+; LE-NEXT:    xxlxor 34, 2, 34
+; LE-NEXT:    vslb 3, 2, 4
+; LE-NEXT:    vsrb 2, 2, 4
+; LE-NEXT:    xxlor 34, 34, 35
+; LE-NEXT:    xxland 35, 34, 0
+; LE-NEXT:    vsrb 2, 2, 5
+; LE-NEXT:    vslb 3, 3, 5
+; LE-NEXT:    xxland 0, 34, 0
+; LE-NEXT:    xxlor 34, 0, 35
+; LE-NEXT:    vsrb 3, 2, 0
+; LE-NEXT:    xxland 34, 34, 1
+; LE-NEXT:    xxland 0, 35, 1
+; LE-NEXT:    vaddubm 2, 2, 2
+; LE-NEXT:    xxlor 34, 0, 34
+; LE-NEXT:    vsrb 2, 2, 0
+; LE-NEXT:    blr
+  %a.ext = zext <16 x i8> %a to <16 x i16>
+  %b.ext = zext <16 x i8> %b to <16 x i16>
+  %clmul = call <16 x i16> @llvm.clmul.v16i16(<16 x i16> %a.ext, <16 x i16> %b.ext)
+  %res.ext = lshr <16 x i16> %clmul, splat (i16 8)
+  %res = trunc <16 x i16> %res.ext to <16 x i8>
+  ret <16 x i8> %res
+}
+
+define <8 x i16> @clmulh_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
+; BE-LABEL: clmulh_v8i16:
+; BE:       # %bb.0:
+; BE-NEXT:    li 3, -80
+; BE-NEXT:    vspltish 4, 8
+; BE-NEXT:    vxor 5, 5, 5
+; BE-NEXT:    stvx 27, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, -64
+; BE-NEXT:    vadduhm 19, 4, 4
+; BE-NEXT:    vspltisb 1, -1
+; BE-NEXT:    stvx 28, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, -48
+; BE-NEXT:    vspltish 0, 2
+; BE-NEXT:    stvx 29, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, -32
+; BE-NEXT:    vrlh 8, 2, 4
+; BE-NEXT:    vspltish 2, 4
+; BE-NEXT:    stvx 30, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, -16
+; BE-NEXT:    stvx 31, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    addis 3, 2, .LCPI9_0 at toc@ha
+; BE-NEXT:    addi 3, 3, .LCPI9_0 at toc@l
+; BE-NEXT:    vrlh 6, 3, 4
+; BE-NEXT:    vspltish 3, 1
+; BE-NEXT:    vslh 13, 1, 1
+; BE-NEXT:    vspltisb 1, 15
+; BE-NEXT:    vand 14, 8, 1
+; BE-NEXT:    vsrh 8, 8, 2
+; BE-NEXT:    vand 15, 6, 1
+; BE-NEXT:    vsrh 6, 6, 2
+; BE-NEXT:    vslh 14, 14, 2
+; BE-NEXT:    vand 8, 8, 1
+; BE-NEXT:    vslh 15, 15, 2
+; BE-NEXT:    vand 6, 6, 1
+; BE-NEXT:    vor 8, 8, 14
+; BE-NEXT:    vor 14, 6, 15
+; BE-NEXT:    lvx 6, 0, 3
+; BE-NEXT:    addis 3, 2, .LCPI9_1 at toc@ha
+; BE-NEXT:    addi 3, 3, .LCPI9_1 at toc@l
+; BE-NEXT:    vand 15, 8, 6
+; BE-NEXT:    vsrh 8, 8, 0
+; BE-NEXT:    vslh 15, 15, 0
+; BE-NEXT:    vand 8, 8, 6
+; BE-NEXT:    vor 15, 8, 15
+; BE-NEXT:    lvx 8, 0, 3
+; BE-NEXT:    addis 3, 2, .LCPI9_2 at toc@ha
+; BE-NEXT:    addi 3, 3, .LCPI9_2 at toc@l
+; BE-NEXT:    lvx 31, 0, 3
+; BE-NEXT:    addis 3, 2, .LCPI9_3 at toc@ha
+; BE-NEXT:    addi 3, 3, .LCPI9_3 at toc@l
+; BE-NEXT:    lvx 30, 0, 3
+; BE-NEXT:    addis 3, 2, .LCPI9_4 at toc@ha
+; BE-NEXT:    addi 3, 3, .LCPI9_4 at toc@l
+; BE-NEXT:    vand 16, 14, 6
+; BE-NEXT:    lvx 29, 0, 3
+; BE-NEXT:    addis 3, 2, .LCPI9_5 at toc@ha
+; BE-NEXT:    addi 3, 3, .LCPI9_5 at toc@l
+; BE-NEXT:    lvx 28, 0, 3
+; BE-NEXT:    addis 3, 2, .LCPI9_6 at toc@ha
+; BE-NEXT:    addi 3, 3, .LCPI9_6 at toc@l
+; BE-NEXT:    lvx 27, 0, 3
+; BE-NEXT:    li 3, -16
+; BE-NEXT:    vsrh 14, 14, 0
+; BE-NEXT:    vslh 16, 16, 0
+; BE-NEXT:    vand 14, 14, 6
+; BE-NEXT:    vor 14, 14, 16
+; BE-NEXT:    vsrh 17, 14, 3
+; BE-NEXT:    vand 14, 14, 8
+; BE-NEXT:    vadduhm 14, 14, 14
+; BE-NEXT:    vsrh 16, 15, 3
+; BE-NEXT:    vand 15, 15, 8
+; BE-NEXT:    vadduhm 15, 15, 15
+; BE-NEXT:    vand 17, 17, 8
+; BE-NEXT:    vand 16, 16, 8
+; BE-NEXT:    vor 14, 17, 14
+; BE-NEXT:    vslh 7, 2, 2
+; BE-NEXT:    vsldoi 9, 3, 3, 1
+; BE-NEXT:    vsldoi 10, 0, 0, 1
+; BE-NEXT:    vsldoi 11, 2, 2, 1
+; BE-NEXT:    vslh 12, 4, 4
+; BE-NEXT:    vor 15, 16, 15
+; BE-NEXT:    vand 16, 14, 0
+; BE-NEXT:    vand 17, 14, 3
+; BE-NEXT:    vand 18, 14, 2
+; BE-NEXT:    vand 19, 14, 19
+; BE-NEXT:    vand 31, 14, 31
+; BE-NEXT:    vand 7, 14, 7
+; BE-NEXT:    vand 30, 14, 30
+; BE-NEXT:    vand 9, 14, 9
+; BE-NEXT:    vand 10, 14, 10
+; BE-NEXT:    vand 11, 14, 11
+; BE-NEXT:    vand 12, 14, 12
+; BE-NEXT:    vand 29, 14, 29
+; BE-NEXT:    vand 28, 14, 28
+; BE-NEXT:    vand 27, 14, 27
+; BE-NEXT:    vand 13, 14, 13
+; BE-NEXT:    vand 14, 14, 4
+; BE-NEXT:    vmladduhm 16, 15, 16, 5
+; BE-NEXT:    vmladduhm 17, 15, 17, 5
+; BE-NEXT:    vmladduhm 18, 15, 18, 5
+; BE-NEXT:    vmladduhm 14, 15, 14, 5
+; BE-NEXT:    vmladduhm 19, 15, 19, 5
+; BE-NEXT:    vmladduhm 31, 15, 31, 5
+; BE-NEXT:    vmladduhm 7, 15, 7, 5
+; BE-NEXT:    vmladduhm 30, 15, 30, 5
+; BE-NEXT:    vmladduhm 9, 15, 9, 5
+; BE-NEXT:    vmladduhm 10, 15, 10, 5
+; BE-NEXT:    vmladduhm 11, 15, 11, 5
+; BE-NEXT:    vmladduhm 12, 15, 12, 5
+; BE-NEXT:    vmladduhm 29, 15, 29, 5
+; BE-NEXT:    vmladduhm 28, 15, 28, 5
+; BE-NEXT:    vmladduhm 27, 15, 27, 5
+; BE-NEXT:    vmladduhm 5, 15, 13, 5
+; BE-NEXT:    vxor 13, 17, 16
+; BE-NEXT:    vxor 13, 13, 18
+; BE-NEXT:    vxor 13, 13, 14
+; BE-NEXT:    vxor 13, 13, 19
+; BE-NEXT:    vxor 13, 13, 31
+; BE-NEXT:    lvx 31, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, -32
+; BE-NEXT:    vxor 7, 13, 7
+; BE-NEXT:    vxor 7, 7, 30
+; BE-NEXT:    lvx 30, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, -48
+; BE-NEXT:    vxor 7, 7, 9
+; BE-NEXT:    vxor 7, 7, 10
+; BE-NEXT:    vxor 7, 7, 11
+; BE-NEXT:    vxor 7, 7, 12
+; BE-NEXT:    vxor 7, 7, 29
+; BE-NEXT:    lvx 29, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, -64
+; BE-NEXT:    vxor 7, 7, 28
+; BE-NEXT:    lvx 28, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, -80
+; BE-NEXT:    vxor 7, 7, 27
+; BE-NEXT:    lvx 27, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    vxor 5, 7, 5
+; BE-NEXT:    vrlh 4, 5, 4
+; BE-NEXT:    vand 5, 4, 1
+; BE-NEXT:    vsrh 4, 4, 2
+; BE-NEXT:    vslh 2, 5, 2
+; BE-NEXT:    vand 4, 4, 1
+; BE-NEXT:    vor 2, 4, 2
+; BE-NEXT:    vand 4, 2, 6
+; BE-NEXT:    vsrh 2, 2, 0
+; BE-NEXT:    vslh 4, 4, 0
+; BE-NEXT:    vand 2, 2, 6
+; BE-NEXT:    vor 2, 2, 4
+; BE-NEXT:    vsrh 4, 2, 3
+; BE-NEXT:    vand 2, 2, 8
+; BE-NEXT:    vadduhm 2, 2, 2
+; BE-NEXT:    vand 4, 4, 8
+; BE-NEXT:    vor 2, 4, 2
+; BE-NEXT:    vsrh 2, 2, 3
+; BE-NEXT:    blr
+;
+; LE-LABEL: clmulh_v8i16:
+; LE:       # %bb.0:
+; LE-NEXT:    vspltish 5, 8
+; LE-NEXT:    vspltisb 4, 15
+; LE-NEXT:    addis 3, 2, .LCPI9_0 at toc@ha
+; LE-NEXT:    vrlh 2, 2, 5
+; LE-NEXT:    vspltish 0, 4
+; LE-NEXT:    addi 3, 3, .LCPI9_0 at toc@l
+; LE-NEXT:    vspltish 6, 2
+; LE-NEXT:    vspltish 1, 1
+; LE-NEXT:    vrlh 3, 3, 5
+; LE-NEXT:    xxland 42, 34, 36
+; LE-NEXT:    vsrh 2, 2, 0
+; LE-NEXT:    vslh 10, 10, 0
+; LE-NEXT:    xxland 0, 34, 36
+; LE-NEXT:    vsldoi 7, 1, 1, 1
+; LE-NEXT:    vsldoi 8, 6, 6, 1
+; LE-NEXT:    vsldoi 9, 0, 0, 1
+; LE-NEXT:    xxlor 34, 0, 42
+; LE-NEXT:    lxvd2x 0, 0, 3
+; LE-NEXT:    addis 3, 2, .LCPI9_1 at toc@ha
+; LE-NEXT:    addi 3, 3, .LCPI9_1 at toc@l
+; LE-NEXT:    xxland 42, 34, 0
+; LE-NEXT:    vsrh 2, 2, 6
+; LE-NEXT:    vslh 10, 10, 6
+; LE-NEXT:    xxland 1, 34, 0
+; LE-NEXT:    xxlor 34, 1, 42
+; LE-NEXT:    lxvd2x 1, 0, 3
+; LE-NEXT:    addis 3, 2, .LCPI9_2 at toc@ha
+; LE-NEXT:    vsrh 10, 2, 1
+; LE-NEXT:    addi 3, 3, .LCPI9_2 at toc@l
+; LE-NEXT:    lxvd2x 4, 0, 3
+; LE-NEXT:    addis 3, 2, .LCPI9_3 at toc@ha
+; LE-NEXT:    xxland 34, 34, 1
+; LE-NEXT:    xxland 2, 42, 1
+; LE-NEXT:    xxland 42, 35, 36
+; LE-NEXT:    vsrh 3, 3, 0
+; LE-NEXT:    addi 3, 3, .LCPI9_3 at toc@l
+; LE-NEXT:    vadduhm 2, 2, 2
+; LE-NEXT:    vslh 10, 10, 0
+; LE-NEXT:    xxlor 34, 2, 34
+; LE-NEXT:    xxland 2, 35, 36
+; LE-NEXT:    xxlor 35, 2, 42
+; LE-NEXT:    xxland 42, 35, 0
+; LE-NEXT:    vsrh 3, 3, 6
+; LE-NEXT:    vslh 10, 10, 6
+; LE-NEXT:    xxland 2, 35, 0
+; LE-NEXT:    xxlor 35, 2, 42
+; LE-NEXT:    vsrh 10, 3, 1
+; LE-NEXT:    xxland 35, 35, 1
+; LE-NEXT:    xxland 2, 42, 1
+; LE-NEXT:    vadduhm 3, 3, 3
+; LE-NEXT:    xxlor 2, 2, 35
+; LE-NEXT:    vxor 3, 3, 3
+; LE-NEXT:    xxland 42, 2, 38
+; LE-NEXT:    xxland 43, 2, 33
+; LE-NEXT:    xxland 39, 2, 39
+; LE-NEXT:    vmladduhm 10, 2, 10, 3
+; LE-NEXT:    vmladduhm 11, 2, 11, 3
+; LE-NEXT:    vmladduhm 7, 2, 7, 3
+; LE-NEXT:    xxlxor 3, 43, 42
+; LE-NEXT:    xxland 42, 2, 32
+; LE-NEXT:    vmladduhm 10, 2, 10, 3
+; LE-NEXT:    xxlxor 3, 3, 42
+; LE-NEXT:    xxland 42, 2, 37
+; LE-NEXT:    vmladduhm 10, 2, 10, 3
+; LE-NEXT:    xxlxor 3, 3, 42
+; LE-NEXT:    vadduhm 10, 5, 5
+; LE-NEXT:    xxland 42, 2, 42
+; LE-NEXT:    vmladduhm 10, 2, 10, 3
+; LE-NEXT:    xxlxor 3, 3, 42
+; LE-NEXT:    xxland 42, 2, 4
+; LE-NEXT:    lxvd2x 4, 0, 3
+; LE-NEXT:    addis 3, 2, .LCPI9_4 at toc@ha
+; LE-NEXT:    vmladduhm 10, 2, 10, 3
+; LE-NEXT:    addi 3, 3, .LCPI9_4 at toc@l
+; LE-NEXT:    xxlxor 3, 3, 42
+; LE-NEXT:    vslh 10, 0, 0
+; LE-NEXT:    xxland 42, 2, 42
+; LE-NEXT:    vmladduhm 10, 2, 10, 3
+; LE-NEXT:    xxlxor 3, 3, 42
+; LE-NEXT:    xxland 42, 2, 4
+; LE-NEXT:    lxvd2x 4, 0, 3
+; LE-NEXT:    addis 3, 2, .LCPI9_5 at toc@ha
+; LE-NEXT:    vmladduhm 10, 2, 10, 3
+; LE-NEXT:    addi 3, 3, .LCPI9_5 at toc@l
+; LE-NEXT:    xxlxor 3, 3, 42
+; LE-NEXT:    xxlxor 3, 3, 39
+; LE-NEXT:    xxland 39, 2, 40
+; LE-NEXT:    vmladduhm 7, 2, 7, 3
+; LE-NEXT:    xxlxor 3, 3, 39
+; LE-NEXT:    xxland 39, 2, 41
+; LE-NEXT:    vmladduhm 7, 2, 7, 3
+; LE-NEXT:    xxlxor 3, 3, 39
+; LE-NEXT:    vslh 7, 5, 5
+; LE-NEXT:    xxland 39, 2, 39
+; LE-NEXT:    vmladduhm 7, 2, 7, 3
+; LE-NEXT:    xxlxor 3, 3, 39
+; LE-NEXT:    xxland 39, 2, 4
+; LE-NEXT:    lxvd2x 4, 0, 3
+; LE-NEXT:    addis 3, 2, .LCPI9_6 at toc@ha
+; LE-NEXT:    vmladduhm 7, 2, 7, 3
+; LE-NEXT:    addi 3, 3, .LCPI9_6 at toc@l
+; LE-NEXT:    xxlxor 3, 3, 39
+; LE-NEXT:    xxland 39, 2, 4
+; LE-NEXT:    lxvd2x 4, 0, 3
+; LE-NEXT:    vmladduhm 7, 2, 7, 3
+; LE-NEXT:    xxlxor 3, 3, 39
+; LE-NEXT:    xxland 39, 2, 4
+; LE-NEXT:    vmladduhm 7, 2, 7, 3
+; LE-NEXT:    xxlxor 3, 3, 39
+; LE-NEXT:    xxleqv 39, 39, 39
+; LE-NEXT:    vslh 7, 7, 7
+; LE-NEXT:    xxland 39, 2, 39
+; LE-NEXT:    vmladduhm 2, 2, 7, 3
+; LE-NEXT:    xxlxor 34, 3, 34
+; LE-NEXT:    vrlh 2, 2, 5
+; LE-NEXT:    xxland 35, 34, 36
+; LE-NEXT:    vsrh 2, 2, 0
+; LE-NEXT:    vslh 3, 3, 0
+; LE-NEXT:    xxland 2, 34, 36
+; LE-NEXT:    xxlor 34, 2, 35
+; LE-NEXT:    xxland 35, 34, 0
+; LE-NEXT:    vsrh 2, 2, 6
+; LE-NEXT:    vslh 3, 3, 6
+; LE-NEXT:    xxland 0, 34, 0
+; LE-NEXT:    xxlor 34, 0, 35
+; LE-NEXT:    vsrh 3, 2, 1
+; LE-NEXT:    xxland 34, 34, 1
+; LE-NEXT:    xxland 0, 35, 1
+; LE-NEXT:    vadduhm 2, 2, 2
+; LE-NEXT:    xxlor 34, 0, 34
+; LE-NEXT:    vsrh 2, 2, 1
+; LE-NEXT:    blr
+  %a.ext = zext <8 x i16> %a to <8 x i32>
+  %b.ext = zext <8 x i16> %b to <8 x i32>
+  %clmul = call <8 x i32> @llvm.clmul.v8i32(<8 x i32> %a.ext, <8 x i32> %b.ext)
+  %res.ext = lshr <8 x i32> %clmul, splat (i32 16)
+  %res = trunc <8x i32> %res.ext to <8 x i16>
+  ret <8 x i16> %res
+}
+
+define <4 x i32> @clmulh_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
+; BE-LABEL: clmulh_v4i32:
+; BE:       # %bb.0:
+; BE-NEXT:    stdu 1, -1472(1)
+; BE-NEXT:    li 3, 1280
+; BE-NEXT:    vspltisb 12, -1
+; BE-NEXT:    stvx 20, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 1296
+; BE-NEXT:    vslw 15, 12, 12
+; BE-NEXT:    vspltisw 12, 12
+; BE-NEXT:    stvx 21, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 1312
+; BE-NEXT:    vadduwm 17, 12, 12
+; BE-NEXT:    vspltisw 18, 8
+; BE-NEXT:    stvx 22, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 1328
+; BE-NEXT:    vsrw 6, 2, 18
+; BE-NEXT:    vspltisw 19, 4
+; BE-NEXT:    stvx 23, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 1344
+; BE-NEXT:    stvx 24, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 1360
+; BE-NEXT:    stvx 25, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 1376
+; BE-NEXT:    vsrw 9, 3, 18
+; BE-NEXT:    stvx 26, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 1392
+; BE-NEXT:    stvx 27, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 1408
+; BE-NEXT:    stvx 28, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 1424
+; BE-NEXT:    vsrw 12, 2, 17
+; BE-NEXT:    stvx 29, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 1440
+; BE-NEXT:    stvx 30, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 1456
+; BE-NEXT:    vspltisw 30, 2
+; BE-NEXT:    vslw 14, 2, 17
+; BE-NEXT:    stvx 31, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 1264
+; BE-NEXT:    vspltisw 31, 1
+; BE-NEXT:    stvx 17, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    addis 3, 2, .LCPI10_0 at toc@ha
+; BE-NEXT:    addi 3, 3, .LCPI10_0 at toc@l
+; BE-NEXT:    lvx 29, 0, 3
+; BE-NEXT:    li 3, 1248
+; BE-NEXT:    vsrw 16, 3, 17
+; BE-NEXT:    stvx 29, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 1232
+; BE-NEXT:    vslw 17, 3, 17
+; BE-NEXT:    vand 2, 2, 29
+; BE-NEXT:    vand 3, 3, 29
+; BE-NEXT:    vand 6, 6, 29
+; BE-NEXT:    vand 9, 9, 29
+; BE-NEXT:    vslw 2, 2, 18
+; BE-NEXT:    vslw 3, 3, 18
+; BE-NEXT:    vor 6, 6, 12
+; BE-NEXT:    vspltisb 12, 15
+; BE-NEXT:    stvx 12, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    addis 3, 2, .LCPI10_1 at toc@ha
+; BE-NEXT:    addi 3, 3, .LCPI10_1 at toc@l
+; BE-NEXT:    vor 9, 9, 16
+; BE-NEXT:    vor 2, 14, 2
+; BE-NEXT:    vor 3, 17, 3
+; BE-NEXT:    vor 2, 2, 6
+; BE-NEXT:    vor 3, 3, 9
+; BE-NEXT:    vand 6, 2, 12
+; BE-NEXT:    vsrw 2, 2, 19
+; BE-NEXT:    vand 9, 3, 12
+; BE-NEXT:    vsrw 3, 3, 19
+; BE-NEXT:    vand 2, 2, 12
+; BE-NEXT:    vand 3, 3, 12
+; BE-NEXT:    lvx 12, 0, 3
+; BE-NEXT:    li 3, 1216
+; BE-NEXT:    stvx 12, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    addis 3, 2, .LCPI10_2 at toc@ha
+; BE-NEXT:    addi 3, 3, .LCPI10_2 at toc@l
+; BE-NEXT:    vslw 6, 6, 19
+; BE-NEXT:    vslw 9, 9, 19
+; BE-NEXT:    vor 2, 2, 6
+; BE-NEXT:    vor 3, 3, 9
+; BE-NEXT:    vand 6, 2, 12
+; BE-NEXT:    vsrw 2, 2, 30
+; BE-NEXT:    vand 9, 3, 12
+; BE-NEXT:    vsrw 3, 3, 30
+; BE-NEXT:    vand 2, 2, 12
+; BE-NEXT:    vand 3, 3, 12
+; BE-NEXT:    lvx 12, 0, 3
+; BE-NEXT:    li 3, 1200
+; BE-NEXT:    stvx 12, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 1136
+; BE-NEXT:    stvx 18, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    addis 3, 2, .LCPI10_3 at toc@ha
+; BE-NEXT:    addi 3, 3, .LCPI10_3 at toc@l
+; BE-NEXT:    vslw 6, 6, 30
+; BE-NEXT:    vslw 9, 9, 30
+; BE-NEXT:    vor 2, 2, 6
+; BE-NEXT:    vor 3, 3, 9
+; BE-NEXT:    vsrw 6, 2, 31
+; BE-NEXT:    vand 2, 2, 12
+; BE-NEXT:    vadduwm 2, 2, 2
+; BE-NEXT:    vsrw 9, 3, 31
+; BE-NEXT:    vand 3, 3, 12
+; BE-NEXT:    vand 6, 6, 12
+; BE-NEXT:    vand 12, 9, 12
+; BE-NEXT:    vor 9, 6, 2
+; BE-NEXT:    vadduwm 2, 3, 3
+; BE-NEXT:    vor 14, 12, 2
+; BE-NEXT:    vadduwm 2, 18, 18
+; BE-NEXT:    vand 28, 14, 2
+; BE-NEXT:    lvx 2, 0, 3
+; BE-NEXT:    addis 3, 2, .LCPI10_4 at toc@ha
+; BE-NEXT:    addi 3, 3, .LCPI10_4 at toc@l
+; BE-NEXT:    vand 27, 14, 2
+; BE-NEXT:    lvx 2, 0, 3
+; BE-NEXT:    addis 3, 2, .LCPI10_5 at toc@ha
+; BE-NEXT:    addi 3, 3, .LCPI10_5 at toc@l
+; BE-NEXT:    vand 25, 14, 2
+; BE-NEXT:    lvx 2, 0, 3
+; BE-NEXT:    addis 3, 2, .LCPI10_6 at toc@ha
+; BE-NEXT:    addi 3, 3, .LCPI10_6 at toc@l
+; BE-NEXT:    vslw 4, 19, 19
+; BE-NEXT:    vand 26, 14, 4
+; BE-NEXT:    vand 4, 14, 2
+; BE-NEXT:    lvx 2, 0, 3
+; BE-NEXT:    addis 3, 2, .LCPI10_7 at toc@ha
+; BE-NEXT:    addi 3, 3, .LCPI10_7 at toc@l
+; BE-NEXT:    vsldoi 5, 31, 31, 1
+; BE-NEXT:    vand 24, 14, 5
+; BE-NEXT:    vand 5, 14, 2
+; BE-NEXT:    lvx 2, 0, 3
+; BE-NEXT:    addis 3, 2, .LCPI10_8 at toc@ha
+; BE-NEXT:    addi 3, 3, .LCPI10_8 at toc@l
+; BE-NEXT:    vand 29, 14, 2
+; BE-NEXT:    lvx 2, 0, 3
+; BE-NEXT:    addis 3, 2, .LCPI10_9 at toc@ha
+; BE-NEXT:    addi 3, 3, .LCPI10_9 at toc@l
+; BE-NEXT:    vand 21, 14, 2
+; BE-NEXT:    lvx 2, 0, 3
+; BE-NEXT:    addis 3, 2, .LCPI10_10 at toc@ha
+; BE-NEXT:    addi 3, 3, .LCPI10_10 at toc@l
+; BE-NEXT:    vslw 7, 18, 18
+; BE-NEXT:    vand 3, 14, 7
+; BE-NEXT:    vand 7, 14, 2
+; BE-NEXT:    lvx 2, 0, 3
+; BE-NEXT:    addis 3, 2, .LCPI10_11 at toc@ha
+; BE-NEXT:    addi 3, 3, .LCPI10_11 at toc@l
+; BE-NEXT:    vsldoi 13, 18, 18, 2
+; BE-NEXT:    vand 16, 14, 13
+; BE-NEXT:    vand 13, 14, 2
+; BE-NEXT:    lvx 2, 0, 3
+; BE-NEXT:    addis 3, 2, .LCPI10_12 at toc@ha
+; BE-NEXT:    addi 3, 3, .LCPI10_12 at toc@l
+; BE-NEXT:    vand 12, 14, 2
+; BE-NEXT:    lvx 2, 0, 3
+; BE-NEXT:    li 3, 1184
+; BE-NEXT:    stvx 31, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 1168
+; BE-NEXT:    stvx 30, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 1152
+; BE-NEXT:    vsldoi 11, 31, 31, 2
+; BE-NEXT:    stvx 19, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 1072
+; BE-NEXT:    vsldoi 1, 19, 19, 1
+; BE-NEXT:    vsldoi 10, 30, 30, 2
+; BE-NEXT:    vand 20, 14, 11
+; BE-NEXT:    vand 11, 14, 2
+; BE-NEXT:    vsldoi 2, 31, 31, 3
+; BE-NEXT:    vsldoi 8, 19, 19, 2
+; BE-NEXT:    vand 22, 14, 1
+; BE-NEXT:    vand 1, 14, 10
+; BE-NEXT:    vand 10, 14, 2
+; BE-NEXT:    vsldoi 2, 30, 30, 3
+; BE-NEXT:    vand 17, 14, 8
+; BE-NEXT:    vand 8, 14, 2
+; BE-NEXT:    vsldoi 2, 19, 19, 3
+; BE-NEXT:    vand 2, 14, 2
+; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 1040
+; BE-NEXT:    vsldoi 2, 18, 18, 3
+; BE-NEXT:    vand 2, 14, 2
+; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    addis 3, 2, .LCPI10_13 at toc@ha
+; BE-NEXT:    addi 3, 3, .LCPI10_13 at toc@l
+; BE-NEXT:    lvx 2, 0, 3
+; BE-NEXT:    li 3, 1008
+; BE-NEXT:    vand 2, 14, 2
+; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    addis 3, 2, .LCPI10_14 at toc@ha
+; BE-NEXT:    addi 3, 3, .LCPI10_14 at toc@l
+; BE-NEXT:    lvx 2, 0, 3
+; BE-NEXT:    li 3, 288
+; BE-NEXT:    vand 2, 14, 2
+; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    addis 3, 2, .LCPI10_15 at toc@ha
+; BE-NEXT:    addi 3, 3, .LCPI10_15 at toc@l
+; BE-NEXT:    lvx 2, 0, 3
+; BE-NEXT:    li 3, 192
+; BE-NEXT:    vand 2, 14, 2
+; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 272
+; BE-NEXT:    vand 2, 14, 15
+; BE-NEXT:    vspltisw 15, -16
+; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 976
+; BE-NEXT:    vand 2, 14, 30
+; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 944
+; BE-NEXT:    vand 31, 14, 31
+; BE-NEXT:    stvx 31, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 912
+; BE-NEXT:    vsldoi 0, 30, 30, 1
+; BE-NEXT:    vand 19, 14, 19
+; BE-NEXT:    stvx 19, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 880
+; BE-NEXT:    vand 23, 14, 0
+; BE-NEXT:    vand 14, 14, 18
+; BE-NEXT:    stvx 14, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 1120
+; BE-NEXT:    vxor 6, 6, 6
+; BE-NEXT:    vrlw 0, 2, 15
+; BE-NEXT:    vmsumuhm 2, 9, 0, 6
+; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 1104
+; BE-NEXT:    vrlw 0, 31, 15
+; BE-NEXT:    vmsumuhm 2, 9, 0, 6
+; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 1088
+; BE-NEXT:    vrlw 0, 19, 15
+; BE-NEXT:    vmsumuhm 2, 9, 0, 6
+; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 1056
+; BE-NEXT:    vrlw 0, 14, 15
+; BE-NEXT:    vmsumuhm 2, 9, 0, 6
+; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 1024
+; BE-NEXT:    vrlw 0, 28, 15
+; BE-NEXT:    vmsumuhm 2, 9, 0, 6
+; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 992
+; BE-NEXT:    vrlw 0, 27, 15
+; BE-NEXT:    vmsumuhm 2, 9, 0, 6
+; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 960
+; BE-NEXT:    vrlw 0, 26, 15
+; BE-NEXT:    vmsumuhm 2, 9, 0, 6
+; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 928
+; BE-NEXT:    vrlw 0, 25, 15
+; BE-NEXT:    vmsumuhm 2, 9, 0, 6
+; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 896
+; BE-NEXT:    vrlw 0, 24, 15
+; BE-NEXT:    vmsumuhm 2, 9, 0, 6
+; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 864
+; BE-NEXT:    vrlw 0, 23, 15
+; BE-NEXT:    vmsumuhm 2, 9, 0, 6
+; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 832
+; BE-NEXT:    vrlw 0, 22, 15
+; BE-NEXT:    vmsumuhm 2, 9, 0, 6
+; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 800
+; BE-NEXT:    vrlw 0, 3, 15
+; BE-NEXT:    vmsumuhm 2, 9, 0, 6
+; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 768
+; BE-NEXT:    vrlw 0, 4, 15
+; BE-NEXT:    vmsumuhm 2, 9, 0, 6
+; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 736
+; BE-NEXT:    vrlw 0, 5, 15
+; BE-NEXT:    vmsumuhm 2, 9, 0, 6
+; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 704
+; BE-NEXT:    vrlw 0, 29, 15
+; BE-NEXT:    vmsumuhm 2, 9, 0, 6
+; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 672
+; BE-NEXT:    vrlw 0, 21, 15
+; BE-NEXT:    vmsumuhm 2, 9, 0, 6
+; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 640
+; BE-NEXT:    vrlw 0, 20, 15
+; BE-NEXT:    vmsumuhm 2, 9, 0, 6
+; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 592
+; BE-NEXT:    vrlw 0, 1, 15
+; BE-NEXT:    vmsumuhm 2, 9, 0, 6
+; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 560
+; BE-NEXT:    vrlw 0, 17, 15
+; BE-NEXT:    vmsumuhm 2, 9, 0, 6
+; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 528
+; BE-NEXT:    vrlw 0, 16, 15
+; BE-NEXT:    vmsumuhm 2, 9, 0, 6
+; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 496
+; BE-NEXT:    vrlw 0, 7, 15
+; BE-NEXT:    vmsumuhm 2, 9, 0, 6
+; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 464
+; BE-NEXT:    vrlw 0, 13, 15
+; BE-NEXT:    vmsumuhm 2, 9, 0, 6
+; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 432
+; BE-NEXT:    vrlw 0, 12, 15
+; BE-NEXT:    vmsumuhm 2, 9, 0, 6
+; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 400
+; BE-NEXT:    vrlw 0, 11, 15
+; BE-NEXT:    vmsumuhm 2, 9, 0, 6
+; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 368
+; BE-NEXT:    vrlw 0, 10, 15
+; BE-NEXT:    vmsumuhm 2, 9, 0, 6
+; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 336
+; BE-NEXT:    vrlw 0, 8, 15
+; BE-NEXT:    vmsumuhm 2, 9, 0, 6
+; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 1072
+; BE-NEXT:    vmr 14, 7
+; BE-NEXT:    lvx 7, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 304
+; BE-NEXT:    vrlw 0, 7, 15
+; BE-NEXT:    vmsumuhm 2, 9, 0, 6
+; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 1040
+; BE-NEXT:    vmr 30, 1
+; BE-NEXT:    lvx 1, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 240
+; BE-NEXT:    vrlw 0, 1, 15
+; BE-NEXT:    vmsumuhm 2, 9, 0, 6
+; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 1008
+; BE-NEXT:    vmr 19, 5
+; BE-NEXT:    lvx 5, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 208
+; BE-NEXT:    vrlw 0, 5, 15
+; BE-NEXT:    vmsumuhm 2, 9, 0, 6
+; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 288
+; BE-NEXT:    vmr 18, 4
+; BE-NEXT:    lvx 4, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 160
+; BE-NEXT:    vrlw 0, 4, 15
+; BE-NEXT:    vmsumuhm 2, 9, 0, 6
+; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 192
+; BE-NEXT:    vmr 31, 3
+; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 128
+; BE-NEXT:    vrlw 0, 3, 15
+; BE-NEXT:    vmsumuhm 2, 9, 0, 6
+; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 272
+; BE-NEXT:    lvx 2, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 64
+; BE-NEXT:    vrlw 0, 2, 15
+; BE-NEXT:    vmsumuhm 0, 9, 0, 6
+; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 976
+; BE-NEXT:    lvx 0, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 96
+; BE-NEXT:    vmulouh 0, 9, 0
+; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 944
+; BE-NEXT:    lvx 0, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 80
+; BE-NEXT:    vmulouh 0, 9, 0
+; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 912
+; BE-NEXT:    lvx 0, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 112
+; BE-NEXT:    vmulouh 0, 9, 0
+; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 880
+; BE-NEXT:    lvx 0, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 144
+; BE-NEXT:    vmulouh 0, 9, 0
+; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 176
+; BE-NEXT:    vmulouh 0, 9, 28
+; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 224
+; BE-NEXT:    vmulouh 0, 9, 27
+; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 256
+; BE-NEXT:    vmulouh 0, 9, 26
+; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 320
+; BE-NEXT:    vmulouh 0, 9, 25
+; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 352
+; BE-NEXT:    vmulouh 0, 9, 24
+; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 384
+; BE-NEXT:    vmulouh 0, 9, 23
+; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 416
+; BE-NEXT:    vmulouh 0, 9, 22
+; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 448
+; BE-NEXT:    vmulouh 0, 9, 31
+; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 480
+; BE-NEXT:    vmulouh 0, 9, 18
+; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 512
+; BE-NEXT:    vmulouh 0, 9, 19
+; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 544
+; BE-NEXT:    vmulouh 0, 9, 29
+; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 576
+; BE-NEXT:    vmulouh 0, 9, 21
+; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 608
+; BE-NEXT:    vmulouh 0, 9, 20
+; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 624
+; BE-NEXT:    vmulouh 0, 9, 30
+; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 656
+; BE-NEXT:    vmulouh 0, 9, 17
+; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 688
+; BE-NEXT:    vmulouh 0, 9, 16
+; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 720
+; BE-NEXT:    vmulouh 0, 9, 14
+; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 752
+; BE-NEXT:    vmulouh 0, 9, 13
+; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 784
+; BE-NEXT:    vmulouh 0, 9, 12
+; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 816
+; BE-NEXT:    vmulouh 0, 9, 11
+; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 848
+; BE-NEXT:    vmulouh 0, 9, 10
+; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 880
+; BE-NEXT:    vmulouh 0, 9, 8
+; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 912
+; BE-NEXT:    vmulouh 0, 9, 7
+; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 944
+; BE-NEXT:    vmulouh 0, 9, 1
+; BE-NEXT:    stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 976
+; BE-NEXT:    vmulouh 5, 9, 5
+; BE-NEXT:    stvx 5, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 1008
+; BE-NEXT:    vmulouh 4, 9, 4
+; BE-NEXT:    stvx 4, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 1040
+; BE-NEXT:    vmulouh 3, 9, 3
+; BE-NEXT:    stvx 3, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 1072
+; BE-NEXT:    vmulouh 2, 9, 2
+; BE-NEXT:    stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 1120
+; BE-NEXT:    lvx 2, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 1104
+; BE-NEXT:    vslw 9, 2, 15
+; BE-NEXT:    lvx 2, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 1088
+; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 1056
+; BE-NEXT:    lvx 4, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 1024
+; BE-NEXT:    vslw 2, 2, 15
+; BE-NEXT:    lvx 5, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 992
+; BE-NEXT:    lvx 0, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 960
+; BE-NEXT:    lvx 1, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 928
+; BE-NEXT:    vslw 3, 3, 15
+; BE-NEXT:    lvx 6, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 896
+; BE-NEXT:    lvx 7, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 864
+; BE-NEXT:    lvx 8, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 832
+; BE-NEXT:    vslw 4, 4, 15
+; BE-NEXT:    lvx 10, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 800
+; BE-NEXT:    lvx 11, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 768
+; BE-NEXT:    vslw 5, 5, 15
+; BE-NEXT:    lvx 12, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 736
+; BE-NEXT:    lvx 13, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 704
+; BE-NEXT:    lvx 14, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 672
+; BE-NEXT:    vslw 0, 0, 15
+; BE-NEXT:    lvx 16, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 640
+; BE-NEXT:    lvx 17, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 592
+; BE-NEXT:    lvx 18, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 560
+; BE-NEXT:    vslw 1, 1, 15
+; BE-NEXT:    lvx 19, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 528
+; BE-NEXT:    lvx 31, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 496
+; BE-NEXT:    vslw 6, 6, 15
+; BE-NEXT:    lvx 30, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 464
+; BE-NEXT:    lvx 29, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 432
+; BE-NEXT:    lvx 28, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 400
+; BE-NEXT:    vslw 7, 7, 15
+; BE-NEXT:    lvx 27, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 368
+; BE-NEXT:    lvx 26, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 336
+; BE-NEXT:    lvx 25, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 304
+; BE-NEXT:    vslw 8, 8, 15
+; BE-NEXT:    lvx 24, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 240
+; BE-NEXT:    lvx 23, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 208
+; BE-NEXT:    vslw 10, 10, 15
+; BE-NEXT:    lvx 22, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 160
+; BE-NEXT:    lvx 21, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 128
+; BE-NEXT:    lvx 20, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 1120
+; BE-NEXT:    vslw 11, 11, 15
+; BE-NEXT:    vslw 20, 20, 15
+; BE-NEXT:    stvx 20, 1, 3 # 16-byte Folded Spill
+; BE-NEXT:    li 3, 64
+; BE-NEXT:    lvx 20, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 96
+; BE-NEXT:    vslw 12, 12, 15
+; BE-NEXT:    vslw 13, 13, 15
+; BE-NEXT:    vslw 14, 14, 15
+; BE-NEXT:    vslw 16, 16, 15
+; BE-NEXT:    vslw 17, 17, 15
+; BE-NEXT:    vslw 18, 18, 15
+; BE-NEXT:    vslw 19, 19, 15
+; BE-NEXT:    vslw 31, 31, 15
+; BE-NEXT:    vslw 30, 30, 15
+; BE-NEXT:    vslw 29, 29, 15
+; BE-NEXT:    vslw 28, 28, 15
+; BE-NEXT:    vslw 27, 27, 15
+; BE-NEXT:    vslw 26, 26, 15
+; BE-NEXT:    vslw 25, 25, 15
+; BE-NEXT:    vslw 24, 24, 15
+; BE-NEXT:    vslw 23, 23, 15
+; BE-NEXT:    vslw 22, 22, 15
+; BE-NEXT:    vslw 21, 21, 15
+; BE-NEXT:    vslw 20, 20, 15
+; BE-NEXT:    lvx 15, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 80
+; BE-NEXT:    vadduwm 9, 15, 9
+; BE-NEXT:    lvx 15, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 112
+; BE-NEXT:    vadduwm 2, 15, 2
+; BE-NEXT:    vxor 2, 2, 9
+; BE-NEXT:    lvx 9, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 144
+; BE-NEXT:    vadduwm 3, 9, 3
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 176
+; BE-NEXT:    vadduwm 3, 3, 4
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 224
+; BE-NEXT:    vadduwm 3, 3, 5
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 256
+; BE-NEXT:    vadduwm 3, 3, 0
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 320
+; BE-NEXT:    vadduwm 3, 3, 1
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 352
+; BE-NEXT:    vadduwm 3, 3, 6
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 384
+; BE-NEXT:    vadduwm 3, 3, 7
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 416
+; BE-NEXT:    vadduwm 3, 3, 8
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 448
+; BE-NEXT:    vadduwm 3, 3, 10
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 480
+; BE-NEXT:    vadduwm 3, 3, 11
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 512
+; BE-NEXT:    vadduwm 3, 3, 12
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 544
+; BE-NEXT:    vadduwm 3, 3, 13
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 576
+; BE-NEXT:    vadduwm 3, 3, 14
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 608
+; BE-NEXT:    vadduwm 3, 3, 16
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 624
+; BE-NEXT:    vadduwm 3, 3, 17
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 656
+; BE-NEXT:    vadduwm 3, 3, 18
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 688
+; BE-NEXT:    vadduwm 3, 3, 19
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 720
+; BE-NEXT:    vadduwm 3, 3, 31
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 752
+; BE-NEXT:    vadduwm 3, 3, 30
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 784
+; BE-NEXT:    vadduwm 3, 3, 29
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 816
+; BE-NEXT:    vadduwm 3, 3, 28
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 848
+; BE-NEXT:    vadduwm 3, 3, 27
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 880
+; BE-NEXT:    vadduwm 3, 3, 26
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 912
+; BE-NEXT:    vadduwm 3, 3, 25
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 944
+; BE-NEXT:    vadduwm 3, 3, 24
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 976
+; BE-NEXT:    vadduwm 3, 3, 23
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 1008
+; BE-NEXT:    vadduwm 3, 3, 22
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 1040
+; BE-NEXT:    vadduwm 3, 3, 21
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 1120
+; BE-NEXT:    lvx 4, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 1072
+; BE-NEXT:    vadduwm 3, 3, 4
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 1264
+; BE-NEXT:    lvx 5, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 1136
+; BE-NEXT:    vadduwm 3, 3, 20
+; BE-NEXT:    lvx 1, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 1248
+; BE-NEXT:    vxor 2, 2, 3
+; BE-NEXT:    lvx 0, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 1232
+; BE-NEXT:    vsrw 3, 2, 5
+; BE-NEXT:    vsrw 4, 2, 1
+; BE-NEXT:    vslw 5, 2, 5
+; BE-NEXT:    vand 2, 2, 0
+; BE-NEXT:    vslw 2, 2, 1
+; BE-NEXT:    vand 4, 4, 0
+; BE-NEXT:    vor 2, 5, 2
+; BE-NEXT:    lvx 5, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 1152
+; BE-NEXT:    vor 3, 4, 3
+; BE-NEXT:    lvx 4, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 1216
+; BE-NEXT:    vor 2, 2, 3
+; BE-NEXT:    vand 3, 2, 5
+; BE-NEXT:    vsrw 2, 2, 4
+; BE-NEXT:    vand 2, 2, 5
+; BE-NEXT:    lvx 5, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 1168
+; BE-NEXT:    vslw 3, 3, 4
+; BE-NEXT:    lvx 4, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 1184
+; BE-NEXT:    vor 2, 2, 3
+; BE-NEXT:    vand 3, 2, 5
+; BE-NEXT:    vsrw 2, 2, 4
+; BE-NEXT:    vslw 3, 3, 4
+; BE-NEXT:    lvx 4, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 1200
+; BE-NEXT:    vand 2, 2, 5
+; BE-NEXT:    lvx 5, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 1456
+; BE-NEXT:    lvx 31, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 1440
+; BE-NEXT:    lvx 30, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 1424
+; BE-NEXT:    vor 2, 2, 3
+; BE-NEXT:    lvx 29, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 1408
+; BE-NEXT:    lvx 28, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 1392
+; BE-NEXT:    lvx 27, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 1376
+; BE-NEXT:    vsrw 3, 2, 4
+; BE-NEXT:    lvx 26, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 1360
+; BE-NEXT:    lvx 25, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 1344
+; BE-NEXT:    lvx 24, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 1328
+; BE-NEXT:    vand 2, 2, 5
+; BE-NEXT:    vadduwm 2, 2, 2
+; BE-NEXT:    lvx 23, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 1312
+; BE-NEXT:    lvx 22, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 1296
+; BE-NEXT:    vand 3, 3, 5
+; BE-NEXT:    lvx 21, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    li 3, 1280
+; BE-NEXT:    lvx 20, 1, 3 # 16-byte Folded Reload
+; BE-NEXT:    vor 2, 3, 2
+; BE-NEXT:    vsrw 2, 2, 4
+; BE-NEXT:    addi 1, 1, 1472
+; BE-NEXT:    blr
+;
+; LE-LABEL: clmulh_v4i32:
+; LE:       # %bb.0:
+; LE-NEXT:    addis 3, 2, .LCPI10_0 at toc@ha
+; LE-NEXT:    vspltisw 7, 12
+; LE-NEXT:    vspltisw 4, 8
+; LE-NEXT:    addi 3, 3, .LCPI10_0 at toc@l
+; LE-NEXT:    vadduwm 7, 7, 7
+; LE-NEXT:    vsrw 18, 2, 4
+; LE-NEXT:    vspltisb 5, 15
+; LE-NEXT:    vspltisw 0, 4
+; LE-NEXT:    lxvd2x 0, 0, 3
+; LE-NEXT:    vsrw 17, 2, 7
+; LE-NEXT:    addis 3, 2, .LCPI10_1 at toc@ha
+; LE-NEXT:    vspltisw 6, 2
+; LE-NEXT:    vspltisw 1, 1
+; LE-NEXT:    vsldoi 10, 0, 0, 1
+; LE-NEXT:    addi 3, 3, .LCPI10_1 at toc@l
+; LE-NEXT:    vsldoi 13, 0, 0, 2
+; LE-NEXT:    vsldoi 9, 6, 6, 1
+; LE-NEXT:    vsldoi 12, 6, 6, 2
+; LE-NEXT:    vsldoi 14, 4, 4, 2
+; LE-NEXT:    vsldoi 16, 6, 6, 3
+; LE-NEXT:    vsldoi 8, 1, 1, 1
+; LE-NEXT:    vsldoi 11, 1, 1, 2
+; LE-NEXT:    vsldoi 15, 1, 1, 3
+; LE-NEXT:    xxland 1, 50, 0
+; LE-NEXT:    xxlor 1, 1, 49
+; LE-NEXT:    vslw 17, 2, 7
+; LE-NEXT:    xxland 34, 34, 0
+; LE-NEXT:    vslw 2, 2, 4
+; LE-NEXT:    xxlor 2, 49, 34
+; LE-NEXT:    xxlor 34, 2, 1
+; LE-NEXT:    xxland 50, 34, 37
+; LE-NEXT:    vsrw 2, 2, 0
+; LE-NEXT:    vslw 18, 18, 0
+; LE-NEXT:    xxland 1, 34, 37
+; LE-NEXT:    xxlor 34, 1, 50
+; LE-NEXT:    lxvd2x 1, 0, 3
+; LE-NEXT:    addis 3, 2, .LCPI10_2 at toc@ha
+; LE-NEXT:    addi 3, 3, .LCPI10_2 at toc@l
+; LE-NEXT:    xxland 51, 34, 1
+; LE-NEXT:    vsrw 2, 2, 6
+; LE-NEXT:    vslw 19, 19, 6
+; LE-NEXT:    xxland 2, 34, 1
+; LE-NEXT:    xxlor 34, 2, 51
+; LE-NEXT:    lxvd2x 2, 0, 3
+; LE-NEXT:    addis 3, 2, .LCPI10_3 at toc@ha
+; LE-NEXT:    vsrw 19, 2, 1
+; LE-NEXT:    addi 3, 3, .LCPI10_3 at toc@l
+; LE-NEXT:    lxvd2x 5, 0, 3
+; LE-NEXT:    addis 3, 2, .LCPI10_4 at toc@ha
+; LE-NEXT:    xxland 34, 34, 2
+; LE-NEXT:    xxland 3, 51, 2
+; LE-NEXT:    vsrw 19, 3, 4
+; LE-NEXT:    addi 3, 3, .LCPI10_4 at toc@l
+; LE-NEXT:    vadduwm 2, 2, 2
+; LE-NEXT:    xxlor 34, 3, 34
+; LE-NEXT:    xxland 3, 51, 0
+; LE-NEXT:    vsrw 19, 3, 7
+; LE-NEXT:    xxlor 3, 3, 51
+; LE-NEXT:    vslw 19, 3, 7
+; LE-NEXT:    xxland 35, 35, 0
+; LE-NEXT:    vsldoi 17, 0, 0, 3
+; LE-NEXT:    vslw 3, 3, 4
+; LE-NEXT:    xxlor 4, 51, 35
+; LE-NEXT:    xxlor 35, 4, 3
+; LE-NEXT:    xxland 51, 35, 37
+; LE-NEXT:    vsrw 3, 3, 0
+; LE-NEXT:    vslw 19, 19, 0
+; LE-NEXT:    xxland 3, 35, 37
+; LE-NEXT:    xxlor 35, 3, 51
+; LE-NEXT:    xxland 51, 35, 1
+; LE-NEXT:    vsrw 3, 3, 6
+; LE-NEXT:    vslw 19, 19, 6
+; LE-NEXT:    xxland 3, 35, 1
+; LE-NEXT:    xxlor 35, 3, 51
+; LE-NEXT:    vsrw 19, 3, 1
+; LE-NEXT:    xxland 35, 35, 2
+; LE-NEXT:    xxland 3, 51, 2
+; LE-NEXT:    vadduwm 3, 3, 3
+; LE-NEXT:    xxlor 3, 3, 35
+; LE-NEXT:    xxland 35, 3, 38
+; LE-NEXT:    xxland 51, 3, 33
+; LE-NEXT:    vmuluwm 3, 2, 3
+; LE-NEXT:    vmuluwm 19, 2, 19
+; LE-NEXT:    xxlxor 4, 51, 35
+; LE-NEXT:    xxland 35, 3, 32
+; LE-NEXT:    vmuluwm 3, 2, 3
+; LE-NEXT:    xxlxor 4, 4, 35
+; LE-NEXT:    xxland 35, 3, 36
+; LE-NEXT:    vmuluwm 3, 2, 3
+; LE-NEXT:    xxlxor 4, 4, 35
+; LE-NEXT:    vadduwm 3, 4, 4
+; LE-NEXT:    xxland 35, 3, 35
+; LE-NEXT:    vsldoi 18, 4, 4, 3
+; LE-NEXT:    vmuluwm 3, 2, 3
+; LE-NEXT:    xxlxor 4, 4, 35
+; LE-NEXT:    xxland 35, 3, 5
+; LE-NEXT:    lxvd2x 5, 0, 3
+; LE-NEXT:    addis 3, 2, .LCPI10_5 at toc@ha
+; LE-NEXT:    vmuluwm 3, 2, 3
+; LE-NEXT:    addi 3, 3, .LCPI10_5 at toc@l
+; LE-NEXT:    xxlxor 4, 4, 35
+; LE-NEXT:    vslw 3, 0, 0
+; LE-NEXT:    xxland 35, 3, 35
+; LE-NEXT:    vmuluwm 3, 2, 3
+; LE-NEXT:    xxlxor 4, 4, 35
+; LE-NEXT:    xxland 35, 3, 5
+; LE-NEXT:    lxvd2x 5, 0, 3
+; LE-NEXT:    addis 3, 2, .LCPI10_6 at toc@ha
+; LE-NEXT:    vmuluwm 3, 2, 3
+; LE-NEXT:    addi 3, 3, .LCPI10_6 at toc@l
+; LE-NEXT:    xxlxor 4, 4, 35
+; LE-NEXT:    xxland 35, 3, 40
+; LE-NEXT:    vmuluwm 3, 2, 3
+; LE-NEXT:    xxlxor 4, 4, 35
+; LE-NEXT:    xxland 35, 3, 41
+; LE-NEXT:    vmuluwm 3, 2, 3
+; LE-NEXT:    xxlxor 4, 4, 35
+; LE-NEXT:    xxland 35, 3, 42
+; LE-NEXT:    vmuluwm 3, 2, 3
+; LE-NEXT:    xxlxor 4, 4, 35
+; LE-NEXT:    vslw 3, 4, 4
+; LE-NEXT:    xxland 35, 3, 35
+; LE-NEXT:    vmuluwm 3, 2, 3
+; LE-NEXT:    xxlxor 4, 4, 35
+; LE-NEXT:    xxland 35, 3, 5
+; LE-NEXT:    lxvd2x 5, 0, 3
+; LE-NEXT:    addis 3, 2, .LCPI10_7 at toc@ha
+; LE-NEXT:    vmuluwm 3, 2, 3
+; LE-NEXT:    addi 3, 3, .LCPI10_7 at toc@l
+; LE-NEXT:    xxlxor 4, 4, 35
+; LE-NEXT:    xxland 35, 3, 5
+; LE-NEXT:    lxvd2x 5, 0, 3
+; LE-NEXT:    addis 3, 2, .LCPI10_8 at toc@ha
+; LE-NEXT:    vmuluwm 3, 2, 3
+; LE-NEXT:    addi 3, 3, .LCPI10_8 at toc@l
+; LE-NEXT:    xxlxor 4, 4, 35
+; LE-NEXT:    xxland 35, 3, 5
+; LE-NEXT:    lxvd2x 5, 0, 3
+; LE-NEXT:    addis 3, 2, .LCPI10_9 at toc@ha
+; LE-NEXT:    vmuluwm 3, 2, 3
+; LE-NEXT:    addi 3, 3, .LCPI10_9 at toc@l
+; LE-NEXT:    xxlxor 4, 4, 35
+; LE-NEXT:    xxland 35, 3, 5
+; LE-NEXT:    lxvd2x 5, 0, 3
+; LE-NEXT:    addis 3, 2, .LCPI10_10 at toc@ha
+; LE-NEXT:    vmuluwm 3, 2, 3
+; LE-NEXT:    addi 3, 3, .LCPI10_10 at toc@l
+; LE-NEXT:    xxlxor 4, 4, 35
+; LE-NEXT:    xxland 35, 3, 43
+; LE-NEXT:    vmuluwm 3, 2, 3
+; LE-NEXT:    xxlxor 4, 4, 35
+; LE-NEXT:    xxland 35, 3, 44
+; LE-NEXT:    vmuluwm 3, 2, 3
+; LE-NEXT:    xxlxor 4, 4, 35
+; LE-NEXT:    xxland 35, 3, 45
+; LE-NEXT:    vmuluwm 3, 2, 3
+; LE-NEXT:    xxlxor 4, 4, 35
+; LE-NEXT:    xxland 35, 3, 46
+; LE-NEXT:    vmuluwm 3, 2, 3
+; LE-NEXT:    xxlxor 4, 4, 35
+; LE-NEXT:    xxland 35, 3, 5
+; LE-NEXT:    lxvd2x 5, 0, 3
+; LE-NEXT:    addis 3, 2, .LCPI10_11 at toc@ha
+; LE-NEXT:    vmuluwm 3, 2, 3
+; LE-NEXT:    addi 3, 3, .LCPI10_11 at toc@l
+; LE-NEXT:    xxlxor 4, 4, 35
+; LE-NEXT:    xxland 35, 3, 5
+; LE-NEXT:    lxvd2x 5, 0, 3
+; LE-NEXT:    addis 3, 2, .LCPI10_12 at toc@ha
+; LE-NEXT:    vmuluwm 3, 2, 3
+; LE-NEXT:    addi 3, 3, .LCPI10_12 at toc@l
+; LE-NEXT:    xxlxor 4, 4, 35
+; LE-NEXT:    xxland 35, 3, 5
+; LE-NEXT:    lxvd2x 5, 0, 3
+; LE-NEXT:    addis 3, 2, .LCPI10_13 at toc@ha
+; LE-NEXT:    vmuluwm 3, 2, 3
+; LE-NEXT:    addi 3, 3, .LCPI10_13 at toc@l
+; LE-NEXT:    xxlxor 4, 4, 35
+; LE-NEXT:    xxland 35, 3, 5
+; LE-NEXT:    lxvd2x 5, 0, 3
+; LE-NEXT:    addis 3, 2, .LCPI10_14 at toc@ha
+; LE-NEXT:    vmuluwm 3, 2, 3
+; LE-NEXT:    addi 3, 3, .LCPI10_14 at toc@l
+; LE-NEXT:    xxlxor 4, 4, 35
+; LE-NEXT:    xxland 35, 3, 47
+; LE-NEXT:    vmuluwm 3, 2, 3
+; LE-NEXT:    xxlxor 4, 4, 35
+; LE-NEXT:    xxland 35, 3, 48
+; LE-NEXT:    vmuluwm 3, 2, 3
+; LE-NEXT:    xxlxor 4, 4, 35
+; LE-NEXT:    xxland 35, 3, 49
+; LE-NEXT:    vmuluwm 3, 2, 3
+; LE-NEXT:    xxlxor 4, 4, 35
+; LE-NEXT:    xxland 35, 3, 50
+; LE-NEXT:    vmuluwm 3, 2, 3
+; LE-NEXT:    xxlxor 4, 4, 35
+; LE-NEXT:    xxland 35, 3, 5
+; LE-NEXT:    lxvd2x 5, 0, 3
+; LE-NEXT:    addis 3, 2, .LCPI10_15 at toc@ha
+; LE-NEXT:    vmuluwm 3, 2, 3
+; LE-NEXT:    addi 3, 3, .LCPI10_15 at toc@l
+; LE-NEXT:    xxlxor 4, 4, 35
+; LE-NEXT:    xxland 35, 3, 5
+; LE-NEXT:    lxvd2x 5, 0, 3
+; LE-NEXT:    vmuluwm 3, 2, 3
+; LE-NEXT:    xxlxor 4, 4, 35
+; LE-NEXT:    xxland 35, 3, 5
+; LE-NEXT:    vmuluwm 3, 2, 3
+; LE-NEXT:    xxlxor 4, 4, 35
+; LE-NEXT:    xxleqv 35, 35, 35
+; LE-NEXT:    vslw 3, 3, 3
+; LE-NEXT:    xxland 35, 3, 35
+; LE-NEXT:    vmuluwm 2, 2, 3
+; LE-NEXT:    xxlxor 34, 4, 34
+; LE-NEXT:    vsrw 8, 2, 4
+; LE-NEXT:    vsrw 3, 2, 7
+; LE-NEXT:    xxland 3, 40, 0
+; LE-NEXT:    xxlor 3, 3, 35
+; LE-NEXT:    vslw 3, 2, 7
+; LE-NEXT:    xxland 34, 34, 0
+; LE-NEXT:    vslw 2, 2, 4
+; LE-NEXT:    xxlor 0, 35, 34
+; LE-NEXT:    xxlor 34, 0, 3
+; LE-NEXT:    xxland 35, 34, 37
+; LE-NEXT:    vsrw 2, 2, 0
+; LE-NEXT:    vslw 3, 3, 0
+; LE-NEXT:    xxland 0, 34, 37
+; LE-NEXT:    xxlor 34, 0, 35
+; LE-NEXT:    xxland 35, 34, 1
+; LE-NEXT:    vsrw 2, 2, 6
+; LE-NEXT:    vslw 3, 3, 6
+; LE-NEXT:    xxland 0, 34, 1
+; LE-NEXT:    xxlor 34, 0, 35
+; LE-NEXT:    vsrw 3, 2, 1
+; LE-NEXT:    xxland 34, 34, 2
+; LE-NEXT:    xxland 0, 35, 2
+; LE-NEXT:    vadduwm 2, 2, 2
+; LE-NEXT:    xxlor 34, 0, 34
+; LE-NEXT:    vsrw 2, 2, 1
+; LE-NEXT:    blr
+  %a.ext = zext <4 x i32> %a to <4 x i64>
+  %b.ext = zext <4 x i32> %b to <4 x i64>
+  %clmul = call <4 x i64> @llvm.clmul.v4i64(<4 x i64> %a.ext, <4 x i64> %b.ext)
+  %res.ext = lshr <4 x i64> %clmul, splat (i64 32)
+  %res = trunc <4 x i64> %res.ext to <4 x i32>
+  ret <4 x i32> %res
+}
+
+define <2 x i64> @clmulh_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
+; BE-LABEL: clmulh_v2i64:
+; BE:       # %bb.0:
+; BE-NEXT:    stdu 1, -1040(1)
+; BE-NEXT:    lis 7, -21846
+; BE-NEXT:    lis 8, 21845
+; BE-NEXT:    std 26, 992(1) # 8-byte Folded Spill
+; BE-NEXT:    ori 7, 7, 43690
+; BE-NEXT:    ori 8, 8, 21845
+; BE-NEXT:    std 27, 1000(1) # 8-byte Folded Spill
+; BE-NEXT:    sldi 7, 7, 32
+; BE-NEXT:    sldi 8, 8, 32
+; BE-NEXT:    lis 9, -13108
+; BE-NEXT:    lis 10, 13107
+; BE-NEXT:    std 30, 1024(1) # 8-byte Folded Spill
+; BE-NEXT:    oris 7, 7, 43690
+; BE-NEXT:    oris 8, 8, 21845
+; BE-NEXT:    std 28, 1008(1) # 8-byte Folded Spill
+; BE-NEXT:    sldi 0, 3, 1
+; BE-NEXT:    rldicl 3, 3, 63, 1
+; BE-NEXT:    ori 9, 9, 52428
+; BE-NEXT:    ori 10, 10, 13107
+; BE-NEXT:    std 29, 1016(1) # 8-byte Folded Spill
+; BE-NEXT:    ori 27, 7, 43690
+; BE-NEXT:    ori 26, 8, 21845
+; BE-NEXT:    std 2, 888(1) # 8-byte Folded Spill
+; BE-NEXT:    sldi 9, 9, 32
+; BE-NEXT:    sldi 10, 10, 32
+; BE-NEXT:    and 7, 0, 27
+; BE-NEXT:    and 3, 3, 26
+; BE-NEXT:    std 31, 1032(1) # 8-byte Folded Spill
+; BE-NEXT:    lis 11, -3856
+; BE-NEXT:    lis 12, 3855
+; BE-NEXT:    std 15, 904(1) # 8-byte Folded Spill
+; BE-NEXT:    sldi 30, 5, 1
+; BE-NEXT:    rldicl 5, 5, 63, 1
+; BE-NEXT:    oris 9, 9, 52428
+; BE-NEXT:    oris 10, 10, 13107
+; BE-NEXT:    std 14, 896(1) # 8-byte Folded Spill
+; BE-NEXT:    or 3, 3, 7
+; BE-NEXT:    ori 11, 11, 61680
+; BE-NEXT:    std 17, 920(1) # 8-byte Folded Spill
+; BE-NEXT:    ori 12, 12, 3855
+; BE-NEXT:    ori 29, 9, 52428
+; BE-NEXT:    ori 28, 10, 13107
+; BE-NEXT:    and 8, 30, 27
+; BE-NEXT:    std 16, 912(1) # 8-byte Folded Spill
+; BE-NEXT:    and 5, 5, 26
+; BE-NEXT:    sldi 7, 3, 2
+; BE-NEXT:    std 19, 936(1) # 8-byte Folded Spill
+; BE-NEXT:    rldicl 3, 3, 62, 2
+; BE-NEXT:    sldi 11, 11, 32
+; BE-NEXT:    sldi 12, 12, 32
+; BE-NEXT:    or 5, 5, 8
+; BE-NEXT:    std 18, 928(1) # 8-byte Folded Spill
+; BE-NEXT:    and 7, 7, 29
+; BE-NEXT:    and 3, 3, 28
+; BE-NEXT:    std 21, 952(1) # 8-byte Folded Spill
+; BE-NEXT:    oris 11, 11, 61680
+; BE-NEXT:    oris 12, 12, 3855
+; BE-NEXT:    sldi 8, 5, 2
+; BE-NEXT:    rldicl 5, 5, 62, 2
+; BE-NEXT:    std 20, 944(1) # 8-byte Folded Spill
+; BE-NEXT:    or 3, 3, 7
+; BE-NEXT:    ori 9, 11, 61680
+; BE-NEXT:    std 23, 968(1) # 8-byte Folded Spill
+; BE-NEXT:    ori 10, 12, 3855
+; BE-NEXT:    and 8, 8, 29
+; BE-NEXT:    and 5, 5, 28
+; BE-NEXT:    sldi 7, 3, 4
+; BE-NEXT:    std 22, 960(1) # 8-byte Folded Spill
+; BE-NEXT:    rldicl 3, 3, 60, 4
+; BE-NEXT:    or 5, 5, 8
+; BE-NEXT:    std 25, 984(1) # 8-byte Folded Spill
+; BE-NEXT:    and 7, 7, 9
+; BE-NEXT:    and 3, 3, 10
+; BE-NEXT:    sldi 8, 5, 4
+; BE-NEXT:    rldicl 5, 5, 60, 4
+; BE-NEXT:    std 24, 976(1) # 8-byte Folded Spill
+; BE-NEXT:    or 3, 3, 7
+; BE-NEXT:    and 8, 8, 9
+; BE-NEXT:    std 27, 352(1) # 8-byte Folded Spill
+; BE-NEXT:    and 5, 5, 10
+; BE-NEXT:    rotlwi 7, 3, 24
+; BE-NEXT:    or 5, 5, 8
+; BE-NEXT:    rlwimi 7, 3, 8, 8, 15
+; BE-NEXT:    std 26, 344(1) # 8-byte Folded Spill
+; BE-NEXT:    mr 30, 9
+; BE-NEXT:    std 29, 368(1) # 8-byte Folded Spill
+; BE-NEXT:    rotlwi 8, 5, 24
+; BE-NEXT:    rldicl 9, 3, 32, 32
+; BE-NEXT:    rlwimi 7, 3, 8, 24, 31
+; BE-NEXT:    rldicl 3, 5, 32, 32
+; BE-NEXT:    std 28, 360(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwimi 8, 5, 8, 8, 15
+; BE-NEXT:    std 30, 376(1) # 8-byte Folded Spill
+; BE-NEXT:    rotlwi 11, 3, 24
+; BE-NEXT:    mr 0, 10
+; BE-NEXT:    rotlwi 10, 9, 24
+; BE-NEXT:    std 0, 384(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwimi 11, 3, 8, 8, 15
+; BE-NEXT:    rlwimi 8, 5, 8, 24, 31
+; BE-NEXT:    rlwimi 10, 9, 8, 8, 15
+; BE-NEXT:    rlwimi 11, 3, 8, 24, 31
+; BE-NEXT:    sldi 5, 8, 32
+; BE-NEXT:    rlwimi 10, 9, 8, 24, 31
+; BE-NEXT:    sldi 3, 7, 32
+; BE-NEXT:    or 11, 5, 11
+; BE-NEXT:    or 12, 3, 10
+; BE-NEXT:    rlwinm 3, 11, 0, 30, 30
+; BE-NEXT:    rlwinm 5, 11, 0, 29, 29
+; BE-NEXT:    mulld 3, 12, 3
+; BE-NEXT:    std 3, 880(1) # 8-byte Folded Spill
+; BE-NEXT:    clrldi 3, 11, 63
+; BE-NEXT:    mulld 2, 12, 3
+; BE-NEXT:    mulld 3, 12, 5
+; BE-NEXT:    std 3, 872(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 3, 11, 0, 28, 28
+; BE-NEXT:    rlwinm 5, 11, 0, 27, 27
+; BE-NEXT:    mulld 3, 12, 3
+; BE-NEXT:    std 3, 856(1) # 8-byte Folded Spill
+; BE-NEXT:    mulld 3, 12, 5
+; BE-NEXT:    std 3, 864(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 3, 11, 0, 26, 26
+; BE-NEXT:    rlwinm 5, 11, 0, 25, 25
+; BE-NEXT:    mulld 3, 12, 3
+; BE-NEXT:    std 3, 840(1) # 8-byte Folded Spill
+; BE-NEXT:    mulld 3, 12, 5
+; BE-NEXT:    std 3, 848(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 3, 11, 0, 24, 24
+; BE-NEXT:    rlwinm 5, 11, 0, 23, 23
+; BE-NEXT:    mulld 3, 12, 3
+; BE-NEXT:    std 3, 824(1) # 8-byte Folded Spill
+; BE-NEXT:    mulld 3, 12, 5
+; BE-NEXT:    std 3, 832(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 3, 11, 0, 22, 22
+; BE-NEXT:    rlwinm 5, 11, 0, 21, 21
+; BE-NEXT:    mulld 3, 12, 3
+; BE-NEXT:    std 3, 808(1) # 8-byte Folded Spill
+; BE-NEXT:    mulld 3, 12, 5
+; BE-NEXT:    std 3, 816(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 3, 11, 0, 20, 20
+; BE-NEXT:    rlwinm 5, 11, 0, 19, 19
+; BE-NEXT:    mulld 3, 12, 3
+; BE-NEXT:    std 3, 792(1) # 8-byte Folded Spill
+; BE-NEXT:    mulld 3, 12, 5
+; BE-NEXT:    std 3, 800(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 3, 11, 0, 18, 18
+; BE-NEXT:    rlwinm 5, 11, 0, 17, 17
+; BE-NEXT:    mulld 3, 12, 3
+; BE-NEXT:    std 3, 776(1) # 8-byte Folded Spill
+; BE-NEXT:    mulld 3, 12, 5
+; BE-NEXT:    std 3, 784(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 3, 11, 0, 16, 16
+; BE-NEXT:    rlwinm 5, 11, 0, 15, 15
+; BE-NEXT:    mulld 3, 12, 3
+; BE-NEXT:    std 3, 760(1) # 8-byte Folded Spill
+; BE-NEXT:    mulld 3, 12, 5
+; BE-NEXT:    std 3, 768(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 3, 11, 0, 14, 14
+; BE-NEXT:    rlwinm 5, 11, 0, 13, 13
+; BE-NEXT:    mulld 3, 12, 3
+; BE-NEXT:    std 3, 744(1) # 8-byte Folded Spill
+; BE-NEXT:    mulld 3, 12, 5
+; BE-NEXT:    std 3, 752(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 3, 11, 0, 12, 12
+; BE-NEXT:    rlwinm 5, 11, 0, 11, 11
+; BE-NEXT:    mulld 3, 12, 3
+; BE-NEXT:    std 3, 728(1) # 8-byte Folded Spill
+; BE-NEXT:    mulld 3, 12, 5
+; BE-NEXT:    std 3, 736(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 3, 11, 0, 10, 10
+; BE-NEXT:    mulld 3, 12, 3
+; BE-NEXT:    std 3, 720(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 3, 11, 0, 9, 9
+; BE-NEXT:    mulld 3, 12, 3
+; BE-NEXT:    std 3, 712(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 3, 11, 0, 8, 8
+; BE-NEXT:    mulld 3, 12, 3
+; BE-NEXT:    std 3, 704(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 3, 11, 0, 7, 7
+; BE-NEXT:    mulld 3, 12, 3
+; BE-NEXT:    std 3, 696(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 3, 11, 0, 6, 6
+; BE-NEXT:    mulld 3, 12, 3
+; BE-NEXT:    std 3, 688(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 3, 11, 0, 5, 5
+; BE-NEXT:    mulld 3, 12, 3
+; BE-NEXT:    std 3, 680(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 3, 11, 0, 4, 4
+; BE-NEXT:    mulld 3, 12, 3
+; BE-NEXT:    std 3, 672(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 3, 11, 0, 3, 3
+; BE-NEXT:    mulld 3, 12, 3
+; BE-NEXT:    std 3, 664(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 3, 11, 0, 2, 2
+; BE-NEXT:    mulld 3, 12, 3
+; BE-NEXT:    std 3, 656(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 3, 11, 0, 1, 1
+; BE-NEXT:    mulld 3, 12, 3
+; BE-NEXT:    std 3, 648(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 3, 11, 0, 0, 0
+; BE-NEXT:    mulld 3, 12, 3
+; BE-NEXT:    std 3, 640(1) # 8-byte Folded Spill
+; BE-NEXT:    rldicl 3, 11, 32, 32
+; BE-NEXT:    rldicl 3, 3, 32, 31
+; BE-NEXT:    mulld 3, 12, 3
+; BE-NEXT:    std 3, 632(1) # 8-byte Folded Spill
+; BE-NEXT:    rldicl 3, 11, 31, 33
+; BE-NEXT:    rldicl 3, 3, 33, 30
+; BE-NEXT:    rldicl 5, 11, 30, 34
+; BE-NEXT:    rldicl 5, 5, 34, 29
+; BE-NEXT:    mulld 3, 12, 3
+; BE-NEXT:    std 3, 616(1) # 8-byte Folded Spill
+; BE-NEXT:    mulld 3, 12, 5
+; BE-NEXT:    std 3, 624(1) # 8-byte Folded Spill
+; BE-NEXT:    rldicl 3, 11, 29, 35
+; BE-NEXT:    rldicl 3, 3, 35, 28
+; BE-NEXT:    rldicl 5, 11, 28, 36
+; BE-NEXT:    rldicl 5, 5, 36, 27
+; BE-NEXT:    mulld 3, 12, 3
+; BE-NEXT:    std 3, 600(1) # 8-byte Folded Spill
+; BE-NEXT:    mulld 3, 12, 5
+; BE-NEXT:    std 3, 608(1) # 8-byte Folded Spill
+; BE-NEXT:    rldicl 3, 11, 27, 37
+; BE-NEXT:    rldicl 3, 3, 37, 26
+; BE-NEXT:    rldicl 5, 11, 26, 38
+; BE-NEXT:    rldicl 5, 5, 38, 25
+; BE-NEXT:    mulld 3, 12, 3
+; BE-NEXT:    std 3, 584(1) # 8-byte Folded Spill
+; BE-NEXT:    mulld 3, 12, 5
+; BE-NEXT:    std 3, 592(1) # 8-byte Folded Spill
+; BE-NEXT:    rldicl 3, 11, 25, 39
+; BE-NEXT:    rldicl 3, 3, 39, 24
+; BE-NEXT:    rldicl 5, 11, 24, 40
+; BE-NEXT:    rldicl 5, 5, 40, 23
+; BE-NEXT:    mulld 3, 12, 3
+; BE-NEXT:    std 3, 568(1) # 8-byte Folded Spill
+; BE-NEXT:    mulld 3, 12, 5
+; BE-NEXT:    std 3, 576(1) # 8-byte Folded Spill
+; BE-NEXT:    rldicl 3, 11, 23, 41
+; BE-NEXT:    rldicl 3, 3, 41, 22
+; BE-NEXT:    rldicl 5, 11, 22, 42
+; BE-NEXT:    rldicl 5, 5, 42, 21
+; BE-NEXT:    mulld 3, 12, 3
+; BE-NEXT:    std 3, 552(1) # 8-byte Folded Spill
+; BE-NEXT:    mulld 3, 12, 5
+; BE-NEXT:    std 3, 560(1) # 8-byte Folded Spill
+; BE-NEXT:    rldicl 3, 11, 21, 43
+; BE-NEXT:    rldicl 3, 3, 43, 20
+; BE-NEXT:    rldicl 5, 11, 20, 44
+; BE-NEXT:    rldicl 5, 5, 44, 19
+; BE-NEXT:    mulld 3, 12, 3
+; BE-NEXT:    std 3, 536(1) # 8-byte Folded Spill
+; BE-NEXT:    mulld 3, 12, 5
+; BE-NEXT:    std 3, 544(1) # 8-byte Folded Spill
+; BE-NEXT:    rldicl 3, 11, 19, 45
+; BE-NEXT:    rldicl 3, 3, 45, 18
+; BE-NEXT:    rldicl 5, 11, 18, 46
+; BE-NEXT:    rldicl 5, 5, 46, 17
+; BE-NEXT:    mulld 3, 12, 3
+; BE-NEXT:    std 3, 520(1) # 8-byte Folded Spill
+; BE-NEXT:    mulld 3, 12, 5
+; BE-NEXT:    std 3, 528(1) # 8-byte Folded Spill
+; BE-NEXT:    rldicl 3, 11, 17, 47
+; BE-NEXT:    rldicl 3, 3, 47, 16
+; BE-NEXT:    rldicl 5, 11, 16, 48
+; BE-NEXT:    rldicl 5, 5, 48, 15
+; BE-NEXT:    mulld 3, 12, 3
+; BE-NEXT:    std 3, 504(1) # 8-byte Folded Spill
+; BE-NEXT:    mulld 3, 12, 5
+; BE-NEXT:    std 3, 512(1) # 8-byte Folded Spill
+; BE-NEXT:    rldicl 3, 11, 15, 49
+; BE-NEXT:    rldicl 3, 3, 49, 14
+; BE-NEXT:    rldicl 5, 11, 14, 50
+; BE-NEXT:    rldicl 5, 5, 50, 13
+; BE-NEXT:    mulld 3, 12, 3
+; BE-NEXT:    std 3, 488(1) # 8-byte Folded Spill
+; BE-NEXT:    mulld 3, 12, 5
+; BE-NEXT:    std 3, 496(1) # 8-byte Folded Spill
+; BE-NEXT:    rldicl 3, 11, 13, 51
+; BE-NEXT:    rldicl 3, 3, 51, 12
+; BE-NEXT:    rldicl 5, 11, 12, 52
+; BE-NEXT:    rldicl 5, 5, 52, 11
+; BE-NEXT:    mulld 3, 12, 3
+; BE-NEXT:    std 3, 472(1) # 8-byte Folded Spill
+; BE-NEXT:    mulld 3, 12, 5
+; BE-NEXT:    std 3, 480(1) # 8-byte Folded Spill
+; BE-NEXT:    rldicl 3, 11, 11, 53
+; BE-NEXT:    rldicl 3, 3, 53, 10
+; BE-NEXT:    rldicl 5, 11, 10, 54
+; BE-NEXT:    rldicl 5, 5, 54, 9
+; BE-NEXT:    mulld 3, 12, 3
+; BE-NEXT:    std 3, 456(1) # 8-byte Folded Spill
+; BE-NEXT:    mulld 3, 12, 5
+; BE-NEXT:    std 3, 464(1) # 8-byte Folded Spill
+; BE-NEXT:    rldicl 3, 11, 9, 55
+; BE-NEXT:    rldicl 3, 3, 55, 8
+; BE-NEXT:    rldicl 5, 11, 8, 56
+; BE-NEXT:    rldicl 5, 5, 56, 7
+; BE-NEXT:    mulld 3, 12, 3
+; BE-NEXT:    std 3, 440(1) # 8-byte Folded Spill
+; BE-NEXT:    mulld 3, 12, 5
+; BE-NEXT:    std 3, 448(1) # 8-byte Folded Spill
+; BE-NEXT:    rldicl 3, 11, 7, 57
+; BE-NEXT:    rldicl 3, 3, 57, 6
+; BE-NEXT:    rldicl 5, 11, 6, 58
+; BE-NEXT:    rldicl 5, 5, 58, 5
+; BE-NEXT:    mulld 3, 12, 3
+; BE-NEXT:    std 3, 424(1) # 8-byte Folded Spill
+; BE-NEXT:    mulld 3, 12, 5
+; BE-NEXT:    std 3, 432(1) # 8-byte Folded Spill
+; BE-NEXT:    rldicl 3, 11, 5, 59
+; BE-NEXT:    rldicl 3, 3, 59, 4
+; BE-NEXT:    rldicl 5, 11, 4, 60
+; BE-NEXT:    rldicl 5, 5, 60, 3
+; BE-NEXT:    mulld 3, 12, 3
+; BE-NEXT:    std 3, 408(1) # 8-byte Folded Spill
+; BE-NEXT:    mulld 3, 12, 5
+; BE-NEXT:    std 3, 416(1) # 8-byte Folded Spill
+; BE-NEXT:    rldicl 3, 11, 3, 61
+; BE-NEXT:    rldicl 5, 11, 2, 62
+; BE-NEXT:    rldicl 3, 3, 61, 2
+; BE-NEXT:    rldicl 5, 5, 62, 1
+; BE-NEXT:    mulld 3, 12, 3
+; BE-NEXT:    std 3, 392(1) # 8-byte Folded Spill
+; BE-NEXT:    mulld 3, 12, 5
+; BE-NEXT:    std 3, 400(1) # 8-byte Folded Spill
+; BE-NEXT:    sldi 3, 4, 1
+; BE-NEXT:    rldicl 4, 4, 63, 1
+; BE-NEXT:    and 3, 3, 27
+; BE-NEXT:    and 4, 4, 26
+; BE-NEXT:    or 3, 4, 3
+; BE-NEXT:    sldi 4, 3, 2
+; BE-NEXT:    rldicl 3, 3, 62, 2
+; BE-NEXT:    and 4, 4, 29
+; BE-NEXT:    and 3, 3, 28
+; BE-NEXT:    or 3, 3, 4
+; BE-NEXT:    sldi 4, 3, 4
+; BE-NEXT:    rldicl 3, 3, 60, 4
+; BE-NEXT:    and 4, 4, 30
+; BE-NEXT:    and 3, 3, 0
+; BE-NEXT:    or 3, 3, 4
+; BE-NEXT:    rotlwi 4, 3, 24
+; BE-NEXT:    rlwimi 4, 3, 8, 8, 15
+; BE-NEXT:    rlwimi 4, 3, 8, 24, 31
+; BE-NEXT:    rldicl 3, 3, 32, 32
+; BE-NEXT:    rotlwi 5, 3, 24
+; BE-NEXT:    rlwimi 5, 3, 8, 8, 15
+; BE-NEXT:    rlwimi 5, 3, 8, 24, 31
+; BE-NEXT:    sldi 3, 6, 1
+; BE-NEXT:    rldicl 6, 6, 63, 1
+; BE-NEXT:    and 3, 3, 27
+; BE-NEXT:    and 6, 6, 26
+; BE-NEXT:    or 3, 6, 3
+; BE-NEXT:    sldi 6, 3, 2
+; BE-NEXT:    rldicl 3, 3, 62, 2
+; BE-NEXT:    and 6, 6, 29
+; BE-NEXT:    and 3, 3, 28
+; BE-NEXT:    or 3, 3, 6
+; BE-NEXT:    sldi 6, 3, 4
+; BE-NEXT:    rldicl 3, 3, 60, 4
+; BE-NEXT:    and 6, 6, 30
+; BE-NEXT:    and 3, 3, 0
+; BE-NEXT:    or 3, 3, 6
+; BE-NEXT:    rotlwi 6, 3, 24
+; BE-NEXT:    rlwimi 6, 3, 8, 8, 15
+; BE-NEXT:    rlwimi 6, 3, 8, 24, 31
+; BE-NEXT:    rldicl 3, 3, 32, 32
+; BE-NEXT:    rotlwi 7, 3, 24
+; BE-NEXT:    rlwimi 7, 3, 8, 8, 15
+; BE-NEXT:    rlwimi 7, 3, 8, 24, 31
+; BE-NEXT:    sldi 3, 4, 32
+; BE-NEXT:    or 4, 3, 5
+; BE-NEXT:    sldi 3, 6, 32
+; BE-NEXT:    or 3, 3, 7
+; BE-NEXT:    rlwinm 5, 3, 0, 30, 30
+; BE-NEXT:    mulld 5, 4, 5
+; BE-NEXT:    std 5, 336(1) # 8-byte Folded Spill
+; BE-NEXT:    clrldi 5, 3, 63
+; BE-NEXT:    mulld 5, 4, 5
+; BE-NEXT:    std 5, 328(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 5, 3, 0, 29, 29
+; BE-NEXT:    mulld 5, 4, 5
+; BE-NEXT:    std 5, 320(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 5, 3, 0, 28, 28
+; BE-NEXT:    mulld 5, 4, 5
+; BE-NEXT:    std 5, 312(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 5, 3, 0, 27, 27
+; BE-NEXT:    mulld 5, 4, 5
+; BE-NEXT:    std 5, 304(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 5, 3, 0, 26, 26
+; BE-NEXT:    mulld 5, 4, 5
+; BE-NEXT:    std 5, 296(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 5, 3, 0, 25, 25
+; BE-NEXT:    mulld 5, 4, 5
+; BE-NEXT:    std 5, 288(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 5, 3, 0, 24, 24
+; BE-NEXT:    mulld 5, 4, 5
+; BE-NEXT:    std 5, 280(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 5, 3, 0, 23, 23
+; BE-NEXT:    mulld 5, 4, 5
+; BE-NEXT:    std 5, 272(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 5, 3, 0, 22, 22
+; BE-NEXT:    mulld 5, 4, 5
+; BE-NEXT:    std 5, 264(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 5, 3, 0, 21, 21
+; BE-NEXT:    mulld 5, 4, 5
+; BE-NEXT:    std 5, 256(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 5, 3, 0, 20, 20
+; BE-NEXT:    mulld 5, 4, 5
+; BE-NEXT:    std 5, 248(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 5, 3, 0, 19, 19
+; BE-NEXT:    mulld 5, 4, 5
+; BE-NEXT:    std 5, 240(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 5, 3, 0, 18, 18
+; BE-NEXT:    mulld 5, 4, 5
+; BE-NEXT:    std 5, 232(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 5, 3, 0, 17, 17
+; BE-NEXT:    mulld 5, 4, 5
+; BE-NEXT:    std 5, 224(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 5, 3, 0, 16, 16
+; BE-NEXT:    mulld 5, 4, 5
+; BE-NEXT:    std 5, 216(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 5, 3, 0, 15, 15
+; BE-NEXT:    mulld 5, 4, 5
+; BE-NEXT:    std 5, 208(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 5, 3, 0, 14, 14
+; BE-NEXT:    mulld 5, 4, 5
+; BE-NEXT:    std 5, 200(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 5, 3, 0, 13, 13
+; BE-NEXT:    mulld 5, 4, 5
+; BE-NEXT:    std 5, 192(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 5, 3, 0, 12, 12
+; BE-NEXT:    mulld 5, 4, 5
+; BE-NEXT:    std 5, 184(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 5, 3, 0, 11, 11
+; BE-NEXT:    mulld 5, 4, 5
+; BE-NEXT:    std 5, 176(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 5, 3, 0, 10, 10
+; BE-NEXT:    mulld 5, 4, 5
+; BE-NEXT:    std 5, 168(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 5, 3, 0, 9, 9
+; BE-NEXT:    mulld 5, 4, 5
+; BE-NEXT:    std 5, 160(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 5, 3, 0, 8, 8
+; BE-NEXT:    mulld 5, 4, 5
+; BE-NEXT:    std 5, 152(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 5, 3, 0, 7, 7
+; BE-NEXT:    mulld 5, 4, 5
+; BE-NEXT:    std 5, 144(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 5, 3, 0, 6, 6
+; BE-NEXT:    mulld 5, 4, 5
+; BE-NEXT:    std 5, 136(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 5, 3, 0, 5, 5
+; BE-NEXT:    mulld 5, 4, 5
+; BE-NEXT:    std 5, 128(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 5, 3, 0, 4, 4
+; BE-NEXT:    mulld 5, 4, 5
+; BE-NEXT:    std 5, 120(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 5, 3, 0, 3, 3
+; BE-NEXT:    mulld 5, 4, 5
+; BE-NEXT:    std 5, 112(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 5, 3, 0, 2, 2
+; BE-NEXT:    mulld 5, 4, 5
+; BE-NEXT:    std 5, 104(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 5, 3, 0, 1, 1
+; BE-NEXT:    mulld 5, 4, 5
+; BE-NEXT:    std 5, 96(1) # 8-byte Folded Spill
+; BE-NEXT:    rlwinm 5, 3, 0, 0, 0
+; BE-NEXT:    mulld 5, 4, 5
+; BE-NEXT:    std 5, 88(1) # 8-byte Folded Spill
+; BE-NEXT:    rldicl 5, 3, 32, 32
+; BE-NEXT:    rldicl 5, 5, 32, 31
+; BE-NEXT:    mulld 5, 4, 5
+; BE-NEXT:    std 5, 80(1) # 8-byte Folded Spill
+; BE-NEXT:    rldicl 5, 3, 31, 33
+; BE-NEXT:    rldicl 5, 5, 33, 30
+; BE-NEXT:    rldicl 6, 3, 30, 34
+; BE-NEXT:    rldicl 6, 6, 34, 29
+; BE-NEXT:    mulld 5, 4, 5
+; BE-NEXT:    std 5, 64(1) # 8-byte Folded Spill
+; BE-NEXT:    mulld 5, 4, 6
+; BE-NEXT:    std 5, 72(1) # 8-byte Folded Spill
+; BE-NEXT:    rldicl 5, 3, 29, 35
+; BE-NEXT:    rldicl 6, 3, 28, 36
+; BE-NEXT:    rldicl 5, 5, 35, 28
+; BE-NEXT:    rldicl 6, 6, 36, 27
+; BE-NEXT:    mulld 31, 4, 5
+; BE-NEXT:    mulld 5, 4, 6
+; BE-NEXT:    std 5, 56(1) # 8-byte Folded Spill
+; BE-NEXT:    rldicl 5, 3, 27, 37
+; BE-NEXT:    rldicl 5, 5, 37, 26
+; BE-NEXT:    rldicl 6, 3, 26, 38
+; BE-NEXT:    mulld 15, 4, 5
+; BE-NEXT:    rldicl 5, 3, 25, 39
+; BE-NEXT:    rldicl 6, 6, 38, 25
+; BE-NEXT:    rldicl 5, 5, 39, 24
+; BE-NEXT:    mulld 14, 4, 6
+; BE-NEXT:    rldicl 6, 3, 24, 40
+; BE-NEXT:    mulld 17, 4, 5
+; BE-NEXT:    rldicl 5, 3, 23, 41
+; BE-NEXT:    rldicl 6, 6, 40, 23
+; BE-NEXT:    rldicl 5, 5, 41, 22
+; BE-NEXT:    mulld 16, 4, 6
+; BE-NEXT:    rldicl 6, 3, 22, 42
+; BE-NEXT:    mulld 19, 4, 5
+; BE-NEXT:    rldicl 5, 3, 21, 43
+; BE-NEXT:    rldicl 6, 6, 42, 21
+; BE-NEXT:    rldicl 5, 5, 43, 20
+; BE-NEXT:    mulld 18, 4, 6
+; BE-NEXT:    rldicl 6, 3, 20, 44
+; BE-NEXT:    mulld 21, 4, 5
+; BE-NEXT:    rldicl 5, 3, 19, 45
+; BE-NEXT:    rldicl 6, 6, 44, 19
+; BE-NEXT:    rldicl 5, 5, 45, 18
+; BE-NEXT:    mulld 20, 4, 6
+; BE-NEXT:    rldicl 6, 3, 18, 46
+; BE-NEXT:    mulld 23, 4, 5
+; BE-NEXT:    rldicl 5, 3, 17, 47
+; BE-NEXT:    rldicl 6, 6, 46, 17
+; BE-NEXT:    rldicl 5, 5, 47, 16
+; BE-NEXT:    mulld 22, 4, 6
+; BE-NEXT:    rldicl 6, 3, 16, 48
+; BE-NEXT:    mulld 25, 4, 5
+; BE-NEXT:    rldicl 5, 3, 15, 49
+; BE-NEXT:    rldicl 6, 6, 48, 15
+; BE-NEXT:    rldicl 5, 5, 49, 14
+; BE-NEXT:    mulld 24, 4, 6
+; BE-NEXT:    rldicl 6, 3, 14, 50
+; BE-NEXT:    mulld 27, 4, 5
+; BE-NEXT:    rldicl 5, 3, 13, 51
+; BE-NEXT:    rldicl 6, 6, 50, 13
+; BE-NEXT:    rldicl 5, 5, 51, 12
+; BE-NEXT:    mulld 26, 4, 6
+; BE-NEXT:    rldicl 6, 3, 12, 52
+; BE-NEXT:    mulld 29, 4, 5
+; BE-NEXT:    rldicl 5, 3, 11, 53
+; BE-NEXT:    rldicl 6, 6, 52, 11
+; BE-NEXT:    rldicl 5, 5, 53, 10
+; BE-NEXT:    mulld 28, 4, 6
+; BE-NEXT:    rldicl 6, 3, 10, 54
+; BE-NEXT:    mulld 0, 4, 5
+; BE-NEXT:    rldicl 5, 3, 9, 55
+; BE-NEXT:    rldicl 6, 6, 54, 9
+; BE-NEXT:    rldicl 5, 5, 55, 8
+; BE-NEXT:    mulld 30, 4, 6
+; BE-NEXT:    rldicl 6, 3, 8, 56
+; BE-NEXT:    mulld 11, 4, 5
+; BE-NEXT:    rldicl 5, 3, 7, 57
+; BE-NEXT:    rldicl 6, 6, 56, 7
+; BE-NEXT:    rldicl 5, 5, 57, 6
+; BE-NEXT:    mulld 12, 4, 6
+; BE-NEXT:    rldicl 6, 3, 6, 58
+; BE-NEXT:    mulld 9, 4, 5
+; BE-NEXT:    rldicl 5, 3, 5, 59
+; BE-NEXT:    rldicl 6, 6, 58, 5
+; BE-NEXT:    rldicl 5, 5, 59, 4
+; BE-NEXT:    mulld 10, 4, 6
+; BE-NEXT:    rldicl 6, 3, 4, 60
+; BE-NEXT:    mulld 7, 4, 5
+; BE-NEXT:    rldicl 5, 3, 3, 61
+; BE-NEXT:    rldicl 3, 3, 2, 62
+; BE-NEXT:    rldicl 6, 6, 60, 3
+; BE-NEXT:    rldicl 3, 3, 62, 1
+; BE-NEXT:    mulld 8, 4, 6
+; BE-NEXT:    mulld 6, 4, 3
+; BE-NEXT:    ld 3, 880(1) # 8-byte Folded Reload
+; BE-NEXT:    rldicl 5, 5, 61, 2
+; BE-NEXT:    mulld 5, 4, 5
+; BE-NEXT:    ld 4, 336(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 2, 3
+; BE-NEXT:    ld 2, 328(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 2, 4
+; BE-NEXT:    ld 2, 872(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 320(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 2
+; BE-NEXT:    ld 2, 856(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 312(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 2
+; BE-NEXT:    ld 2, 864(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 304(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 2
+; BE-NEXT:    ld 2, 840(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 296(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 2
+; BE-NEXT:    ld 2, 848(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 288(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 2
+; BE-NEXT:    ld 2, 824(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 280(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 2
+; BE-NEXT:    ld 2, 832(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 272(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 2
+; BE-NEXT:    ld 2, 808(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 264(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 2
+; BE-NEXT:    ld 2, 816(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 256(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 2
+; BE-NEXT:    ld 2, 792(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 248(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 2
+; BE-NEXT:    ld 2, 800(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 240(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 2
+; BE-NEXT:    ld 2, 776(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 232(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 2
+; BE-NEXT:    ld 2, 784(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 224(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 2
+; BE-NEXT:    ld 2, 760(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 216(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 2
+; BE-NEXT:    ld 2, 768(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 208(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 2
+; BE-NEXT:    ld 2, 744(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 200(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 2
+; BE-NEXT:    ld 2, 752(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 192(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 2
+; BE-NEXT:    ld 2, 728(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 184(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 2
+; BE-NEXT:    ld 2, 736(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 176(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 2
+; BE-NEXT:    ld 2, 720(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 168(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 2
+; BE-NEXT:    ld 2, 712(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 160(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 2
+; BE-NEXT:    ld 2, 704(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 152(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 2
+; BE-NEXT:    ld 2, 696(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 144(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 2
+; BE-NEXT:    ld 2, 688(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 136(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 2
+; BE-NEXT:    ld 2, 680(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 128(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 2
+; BE-NEXT:    ld 2, 672(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 120(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 2
+; BE-NEXT:    ld 2, 664(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 112(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 2
+; BE-NEXT:    ld 2, 656(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 104(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 2
+; BE-NEXT:    ld 2, 648(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 96(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 2
+; BE-NEXT:    ld 2, 640(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 88(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 2
+; BE-NEXT:    ld 2, 632(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 80(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 2
+; BE-NEXT:    ld 2, 616(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 64(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 2
+; BE-NEXT:    ld 2, 624(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    ld 2, 72(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 2
+; BE-NEXT:    ld 2, 600(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 31
+; BE-NEXT:    ld 31, 608(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 2
+; BE-NEXT:    xor 3, 3, 31
+; BE-NEXT:    ld 31, 56(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 31
+; BE-NEXT:    ld 31, 584(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 15
+; BE-NEXT:    xor 4, 4, 14
+; BE-NEXT:    ld 15, 592(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 31
+; BE-NEXT:    xor 4, 4, 17
+; BE-NEXT:    xor 4, 4, 16
+; BE-NEXT:    xor 3, 3, 15
+; BE-NEXT:    ld 15, 568(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 19
+; BE-NEXT:    xor 4, 4, 18
+; BE-NEXT:    ld 17, 576(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 15
+; BE-NEXT:    xor 4, 4, 21
+; BE-NEXT:    xor 4, 4, 20
+; BE-NEXT:    xor 3, 3, 17
+; BE-NEXT:    ld 17, 552(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 23
+; BE-NEXT:    xor 4, 4, 22
+; BE-NEXT:    ld 19, 560(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 17
+; BE-NEXT:    xor 4, 4, 25
+; BE-NEXT:    xor 4, 4, 24
+; BE-NEXT:    xor 3, 3, 19
+; BE-NEXT:    ld 19, 536(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 27
+; BE-NEXT:    xor 4, 4, 26
+; BE-NEXT:    ld 21, 544(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 19
+; BE-NEXT:    xor 4, 4, 29
+; BE-NEXT:    xor 4, 4, 28
+; BE-NEXT:    xor 3, 3, 21
+; BE-NEXT:    ld 21, 520(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 0
+; BE-NEXT:    xor 4, 4, 30
+; BE-NEXT:    ld 23, 528(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 21
+; BE-NEXT:    xor 4, 4, 11
+; BE-NEXT:    xor 4, 4, 12
+; BE-NEXT:    xor 3, 3, 23
+; BE-NEXT:    ld 23, 504(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 9
+; BE-NEXT:    xor 4, 4, 10
+; BE-NEXT:    ld 25, 512(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 23
+; BE-NEXT:    xor 4, 4, 7
+; BE-NEXT:    xor 4, 4, 8
+; BE-NEXT:    xor 3, 3, 25
+; BE-NEXT:    ld 25, 488(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 4, 4, 5
+; BE-NEXT:    xor 4, 4, 6
+; BE-NEXT:    ld 27, 496(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 25
+; BE-NEXT:    sldi 6, 4, 1
+; BE-NEXT:    rldicl 4, 4, 63, 1
+; BE-NEXT:    xor 3, 3, 27
+; BE-NEXT:    ld 27, 472(1) # 8-byte Folded Reload
+; BE-NEXT:    ld 29, 480(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 27
+; BE-NEXT:    xor 3, 3, 29
+; BE-NEXT:    ld 29, 456(1) # 8-byte Folded Reload
+; BE-NEXT:    ld 0, 464(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 29
+; BE-NEXT:    xor 3, 3, 0
+; BE-NEXT:    ld 0, 440(1) # 8-byte Folded Reload
+; BE-NEXT:    ld 11, 448(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 0
+; BE-NEXT:    xor 3, 3, 11
+; BE-NEXT:    ld 11, 424(1) # 8-byte Folded Reload
+; BE-NEXT:    ld 9, 432(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 11
+; BE-NEXT:    xor 3, 3, 9
+; BE-NEXT:    ld 9, 408(1) # 8-byte Folded Reload
+; BE-NEXT:    ld 7, 416(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 9
+; BE-NEXT:    xor 3, 3, 7
+; BE-NEXT:    ld 7, 392(1) # 8-byte Folded Reload
+; BE-NEXT:    ld 5, 400(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 7
+; BE-NEXT:    ld 7, 352(1) # 8-byte Folded Reload
+; BE-NEXT:    xor 3, 3, 5
+; BE-NEXT:    sldi 5, 3, 1
+; BE-NEXT:    rldicl 3, 3, 63, 1
+; BE-NEXT:    ld 8, 344(1) # 8-byte Folded Reload
+; BE-NEXT:    and 5, 5, 7
+; BE-NEXT:    and 6, 6, 7
+; BE-NEXT:    and 3, 3, 8
+; BE-NEXT:    and 4, 4, 8
+; BE-NEXT:    ld 7, 368(1) # 8-byte Folded Reload
+; BE-NEXT:    or 3, 3, 5
+; BE-NEXT:    or 4, 4, 6
+; BE-NEXT:    ld 8, 360(1) # 8-byte Folded Reload
+; BE-NEXT:    sldi 5, 3, 2
+; BE-NEXT:    rldicl 3, 3, 62, 2
+; BE-NEXT:    sldi 6, 4, 2
+; BE-NEXT:    rldicl 4, 4, 62, 2
+; BE-NEXT:    ld 2, 888(1) # 8-byte Folded Reload
+; BE-NEXT:    and 5, 5, 7
+; BE-NEXT:    and 3, 3, 8
+; BE-NEXT:    ld 31, 1032(1) # 8-byte Folded Reload
+; BE-NEXT:    and 6, 6, 7
+; BE-NEXT:    and 4, 4, 8
+; BE-NEXT:    ld 8, 376(1) # 8-byte Folded Reload
+; BE-NEXT:    or 3, 3, 5
+; BE-NEXT:    or 4, 4, 6
+; BE-NEXT:    sldi 5, 3, 4
+; BE-NEXT:    rldicl 3, 3, 60, 4
+; BE-NEXT:    ld 7, 384(1) # 8-byte Folded Reload
+; BE-NEXT:    and 5, 5, 8
+; BE-NEXT:    sldi 6, 4, 4
+; BE-NEXT:    and 3, 3, 7
+; BE-NEXT:    rldicl 4, 4, 60, 4
+; BE-NEXT:    ld 30, 1024(1) # 8-byte Folded Reload
+; BE-NEXT:    or 3, 3, 5
+; BE-NEXT:    and 6, 6, 8
+; BE-NEXT:    ld 29, 1016(1) # 8-byte Folded Reload
+; BE-NEXT:    and 4, 4, 7
+; BE-NEXT:    rotlwi 5, 3, 24
+; BE-NEXT:    or 4, 4, 6
+; BE-NEXT:    rlwimi 5, 3, 8, 8, 15
+; BE-NEXT:    ld 28, 1008(1) # 8-byte Folded Reload
+; BE-NEXT:    rotlwi 6, 4, 24
+; BE-NEXT:    ld 27, 1000(1) # 8-byte Folded Reload
+; BE-NEXT:    rldicl 7, 3, 32, 32
+; BE-NEXT:    rlwimi 5, 3, 8, 24, 31
+; BE-NEXT:    rldicl 3, 4, 32, 32
+; BE-NEXT:    ld 26, 992(1) # 8-byte Folded Reload
+; BE-NEXT:    rlwimi 6, 4, 8, 8, 15
+; BE-NEXT:    rotlwi 8, 7, 24
+; BE-NEXT:    ld 25, 984(1) # 8-byte Folded Reload
+; BE-NEXT:    rotlwi 9, 3, 24
+; BE-NEXT:    rlwimi 8, 7, 8, 8, 15
+; BE-NEXT:    rlwimi 9, 3, 8, 8, 15
+; BE-NEXT:    ld 24, 976(1) # 8-byte Folded Reload
+; BE-NEXT:    ld 23, 968(1) # 8-byte Folded Reload
+; BE-NEXT:    rlwimi 6, 4, 8, 24, 31
+; BE-NEXT:    rlwimi 8, 7, 8, 24, 31
+; BE-NEXT:    ld 22, 960(1) # 8-byte Folded Reload
+; BE-NEXT:    rlwimi 9, 3, 8, 24, 31
+; BE-NEXT:    sldi 3, 5, 32
+; BE-NEXT:    ld 21, 952(1) # 8-byte Folded Reload
+; BE-NEXT:    sldi 4, 6, 32
+; BE-NEXT:    or 3, 3, 8
+; BE-NEXT:    or 4, 4, 9
+; BE-NEXT:    ld 20, 944(1) # 8-byte Folded Reload
+; BE-NEXT:    rldicl 3, 3, 63, 1
+; BE-NEXT:    rldicl 4, 4, 63, 1
+; BE-NEXT:    ld 19, 936(1) # 8-byte Folded Reload
+; BE-NEXT:    ld 18, 928(1) # 8-byte Folded Reload
+; BE-NEXT:    ld 17, 920(1) # 8-byte Folded Reload
+; BE-NEXT:    ld 16, 912(1) # 8-byte Folded Reload
+; BE-NEXT:    ld 15, 904(1) # 8-byte Folded Reload
+; BE-NEXT:    ld 14, 896(1) # 8-byte Folded Reload
+; BE-NEXT:    addi 1, 1, 1040
+; BE-NEXT:    blr
+;
+; LE-LABEL: clmulh_v2i64:
+; LE:       # %bb.0:
+; LE-NEXT:    stdu 1, -736(1)
+; LE-NEXT:    lis 4, -21846
+; LE-NEXT:    lis 5, 21845
+; LE-NEXT:    xxswapd 1, 35
+; LE-NEXT:    xxswapd 0, 34
+; LE-NEXT:    mfvsrd 3, 35
+; LE-NEXT:    mfvsrd 9, 34
+; LE-NEXT:    lis 6, -13108
+; LE-NEXT:    lis 7, 13107
+; LE-NEXT:    ori 4, 4, 43690
+; LE-NEXT:    ori 5, 5, 21845
+; LE-NEXT:    mffprd 8, 1
+; LE-NEXT:    mffprd 10, 0
+; LE-NEXT:    std 28, 704(1) # 8-byte Folded Spill
+; LE-NEXT:    std 29, 712(1) # 8-byte Folded Spill
+; LE-NEXT:    ori 6, 6, 52428
+; LE-NEXT:    ori 7, 7, 13107
+; LE-NEXT:    sldi 4, 4, 32
+; LE-NEXT:    sldi 5, 5, 32
+; LE-NEXT:    sldi 6, 6, 32
+; LE-NEXT:    sldi 7, 7, 32
+; LE-NEXT:    sldi 11, 3, 1
+; LE-NEXT:    rldicl 3, 3, 63, 1
+; LE-NEXT:    std 30, 720(1) # 8-byte Folded Spill
+; LE-NEXT:    lis 0, -3856
+; LE-NEXT:    oris 4, 4, 43690
+; LE-NEXT:    oris 5, 5, 21845
+; LE-NEXT:    lis 30, 3855
+; LE-NEXT:    oris 6, 6, 52428
+; LE-NEXT:    sldi 12, 10, 1
+; LE-NEXT:    rldicl 10, 10, 63, 1
+; LE-NEXT:    oris 7, 7, 13107
+; LE-NEXT:    std 27, 696(1) # 8-byte Folded Spill
+; LE-NEXT:    ori 28, 4, 43690
+; LE-NEXT:    ori 29, 5, 21845
+; LE-NEXT:    std 14, 592(1) # 8-byte Folded Spill
+; LE-NEXT:    std 15, 600(1) # 8-byte Folded Spill
+; LE-NEXT:    sldi 4, 8, 1
+; LE-NEXT:    rldicl 5, 8, 63, 1
+; LE-NEXT:    std 16, 608(1) # 8-byte Folded Spill
+; LE-NEXT:    std 17, 616(1) # 8-byte Folded Spill
+; LE-NEXT:    sldi 8, 9, 1
+; LE-NEXT:    rldicl 9, 9, 63, 1
+; LE-NEXT:    std 28, 568(1) # 8-byte Folded Spill
+; LE-NEXT:    std 29, 576(1) # 8-byte Folded Spill
+; LE-NEXT:    and 11, 11, 28
+; LE-NEXT:    and 3, 3, 29
+; LE-NEXT:    std 18, 624(1) # 8-byte Folded Spill
+; LE-NEXT:    std 19, 632(1) # 8-byte Folded Spill
+; LE-NEXT:    and 4, 4, 28
+; LE-NEXT:    and 5, 5, 29
+; LE-NEXT:    std 20, 640(1) # 8-byte Folded Spill
+; LE-NEXT:    std 21, 648(1) # 8-byte Folded Spill
+; LE-NEXT:    and 8, 8, 28
+; LE-NEXT:    and 9, 9, 29
+; LE-NEXT:    std 22, 656(1) # 8-byte Folded Spill
+; LE-NEXT:    std 23, 664(1) # 8-byte Folded Spill
+; LE-NEXT:    and 12, 12, 28
+; LE-NEXT:    and 10, 10, 29
+; LE-NEXT:    std 24, 672(1) # 8-byte Folded Spill
+; LE-NEXT:    std 25, 680(1) # 8-byte Folded Spill
+; LE-NEXT:    or 3, 3, 11
+; LE-NEXT:    or 4, 5, 4
+; LE-NEXT:    std 26, 688(1) # 8-byte Folded Spill
+; LE-NEXT:    std 31, 728(1) # 8-byte Folded Spill
+; LE-NEXT:    ori 5, 0, 61680
+; LE-NEXT:    ori 11, 30, 3855
+; LE-NEXT:    std 2, 584(1) # 8-byte Folded Spill
+; LE-NEXT:    ori 30, 6, 52428
+; LE-NEXT:    ori 0, 7, 13107
+; LE-NEXT:    std 30, 552(1) # 8-byte Folded Spill
+; LE-NEXT:    std 0, 560(1) # 8-byte Folded Spill
+; LE-NEXT:    or 6, 9, 8
+; LE-NEXT:    or 7, 10, 12
+; LE-NEXT:    sldi 8, 3, 2
+; LE-NEXT:    rldicl 3, 3, 62, 2
+; LE-NEXT:    sldi 9, 4, 2
+; LE-NEXT:    rldicl 4, 4, 62, 2
+; LE-NEXT:    sldi 5, 5, 32
+; LE-NEXT:    sldi 10, 11, 32
+; LE-NEXT:    sldi 11, 6, 2
+; LE-NEXT:    rldicl 6, 6, 62, 2
+; LE-NEXT:    sldi 12, 7, 2
+; LE-NEXT:    rldicl 7, 7, 62, 2
+; LE-NEXT:    and 8, 8, 30
+; LE-NEXT:    and 3, 3, 0
+; LE-NEXT:    and 9, 9, 30
+; LE-NEXT:    and 4, 4, 0
+; LE-NEXT:    oris 5, 5, 61680
+; LE-NEXT:    oris 10, 10, 3855
+; LE-NEXT:    and 11, 11, 30
+; LE-NEXT:    and 6, 6, 0
+; LE-NEXT:    and 12, 12, 30
+; LE-NEXT:    and 7, 7, 0
+; LE-NEXT:    or 3, 3, 8
+; LE-NEXT:    or 4, 4, 9
+; LE-NEXT:    ori 30, 5, 61680
+; LE-NEXT:    std 30, 536(1) # 8-byte Folded Spill
+; LE-NEXT:    ori 0, 10, 3855
+; LE-NEXT:    std 0, 544(1) # 8-byte Folded Spill
+; LE-NEXT:    or 5, 6, 11
+; LE-NEXT:    or 6, 7, 12
+; LE-NEXT:    sldi 7, 3, 4
+; LE-NEXT:    rldicl 3, 3, 60, 4
+; LE-NEXT:    sldi 8, 4, 4
+; LE-NEXT:    rldicl 4, 4, 60, 4
+; LE-NEXT:    sldi 9, 5, 4
+; LE-NEXT:    rldicl 5, 5, 60, 4
+; LE-NEXT:    sldi 10, 6, 4
+; LE-NEXT:    rldicl 6, 6, 60, 4
+; LE-NEXT:    and 7, 7, 30
+; LE-NEXT:    and 3, 3, 0
+; LE-NEXT:    and 8, 8, 30
+; LE-NEXT:    and 4, 4, 0
+; LE-NEXT:    and 9, 9, 30
+; LE-NEXT:    and 5, 5, 0
+; LE-NEXT:    and 10, 10, 30
+; LE-NEXT:    and 6, 6, 0
+; LE-NEXT:    or 3, 3, 7
+; LE-NEXT:    or 4, 4, 8
+; LE-NEXT:    or 5, 5, 9
+; LE-NEXT:    or 6, 6, 10
+; LE-NEXT:    rldicl 7, 3, 32, 32
+; LE-NEXT:    rotlwi 8, 3, 24
+; LE-NEXT:    rldicl 9, 4, 32, 32
+; LE-NEXT:    rotlwi 10, 4, 24
+; LE-NEXT:    rldicl 11, 5, 32, 32
+; LE-NEXT:    rotlwi 12, 5, 24
+; LE-NEXT:    rotlwi 29, 7, 24
+; LE-NEXT:    rlwimi 8, 3, 8, 8, 15
+; LE-NEXT:    rotlwi 28, 9, 24
+; LE-NEXT:    rlwimi 10, 4, 8, 8, 15
+; LE-NEXT:    rlwimi 8, 3, 8, 24, 31
+; LE-NEXT:    rlwimi 10, 4, 8, 24, 31
+; LE-NEXT:    rotlwi 4, 11, 24
+; LE-NEXT:    rlwimi 12, 5, 8, 8, 15
+; LE-NEXT:    rlwimi 29, 7, 8, 8, 15
+; LE-NEXT:    sldi 3, 8, 32
+; LE-NEXT:    rlwimi 28, 9, 8, 8, 15
+; LE-NEXT:    sldi 8, 10, 32
+; LE-NEXT:    rlwimi 12, 5, 8, 24, 31
+; LE-NEXT:    rlwimi 29, 7, 8, 24, 31
+; LE-NEXT:    rlwimi 28, 9, 8, 24, 31
+; LE-NEXT:    rlwimi 4, 11, 8, 8, 15
+; LE-NEXT:    sldi 5, 12, 32
+; LE-NEXT:    or 9, 3, 29
+; LE-NEXT:    or 3, 8, 28
+; LE-NEXT:    rlwimi 4, 11, 8, 24, 31
+; LE-NEXT:    or 10, 5, 4
+; LE-NEXT:    rlwinm 4, 3, 0, 30, 30
+; LE-NEXT:    std 4, 528(1) # 8-byte Folded Spill
+; LE-NEXT:    rlwinm 4, 3, 0, 5, 5
+; LE-NEXT:    std 4, 376(1) # 8-byte Folded Spill
+; LE-NEXT:    rlwinm 4, 3, 0, 4, 4
+; LE-NEXT:    std 4, 368(1) # 8-byte Folded Spill
+; LE-NEXT:    rlwinm 4, 3, 0, 3, 3
+; LE-NEXT:    std 4, 360(1) # 8-byte Folded Spill
+; LE-NEXT:    rlwinm 4, 3, 0, 2, 2
+; LE-NEXT:    std 4, 352(1) # 8-byte Folded Spill
+; LE-NEXT:    rlwinm 4, 3, 0, 1, 1
+; LE-NEXT:    std 4, 344(1) # 8-byte Folded Spill
+; LE-NEXT:    rlwinm 4, 3, 0, 0, 0
+; LE-NEXT:    std 4, 336(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 32, 32
+; LE-NEXT:    std 4, 272(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 31, 33
+; LE-NEXT:    std 4, 264(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 30, 34
+; LE-NEXT:    std 4, 256(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 29, 35
+; LE-NEXT:    std 4, 248(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 28, 36
+; LE-NEXT:    std 4, 240(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 27, 37
+; LE-NEXT:    std 4, 232(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 26, 38
+; LE-NEXT:    std 4, 224(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 25, 39
+; LE-NEXT:    std 4, 216(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 24, 40
+; LE-NEXT:    rldicl 0, 6, 32, 32
+; LE-NEXT:    rotlwi 30, 6, 24
+; LE-NEXT:    rotlwi 27, 0, 24
+; LE-NEXT:    std 4, 208(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 23, 41
+; LE-NEXT:    rlwimi 30, 6, 8, 8, 15
+; LE-NEXT:    rlwimi 30, 6, 8, 24, 31
+; LE-NEXT:    rlwimi 27, 0, 8, 8, 15
+; LE-NEXT:    std 4, 200(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 22, 42
+; LE-NEXT:    sldi 6, 30, 32
+; LE-NEXT:    rlwimi 27, 0, 8, 24, 31
+; LE-NEXT:    or 11, 6, 27
+; LE-NEXT:    std 4, 192(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 21, 43
+; LE-NEXT:    clrldi 5, 3, 63
+; LE-NEXT:    rlwinm 6, 3, 0, 29, 29
+; LE-NEXT:    rlwinm 7, 3, 0, 28, 28
+; LE-NEXT:    std 4, 184(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 20, 44
+; LE-NEXT:    rlwinm 8, 3, 0, 27, 27
+; LE-NEXT:    rlwinm 12, 3, 0, 26, 26
+; LE-NEXT:    rlwinm 0, 3, 0, 25, 25
+; LE-NEXT:    std 4, 176(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 19, 45
+; LE-NEXT:    rlwinm 30, 3, 0, 24, 24
+; LE-NEXT:    rlwinm 29, 3, 0, 23, 23
+; LE-NEXT:    rlwinm 28, 3, 0, 22, 22
+; LE-NEXT:    std 4, 168(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 18, 46
+; LE-NEXT:    rlwinm 27, 3, 0, 21, 21
+; LE-NEXT:    rlwinm 26, 3, 0, 20, 20
+; LE-NEXT:    rlwinm 25, 3, 0, 19, 19
+; LE-NEXT:    std 4, 160(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 17, 47
+; LE-NEXT:    rlwinm 24, 3, 0, 18, 18
+; LE-NEXT:    rlwinm 23, 3, 0, 17, 17
+; LE-NEXT:    rlwinm 22, 3, 0, 16, 16
+; LE-NEXT:    std 4, 152(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 16, 48
+; LE-NEXT:    rlwinm 21, 3, 0, 15, 15
+; LE-NEXT:    rlwinm 20, 3, 0, 14, 14
+; LE-NEXT:    rlwinm 19, 3, 0, 13, 13
+; LE-NEXT:    std 4, 144(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 15, 49
+; LE-NEXT:    rlwinm 18, 3, 0, 12, 12
+; LE-NEXT:    rlwinm 17, 3, 0, 11, 11
+; LE-NEXT:    rlwinm 16, 3, 0, 10, 10
+; LE-NEXT:    std 4, 136(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 14, 50
+; LE-NEXT:    rlwinm 15, 3, 0, 9, 9
+; LE-NEXT:    rlwinm 14, 3, 0, 8, 8
+; LE-NEXT:    rlwinm 31, 3, 0, 7, 7
+; LE-NEXT:    std 4, 128(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 13, 51
+; LE-NEXT:    rlwinm 2, 3, 0, 6, 6
+; LE-NEXT:    std 4, 120(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 12, 52
+; LE-NEXT:    std 4, 112(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 11, 53
+; LE-NEXT:    std 4, 104(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 10, 54
+; LE-NEXT:    std 4, 96(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 9, 55
+; LE-NEXT:    std 4, 88(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 8, 56
+; LE-NEXT:    std 4, 80(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 7, 57
+; LE-NEXT:    std 4, 72(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 6, 58
+; LE-NEXT:    std 4, 64(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 5, 59
+; LE-NEXT:    std 4, 56(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 4, 60
+; LE-NEXT:    std 4, 48(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 3, 61
+; LE-NEXT:    rldicl 3, 3, 2, 62
+; LE-NEXT:    std 3, 32(1) # 8-byte Folded Spill
+; LE-NEXT:    ld 3, 528(1) # 8-byte Folded Reload
+; LE-NEXT:    std 4, 40(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 3
+; LE-NEXT:    std 3, 288(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 5
+; LE-NEXT:    std 3, 280(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 6
+; LE-NEXT:    std 3, 296(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 7
+; LE-NEXT:    std 3, 304(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 8
+; LE-NEXT:    std 3, 312(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 12
+; LE-NEXT:    std 3, 320(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 0
+; LE-NEXT:    std 3, 328(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 30
+; LE-NEXT:    std 3, 528(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 29
+; LE-NEXT:    std 3, 520(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 28
+; LE-NEXT:    std 3, 512(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 27
+; LE-NEXT:    std 3, 504(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 26
+; LE-NEXT:    std 3, 496(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 25
+; LE-NEXT:    std 3, 488(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 24
+; LE-NEXT:    std 3, 480(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 23
+; LE-NEXT:    std 3, 472(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 22
+; LE-NEXT:    std 3, 464(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 21
+; LE-NEXT:    std 3, 456(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 20
+; LE-NEXT:    std 3, 448(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 19
+; LE-NEXT:    std 3, 440(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 18
+; LE-NEXT:    std 3, 432(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 17
+; LE-NEXT:    std 3, 424(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 16
+; LE-NEXT:    std 3, 416(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 15
+; LE-NEXT:    std 3, 408(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 14
+; LE-NEXT:    std 3, 400(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 31
+; LE-NEXT:    std 3, 392(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 2
+; LE-NEXT:    std 3, 384(1) # 8-byte Folded Spill
+; LE-NEXT:    ld 3, 376(1) # 8-byte Folded Reload
+; LE-NEXT:    mulld 3, 11, 3
+; LE-NEXT:    std 3, 376(1) # 8-byte Folded Spill
+; LE-NEXT:    ld 3, 368(1) # 8-byte Folded Reload
+; LE-NEXT:    mulld 3, 11, 3
+; LE-NEXT:    std 3, 368(1) # 8-byte Folded Spill
+; LE-NEXT:    ld 3, 360(1) # 8-byte Folded Reload
+; LE-NEXT:    mulld 3, 11, 3
+; LE-NEXT:    std 3, 360(1) # 8-byte Folded Spill
+; LE-NEXT:    ld 3, 352(1) # 8-byte Folded Reload
+; LE-NEXT:    mulld 3, 11, 3
+; LE-NEXT:    std 3, 352(1) # 8-byte Folded Spill
+; LE-NEXT:    ld 3, 344(1) # 8-byte Folded Reload
+; LE-NEXT:    mulld 3, 11, 3
+; LE-NEXT:    std 3, 344(1) # 8-byte Folded Spill
+; LE-NEXT:    ld 3, 336(1) # 8-byte Folded Reload
+; LE-NEXT:    mulld 3, 11, 3
+; LE-NEXT:    std 3, 336(1) # 8-byte Folded Spill
+; LE-NEXT:    ld 3, 272(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 4, 3, 32, 31
+; LE-NEXT:    ld 3, 264(1) # 8-byte Folded Reload
+; LE-NEXT:    mulld 4, 11, 4
+; LE-NEXT:    rldicl 5, 3, 33, 30
+; LE-NEXT:    ld 3, 256(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 6, 3, 34, 29
+; LE-NEXT:    ld 3, 248(1) # 8-byte Folded Reload
+; LE-NEXT:    std 4, 272(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 4, 11, 5
+; LE-NEXT:    ld 5, 280(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 7, 3, 35, 28
+; LE-NEXT:    ld 3, 240(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 8, 3, 36, 27
+; LE-NEXT:    ld 3, 232(1) # 8-byte Folded Reload
+; LE-NEXT:    std 4, 264(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 4, 11, 6
+; LE-NEXT:    mulld 6, 11, 7
+; LE-NEXT:    mulld 7, 11, 8
+; LE-NEXT:    rldicl 12, 3, 37, 26
+; LE-NEXT:    ld 3, 224(1) # 8-byte Folded Reload
+; LE-NEXT:    mulld 8, 11, 12
+; LE-NEXT:    std 4, 256(1) # 8-byte Folded Spill
+; LE-NEXT:    clrldi 4, 9, 63
+; LE-NEXT:    rldicl 0, 3, 38, 25
+; LE-NEXT:    ld 3, 216(1) # 8-byte Folded Reload
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    mulld 12, 11, 0
+; LE-NEXT:    rldicl 30, 3, 39, 24
+; LE-NEXT:    ld 3, 208(1) # 8-byte Folded Reload
+; LE-NEXT:    mulld 0, 11, 30
+; LE-NEXT:    rldicl 29, 3, 40, 23
+; LE-NEXT:    ld 3, 200(1) # 8-byte Folded Reload
+; LE-NEXT:    mulld 30, 11, 29
+; LE-NEXT:    rldicl 28, 3, 41, 22
+; LE-NEXT:    ld 3, 192(1) # 8-byte Folded Reload
+; LE-NEXT:    mulld 29, 11, 28
+; LE-NEXT:    rldicl 27, 3, 42, 21
+; LE-NEXT:    ld 3, 184(1) # 8-byte Folded Reload
+; LE-NEXT:    mulld 28, 11, 27
+; LE-NEXT:    rldicl 26, 3, 43, 20
+; LE-NEXT:    ld 3, 176(1) # 8-byte Folded Reload
+; LE-NEXT:    mulld 27, 11, 26
+; LE-NEXT:    rldicl 25, 3, 44, 19
+; LE-NEXT:    ld 3, 168(1) # 8-byte Folded Reload
+; LE-NEXT:    mulld 26, 11, 25
+; LE-NEXT:    rldicl 24, 3, 45, 18
+; LE-NEXT:    ld 3, 160(1) # 8-byte Folded Reload
+; LE-NEXT:    mulld 25, 11, 24
+; LE-NEXT:    rldicl 23, 3, 46, 17
+; LE-NEXT:    ld 3, 152(1) # 8-byte Folded Reload
+; LE-NEXT:    mulld 24, 11, 23
+; LE-NEXT:    rldicl 22, 3, 47, 16
+; LE-NEXT:    ld 3, 144(1) # 8-byte Folded Reload
+; LE-NEXT:    mulld 23, 11, 22
+; LE-NEXT:    rldicl 21, 3, 48, 15
+; LE-NEXT:    ld 3, 136(1) # 8-byte Folded Reload
+; LE-NEXT:    mulld 22, 11, 21
+; LE-NEXT:    rldicl 20, 3, 49, 14
+; LE-NEXT:    ld 3, 128(1) # 8-byte Folded Reload
+; LE-NEXT:    mulld 21, 11, 20
+; LE-NEXT:    rldicl 19, 3, 50, 13
+; LE-NEXT:    ld 3, 120(1) # 8-byte Folded Reload
+; LE-NEXT:    mulld 20, 11, 19
+; LE-NEXT:    rldicl 18, 3, 51, 12
+; LE-NEXT:    ld 3, 112(1) # 8-byte Folded Reload
+; LE-NEXT:    mulld 19, 11, 18
+; LE-NEXT:    rldicl 17, 3, 52, 11
+; LE-NEXT:    ld 3, 104(1) # 8-byte Folded Reload
+; LE-NEXT:    mulld 18, 11, 17
+; LE-NEXT:    rldicl 16, 3, 53, 10
+; LE-NEXT:    ld 3, 96(1) # 8-byte Folded Reload
+; LE-NEXT:    mulld 17, 11, 16
+; LE-NEXT:    rldicl 15, 3, 54, 9
+; LE-NEXT:    ld 3, 88(1) # 8-byte Folded Reload
+; LE-NEXT:    mulld 16, 11, 15
+; LE-NEXT:    rldicl 14, 3, 55, 8
+; LE-NEXT:    ld 3, 80(1) # 8-byte Folded Reload
+; LE-NEXT:    mulld 15, 11, 14
+; LE-NEXT:    rldicl 31, 3, 56, 7
+; LE-NEXT:    ld 3, 72(1) # 8-byte Folded Reload
+; LE-NEXT:    mulld 14, 11, 31
+; LE-NEXT:    rldicl 2, 3, 57, 6
+; LE-NEXT:    ld 3, 64(1) # 8-byte Folded Reload
+; LE-NEXT:    mulld 31, 11, 2
+; LE-NEXT:    rldicl 3, 3, 58, 5
+; LE-NEXT:    std 3, 248(1) # 8-byte Folded Spill
+; LE-NEXT:    ld 3, 56(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 3, 3, 59, 4
+; LE-NEXT:    std 3, 240(1) # 8-byte Folded Spill
+; LE-NEXT:    ld 3, 48(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 3, 3, 60, 3
+; LE-NEXT:    std 3, 232(1) # 8-byte Folded Spill
+; LE-NEXT:    ld 3, 40(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 3, 3, 61, 2
+; LE-NEXT:    std 3, 224(1) # 8-byte Folded Spill
+; LE-NEXT:    ld 3, 32(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 3, 3, 62, 1
+; LE-NEXT:    std 3, 216(1) # 8-byte Folded Spill
+; LE-NEXT:    ld 3, 248(1) # 8-byte Folded Reload
+; LE-NEXT:    mulld 2, 11, 3
+; LE-NEXT:    ld 3, 240(1) # 8-byte Folded Reload
+; LE-NEXT:    mulld 3, 11, 3
+; LE-NEXT:    std 3, 248(1) # 8-byte Folded Spill
+; LE-NEXT:    ld 3, 232(1) # 8-byte Folded Reload
+; LE-NEXT:    mulld 3, 11, 3
+; LE-NEXT:    std 3, 240(1) # 8-byte Folded Spill
+; LE-NEXT:    ld 3, 224(1) # 8-byte Folded Reload
+; LE-NEXT:    mulld 3, 11, 3
+; LE-NEXT:    std 3, 232(1) # 8-byte Folded Spill
+; LE-NEXT:    ld 3, 216(1) # 8-byte Folded Reload
+; LE-NEXT:    mulld 11, 11, 3
+; LE-NEXT:    rlwinm 3, 9, 0, 30, 30
+; LE-NEXT:    mulld 3, 10, 3
+; LE-NEXT:    xor 3, 4, 3
+; LE-NEXT:    ld 4, 288(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 4, 5, 4
+; LE-NEXT:    rlwinm 5, 9, 0, 29, 29
+; LE-NEXT:    mulld 5, 10, 5
+; LE-NEXT:    xor 3, 3, 5
+; LE-NEXT:    ld 5, 296(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 4, 4, 5
+; LE-NEXT:    rlwinm 5, 9, 0, 28, 28
+; LE-NEXT:    mulld 5, 10, 5
+; LE-NEXT:    xor 3, 3, 5
+; LE-NEXT:    ld 5, 304(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 4, 4, 5
+; LE-NEXT:    rlwinm 5, 9, 0, 27, 27
+; LE-NEXT:    mulld 5, 10, 5
+; LE-NEXT:    xor 3, 3, 5
+; LE-NEXT:    ld 5, 312(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 4, 4, 5
+; LE-NEXT:    rlwinm 5, 9, 0, 26, 26
+; LE-NEXT:    mulld 5, 10, 5
+; LE-NEXT:    xor 3, 3, 5
+; LE-NEXT:    ld 5, 320(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 4, 4, 5
+; LE-NEXT:    rlwinm 5, 9, 0, 25, 25
+; LE-NEXT:    mulld 5, 10, 5
+; LE-NEXT:    xor 3, 3, 5
+; LE-NEXT:    ld 5, 328(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 4, 4, 5
+; LE-NEXT:    rlwinm 5, 9, 0, 24, 24
+; LE-NEXT:    mulld 5, 10, 5
+; LE-NEXT:    xor 3, 3, 5
+; LE-NEXT:    std 3, 328(1) # 8-byte Folded Spill
+; LE-NEXT:    ld 3, 528(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 4, 3
+; LE-NEXT:    ld 4, 520(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 512(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 504(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 496(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 488(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 480(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 472(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 464(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 456(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 448(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 440(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 432(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 424(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 416(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 408(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 400(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 392(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 384(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 376(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 368(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 360(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 352(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 344(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 336(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 272(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 264(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 256(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 248(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 6
+; LE-NEXT:    ld 6, 576(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 7
+; LE-NEXT:    ld 7, 568(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 8
+; LE-NEXT:    ld 8, 560(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 12
+; LE-NEXT:    ld 12, 544(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 0
+; LE-NEXT:    ld 0, 536(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 30
+; LE-NEXT:    ld 30, 720(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 29
+; LE-NEXT:    ld 29, 712(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 28
+; LE-NEXT:    ld 28, 704(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 27
+; LE-NEXT:    ld 27, 696(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 26
+; LE-NEXT:    ld 26, 688(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 25
+; LE-NEXT:    ld 25, 680(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 24
+; LE-NEXT:    ld 24, 672(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 23
+; LE-NEXT:    ld 23, 664(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 22
+; LE-NEXT:    ld 22, 656(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 21
+; LE-NEXT:    ld 21, 648(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 20
+; LE-NEXT:    ld 20, 640(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 19
+; LE-NEXT:    ld 19, 632(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 18
+; LE-NEXT:    ld 18, 624(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 17
+; LE-NEXT:    ld 17, 616(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 16
+; LE-NEXT:    ld 16, 608(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 15
+; LE-NEXT:    ld 15, 600(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 14
+; LE-NEXT:    ld 14, 592(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 31
+; LE-NEXT:    ld 31, 728(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 2
+; LE-NEXT:    ld 2, 584(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 240(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 232(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    xor 3, 3, 11
+; LE-NEXT:    ld 11, 552(1) # 8-byte Folded Reload
+; LE-NEXT:    sldi 4, 3, 1
+; LE-NEXT:    rldicl 3, 3, 63, 1
+; LE-NEXT:    and 4, 4, 7
+; LE-NEXT:    and 3, 3, 6
+; LE-NEXT:    or 3, 3, 4
+; LE-NEXT:    sldi 4, 3, 2
+; LE-NEXT:    rldicl 3, 3, 62, 2
+; LE-NEXT:    and 4, 4, 11
+; LE-NEXT:    and 3, 3, 8
+; LE-NEXT:    or 3, 3, 4
+; LE-NEXT:    sldi 4, 3, 4
+; LE-NEXT:    rldicl 3, 3, 60, 4
+; LE-NEXT:    and 4, 4, 0
+; LE-NEXT:    and 3, 3, 12
+; LE-NEXT:    or 3, 3, 4
+; LE-NEXT:    rotlwi 5, 3, 24
+; LE-NEXT:    rldicl 4, 3, 32, 32
+; LE-NEXT:    rlwimi 5, 3, 8, 8, 15
+; LE-NEXT:    rlwimi 5, 3, 8, 24, 31
+; LE-NEXT:    rotlwi 3, 4, 24
+; LE-NEXT:    rlwimi 3, 4, 8, 8, 15
+; LE-NEXT:    rlwimi 3, 4, 8, 24, 31
+; LE-NEXT:    sldi 4, 5, 32
+; LE-NEXT:    or 3, 4, 3
+; LE-NEXT:    ld 4, 328(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 3, 3, 63, 1
+; LE-NEXT:    mtfprd 0, 3
+; LE-NEXT:    rlwinm 3, 9, 0, 23, 23
+; LE-NEXT:    mulld 3, 10, 3
+; LE-NEXT:    xor 3, 4, 3
+; LE-NEXT:    rlwinm 4, 9, 0, 22, 22
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 21, 21
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 20, 20
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 19, 19
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 18, 18
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 17, 17
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 16, 16
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 15, 15
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 14, 14
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 13, 13
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 12, 12
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 11, 11
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 10, 10
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 9, 9
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 8, 8
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 7, 7
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 6, 6
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 5, 5
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 4, 4
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 3, 3
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 2, 2
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 1, 1
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 0, 0
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 32, 32
+; LE-NEXT:    rldicl 4, 4, 32, 31
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 31, 33
+; LE-NEXT:    rldicl 4, 4, 33, 30
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 30, 34
+; LE-NEXT:    rldicl 4, 4, 34, 29
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 29, 35
+; LE-NEXT:    rldicl 4, 4, 35, 28
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 28, 36
+; LE-NEXT:    rldicl 4, 4, 36, 27
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 27, 37
+; LE-NEXT:    rldicl 4, 4, 37, 26
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 26, 38
+; LE-NEXT:    rldicl 4, 4, 38, 25
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 25, 39
+; LE-NEXT:    rldicl 4, 4, 39, 24
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 24, 40
+; LE-NEXT:    rldicl 4, 4, 40, 23
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 23, 41
+; LE-NEXT:    rldicl 4, 4, 41, 22
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 22, 42
+; LE-NEXT:    rldicl 4, 4, 42, 21
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 21, 43
+; LE-NEXT:    rldicl 4, 4, 43, 20
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 20, 44
+; LE-NEXT:    rldicl 4, 4, 44, 19
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 19, 45
+; LE-NEXT:    rldicl 4, 4, 45, 18
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 18, 46
+; LE-NEXT:    rldicl 4, 4, 46, 17
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 17, 47
+; LE-NEXT:    rldicl 4, 4, 47, 16
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 16, 48
+; LE-NEXT:    rldicl 4, 4, 48, 15
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 15, 49
+; LE-NEXT:    rldicl 4, 4, 49, 14
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 14, 50
+; LE-NEXT:    rldicl 4, 4, 50, 13
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 13, 51
+; LE-NEXT:    rldicl 4, 4, 51, 12
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 12, 52
+; LE-NEXT:    rldicl 4, 4, 52, 11
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 11, 53
+; LE-NEXT:    rldicl 4, 4, 53, 10
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 10, 54
+; LE-NEXT:    rldicl 4, 4, 54, 9
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 9, 55
+; LE-NEXT:    rldicl 4, 4, 55, 8
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 8, 56
+; LE-NEXT:    rldicl 4, 4, 56, 7
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 7, 57
+; LE-NEXT:    rldicl 4, 4, 57, 6
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 6, 58
+; LE-NEXT:    rldicl 4, 4, 58, 5
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 5, 59
+; LE-NEXT:    rldicl 4, 4, 59, 4
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 4, 60
+; LE-NEXT:    rldicl 4, 4, 60, 3
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 3, 61
+; LE-NEXT:    rldicl 4, 4, 61, 2
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 2, 62
+; LE-NEXT:    rldicl 4, 4, 62, 1
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    sldi 4, 3, 1
+; LE-NEXT:    rldicl 3, 3, 63, 1
+; LE-NEXT:    and 4, 4, 7
+; LE-NEXT:    and 3, 3, 6
+; LE-NEXT:    or 3, 3, 4
+; LE-NEXT:    sldi 4, 3, 2
+; LE-NEXT:    rldicl 3, 3, 62, 2
+; LE-NEXT:    and 4, 4, 11
+; LE-NEXT:    and 3, 3, 8
+; LE-NEXT:    or 3, 3, 4
+; LE-NEXT:    sldi 4, 3, 4
+; LE-NEXT:    rldicl 3, 3, 60, 4
+; LE-NEXT:    and 4, 4, 0
+; LE-NEXT:    and 3, 3, 12
+; LE-NEXT:    or 3, 3, 4
+; LE-NEXT:    rldicl 4, 3, 32, 32
+; LE-NEXT:    rotlwi 5, 4, 24
+; LE-NEXT:    rlwimi 5, 4, 8, 8, 15
+; LE-NEXT:    rlwimi 5, 4, 8, 24, 31
+; LE-NEXT:    rotlwi 4, 3, 24
+; LE-NEXT:    rlwimi 4, 3, 8, 8, 15
+; LE-NEXT:    rlwimi 4, 3, 8, 24, 31
+; LE-NEXT:    sldi 3, 4, 32
+; LE-NEXT:    or 3, 3, 5
+; LE-NEXT:    rldicl 3, 3, 63, 1
+; LE-NEXT:    mtfprd 1, 3
+; LE-NEXT:    xxmrghd 34, 1, 0
+; LE-NEXT:    addi 1, 1, 736
+; LE-NEXT:    blr
+  %a.ext = zext <2 x i64> %a to <2 x i128>
+  %b.ext = zext <2 x i64> %b to <2 x i128>
+  %clmul = call <2 x i128> @llvm.clmul.v2i128(<2 x i128> %a.ext, <2 x i128> %b.ext)
+  %res.ext = lshr <2 x i128> %clmul, splat (i128 64)
+  %res = trunc <2 x i128> %res.ext to <2 x i64>
+  ret <2 x i64> %res
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/X86/known-pow2.ll b/llvm/test/CodeGen/X86/known-pow2.ll
index 92b176b7a4bbb..b4dd00125aab5 100644
--- a/llvm/test/CodeGen/X86/known-pow2.ll
+++ b/llvm/test/CodeGen/X86/known-pow2.ll
@@ -797,17 +797,13 @@ define i1 @pow2_and_fail0(i32 %x, i32 %y) {
   ret i1 %r
 }
 
-define i1 @pow2_and_fail1(i32 %x, i32 %y) {
-; CHECK-LABEL: pow2_and_fail1:
+define i1 @pow2_andnot_3op(i32 %x, i32 %y) {
+; CHECK-LABEL: pow2_andnot_3op:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl %esi, %ecx
-; CHECK-NEXT:    movl $1, %eax
-; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-NEXT:    shll %cl, %eax
 ; CHECK-NEXT:    notl %edi
-; CHECK-NEXT:    andl %eax, %edi
-; CHECK-NEXT:    testl $-2, %edi
-; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    andl $-2, %edi
+; CHECK-NEXT:    btl %esi, %edi
+; CHECK-NEXT:    setae %al
 ; CHECK-NEXT:    retq
   %yy = shl i32 1, %y
   %nyy = sub i32 1, %yy
@@ -817,17 +813,13 @@ define i1 @pow2_and_fail1(i32 %x, i32 %y) {
   ret i1 %r
 }
 
-define i1 @pow2_and_fail2(i32 %x, i32 %y, i32 %z) {
-; CHECK-LABEL: pow2_and_fail2:
+define i1 @pow2_and_3op(i32 %x, i32 %y, i32 %z) {
+; CHECK-LABEL: pow2_and_3op:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl %esi, %ecx
-; CHECK-NEXT:    movl $1, %eax
-; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-NEXT:    shll %cl, %eax
-; CHECK-NEXT:    andl %edx, %eax
 ; CHECK-NEXT:    notl %edi
-; CHECK-NEXT:    testl %edi, %eax
-; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    andl %edx, %edi
+; CHECK-NEXT:    btl %esi, %edi
+; CHECK-NEXT:    setae %al
 ; CHECK-NEXT:    retq
   %yy = shl i32 1, %y
   %d = and i32 %yy, %z
@@ -856,13 +848,9 @@ define i1 @pow2_though_zext(i32 %x, i16 %y) {
 define i1 @pow2_and_i20(i20 %num, i20 %shift) {
 ; CHECK-LABEL: pow2_and_i20:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl %esi, %ecx
-; CHECK-NEXT:    movl $1, %eax
-; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-NEXT:    shll %cl, %eax
-; CHECK-NEXT:    andl %edi, %eax
-; CHECK-NEXT:    testl $1048575, %eax # imm = 0xFFFFF
-; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    andl $1048575, %edi # imm = 0xFFFFF
+; CHECK-NEXT:    btl %esi, %edi
+; CHECK-NEXT:    setae %al
 ; CHECK-NEXT:    retq
   %mask = shl nuw i20 1, %shift
   %bit = and i20 %mask, %num
@@ -873,13 +861,10 @@ define i1 @pow2_and_i20(i20 %num, i20 %shift) {
 define i1 @pow2_and_i50(i50 %num, i50 %shift) {
 ; CHECK-LABEL: pow2_and_i50:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %rsi, %rcx
-; CHECK-NEXT:    movl $1, %eax
-; CHECK-NEXT:    # kill: def $cl killed $cl killed $rcx
-; CHECK-NEXT:    shlq %cl, %rax
+; CHECK-NEXT:    movabsq $1125899906842623, %rax # imm = 0x3FFFFFFFFFFFF
 ; CHECK-NEXT:    andq %rdi, %rax
-; CHECK-NEXT:    shlq $14, %rax
-; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    btq %rsi, %rax
+; CHECK-NEXT:    setae %al
 ; CHECK-NEXT:    retq
   %mask = shl nuw i50 1, %shift
   %bit = and i50 %mask, %num
diff --git a/llvm/test/MC/AMDGPU/gfx1170_asm_wmma_w32.s b/llvm/test/MC/AMDGPU/gfx1170_asm_wmma_w32.s
new file mode 100644
index 0000000000000..abdb344ac0614
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx1170_asm_wmma_w32.s
@@ -0,0 +1,1529 @@
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1170 -show-encoding %s | FileCheck --check-prefix=GFX1170 %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1170 %s 2>&1 | FileCheck --check-prefix=GFX1170-ERR --implicit-check-not=error: %s
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15]
+// GFX1170: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x1c]
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] clamp
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0]
+// GFX1170: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x3c]
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0]
+// GFX1170: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x5c]
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
+// GFX1170: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x9c]
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[1,0,0]
+// GFX1170: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x40,0xcc,0x00,0x09,0x22,0x1c]
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,1,0]
+// GFX1170: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x40,0xcc,0x00,0x09,0x22,0x1c]
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1]
+// GFX1170: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x40,0xcc,0x00,0x09,0x22,0x1c]
+
+v_wmma_f32_16x16x16_f16 v[8:15], s[0:3], v[4:7], v[8:15]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], s[4:7], v[8:15]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], s[8:15]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_f16 v[8:15], 1.0, v[4:7], v[8:15]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], 1.0, v[8:15]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], 1.0
+// GFX1170: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], 1.0 ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0xca,0x1b]
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], 1
+// GFX1170: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x06,0x1a]
+
+
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15]
+// GFX1170: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x1c]
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] clamp
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0]
+// GFX1170: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x3c]
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0]
+// GFX1170: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x5c]
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
+// GFX1170: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x9c]
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[1,0,0]
+// GFX1170: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x41,0xcc,0x00,0x09,0x22,0x1c]
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,1,0]
+// GFX1170: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x41,0xcc,0x00,0x09,0x22,0x1c]
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1]
+// GFX1170: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x41,0xcc,0x00,0x09,0x22,0x1c]
+
+v_wmma_f32_16x16x16_bf16 v[8:15], s[0:3], v[4:7], v[8:15]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], s[4:7], v[8:15]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], s[8:15]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf16 v[8:15], 1.0, v[4:7], v[8:15]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], 1.0, v[8:15]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], 1.0
+// GFX1170: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], 1.0 ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0xca,0x1b]
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], 1
+// GFX1170: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x06,0x1a]
+
+
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11]
+// GFX1170: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x1c]
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] clamp
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0]
+// GFX1170: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x3c]
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0]
+// GFX1170: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x5c]
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1]
+// GFX1170: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x9c]
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[1,0,0]
+// GFX1170: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x42,0xcc,0x00,0x09,0x22,0x1c]
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,1,0]
+// GFX1170: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x42,0xcc,0x00,0x09,0x22,0x1c]
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1]
+// GFX1170: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x42,0xcc,0x00,0x09,0x22,0x1c]
+
+v_wmma_f16_16x16x16_f16 v[8:11], s[0:3], v[4:7], v[8:11]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], s[4:7], v[8:11]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], s[8:11]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f16_16x16x16_f16 v[8:11], 1.0, v[4:7], v[8:11]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], 1.0, v[8:11]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], 1.0
+// GFX1170: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], 1.0 ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0xca,0x1b]
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], 1
+// GFX1170: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x06,0x1a]
+
+
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11]
+// GFX1170: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x1c]
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] clamp
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0]
+// GFX1170: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x3c]
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0]
+// GFX1170: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x5c]
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1]
+// GFX1170: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x9c]
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[1,0,0]
+// GFX1170: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x43,0xcc,0x00,0x09,0x22,0x1c]
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,1,0]
+// GFX1170: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x43,0xcc,0x00,0x09,0x22,0x1c]
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1]
+// GFX1170: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x43,0xcc,0x00,0x09,0x22,0x1c]
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], s[0:3], v[4:7], v[8:11]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], s[4:7], v[8:11]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], s[8:11]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], 1.0, v[4:7], v[8:11]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], 1.0, v[8:11]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], 1.0
+// GFX1170: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], 1.0 ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0xca,0x1b]
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], 1
+// GFX1170: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x06,0x1a]
+
+
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11]
+// GFX1170: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] clamp
+// GFX1170: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] clamp ; encoding: [0x04,0xc0,0x44,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
+// GFX1170: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x3c]
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
+// GFX1170: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x5c]
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[4:11], s[0:1], v[2:3], v[4:11]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], s[2:3], v[4:11]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], s[4:11]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu8 v[4:11], 1, v[2:3], v[4:11]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], 1, v[4:11]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], 1
+// GFX1170: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x06,0x1a]
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], 1.0
+// GFX1170: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0xca,0x1b]
+
+
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9]
+// GFX1170: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x1c]
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] clamp
+// GFX1170: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] clamp ; encoding: [0x02,0xc0,0x45,0xcc,0x00,0x03,0x0a,0x1c]
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[1,0,0]
+// GFX1170: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x3c]
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[0,1,0]
+// GFX1170: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x5c]
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_hi:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_hi:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_hi:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:9], s0, v1, v[2:9]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, s1, v[2:9]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, s[0:7]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu4 v[2:9], 1, v1, v[2:9]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, 1, v[2:9]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, 1
+// GFX1170: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, 1 ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x06,0x1a]
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, 1.0
+// GFX1170: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, 1.0 ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0xca,0x1b]
+
+
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11]
+// GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] clamp
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
+// GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x9c]
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
+// GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x46,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], s[0:1], v[2:3], v[4:11]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], s[2:3], v[4:11]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], s[4:11]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], 1.0, v[2:3], v[4:11]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], 1.0, v[4:11]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], 1.0
+// GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0xca,0x1b]
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], 1
+// GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0x06,0x1a]
+
+
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11]
+// GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] clamp
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
+// GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x9c]
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
+// GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x48,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], s[0:1], v[2:3], v[4:11]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], s[2:3], v[4:11]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], s[4:11]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], 1.0, v[2:3], v[4:11]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], 1.0, v[4:11]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], 1.0
+// GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0xca,0x1b]
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], 1
+// GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0x06,0x1a]
+
+
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11]
+// GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] clamp
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
+// GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x9c]
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
+// GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x47,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], s[0:1], v[2:3], v[4:11]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], s[2:3], v[4:11]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], s[4:11]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], 1.0, v[2:3], v[4:11]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], 1.0, v[4:11]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], 1.0
+// GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0xca,0x1b]
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], 1
+// GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0x06,0x1a]
+
+
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11]
+// GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] clamp
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
+// GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x9c]
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
+// GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x49,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], s[0:1], v[2:3], v[4:11]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], s[2:3], v[4:11]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], s[4:11]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], 1.0, v[2:3], v[4:11]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], 1.0, v[4:11]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], 1.0
+// GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0xca,0x1b]
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], 1
+// GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0x06,0x1a]
+
+
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11]
+// GFX1170: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] clamp
+// GFX1170: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] clamp ; encoding: [0x04,0xc0,0x4a,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
+// GFX1170: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x3c]
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
+// GFX1170: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x5c]
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[4:11], s[0:1], v[2:3], v[4:11]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], s[2:3], v[4:11]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], s[4:11]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x32_iu4 v[4:11], 1, v[2:3], v[4:11]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], 1, v[4:11]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], 1
+// GFX1170: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x06,0x1a]
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], 1.0
+// GFX1170: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0xca,0x1b]
+
+
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20
+// GFX1170: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x1c]
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 clamp
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 index_key:1
+// GFX1170: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 index_key:1 ; encoding: [0x0c,0x48,0x50,0xcc,0x00,0x09,0x52,0x1c]
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 index_key:2
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 index_key:3
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0]
+// GFX1170: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x3c]
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0]
+// GFX1170: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x5c]
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[1,0,0]
+// GFX1170: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[1,0,0] ; encoding: [0x0c,0x41,0x50,0xcc,0x00,0x09,0x52,0x1c]
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[0,1,0]
+// GFX1170: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[0,1,0] ; encoding: [0x0c,0x42,0x50,0xcc,0x00,0x09,0x52,0x1c]
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_swmmac_f32_16x16x32_f16 v[12:19], s[0:3], v[4:11], v20
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], s[4:11], v20
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], s20
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_f16 v[12:19], 1.0, v[4:11], v20
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], 1.0, v20
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], 1.0
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], 1
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20
+// GFX1170: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x1c]
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 clamp
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 index_key:1
+// GFX1170: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 index_key:1 ; encoding: [0x0c,0x48,0x51,0xcc,0x00,0x09,0x52,0x1c]
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 index_key:2
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 index_key:3
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0]
+// GFX1170: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x3c]
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0]
+// GFX1170: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x5c]
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[1,0,0]
+// GFX1170: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[1,0,0] ; encoding: [0x0c,0x41,0x51,0xcc,0x00,0x09,0x52,0x1c]
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[0,1,0]
+// GFX1170: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[0,1,0] ; encoding: [0x0c,0x42,0x51,0xcc,0x00,0x09,0x52,0x1c]
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], s[0:3], v[4:11], v20
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], s[4:11], v20
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], s20
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], 1.0, v[4:11], v20
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], 1.0, v20
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], 1.0
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], 1
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16
+// GFX1170: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x1c]
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 clamp
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:1
+// GFX1170: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:1 ; encoding: [0x0c,0x48,0x52,0xcc,0x00,0x09,0x42,0x1c]
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:2
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:3
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0]
+// GFX1170: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x3c]
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0]
+// GFX1170: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x5c]
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[1,0,0]
+// GFX1170: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[1,0,0] ; encoding: [0x0c,0x41,0x52,0xcc,0x00,0x09,0x42,0x1c]
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[0,1,0]
+// GFX1170: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[0,1,0] ; encoding: [0x0c,0x42,0x52,0xcc,0x00,0x09,0x42,0x1c]
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_swmmac_f16_16x16x32_f16 v[12:15], s[0:3], v[4:11], v16
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], s[4:11], v16
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], s16
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f16_16x16x32_f16 v[12:15], 1.0, v[4:11], v16
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], 1.0, v16
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], 1.0
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], 1
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16
+// GFX1170: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x1c]
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 clamp
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:1
+// GFX1170: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:1 ; encoding: [0x0c,0x48,0x53,0xcc,0x00,0x09,0x42,0x1c]
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:2
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:3
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0]
+// GFX1170: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x3c]
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0]
+// GFX1170: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x5c]
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[1,0,0]
+// GFX1170: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[1,0,0] ; encoding: [0x0c,0x41,0x53,0xcc,0x00,0x09,0x42,0x1c]
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[0,1,0]
+// GFX1170: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[0,1,0] ; encoding: [0x0c,0x42,0x53,0xcc,0x00,0x09,0x42,0x1c]
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], s[0:3], v[4:11], v16
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], s[4:11], v16
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], s16
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], 1.0, v[4:11], v16
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], 1.0, v16
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], 1.0
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], 1
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14
+// GFX1170: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x1c]
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 clamp
+// GFX1170: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 clamp ; encoding: [0x06,0xc0,0x54,0xcc,0x00,0x05,0x3a,0x1c]
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 index_key:1
+// GFX1170: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x54,0xcc,0x00,0x05,0x3a,0x1c]
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 index_key:2
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 index_key:3
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0]
+// GFX1170: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x3c]
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0]
+// GFX1170: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x5c]
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], s[0:1], v[2:5], v14
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], s[0:3], v14
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], s14
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], 1, v[2:5], v14
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], 1, v14
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], 1
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], 1.0
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11
+// GFX1170: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x1c]
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 clamp
+// GFX1170: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 clamp ; encoding: [0x03,0xc0,0x55,0xcc,0x00,0x03,0x2e,0x1c]
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 index_key:1
+// GFX1170: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 index_key:1 ; encoding: [0x03,0x48,0x55,0xcc,0x00,0x03,0x2e,0x1c]
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 index_key:2
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 index_key:3
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[1,0,0]
+// GFX1170: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[1,0,0] ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x3c]
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[0,1,0]
+// GFX1170: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[0,1,0] ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x5c]
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_hi:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_hi:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_hi:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], s0, v[1:2], v11
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, s[0:1], v11
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], s11
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], 1, v[1:2], v11
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, 1, v11
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], 1
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], 1.0
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14
+// GFX1170: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x1c]
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 clamp
+// GFX1170: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 clamp ; encoding: [0x06,0xc0,0x56,0xcc,0x00,0x05,0x3a,0x1c]
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 index_key:1
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 index_key:2
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 index_key:3
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0]
+// GFX1170: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x3c]
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0]
+// GFX1170: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x5c]
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_hi:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], s[0:1], v[2:5], v14
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], s[0:3], v14
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], s14
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], 1, v[2:5], v14
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], 1, v14
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], 1
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], 1.0
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14
+// GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x1c]
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 clamp
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1
+// GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x57,0xcc,0x00,0x05,0x3a,0x1c]
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:2
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:3
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], s[0:1], v[2:5], v14
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], s[0:3], v14
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], s14
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], 1.0, v[2:5], v14
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], 1.0, v14
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], 1.0
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], 1
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14
+// GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x1c]
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 clamp
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1
+// GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x58,0xcc,0x00,0x05,0x3a,0x1c]
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:2
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:3
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], s[0:1], v[2:5], v14
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], s[0:3], v14
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], s14
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], 1.0, v[2:5], v14
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], 1.0, v14
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], 1.0
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], 1
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14
+// GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x1c]
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 clamp
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1
+// GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x59,0xcc,0x00,0x05,0x3a,0x1c]
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:2
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:3
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], s[0:1], v[2:5], v14
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], s[0:3], v14
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], s14
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], 1.0, v[2:5], v14
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], 1.0, v14
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], 1.0
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], 1
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14
+// GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x1c]
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 clamp
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1
+// GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x5a,0xcc,0x00,0x05,0x3a,0x1c]
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:2
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:3
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], s[0:1], v[2:5], v14
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], s[0:3], v14
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], s14
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], 1.0, v[2:5], v14
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], 1.0, v14
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], 1.0
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], 1
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
diff --git a/llvm/test/MC/AMDGPU/gfx1170_asm_wmma_w64.s b/llvm/test/MC/AMDGPU/gfx1170_asm_wmma_w64.s
new file mode 100644
index 0000000000000..6b1b889f8bedd
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx1170_asm_wmma_w64.s
@@ -0,0 +1,1529 @@
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1170 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=GFX1170 %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1170 -mattr=+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=GFX1170-ERR --implicit-check-not=error: %s
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7]
+// GFX1170: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] clamp
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0]
+// GFX1170: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x3c]
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0]
+// GFX1170: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x5c]
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
+// GFX1170: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x9c]
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[1,0,0]
+// GFX1170: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[1,0,0] ; encoding: [0x04,0x41,0x40,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,1,0]
+// GFX1170: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,1,0] ; encoding: [0x04,0x42,0x40,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1]
+// GFX1170: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x40,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f32_16x16x16_f16 v[4:7], s[0:1], v[2:3], v[4:7]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], s[2:3], v[4:7]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], s[4:7]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_f16 v[4:7], 1.0, v[2:3], v[4:7]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], 1.0, v[4:7]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], 1.0
+// GFX1170: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0xca,0x1b]
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], 1
+// GFX1170: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x06,0x1a]
+
+
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7]
+// GFX1170: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] clamp
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0]
+// GFX1170: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x3c]
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0]
+// GFX1170: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x5c]
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
+// GFX1170: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x9c]
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[1,0,0]
+// GFX1170: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[1,0,0] ; encoding: [0x04,0x41,0x41,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,1,0]
+// GFX1170: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,1,0] ; encoding: [0x04,0x42,0x41,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1]
+// GFX1170: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x41,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f32_16x16x16_bf16 v[4:7], s[0:1], v[2:3], v[4:7]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], s[2:3], v[4:7]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], s[4:7]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf16 v[4:7], 1.0, v[2:3], v[4:7]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], 1.0, v[4:7]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], 1.0
+// GFX1170: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0xca,0x1b]
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], 1
+// GFX1170: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x06,0x1a]
+
+
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5]
+// GFX1170: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] clamp
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0]
+// GFX1170: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x3c]
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0]
+// GFX1170: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x5c]
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1]
+// GFX1170: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x9c]
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[1,0,0]
+// GFX1170: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[1,0,0] ; encoding: [0x04,0x41,0x42,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,1,0]
+// GFX1170: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,1,0] ; encoding: [0x04,0x42,0x42,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1]
+// GFX1170: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x42,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f16_16x16x16_f16 v[4:5], s[0:1], v[2:3], v[4:5]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], s[2:3], v[4:5]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], s[4:5]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f16_16x16x16_f16 v[4:5], 1.0, v[2:3], v[4:5]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], 1.0, v[4:5]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], 1.0
+// GFX1170: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0xca,0x1b]
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], 1
+// GFX1170: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x06,0x1a]
+
+
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5]
+// GFX1170: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] clamp
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0]
+// GFX1170: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x3c]
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0]
+// GFX1170: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x5c]
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1]
+// GFX1170: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x9c]
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[1,0,0]
+// GFX1170: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[1,0,0] ; encoding: [0x04,0x41,0x43,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,1,0]
+// GFX1170: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,1,0] ; encoding: [0x04,0x42,0x43,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1]
+// GFX1170: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x43,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], s[0:1], v[2:3], v[4:5]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], s[2:3], v[4:5]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], s[4:5]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], 1.0, v[2:3], v[4:5]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], 1.0, v[4:5]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], 1.0
+// GFX1170: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0xca,0x1b]
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], 1
+// GFX1170: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x06,0x1a]
+
+
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5]
+// GFX1170: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x1c]
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] clamp
+// GFX1170: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] clamp ; encoding: [0x02,0xc0,0x44,0xcc,0x00,0x03,0x0a,0x1c]
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
+// GFX1170: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x3c]
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
+// GFX1170: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x5c]
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_hi:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_hi:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[2:5], s0, v1, v[2:5]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, s1, v[2:5]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, s[0:3]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu8 v[2:5], 1, v1, v[2:5]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, 1, v[2:5]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, 1
+// GFX1170: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x06,0x1a]
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, 1.0
+// GFX1170: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0xca,0x1b]
+
+
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5]
+// GFX1170: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x1c]
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] clamp
+// GFX1170: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] clamp ; encoding: [0x02,0xc0,0x45,0xcc,0x00,0x03,0x0a,0x1c]
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
+// GFX1170: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x3c]
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
+// GFX1170: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x5c]
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_hi:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_hi:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:5], s0, v1, v[2:5]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, s1, v[2:5]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, s[0:3]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu4 v[2:5], 1, v1, v[2:5]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, 1, v[2:5]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, 1
+// GFX1170: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x06,0x1a]
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, 1.0
+// GFX1170: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0xca,0x1b]
+
+
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5]
+// GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x1c]
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] clamp
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
+// GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x9c]
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
+// GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] ; encoding: [0x02,0x44,0x46,0xcc,0x00,0x03,0x0a,0x1c]
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], s0, v1, v[2:5]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, s1, v[2:5]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, s[0:3]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], 1.0, v1, v[2:5]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, 1.0, v[2:5]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, 1.0
+// GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x03,0xca,0x1b]
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, 1
+// GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x03,0x06,0x1a]
+
+
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5]
+// GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x1c]
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] clamp
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
+// GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x9c]
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
+// GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] ; encoding: [0x02,0x44,0x47,0xcc,0x00,0x03,0x0a,0x1c]
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], s0, v1, v[2:5]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, s1, v[2:5]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, s[0:3]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], 1.0, v1, v[2:5]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, 1.0, v[2:5]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, 1.0
+// GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x03,0xca,0x1b]
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, 1
+// GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x03,0x06,0x1a]
+
+
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5]
+// GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x1c]
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] clamp
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
+// GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x9c]
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
+// GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] ; encoding: [0x02,0x44,0x48,0xcc,0x00,0x03,0x0a,0x1c]
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], s0, v1, v[2:5]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, s1, v[2:5]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, s[0:3]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], 1.0, v1, v[2:5]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, 1.0, v[2:5]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, 1.0
+// GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x03,0xca,0x1b]
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, 1
+// GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x03,0x06,0x1a]
+
+
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5]
+// GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x1c]
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] clamp
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
+// GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x9c]
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
+// GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] ; encoding: [0x02,0x44,0x49,0xcc,0x00,0x03,0x0a,0x1c]
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], s0, v1, v[2:5]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, s1, v[2:5]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, s[0:3]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], 1.0, v1, v[2:5]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, 1.0, v[2:5]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, 1.0
+// GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x03,0xca,0x1b]
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, 1
+// GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x03,0x06,0x1a]
+
+
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5]
+// GFX1170: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x1c]
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] clamp
+// GFX1170: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] clamp ; encoding: [0x02,0xc0,0x4a,0xcc,0x00,0x03,0x0a,0x1c]
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
+// GFX1170: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x3c]
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
+// GFX1170: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x5c]
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_hi:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_hi:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[2:5], s0, v1, v[2:5]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, s1, v[2:5]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, s[0:3]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x32_iu4 v[2:5], 1.0, v1, v[2:5]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, 1.0, v[2:5]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, 1.0
+// GFX1170: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0xca,0x1b]
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, 1
+// GFX1170: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x06,0x1a]
+
+
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10
+// GFX1170: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x1c]
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 clamp
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:1
+// GFX1170: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:1 ; encoding: [0x06,0x48,0x50,0xcc,0x00,0x05,0x2a,0x1c]
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:2
+// GFX1170: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:2 ; encoding: [0x06,0x50,0x50,0xcc,0x00,0x05,0x2a,0x1c]
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:3
+// GFX1170: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:3 ; encoding: [0x06,0x58,0x50,0xcc,0x00,0x05,0x2a,0x1c]
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0]
+// GFX1170: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x3c]
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0]
+// GFX1170: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x5c]
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[1,0,0]
+// GFX1170: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[1,0,0] ; encoding: [0x06,0x41,0x50,0xcc,0x00,0x05,0x2a,0x1c]
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[0,1,0]
+// GFX1170: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[0,1,0] ; encoding: [0x06,0x42,0x50,0xcc,0x00,0x05,0x2a,0x1c]
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_swmmac_f32_16x16x32_f16 v[6:9], s[0:1], v[2:5], v10
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], s[0:3], v10
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], s10
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_f16 v[6:9], 1.0, v[2:5], v10
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], 1.0, v10
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], 1.0
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], 1
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10
+// GFX1170: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x1c]
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 clamp
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:1
+// GFX1170: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:1 ; encoding: [0x06,0x48,0x51,0xcc,0x00,0x05,0x2a,0x1c]
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:2
+// GFX1170: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:2 ; encoding: [0x06,0x50,0x51,0xcc,0x00,0x05,0x2a,0x1c]
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:3
+// GFX1170: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:3 ; encoding: [0x06,0x58,0x51,0xcc,0x00,0x05,0x2a,0x1c]
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0]
+// GFX1170: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x3c]
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0]
+// GFX1170: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x5c]
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[1,0,0]
+// GFX1170: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[1,0,0] ; encoding: [0x06,0x41,0x51,0xcc,0x00,0x05,0x2a,0x1c]
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[0,1,0]
+// GFX1170: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[0,1,0] ; encoding: [0x06,0x42,0x51,0xcc,0x00,0x05,0x2a,0x1c]
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], s[0:1], v[2:5], v10
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], s[0:3], v10
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], s10
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], 1.0, v[2:5], v10
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], 1.0, v10
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], 1.0
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], 1
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8
+// GFX1170: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x1c]
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 clamp
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:1
+// GFX1170: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:1 ; encoding: [0x06,0x48,0x52,0xcc,0x00,0x05,0x22,0x1c]
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:2
+// GFX1170: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:2 ; encoding: [0x06,0x50,0x52,0xcc,0x00,0x05,0x22,0x1c]
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:3
+// GFX1170: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:3 ; encoding: [0x06,0x58,0x52,0xcc,0x00,0x05,0x22,0x1c]
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0]
+// GFX1170: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x3c]
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0]
+// GFX1170: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x5c]
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[1,0,0]
+// GFX1170: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[1,0,0] ; encoding: [0x06,0x41,0x52,0xcc,0x00,0x05,0x22,0x1c]
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[0,1,0]
+// GFX1170: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[0,1,0] ; encoding: [0x06,0x42,0x52,0xcc,0x00,0x05,0x22,0x1c]
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_swmmac_f16_16x16x32_f16 v[6:7], s[0:1], v[2:5], v8
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], s[0:3], v8
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], s8
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f16_16x16x32_f16 v[6:7], 1.0, v[2:5], v8
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], 1.0, v8
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], 1.0
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], 1
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8
+// GFX1170: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x1c]
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 clamp
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:1
+// GFX1170: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:1 ; encoding: [0x06,0x48,0x53,0xcc,0x00,0x05,0x22,0x1c]
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:2
+// GFX1170: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:2 ; encoding: [0x06,0x50,0x53,0xcc,0x00,0x05,0x22,0x1c]
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:3
+// GFX1170: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:3 ; encoding: [0x06,0x58,0x53,0xcc,0x00,0x05,0x22,0x1c]
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0]
+// GFX1170: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x3c]
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0]
+// GFX1170: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x5c]
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[1,0,0]
+// GFX1170: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[1,0,0] ; encoding: [0x06,0x41,0x53,0xcc,0x00,0x05,0x22,0x1c]
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[0,1,0]
+// GFX1170: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[0,1,0] ; encoding: [0x06,0x42,0x53,0xcc,0x00,0x05,0x22,0x1c]
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], s[0:1], v[2:5], v8
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], s[0:3], v8
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], s8
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], 1.0, v[2:5], v8
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], 1.0, v8
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], 1.0
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], 1
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7
+// GFX1170: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 clamp
+// GFX1170: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 clamp ; encoding: [0x03,0xc0,0x54,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:1
+// GFX1170: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x54,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:2
+// GFX1170: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x54,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:3
+// GFX1170: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x54,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
+// GFX1170: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x3c]
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
+// GFX1170: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x5c]
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_hi:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_hi:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_hi:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], s0, v[1:2], v7
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, s[0:1], v7
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], s7
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], 1, v[1:2], v7
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, 1, v7
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], 1
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], 1.0
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6
+// GFX1170: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x1c]
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 clamp
+// GFX1170: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 clamp ; encoding: [0x02,0xc0,0x55,0xcc,0x00,0x03,0x1a,0x1c]
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:1
+// GFX1170: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:1 ; encoding: [0x02,0x48,0x55,0xcc,0x00,0x03,0x1a,0x1c]
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:2
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:3
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[1,0,0]
+// GFX1170: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x3c]
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[0,1,0]
+// GFX1170: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x5c]
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_hi:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_hi:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_hi:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], s0, v1, v6
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, s1, v6
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, s6
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], 1, v1, v6
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, 1, v6
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, 1
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, 1.0
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7
+// GFX1170: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 clamp
+// GFX1170: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 clamp ; encoding: [0x03,0xc0,0x56,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:1
+// GFX1170: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x56,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:2
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:3
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
+// GFX1170: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x3c]
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
+// GFX1170: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x5c]
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_hi:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_hi:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_hi:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], s0, v[1:2], v7
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, s[0:1], v7
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], s7
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], 1, v[1:2], v7
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, 1, v7
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], 1
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], 1.0
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7
+// GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x57,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 clamp
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:1
+// GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x57,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:2
+// GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x57,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:3
+// GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x57,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 neg_lo:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 neg_hi:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 neg_hi:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 neg_hi:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], s0, v[1:2], v7
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, s[0:1], v7
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], s7
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], 1.0, v[1:2], v7
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, 1.0, v7
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], 1.0
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], 1
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7
+// GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x58,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 clamp
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:1
+// GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x58,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:2
+// GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x58,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:3
+// GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x58,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 neg_lo:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 neg_hi:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 neg_hi:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 neg_hi:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], s0, v[1:2], v7
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, s[0:1], v7
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], s7
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], 1.0, v[1:2], v7
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, 1.0, v7
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], 1.0
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], 1
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7
+// GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x59,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 clamp
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:1
+// GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x59,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:2
+// GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x59,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:3
+// GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x59,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 neg_lo:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 neg_hi:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 neg_hi:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 neg_hi:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], s0, v[1:2], v7
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, s[0:1], v7
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], s7
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], 1.0, v[1:2], v7
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, 1.0, v7
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], 1.0
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], 1
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7
+// GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x5a,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 clamp
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:1
+// GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x5a,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:2
+// GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x5a,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:3
+// GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x5a,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 neg_lo:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 neg_hi:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 neg_hi:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 neg_hi:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], s0, v[1:2], v7
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, s[0:1], v7
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], s7
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], 1.0, v[1:2], v7
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, 1.0, v7
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], 1.0
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], 1
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
diff --git a/llvm/test/MC/AMDGPU/gfx1170_unsupported.s b/llvm/test/MC/AMDGPU/gfx1170_unsupported.s
new file mode 100644
index 0000000000000..b4e0da1779ffb
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx1170_unsupported.s
@@ -0,0 +1,11 @@
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1170 %s 2>&1 | FileCheck --implicit-check-not=error: %s
+
+//===----------------------------------------------------------------------===//
+// Unsupported instructions.
+//===----------------------------------------------------------------------===//
+
+v_dot2c_f32_f16 v0, v1, v2
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_dot2acc_f32_f16 v5, v1, v2
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/literals.s b/llvm/test/MC/AMDGPU/literals.s
index 363db1a16b170..a96e9c4c07873 100644
--- a/llvm/test/MC/AMDGPU/literals.s
+++ b/llvm/test/MC/AMDGPU/literals.s
@@ -206,14 +206,14 @@ v_fract_f64_e32 v[0:1], lit(1.0)
 v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], 1.0
 // GFX11: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], 1.0 ; encoding: [0x08,0x40,0x44,0xcc,0x00,0x09,0xca,0x1b]
 // NOGFX12: :[[@LINE-2]]:1: error: operands are not valid for this GPU or mode
-// NOGFX1250: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
 // NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
 // NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], lit(1.0)
 // NOGFX11: :[[@LINE-1]]:54: error: invalid operand for instruction
 // NOGFX12: :[[@LINE-2]]:54: error: invalid operand for instruction
-// NOGFX1250: :[[@LINE-3]]:54: error: invalid operand for instruction
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
 // NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
 // NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
@@ -658,14 +658,14 @@ v_fract_f64_e32 v[0:1], 0xffffffffffffffff
 v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], 1
 // GFX11: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x44,0xcc,0x00,0x09,0x06,0x1a]
 // NOGFX12: :[[@LINE-2]]:1: error: operands are not valid for this GPU or mode
-// NOGFX1250: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
 // NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
 // NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], lit(1)
 // NOGFX11: :[[@LINE-1]]:54: error: invalid operand for instruction
 // NOGFX12: :[[@LINE-2]]:54: error: invalid operand for instruction
-// NOGFX1250: :[[@LINE-3]]:54: error: invalid operand for instruction
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
 // NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
 // NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1170_dasm_wmma_w32.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1170_dasm_wmma_w32.txt
new file mode 100644
index 0000000000000..1e778fb04aea2
--- /dev/null
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1170_dasm_wmma_w32.txt
@@ -0,0 +1,1628 @@
+# RUN: not llvm-mc -disassemble -triple=amdgcn -mcpu=gfx1170 -show-encoding %s | FileCheck --check-prefix=GFX1170 %s
+# RUN: not llvm-mc -disassemble -triple=amdgcn -mcpu=gfx1170 -show-encoding %s 2>&1 | FileCheck --implicit-check-not=warning: --check-prefix=GFX1170-ERR %s
+
+[0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x1c]
+# GFX1170: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x1c]
+
+[0x08,0xc0,0x40,0xcc,0x00,0x09,0x22,0x1c] # clamp
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x08,0x48,0x40,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x08,0x50,0x40,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x08,0x60,0x40,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x08,0x00,0x40,0xcc,0x00,0x09,0x22,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x08,0x48,0x40,0xcc,0x00,0x09,0x22,0x1c] # index_key:1
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x08,0x50,0x40,0xcc,0x00,0x09,0x22,0x1c] # index_key:2
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x08,0x58,0x40,0xcc,0x00,0x09,0x22,0x1c] # index_key:3
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x3c] # neg_lo:[1,0,0]
+# GFX1170: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x3c]
+
+[0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x5c] # neg_lo:[0,1,0]
+# GFX1170: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x5c]
+
+[0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x9c] # neg_lo:[0,0,1]
+# GFX1170: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x9c]
+
+[0x08,0x41,0x40,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[1,0,0]
+# GFX1170: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x40,0xcc,0x00,0x09,0x22,0x1c]
+
+[0x08,0x42,0x40,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[0,1,0]
+# GFX1170: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x40,0xcc,0x00,0x09,0x22,0x1c]
+
+[0x08,0x44,0x40,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[0,0,1]
+# GFX1170: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x40,0xcc,0x00,0x09,0x22,0x1c]
+
+[0x08,0x40,0x40,0xcc,0x00,0x08,0x22,0x1c] # sgpr src0
+# GFX1170: v_wmma_f32_16x16x16_f16 v[8:15], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v[4:7], v[8:15] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x08,0x22,0x1c]
+
+[0x08,0x40,0x40,0xcc,0x00,0x09,0x20,0x1c] # sgpr src1
+# GFX1170: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], s[4:7]/*Invalid register, operand has 'VReg_128' register class*/, v[8:15] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x20,0x1c]
+
+[0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x18] # sgpr src2
+# GFX1170: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], s[8:15]/*Invalid register, operand has 'VReg_256' register class*/ ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x18]
+
+[0x08,0x40,0x40,0xcc,0xf2,0x08,0x22,0x1c] # 1.0 src0
+# GFX1170: v_wmma_f32_16x16x16_f16 v[8:15], 1.0/*Invalid immediate*/, v[4:7], v[8:15] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x08,0x22,0x1c]
+
+[0x08,0x40,0x40,0xcc,0x00,0xe5,0x21,0x1c] # 1.0 src1
+# GFX1170: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], 1.0/*Invalid immediate*/, v[8:15] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x01,0x20,0x1c]
+
+[0x08,0x40,0x40,0xcc,0x00,0x09,0xca,0x1b] # 1.0 src2
+# GFX1170: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], 1.0 ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0xca,0x1b]
+
+[0x08,0x40,0x40,0xcc,0x00,0x09,0x06,0x1a] # 1 src2
+# GFX1170: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x06,0x1a]
+
+
+
+[0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x1c]
+# GFX1170: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x1c]
+
+[0x08,0xc0,0x41,0xcc,0x00,0x09,0x22,0x1c] # clamp
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x08,0x48,0x41,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x08,0x50,0x41,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x08,0x60,0x41,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x08,0x00,0x41,0xcc,0x00,0x09,0x22,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x41,0xcc,0x00,0x05,0x12,0x1c] # index_key:1
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x41,0xcc,0x00,0x05,0x12,0x1c] # index_key:2
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x58,0x41,0xcc,0x00,0x05,0x12,0x1c] # index_key:3
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x3c] # neg_lo:[1,0,0]
+# GFX1170: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x3c]
+
+[0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x5c] # neg_lo:[0,1,0]
+# GFX1170: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x5c]
+
+[0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x9c] # neg_lo:[0,0,1]
+# GFX1170: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x9c]
+
+[0x08,0x41,0x41,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[1,0,0]
+# GFX1170: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x41,0xcc,0x00,0x09,0x22,0x1c]
+
+[0x08,0x42,0x41,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[0,1,0]
+# GFX1170: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x41,0xcc,0x00,0x09,0x22,0x1c]
+
+[0x08,0x44,0x41,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[0,0,1]
+# GFX1170: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x41,0xcc,0x00,0x09,0x22,0x1c]
+
+[0x08,0x40,0x41,0xcc,0x00,0x08,0x22,0x1c] # sgpr src0
+# GFX1170: v_wmma_f32_16x16x16_bf16 v[8:15], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v[4:7], v[8:15] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x08,0x22,0x1c]
+
+[0x08,0x40,0x41,0xcc,0x00,0x09,0x20,0x1c] # sgpr src1
+# GFX1170: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], s[4:7]/*Invalid register, operand has 'VReg_128' register class*/, v[8:15] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x20,0x1c]
+
+[0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x18] # sgpr src2
+# GFX1170: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], s[8:15]/*Invalid register, operand has 'VReg_256' register class*/ ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x18]
+
+[0x08,0x40,0x41,0xcc,0xf2,0x08,0x22,0x1c] # 1.0 src0
+# GFX1170: v_wmma_f32_16x16x16_bf16 v[8:15], 1.0/*Invalid immediate*/, v[4:7], v[8:15] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x08,0x22,0x1c]
+
+[0x08,0x40,0x41,0xcc,0x00,0xe5,0x21,0x1c] # 1.0 src1
+# GFX1170: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], 1.0/*Invalid immediate*/, v[8:15] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x01,0x20,0x1c]
+
+[0x08,0x40,0x41,0xcc,0x00,0x09,0xca,0x1b] # 1.0 src2
+# GFX1170: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], 1.0 ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0xca,0x1b]
+
+[0x08,0x40,0x41,0xcc,0x00,0x09,0x06,0x1a] # 1 src2
+# GFX1170: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x06,0x1a]
+
+
+
+[0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x1c]
+# GFX1170: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x1c]
+
+[0x08,0xc0,0x42,0xcc,0x00,0x09,0x22,0x1c] # clamp
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x08,0x48,0x42,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x08,0x50,0x42,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x08,0x60,0x42,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x08,0x00,0x42,0xcc,0x00,0x09,0x22,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x42,0xcc,0x00,0x05,0x12,0x1c] # index_key:1
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x42,0xcc,0x00,0x05,0x12,0x1c] # index_key:2
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x58,0x42,0xcc,0x00,0x05,0x12,0x1c] # index_key:3
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x3c] # neg_lo:[1,0,0]
+# GFX1170: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x3c]
+
+[0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x5c] # neg_lo:[0,1,0]
+# GFX1170: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x5c]
+
+[0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x9c] # neg_lo:[0,0,1]
+# GFX1170: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x9c]
+
+[0x08,0x41,0x42,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[1,0,0]
+# GFX1170: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x42,0xcc,0x00,0x09,0x22,0x1c]
+
+[0x08,0x42,0x42,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[0,1,0]
+# GFX1170: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x42,0xcc,0x00,0x09,0x22,0x1c]
+
+[0x08,0x44,0x42,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[0,0,1]
+# GFX1170: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x42,0xcc,0x00,0x09,0x22,0x1c]
+
+[0x08,0x40,0x42,0xcc,0x00,0x08,0x22,0x1c] # sgpr src0
+# GFX1170: v_wmma_f16_16x16x16_f16 v[8:11], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v[4:7], v[8:11] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x08,0x22,0x1c]
+
+[0x08,0x40,0x42,0xcc,0x00,0x09,0x20,0x1c] # sgpr src1
+# GFX1170: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], s[4:7]/*Invalid register, operand has 'VReg_128' register class*/, v[8:11] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x20,0x1c]
+
+[0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x18] # sgpr src2
+# GFX1170: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], s[8:11]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x18]
+
+[0x08,0x40,0x42,0xcc,0xf2,0x08,0x22,0x1c] # 1.0 src0
+# GFX1170: v_wmma_f16_16x16x16_f16 v[8:11], 1.0/*Invalid immediate*/, v[4:7], v[8:11] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x08,0x22,0x1c]
+
+[0x08,0x40,0x42,0xcc,0x00,0xe5,0x21,0x1c] # 1.0 src1
+# GFX1170: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], 1.0/*Invalid immediate*/, v[8:11] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x01,0x20,0x1c]
+
+[0x08,0x40,0x42,0xcc,0x00,0x09,0xca,0x1b] # 1.0 src2
+# GFX1170: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], 1.0 ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0xca,0x1b]
+
+[0x08,0x40,0x42,0xcc,0x00,0x09,0x06,0x1a] # 1 src2
+# GFX1170: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x06,0x1a]
+
+
+
+[0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x1c]
+# GFX1170: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x1c]
+
+[0x08,0xc0,0x43,0xcc,0x00,0x09,0x22,0x1c] # clamp
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x08,0x48,0x43,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x08,0x50,0x43,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x08,0x60,0x43,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x08,0x00,0x43,0xcc,0x00,0x09,0x22,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x43,0xcc,0x00,0x05,0x12,0x1c] # index_key:1
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x43,0xcc,0x00,0x05,0x12,0x1c] # index_key:2
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x58,0x43,0xcc,0x00,0x05,0x12,0x1c] # index_key:3
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x3c] # neg_lo:[1,0,0]
+# GFX1170: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x3c]
+
+[0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x5c] # neg_lo:[0,1,0]
+# GFX1170: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x5c]
+
+[0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x9c] # neg_lo:[0,0,1]
+# GFX1170: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x9c]
+
+[0x08,0x41,0x43,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[1,0,0]
+# GFX1170: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x43,0xcc,0x00,0x09,0x22,0x1c]
+
+[0x08,0x42,0x43,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[0,1,0]
+# GFX1170: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x43,0xcc,0x00,0x09,0x22,0x1c]
+
+[0x08,0x44,0x43,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[0,0,1]
+# GFX1170: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x43,0xcc,0x00,0x09,0x22,0x1c]
+
+[0x08,0x40,0x43,0xcc,0x00,0x08,0x22,0x1c] # sgpr src0
+# GFX1170: v_wmma_bf16_16x16x16_bf16 v[8:11], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v[4:7], v[8:11] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x08,0x22,0x1c]
+
+[0x08,0x40,0x43,0xcc,0x00,0x09,0x20,0x1c] # sgpr src1
+# GFX1170: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], s[4:7]/*Invalid register, operand has 'VReg_128' register class*/, v[8:11] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x20,0x1c]
+
+[0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x18] # sgpr src2
+# GFX1170: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], s[8:11]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x18]
+
+[0x08,0x40,0x43,0xcc,0xf2,0x08,0x22,0x1c] # 1.0 src0
+# GFX1170: v_wmma_bf16_16x16x16_bf16 v[8:11], 1.0/*Invalid immediate*/, v[4:7], v[8:11] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x08,0x22,0x1c]
+
+[0x08,0x40,0x43,0xcc,0x00,0xe5,0x21,0x1c] # 1.0 src1
+# GFX1170: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], 1.0/*Invalid immediate*/, v[8:11] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x01,0x20,0x1c]
+
+[0x08,0x40,0x43,0xcc,0x00,0x09,0xca,0x1b] # 1.0 src2
+# GFX1170: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], 1.0 ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0xca,0x1b]
+
+[0x08,0x40,0x43,0xcc,0x00,0x09,0x06,0x1a] # 1 src2
+# GFX1170: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x06,0x1a]
+
+
+
+[0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x1c]
+# GFX1170: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0xc0,0x44,0xcc,0x00,0x05,0x12,0x1c] # clamp
+# GFX1170: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] clamp ; encoding: [0x04,0xc0,0x44,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x48,0x44,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x44,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x60,0x44,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x00,0x44,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x48,0x44,0xcc,0x00,0x03,0x0a,0x1c] # index_key:1
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x44,0xcc,0x00,0x03,0x0a,0x1c] # index_key:2
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x58,0x44,0xcc,0x00,0x03,0x0a,0x1c] # index_key:3
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0]
+# GFX1170: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x3c]
+
+[0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0]
+# GFX1170: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x5c]
+
+[0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x41,0x44,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x42,0x44,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x44,0x44,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x44,0xcc,0x00,0x04,0x12,0x1c] # sgpr src0
+# GFX1170: v_wmma_i32_16x16x16_iu8 v[4:11], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x44,0xcc,0x00,0x05,0x10,0x1c] # sgpr src1
+# GFX1170: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], s[2:3]/*Invalid register, operand has 'VReg_64' register class*/, v[4:11] ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x10,0x1c]
+
+[0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x18] # sgpr src2
+# GFX1170: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/ ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x18]
+
+[0x04,0x40,0x44,0xcc,0x81,0x04,0x12,0x1c] # 1 src0
+# GFX1170: v_wmma_i32_16x16x16_iu8 v[4:11], 1/*Invalid immediate*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x44,0xcc,0x01,0x04,0x12,0x1c]
+
+[0x04,0x40,0x44,0xcc,0x00,0x03,0x11,0x1c] # 1 src1
+# GFX1170: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], 1/*Invalid immediate*/, v[4:11] ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x03,0x10,0x1c]
+
+[0x04,0x40,0x44,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX1170: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x06,0x1a]
+
+[0x04,0x40,0x44,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX1170: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0xca,0x1b]
+
+
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x1c]
+# GFX1170: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x1c]
+
+[0x02,0xc0,0x45,0xcc,0x00,0x03,0x0a,0x1c] # clamp
+# GFX1170: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] clamp ; encoding: [0x02,0xc0,0x45,0xcc,0x00,0x03,0x0a,0x1c]
+
+[0x02,0x48,0x45,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x45,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x60,0x45,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x00,0x45,0xcc,0x00,0x03,0x0a,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x48,0x45,0xcc,0x00,0x03,0x0a,0x1c] # index_key:1
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x45,0xcc,0x00,0x03,0x0a,0x1c] # index_key:2
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x58,0x45,0xcc,0x00,0x03,0x0a,0x1c] # index_key:3
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x3c] # neg_lo:[1,0,0]
+# GFX1170: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x3c]
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x5c] # neg_lo:[0,1,0]
+# GFX1170: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x5c]
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x9c] # neg_lo:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x41,0x45,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x42,0x45,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x44,0x45,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x45,0xcc,0x00,0x02,0x0a,0x1c] # sgpr src0
+# GFX1170: v_wmma_i32_16x16x16_iu4 v[2:9], s0/*Invalid register, operand has 'VGPR_32' register class*/, v1, v[2:9] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x02,0x0a,0x1c]
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x08,0x1c] # sgpr src1
+# GFX1170: v_wmma_i32_16x16x16_iu4 v[2:9], v0, s1/*Invalid register, operand has 'VGPR_32' register class*/, v[2:9] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x08,0x1c]
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x18] # sgpr src2
+# GFX1170: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, s[0:7]/*Invalid register, operand has 'VReg_256' register class*/ ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x02,0x18]
+
+[0x02,0x40,0x45,0xcc,0x81,0x02,0x0a,0x1c] # 1 src0
+# GFX1170: v_wmma_i32_16x16x16_iu4 v[2:9], 1/*Invalid immediate*/, v1, v[2:9] ; encoding: [0x02,0x40,0x45,0xcc,0x01,0x02,0x0a,0x1c]
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x09,0x1c] # 1 src1
+# GFX1170: v_wmma_i32_16x16x16_iu4 v[2:9], v0, 1/*Invalid immediate*/, v[2:9] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x08,0x1c]
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x06,0x1a] # 1 src2
+# GFX1170: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, 1 ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x06,0x1a]
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2
+# GFX1170: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, 1.0 ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0xca,0x1b]
+
+
+
+[0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x1c]
+# GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0xc0,0x46,0xcc,0x00,0x05,0x12,0x1c] # clamp
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x46,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x46,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x60,0x46,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x00,0x46,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x46,0xcc,0x00,0x05,0x12,0x1c] # index_key:1
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x46,0xcc,0x00,0x05,0x12,0x1c] # index_key:2
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x58,0x46,0xcc,0x00,0x05,0x12,0x1c] # index_key:3
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1]
+# GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x9c]
+
+[0x04,0x41,0x46,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x42,0x46,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x44,0x46,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1]
+# GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x46,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x40,0x46,0xcc,0x00,0x04,0x12,0x1c] # sgpr src0
+# GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x46,0xcc,0x00,0x05,0x10,0x1c] # sgpr src1
+# GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], s[2:3]/*Invalid register, operand has 'VReg_64' register class*/, v[4:11] ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0x10,0x1c]
+
+[0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x18] # sgpr src2
+# GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/ ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x18]
+
+[0x04,0x40,0x46,0xcc,0xf2,0x04,0x12,0x1c] # 1.0 src0
+# GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], 1.0/*Invalid immediate*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x46,0xcc,0x00,0xe5,0x11,0x1c] # 1.0 src1
+# GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], 1.0/*Invalid immediate*/, v[4:11] ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x01,0x10,0x1c]
+
+[0x04,0x40,0x46,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0xca,0x1b]
+
+[0x04,0x40,0x46,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0x06,0x1a]
+
+
+
+[0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x1c]
+# GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0xc0,0x48,0xcc,0x00,0x05,0x12,0x1c] # clamp
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x48,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x48,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x60,0x48,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x00,0x48,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x48,0xcc,0x00,0x05,0x12,0x1c] # index_key:1
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x48,0xcc,0x00,0x05,0x12,0x1c] # index_key:2
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x58,0x48,0xcc,0x00,0x05,0x12,0x1c] # index_key:3
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1]
+# GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x9c]
+
+[0x04,0x41,0x48,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x42,0x48,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x44,0x48,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1]
+# GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x48,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x40,0x48,0xcc,0x00,0x04,0x12,0x1c] # sgpr src0
+# GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x48,0xcc,0x00,0x05,0x10,0x1c] # sgpr src1
+# GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], s[2:3]/*Invalid register, operand has 'VReg_64' register class*/, v[4:11] ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0x10,0x1c]
+
+[0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x18] # sgpr src2
+# GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/ ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x18]
+
+[0x04,0x40,0x48,0xcc,0xf2,0x04,0x12,0x1c] # 1.0 src0
+# GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], 1.0/*Invalid immediate*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x48,0xcc,0x00,0xe5,0x11,0x1c] # 1.0 src1
+# GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], 1.0/*Invalid immediate*/, v[4:11] ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x01,0x10,0x1c]
+
+[0x04,0x40,0x48,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0xca,0x1b]
+
+[0x04,0x40,0x48,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0x06,0x1a]
+
+
+
+[0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x1c]
+# GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0xc0,0x47,0xcc,0x00,0x05,0x12,0x1c] # clamp
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x47,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x47,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x60,0x47,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x00,0x47,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x47,0xcc,0x00,0x05,0x12,0x1c] # index_key:1
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x47,0xcc,0x00,0x05,0x12,0x1c] # index_key:2
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x58,0x47,0xcc,0x00,0x05,0x12,0x1c] # index_key:3
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1]
+# GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x9c]
+
+[0x04,0x41,0x47,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x42,0x47,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x44,0x47,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1]
+# GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x47,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x40,0x47,0xcc,0x00,0x04,0x12,0x1c] # sgpr src0
+# GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x47,0xcc,0x00,0x05,0x10,0x1c] # sgpr src1
+# GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], s[2:3]/*Invalid register, operand has 'VReg_64' register class*/, v[4:11] ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0x10,0x1c]
+
+[0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x18] # sgpr src2
+# GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/ ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x18]
+
+[0x04,0x40,0x47,0xcc,0xf2,0x04,0x12,0x1c] # 1.0 src0
+# GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], 1.0/*Invalid immediate*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x47,0xcc,0x00,0xe5,0x11,0x1c] # 1.0 src1
+# GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], 1.0/*Invalid immediate*/, v[4:11] ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x01,0x10,0x1c]
+
+[0x04,0x40,0x47,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0xca,0x1b]
+
+[0x04,0x40,0x47,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0x06,0x1a]
+
+
+
+[0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x1c]
+# GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0xc0,0x49,0xcc,0x00,0x05,0x12,0x1c] # clamp
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x49,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x49,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x60,0x49,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x00,0x49,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x49,0xcc,0x00,0x05,0x12,0x1c] # index_key:1
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x49,0xcc,0x00,0x05,0x12,0x1c] # index_key:2
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x58,0x49,0xcc,0x00,0x05,0x12,0x1c] # index_key:3
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1]
+# GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x9c]
+
+[0x04,0x41,0x49,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x42,0x49,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x44,0x49,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1]
+# GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x49,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x40,0x49,0xcc,0x00,0x04,0x12,0x1c] # sgpr src0
+# GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x49,0xcc,0x00,0x05,0x10,0x1c] # sgpr src1
+# GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], s[2:3]/*Invalid register, operand has 'VReg_64' register class*/, v[4:11] ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0x10,0x1c]
+
+[0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x18] # sgpr src2
+# GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/ ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x18]
+
+[0x04,0x40,0x49,0xcc,0xf2,0x04,0x12,0x1c] # 1.0 src0
+# GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], 1.0/*Invalid immediate*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x49,0xcc,0x00,0xe5,0x11,0x1c] # 1.0 src1
+# GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], 1.0/*Invalid immediate*/, v[4:11] ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x01,0x10,0x1c]
+
+[0x04,0x40,0x49,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0xca,0x1b]
+
+[0x04,0x40,0x49,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0x06,0x1a]
+
+
+
+[0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x1c]
+# GFX1170: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0xc0,0x4a,0xcc,0x00,0x05,0x12,0x1c] # clamp
+# GFX1170: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] clamp ; encoding: [0x04,0xc0,0x4a,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x48,0x4a,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x4a,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x60,0x4a,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x00,0x4a,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x4a,0xcc,0x00,0x05,0x12,0x1c] # index_key:1
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x4a,0xcc,0x00,0x05,0x12,0x1c] # index_key:2
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x58,0x4a,0xcc,0x00,0x05,0x12,0x1c] # index_key:3
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0]
+# GFX1170: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x3c]
+
+[0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0]
+# GFX1170: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x5c]
+
+[0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x41,0x4a,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x42,0x4a,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x44,0x4a,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x4a,0xcc,0x00,0x04,0x12,0x1c] # sgpr src0
+# GFX1170: v_wmma_i32_16x16x32_iu4 v[4:11], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x4a,0xcc,0x00,0x05,0x10,0x1c] # sgpr src1
+# GFX1170: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], s[2:3]/*Invalid register, operand has 'VReg_64' register class*/, v[4:11] ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x10,0x1c]
+
+[0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x18] # sgpr src2
+# GFX1170: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/ ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x18]
+
+[0x04,0x40,0x4a,0xcc,0x81,0x04,0x12,0x1c] # 1 src0
+# GFX1170: v_wmma_i32_16x16x32_iu4 v[4:11], 1/*Invalid immediate*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x4a,0xcc,0x01,0x04,0x12,0x1c]
+
+[0x04,0x40,0x4a,0xcc,0x00,0x03,0x11,0x1c] # 1 src1
+# GFX1170: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], 1/*Invalid immediate*/, v[4:11] ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x03,0x10,0x1c]
+
+[0x04,0x40,0x4a,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX1170: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x06,0x1a]
+
+[0x04,0x40,0x4a,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX1170: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0xca,0x1b]
+
+
+
+[0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x1c]
+# GFX1170:v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x1c]
+
+[0x0c,0xc0,0x50,0xcc,0x00,0x09,0x52,0x1c] # clamp
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x0c,0x48,0x50,0xcc,0x00,0x09,0x52,0x1c] # op_sel:[1,0,0]
+# GFX1170:v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 index_key:1 ; encoding: [0x0c,0x48,0x50,0xcc,0x00,0x09,0x52,0x1c]
+
+[0x0c,0x50,0x50,0xcc,0x00,0x09,0x52,0x1c] # op_sel:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x0c,0x60,0x50,0xcc,0x00,0x09,0x52,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x0c,0x00,0x50,0xcc,0x00,0x09,0x52,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x0c,0x48,0x50,0xcc,0x00,0x09,0x52,0x1c] # index_key:1
+# GFX1170: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 index_key:1 ; encoding: [0x0c,0x48,0x50,0xcc,0x00,0x09,0x52,0x1c]
+
+[0x0c,0x50,0x50,0xcc,0x00,0x09,0x52,0x1c] # index_key:2
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x0c,0x58,0x50,0xcc,0x00,0x09,0x52,0x1c] # index_key:3
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x3c] # neg_lo:[1,0,0]
+# GFX1170: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x3c]
+
+[0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x5c] # neg_lo:[0,1,0]
+# GFX1170: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x5c]
+
+[0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x9c] # neg_lo:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x0c,0x41,0x50,0xcc,0x00,0x09,0x52,0x1c] # neg_hi:[1,0,0]
+# GFX1170: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[1,0,0] ; encoding: [0x0c,0x41,0x50,0xcc,0x00,0x09,0x52,0x1c]
+
+[0x0c,0x42,0x50,0xcc,0x00,0x09,0x52,0x1c] # neg_hi:[0,1,0]
+# GFX1170: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[0,1,0] ; encoding: [0x0c,0x42,0x50,0xcc,0x00,0x09,0x52,0x1c]
+
+[0x0c,0x44,0x50,0xcc,0x00,0x09,0x52,0x1c] # neg_hi:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x0c,0x40,0x50,0xcc,0x00,0x08,0x52,0x1c] # sgpr src0
+# GFX1170: v_swmmac_f32_16x16x32_f16 v[12:19], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v[4:11], v20 ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x08,0x52,0x1c]
+
+[0x0c,0x40,0x50,0xcc,0x00,0x09,0x50,0x1c] # sgpr src1
+# GFX1170: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/, v20 ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x50,0x1c]
+
+[0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x18] # sgpr src2
+# GFX1170: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], s20/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x18]
+
+[0x0c,0x40,0x50,0xcc,0xf2,0x08,0x52,0x1c] # 1.0 src0
+# GFX1170: v_swmmac_f32_16x16x32_f16 v[12:19], 1.0/*Invalid immediate*/, v[4:11], v20 ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x08,0x52,0x1c]
+
+[0x0c,0x40,0x50,0xcc,0x00,0xe5,0x51,0x1c] # 1.0 src1
+# GFX1170: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], 1.0/*Invalid immediate*/, v20 ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x01,0x50,0x1c]
+
+[0x0c,0x40,0x50,0xcc,0x00,0x09,0xca,0x1b] # 1.0 src2
+# GFX1170: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], 1.0/*Invalid immediate*/ ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x02,0x18]
+
+[0x0c,0x40,0x50,0xcc,0x00,0x09,0x06,0x1a] # 1 src2
+# GFX1170: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], 1/*Invalid immediate*/ ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x06,0x18]
+
+
+
+[0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x1c]
+# GFX1170: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x1c]
+
+[0x0c,0xc0,0x51,0xcc,0x00,0x09,0x52,0x1c] # clamp
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x0c,0x48,0x51,0xcc,0x00,0x09,0x52,0x1c] # op_sel:[1,0,0]
+# GFX1170: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 index_key:1 ; encoding: [0x0c,0x48,0x51,0xcc,0x00,0x09,0x52,0x1c]
+
+[0x0c,0x50,0x51,0xcc,0x00,0x09,0x52,0x1c] # op_sel:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x0c,0x60,0x51,0xcc,0x00,0x09,0x52,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x0c,0x00,0x51,0xcc,0x00,0x09,0x52,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x0c,0x48,0x51,0xcc,0x00,0x09,0x52,0x1c] # index_key:1
+# GFX1170: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 index_key:1 ; encoding: [0x0c,0x48,0x51,0xcc,0x00,0x09,0x52,0x1c]
+
+[0x0c,0x50,0x51,0xcc,0x00,0x09,0x52,0x1c] # index_key:2
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x0c,0x58,0x51,0xcc,0x00,0x09,0x52,0x1c] # index_key:3
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x3c] # neg_lo:[1,0,0]
+# GFX1170: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x3c]
+
+[0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x5c] # neg_lo:[0,1,0]
+# GFX1170: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x5c]
+
+[0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x9c] # neg_lo:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x0c,0x41,0x51,0xcc,0x00,0x09,0x52,0x1c] # neg_hi:[1,0,0]
+# GFX1170: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[1,0,0] ; encoding: [0x0c,0x41,0x51,0xcc,0x00,0x09,0x52,0x1c]
+
+[0x0c,0x42,0x51,0xcc,0x00,0x09,0x52,0x1c] # neg_hi:[0,1,0]
+# GFX1170: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[0,1,0] ; encoding: [0x0c,0x42,0x51,0xcc,0x00,0x09,0x52,0x1c]
+
+[0x0c,0x44,0x51,0xcc,0x00,0x09,0x52,0x1c] # neg_hi:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x0c,0x40,0x51,0xcc,0x00,0x08,0x52,0x1c] # sgpr src0
+# GFX1170: v_swmmac_f32_16x16x32_bf16 v[12:19], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v[4:11], v20 ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x08,0x52,0x1c]
+
+[0x0c,0x40,0x51,0xcc,0x00,0x09,0x50,0x1c] # sgpr src1
+# GFX1170: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/, v20 ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x50,0x1c]
+
+[0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x18] # sgpr src2
+# GFX1170: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], s20/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x18]
+
+[0x0c,0x40,0x51,0xcc,0xf2,0x08,0x52,0x1c] # 1.0 src0
+# GFX1170: v_swmmac_f32_16x16x32_bf16 v[12:19], 1.0/*Invalid immediate*/, v[4:11], v20 ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x08,0x52,0x1c]
+
+[0x0c,0x40,0x51,0xcc,0x00,0xe5,0x51,0x1c] # 1.0 src1
+# GFX1170: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], 1.0/*Invalid immediate*/, v20 ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x01,0x50,0x1c]
+
+[0x0c,0x40,0x51,0xcc,0x00,0x09,0xca,0x1b] # 1.0 src2
+# GFX1170: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], 1.0/*Invalid immediate*/ ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x02,0x18]
+
+[0x0c,0x40,0x51,0xcc,0x00,0x09,0x06,0x1a] # 1 src2
+# GFX1170: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], 1/*Invalid immediate*/ ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x06,0x18]
+
+
+
+[0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x1c]
+# GFX1170: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x1c]
+
+[0x0c,0xc0,0x52,0xcc,0x00,0x09,0x42,0x1c] # clamp
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x0c,0x48,0x52,0xcc,0x00,0x09,0x42,0x1c] # op_sel:[1,0,0]
+# GFX1170: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:1 ; encoding: [0x0c,0x48,0x52,0xcc,0x00,0x09,0x42,0x1c]
+
+[0x0c,0x50,0x52,0xcc,0x00,0x09,0x42,0x1c] # op_sel:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x0c,0x60,0x52,0xcc,0x00,0x09,0x42,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x0c,0x00,0x52,0xcc,0x00,0x09,0x42,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x0c,0x48,0x52,0xcc,0x00,0x09,0x42,0x1c] # index_key:1
+# GFX1170: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:1 ; encoding: [0x0c,0x48,0x52,0xcc,0x00,0x09,0x42,0x1c]
+
+[0x0c,0x50,0x52,0xcc,0x00,0x09,0x42,0x1c] # index_key:2
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x0c,0x58,0x52,0xcc,0x00,0x09,0x42,0x1c] # index_key:3
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x3c] # neg_lo:[1,0,0]
+# GFX1170: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x3c]
+
+[0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x5c] # neg_lo:[0,1,0]
+# GFX1170: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x5c]
+
+[0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x9c] # neg_lo:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x0c,0x41,0x52,0xcc,0x00,0x09,0x42,0x1c] # neg_hi:[1,0,0]
+# GFX1170: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[1,0,0] ; encoding: [0x0c,0x41,0x52,0xcc,0x00,0x09,0x42,0x1c]
+
+[0x0c,0x42,0x52,0xcc,0x00,0x09,0x42,0x1c] # neg_hi:[0,1,0]
+# GFX1170: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[0,1,0] ; encoding: [0x0c,0x42,0x52,0xcc,0x00,0x09,0x42,0x1c]
+
+[0x0c,0x44,0x52,0xcc,0x00,0x09,0x42,0x1c] # neg_hi:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x0c,0x40,0x52,0xcc,0x00,0x08,0x42,0x1c] # sgpr src0
+# GFX1170: v_swmmac_f16_16x16x32_f16 v[12:15], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v[4:11], v16 ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x08,0x42,0x1c]
+
+[0x0c,0x40,0x52,0xcc,0x00,0x09,0x40,0x1c] # sgpr src1
+# GFX1170: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/, v16 ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x40,0x1c]
+
+[0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x18] # sgpr src2
+# GFX1170: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], s16/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x18]
+
+[0x0c,0x40,0x52,0xcc,0xf2,0x08,0x42,0x1c] # 1.0 src0
+# GFX1170: v_swmmac_f16_16x16x32_f16 v[12:15], 1.0/*Invalid immediate*/, v[4:11], v16 ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x08,0x42,0x1c]
+
+[0x0c,0x40,0x52,0xcc,0x00,0xe5,0x41,0x1c] # 1.0 src1
+# GFX1170: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], 1.0/*Invalid immediate*/, v16 ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x01,0x40,0x1c]
+
+[0x0c,0x40,0x52,0xcc,0x00,0x09,0xca,0x1b] # 1.0 src2
+# GFX1170: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], 1.0/*Invalid immediate*/ ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x02,0x18]
+
+[0x0c,0x40,0x52,0xcc,0x00,0x09,0x06,0x1a] # 1 src2
+# GFX1170: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], 1/*Invalid immediate*/ ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x06,0x18]
+
+
+
+[0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x1c]
+# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x1c]
+
+[0x0c,0xc0,0x53,0xcc,0x00,0x09,0x42,0x1c] # clamp
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x0c,0x48,0x53,0xcc,0x00,0x09,0x42,0x1c] # op_sel:[1,0,0]
+# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:1 ; encoding: [0x0c,0x48,0x53,0xcc,0x00,0x09,0x42,0x1c]
+
+[0x0c,0x50,0x53,0xcc,0x00,0x09,0x42,0x1c] # op_sel:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x0c,0x60,0x53,0xcc,0x00,0x09,0x42,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x0c,0x00,0x53,0xcc,0x00,0x09,0x42,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x0c,0x48,0x53,0xcc,0x00,0x09,0x42,0x1c] # index_key:1
+# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:1 ; encoding: [0x0c,0x48,0x53,0xcc,0x00,0x09,0x42,0x1c]
+
+[0x0c,0x50,0x53,0xcc,0x00,0x09,0x42,0x1c] # index_key:2
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x0c,0x58,0x53,0xcc,0x00,0x09,0x42,0x1c] # index_key:3
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x3c] # neg_lo:[1,0,0]
+# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x3c]
+
+[0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x5c] # neg_lo:[0,1,0]
+# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x5c]
+
+[0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x9c] # neg_lo:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x0c,0x41,0x53,0xcc,0x00,0x09,0x42,0x1c] # neg_hi:[1,0,0]
+# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[1,0,0] ; encoding: [0x0c,0x41,0x53,0xcc,0x00,0x09,0x42,0x1c]
+
+[0x0c,0x42,0x53,0xcc,0x00,0x09,0x42,0x1c] # neg_hi:[0,1,0]
+# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[0,1,0] ; encoding: [0x0c,0x42,0x53,0xcc,0x00,0x09,0x42,0x1c]
+
+[0x0c,0x44,0x53,0xcc,0x00,0x09,0x42,0x1c] # neg_hi:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x0c,0x40,0x53,0xcc,0x00,0x08,0x42,0x1c] # sgpr src0
+# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[12:15], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v[4:11], v16 ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x08,0x42,0x1c]
+
+[0x0c,0x40,0x53,0xcc,0x00,0x09,0x40,0x1c] # sgpr src1
+# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/, v16 ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x40,0x1c]
+
+[0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x18] # sgpr src2
+# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], s16/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x18]
+
+[0x0c,0x40,0x53,0xcc,0xf2,0x08,0x42,0x1c] # 1.0 src0
+# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[12:15], 1.0/*Invalid immediate*/, v[4:11], v16 ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x08,0x42,0x1c]
+
+[0x0c,0x40,0x53,0xcc,0x00,0xe5,0x41,0x1c] # 1.0 src1
+# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], 1.0/*Invalid immediate*/, v16 ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x01,0x40,0x1c]
+
+[0x0c,0x40,0x53,0xcc,0x00,0x09,0xca,0x1b] # 1.0 src2
+# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], 1.0/*Invalid immediate*/ ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x02,0x18]
+
+[0x0c,0x40,0x53,0xcc,0x00,0x09,0x06,0x1a] # 1 src2
+# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], 1/*Invalid immediate*/ ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x06,0x18]
+
+
+
+[0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x1c]
+# GFX1170: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0xc0,0x54,0xcc,0x00,0x05,0x3a,0x1c] # clamp
+# GFX1170: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 clamp ; encoding: [0x06,0xc0,0x54,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0x48,0x54,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[1,0,0]
+# GFX1170: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x54,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0x50,0x54,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x60,0x54,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x00,0x54,0xcc,0x00,0x05,0x3a,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x54,0xcc,0x00,0x05,0x3a,0x1c] # index_key:1
+# GFX1170: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x54,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0x50,0x54,0xcc,0x00,0x05,0x3a,0x1c] # index_key:2
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x58,0x54,0xcc,0x00,0x05,0x3a,0x1c] # index_key:3
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x3c] # neg_lo:[1,0,0]
+# GFX1170: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x3c]
+
+[0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x5c] # neg_lo:[0,1,0]
+# GFX1170: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x5c]
+
+[0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x9c] # neg_lo:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x41,0x54,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x42,0x54,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x44,0x54,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x54,0xcc,0x00,0x04,0x3a,0x1c] # sgpr src0
+# GFX1170:v_swmmac_i32_16x16x32_iu8 v[6:13], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v14 ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x04,0x3a,0x1c]
+
+[0x06,0x40,0x54,0xcc,0x00,0x05,0x38,0x1c] # sgpr src1
+# GFX1170:v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v14 ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x01,0x38,0x1c]
+
+[0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x18] # sgpr src2
+# GFX1170:v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], s14/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x18]
+
+[0x06,0x40,0x54,0xcc,0x81,0x04,0x3a,0x1c] # 1 src0
+# GFX1170:v_swmmac_i32_16x16x32_iu8 v[6:13], 1/*Invalid immediate*/, v[2:5], v14 ; encoding: [0x06,0x40,0x54,0xcc,0x01,0x04,0x3a,0x1c]
+
+[0x06,0x40,0x54,0xcc,0x00,0x03,0x39,0x1c] # 1 src1
+# GFX1170:v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], 1/*Invalid immediate*/, v14 ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x03,0x38,0x1c]
+
+[0x06,0x40,0x54,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX1170:v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x05,0x06,0x18]
+
+[0x06,0x40,0x54,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX1170:v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x05,0x02,0x18]
+
+
+
+[0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x1c]
+# GFX1170: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x1c]
+
+[0x03,0xc0,0x55,0xcc,0x00,0x03,0x2e,0x1c] # clamp
+# GFX1170: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 clamp ; encoding: [0x03,0xc0,0x55,0xcc,0x00,0x03,0x2e,0x1c]
+
+[0x03,0x48,0x55,0xcc,0x00,0x03,0x2e,0x1c] # op_sel:[1,0,0]
+# GFX1170: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 index_key:1 ; encoding: [0x03,0x48,0x55,0xcc,0x00,0x03,0x2e,0x1c]
+
+[0x03,0x50,0x55,0xcc,0x00,0x03,0x2e,0x1c] # op_sel:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x60,0x55,0xcc,0x00,0x03,0x2e,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x00,0x55,0xcc,0x00,0x03,0x2e,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x48,0x55,0xcc,0x00,0x03,0x2e,0x1c] # index_key:1
+# GFX1170: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 index_key:1 ; encoding: [0x03,0x48,0x55,0xcc,0x00,0x03,0x2e,0x1c]
+
+[0x03,0x50,0x55,0xcc,0x00,0x03,0x2e,0x1c] # index_key:2
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x58,0x55,0xcc,0x00,0x03,0x2e,0x1c] # index_key:3
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x3c] # neg_lo:[1,0,0]
+# GFX1170: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[1,0,0] ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x3c]
+
+[0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x5c] # neg_lo:[0,1,0]
+# GFX1170: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[0,1,0] ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x5c]
+
+[0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x9c] # neg_lo:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x41,0x55,0xcc,0x00,0x03,0x2e,0x1c] # neg_hi:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x42,0x55,0xcc,0x00,0x03,0x2e,0x1c] # neg_hi:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x44,0x55,0xcc,0x00,0x03,0x2e,0x1c] # neg_hi:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x55,0xcc,0x00,0x02,0x2e,0x1c] # sgpr src0
+# GFX1170: v_swmmac_i32_16x16x32_iu4 v[3:10], s0/*Invalid register, operand has 'VGPR_32' register class*/, v[1:2], v11 ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x02,0x2e,0x1c]
+
+[0x03,0x40,0x55,0xcc,0x00,0x03,0x2c,0x1c] # sgpr src1
+# GFX1170: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v11 ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x01,0x2c,0x1c]
+
+[0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x18] # sgpr src2
+# GFX1170: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], s11/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x18]
+
+[0x03,0x40,0x55,0xcc,0x81,0x02,0x2e,0x1c] # 1 src0
+# GFX1170: v_swmmac_i32_16x16x32_iu4 v[3:10], 1/*Invalid immediate*/, v[1:2], v11 ; encoding: [0x03,0x40,0x55,0xcc,0x01,0x02,0x2e,0x1c]
+
+[0x03,0x40,0x55,0xcc,0x00,0x03,0x2d,0x1c] # 1 src1
+# GFX1170: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, 1/*Invalid immediate*/, v11 ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x2c,0x1c]
+
+[0x03,0x40,0x55,0xcc,0x00,0x03,0x06,0x1a] # 1 src2
+# GFX1170: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], 1/*Invalid immediate*/ ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x06,0x18]
+
+[0x03,0x40,0x55,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2
+# GFX1170: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], 1.0/*Invalid immediate*/ ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x02,0x18]
+
+
+
+[0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x1c]
+# GFX1170: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0xc0,0x56,0xcc,0x00,0x05,0x3a,0x1c] # clamp
+# GFX1170: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 clamp ; encoding: [0x06,0xc0,0x56,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0x48,0x56,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x50,0x56,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x60,0x56,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x00,0x56,0xcc,0x00,0x05,0x3a,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x56,0xcc,0x00,0x05,0x3a,0x1c] # index_key:1
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x50,0x56,0xcc,0x00,0x05,0x3a,0x1c] # index_key:2
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x58,0x56,0xcc,0x00,0x05,0x3a,0x1c] # index_key:3
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x3c] # neg_lo:[1,0,0]
+# GFX1170: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x3c]
+
+[0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x5c] # neg_lo:[0,1,0]
+# GFX1170: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x5c]
+
+[0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x9c] # neg_lo:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x41,0x56,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x42,0x56,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x44,0x56,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x56,0xcc,0x00,0x04,0x3a,0x1c] # sgpr src0
+# GFX1170: v_swmmac_i32_16x16x64_iu4 v[6:13], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v14 ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x04,0x3a,0x1c]
+
+[0x06,0x40,0x56,0xcc,0x00,0x05,0x38,0x1c] # sgpr src1
+# GFX1170: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v14 ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x01,0x38,0x1c]
+
+[0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x18] # sgpr src2
+# GFX1170: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], s14/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x18]
+
+[0x06,0x40,0x56,0xcc,0x81,0x04,0x3a,0x1c] # 1 src0
+# GFX1170: v_swmmac_i32_16x16x64_iu4 v[6:13], 1/*Invalid immediate*/, v[2:5], v14 ; encoding: [0x06,0x40,0x56,0xcc,0x01,0x04,0x3a,0x1c]
+
+[0x06,0x40,0x56,0xcc,0x00,0x03,0x39,0x1c] # 1 src1
+# GFX1170: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], 1/*Invalid immediate*/, v14 ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x03,0x38,0x1c]
+
+[0x06,0x40,0x56,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX1170: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x05,0x06,0x18]
+
+[0x06,0x40,0x56,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX1170: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x05,0x02,0x18]
+
+
+
+[0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x1c]
+# GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0xc0,0x57,0xcc,0x00,0x05,0x3a,0x1c] # clamp
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x57,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[1,0,0]
+# GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x57,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0x50,0x57,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x60,0x57,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x00,0x57,0xcc,0x00,0x05,0x3a,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x57,0xcc,0x00,0x05,0x3a,0x1c] # index_key:1
+# GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x57,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0x50,0x57,0xcc,0x00,0x05,0x3a,0x1c] # index_key:2
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x58,0x57,0xcc,0x00,0x05,0x3a,0x1c] # index_key:3
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x3c] # neg_lo:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x5c] # neg_lo:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x9c] # neg_lo:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x41,0x57,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x42,0x57,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x44,0x57,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x57,0xcc,0x00,0x04,0x3a,0x1c] # sgpr src0
+# GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v14 ; encoding: [0x06,0x40,0x57,0xcc,0x00,0x04,0x3a,0x1c]
+
+[0x06,0x40,0x57,0xcc,0x00,0x05,0x38,0x1c] # sgpr src1
+# GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v14 ; encoding: [0x06,0x40,0x57,0xcc,0x00,0x01,0x38,0x1c]
+
+[0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x18] # sgpr src2
+# GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], s14/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x18]
+
+[0x06,0x40,0x57,0xcc,0xf2,0x04,0x3a,0x1c] # 1.0 src0
+# GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], 1.0/*Invalid immediate*/, v[2:5], v14 ; encoding: [0x06,0x40,0x57,0xcc,0x00,0x04,0x3a,0x1c]
+
+[0x06,0x40,0x57,0xcc,0x00,0xe5,0x39,0x1c] # 1.0 src1
+# GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], 1.0/*Invalid immediate*/, v14 ; encoding: [0x06,0x40,0x57,0xcc,0x00,0x01,0x38,0x1c]
+
+[0x06,0x40,0x57,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x57,0xcc,0x00,0x05,0x02,0x18]
+
+[0x06,0x40,0x57,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x57,0xcc,0x00,0x05,0x06,0x18]
+
+
+
+[0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x1c]
+# GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0xc0,0x58,0xcc,0x00,0x05,0x3a,0x1c] # clamp
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x58,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[1,0,0]
+# GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x58,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0x50,0x58,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x60,0x58,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x00,0x58,0xcc,0x00,0x05,0x3a,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x58,0xcc,0x00,0x05,0x3a,0x1c] # index_key:1
+# GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x58,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0x50,0x58,0xcc,0x00,0x05,0x3a,0x1c] # index_key:2
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x58,0x58,0xcc,0x00,0x05,0x3a,0x1c] # index_key:3
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x3c] # neg_lo:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x5c] # neg_lo:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x9c] # neg_lo:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x41,0x58,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x42,0x58,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x44,0x58,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x58,0xcc,0x00,0x04,0x3a,0x1c] # sgpr src0
+# GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v14 ; encoding: [0x06,0x40,0x58,0xcc,0x00,0x04,0x3a,0x1c]
+
+[0x06,0x40,0x58,0xcc,0x00,0x05,0x38,0x1c] # sgpr src1
+# GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v14 ; encoding: [0x06,0x40,0x58,0xcc,0x00,0x01,0x38,0x1c]
+
+[0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x18] # sgpr src2
+# GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], s14/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x18]
+
+[0x06,0x40,0x58,0xcc,0xf2,0x04,0x3a,0x1c] # 1.0 src0
+# GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], 1.0/*Invalid immediate*/, v[2:5], v14 ; encoding: [0x06,0x40,0x58,0xcc,0x00,0x04,0x3a,0x1c]
+
+[0x06,0x40,0x58,0xcc,0x00,0xe5,0x39,0x1c] # 1.0 src1
+# GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], 1.0/*Invalid immediate*/, v14 ; encoding: [0x06,0x40,0x58,0xcc,0x00,0x01,0x38,0x1c]
+
+[0x06,0x40,0x58,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x58,0xcc,0x00,0x05,0x02,0x18]
+
+[0x06,0x40,0x58,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x58,0xcc,0x00,0x05,0x06,0x18]
+
+
+
+[0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x1c]
+# GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0xc0,0x59,0xcc,0x00,0x05,0x3a,0x1c] # clamp
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x59,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[1,0,0]
+# GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x59,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0x50,0x59,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x60,0x59,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x00,0x59,0xcc,0x00,0x05,0x3a,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x59,0xcc,0x00,0x05,0x3a,0x1c] # index_key:1
+# GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x59,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0x50,0x59,0xcc,0x00,0x05,0x3a,0x1c] # index_key:2
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x58,0x59,0xcc,0x00,0x05,0x3a,0x1c] # index_key:3
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x3c] # neg_lo:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x5c] # neg_lo:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x9c] # neg_lo:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x41,0x59,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x42,0x59,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x44,0x59,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x59,0xcc,0x00,0x04,0x3a,0x1c] # sgpr src0
+# GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v14 ; encoding: [0x06,0x40,0x59,0xcc,0x00,0x04,0x3a,0x1c]
+
+[0x06,0x40,0x59,0xcc,0x00,0x05,0x38,0x1c] # sgpr src1
+# GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v14 ; encoding: [0x06,0x40,0x59,0xcc,0x00,0x01,0x38,0x1c]
+
+[0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x18] # sgpr src2
+# GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], s14/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x18]
+
+[0x06,0x40,0x59,0xcc,0xf2,0x04,0x3a,0x1c] # 1.0 src0
+# GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], 1.0/*Invalid immediate*/, v[2:5], v14 ; encoding: [0x06,0x40,0x59,0xcc,0x00,0x04,0x3a,0x1c]
+
+[0x06,0x40,0x59,0xcc,0x00,0xe5,0x39,0x1c] # 1.0 src1
+# GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], 1.0/*Invalid immediate*/, v14 ; encoding: [0x06,0x40,0x59,0xcc,0x00,0x01,0x38,0x1c]
+
+[0x06,0x40,0x59,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x59,0xcc,0x00,0x05,0x02,0x18]
+
+[0x06,0x40,0x59,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x59,0xcc,0x00,0x05,0x06,0x18]
+
+
+
+[0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x1c]
+# GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0xc0,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # clamp
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[1,0,0]
+# GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x5a,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0x50,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x60,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x00,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # index_key:1
+# GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x5a,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0x50,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # index_key:2
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x58,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # index_key:3
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x3c] # neg_lo:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x5c] # neg_lo:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x9c] # neg_lo:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x41,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x42,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x44,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x5a,0xcc,0x00,0x04,0x3a,0x1c] # sgpr src0
+# GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v14 ; encoding: [0x06,0x40,0x5a,0xcc,0x00,0x04,0x3a,0x1c]
+
+[0x06,0x40,0x5a,0xcc,0x00,0x05,0x38,0x1c] # sgpr src1
+# GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v14 ; encoding: [0x06,0x40,0x5a,0xcc,0x00,0x01,0x38,0x1c]
+
+[0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x18] # sgpr src2
+# GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], s14/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x18]
+
+[0x06,0x40,0x5a,0xcc,0xf2,0x04,0x3a,0x1c] # 1.0 src0
+# GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], 1.0/*Invalid immediate*/, v[2:5], v14 ; encoding: [0x06,0x40,0x5a,0xcc,0x00,0x04,0x3a,0x1c]
+
+[0x06,0x40,0x5a,0xcc,0x00,0xe5,0x39,0x1c] # 1.0 src1
+# GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], 1.0/*Invalid immediate*/, v14 ; encoding: [0x06,0x40,0x5a,0xcc,0x00,0x01,0x38,0x1c]
+
+[0x06,0x40,0x5a,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x5a,0xcc,0x00,0x05,0x02,0x18]
+
+[0x06,0x40,0x5a,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x5a,0xcc,0x00,0x05,0x06,0x18]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1170_dasm_wmma_w64.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1170_dasm_wmma_w64.txt
new file mode 100644
index 0000000000000..169fd20488e37
--- /dev/null
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1170_dasm_wmma_w64.txt
@@ -0,0 +1,1628 @@
+# RUN: not llvm-mc -disassemble -triple=amdgcn -mcpu=gfx1170 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=GFX1170 %s
+# RUN: not llvm-mc -disassemble -triple=amdgcn -mcpu=gfx1170 -mattr=+wavefrontsize64 -show-encoding %s 2>&1 | FileCheck --implicit-check-not=warning: --check-prefix=GFX1170-ERR %s
+
+[0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x1c]
+# GFX1170: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0xc0,0x40,0xcc,0x00,0x05,0x12,0x1c] # clamp
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x40,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x40,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x60,0x40,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x00,0x40,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x40,0xcc,0x00,0x05,0x12,0x1c] # index_key:1
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x40,0xcc,0x00,0x05,0x12,0x1c] # index_key:2
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x58,0x40,0xcc,0x00,0x05,0x12,0x1c] # index_key:3
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0]
+# GFX1170: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x3c]
+
+[0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0]
+# GFX1170: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x5c]
+
+[0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1]
+# GFX1170: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x9c]
+
+[0x04,0x41,0x40,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0]
+# GFX1170: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[1,0,0] ; encoding: [0x04,0x41,0x40,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x42,0x40,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0]
+# GFX1170: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,1,0] ; encoding: [0x04,0x42,0x40,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x44,0x40,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1]
+# GFX1170: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x40,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x40,0x40,0xcc,0x00,0x04,0x12,0x1c] # sgpr_0 src0
+# GFX1170: v_wmma_f32_16x16x16_f16 v[4:7], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:7] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x40,0xcc,0x00,0x01,0x10,0x1c] # sgpr_0 src1
+# GFX1170: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[4:7] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x01,0x10,0x1c]
+
+[0x04,0x40,0x40,0xcc,0x00,0x05,0x02,0x18] # sgpr_0 src2
+# GFX1170: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x02,0x18]
+
+[0x04,0x40,0x40,0xcc,0xf2,0x04,0x12,0x1c] # 1.0 src0
+# GFX1170: v_wmma_f32_16x16x16_f16 v[4:7], 1.0/*Invalid immediate*/, v[2:3], v[4:7] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x40,0xcc,0x00,0xe5,0x11,0x1c] # 1.0 src1
+# GFX1170: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], 1.0/*Invalid immediate*/, v[4:7] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x01,0x10,0x1c]
+
+[0x04,0x40,0x40,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX1170: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0xca,0x1b]
+
+[0x04,0x40,0x40,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX1170: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x06,0x1a]
+
+
+
+[0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x1c]
+# GFX1170: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0xc0,0x41,0xcc,0x00,0x05,0x12,0x1c] # clamp
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x41,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x41,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x60,0x41,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x00,0x41,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x41,0xcc,0x00,0x05,0x12,0x1c] # index_key:1
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x41,0xcc,0x00,0x05,0x12,0x1c] # index_key:2
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x58,0x41,0xcc,0x00,0x05,0x12,0x1c] # index_key:3
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0]
+# GFX1170: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x3c]
+
+[0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0]
+# GFX1170: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x5c]
+
+[0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1]
+# GFX1170: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x9c]
+
+[0x04,0x41,0x41,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0]
+# GFX1170: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[1,0,0] ; encoding: [0x04,0x41,0x41,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x42,0x41,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0]
+# GFX1170: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,1,0] ; encoding: [0x04,0x42,0x41,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x44,0x41,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1]
+# GFX1170: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x41,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x40,0x41,0xcc,0x00,0x04,0x12,0x1c] # sgpr_0 src0
+# GFX1170: v_wmma_f32_16x16x16_bf16 v[4:7], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:7] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x41,0xcc,0x00,0x01,0x10,0x1c] # sgpr_0 src1
+# GFX1170: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[4:7] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x01,0x10,0x1c]
+
+[0x04,0x40,0x41,0xcc,0x00,0x05,0x02,0x18] # sgpr_0 src2
+# GFX1170: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x02,0x18]
+
+[0x04,0x40,0x41,0xcc,0xf2,0x04,0x12,0x1c] # 1.0 src0
+# GFX1170: v_wmma_f32_16x16x16_bf16 v[4:7], 1.0/*Invalid immediate*/, v[2:3], v[4:7] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x41,0xcc,0x00,0xe5,0x11,0x1c] # 1.0 src1
+# GFX1170: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], 1.0/*Invalid immediate*/, v[4:7] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x01,0x10,0x1c]
+
+[0x04,0x40,0x41,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX1170: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0xca,0x1b]
+
+[0x04,0x40,0x41,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX1170: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x06,0x1a]
+
+
+
+[0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x1c]
+# GFX1170: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0xc0,0x42,0xcc,0x00,0x05,0x12,0x1c] # clamp
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x42,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x42,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x60,0x42,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x00,0x42,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x42,0xcc,0x00,0x05,0x12,0x1c] # index_key:1
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x42,0xcc,0x00,0x05,0x12,0x1c] # index_key:2
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x58,0x42,0xcc,0x00,0x05,0x12,0x1c] # index_key:3
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0]
+# GFX1170: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x3c]
+
+[0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0]
+# GFX1170: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x5c]
+
+[0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1]
+# GFX1170: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x9c]
+
+[0x04,0x41,0x42,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0]
+# GFX1170: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[1,0,0] ; encoding: [0x04,0x41,0x42,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x42,0x42,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0]
+# GFX1170: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,1,0] ; encoding: [0x04,0x42,0x42,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x44,0x42,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1]
+# GFX1170: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x42,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x40,0x42,0xcc,0x00,0x04,0x12,0x1c] # sgpr_0 src0
+# GFX1170: v_wmma_f16_16x16x16_f16 v[4:5], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:5] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x42,0xcc,0x00,0x01,0x10,0x1c] # sgpr_0 src1
+# GFX1170: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[4:5] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x01,0x10,0x1c]
+
+[0x04,0x40,0x42,0xcc,0x00,0x05,0x02,0x18] # sgpr_0 src2
+# GFX1170: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/ ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x02,0x18]
+
+[0x04,0x40,0x42,0xcc,0xf2,0x04,0x12,0x1c] # 1.0 src0
+# GFX1170: v_wmma_f16_16x16x16_f16 v[4:5], 1.0/*Invalid immediate*/, v[2:3], v[4:5] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x42,0xcc,0x00,0xe5,0x11,0x1c] # 1.0 src1
+# GFX1170: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], 1.0/*Invalid immediate*/, v[4:5] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x01,0x10,0x1c]
+
+[0x04,0x40,0x42,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX1170: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0xca,0x1b]
+
+[0x04,0x40,0x42,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX1170: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x06,0x1a]
+
+
+
+[0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x1c]
+# GFX1170: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0xc0,0x43,0xcc,0x00,0x05,0x12,0x1c] # clamp
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x43,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x43,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x60,0x43,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x00,0x43,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x43,0xcc,0x00,0x05,0x12,0x1c] # index_key:1
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x43,0xcc,0x00,0x05,0x12,0x1c] # index_key:2
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x58,0x43,0xcc,0x00,0x05,0x12,0x1c] # index_key:3
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0]
+# GFX1170: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x3c]
+
+[0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0]
+# GFX1170: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x5c]
+
+[0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1]
+# GFX1170: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x9c]
+
+[0x04,0x41,0x43,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0]
+# GFX1170: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[1,0,0] ; encoding: [0x04,0x41,0x43,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x42,0x43,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0]
+# GFX1170: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,1,0] ; encoding: [0x04,0x42,0x43,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x44,0x43,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1]
+# GFX1170: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x43,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x40,0x43,0xcc,0x00,0x04,0x12,0x1c] # sgpr_0 src0
+# GFX1170: v_wmma_bf16_16x16x16_bf16 v[4:5], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:5] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x43,0xcc,0x00,0x01,0x10,0x1c] # sgpr_0 src1
+# GFX1170: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[4:5] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x01,0x10,0x1c]
+
+[0x04,0x40,0x43,0xcc,0x00,0x05,0x02,0x18] # sgpr_0 src2
+# GFX1170: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/ ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x02,0x18]
+
+[0x04,0x40,0x43,0xcc,0xf2,0x04,0x12,0x1c] # 1.0 src0
+# GFX1170: v_wmma_bf16_16x16x16_bf16 v[4:5], 1.0/*Invalid immediate*/, v[2:3], v[4:5] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x43,0xcc,0x00,0xe5,0x11,0x1c] # 1.0 src1
+# GFX1170: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], 1.0/*Invalid immediate*/, v[4:5] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x01,0x10,0x1c]
+
+[0x04,0x40,0x43,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX1170: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0xca,0x1b]
+
+[0x04,0x40,0x43,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX1170: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x06,0x1a]
+
+
+
+[0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x1c]
+# GFX1170: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x1c]
+
+[0x02,0xc0,0x44,0xcc,0x00,0x03,0x0a,0x1c] # clamp
+# GFX1170: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] clamp ; encoding: [0x02,0xc0,0x44,0xcc,0x00,0x03,0x0a,0x1c]
+
+[0x02,0x48,0x44,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x44,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x60,0x44,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x00,0x44,0xcc,0x00,0x03,0x0a,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x48,0x44,0xcc,0x00,0x03,0x0a,0x1c] # index_key:1
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x44,0xcc,0x00,0x03,0x0a,0x1c] # index_key:2
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x58,0x44,0xcc,0x00,0x03,0x0a,0x1c] # index_key:3
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x3c] # neg_lo:[1,0,0]
+# GFX1170: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x3c]
+
+[0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x5c] # neg_lo:[0,1,0]
+# GFX1170: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x5c]
+
+[0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x9c] # neg_lo:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x41,0x44,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x42,0x44,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x44,0x44,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x44,0xcc,0x00,0x02,0x0a,0x1c] # sgpr_0 src0
+# GFX1170: v_wmma_i32_16x16x16_iu8 v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/, v1, v[2:5] ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x02,0x0a,0x1c]
+
+[0x02,0x40,0x44,0xcc,0x00,0x01,0x08,0x1c] # sgpr_0 src1
+# GFX1170: v_wmma_i32_16x16x16_iu8 v[2:5], v0, s0/*Invalid register, operand has 'VGPR_32' register class*/, v[2:5] ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x01,0x08,0x1c]
+
+[0x02,0x40,0x44,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2
+# GFX1170: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, s[0:3]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x02,0x18]
+
+[0x02,0x40,0x44,0xcc,0x81,0x02,0x0a,0x1c] # 1 src0
+# GFX1170: v_wmma_i32_16x16x16_iu8 v[2:5], 1/*Invalid immediate*/, v1, v[2:5] ; encoding: [0x02,0x40,0x44,0xcc,0x01,0x02,0x0a,0x1c]
+
+[0x02,0x40,0x44,0xcc,0x00,0x03,0x09,0x1c] # 1 src1
+# GFX1170: v_wmma_i32_16x16x16_iu8 v[2:5], v0, 1/*Invalid immediate*/, v[2:5] ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x08,0x1c]
+
+[0x02,0x40,0x44,0xcc,0x00,0x03,0x06,0x1a] # 1 src2
+# GFX1170: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x06,0x1a]
+
+[0x02,0x40,0x44,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2
+# GFX1170: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0xca,0x1b]
+
+
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x1c]
+# GFX1170: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x1c]
+
+[0x02,0xc0,0x45,0xcc,0x00,0x03,0x0a,0x1c] # clamp
+# GFX1170: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] clamp ; encoding: [0x02,0xc0,0x45,0xcc,0x00,0x03,0x0a,0x1c]
+
+[0x02,0x48,0x45,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x45,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x60,0x45,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x00,0x45,0xcc,0x00,0x03,0x0a,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x48,0x45,0xcc,0x00,0x03,0x0a,0x1c] # index_key:1
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x45,0xcc,0x00,0x03,0x0a,0x1c] # index_key:2
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x58,0x45,0xcc,0x00,0x03,0x0a,0x1c] # index_key:3
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x3c] # neg_lo:[1,0,0]
+# GFX1170: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x3c]
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x5c] # neg_lo:[0,1,0]
+# GFX1170: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x5c]
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x9c] # neg_lo:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x41,0x45,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x42,0x45,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x44,0x45,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x45,0xcc,0x00,0x02,0x0a,0x1c] # sgpr_0 src0
+# GFX1170: v_wmma_i32_16x16x16_iu4 v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/, v1, v[2:5] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x02,0x0a,0x1c]
+
+[0x02,0x40,0x45,0xcc,0x00,0x01,0x08,0x1c] # sgpr_0 src1
+# GFX1170: v_wmma_i32_16x16x16_iu4 v[2:5], v0, s0/*Invalid register, operand has 'VGPR_32' register class*/, v[2:5] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x01,0x08,0x1c]
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2
+# GFX1170: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, s[0:3]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x02,0x18]
+
+[0x02,0x40,0x45,0xcc,0x81,0x02,0x0a,0x1c] # 1 src0
+# GFX1170: v_wmma_i32_16x16x16_iu4 v[2:5], 1/*Invalid immediate*/, v1, v[2:5] ; encoding: [0x02,0x40,0x45,0xcc,0x01,0x02,0x0a,0x1c]
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x09,0x1c] # 1 src1
+# GFX1170: v_wmma_i32_16x16x16_iu4 v[2:5], v0, 1/*Invalid immediate*/, v[2:5] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x08,0x1c]
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x06,0x1a] # 1 src2
+# GFX1170: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x06,0x1a]
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2
+# GFX1170: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0xca,0x1b]
+
+
+
+[0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x1c]
+# GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x1c]
+
+[0x02,0xc0,0x46,0xcc,0x00,0x03,0x0a,0x1c] # clamp
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x48,0x46,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x46,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x60,0x46,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x00,0x46,0xcc,0x00,0x03,0x0a,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x48,0x46,0xcc,0x00,0x03,0x0a,0x1c] # index_key:1
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x46,0xcc,0x00,0x03,0x0a,0x1c] # index_key:2
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x58,0x46,0xcc,0x00,0x03,0x0a,0x1c] # index_key:3
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x3c] # neg_lo:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x5c] # neg_lo:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x9c] # neg_lo:[0,0,1]
+# GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x9c]
+
+[0x02,0x41,0x46,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x42,0x46,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x44,0x46,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,0,1]
+# GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] ; encoding: [0x02,0x44,0x46,0xcc,0x00,0x03,0x0a,0x1c]
+
+[0x02,0x40,0x46,0xcc,0x00,0x02,0x0a,0x1c] # sgpr_0 src0
+# GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/, v1, v[2:5] ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x02,0x0a,0x1c]
+
+[0x02,0x40,0x46,0xcc,0x00,0x01,0x08,0x1c] # sgpr_0 src1
+# GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, s0/*Invalid register, operand has 'VGPR_32' register class*/, v[2:5] ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x01,0x08,0x1c]
+
+[0x02,0x40,0x46,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2
+# GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, s[0:3]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x03,0x02,0x18]
+
+[0x02,0x40,0x46,0xcc,0xf2,0x02,0x0a,0x1c] # 1.0 src0
+# GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], 1.0/*Invalid immediate*/, v1, v[2:5] ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x02,0x0a,0x1c]
+
+[0x02,0x40,0x46,0xcc,0x00,0xe5,0x09,0x1c] # 1.0 src1
+# GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, 1.0/*Invalid immediate*/, v[2:5] ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x01,0x08,0x1c]
+
+[0x02,0x40,0x46,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2
+# GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x03,0xca,0x1b]
+
+[0x02,0x40,0x46,0xcc,0x00,0x03,0x06,0x1a] # 1 src2
+# GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x03,0x06,0x1a]
+
+
+
+[0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x1c]
+# GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x1c]
+
+[0x02,0xc0,0x47,0xcc,0x00,0x03,0x0a,0x1c] # clamp
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x48,0x47,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x47,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x60,0x47,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x00,0x47,0xcc,0x00,0x03,0x0a,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x48,0x47,0xcc,0x00,0x03,0x0a,0x1c] # index_key:1
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x47,0xcc,0x00,0x03,0x0a,0x1c] # index_key:2
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x58,0x47,0xcc,0x00,0x03,0x0a,0x1c] # index_key:3
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x3c] # neg_lo:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x5c] # neg_lo:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x9c] # neg_lo:[0,0,1]
+# GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x9c]
+
+[0x02,0x41,0x47,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x42,0x47,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x44,0x47,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,0,1]
+# GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] ; encoding: [0x02,0x44,0x47,0xcc,0x00,0x03,0x0a,0x1c]
+
+[0x02,0x40,0x47,0xcc,0x00,0x02,0x0a,0x1c] # sgpr_0 src0
+# GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/, v1, v[2:5] ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x02,0x0a,0x1c]
+
+[0x02,0x40,0x47,0xcc,0x00,0x01,0x08,0x1c] # sgpr_0 src1
+# GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, s0/*Invalid register, operand has 'VGPR_32' register class*/, v[2:5] ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x01,0x08,0x1c]
+
+[0x02,0x40,0x47,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2
+# GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, s[0:3]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x03,0x02,0x18]
+
+[0x02,0x40,0x47,0xcc,0xf2,0x02,0x0a,0x1c] # 1.0 src0
+# GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], 1.0/*Invalid immediate*/, v1, v[2:5] ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x02,0x0a,0x1c]
+
+[0x02,0x40,0x47,0xcc,0x00,0xe5,0x09,0x1c] # 1.0 src1
+# GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, 1.0/*Invalid immediate*/, v[2:5] ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x01,0x08,0x1c]
+
+[0x02,0x40,0x47,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2
+# GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x03,0xca,0x1b]
+
+[0x02,0x40,0x47,0xcc,0x00,0x03,0x06,0x1a] # 1 src2
+# GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x03,0x06,0x1a]
+
+
+
+[0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x1c]
+# GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x1c]
+
+[0x02,0xc0,0x48,0xcc,0x00,0x03,0x0a,0x1c] # clamp
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x48,0x48,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x48,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x60,0x48,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x00,0x48,0xcc,0x00,0x03,0x0a,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x48,0x48,0xcc,0x00,0x03,0x0a,0x1c] # index_key:1
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x48,0xcc,0x00,0x03,0x0a,0x1c] # index_key:2
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x58,0x48,0xcc,0x00,0x03,0x0a,0x1c] # index_key:3
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x3c] # neg_lo:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x5c] # neg_lo:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x9c] # neg_lo:[0,0,1]
+# GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x9c]
+
+[0x02,0x41,0x48,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x42,0x48,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x44,0x48,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,0,1]
+# GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] ; encoding: [0x02,0x44,0x48,0xcc,0x00,0x03,0x0a,0x1c]
+
+[0x02,0x40,0x48,0xcc,0x00,0x02,0x0a,0x1c] # sgpr_0 src0
+# GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/, v1, v[2:5] ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x02,0x0a,0x1c]
+
+[0x02,0x40,0x48,0xcc,0x00,0x01,0x08,0x1c] # sgpr_0 src1
+# GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, s0/*Invalid register, operand has 'VGPR_32' register class*/, v[2:5] ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x01,0x08,0x1c]
+
+[0x02,0x40,0x48,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2
+# GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, s[0:3]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x03,0x02,0x18]
+
+[0x02,0x40,0x48,0xcc,0xf2,0x02,0x0a,0x1c] # 1.0 src0
+# GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], 1.0/*Invalid immediate*/, v1, v[2:5] ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x02,0x0a,0x1c]
+
+[0x02,0x40,0x48,0xcc,0x00,0xe5,0x09,0x1c] # 1.0 src1
+# GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, 1.0/*Invalid immediate*/, v[2:5] ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x01,0x08,0x1c]
+
+[0x02,0x40,0x48,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2
+# GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x03,0xca,0x1b]
+
+[0x02,0x40,0x48,0xcc,0x00,0x03,0x06,0x1a] # 1 src2
+# GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x03,0x06,0x1a]
+
+
+
+[0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x1c]
+# GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x1c]
+
+[0x02,0xc0,0x49,0xcc,0x00,0x03,0x0a,0x1c] # clamp
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x48,0x49,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x49,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x60,0x49,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x00,0x49,0xcc,0x00,0x03,0x0a,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x48,0x49,0xcc,0x00,0x03,0x0a,0x1c] # index_key:1
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x49,0xcc,0x00,0x03,0x0a,0x1c] # index_key:2
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x58,0x49,0xcc,0x00,0x03,0x0a,0x1c] # index_key:3
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x3c] # neg_lo:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x5c] # neg_lo:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x9c] # neg_lo:[0,0,1]
+# GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x9c]
+
+[0x02,0x41,0x49,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x42,0x49,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x44,0x49,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,0,1]
+# GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] ; encoding: [0x02,0x44,0x49,0xcc,0x00,0x03,0x0a,0x1c]
+
+[0x02,0x40,0x49,0xcc,0x00,0x02,0x0a,0x1c] # sgpr_0 src0
+# GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/, v1, v[2:5] ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x02,0x0a,0x1c]
+
+[0x02,0x40,0x49,0xcc,0x00,0x01,0x08,0x1c] # sgpr_0 src1
+# GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, s0/*Invalid register, operand has 'VGPR_32' register class*/, v[2:5] ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x01,0x08,0x1c]
+
+[0x02,0x40,0x49,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2
+# GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, s[0:3]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x03,0x02,0x18]
+
+[0x02,0x40,0x49,0xcc,0xf2,0x02,0x0a,0x1c] # 1.0 src0
+# GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], 1.0/*Invalid immediate*/, v1, v[2:5] ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x02,0x0a,0x1c]
+
+[0x02,0x40,0x49,0xcc,0x00,0xe5,0x09,0x1c] # 1.0 src1
+# GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, 1.0/*Invalid immediate*/, v[2:5] ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x01,0x08,0x1c]
+
+[0x02,0x40,0x49,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2
+# GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x03,0xca,0x1b]
+
+[0x02,0x40,0x49,0xcc,0x00,0x03,0x06,0x1a] # 1 src2
+# GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x03,0x06,0x1a]
+
+
+
+[0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x1c]
+# GFX1170: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x1c]
+
+[0x02,0xc0,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # clamp
+# GFX1170: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] clamp ; encoding: [0x02,0xc0,0x4a,0xcc,0x00,0x03,0x0a,0x1c]
+
+[0x02,0x48,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x60,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x00,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x48,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # index_key:1
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # index_key:2
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x58,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # index_key:3
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x3c] # neg_lo:[1,0,0]
+# GFX1170: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x3c]
+
+[0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x5c] # neg_lo:[0,1,0]
+# GFX1170: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x5c]
+
+[0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x9c] # neg_lo:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x41,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x42,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x44,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x4a,0xcc,0x00,0x02,0x0a,0x1c] # sgpr_0 src0
+# GFX1170: v_wmma_i32_16x16x32_iu4 v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/, v1, v[2:5] ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x02,0x0a,0x1c]
+
+[0x02,0x40,0x4a,0xcc,0x00,0x01,0x08,0x1c] # sgpr_0 src1
+# GFX1170: v_wmma_i32_16x16x32_iu4 v[2:5], v0, s0/*Invalid register, operand has 'VGPR_32' register class*/, v[2:5] ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x01,0x08,0x1c]
+
+[0x02,0x40,0x4a,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2
+# GFX1170: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, s[0:3]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x02,0x18]
+
+[0x02,0x40,0x4a,0xcc,0x81,0x02,0x0a,0x1c] # 1 src0
+# GFX1170: v_wmma_i32_16x16x32_iu4 v[2:5], 1/*Invalid immediate*/, v1, v[2:5] ; encoding: [0x02,0x40,0x4a,0xcc,0x01,0x02,0x0a,0x1c]
+
+[0x02,0x40,0x4a,0xcc,0x00,0x03,0x09,0x1c] # 1 src1
+# GFX1170: v_wmma_i32_16x16x32_iu4 v[2:5], v0, 1/*Invalid immediate*/, v[2:5] ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x08,0x1c]
+
+[0x02,0x40,0x4a,0xcc,0x00,0x03,0x06,0x1a] # 1 src2
+# GFX1170: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x06,0x1a]
+
+[0x02,0x40,0x4a,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2
+# GFX1170: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0xca,0x1b]
+
+
+
+[0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x1c]
+# GFX1170: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x1c]
+
+[0x06,0xc0,0x50,0xcc,0x00,0x05,0x2a,0x1c] # clamp
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x50,0xcc,0x00,0x05,0x2a,0x1c] # op_sel:[1,0,0]
+# GFX1170: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:1 ; encoding: [0x06,0x48,0x50,0xcc,0x00,0x05,0x2a,0x1c]
+
+[0x06,0x50,0x50,0xcc,0x00,0x05,0x2a,0x1c] # op_sel:[0,1,0]
+# GFX1170: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:2 ; encoding: [0x06,0x50,0x50,0xcc,0x00,0x05,0x2a,0x1c]
+
+[0x06,0x60,0x50,0xcc,0x00,0x05,0x2a,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x00,0x50,0xcc,0x00,0x05,0x2a,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x50,0xcc,0x00,0x05,0x2a,0x1c] # index_key:1
+# GFX1170: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:1 ; encoding: [0x06,0x48,0x50,0xcc,0x00,0x05,0x2a,0x1c]
+
+[0x06,0x50,0x50,0xcc,0x00,0x05,0x2a,0x1c] # index_key:2
+# GFX1170: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:2 ; encoding: [0x06,0x50,0x50,0xcc,0x00,0x05,0x2a,0x1c]
+
+[0x06,0x58,0x50,0xcc,0x00,0x05,0x2a,0x1c] # index_key:3
+# GFX1170: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:3 ; encoding: [0x06,0x58,0x50,0xcc,0x00,0x05,0x2a,0x1c]
+
+[0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x3c] # neg_lo:[1,0,0]
+# GFX1170: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x3c]
+
+[0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x5c] # neg_lo:[0,1,0]
+# GFX1170: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x5c]
+
+[0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x9c] # neg_lo:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x41,0x50,0xcc,0x00,0x05,0x2a,0x1c] # neg_hi:[1,0,0]
+# GFX1170: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[1,0,0] ; encoding: [0x06,0x41,0x50,0xcc,0x00,0x05,0x2a,0x1c]
+
+[0x06,0x42,0x50,0xcc,0x00,0x05,0x2a,0x1c] # neg_hi:[0,1,0]
+# GFX1170: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[0,1,0] ; encoding: [0x06,0x42,0x50,0xcc,0x00,0x05,0x2a,0x1c]
+
+[0x06,0x44,0x50,0xcc,0x00,0x05,0x2a,0x1c] # neg_hi:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x50,0xcc,0x00,0x04,0x2a,0x1c] # sgpr_0 src0
+# GFX1170: v_swmmac_f32_16x16x32_f16 v[6:9], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v10 ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x04,0x2a,0x1c]
+
+[0x06,0x40,0x50,0xcc,0x00,0x01,0x28,0x1c] # sgpr_0 src1
+# GFX1170: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v10 ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x01,0x28,0x1c]
+
+[0x06,0x40,0x50,0xcc,0x00,0x05,0x02,0x18] # sgpr_0 src2
+# GFX1170: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x05,0x02,0x18]
+
+[0x06,0x40,0x50,0xcc,0xf2,0x04,0x2a,0x1c] # 1.0 src0
+# GFX1170: v_swmmac_f32_16x16x32_f16 v[6:9], 1.0/*Invalid immediate*/, v[2:5], v10 ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x04,0x2a,0x1c]
+
+[0x06,0x40,0x50,0xcc,0x00,0xe5,0x29,0x1c] # 1.0 src1
+# GFX1170: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], 1.0/*Invalid immediate*/, v10 ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x01,0x28,0x1c]
+
+[0x06,0x40,0x50,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX1170: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x05,0x02,0x18]
+
+[0x06,0x40,0x50,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX1170: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x05,0x06,0x18]
+
+
+
+[0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x1c]
+# GFX1170: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x1c]
+
+[0x06,0xc0,0x51,0xcc,0x00,0x05,0x2a,0x1c] # clamp
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x51,0xcc,0x00,0x05,0x2a,0x1c] # op_sel:[1,0,0]
+# GFX1170: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:1 ; encoding: [0x06,0x48,0x51,0xcc,0x00,0x05,0x2a,0x1c]
+
+[0x06,0x50,0x51,0xcc,0x00,0x05,0x2a,0x1c] # op_sel:[0,1,0]
+# GFX1170: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:2 ; encoding: [0x06,0x50,0x51,0xcc,0x00,0x05,0x2a,0x1c]
+
+[0x06,0x60,0x51,0xcc,0x00,0x05,0x2a,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x00,0x51,0xcc,0x00,0x05,0x2a,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x51,0xcc,0x00,0x05,0x2a,0x1c] # index_key:1
+# GFX1170: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:1 ; encoding: [0x06,0x48,0x51,0xcc,0x00,0x05,0x2a,0x1c]
+
+[0x06,0x50,0x51,0xcc,0x00,0x05,0x2a,0x1c] # index_key:2
+# GFX1170: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:2 ; encoding: [0x06,0x50,0x51,0xcc,0x00,0x05,0x2a,0x1c]
+
+[0x06,0x58,0x51,0xcc,0x00,0x05,0x2a,0x1c] # index_key:3
+# GFX1170: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:3 ; encoding: [0x06,0x58,0x51,0xcc,0x00,0x05,0x2a,0x1c]
+
+[0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x3c] # neg_lo:[1,0,0]
+# GFX1170: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x3c]
+
+[0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x5c] # neg_lo:[0,1,0]
+# GFX1170: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x5c]
+
+[0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x9c] # neg_lo:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x41,0x51,0xcc,0x00,0x05,0x2a,0x1c] # neg_hi:[1,0,0]
+# GFX1170: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[1,0,0] ; encoding: [0x06,0x41,0x51,0xcc,0x00,0x05,0x2a,0x1c]
+
+[0x06,0x42,0x51,0xcc,0x00,0x05,0x2a,0x1c] # neg_hi:[0,1,0]
+# GFX1170: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[0,1,0] ; encoding: [0x06,0x42,0x51,0xcc,0x00,0x05,0x2a,0x1c]
+
+[0x06,0x44,0x51,0xcc,0x00,0x05,0x2a,0x1c] # neg_hi:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x51,0xcc,0x00,0x04,0x2a,0x1c] # sgpr_0 src0
+# GFX1170: v_swmmac_f32_16x16x32_bf16 v[6:9], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v10 ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x04,0x2a,0x1c]
+
+[0x06,0x40,0x51,0xcc,0x00,0x01,0x28,0x1c] # sgpr_0 src1
+# GFX1170: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v10 ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x01,0x28,0x1c]
+
+[0x06,0x40,0x51,0xcc,0x00,0x05,0x02,0x18] # sgpr_0 src2
+# GFX1170: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x05,0x02,0x18]
+
+[0x06,0x40,0x51,0xcc,0xf2,0x04,0x2a,0x1c] # 1.0 src0
+# GFX1170: v_swmmac_f32_16x16x32_bf16 v[6:9], 1.0/*Invalid immediate*/, v[2:5], v10 ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x04,0x2a,0x1c]
+
+[0x06,0x40,0x51,0xcc,0x00,0xe5,0x29,0x1c] # 1.0 src1
+# GFX1170: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], 1.0/*Invalid immediate*/, v10 ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x01,0x28,0x1c]
+
+[0x06,0x40,0x51,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX1170: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x05,0x02,0x18]
+
+[0x06,0x40,0x51,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX1170: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x05,0x06,0x18]
+
+
+
+[0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x1c]
+# GFX1170: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x1c]
+
+[0x06,0xc0,0x52,0xcc,0x00,0x05,0x22,0x1c] # clamp
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x52,0xcc,0x00,0x05,0x22,0x1c] # op_sel:[1,0,0]
+# GFX1170: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:1 ; encoding: [0x06,0x48,0x52,0xcc,0x00,0x05,0x22,0x1c]
+
+[0x06,0x50,0x52,0xcc,0x00,0x05,0x22,0x1c] # op_sel:[0,1,0]
+# GFX1170: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:2 ; encoding: [0x06,0x50,0x52,0xcc,0x00,0x05,0x22,0x1c]
+
+[0x06,0x60,0x52,0xcc,0x00,0x05,0x22,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x00,0x52,0xcc,0x00,0x05,0x22,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x52,0xcc,0x00,0x05,0x22,0x1c] # index_key:1
+# GFX1170: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:1 ; encoding: [0x06,0x48,0x52,0xcc,0x00,0x05,0x22,0x1c]
+
+[0x06,0x50,0x52,0xcc,0x00,0x05,0x22,0x1c] # index_key:2
+# GFX1170: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:2 ; encoding: [0x06,0x50,0x52,0xcc,0x00,0x05,0x22,0x1c]
+
+[0x06,0x58,0x52,0xcc,0x00,0x05,0x22,0x1c] # index_key:3
+# GFX1170: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:3 ; encoding: [0x06,0x58,0x52,0xcc,0x00,0x05,0x22,0x1c]
+
+[0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x3c] # neg_lo:[1,0,0]
+# GFX1170: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x3c]
+
+[0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x5c] # neg_lo:[0,1,0]
+# GFX1170: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x5c]
+
+[0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x9c] # neg_lo:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x41,0x52,0xcc,0x00,0x05,0x22,0x1c] # neg_hi:[1,0,0]
+# GFX1170: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[1,0,0] ; encoding: [0x06,0x41,0x52,0xcc,0x00,0x05,0x22,0x1c]
+
+[0x06,0x42,0x52,0xcc,0x00,0x05,0x22,0x1c] # neg_hi:[0,1,0]
+# GFX1170: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[0,1,0] ; encoding: [0x06,0x42,0x52,0xcc,0x00,0x05,0x22,0x1c]
+
+[0x06,0x44,0x52,0xcc,0x00,0x05,0x22,0x1c] # neg_hi:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x52,0xcc,0x00,0x04,0x22,0x1c] # sgpr_0 src0
+# GFX1170: v_swmmac_f16_16x16x32_f16 v[6:7], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v8 ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x04,0x22,0x1c]
+
+[0x06,0x40,0x52,0xcc,0x00,0x01,0x20,0x1c] # sgpr_0 src1
+# GFX1170: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v8 ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x01,0x20,0x1c]
+
+[0x06,0x40,0x52,0xcc,0x00,0x05,0x02,0x18] # sgpr_0 src2
+# GFX1170: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x05,0x02,0x18]
+
+[0x06,0x40,0x52,0xcc,0xf2,0x04,0x22,0x1c] # 1.0 src0
+# GFX1170: v_swmmac_f16_16x16x32_f16 v[6:7], 1.0/*Invalid immediate*/, v[2:5], v8 ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x04,0x22,0x1c]
+
+[0x06,0x40,0x52,0xcc,0x00,0xe5,0x21,0x1c] # 1.0 src1
+# GFX1170: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], 1.0/*Invalid immediate*/, v8 ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x01,0x20,0x1c]
+
+[0x06,0x40,0x52,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX1170: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x05,0x02,0x18]
+
+[0x06,0x40,0x52,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX1170: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x05,0x06,0x18]
+
+
+
+[0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x1c]
+# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x1c]
+
+[0x06,0xc0,0x53,0xcc,0x00,0x05,0x22,0x1c] # clamp
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x53,0xcc,0x00,0x05,0x22,0x1c] # op_sel:[1,0,0]
+# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:1 ; encoding: [0x06,0x48,0x53,0xcc,0x00,0x05,0x22,0x1c]
+
+[0x06,0x50,0x53,0xcc,0x00,0x05,0x22,0x1c] # op_sel:[0,1,0]
+# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:2 ; encoding: [0x06,0x50,0x53,0xcc,0x00,0x05,0x22,0x1c]
+
+[0x06,0x60,0x53,0xcc,0x00,0x05,0x22,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x00,0x53,0xcc,0x00,0x05,0x22,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x53,0xcc,0x00,0x05,0x22,0x1c] # index_key:1
+# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:1 ; encoding: [0x06,0x48,0x53,0xcc,0x00,0x05,0x22,0x1c]
+
+[0x06,0x50,0x53,0xcc,0x00,0x05,0x22,0x1c] # index_key:2
+# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:2 ; encoding: [0x06,0x50,0x53,0xcc,0x00,0x05,0x22,0x1c]
+
+[0x06,0x58,0x53,0xcc,0x00,0x05,0x22,0x1c] # index_key:3
+# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:3 ; encoding: [0x06,0x58,0x53,0xcc,0x00,0x05,0x22,0x1c]
+
+[0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x3c] # neg_lo:[1,0,0]
+# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x3c]
+
+[0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x5c] # neg_lo:[0,1,0]
+# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x5c]
+
+[0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x9c] # neg_lo:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x41,0x53,0xcc,0x00,0x05,0x22,0x1c] # neg_hi:[1,0,0]
+# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[1,0,0] ; encoding: [0x06,0x41,0x53,0xcc,0x00,0x05,0x22,0x1c]
+
+[0x06,0x42,0x53,0xcc,0x00,0x05,0x22,0x1c] # neg_hi:[0,1,0]
+# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[0,1,0] ; encoding: [0x06,0x42,0x53,0xcc,0x00,0x05,0x22,0x1c]
+
+[0x06,0x44,0x53,0xcc,0x00,0x05,0x22,0x1c] # neg_hi:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x53,0xcc,0x00,0x04,0x22,0x1c] # sgpr_0 src0
+# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[6:7], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v8 ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x04,0x22,0x1c]
+
+[0x06,0x40,0x53,0xcc,0x00,0x01,0x20,0x1c] # sgpr_0 src1
+# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v8 ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x01,0x20,0x1c]
+
+[0x06,0x40,0x53,0xcc,0x00,0x05,0x02,0x18] # sgpr_0 src2
+# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x05,0x02,0x18]
+
+[0x06,0x40,0x53,0xcc,0xf2,0x04,0x22,0x1c] # 1.0 src0
+# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[6:7], 1.0/*Invalid immediate*/, v[2:5], v8 ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x04,0x22,0x1c]
+
+[0x06,0x40,0x53,0xcc,0x00,0xe5,0x21,0x1c] # 1.0 src1
+# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], 1.0/*Invalid immediate*/, v8 ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x01,0x20,0x1c]
+
+[0x06,0x40,0x53,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x05,0x02,0x18]
+
+[0x06,0x40,0x53,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x05,0x06,0x18]
+
+
+
+[0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x1c]
+# GFX1170: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0xc0,0x54,0xcc,0x00,0x03,0x1e,0x1c] # clamp
+# GFX1170: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 clamp ; encoding: [0x03,0xc0,0x54,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x48,0x54,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[1,0,0]
+# GFX1170: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x54,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x50,0x54,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,1,0]
+# GFX1170: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x54,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x60,0x54,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x00,0x54,0xcc,0x00,0x03,0x1e,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x48,0x54,0xcc,0x00,0x03,0x1e,0x1c] # index_key:1
+# GFX1170: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x54,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x50,0x54,0xcc,0x00,0x03,0x1e,0x1c] # index_key:2
+# GFX1170: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x54,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x58,0x54,0xcc,0x00,0x03,0x1e,0x1c] # index_key:3
+# GFX1170: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x54,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x3c] # neg_lo:[1,0,0]
+# GFX1170: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x3c]
+
+[0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x5c] # neg_lo:[0,1,0]
+# GFX1170: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x5c]
+
+[0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x9c] # neg_lo:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x41,0x54,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x42,0x54,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x44,0x54,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x54,0xcc,0x00,0x02,0x1e,0x1c] # sgpr_0 src0
+# GFX1170: v_swmmac_i32_16x16x32_iu8 v[3:6], s0/*Invalid register, operand has 'VGPR_32' register class*/, v[1:2], v7 ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x02,0x1e,0x1c]
+
+[0x03,0x40,0x54,0xcc,0x00,0x01,0x1c,0x1c] # sgpr_0 src1
+# GFX1170: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v7 ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x01,0x1c,0x1c]
+
+[0x03,0x40,0x54,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2
+# GFX1170: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x02,0x18]
+
+[0x03,0x40,0x54,0xcc,0x81,0x02,0x1e,0x1c] # 1 src0
+# GFX1170: v_swmmac_i32_16x16x32_iu8 v[3:6], 1/*Invalid immediate*/, v[1:2], v7 ; encoding: [0x03,0x40,0x54,0xcc,0x01,0x02,0x1e,0x1c]
+
+[0x03,0x40,0x54,0xcc,0x00,0x03,0x1d,0x1c] # 1 src1
+# GFX1170: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, 1/*Invalid immediate*/, v7 ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x1c,0x1c]
+
+[0x03,0x40,0x54,0xcc,0x00,0x03,0x06,0x1a] # 1 src2
+# GFX1170: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], 1/*Invalid immediate*/ ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x06,0x18]
+
+[0x03,0x40,0x54,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2
+# GFX1170: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], 1.0/*Invalid immediate*/ ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x02,0x18]
+
+
+
+[0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x1c]
+# GFX1170: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x1c]
+
+[0x02,0xc0,0x55,0xcc,0x00,0x03,0x1a,0x1c] # clamp
+# GFX1170: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 clamp ; encoding: [0x02,0xc0,0x55,0xcc,0x00,0x03,0x1a,0x1c]
+
+[0x02,0x48,0x55,0xcc,0x00,0x03,0x1a,0x1c] # op_sel:[1,0,0]
+# GFX1170: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:1 ; encoding: [0x02,0x48,0x55,0xcc,0x00,0x03,0x1a,0x1c]
+
+[0x02,0x50,0x55,0xcc,0x00,0x03,0x1a,0x1c] # op_sel:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x60,0x55,0xcc,0x00,0x03,0x1a,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x00,0x55,0xcc,0x00,0x03,0x1a,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x48,0x55,0xcc,0x00,0x03,0x1a,0x1c] # index_key:1
+# GFX1170: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:1 ; encoding: [0x02,0x48,0x55,0xcc,0x00,0x03,0x1a,0x1c]
+
+[0x02,0x50,0x55,0xcc,0x00,0x03,0x1a,0x1c] # index_key:2
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x58,0x55,0xcc,0x00,0x03,0x1a,0x1c] # index_key:3
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x3c] # neg_lo:[1,0,0]
+# GFX1170: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x3c]
+
+[0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x5c] # neg_lo:[0,1,0]
+# GFX1170: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x5c]
+
+[0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x9c] # neg_lo:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x41,0x55,0xcc,0x00,0x03,0x1a,0x1c] # neg_hi:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x42,0x55,0xcc,0x00,0x03,0x1a,0x1c] # neg_hi:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x44,0x55,0xcc,0x00,0x03,0x1a,0x1c] # neg_hi:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x55,0xcc,0x00,0x02,0x1a,0x1c] # sgpr_0 src0
+# GFX1170: v_swmmac_i32_16x16x32_iu4 v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/, v1, v6 ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x02,0x1a,0x1c]
+
+[0x02,0x40,0x55,0xcc,0x00,0x01,0x18,0x1c] # sgpr_0 src1
+# GFX1170: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, s0/*Invalid register, operand has 'VGPR_32' register class*/, v6 ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x01,0x18,0x1c]
+
+[0x02,0x40,0x55,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2
+# GFX1170: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x02,0x18]
+
+[0x02,0x40,0x55,0xcc,0x81,0x02,0x1a,0x1c] # 1 src0
+# GFX1170: v_swmmac_i32_16x16x32_iu4 v[2:5], 1/*Invalid immediate*/, v1, v6 ; encoding: [0x02,0x40,0x55,0xcc,0x01,0x02,0x1a,0x1c]
+
+[0x02,0x40,0x55,0xcc,0x00,0x03,0x19,0x1c] # 1 src1
+# GFX1170: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, 1/*Invalid immediate*/, v6 ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x18,0x1c]
+
+[0x02,0x40,0x55,0xcc,0x00,0x03,0x06,0x1a] # 1 src2
+# GFX1170: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, 1/*Invalid immediate*/ ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x06,0x18]
+
+[0x02,0x40,0x55,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2
+# GFX1170: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, 1.0/*Invalid immediate*/ ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x02,0x18]
+
+
+
+[0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x1c]
+# GFX1170: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0xc0,0x56,0xcc,0x00,0x03,0x1e,0x1c] # clamp
+# GFX1170: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 clamp ; encoding: [0x03,0xc0,0x56,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x48,0x56,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[1,0,0]
+# GFX1170: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x56,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x50,0x56,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x60,0x56,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x00,0x56,0xcc,0x00,0x03,0x1e,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x48,0x56,0xcc,0x00,0x03,0x1e,0x1c] # index_key:1
+# GFX1170: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x56,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x50,0x56,0xcc,0x00,0x03,0x1e,0x1c] # index_key:2
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x58,0x56,0xcc,0x00,0x03,0x1e,0x1c] # index_key:3
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x3c] # neg_lo:[1,0,0]
+# GFX1170: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x3c]
+
+[0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x5c] # neg_lo:[0,1,0]
+# GFX1170: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x5c]
+
+[0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x9c] # neg_lo:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x41,0x56,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x42,0x56,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x44,0x56,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x56,0xcc,0x00,0x02,0x1e,0x1c] # sgpr_0 src0
+# GFX1170: v_swmmac_i32_16x16x64_iu4 v[3:6], s0/*Invalid register, operand has 'VGPR_32' register class*/, v[1:2], v7 ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x02,0x1e,0x1c]
+
+[0x03,0x40,0x56,0xcc,0x00,0x01,0x1c,0x1c] # sgpr_0 src1
+# GFX1170: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v7 ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x01,0x1c,0x1c]
+
+[0x03,0x40,0x56,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2
+# GFX1170: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x02,0x18]
+
+[0x03,0x40,0x56,0xcc,0x81,0x02,0x1e,0x1c] # 1 src0
+# GFX1170: v_swmmac_i32_16x16x64_iu4 v[3:6], 1/*Invalid immediate*/, v[1:2], v7 ; encoding: [0x03,0x40,0x56,0xcc,0x01,0x02,0x1e,0x1c]
+
+[0x03,0x40,0x56,0xcc,0x00,0x03,0x1d,0x1c] # 1 src1
+# GFX1170: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, 1/*Invalid immediate*/, v7 ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x1c,0x1c]
+
+[0x03,0x40,0x56,0xcc,0x00,0x03,0x06,0x1a] # 1 src2
+# GFX1170: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], 1/*Invalid immediate*/ ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x06,0x18]
+
+[0x03,0x40,0x56,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2
+# GFX1170: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], 1.0/*Invalid immediate*/ ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x02,0x18]
+
+
+
+[0x03,0x40,0x57,0xcc,0x00,0x03,0x1e,0x1c]
+# GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x57,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0xc0,0x57,0xcc,0x00,0x03,0x1e,0x1c] # clamp
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x48,0x57,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[1,0,0]
+# GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x57,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x50,0x57,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,1,0]
+# GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x57,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x60,0x57,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x57,0xcc,0x00,0x03,0x1e,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x57,0xcc,0x00,0x03,0x1e,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x00,0x57,0xcc,0x00,0x03,0x1e,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x48,0x57,0xcc,0x00,0x03,0x1e,0x1c] # index_key:1
+# GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x57,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x50,0x57,0xcc,0x00,0x03,0x1e,0x1c] # index_key:2
+# GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x57,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x58,0x57,0xcc,0x00,0x03,0x1e,0x1c] # index_key:3
+# GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x57,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x40,0x57,0xcc,0x00,0x03,0x1e,0x3c] # neg_lo:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x57,0xcc,0x00,0x03,0x1e,0x5c] # neg_lo:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x57,0xcc,0x00,0x03,0x1e,0x9c] # neg_lo:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x41,0x57,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x42,0x57,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x44,0x57,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x57,0xcc,0x00,0x02,0x1e,0x1c] # sgpr_0 src0
+# GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], s0/*Invalid register, operand has 'VGPR_32' register class*/, v[1:2], v7 ; encoding: [0x03,0x40,0x57,0xcc,0x00,0x02,0x1e,0x1c]
+
+[0x03,0x40,0x57,0xcc,0x00,0x01,0x1c,0x1c] # sgpr_0 src1
+# GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v7 ; encoding: [0x03,0x40,0x57,0xcc,0x00,0x01,0x1c,0x1c]
+
+[0x03,0x40,0x57,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2
+# GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x03,0x40,0x57,0xcc,0x00,0x03,0x02,0x18]
+
+[0x03,0x40,0x57,0xcc,0xf2,0x02,0x1e,0x1c] # 1.0 src0
+# GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], 1.0/*Invalid immediate*/, v[1:2], v7 ; encoding: [0x03,0x40,0x57,0xcc,0x00,0x02,0x1e,0x1c]
+
+[0x03,0x40,0x57,0xcc,0x00,0xe5,0x1d,0x1c] # 1.0 src1
+# GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, 1.0/*Invalid immediate*/, v7 ; encoding: [0x03,0x40,0x57,0xcc,0x00,0x01,0x1c,0x1c]
+
+[0x03,0x40,0x57,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2
+# GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], 1.0/*Invalid immediate*/ ; encoding: [0x03,0x40,0x57,0xcc,0x00,0x03,0x02,0x18]
+
+[0x03,0x40,0x57,0xcc,0x00,0x03,0x06,0x1a] # 1 src2
+# GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], 1/*Invalid immediate*/ ; encoding: [0x03,0x40,0x57,0xcc,0x00,0x03,0x06,0x18]
+
+
+
+[0x03,0x40,0x58,0xcc,0x00,0x03,0x1e,0x1c]
+# GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x58,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0xc0,0x58,0xcc,0x00,0x03,0x1e,0x1c] # clamp
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x48,0x58,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[1,0,0]
+# GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x58,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x50,0x58,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,1,0]
+# GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x58,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x60,0x58,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x58,0xcc,0x00,0x03,0x1e,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x58,0xcc,0x00,0x03,0x1e,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x00,0x58,0xcc,0x00,0x03,0x1e,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x48,0x58,0xcc,0x00,0x03,0x1e,0x1c] # index_key:1
+# GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x58,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x50,0x58,0xcc,0x00,0x03,0x1e,0x1c] # index_key:2
+# GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x58,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x58,0x58,0xcc,0x00,0x03,0x1e,0x1c] # index_key:3
+# GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x58,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x40,0x58,0xcc,0x00,0x03,0x1e,0x3c] # neg_lo:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x58,0xcc,0x00,0x03,0x1e,0x5c] # neg_lo:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x58,0xcc,0x00,0x03,0x1e,0x9c] # neg_lo:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x41,0x58,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x42,0x58,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x44,0x58,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x58,0xcc,0x00,0x02,0x1e,0x1c] # sgpr_0 src0
+# GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], s0/*Invalid register, operand has 'VGPR_32' register class*/, v[1:2], v7 ; encoding: [0x03,0x40,0x58,0xcc,0x00,0x02,0x1e,0x1c]
+
+[0x03,0x40,0x58,0xcc,0x00,0x01,0x1c,0x1c] # sgpr_0 src1
+# GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v7 ; encoding: [0x03,0x40,0x58,0xcc,0x00,0x01,0x1c,0x1c]
+
+[0x03,0x40,0x58,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2
+# GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x03,0x40,0x58,0xcc,0x00,0x03,0x02,0x18]
+
+[0x03,0x40,0x58,0xcc,0xf2,0x02,0x1e,0x1c] # 1.0 src0
+# GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], 1.0/*Invalid immediate*/, v[1:2], v7 ; encoding: [0x03,0x40,0x58,0xcc,0x00,0x02,0x1e,0x1c]
+
+[0x03,0x40,0x58,0xcc,0x00,0xe5,0x1d,0x1c] # 1.0 src1
+# GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, 1.0/*Invalid immediate*/, v7 ; encoding: [0x03,0x40,0x58,0xcc,0x00,0x01,0x1c,0x1c]
+
+[0x03,0x40,0x58,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2
+# GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], 1.0/*Invalid immediate*/ ; encoding: [0x03,0x40,0x58,0xcc,0x00,0x03,0x02,0x18]
+
+[0x03,0x40,0x58,0xcc,0x00,0x03,0x06,0x1a] # 1 src2
+# GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], 1/*Invalid immediate*/ ; encoding: [0x03,0x40,0x58,0xcc,0x00,0x03,0x06,0x18]
+
+
+
+[0x03,0x40,0x59,0xcc,0x00,0x03,0x1e,0x1c]
+# GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x59,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0xc0,0x59,0xcc,0x00,0x03,0x1e,0x1c] # clamp
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x48,0x59,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[1,0,0]
+# GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x59,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x50,0x59,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,1,0]
+# GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x59,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x60,0x59,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x59,0xcc,0x00,0x03,0x1e,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x59,0xcc,0x00,0x03,0x1e,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x00,0x59,0xcc,0x00,0x03,0x1e,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x48,0x59,0xcc,0x00,0x03,0x1e,0x1c] # index_key:1
+# GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x59,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x50,0x59,0xcc,0x00,0x03,0x1e,0x1c] # index_key:2
+# GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x59,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x58,0x59,0xcc,0x00,0x03,0x1e,0x1c] # index_key:3
+# GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x59,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x40,0x59,0xcc,0x00,0x03,0x1e,0x3c] # neg_lo:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x59,0xcc,0x00,0x03,0x1e,0x5c] # neg_lo:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x59,0xcc,0x00,0x03,0x1e,0x9c] # neg_lo:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x41,0x59,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x42,0x59,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x44,0x59,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x59,0xcc,0x00,0x02,0x1e,0x1c] # sgpr_0 src0
+# GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], s0/*Invalid register, operand has 'VGPR_32' register class*/, v[1:2], v7 ; encoding: [0x03,0x40,0x59,0xcc,0x00,0x02,0x1e,0x1c]
+
+[0x03,0x40,0x59,0xcc,0x00,0x01,0x1c,0x1c] # sgpr_0 src1
+# GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v7 ; encoding: [0x03,0x40,0x59,0xcc,0x00,0x01,0x1c,0x1c]
+
+[0x03,0x40,0x59,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2
+# GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x03,0x40,0x59,0xcc,0x00,0x03,0x02,0x18]
+
+[0x03,0x40,0x59,0xcc,0xf2,0x02,0x1e,0x1c] # 1.0 src0
+# GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], 1.0/*Invalid immediate*/, v[1:2], v7 ; encoding: [0x03,0x40,0x59,0xcc,0x00,0x02,0x1e,0x1c]
+
+[0x03,0x40,0x59,0xcc,0x00,0xe5,0x1d,0x1c] # 1.0 src1
+# GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, 1.0/*Invalid immediate*/, v7 ; encoding: [0x03,0x40,0x59,0xcc,0x00,0x01,0x1c,0x1c]
+
+[0x03,0x40,0x59,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2
+# GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], 1.0/*Invalid immediate*/ ; encoding: [0x03,0x40,0x59,0xcc,0x00,0x03,0x02,0x18]
+
+[0x03,0x40,0x59,0xcc,0x00,0x03,0x06,0x1a] # 1 src2
+# GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], 1/*Invalid immediate*/ ; encoding: [0x03,0x40,0x59,0xcc,0x00,0x03,0x06,0x18]
+
+
+
+[0x03,0x40,0x5a,0xcc,0x00,0x03,0x1e,0x1c]
+# GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x5a,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0xc0,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # clamp
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x48,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[1,0,0]
+# GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x5a,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x50,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,1,0]
+# GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x5a,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x60,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x5a,0xcc,0x00,0x03,0x1e,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x5a,0xcc,0x00,0x03,0x1e,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x00,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x48,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # index_key:1
+# GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x5a,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x50,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # index_key:2
+# GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x5a,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x58,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # index_key:3
+# GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x5a,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x40,0x5a,0xcc,0x00,0x03,0x1e,0x3c] # neg_lo:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x5a,0xcc,0x00,0x03,0x1e,0x5c] # neg_lo:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x5a,0xcc,0x00,0x03,0x1e,0x9c] # neg_lo:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x41,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x42,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x44,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x5a,0xcc,0x00,0x02,0x1e,0x1c] # sgpr_0 src0
+# GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], s0/*Invalid register, operand has 'VGPR_32' register class*/, v[1:2], v7 ; encoding: [0x03,0x40,0x5a,0xcc,0x00,0x02,0x1e,0x1c]
+
+[0x03,0x40,0x5a,0xcc,0x00,0x01,0x1c,0x1c] # sgpr_0 src1
+# GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v7 ; encoding: [0x03,0x40,0x5a,0xcc,0x00,0x01,0x1c,0x1c]
+
+[0x03,0x40,0x5a,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2
+# GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x03,0x40,0x5a,0xcc,0x00,0x03,0x02,0x18]
+
+[0x03,0x40,0x5a,0xcc,0xf2,0x02,0x1e,0x1c] # 1.0 src0
+# GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], 1.0/*Invalid immediate*/, v[1:2], v7 ; encoding: [0x03,0x40,0x5a,0xcc,0x00,0x02,0x1e,0x1c]
+
+[0x03,0x40,0x5a,0xcc,0x00,0xe5,0x1d,0x1c] # 1.0 src1
+# GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, 1.0/*Invalid immediate*/, v7 ; encoding: [0x03,0x40,0x5a,0xcc,0x00,0x01,0x1c,0x1c]
+
+[0x03,0x40,0x5a,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2
+# GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], 1.0/*Invalid immediate*/ ; encoding: [0x03,0x40,0x5a,0xcc,0x00,0x03,0x02,0x18]
+
+[0x03,0x40,0x5a,0xcc,0x00,0x03,0x06,0x1a] # 1 src2
+# GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], 1/*Invalid immediate*/ ; encoding: [0x03,0x40,0x5a,0xcc,0x00,0x03,0x06,0x18]
diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/vector-type-constant-folding.ll b/llvm/test/Transforms/InstSimplify/ConstProp/vector-type-constant-folding.ll
new file mode 100644
index 0000000000000..5b12fd3ec545f
--- /dev/null
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/vector-type-constant-folding.ll
@@ -0,0 +1,25 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s -passes=instsimplify -S | FileCheck %s
+;
+; Test type mismatch in ConstantFolding for vector types.
+
+define internal void @f() {
+; CHECK-LABEL: define internal void @f() {
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
+
+define void @test() {
+; CHECK-LABEL: define void @test() {
+; CHECK-NEXT:    store <4 x i16> <i16 sub (i16 extractelement (<4 x i16> bitcast (i64 ptrtoint (ptr @f to i64) to <4 x i16>), i32 0), i16 extractelement (<4 x i16> bitcast (i64 ptrtoint (ptr @f to i64) to <4 x i16>), i32 0)), i16 sub (i16 extractelement (<4 x i16> bitcast (i64 ptrtoint (ptr @f to i64) to <4 x i16>), i32 1), i16 extractelement (<4 x i16> bitcast (i64 ptrtoint (ptr @f to i64) to <4 x i16>), i32 1)), i16 sub (i16 extractelement (<4 x i16> bitcast (i64 ptrtoint (ptr @f to i64) to <4 x i16>), i32 2), i16 extractelement (<4 x i16> bitcast (i64 ptrtoint (ptr @f to i64) to <4 x i16>), i32 2)), i16 sub (i16 extractelement (<4 x i16> bitcast (i64 ptrtoint (ptr @f to i64) to <4 x i16>), i32 3), i16 extractelement (<4 x i16> bitcast (i64 ptrtoint (ptr @f to i64) to <4 x i16>), i32 3))>, ptr @f, align 8
+; CHECK-NEXT:    ret void
+;
+  %1 = ptrtoint ptr @f to i64
+  %2 = bitcast i64 %1 to <4 x i16>
+  %3 = ptrtoint ptr @f to i64
+  %4 = bitcast i64 %3 to <4 x i16>
+  %sub = sub <4 x i16> %2, %4
+  store <4 x i16> %sub, ptr @f, align 8
+  ret void
+}
diff --git a/llvm/unittests/Target/X86/CMakeLists.txt b/llvm/unittests/Target/X86/CMakeLists.txt
index b011681aa3b95..253ac5db96df2 100644
--- a/llvm/unittests/Target/X86/CMakeLists.txt
+++ b/llvm/unittests/Target/X86/CMakeLists.txt
@@ -13,6 +13,7 @@ set(LLVM_LINK_COMPONENTS
   MC
   MIRParser
   Passes
+  SelectionDAG
   Support
   Target
   TargetParser
@@ -24,4 +25,5 @@ set(LLVM_LINK_COMPONENTS
 add_llvm_unittest(X86Tests
   MachineSizeOptsTest.cpp
   TernlogTest.cpp
+  X86SelectionDAGTest.cpp
   )
diff --git a/llvm/unittests/Target/X86/X86SelectionDAGTest.cpp b/llvm/unittests/Target/X86/X86SelectionDAGTest.cpp
new file mode 100644
index 0000000000000..b546908a48931
--- /dev/null
+++ b/llvm/unittests/Target/X86/X86SelectionDAGTest.cpp
@@ -0,0 +1,103 @@
+//===----------------------------------------------------------------------===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86ISelLowering.h"
+#include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/AsmParser/Parser.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/KnownBits.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Target/TargetMachine.h"
+#include "gtest/gtest.h"
+
+namespace llvm {
+
+class X86SelectionDAGTest : public testing::Test {
+protected:
+  const TargetSubtargetInfo *STI;
+
+  static void SetUpTestCase() {
+    LLVMInitializeX86TargetInfo();
+    LLVMInitializeX86Target();
+    LLVMInitializeX86TargetMC();
+  }
+
+  void SetUp() override {
+    StringRef Assembly = "define void @f() { ret void }";
+
+    Triple TargetTriple("x86_64--");
+    std::string Error;
+    const Target *T = TargetRegistry::lookupTarget("", TargetTriple, Error);
+
+    TargetOptions Options;
+    TM = std::unique_ptr<TargetMachine>(T->createTargetMachine(
+        TargetTriple, "x86-64-v4", "", Options, std::nullopt, std::nullopt,
+        CodeGenOptLevel::Aggressive));
+
+    SMDiagnostic SMError;
+    M = parseAssemblyString(Assembly, SMError, Context);
+    if (!M)
+      report_fatal_error(SMError.getMessage());
+    M->setDataLayout(TM->createDataLayout());
+
+    F = M->getFunction("f");
+    if (!F)
+      report_fatal_error("F?");
+
+    MachineModuleInfo MMI(TM.get());
+
+    STI = TM->getSubtargetImpl(*F);
+    MF = std::make_unique<MachineFunction>(*F, *TM, *STI, MMI.getContext(), 0);
+
+    DAG = std::make_unique<SelectionDAG>(*TM, CodeGenOptLevel::None);
+    if (!DAG)
+      report_fatal_error("DAG?");
+    OptimizationRemarkEmitter ORE(F);
+    DAG->init(*MF, ORE, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
+              MMI, nullptr);
+  }
+
+  LLVMContext Context;
+  std::unique_ptr<TargetMachine> TM;
+  std::unique_ptr<Module> M;
+  Function *F;
+  std::unique_ptr<MachineFunction> MF;
+  std::unique_ptr<SelectionDAG> DAG;
+};
+
+TEST_F(X86SelectionDAGTest, computeKnownBits_FANDN) {
+  SDLoc Loc;
+
+  auto SrcF32 = DAG->getCopyFromReg(DAG->getEntryNode(), Loc, 1, MVT::f32);
+  auto SignBitF32 = DAG->getConstantFP(-0.0f, Loc, MVT::f32);
+  auto OpF32 = DAG->getNode(X86ISD::FANDN, Loc, MVT::f32, SignBitF32, SrcF32);
+  KnownBits KnownF32 = DAG->computeKnownBits(OpF32);
+  EXPECT_TRUE(KnownF32.isNonNegative());
+
+  auto Src2xF64 = DAG->getCopyFromReg(DAG->getEntryNode(), Loc, 1, MVT::v2f64);
+  auto ZeroF64 = DAG->getConstantFP(+0.0f, Loc, MVT::f64);
+  auto SignBitF64 = DAG->getConstantFP(-0.0f, Loc, MVT::f64);
+  auto HiSign2xF64 =
+      DAG->getBuildVector(MVT::v2f64, Loc, {ZeroF64, SignBitF64});
+  auto Op2xF64 =
+      DAG->getNode(X86ISD::FANDN, Loc, MVT::v2f64, HiSign2xF64, Src2xF64);
+  KnownBits KnownAll2xF64 = DAG->computeKnownBits(Op2xF64);
+  KnownBits KnownLo2xF64 = DAG->computeKnownBits(Op2xF64, APInt(2, 1));
+  KnownBits KnownHi2xF64 = DAG->computeKnownBits(Op2xF64, APInt(2, 2));
+  EXPECT_FALSE(KnownAll2xF64.isNonNegative());
+  EXPECT_FALSE(KnownLo2xF64.isNonNegative());
+  EXPECT_TRUE(KnownHi2xF64.isNonNegative());
+}
+
+} // end namespace llvm
diff --git a/llvm/utils/TableGen/CodeGenMapTable.cpp b/llvm/utils/TableGen/CodeGenMapTable.cpp
index 9724493642d75..049ce41ba45ef 100644
--- a/llvm/utils/TableGen/CodeGenMapTable.cpp
+++ b/llvm/utils/TableGen/CodeGenMapTable.cpp
@@ -381,7 +381,7 @@ unsigned MapTableEmitter::emitBinSearchTable(raw_ostream &OS) {
         OutStr += ", ";
         OutStr += ColInstr->getName();
       } else {
-        OutStr += ", (uint32_t)-1U";
+        OutStr += ", INSTRUCTION_LIST_END";
       }
     }
 
@@ -455,7 +455,7 @@ void MapTableEmitter::emitMapFuncBody(raw_ostream &OS, unsigned TableSize) {
       OS << ")\n";
       OS << "    return Table[mid][" << I + 1 << "];\n";
     }
-    OS << "  return (uint32_t)-1U;";
+    OS << "  llvm_unreachable(\"Unrecognized column value!\");\n";
   } else {
     OS << "  return Table[mid][1];\n";
   }
@@ -474,7 +474,7 @@ void MapTableEmitter::emitTablesWithFunc(raw_ostream &OS) {
   const ListInit *ColFields = InstrMapDesc.getColFields();
   ArrayRef<const ListInit *> ValueCols = InstrMapDesc.getValueCols();
   OS << "// " << InstrMapDesc.getName() << "\nLLVM_READONLY\n";
-  OS << "int64_t " << InstrMapDesc.getName() << "(uint32_t Opcode";
+  OS << "int32_t " << InstrMapDesc.getName() << "(uint32_t Opcode";
   if (ValueCols.size() > 1) {
     for (const Init *CF : ColFields->getElements()) {
       std::string ColName = CF->getAsUnquotedString();
diff --git a/llvm/utils/gn/secondary/llvm/unittests/Target/X86/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/Target/X86/BUILD.gn
index af2c6d38d9519..e70ccd67b4018 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/Target/X86/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/Target/X86/BUILD.gn
@@ -21,5 +21,6 @@ unittest("X86Tests") {
   sources = [
     "MachineSizeOptsTest.cpp",
     "TernlogTest.cpp",
+	"X86SelectionDAGTest.cpp",
   ]
 }
diff --git a/mlir/include/mlir/Dialect/MemRef/Utils/MemRefUtils.h b/mlir/include/mlir/Dialect/MemRef/Utils/MemRefUtils.h
index 5d2429bb476e6..9af0f301d763c 100644
--- a/mlir/include/mlir/Dialect/MemRef/Utils/MemRefUtils.h
+++ b/mlir/include/mlir/Dialect/MemRef/Utils/MemRefUtils.h
@@ -157,6 +157,21 @@ void resolveSourceIndicesCollapseShape(Location loc, PatternRewriter &rewriter,
                                        ValueRange indices,
                                        SmallVectorImpl<Value> &sourceIndices);
 
+/// Given the 'indices' of a load/store operation where the memref is a result
+/// of a rank-reducing full subview op, returns the indices w.r.t to the source
+/// memref of the memref.subview op. For example
+///
+///  %alias = memref.subview %src[0, 0, 0][1, 2, 2][1, 1, 1]: memref<1x2x2xf32>
+///                           to memref<2x2xf32>
+///  %val = memref.load %alias[%i, %j] : memref<2x2xf32>
+///
+/// could be folded into
+///
+///  %val = memref.load %src[0, %i, %j] : memref<1x2x2xf32>
+LogicalResult resolveSourceIndicesRankReducingSubview(
+    Location loc, OpBuilder &b, memref::SubViewOp subViewOp, ValueRange indices,
+    SmallVectorImpl<Value> &sourceIndices);
+
 } // namespace memref
 } // namespace mlir
 
diff --git a/mlir/include/mlir/Dialect/SCF/Utils/Utils.h b/mlir/include/mlir/Dialect/SCF/Utils/Utils.h
index c85f3b02c4a44..a758032ef69b4 100644
--- a/mlir/include/mlir/Dialect/SCF/Utils/Utils.h
+++ b/mlir/include/mlir/Dialect/SCF/Utils/Utils.h
@@ -18,6 +18,7 @@
 #include "mlir/Support/LLVM.h"
 #include "llvm/ADT/STLExtras.h"
 #include <optional>
+#include <tuple>
 
 namespace mlir {
 class Location;
@@ -248,6 +249,12 @@ FailureOr<scf::ParallelOp> parallelLoopUnrollByFactors(
     function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn = nullptr,
     IRMapping *clonedToSrcOpsMap = nullptr);
 
+/// Get constant loop bounds and steps for each of the induction variables of
+/// the given loop operation, if all the loop's ranges are constant. Each entry
+/// in the returned vector is a tuple (lowerBound, upperBound, step).
+llvm::SmallVector<std::tuple<int64_t, int64_t, int64_t>>
+getConstLoopBounds(mlir::LoopLikeOpInterface loopOp);
+
 /// Get constant trip counts for each of the induction variables of the given
 /// loop operation. If any of the loop's trip counts is not constant, return an
 /// empty vector.
diff --git a/mlir/lib/Conversion/MathToSPIRV/MathToSPIRV.cpp b/mlir/lib/Conversion/MathToSPIRV/MathToSPIRV.cpp
index 610ce1f13c56b..78f0fe1392962 100644
--- a/mlir/lib/Conversion/MathToSPIRV/MathToSPIRV.cpp
+++ b/mlir/lib/Conversion/MathToSPIRV/MathToSPIRV.cpp
@@ -449,8 +449,14 @@ struct RoundOpPattern final : public OpConversionPattern<math::RoundOp> {
       return res;
 
     Location loc = roundOp.getLoc();
-    Value operand = roundOp.getOperand();
-    Type ty = operand.getType();
+    auto ty = getTypeConverter()->convertType(adaptor.getOperand().getType());
+    if (!ty) {
+      return rewriter.notifyMatchFailure(
+          roundOp->getLoc(),
+          llvm::formatv("failed to convert type {0} for SPIR-V",
+                        roundOp.getType()));
+    }
+
     Type ety = getElementTypeOrSelf(ty);
 
     auto zero = spirv::ConstantOp::getZero(ty, loc, rewriter);
@@ -466,14 +472,15 @@ struct RoundOpPattern final : public OpConversionPattern<math::RoundOp> {
                                        rewriter.getFloatAttr(ety, 0.5));
     }
 
-    auto abs = spirv::GLFAbsOp::create(rewriter, loc, operand);
+    auto abs = spirv::GLFAbsOp::create(rewriter, loc, adaptor.getOperand());
     auto floor = spirv::GLFloorOp::create(rewriter, loc, abs);
     auto sub = spirv::FSubOp::create(rewriter, loc, abs, floor);
     auto greater =
         spirv::FOrdGreaterThanEqualOp::create(rewriter, loc, sub, half);
     auto select = spirv::SelectOp::create(rewriter, loc, greater, one, zero);
     auto add = spirv::FAddOp::create(rewriter, loc, floor, select);
-    rewriter.replaceOpWithNewOp<math::CopySignOp>(roundOp, add, operand);
+    rewriter.replaceOpWithNewOp<math::CopySignOp>(roundOp, add,
+                                                  adaptor.getOperand());
     return success();
   }
 };
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
index 4c67720654f83..d960201e2b3d0 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
@@ -4151,11 +4151,13 @@ LLVMFuncOp BlockAddressOp::getFunction(SymbolTableCollection &symbolTable) {
 }
 
 BlockTagOp BlockAddressOp::getBlockTagOp() {
-  auto funcOp = dyn_cast<LLVMFuncOp>(mlir::SymbolTable::lookupNearestSymbolFrom(
-      parentLLVMModule(*this), getBlockAddr().getFunction()));
+  Operation *sym = mlir::SymbolTable::lookupNearestSymbolFrom(
+      parentLLVMModule(*this), getBlockAddr().getFunction());
+  if (!sym)
+    return nullptr;
+  auto funcOp = dyn_cast<LLVMFuncOp>(sym);
   if (!funcOp)
     return nullptr;
-
   BlockTagOp blockTagOp = nullptr;
   funcOp.walk([&](LLVM::BlockTagOp labelOp) {
     if (labelOp.getTag() == getBlockAddr().getTag()) {
diff --git a/mlir/lib/Dialect/MemRef/Utils/MemRefUtils.cpp b/mlir/lib/Dialect/MemRef/Utils/MemRefUtils.cpp
index 2d341dce665e5..cf126cd85ddce 100644
--- a/mlir/lib/Dialect/MemRef/Utils/MemRefUtils.cpp
+++ b/mlir/lib/Dialect/MemRef/Utils/MemRefUtils.cpp
@@ -286,5 +286,46 @@ void resolveSourceIndicesCollapseShape(Location loc, PatternRewriter &rewriter,
   }
 }
 
+LogicalResult resolveSourceIndicesRankReducingSubview(
+    Location loc, OpBuilder &b, memref::SubViewOp subViewOp, ValueRange indices,
+    SmallVectorImpl<Value> &sourceIndices) {
+  if (!subViewOp.hasZeroOffset() || !subViewOp.hasUnitStride())
+    return failure();
+
+  MemRefType srcType = subViewOp.getSourceType();
+  MemRefType resType = subViewOp.getType();
+  unsigned srcRank = srcType.getRank();
+  unsigned resRank = resType.getRank();
+  if (srcRank <= resRank || indices.size() != resRank)
+    return failure();
+
+  auto droppedDims = subViewOp.getDroppedDims();
+  if (droppedDims.none() || droppedDims.count() != srcRank - resRank)
+    return failure();
+
+  auto mixedSizes = subViewOp.getMixedSizes();
+  if (mixedSizes.size() != srcRank)
+    return failure();
+
+  unsigned resultDim = 0;
+  for (unsigned sourceDim = 0; sourceDim < srcRank; ++sourceDim) {
+    if (droppedDims.test(sourceDim)) {
+      auto sizeCst = getConstantIntValue(mixedSizes[sourceDim]);
+      if (!sizeCst || *sizeCst != 1)
+        return failure();
+      sourceIndices.push_back(
+          getValueOrCreateConstantIndexOp(b, loc, b.getIndexAttr(0)));
+      continue;
+    }
+    if (resultDim >= indices.size())
+      return failure();
+    sourceIndices.push_back(indices[resultDim++]);
+  }
+  if (resultDim != indices.size())
+    return failure();
+
+  return success();
+}
+
 } // namespace memref
 } // namespace mlir
diff --git a/mlir/lib/Dialect/OpenACC/IR/CMakeLists.txt b/mlir/lib/Dialect/OpenACC/IR/CMakeLists.txt
index 2bd41d99a3661..2ba1778532860 100644
--- a/mlir/lib/Dialect/OpenACC/IR/CMakeLists.txt
+++ b/mlir/lib/Dialect/OpenACC/IR/CMakeLists.txt
@@ -1,5 +1,6 @@
 add_mlir_dialect_library(MLIROpenACCDialect
   OpenACC.cpp
+  OpenACCCG.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/OpenACC
diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
index 03fe5d177e327..ce024648b160c 100644
--- a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
+++ b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
@@ -465,16 +465,6 @@ ValueRange SerialOp::getSuccessorInputs(RegionSuccessor successor) {
   return getSingleRegionSuccessorInputs(getOperation(), successor);
 }
 
-void KernelEnvironmentOp::getSuccessorRegions(
-    RegionBranchPoint point, SmallVectorImpl<RegionSuccessor> &regions) {
-  getSingleRegionOpSuccessorRegions(getOperation(), getRegion(), point,
-                                    regions);
-}
-
-ValueRange KernelEnvironmentOp::getSuccessorInputs(RegionSuccessor successor) {
-  return getSingleRegionSuccessorInputs(getOperation(), successor);
-}
-
 void DataOp::getSuccessorRegions(RegionBranchPoint point,
                                  SmallVectorImpl<RegionSuccessor> &regions) {
   getSingleRegionOpSuccessorRegions(getOperation(), getRegion(), point,
@@ -876,20 +866,6 @@ LogicalResult acc::FirstprivateOp::verify() {
   return success();
 }
 
-//===----------------------------------------------------------------------===//
-// FirstprivateMapInitialOp
-//===----------------------------------------------------------------------===//
-LogicalResult acc::FirstprivateMapInitialOp::verify() {
-  if (getDataClause() != acc::DataClause::acc_firstprivate)
-    return emitError("data clause associated with firstprivate operation must "
-                     "match its intent");
-  if (failed(checkVarAndVarType(*this)))
-    return failure();
-  if (failed(checkNoModifier(*this)))
-    return failure();
-  return success();
-}
-
 //===----------------------------------------------------------------------===//
 // ReductionOp
 //===----------------------------------------------------------------------===//
@@ -1289,16 +1265,6 @@ void acc::FirstprivateOp::getEffects(
   addResultEffect<MemoryEffects::Write>(effects, getAccVar());
 }
 
-// FirstprivateMapInitialOp: var read, accVar result write.
-void acc::FirstprivateMapInitialOp::getEffects(
-    SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
-        &effects) {
-  effects.emplace_back(MemoryEffects::Read::get(),
-                       acc::CurrentDeviceIdResource::get());
-  addOperandEffect<MemoryEffects::Read>(effects, getVarMutable());
-  addResultEffect<MemoryEffects::Write>(effects, getAccVar());
-}
-
 // ReductionOp: var read, accVar result write.
 void acc::ReductionOp::getEffects(
     SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
@@ -1573,65 +1539,6 @@ struct RemoveConstantIfConditionWithRegion : public OpRewritePattern<OpTy> {
   }
 };
 
-/// Remove empty acc.kernel_environment operations. If the operation has wait
-/// operands, create a acc.wait operation to preserve synchronization.
-struct RemoveEmptyKernelEnvironment
-    : public OpRewritePattern<acc::KernelEnvironmentOp> {
-  using OpRewritePattern<acc::KernelEnvironmentOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(acc::KernelEnvironmentOp op,
-                                PatternRewriter &rewriter) const override {
-    assert(op->getNumRegions() == 1 && "expected op to have one region");
-
-    Block &block = op.getRegion().front();
-    if (!block.empty())
-      return failure();
-
-    // Conservatively disable canonicalization of empty acc.kernel_environment
-    // operations if the wait operands in the kernel_environment cannot be fully
-    // represented by acc.wait operation.
-
-    // Disable canonicalization if device type is not the default
-    if (auto deviceTypeAttr = op.getWaitOperandsDeviceTypeAttr()) {
-      for (auto attr : deviceTypeAttr) {
-        if (auto dtAttr = mlir::dyn_cast<acc::DeviceTypeAttr>(attr)) {
-          if (dtAttr.getValue() != mlir::acc::DeviceType::None)
-            return failure();
-        }
-      }
-    }
-
-    // Disable canonicalization if any wait segment has a devnum
-    if (auto hasDevnumAttr = op.getHasWaitDevnumAttr()) {
-      for (auto attr : hasDevnumAttr) {
-        if (auto boolAttr = mlir::dyn_cast<mlir::BoolAttr>(attr)) {
-          if (boolAttr.getValue())
-            return failure();
-        }
-      }
-    }
-
-    // Disable canonicalization if there are multiple wait segments
-    if (auto segmentsAttr = op.getWaitOperandsSegmentsAttr()) {
-      if (segmentsAttr.size() > 1)
-        return failure();
-    }
-
-    // Remove empty kernel environment.
-    // Preserve synchronization by creating acc.wait operation if needed.
-    if (!op.getWaitOperands().empty() || op.getWaitOnlyAttr())
-      rewriter.replaceOpWithNewOp<acc::WaitOp>(op, op.getWaitOperands(),
-                                               /*asyncOperand=*/Value(),
-                                               /*waitDevnum=*/Value(),
-                                               /*async=*/nullptr,
-                                               /*ifCond=*/Value());
-    else
-      rewriter.eraseOp(op);
-
-    return success();
-  }
-};
-
 //===----------------------------------------------------------------------===//
 // Recipe Region Helpers
 //===----------------------------------------------------------------------===//
@@ -3221,15 +3128,6 @@ void acc::HostDataOp::getCanonicalizationPatterns(RewritePatternSet &results,
   results.add<RemoveConstantIfConditionWithRegion<HostDataOp>>(context);
 }
 
-//===----------------------------------------------------------------------===//
-// KernelEnvironmentOp
-//===----------------------------------------------------------------------===//
-
-void acc::KernelEnvironmentOp::getCanonicalizationPatterns(
-    RewritePatternSet &results, MLIRContext *context) {
-  results.add<RemoveEmptyKernelEnvironment>(context);
-}
-
 //===----------------------------------------------------------------------===//
 // LoopOp
 //===----------------------------------------------------------------------===//
@@ -5129,23 +5027,6 @@ LogicalResult acc::WaitOp::verify() {
   return success();
 }
 
-//===----------------------------------------------------------------------===//
-// ReductionCombineOp
-//===----------------------------------------------------------------------===//
-void acc::ReductionCombineOp::getEffects(
-    llvm::SmallVectorImpl<
-        mlir::SideEffects::EffectInstance<mlir::MemoryEffects::Effect>>
-        &effects) {
-  effects.emplace_back(mlir::MemoryEffects::Read::get(), &getSrcMemrefMutable(),
-                       mlir::SideEffects::DefaultResource::get());
-  effects.emplace_back(mlir::MemoryEffects::Read::get(),
-                       &getDestMemrefMutable(),
-                       mlir::SideEffects::DefaultResource::get());
-  effects.emplace_back(mlir::MemoryEffects::Write::get(),
-                       &getDestMemrefMutable(),
-                       mlir::SideEffects::DefaultResource::get());
-}
-
 #define GET_OP_CLASSES
 #include "mlir/Dialect/OpenACC/OpenACCOps.cpp.inc"
 
diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACCCG.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACCCG.cpp
new file mode 100644
index 0000000000000..2753750128699
--- /dev/null
+++ b/mlir/lib/Dialect/OpenACC/IR/OpenACCCG.cpp
@@ -0,0 +1,186 @@
+//===- OpenACCCG.cpp - OpenACC codegen ops, attributes, and types ---------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implementation for OpenACC codegen operations, attributes, and types.
+// These correspond to the definitions in OpenACCCG*.td tablegen files
+// and are kept in a separate file because they do not represent direct mappings
+// of OpenACC language constructs; they are intermediate representations used
+// when decomposing and lowering primary `acc` dialect operations.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/OpenACC/OpenACC.h"
+#include "mlir/IR/Region.h"
+#include "mlir/Interfaces/ControlFlowInterfaces.h"
+#include "mlir/Support/LogicalResult.h"
+#include "llvm/ADT/SmallVector.h"
+
+using namespace mlir;
+using namespace acc;
+
+namespace {
+
+/// Generic helper for single-region OpenACC ops that execute their body once
+/// and then return to the parent operation with their results (if any).
+static void
+getSingleRegionOpSuccessorRegions(Operation *op, Region &region,
+                                  RegionBranchPoint point,
+                                  SmallVectorImpl<RegionSuccessor> &regions) {
+  if (point.isParent()) {
+    regions.push_back(RegionSuccessor(&region));
+    return;
+  }
+  regions.push_back(RegionSuccessor::parent());
+}
+
+static ValueRange getSingleRegionSuccessorInputs(Operation *op,
+                                                 RegionSuccessor successor) {
+  return successor.isParent() ? ValueRange(op->getResults()) : ValueRange();
+}
+
+/// Remove empty acc.kernel_environment operations. If the operation has wait
+/// operands, create a acc.wait operation to preserve synchronization.
+struct RemoveEmptyKernelEnvironment
+    : public OpRewritePattern<acc::KernelEnvironmentOp> {
+  using OpRewritePattern<acc::KernelEnvironmentOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(acc::KernelEnvironmentOp op,
+                                PatternRewriter &rewriter) const override {
+    assert(op->getNumRegions() == 1 && "expected op to have one region");
+
+    Block &block = op.getRegion().front();
+    if (!block.empty())
+      return failure();
+
+    // Conservatively disable canonicalization of empty acc.kernel_environment
+    // operations if the wait operands in the kernel_environment cannot be fully
+    // represented by acc.wait operation.
+
+    // Disable canonicalization if device type is not the default
+    if (auto deviceTypeAttr = op.getWaitOperandsDeviceTypeAttr()) {
+      for (auto attr : deviceTypeAttr) {
+        if (auto dtAttr = mlir::dyn_cast<acc::DeviceTypeAttr>(attr)) {
+          if (dtAttr.getValue() != mlir::acc::DeviceType::None)
+            return failure();
+        }
+      }
+    }
+
+    // Disable canonicalization if any wait segment has a devnum
+    if (auto hasDevnumAttr = op.getHasWaitDevnumAttr()) {
+      for (auto attr : hasDevnumAttr) {
+        if (auto boolAttr = mlir::dyn_cast<mlir::BoolAttr>(attr)) {
+          if (boolAttr.getValue())
+            return failure();
+        }
+      }
+    }
+
+    // Disable canonicalization if there are multiple wait segments
+    if (auto segmentsAttr = op.getWaitOperandsSegmentsAttr()) {
+      if (segmentsAttr.size() > 1)
+        return failure();
+    }
+
+    // Remove empty kernel environment.
+    // Preserve synchronization by creating acc.wait operation if needed.
+    if (!op.getWaitOperands().empty() || op.getWaitOnlyAttr())
+      rewriter.replaceOpWithNewOp<acc::WaitOp>(op, op.getWaitOperands(),
+                                               /*asyncOperand=*/Value(),
+                                               /*waitDevnum=*/Value(),
+                                               /*async=*/nullptr,
+                                               /*ifCond=*/Value());
+    else
+      rewriter.eraseOp(op);
+
+    return success();
+  }
+};
+
+template <typename EffectTy>
+static void addOperandEffect(
+    SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
+        &effects,
+    const MutableOperandRange &operand) {
+  for (unsigned i = 0, e = operand.size(); i < e; ++i)
+    effects.emplace_back(EffectTy::get(), &operand[i]);
+}
+
+template <typename EffectTy>
+static void addResultEffect(
+    SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
+        &effects,
+    Value result) {
+  effects.emplace_back(EffectTy::get(), mlir::cast<mlir::OpResult>(result));
+}
+
+} // namespace
+
+//===----------------------------------------------------------------------===//
+// KernelEnvironmentOp
+//===----------------------------------------------------------------------===//
+
+void KernelEnvironmentOp::getSuccessorRegions(
+    RegionBranchPoint point, SmallVectorImpl<RegionSuccessor> &regions) {
+  getSingleRegionOpSuccessorRegions(getOperation(), getRegion(), point,
+                                    regions);
+}
+
+ValueRange KernelEnvironmentOp::getSuccessorInputs(RegionSuccessor successor) {
+  return getSingleRegionSuccessorInputs(getOperation(), successor);
+}
+
+void KernelEnvironmentOp::getCanonicalizationPatterns(
+    RewritePatternSet &results, MLIRContext *context) {
+  results.add<RemoveEmptyKernelEnvironment>(context);
+}
+
+//===----------------------------------------------------------------------===//
+// FirstprivateMapInitialOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult FirstprivateMapInitialOp::verify() {
+  if (getDataClause() != acc::DataClause::acc_firstprivate)
+    return emitError("data clause associated with firstprivate operation must "
+                     "match its intent");
+  if (!getVar())
+    return emitError("must have var operand");
+  if (!mlir::isa<mlir::acc::PointerLikeType>(getVar().getType()) &&
+      !mlir::isa<mlir::acc::MappableType>(getVar().getType()))
+    return emitError("var must be mappable or pointer-like");
+  if (mlir::isa<mlir::acc::PointerLikeType>(getVar().getType()) &&
+      getVarType() == getVar().getType())
+    return emitError("varType must capture the element type of var");
+  if (getModifiers() != acc::DataClauseModifier::none)
+    return emitError("no data clause modifiers are allowed");
+  return success();
+}
+
+void FirstprivateMapInitialOp::getEffects(
+    SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
+        &effects) {
+  effects.emplace_back(MemoryEffects::Read::get(),
+                       acc::CurrentDeviceIdResource::get());
+  addOperandEffect<MemoryEffects::Read>(effects, getVarMutable());
+  addResultEffect<MemoryEffects::Write>(effects, getAccVar());
+}
+
+//===----------------------------------------------------------------------===//
+// ReductionCombineOp
+//===----------------------------------------------------------------------===//
+
+void ReductionCombineOp::getEffects(
+    SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
+        &effects) {
+  effects.emplace_back(MemoryEffects::Read::get(), &getSrcMemrefMutable(),
+                       SideEffects::DefaultResource::get());
+  effects.emplace_back(MemoryEffects::Read::get(), &getDestMemrefMutable(),
+                       SideEffects::DefaultResource::get());
+  effects.emplace_back(MemoryEffects::Write::get(), &getDestMemrefMutable(),
+                       SideEffects::DefaultResource::get());
+}
diff --git a/mlir/lib/Dialect/SCF/Transforms/CMakeLists.txt b/mlir/lib/Dialect/SCF/Transforms/CMakeLists.txt
index a9ffa9dc208a0..fb9aa4018d263 100644
--- a/mlir/lib/Dialect/SCF/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/SCF/Transforms/CMakeLists.txt
@@ -32,6 +32,7 @@ add_mlir_dialect_library(MLIRSCFTransforms
   MLIRBufferizationTransforms
   MLIRDestinationStyleOpInterface
   MLIRDialectUtils
+  MLIRIndexDialect
   MLIRIR
   MLIRMemRefDialect
   MLIRPass
diff --git a/mlir/lib/Dialect/SCF/Transforms/ParallelLoopFusion.cpp b/mlir/lib/Dialect/SCF/Transforms/ParallelLoopFusion.cpp
index 4ea832177c4f9..0b132e9109492 100644
--- a/mlir/lib/Dialect/SCF/Transforms/ParallelLoopFusion.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/ParallelLoopFusion.cpp
@@ -13,15 +13,31 @@
 #include "mlir/Dialect/SCF/Transforms/Passes.h"
 
 #include "mlir/Analysis/AliasAnalysis.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Index/IR/IndexOps.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/MemRef/Utils/MemRefUtils.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/SCF/Transforms/Transforms.h"
+#include "mlir/Dialect/SCF/Utils/Utils.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/IRMapping.h"
+#include "mlir/IR/Matchers.h"
 #include "mlir/IR/OpDefinition.h"
 #include "mlir/IR/OperationSupport.h"
+#include "mlir/IR/PatternMatch.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
 
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallBitVector.h"
+#include "llvm/ADT/TypeSwitch.h"
+
+#include <optional>
+#include <tuple>
+
 namespace mlir {
 #define GEN_PASS_DEF_SCFPARALLELLOOPFUSION
 #include "mlir/Dialect/SCF/Transforms/Passes.h.inc"
@@ -55,114 +71,670 @@ static bool equalIterationSpaces(ParallelOp firstPloop,
          matchOperands(firstPloop.getStep(), secondPloop.getStep());
 }
 
-/// Checks if the parallel loops have mixed access to the same buffers. Returns
-/// `true` if the first parallel loop writes to the same indices that the second
-/// loop reads.
-static bool haveNoReadsAfterWriteExceptSameIndex(
+/// Check if both operations are the same type of memory write op and
+/// write to the same memory location (same buffer and same indices).
+static bool opsWriteSameMemLocation(Operation *op1, Operation *op2) {
+  if (!op1 || !op2 || op1->getName() != op2->getName())
+    return false;
+  if (op1 == op2)
+    return true;
+  // support only these memory-writing ops for now
+  if (!isa<memref::StoreOp, vector::TransferWriteOp, vector::StoreOp>(op1))
+    return false;
+  bool opsAreIdentical =
+      llvm::TypeSwitch<Operation *, bool>(op1)
+          .Case([&](memref::StoreOp storeOp1) {
+            auto storeOp2 = cast<memref::StoreOp>(op2);
+            return (storeOp1.getMemRef() == storeOp2.getMemRef()) &&
+                   (storeOp1.getIndices() == storeOp2.getIndices());
+          })
+          .Case([&](vector::TransferWriteOp writeOp1) {
+            auto writeOp2 = cast<vector::TransferWriteOp>(op2);
+            return (writeOp1.getBase() == writeOp2.getBase()) &&
+                   (writeOp1.getIndices() == writeOp2.getIndices()) &&
+                   (writeOp1.getMask() == writeOp2.getMask()) &&
+                   (writeOp1.getValueToStore().getType() ==
+                    writeOp2.getValueToStore().getType()) &&
+                   (writeOp1.getInBounds() == writeOp2.getInBounds());
+          })
+          .Case([&](vector::StoreOp vecStoreOp1) {
+            auto vecStoreOp2 = cast<vector::StoreOp>(op2);
+            return (vecStoreOp1.getBase() == vecStoreOp2.getBase()) &&
+                   (vecStoreOp1.getIndices() == vecStoreOp2.getIndices()) &&
+                   (vecStoreOp1.getValueToStore().getType() ==
+                    vecStoreOp2.getValueToStore().getType()) &&
+                   (vecStoreOp1.getAlignment() == vecStoreOp2.getAlignment()) &&
+                   (vecStoreOp1.getNontemporal() ==
+                    vecStoreOp2.getNontemporal());
+          })
+          .Default([](Operation *) { return false; });
+  return opsAreIdentical;
+}
+
+/// Check if val1 (from the first parallel loop) and val2 (from the
+/// second) are equivalent, considering the mapping of induction variables from
+/// the first to the second parallel loop.
+static bool valsAreEquivalent(Value val1, Value val2,
+                              const IRMapping &loopsIVsMap) {
+  if (val1 == val2 || loopsIVsMap.lookupOrDefault(val1) == val2 ||
+      loopsIVsMap.lookupOrDefault(val2) == val1)
+    return true;
+  Operation *val1DefOp = val1.getDefiningOp();
+  Operation *val2DefOp = val2.getDefiningOp();
+  if (!val1DefOp || !val2DefOp)
+    return false;
+  if (!isMemoryEffectFree(val1DefOp) || !isMemoryEffectFree(val2DefOp))
+    return false;
+  return OperationEquivalence::isEquivalentTo(
+      val1DefOp, val2DefOp,
+      [&](Value v1, Value v2) {
+        return success(loopsIVsMap.lookupOrDefault(v1) == v2 ||
+                       loopsIVsMap.lookupOrDefault(v2) == v1);
+      },
+      /*markEquivalent=*/nullptr, OperationEquivalence::Flags::IgnoreLocations);
+}
+
+/// If the `expr` value is the result of an integer addition of `base` and a
+/// constant, return the constant.
+static std::optional<int64_t> getAddConstant(Value expr, Value base,
+                                             const IRMapping &loopsIVsMap) {
+  if (auto addOp = expr.getDefiningOp<arith::AddIOp>()) {
+    if (auto constOp = getConstantIntValue(addOp.getLhs());
+        constOp && valsAreEquivalent(addOp.getRhs(), base, loopsIVsMap))
+      return constOp.value();
+    if (auto constOp = getConstantIntValue(addOp.getRhs());
+        constOp && valsAreEquivalent(addOp.getLhs(), base, loopsIVsMap))
+      return constOp.value();
+    return std::nullopt;
+  }
+
+  if (auto addOp = expr.getDefiningOp<index::AddOp>()) {
+    if (auto constOp = getConstantIntValue(addOp.getLhs());
+        constOp && valsAreEquivalent(addOp.getRhs(), base, loopsIVsMap))
+      return constOp.value();
+    if (auto constOp = getConstantIntValue(addOp.getRhs());
+        constOp && valsAreEquivalent(addOp.getLhs(), base, loopsIVsMap))
+      return constOp.value();
+    return std::nullopt;
+  }
+
+  if (auto applyOp = expr.getDefiningOp<affine::AffineApplyOp>()) {
+    AffineMap map = applyOp.getAffineMap();
+    if (map.getNumResults() != 1 || map.getNumDims() != 1 ||
+        map.getNumSymbols() != 0)
+      return std::nullopt;
+    if (!valsAreEquivalent(applyOp.getOperand(0), base, loopsIVsMap))
+      return std::nullopt;
+    AffineExpr result = map.getResult(0);
+    auto bin = dyn_cast<AffineBinaryOpExpr>(result);
+    if (!bin || bin.getKind() != AffineExprKind::Add)
+      return std::nullopt;
+    auto lhsDim = dyn_cast<AffineDimExpr>(bin.getLHS());
+    auto rhsDim = dyn_cast<AffineDimExpr>(bin.getRHS());
+    auto lhsConst = dyn_cast<AffineConstantExpr>(bin.getLHS());
+    auto rhsConst = dyn_cast<AffineConstantExpr>(bin.getRHS());
+    if (lhsConst && rhsDim)
+      return lhsConst.getValue();
+    if (rhsConst && lhsDim)
+      return rhsConst.getValue();
+  }
+  return std::nullopt;
+}
+
+// Return true if the scalar load index may hit any element covered by a
+// vector.store/transfer_write along a single memref dimension. Supported cases:
+//
+// 1) Direct index match (with optional offset):
+//    vector.transfer_write %v, %A[%i] : vector<4xf32>, memref<...>
+//    %x = memref.load %A[%i] : memref<...>
+//
+// 2) Loop IV range intersects the write range:
+//    vector.transfer_write %v, %A[%c0] : vector<4xf32>, memref<...>
+//    scf.for %k = %c0 to %c4 step %c1 { %x = memref.load %A[%k] }
+//
+// 3) Constant index (or IV + constant) within the write range:
+//    vector.transfer_write %v, %A[%c0] : vector<4xf32>, memref<...>
+//    %x = memref.load %A[%c2] : memref<...>
+//    %y = memref.load %A[%i + %c1] : memref<...>
+//
+// Args:
+// - loadIndex: index used by the scalar load for this dimension.
+// - offset: subview offset for the base memref dimension (if any).
+// - writeIndex: index used by the transfer_write for this dimension. Can be
+// null if the dim was dropped by a rank reducing subview, whose result is
+// written by the vector.write.
+// - extent: vector size along this dimension (number of elements written).
+// - loopsIVsMap: IV equivalence map between fused loops.
+static bool loadIndexWithinWriteRange(Value loadIndex, OpFoldResult offset,
+                                      Value writeIndex, int64_t extent,
+                                      const IRMapping &loopsIVsMap) {
+  if (extent <= 0)
+    return false;
+
+  // Extract constant loop bounds for loop IVs (e.g. from scf.for).
+  auto getConstLoopBoundsForIV =
+      [](Value index) -> std::optional<std::tuple<int64_t, int64_t, int64_t>> {
+    auto blockArg = dyn_cast<BlockArgument>(index);
+    if (!blockArg)
+      return std::nullopt;
+    auto *parentOp = blockArg.getOwner()->getParentOp();
+    auto loopLike = dyn_cast<LoopLikeOpInterface>(parentOp);
+    if (!loopLike)
+      return std::nullopt;
+    auto ranges = getConstLoopBounds(loopLike);
+    if (ranges.empty())
+      return std::nullopt;
+
+    auto ivs = loopLike.getLoopInductionVars();
+    if (!ivs)
+      return std::nullopt;
+    auto it = llvm::find(*ivs, blockArg);
+    if (it == ivs->end())
+      return std::nullopt;
+    unsigned pos = std::distance(ivs->begin(), it);
+    if (pos >= ranges.size())
+      return std::nullopt;
+    auto [lb, ub, step] = ranges[pos];
+    return std::make_tuple(lb, ub, step);
+  };
+
+  std::optional<int64_t> offsetConst = getConstantIntValue(offset);
+  std::optional<int64_t> writeConst =
+      writeIndex ? getConstantIntValue(writeIndex) : std::optional<int64_t>(0);
+  if (!writeConst && writeIndex) {
+    // Treat single-iteration IVs as constants for matching.
+    if (auto bounds = getConstLoopBoundsForIV(writeIndex)) {
+      auto [lb, ub, step] = *bounds;
+      if (step > 0 && ub == lb + step)
+        writeConst = lb;
+    }
+  }
+
+  // Check whether a loop IV is fully contained in a constant write range.
+  auto loopIVWithinRange = [](int64_t lb, int64_t ub, int64_t step,
+                              int64_t rangeStart, int64_t rangeExtent) -> bool {
+    if (rangeExtent <= 0 || step <= 0)
+      return false;
+    if (ub <= lb)
+      return false;
+    int64_t rangeEnd = rangeStart + rangeExtent;
+    return lb >= rangeStart && ub <= rangeEnd;
+  };
+
+  if (offsetConst && writeConst) {
+    // Constant start of the write range; check constant load or loop IV range.
+    int64_t start = *offsetConst + *writeConst;
+    if (auto loadConst = getConstantIntValue(loadIndex))
+      return (*loadConst >= start && *loadConst < start + extent);
+    if (auto bounds = getConstLoopBoundsForIV(loadIndex)) {
+      auto [lb, ub, step] = *bounds;
+      return loopIVWithinRange(lb, ub, step, start, extent);
+    }
+  }
+
+  if (writeIndex) {
+    // Direct IV match (or IV + constant) against the write index.
+    if (offsetConst && *offsetConst == 0 &&
+        valsAreEquivalent(loadIndex, writeIndex, loopsIVsMap))
+      return true;
+    if (auto addConst = getAddConstant(loadIndex, writeIndex, loopsIVsMap)) {
+      // Match load index of the form writeIndex + C within the write extent.
+      if (offsetConst) {
+        int64_t start = *offsetConst;
+        return (*addConst >= start && *addConst < start + extent);
+      }
+    }
+    return false;
+  }
+
+  if (auto offsetVal = dyn_cast<Value>(offset)) {
+    // Exact match when extent is 1 and the load hits the offset value.
+    if (extent == 1 && valsAreEquivalent(loadIndex, offsetVal, loopsIVsMap))
+      return true;
+  }
+
+  return false;
+}
+
+/// Return the base memref value used by the given memory op.
+static Value getBaseMemref(Operation *op) {
+  // TODO: use the common interface for memory ops once available.
+  return llvm::TypeSwitch<Operation *, Value>(op)
+      .Case([&](memref::LoadOp load) { return load.getMemRef(); })
+      .Case([&](memref::StoreOp store) { return store.getMemRef(); })
+      .Case([&](vector::TransferReadOp read) { return read.getBase(); })
+      .Case([&](vector::TransferWriteOp write) { return write.getBase(); })
+      .Case([&](vector::LoadOp load) { return load.getBase(); })
+      .Case([&](vector::StoreOp store) { return store.getBase(); })
+      .Default([](Operation *) { return Value(); });
+}
+
+/// Recognize scalar memref.load of an element produced by a vector write
+/// (vector.transfer_write or vector.store, optionally through a rank-reducing
+/// unit-stride subview) of the same buffer. This covers the pattern where a
+/// vector write stores a full lane pack and a subsequent scalar load reads an
+/// element from that lane pack. EXAMPLE:
+///  vector.transfer_write %V, %arg[%x, %y, ..., 0] {in_bounds = [true]} :
+///             vector<4xf32>, memref<4xf32, strided<[1], offset: ?>>
+///  scf.for %iter = %c0 to %c4 step %c1 iter_args(...) -> (f32) {
+///    %0 = memref.load %arg[%x, %y, ..., %iter] : memref<1x128x16x4xf32>
+///    ...
+///  }
+///
+static bool isLoadOnWrittenVector(memref::LoadOp loadOp, Value writeBase,
+                                  ValueRange writeIndices, VectorType vecTy,
+                                  ArrayRef<int64_t> vectorDimForWriteDim,
+                                  const IRMapping &ivsMap) {
+  if (!vecTy)
+    return false;
+
+  Value base = writeBase;
+  // The write base if there is no subview, or the subview source otherwise.
+  MemrefValue baseMemref = nullptr;
+  SmallVector<OpFoldResult> offsets;
+  llvm::SmallBitVector droppedDims;
+  bool hasSubview = false;
+  auto *ctx = loadOp.getContext();
+  if (auto subView = base.getDefiningOp<memref::SubViewOp>()) {
+    if (!subView.hasUnitStride())
+      return false;
+    baseMemref = cast<MemrefValue>(subView.getSource());
+    offsets = llvm::to_vector(subView.getMixedOffsets());
+    droppedDims = subView.getDroppedDims();
+    hasSubview = true;
+  } else {
+    baseMemref = dyn_cast<MemrefValue>(base);
+    if (!baseMemref)
+      return false;
+  }
+
+  auto loadIndices = loadOp.getIndices();
+  unsigned baseRank = baseMemref.getType().getRank();
+  if ((loadOp.getMemref() != baseMemref) || (loadIndices.size() != baseRank))
+    return false;
+
+  unsigned writeRank = writeIndices.size();
+  if ((!hasSubview && writeRank != baseRank) ||
+      (hasSubview && offsets.size() != baseRank) ||
+      (vectorDimForWriteDim.size() != writeRank))
+    return false;
+
+  auto zeroAttr = IntegerAttr::get(IndexType::get(ctx), 0);
+  unsigned writeMemrefDim = 0;
+  for (unsigned baseDim : llvm::seq(baseRank)) {
+    bool wasDropped = (hasSubview && droppedDims.test(baseDim));
+    int64_t vectorDim = !wasDropped ? vectorDimForWriteDim[writeMemrefDim] : -1;
+    int64_t extent = 1;
+    if (vectorDim >= 0) {
+      int64_t dimSize = vecTy.getDimSize(vectorDim);
+      if (dimSize == ShapedType::kDynamic)
+        return false;
+      extent = dimSize;
+    }
+    Value writeIndex = !wasDropped ? writeIndices[writeMemrefDim] : Value();
+    OpFoldResult offset =
+        hasSubview ? offsets[baseDim] : OpFoldResult(zeroAttr);
+    if (!loadIndexWithinWriteRange(loadIndices[baseDim], offset, writeIndex,
+                                   extent, ivsMap))
+      return false;
+    if (!wasDropped)
+      ++writeMemrefDim;
+  }
+
+  return true;
+}
+
+/// Recognize scalar memref.load of an element produced by a
+/// vector.transfer_write
+static bool loadMatchesVectorWrite(memref::LoadOp loadOp,
+                                   vector::TransferWriteOp writeOp,
+                                   const IRMapping &ivsMap) {
+  auto vecTy = dyn_cast<VectorType>(writeOp.getVector().getType());
+  if (!vecTy)
+    return false;
+
+  unsigned writeRank = writeOp.getIndices().size();
+  AffineMap permutationMap = writeOp.getPermutationMap();
+  if (!permutationMap.isProjectedPermutation() ||
+      permutationMap.getNumResults() != vecTy.getRank() ||
+      permutationMap.getNumDims() != writeRank)
+    return false;
+
+  SmallVector<int64_t> vectorDimForWriteDim(writeRank, -1);
+  for (unsigned vecDim = 0; vecDim < permutationMap.getNumResults(); ++vecDim) {
+    auto dimExpr = dyn_cast<AffineDimExpr>(permutationMap.getResult(vecDim));
+    if (!dimExpr)
+      return false;
+    unsigned writeDim = dimExpr.getPosition();
+    if (writeDim >= writeRank || vectorDimForWriteDim[writeDim] != -1)
+      return false;
+    vectorDimForWriteDim[writeDim] = vecDim;
+  }
+
+  return isLoadOnWrittenVector(loadOp, writeOp.getBase(), writeOp.getIndices(),
+                               vecTy, vectorDimForWriteDim, ivsMap);
+}
+
+/// Recognize scalar memref.load of an element produced by a vector.store
+static bool loadMatchesVectorStore(memref::LoadOp loadOp,
+                                   vector::StoreOp storeOp,
+                                   const IRMapping &ivsMap) {
+  auto vecTy = dyn_cast<VectorType>(storeOp.getValueToStore().getType());
+  if (!vecTy)
+    return false;
+
+  unsigned writeRank = storeOp.getIndices().size();
+  if (vecTy.getRank() > writeRank)
+    return false;
+
+  SmallVector<int64_t> vectorDimForWriteDim(writeRank, -1);
+  unsigned vecRank = vecTy.getRank();
+  for (unsigned i = 0; i < vecRank; ++i) {
+    unsigned writeDim = writeRank - vecRank + i;
+    vectorDimForWriteDim[writeDim] = i;
+  }
+
+  return isLoadOnWrittenVector(loadOp, storeOp.getBase(), storeOp.getIndices(),
+                               vecTy, vectorDimForWriteDim, ivsMap);
+}
+
+/// Check if both operations access the same positions of the same
+/// buffer, but one of the two does it through a rank-reducing full subview of
+/// the buffer (the other's base). EXAMPLE:
+///  memref.store %a, %buf[%c0, %i, %j] : memref<1x2x2xf32>
+///  %alias = memref.subview %buf[0, 0, 0][1, 2, 2][1, 1, 1]: memref<1x2x2xf32>
+///                           to memref<2x2xf32>
+///  %val = memref.load %alias[%i, %j] : memref<2x2xf32>
+template <typename OpTy1, typename OpTy2>
+static bool opsAccessSameIndicesViaRankReducingSubview(
+    OpTy1 op1, OpTy2 op2, const IRMapping &firstToSecondPloopIVsMap,
+    OpBuilder &b) {
+  auto base1 = cast<MemrefValue>(getBaseMemref(op1));
+  auto base2 = cast<MemrefValue>(getBaseMemref(op2));
+  if (!base1 || !base2)
+    return false;
+
+  auto accessThroughTrivialSubviewIsSame =
+      [&b](memref::SubViewOp subView, ValueRange subViewAccess,
+           ValueRange sourceAccess, const IRMapping &ivsMap) -> bool {
+    SmallVector<Value> resolvedSubviewAccess;
+    LogicalResult resolved = resolveSourceIndicesRankReducingSubview(
+        subView.getLoc(), b, subView, subViewAccess, resolvedSubviewAccess);
+    if (failed(resolved) ||
+        (resolvedSubviewAccess.size() != sourceAccess.size()))
+      return false;
+    for (auto [dimIdx, resolvedIndex] :
+         llvm::enumerate(resolvedSubviewAccess)) {
+      if (!matchPattern(resolvedIndex, m_Zero()) &&
+          !valsAreEquivalent(resolvedIndex, sourceAccess[dimIdx], ivsMap))
+        return false;
+    }
+    return true;
+  };
+
+  // Case 1: op1 uses a subview of op2's base.
+  if (auto subView = base1.template getDefiningOp<memref::SubViewOp>();
+      subView &&
+      memref::isSameViewOrTrivialAlias(
+          base2, cast<MemrefValue>(subView.getSource())) &&
+      accessThroughTrivialSubviewIsSame(subView, op1.getIndices(),
+                                        op2.getIndices(),
+                                        firstToSecondPloopIVsMap))
+    return true;
+
+  // Case 2: op2 uses a subview of op1's base.
+  if (auto subView = base2.template getDefiningOp<memref::SubViewOp>();
+      subView &&
+      memref::isSameViewOrTrivialAlias(
+          base1, cast<MemrefValue>(subView.getSource())) &&
+      accessThroughTrivialSubviewIsSame(subView, op2.getIndices(),
+                                        op1.getIndices(),
+                                        firstToSecondPloopIVsMap))
+    return true;
+
+  return false;
+}
+
+/// Check if both memory read/write operations access the same indices
+/// (considering also the mapping of induction variables from the first to the
+/// second parallel loop).
+template <typename OpTy1, typename OpTy2>
+static bool opsAccessSameIndices(OpTy1 op1, OpTy2 op2,
+                                 const IRMapping &loopsIVsMap, OpBuilder &b) {
+  auto indices1 = op1.getIndices();
+  auto indices2 = op2.getIndices();
+  if (indices1.size() != indices2.size())
+    return opsAccessSameIndicesViaRankReducingSubview(op1, op2, loopsIVsMap, b);
+  for (auto [idx1, idx2] : llvm::zip(indices1, indices2)) {
+    if (!valsAreEquivalent(idx1, idx2, loopsIVsMap))
+      return false;
+  }
+  return true;
+}
+
+/// Check if the loadOp reads from the same memory location (same buffer,
+/// same indices and same properties) as written by the storeOp.
+static bool
+loadsFromSameMemoryLocationWrittenBy(Operation *loadOp, Operation *storeOp,
+                                     const IRMapping &firstToSecondPloopIVsMap,
+                                     OpBuilder &b) {
+  if (!loadOp || !storeOp)
+    return false;
+  // Support only these memory-reading ops for now
+  if (!isa<memref::LoadOp, vector::TransferReadOp, vector::LoadOp>(loadOp))
+    return false;
+  bool accessSameMemory =
+      llvm::TypeSwitch<Operation *, bool>(loadOp)
+          .Case([&](memref::LoadOp memLoadOp) {
+            if (auto memStoreOp = dyn_cast<memref::StoreOp>(storeOp))
+              return opsAccessSameIndices(memLoadOp, memStoreOp,
+                                          firstToSecondPloopIVsMap, b);
+            if (auto vecWriteOp = dyn_cast<vector::TransferWriteOp>(storeOp))
+              return loadMatchesVectorWrite(memLoadOp, vecWriteOp,
+                                            firstToSecondPloopIVsMap);
+            if (auto vecStoreOp = dyn_cast<vector::StoreOp>(storeOp))
+              return loadMatchesVectorStore(memLoadOp, vecStoreOp,
+                                            firstToSecondPloopIVsMap);
+            return false;
+          })
+          .Case([&](vector::TransferReadOp vecReadOp) {
+            auto vecWriteOp = dyn_cast<vector::TransferWriteOp>(storeOp);
+            if (!vecWriteOp)
+              return false;
+            return opsAccessSameIndices(vecReadOp, vecWriteOp,
+                                        firstToSecondPloopIVsMap, b) &&
+                   (vecReadOp.getMask() == vecWriteOp.getMask()) &&
+                   (vecReadOp.getInBounds() == vecWriteOp.getInBounds());
+          })
+          .Case([&](vector::LoadOp vecLoadOp) {
+            auto vecStoreOp = dyn_cast<vector::StoreOp>(storeOp);
+            if (!vecStoreOp)
+              return false;
+            return opsAccessSameIndices(vecLoadOp, vecStoreOp,
+                                        firstToSecondPloopIVsMap, b) &&
+                   (vecLoadOp.getAlignment() == vecStoreOp.getAlignment());
+          })
+          .Default([](Operation *) { return false; });
+  return accessSameMemory;
+}
+
+static Value getStoreOpTargetBuffer(Operation *op) {
+  return llvm::TypeSwitch<Operation *, Value>(op)
+      .Case([&](memref::StoreOp storeOp) { return storeOp.getMemRef(); })
+      .Case([&](vector::TransferWriteOp writeOp) { return writeOp.getBase(); })
+      .Case([&](vector::StoreOp vecStoreOp) { return vecStoreOp.getBase(); })
+      .Default([](Operation *) { return Value(); });
+}
+
+/// To be called when `mayAlias(val1, val2)` is true. Check if the potential
+/// aliasing between the loadOp and storeOp can be resolved by analyzing their
+/// access patterns.
+static bool canResolveAlias(Operation *loadOp, Operation *storeOp,
+                            const IRMapping &loopsIVsMap) {
+  if (auto transfWriteOp = dyn_cast<vector::TransferWriteOp>(storeOp);
+      transfWriteOp && isa<memref::LoadOp>(loadOp))
+    return loadMatchesVectorWrite(cast<memref::LoadOp>(loadOp), transfWriteOp,
+                                  loopsIVsMap);
+  if (auto vecStoreOp = dyn_cast<vector::StoreOp>(storeOp);
+      vecStoreOp && isa<memref::LoadOp>(loadOp))
+    return loadMatchesVectorStore(cast<memref::LoadOp>(loadOp), vecStoreOp,
+                                  loopsIVsMap);
+  return false;
+}
+
+/// Check that the parallel loops have no mixed access to the same buffers.
+/// Return `true` if the second parallel loop does not read or write the buffers
+/// written by the first loop using different indices.
+static bool haveNoDataDependenciesExceptSameIndex(
     ParallelOp firstPloop, ParallelOp secondPloop,
     const IRMapping &firstToSecondPloopIndices,
-    llvm::function_ref<bool(Value, Value)> mayAlias) {
-  DenseMap<Value, SmallVector<ValueRange, 1>> bufferStores;
-  SmallVector<Value> bufferStoresVec;
-  firstPloop.getBody()->walk([&](memref::StoreOp store) {
-    bufferStores[store.getMemRef()].push_back(store.getIndices());
-    bufferStoresVec.emplace_back(store.getMemRef());
-  });
-  auto walkResult = secondPloop.getBody()->walk([&](memref::LoadOp load) {
-    Value loadMem = load.getMemRef();
-    // Stop if the memref is defined in secondPloop body. Careful alias analysis
-    // is needed.
-    auto *memrefDef = loadMem.getDefiningOp();
-    if (memrefDef && memrefDef->getBlock() == load->getBlock())
+    llvm::function_ref<bool(Value, Value)> mayAlias, OpBuilder &b) {
+  // Map buffers to their store/write ops in the firstPloop
+  DenseMap<Value, SmallVector<Operation *>> bufferStoresInFirstPloop;
+  // Record all the memory buffers used in store/write ops found in firstPloop
+  llvm::SmallSetVector<Value, 4> buffersWrittenInFirstPloop;
+
+  auto collectStoreOpsInWalk = [&](Operation *op) {
+    auto memOpInterf = dyn_cast_if_present<MemoryEffectOpInterface>(op);
+    // Ignore ops that don't write to memory
+    if (!memOpInterf || (!memOpInterf.hasEffect<MemoryEffects::Write>() &&
+                         !memOpInterf.hasEffect<MemoryEffects::Free>()))
+      return WalkResult::advance();
+
+    // Only these memory-writing ops are supported for now:
+    // memref.store, vector.transfer_write, vector.store
+    Value storeOpBase = getStoreOpTargetBuffer(op);
+    if (!storeOpBase)
       return WalkResult::interrupt();
 
-    for (Value store : bufferStoresVec)
-      if (store != loadMem && mayAlias(store, loadMem))
-        return WalkResult::interrupt();
+    // Expect the base operand to be a Memref
+    MemrefValue storeOpBaseMemref = dyn_cast<MemrefValue>(storeOpBase);
+    if (!storeOpBaseMemref)
+      return WalkResult::interrupt();
+    // Get the original memref buffer, skipping full view-like ops
+    Value buffer = memref::skipFullyAliasingOperations(storeOpBaseMemref);
+    bufferStoresInFirstPloop[buffer].push_back(op);
+    buffersWrittenInFirstPloop.insert(buffer);
+    return WalkResult::advance();
+  };
 
-    auto write = bufferStores.find(loadMem);
-    if (write == bufferStores.end())
-      return WalkResult::advance();
+  // Walk the first parallel loop to collect all store/write ops and their
+  // target buffers
+  if (firstPloop.getBody()->walk(collectStoreOpsInWalk).wasInterrupted())
+    return false;
 
-    // Check that at last one store was retrieved
-    if (write->second.empty())
+  // Check that this load/read op encountered while walking the second parallel
+  // loop does not have incompatible data dependencies with the store/write ops
+  // collected from the first parallel loop: the loops can be fused only if in
+  // the 2nd loop there are no loads/stores from/to the buffers written in the
+  // 1st loop, except when on the same exact memory location (same indices) as
+  // written in the 1st loop.
+  auto checkLoadInWalkHasNoIncompatibleDataDeps = [&](Operation *loadOp) {
+    auto memOpInterf = dyn_cast_if_present<MemoryEffectOpInterface>(loadOp);
+    // To be conservative, we should stop on ops that don't advertise their
+    // memory effects. However, many ops don't implement MemoryEffectOpInterface
+    // yet, so for now we just skip them.
+    // TODO: once more ops add MemoryEffectOpInterface, interrupt the walk here.
+    if (!memOpInterf &&
+        !loadOp->hasTrait<mlir::OpTrait::HasRecursiveMemoryEffects>())
+      return WalkResult::advance();
+    // Ignore ops that don't read from memory, and wrapping ops that have nested
+    // memory effects (e.g. loops, conditionals) as they will be analyzed when
+    // visiting their nested ops.
+    if ((!memOpInterf &&
+         loadOp->hasTrait<mlir::OpTrait::HasRecursiveMemoryEffects>()) ||
+        (memOpInterf && !memOpInterf.hasEffect<MemoryEffects::Read>()))
+      return WalkResult::advance();
+    // Support only these memory-reading ops for now
+    if (!isa<memref::LoadOp, vector::TransferReadOp, vector::LoadOp>(loadOp) ||
+        !isa<MemrefValue>(loadOp->getOperand(0)))
       return WalkResult::interrupt();
 
-    auto storeIndices = write->second.front();
+    MemrefValue loadOpBase = cast<MemrefValue>(loadOp->getOperand(0));
+    MemrefValue loadedOrigBuf = memref::skipFullyAliasingOperations(loadOpBase);
 
-    // Multiple writes to the same memref are allowed only on the same indices
-    for (const auto &othStoreIndices : write->second) {
-      if (othStoreIndices != storeIndices)
+    for (Value storedMem : buffersWrittenInFirstPloop)
+      if ((storedMem != loadedOrigBuf) && mayAlias(storedMem, loadedOrigBuf) &&
+          !llvm::all_of(bufferStoresInFirstPloop[storedMem],
+                        [&](Operation *storeOp) {
+                          return canResolveAlias(loadOp, storeOp,
+                                                 firstToSecondPloopIndices);
+                        })) {
         return WalkResult::interrupt();
+      }
+
+    auto writeOpsIt = bufferStoresInFirstPloop.find(loadedOrigBuf);
+    if (writeOpsIt == bufferStoresInFirstPloop.end())
+      return WalkResult::advance();
+    // Store/write ops to this buffer in the firstPloop
+    SmallVector<mlir::Operation *> &writeOps = writeOpsIt->second;
+
+    // If the first loop has no writes to this buffer, continue
+    if (writeOps.empty())
+      return WalkResult::advance();
+
+    Operation *writeOp = writeOps.front();
+
+    // In the first parallel loop, multiple writes to the same memref are
+    // allowed only on the same memory location
+    if (!llvm::all_of(writeOps, [&](Operation *otherWriteOp) {
+          return opsWriteSameMemLocation(writeOp, otherWriteOp);
+        })) {
+      return WalkResult::interrupt();
     }
 
-    // Check that the load indices of secondPloop coincide with store indices of
-    // firstPloop for the same memrefs.
-    auto loadIndices = load.getIndices();
-    if (storeIndices.size() != loadIndices.size())
+    // Check that the load in secondPloop reads from the same memory location as
+    // written by the corresponding store in firstPloop
+    if (!loadsFromSameMemoryLocationWrittenBy(loadOp, writeOp,
+                                              firstToSecondPloopIndices, b)) {
       return WalkResult::interrupt();
-    for (int i = 0, e = storeIndices.size(); i < e; ++i) {
-      if (firstToSecondPloopIndices.lookupOrDefault(storeIndices[i]) !=
-          loadIndices[i]) {
-        auto *storeIndexDefOp = storeIndices[i].getDefiningOp();
-        auto *loadIndexDefOp = loadIndices[i].getDefiningOp();
-        if (storeIndexDefOp && loadIndexDefOp) {
-          if (!isMemoryEffectFree(storeIndexDefOp))
-            return WalkResult::interrupt();
-          if (!isMemoryEffectFree(loadIndexDefOp))
-            return WalkResult::interrupt();
-          if (!OperationEquivalence::isEquivalentTo(
-                  storeIndexDefOp, loadIndexDefOp,
-                  [&](Value storeIndex, Value loadIndex) {
-                    if (firstToSecondPloopIndices.lookupOrDefault(storeIndex) !=
-                        firstToSecondPloopIndices.lookupOrDefault(loadIndex))
-                      return failure();
-                    else
-                      return success();
-                  },
-                  /*markEquivalent=*/nullptr,
-                  OperationEquivalence::Flags::IgnoreLocations)) {
-            return WalkResult::interrupt();
-          }
-        } else {
-          return WalkResult::interrupt();
-        }
-      }
     }
+
     return WalkResult::advance();
-  });
-  return !walkResult.wasInterrupted();
+  };
+
+  // Walk the second parallel loop to check load/read ops against the stores
+  // collected from the first parallel loop.
+  return !secondPloop.getBody()
+              ->walk(checkLoadInWalkHasNoIncompatibleDataDeps)
+              .wasInterrupted();
 }
 
-/// Analyzes dependencies in the most primitive way by checking simple read and
-/// write patterns.
-static LogicalResult
-verifyDependencies(ParallelOp firstPloop, ParallelOp secondPloop,
-                   const IRMapping &firstToSecondPloopIndices,
-                   llvm::function_ref<bool(Value, Value)> mayAlias) {
-  if (!haveNoReadsAfterWriteExceptSameIndex(
-          firstPloop, secondPloop, firstToSecondPloopIndices, mayAlias))
-    return failure();
+/// Check that in each loop there are no read ops on the buffers written
+/// by the other loop, except when reading from the same exact memory location
+/// (same indices) as written in the other loop.
+static bool
+noIncompatibleDataDependencies(ParallelOp firstPloop, ParallelOp secondPloop,
+                               const IRMapping &firstToSecondPloopIndices,
+                               llvm::function_ref<bool(Value, Value)> mayAlias,
+                               OpBuilder &b) {
+  if (!haveNoDataDependenciesExceptSameIndex(
+          firstPloop, secondPloop, firstToSecondPloopIndices, mayAlias, b))
+    return false;
 
   IRMapping secondToFirstPloopIndices;
   secondToFirstPloopIndices.map(secondPloop.getBody()->getArguments(),
                                 firstPloop.getBody()->getArguments());
-  return success(haveNoReadsAfterWriteExceptSameIndex(
-      secondPloop, firstPloop, secondToFirstPloopIndices, mayAlias));
+  return haveNoDataDependenciesExceptSameIndex(
+      secondPloop, firstPloop, secondToFirstPloopIndices, mayAlias, b);
 }
 
+/// Check if fusion of the two parallel loops is legal:
+/// i.e. no nested parallel loops, equal iteration spaces,
+/// and no incompatible data dependencies between the loops.
 static bool isFusionLegal(ParallelOp firstPloop, ParallelOp secondPloop,
                           const IRMapping &firstToSecondPloopIndices,
-                          llvm::function_ref<bool(Value, Value)> mayAlias) {
+                          llvm::function_ref<bool(Value, Value)> mayAlias,
+                          OpBuilder &b) {
   return !hasNestedParallelOp(firstPloop) &&
          !hasNestedParallelOp(secondPloop) &&
          equalIterationSpaces(firstPloop, secondPloop) &&
-         succeeded(verifyDependencies(firstPloop, secondPloop,
-                                      firstToSecondPloopIndices, mayAlias));
+         noIncompatibleDataDependencies(firstPloop, secondPloop,
+                                        firstToSecondPloopIndices, mayAlias, b);
 }
 
-/// Prepends operations of firstPloop's body into secondPloop's body.
-/// Updates secondPloop with new loop.
+/// Prepend operations of firstPloop's body into secondPloop's body.
+/// Update secondPloop with new loop.
 static void fuseIfLegal(ParallelOp firstPloop, ParallelOp &secondPloop,
                         OpBuilder builder,
                         llvm::function_ref<bool(Value, Value)> mayAlias) {
@@ -172,7 +744,7 @@ static void fuseIfLegal(ParallelOp firstPloop, ParallelOp &secondPloop,
   firstToSecondPloopIndices.map(block1->getArguments(), block2->getArguments());
 
   if (!isFusionLegal(firstPloop, secondPloop, firstToSecondPloopIndices,
-                     mayAlias))
+                     mayAlias, builder))
     return;
 
   DominanceInfo dom;
@@ -272,6 +844,18 @@ struct ParallelLoopFusion
     auto &aa = getAnalysis<AliasAnalysis>();
 
     auto mayAlias = [&](Value val1, Value val2) -> bool {
+      // If the memref is defined in one of the parallel loops body, careful
+      // alias analysis is needed.
+      // TODO: check if this is still needed as a separate check.
+      auto val1Def = val1.getDefiningOp();
+      auto val2Def = val2.getDefiningOp();
+      auto val1Loop =
+          val1Def ? val1Def->getParentOfType<ParallelOp>() : nullptr;
+      auto val2Loop =
+          val2Def ? val2Def->getParentOfType<ParallelOp>() : nullptr;
+      if (val1Loop != val2Loop)
+        return true;
+
       return !aa.alias(val1, val2).isNo();
     };
 
diff --git a/mlir/lib/Dialect/SCF/Utils/Utils.cpp b/mlir/lib/Dialect/SCF/Utils/Utils.cpp
index f8a4f057c9f0d..e795f3f0b019b 100644
--- a/mlir/lib/Dialect/SCF/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/SCF/Utils/Utils.cpp
@@ -1560,6 +1560,25 @@ bool mlir::isPerfectlyNestedForLoops(
   return true;
 }
 
+llvm::SmallVector<std::tuple<int64_t, int64_t, int64_t>>
+mlir::getConstLoopBounds(mlir::LoopLikeOpInterface loopOp) {
+  std::optional<SmallVector<OpFoldResult>> loBnds = loopOp.getLoopLowerBounds();
+  std::optional<SmallVector<OpFoldResult>> upBnds = loopOp.getLoopUpperBounds();
+  std::optional<SmallVector<OpFoldResult>> steps = loopOp.getLoopSteps();
+  if (!loBnds || !upBnds || !steps)
+    return {};
+  llvm::SmallVector<std::tuple<int64_t, int64_t, int64_t>> loopRanges;
+  for (auto [lb, ub, step] : llvm::zip(*loBnds, *upBnds, *steps)) {
+    auto lbCst = getConstantIntValue(lb);
+    auto ubCst = getConstantIntValue(ub);
+    auto stepCst = getConstantIntValue(step);
+    if (!lbCst || !ubCst || !stepCst)
+      return {};
+    loopRanges.emplace_back(*lbCst, *ubCst, *stepCst);
+  }
+  return loopRanges;
+}
+
 llvm::SmallVector<llvm::APInt>
 mlir::getConstLoopTripCounts(mlir::LoopLikeOpInterface loopOp) {
   std::optional<SmallVector<OpFoldResult>> loBnds = loopOp.getLoopLowerBounds();
diff --git a/mlir/test/Conversion/MathToSPIRV/math-to-gl-spirv.mlir b/mlir/test/Conversion/MathToSPIRV/math-to-gl-spirv.mlir
index b8e001c9f6950..608abffd8bd82 100644
--- a/mlir/test/Conversion/MathToSPIRV/math-to-gl-spirv.mlir
+++ b/mlir/test/Conversion/MathToSPIRV/math-to-gl-spirv.mlir
@@ -257,6 +257,27 @@ func.func @round_vector(%x: vector<4xf32>) -> vector<4xf32> {
   return %0: vector<4xf32>
 }
 
+// Unit dimensional vectors are converted to scalars by inserting
+// unrealized_conversion_cast's.
+//
+// CHECK-LABEL: @round_vector_unit_dim
+//  CHECK-SAME: (%[[ARG:.+]]: vector<1xf32>) -> vector<1xf32>
+func.func @round_vector_unit_dim(%x: vector<1xf32>) -> vector<1xf32> {
+  // CHECK: %[[CAST:.+]] = builtin.unrealized_conversion_cast %[[ARG]] : vector<1xf32> to f32
+  // CHECK: %[[ZERO:.+]] = spirv.Constant 0.000000e+00
+  // CHECK: %[[ONE:.+]] = spirv.Constant 1.000000e+00
+  // CHECK: %[[HALF:.+]] = spirv.Constant 5.000000e-01
+  // CHECK: %[[ABS:.+]] = spirv.GL.FAbs %[[CAST]] : f32
+  // CHECK: %[[FLOOR:.+]] = spirv.GL.Floor %[[ABS]]
+  // CHECK: %[[SUB:.+]] = spirv.FSub %[[ABS]], %[[FLOOR]]
+  // CHECK: %[[GE:.+]] = spirv.FOrdGreaterThanEqual %[[SUB]], %[[HALF]]
+  // CHECK: %[[SEL:.+]] = spirv.Select %[[GE]], %[[ONE]], %[[ZERO]]
+  // CHECK: %[[ADD:.+]] = spirv.FAdd %[[FLOOR]], %[[SEL]]
+  // CHECK: %[[BITCAST:.+]] = spirv.Bitcast %[[ADD]] : f32 to i32
+  %0 = math.round %x : vector<1xf32>
+  return %0: vector<1xf32>
+}
+
 } // end module
 
 // -----
diff --git a/mlir/test/Dialect/LLVMIR/invalid.mlir b/mlir/test/Dialect/LLVMIR/invalid.mlir
index 49b6342aea538..b736cde7689ed 100644
--- a/mlir/test/Dialect/LLVMIR/invalid.mlir
+++ b/mlir/test/Dialect/LLVMIR/invalid.mlir
@@ -2065,3 +2065,10 @@ llvm.func @invalid_sincos_gt_2_element_struct_return_type(%f: f32) -> () {
   // expected-error at +1 {{op expected result type to be an homogeneous struct with two elements matching the operand type}}
   llvm.intr.sincos(%f) : (f32) -> !llvm.struct<(f32, f32, f32)>
 }
+
+// -----
+
+module {
+  // expected-error at +1 {{'llvm.blockaddress' op expects an existing block label target in the referenced function}}
+  %0 = llvm.blockaddress <function = @missing_func, tag = <id = 1>> : !llvm.ptr
+}
diff --git a/mlir/test/Dialect/SCF/parallel-loop-fusion.mlir b/mlir/test/Dialect/SCF/parallel-loop-fusion.mlir
index 0d4ea6f20e8d9..d876062b704f2 100644
--- a/mlir/test/Dialect/SCF/parallel-loop-fusion.mlir
+++ b/mlir/test/Dialect/SCF/parallel-loop-fusion.mlir
@@ -314,23 +314,24 @@ func.func @do_not_fuse_unmatching_read_write_patterns(
 
 // -----
 
-func.func @do_not_fuse_loops_with_memref_defined_in_loop_bodies() {
+func.func @do_not_fuse_loops_with_nonfull_alias_defined_in_loop_bodies() {
   %c2 = arith.constant 2 : index
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
+  %c1fp = arith.constant 1.0 : f32
   %buffer  = memref.alloc() : memref<2x2xf32>
-  scf.parallel (%i, %j) = (%c0, %c0) to (%c2, %c2) step (%c1, %c1) {
+  scf.parallel (%i, %j) = (%c0, %c0) to (%c2, %c1) step (%c1, %c1) {
+    memref.store %c1fp, %buffer[%i, %j] : memref<2x2xf32>
     scf.reduce
   }
-  scf.parallel (%i, %j) = (%c0, %c0) to (%c2, %c2) step (%c1, %c1) {
-    %A = memref.subview %buffer[%c0, %c0][%c2, %c2][%c1, %c1]
-      : memref<2x2xf32> to memref<?x?xf32, strided<[?, ?], offset: ?>>
-    %A_elem = memref.load %A[%i, %j] : memref<?x?xf32, strided<[?, ?], offset: ?>>
+  scf.parallel (%i, %j) = (%c0, %c0) to (%c2, %c1) step (%c1, %c1) {
+    %A = memref.subview %buffer[%i, %c0][2, 1][1, 1] : memref<2x2xf32> to memref<2x1xf32, strided<[2, 1], offset: ?>>
+    %A_elem = memref.load %A[%i, %j] : memref<2x1xf32, strided<[2, 1], offset: ?>>
     scf.reduce
   }
   return
 }
-// CHECK-LABEL: func @do_not_fuse_loops_with_memref_defined_in_loop_bodies
+// CHECK-LABEL: func @do_not_fuse_loops_with_nonfull_alias_defined_in_loop_bodies
 // CHECK:        scf.parallel
 // CHECK:        scf.parallel
 
@@ -604,6 +605,415 @@ func.func @do_not_fuse_affine_apply_to_non_ind_var(
 
 // -----
 
+func.func @fuse_trivial_rank_reducing_subview() {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %c1fp = arith.constant 1.0 : f32
+  %buf = memref.alloc() : memref<1x2x2xf32>
+  scf.parallel (%i, %j) = (%c0, %c0) to (%c2, %c2) step (%c1, %c1) {
+    memref.store %c1fp, %buf[%c0, %i, %j] : memref<1x2x2xf32>
+    scf.reduce
+  }
+  %sub = memref.subview %buf[0, 0, 0][1, 2, 2][1, 1, 1]
+      : memref<1x2x2xf32> to memref<2x2xf32>
+  scf.parallel (%i, %j) = (%c0, %c0) to (%c2, %c2) step (%c1, %c1) {
+    %v = memref.load %sub[%i, %j] : memref<2x2xf32>
+    memref.store %v, %buf[%c0, %i, %j] : memref<1x2x2xf32>
+    scf.reduce
+  }
+  memref.dealloc %buf : memref<1x2x2xf32>
+  return
+}
+// CHECK-LABEL: func @fuse_trivial_rank_reducing_subview
+// CHECK:       %[[BUF:.*]] = memref.alloc() : memref<1x2x2xf32>
+// CHECK:       %[[SUB:.*]] = memref.subview %[[BUF]]
+// CHECK:       scf.parallel
+// CHECK:         memref.store {{.*}}, %[[BUF]]
+// CHECK:         %[[L:.*]] = memref.load %[[SUB]]
+// CHECK:         memref.store %[[L]], %[[BUF]]
+// CHECK-NOT:   scf.parallel
+// CHECK:       memref.dealloc %[[BUF]] : memref<1x2x2xf32>
+
+// -----
+
+func.func @do_not_fuse_nontrivial_subview_offset() {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %c1fp = arith.constant 1.0 : f32
+  %buf = memref.alloc() : memref<2x2x2xf32>
+  scf.parallel (%i, %j) = (%c0, %c0) to (%c2, %c2) step (%c1, %c1) {
+    memref.store %c1fp, %buf[%c0, %i, %j] : memref<2x2x2xf32>
+    scf.reduce
+  }
+  %sub = memref.subview %buf[1, 0, 0][1, 2, 2][1, 1, 1]
+      : memref<2x2x2xf32> to memref<2x2xf32, strided<[2, 1], offset: 4>>
+  scf.parallel (%i, %j) = (%c0, %c0) to (%c2, %c2) step (%c1, %c1) {
+    %v = memref.load %sub[%i, %j]
+        : memref<2x2xf32, strided<[2, 1], offset: 4>>
+    memref.store %v, %buf[%c0, %i, %j] : memref<2x2x2xf32>
+    scf.reduce
+  }
+  memref.dealloc %buf : memref<2x2x2xf32>
+  return
+}
+// CHECK-LABEL: func @do_not_fuse_nontrivial_subview_offset
+// CHECK:       scf.parallel
+// CHECK:       scf.parallel
+
+// -----
+
+func.func @fuse_vector_load_store(%A: memref<4x4xf32>) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+  %vec0 = arith.constant dense<0.0> : vector<4xf32>
+  scf.parallel (%i) = (%c0) to (%c4) step (%c1) {
+    vector.store %vec0, %A[%i, %c0] : memref<4x4xf32>, vector<4xf32>
+    scf.reduce
+  }
+  scf.parallel (%i) = (%c0) to (%c4) step (%c1) {
+    %v = vector.load %A[%i, %c0] : memref<4x4xf32>, vector<4xf32>
+    vector.store %v, %A[%i, %c0] : memref<4x4xf32>, vector<4xf32>
+    scf.reduce
+  }
+  return
+}
+// CHECK-LABEL: func @fuse_vector_load_store
+// CHECK:       scf.parallel (%[[I:.*]]) = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) {
+// CHECK:         vector.store
+// CHECK:         %[[V:.*]] = vector.load
+// CHECK:         vector.store %[[V]]
+// CHECK-NOT:   scf.parallel
+
+// -----
+
+func.func @do_not_fuse_vector_different_indices(%A: memref<4x4xf32>) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+  %vec0 = arith.constant dense<0.0> : vector<4xf32>
+  scf.parallel (%i) = (%c0) to (%c4) step (%c1) {
+    vector.store %vec0, %A[%i, %c0] : memref<4x4xf32>, vector<4xf32>
+    scf.reduce
+  }
+  scf.parallel (%i) = (%c0) to (%c4) step (%c1) {
+    %j = affine.apply affine_map<(d0) -> (d0 + 1)>(%i)
+    %v = vector.load %A[%j, %c0] : memref<4x4xf32>, vector<4xf32>
+    vector.store %v, %A[%i, %c0] : memref<4x4xf32>, vector<4xf32>
+    scf.reduce
+  }
+  return
+}
+// CHECK-LABEL: func @do_not_fuse_vector_different_indices
+// CHECK:       scf.parallel
+// CHECK:       scf.parallel
+
+// -----
+
+func.func @fuse_vector_transfer_same_indices(%A: memref<4x4xf32>) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+  %zero = arith.constant 0.0 : f32
+  scf.parallel (%i) = (%c0) to (%c4) step (%c1) {
+    %v = vector.transfer_read %A[%i, %c0], %zero {permutation_map = affine_map<(d0, d1) -> (d1)>, in_bounds = [true]} : memref<4x4xf32>, vector<4xf32>
+    vector.transfer_write %v, %A[%i, %c0] {permutation_map = affine_map<(d0, d1) -> (d1)>, in_bounds = [true]} : vector<4xf32>, memref<4x4xf32>
+    scf.reduce
+  }
+  scf.parallel (%i) = (%c0) to (%c4) step (%c1) {
+    %v = vector.transfer_read %A[%i, %c0], %zero {permutation_map = affine_map<(d0, d1) -> (d1)>, in_bounds = [true]} : memref<4x4xf32>, vector<4xf32>
+    vector.transfer_write %v, %A[%i, %c0] {permutation_map = affine_map<(d0, d1) -> (d1)>, in_bounds = [true]} : vector<4xf32>, memref<4x4xf32>
+    scf.reduce
+  }
+  return
+}
+// CHECK-LABEL: func @fuse_vector_transfer_same_indices
+// CHECK:       scf.parallel
+// CHECK:         vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}]
+// CHECK:         vector.transfer_write %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}]
+// CHECK:         vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}]
+// CHECK:         vector.transfer_write %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}]
+// CHECK-NOT:   scf.parallel
+
+// -----
+
+func.func @do_not_fuse_vector_transfer_different_indices(%A: memref<4x4xf32>) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+  %zero = arith.constant 0.0 : f32
+  scf.parallel (%i) = (%c0) to (%c4) step (%c1) {
+    %v = vector.transfer_read %A[%i, %c0], %zero {permutation_map = affine_map<(d0, d1) -> (d1)>, in_bounds = [true]} : memref<4x4xf32>, vector<4xf32>
+    vector.transfer_write %v, %A[%i, %c0] {permutation_map = affine_map<(d0, d1) -> (d1)>, in_bounds = [true]} : vector<4xf32>, memref<4x4xf32>
+    scf.reduce
+  }
+  scf.parallel (%i) = (%c0) to (%c4) step (%c1) {
+    %j = affine.apply affine_map<(d0) -> (d0 + 1)>(%i)
+    %v = vector.transfer_read %A[%j, %c0], %zero {permutation_map = affine_map<(d0, d1) -> (d1)>, in_bounds = [true]} : memref<4x4xf32>, vector<4xf32>
+    vector.transfer_write %v, %A[%i, %c0] {permutation_map = affine_map<(d0, d1) -> (d1)>, in_bounds = [true]} : vector<4xf32>, memref<4x4xf32>
+    scf.reduce
+  }
+  return
+}
+// CHECK-LABEL: func @do_not_fuse_vector_transfer_different_indices
+// CHECK:       scf.parallel
+// CHECK:       scf.parallel
+
+// -----
+
+func.func @fuse_vector_transfer_with_subview(%A: memref<1x4xf32>) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+  %zero = arith.constant 0.0 : f32
+  %vec = arith.constant dense<1.0> : vector<4xf32>
+  scf.parallel (%i) = (%c0) to (%c1) step (%c1) {
+    %sub = memref.subview %A[0, 0][1, 4][1, 1] : memref<1x4xf32> to memref<4xf32>
+    vector.transfer_write %vec, %sub[%c0] {permutation_map = affine_map<(d0) -> (d0)>, in_bounds = [true]} : vector<4xf32>, memref<4xf32>
+    scf.reduce
+  }
+  scf.parallel (%i) = (%c0) to (%c1) step (%c1) {
+    %sum = scf.for %k = %c0 to %c4 step %c1 iter_args(%acc = %zero) -> f32 {
+      %v = memref.load %A[%c0, %k] : memref<1x4xf32>
+      %n = arith.addf %v, %acc : f32
+      scf.yield %n : f32
+    }
+    memref.store %sum, %A[%c0, %c0] : memref<1x4xf32>
+    scf.reduce
+  }
+  return
+}
+// CHECK-LABEL: func @fuse_vector_transfer_with_subview
+// CHECK:       scf.parallel
+// CHECK:         vector.transfer_write
+// CHECK:         scf.for
+// CHECK-NOT:   scf.parallel
+
+// -----
+
+func.func @do_not_fuse_vector_transfer_nontrivial_subview(%A: memref<2x4xf32>) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %zero = arith.constant 0.0 : f32
+  scf.parallel (%i) = (%c0) to (%c1) step (%c1) {
+    %v = vector.transfer_read %A[%c0, %i], %zero {permutation_map = affine_map<(d0, d1) -> (d1)>, in_bounds = [true]} : memref<2x4xf32>, vector<1xf32>
+    vector.transfer_write %v, %A[%c0, %i] {permutation_map = affine_map<(d0, d1) -> (d1)>, in_bounds = [true]} : vector<1xf32>, memref<2x4xf32>
+    scf.reduce
+  }
+    %sub = memref.subview %A[1, 0][1, 4][1, 1] : memref<2x4xf32> to memref<4xf32, strided<[1], offset: 4>>
+  scf.parallel (%i) = (%c0) to (%c1) step (%c1) {
+    %v = vector.transfer_read %sub[%i], %zero {in_bounds = [true]} : memref<4xf32, strided<[1], offset: 4>>, vector<1xf32>
+    vector.transfer_write %v, %sub[%i] {in_bounds = [true]} : vector<1xf32>, memref<4xf32, strided<[1], offset: 4>>
+    scf.reduce
+  }
+  return
+}
+// CHECK-LABEL: func @do_not_fuse_vector_transfer_nontrivial_subview
+// CHECK:       scf.parallel
+// CHECK:       scf.parallel
+
+// -----
+
+func.func @do_not_fuse_vector_transfer_different_masks(%A: memref<1x4xf32>) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %zero = arith.constant 0.0 : f32
+  %mask_true = vector.create_mask %c1 : vector<1xi1>
+  %mask_false = vector.create_mask %c0 : vector<1xi1>
+  scf.parallel (%i) = (%c0) to (%c1) step (%c1) {
+    %v = vector.transfer_read %A[%c0, %i], %zero, %mask_true {permutation_map = affine_map<(d0, d1) -> (d1)>, in_bounds = [true]} : memref<1x4xf32>, vector<1xf32>
+    vector.transfer_write %v, %A[%c0, %i], %mask_true {permutation_map = affine_map<(d0, d1) -> (d1)>, in_bounds = [true]} : vector<1xf32>, memref<1x4xf32>
+    scf.reduce
+  }
+  scf.parallel (%i) = (%c0) to (%c1) step (%c1) {
+    %v = vector.transfer_read %A[%c0, %i], %zero, %mask_false {permutation_map = affine_map<(d0, d1) -> (d1)>, in_bounds = [true]} : memref<1x4xf32>, vector<1xf32>
+    vector.transfer_write %v, %A[%c0, %i], %mask_false {permutation_map = affine_map<(d0, d1) -> (d1)>, in_bounds = [true]} : vector<1xf32>, memref<1x4xf32>
+    scf.reduce
+  }
+  return
+}
+// CHECK-LABEL: func @do_not_fuse_vector_transfer_different_masks
+// CHECK:       scf.parallel
+// CHECK:       scf.parallel
+
+// -----
+
+func.func @fuse_vector_transfer_subview_rank_reducing(%A: memref<1x4xf32>, %B: memref<1x4xf32>) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+  %zero = arith.constant 0.0 : f32
+  %vec = arith.constant dense<1.0> : vector<4xf32>
+  scf.parallel (%i) = (%c0) to (%c1) step (%c1) {
+    %sub = memref.subview %A[%i, %c0][1, 4][1, 1] : memref<1x4xf32> to memref<4xf32, strided<[1], offset: ?>>
+    vector.transfer_write %vec, %sub[%c0] {permutation_map = affine_map<(d0) -> (d0)>, in_bounds = [true]} : vector<4xf32>, memref<4xf32, strided<[1], offset: ?>>
+    scf.reduce
+  }
+  scf.parallel (%i) = (%c0) to (%c1) step (%c1) {
+    %sum = scf.for %k = %c0 to %c4 step %c1 iter_args(%acc = %zero) -> f32 {
+      %v = memref.load %A[%i, %k] : memref<1x4xf32>
+      %n = arith.addf %v, %acc : f32
+      scf.yield %n : f32
+    }
+    memref.store %sum, %B[%i, %c0] : memref<1x4xf32>
+    scf.reduce
+  }
+  return
+}
+// CHECK-LABEL: func @fuse_vector_transfer_subview_rank_reducing
+// CHECK:       scf.parallel
+// CHECK:         vector.transfer_write
+// CHECK:         scf.for
+// CHECK-NOT:   scf.parallel
+
+// -----
+
+func.func @do_not_fuse_vector_transfer_subview_offset(%A: memref<1x4xf32>, %B: memref<1x4xf32>) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+  %zero = arith.constant 0.0 : f32
+  %vec = arith.constant dense<1.0> : vector<4xf32>
+  scf.parallel (%i) = (%c0) to (%c1) step (%c1) {
+    %sub = memref.subview %A[%i, %c0][1, 4][1, 1] : memref<1x4xf32> to memref<4xf32, strided<[1], offset: ?>>
+    vector.transfer_write %vec, %sub[%c0] {permutation_map = affine_map<(d0) -> (d0)>, in_bounds = [true]} : vector<4xf32>, memref<4xf32, strided<[1], offset: ?>>
+    scf.reduce
+  }
+  scf.parallel (%i) = (%c0) to (%c1) step (%c1) {
+    %sum = scf.for %k = %c0 to %c4 step %c1 iter_args(%acc = %zero) -> f32 {
+      %v = memref.load %A[%i, %k] : memref<1x4xf32>
+      %n = arith.addf %v, %acc : f32
+      scf.yield %n : f32
+    }
+    // Read from an offset alias to prevent fusion.
+    %off = memref.subview %A[%i, %c1][1, 3][1, 1] : memref<1x4xf32> to memref<3xf32, strided<[1], offset: ?>>
+    %v0 = memref.load %off[%c0] : memref<3xf32, strided<[1], offset: ?>>
+    %res = arith.addf %sum, %v0 : f32
+    memref.store %res, %B[%i, %c0] : memref<1x4xf32>
+    scf.reduce
+  }
+  return
+}
+// CHECK-LABEL: func @do_not_fuse_vector_transfer_subview_offset
+// CHECK:       scf.parallel
+// CHECK:       scf.parallel
+
+// -----
+
+func.func @fuse_vector_transfer_no_subview(%A: memref<1x4xf32>, %B: memref<1x4xf32>) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+  %zero = arith.constant 0.0 : f32
+  %vec = arith.constant dense<2.0> : vector<4xf32>
+  scf.parallel (%i) = (%c0) to (%c1) step (%c1) {
+    vector.transfer_write %vec, %A[%c0, %i] {permutation_map = affine_map<(d0, d1) -> (d1)>, in_bounds = [true]} : vector<4xf32>, memref<1x4xf32>
+    scf.reduce
+  }
+  scf.parallel (%i) = (%c0) to (%c1) step (%c1) {
+    %sum = scf.for %k = %c0 to %c4 step %c1 iter_args(%acc = %zero) -> f32 {
+      %v = memref.load %A[%c0, %k] : memref<1x4xf32>
+      %n = arith.addf %v, %acc : f32
+      scf.yield %n : f32
+    }
+    memref.store %sum, %B[%c0, %c0] : memref<1x4xf32>
+    scf.reduce
+  }
+  return
+}
+// CHECK-LABEL: func @fuse_vector_transfer_no_subview
+// CHECK:       vector.transfer_write
+// CHECK:       scf.for
+// CHECK-NOT:   scf.parallel
+
+// -----
+
+func.func @fuse_vector_transfer_scalar_load_rank2(%A: memref<2x4xf32>, %B: memref<2x4xf32>) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %vec = arith.constant dense<1.0> : vector<2x4xf32>
+  scf.parallel (%i) = (%c0) to (%c1) step (%c1) {
+    vector.transfer_write %vec, %A[%c0, %c0] {permutation_map = affine_map<(d0, d1) -> (d0, d1)>, in_bounds = [true, true]} : vector<2x4xf32>, memref<2x4xf32>
+    scf.reduce
+  }
+  scf.parallel (%i) = (%c0) to (%c1) step (%c1) {
+    %v0 = memref.load %A[%c0, %c1] : memref<2x4xf32>
+    %v1 = memref.load %A[%c1, %c2] : memref<2x4xf32>
+    %sum = arith.addf %v0, %v1 : f32
+    memref.store %sum, %B[%c0, %c0] : memref<2x4xf32>
+    scf.reduce
+  }
+  return
+}
+// CHECK-LABEL: func @fuse_vector_transfer_scalar_load_rank2
+// CHECK:       scf.parallel
+// CHECK:         vector.transfer_write
+// CHECK:         memref.load
+// CHECK:         memref.load
+// CHECK-NOT:   scf.parallel
+
+// -----
+
+func.func @fuse_vector_transfer_scalar_load_loop_rank2(%A: memref<2x4xf32>, %B: memref<2x4xf32>) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+  %zero = arith.constant 0.0 : f32
+  %vec = arith.constant dense<2.0> : vector<2x4xf32>
+  scf.parallel (%i) = (%c0) to (%c1) step (%c1) {
+    vector.transfer_write %vec, %A[%c0, %c0] {permutation_map = affine_map<(d0, d1) -> (d0, d1)>, in_bounds = [true, true]} : vector<2x4xf32>, memref<2x4xf32>
+    scf.reduce
+  }
+  scf.parallel (%i) = (%c0) to (%c1) step (%c1) {
+    %sum = scf.for %k = %c0 to %c4 step %c1 iter_args(%acc = %zero) -> f32 {
+      %v = memref.load %A[%c1, %k] : memref<2x4xf32>
+      %n = arith.addf %v, %acc : f32
+      scf.yield %n : f32
+    }
+    memref.store %sum, %B[%c0, %c0] : memref<2x4xf32>
+    scf.reduce
+  }
+  return
+}
+// CHECK-LABEL: func @fuse_vector_transfer_scalar_load_loop_rank2
+// CHECK:       scf.parallel
+// CHECK:         vector.transfer_write
+// CHECK:         scf.for
+// CHECK-NOT:   scf.parallel
+
+// -----
+
+func.func @fuse_vector_store_scalar_load_rank2(%A: memref<2x4xf32>, %B: memref<2x4xf32>) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %c3 = arith.constant 3 : index
+  %vec = arith.constant dense<3.0> : vector<2x4xf32>
+  scf.parallel (%i) = (%c0) to (%c1) step (%c1) {
+    vector.store %vec, %A[%c0, %c0] : memref<2x4xf32>, vector<2x4xf32>
+    scf.reduce
+  }
+  scf.parallel (%i) = (%c0) to (%c1) step (%c1) {
+    %v0 = memref.load %A[%c1, %c2] : memref<2x4xf32>
+    %v1 = memref.load %A[%c0, %c3] : memref<2x4xf32>
+    %sum = arith.addf %v0, %v1 : f32
+    memref.store %sum, %B[%c0, %c0] : memref<2x4xf32>
+    scf.reduce
+  }
+  return
+}
+// CHECK-LABEL: func @fuse_vector_store_scalar_load_rank2
+// CHECK:       scf.parallel
+// CHECK:         vector.store
+// CHECK:         memref.load
+// CHECK:         memref.load
+// CHECK-NOT:   scf.parallel
+
+// -----
+
 func.func @fuse_reductions_two(%A: memref<2x2xf32>, %B: memref<2x2xf32>) -> (f32, f32) {
   %c2 = arith.constant 2 : index
   %c0 = arith.constant 0 : index
diff --git a/mlir/test/Integration/Dialect/XeGPU/LANE/no-xegpu-ops.mlir b/mlir/test/Integration/Dialect/XeGPU/LANE/no-xegpu-ops.mlir
new file mode 100644
index 0000000000000..548e1864b1a05
--- /dev/null
+++ b/mlir/test/Integration/Dialect/XeGPU/LANE/no-xegpu-ops.mlir
@@ -0,0 +1,53 @@
+// RUN: mlir-opt %s --gpu-async-region --gpu-lower-to-xevm-pipeline="xegpu-op-level=lane" \
+// RUN: | mlir-runner \
+// RUN:   --shared-libs=%mlir_levelzero_runtime \
+// RUN:   --shared-libs=%mlir_runner_utils \
+// RUN:   --entry-point-result=void \
+// RUN: | FileCheck %s
+
+module @add attributes {gpu.container_module} {
+  memref.global "private" constant @__constant_2x2x2xf32_0 : memref<2x2x2xf32> = dense<[[[1.1, 2.2], [3.3, 4.4]], [[5.5, 6.6], [7.7, 8.8 ]]]>
+  memref.global "private" constant @__constant_2x2x2xf32 : memref<2x2x2xf32> = dense<[[[1.2, 2.3], [4.5, 5.8]], [[7.2, 8.3], [10.5, 11.8]]]>
+  func.func @main() {
+    %0 = memref.get_global @__constant_2x2x2xf32 : memref<2x2x2xf32>
+    %1 = memref.get_global @__constant_2x2x2xf32_0 : memref<2x2x2xf32>
+    %2 = call @test(%0, %1) : (memref<2x2x2xf32>, memref<2x2x2xf32>) -> memref<2x2x2xf32>
+    %cast = memref.cast %2 : memref<2x2x2xf32> to memref<*xf32>
+    call @printMemrefF32(%cast) : (memref<*xf32>) -> ()
+    return
+  }
+  func.func private @printMemrefF32(memref<*xf32>)
+  func.func @test(%arg0: memref<2x2x2xf32>, %arg1: memref<2x2x2xf32>) -> memref<2x2x2xf32> {
+    %c2 = arith.constant 2 : index
+    %c1 = arith.constant 1 : index
+    %mem = gpu.alloc () : memref<2x2x2xf32>
+    gpu.memcpy %mem, %arg1 : memref<2x2x2xf32>, memref<2x2x2xf32>
+    %memref_0 = gpu.alloc () : memref<2x2x2xf32>
+    gpu.memcpy %memref_0, %arg0 : memref<2x2x2xf32>, memref<2x2x2xf32>
+    %memref_2 = gpu.alloc () : memref<2x2x2xf32>
+    gpu.launch_func @test_kernel::@test_kernel blocks in (%c2, %c2, %c2) threads in (%c1, %c1, %c1)
+          args(%memref_0 : memref<2x2x2xf32>, %mem : memref<2x2x2xf32>, %memref_2 : memref<2x2x2xf32>)
+    %alloc = memref.alloc() : memref<2x2x2xf32>
+    gpu.memcpy %alloc, %memref_2 : memref<2x2x2xf32>, memref<2x2x2xf32>
+    gpu.dealloc %memref_2 : memref<2x2x2xf32>
+    gpu.dealloc %memref_0 : memref<2x2x2xf32>
+    gpu.dealloc %mem : memref<2x2x2xf32>
+    return %alloc : memref<2x2x2xf32>
+  }
+  gpu.module @test_kernel {
+    gpu.func @test_kernel(%arg0: memref<2x2x2xf32>, %arg1: memref<2x2x2xf32>, %arg2: memref<2x2x2xf32>) kernel {
+      %0 = gpu.block_id  x
+      %1 = gpu.block_id  y
+      %2 = gpu.block_id  z
+      %3 = memref.load %arg0[%0, %1, %2] : memref<2x2x2xf32>
+      %4 = memref.load %arg1[%0, %1, %2] : memref<2x2x2xf32>
+      %5 = arith.addf %3, %4 : f32
+      memref.store %5, %arg2[%0, %1, %2] : memref<2x2x2xf32>
+      gpu.return
+    }
+  }
+  // CHECK: [2.3, 4.5]
+  // CHECK: [7.8, 10.2]
+  // CHECK: [12.7, 14.9]
+  // CHECK: [18.2, 20.6]
+}
diff --git a/utils/bazel/BUILD.bazel b/utils/bazel/BUILD.bazel
index dd837093c62ac..4e907cc8b7ba7 100644
--- a/utils/bazel/BUILD.bazel
+++ b/utils/bazel/BUILD.bazel
@@ -2,4 +2,10 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-# Required to reference .bzl files in this package
+load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
+
+bzl_library(
+    name = "configure",
+    srcs = ["configure.bzl"],
+    visibility = ["//visibility:public"],
+)
diff --git a/utils/bazel/llvm-project-overlay/clang-tools-extra/clang-query/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang-tools-extra/clang-query/BUILD.bazel
index 05fcbf7beb99f..edfa65654066d 100644
--- a/utils/bazel/llvm-project-overlay/clang-tools-extra/clang-query/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/clang-tools-extra/clang-query/BUILD.bazel
@@ -2,7 +2,7 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-load("@rules_cc//cc:defs.bzl", "cc_library")
+load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_library")
 
 package(
     default_visibility = ["//visibility:public"],
@@ -15,6 +15,7 @@ cc_library(
     name = "lib",
     srcs = glob(["*.cpp"]),
     hdrs = glob(["*.h"]),
+    includes = ["."],
     deps = [
         "//clang:ast",
         "//clang:ast_matchers",
@@ -26,3 +27,16 @@ cc_library(
         "//llvm:Support",
     ],
 )
+
+cc_binary(
+    name = "clang-query",
+    srcs = ["tool/ClangQuery.cpp"],
+    stamp = 0,
+    deps = [
+        ":lib",
+        "//clang:frontend",
+        "//clang:tooling",
+        "//llvm:LineEditor",
+        "//llvm:Support",
+    ],
+)
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index a5b6823b9ca3d..2ecd3502204e2 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -2675,6 +2675,16 @@ libc_support_library(
     ],
 )
 
+libc_support_library(
+    name = "__support_math_bf16divl",
+    hdrs = ["src/__support/math/bf16divl.h"],
+    deps = [
+        ":__support_fputil_basic_operations",
+        ":__support_fputil_bfloat16",
+        ":__support_macros_config",
+    ],
+)
+
 libc_support_library(
     name = "__support_math_bf16fmaf",
     hdrs = ["src/__support/math/bf16fmaf.h"],
@@ -2685,6 +2695,56 @@ libc_support_library(
     ],
 )
 
+libc_support_library(
+    name = "__support_math_bf16fmal",
+    hdrs = ["src/__support/math/bf16fmal.h"],
+    deps = [
+        ":__support_fputil_bfloat16",
+        ":__support_fputil_fma",
+        ":__support_macros_config",
+    ],
+)
+
+libc_support_library(
+    name = "__support_math_bf16mul",
+    hdrs = ["src/__support/math/bf16mul.h"],
+    deps = [
+        ":__support_fputil_basic_operations",
+        ":__support_fputil_bfloat16",
+        ":__support_macros_config",
+    ],
+)
+
+libc_support_library(
+    name = "__support_math_bf16mulf",
+    hdrs = ["src/__support/math/bf16mulf.h"],
+    deps = [
+        ":__support_fputil_basic_operations",
+        ":__support_fputil_bfloat16",
+        ":__support_macros_config",
+    ],
+)
+
+libc_support_library(
+    name = "__support_math_bf16mulf128",
+    hdrs = ["src/__support/math/bf16mulf128.h"],
+    deps = [
+        ":__support_fputil_basic_operations",
+        ":__support_fputil_bfloat16",
+        ":__support_macros_config",
+        ":llvm_libc_types_float128",
+    ],
+)
+
+libc_support_library(
+    name = "__support_math_bf16mull",
+    hdrs = ["src/__support/math/bf16mull.h"],
+    deps = [
+        ":__support_fputil_basic_operations",
+        ":__support_fputil_bfloat16",
+    ],
+)
+
 libc_support_library(
     name = "__support_math_canonicalize",
     hdrs = ["src/__support/math/canonicalize.h"],
@@ -2992,6 +3052,47 @@ libc_support_library(
     ],
 )
 
+libc_support_library(
+    name = "__support_math_f16add",
+    hdrs = ["src/__support/math/f16add.h"],
+    deps = [
+        ":__support_fputil_basic_operations",
+        ":__support_macros_config",
+        ":llvm_libc_macros_float16_macros",
+    ],
+)
+
+libc_support_library(
+    name = "__support_math_f16addf",
+    hdrs = ["src/__support/math/f16addf.h"],
+    deps = [
+        ":__support_fputil_basic_operations",
+        ":__support_macros_config",
+        ":llvm_libc_macros_float16_macros",
+    ],
+)
+
+libc_support_library(
+    name = "__support_math_f16addf128",
+    hdrs = ["src/__support/math/f16addf128.h"],
+    deps = [
+        ":__support_fputil_basic_operations",
+        ":__support_macros_config",
+        ":llvm_libc_macros_float16_macros",
+        ":llvm_libc_types_float128",
+    ],
+)
+
+libc_support_library(
+    name = "__support_math_f16addl",
+    hdrs = ["src/__support/math/f16addl.h"],
+    deps = [
+        ":__support_fputil_basic_operations",
+        ":__support_macros_config",
+        ":llvm_libc_macros_float16_macros",
+    ],
+)
+
 libc_support_library(
     name = "__support_math_f16fma",
     hdrs = ["src/__support/math/f16fma.h"],
@@ -3264,6 +3365,16 @@ libc_support_library(
     ],
 )
 
+libc_support_library(
+    name = "__support_math_logbl",
+    hdrs = ["src/__support/math/logbl.h"],
+    deps = [
+        ":__support_common",
+        ":__support_fputil_manipulation_functions",
+        ":__support_macros_config",
+    ],
+)
+
 libc_support_library(
     name = "__support_math_log10",
     hdrs = ["src/__support/math/log10.h"],
@@ -4087,6 +4198,24 @@ libc_support_library(
     ],
 )
 
+libc_support_library(
+    name = "__support_math_tanf16",
+    hdrs = ["src/__support/math/tanf16.h"],
+    deps = [
+        ":__support_fputil_cast",
+        ":__support_fputil_except_value_utils",
+        ":__support_fputil_fenv_impl",
+        ":__support_fputil_fp_bits",
+        ":__support_fputil_multiply_add",
+        ":__support_macros_optimization",
+        ":__support_macros_properties_types",
+        ":__support_math_sincosf16_utils",
+        ":hdr_errno_macros",
+        ":hdr_fenv_macros",
+        ":llvm_libc_macros_float16_macros",
+    ],
+)
+
 libc_support_library(
     name = "__support_math_tanhf",
     hdrs = ["src/__support/math/tanhf.h"],
@@ -4119,6 +4248,22 @@ libc_support_library(
     ],
 )
 
+libc_support_library(
+    name = "__support_math_tanpif",
+    hdrs = ["src/__support/math/tanpif.h"],
+    deps = [
+        ":__support_common",
+        ":__support_fputil_cast",
+        ":__support_fputil_except_value_utils",
+        ":__support_fputil_fenv_impl",
+        ":__support_fputil_fp_bits",
+        ":__support_fputil_multiply_add",
+        ":__support_macros_config",
+        ":__support_macros_optimization",
+        ":__support_sincosf_utils",
+    ],
+)
+
 ############################### complex targets ################################
 
 libc_function(
@@ -4483,11 +4628,49 @@ libc_math_function(
     additional_deps = [":__support_math_bf16addf128"],
 )
 
+libc_math_function(
+    name = "bf16divl",
+    additional_deps = [":__support_math_bf16divl"],
+)
+
 libc_math_function(
     name = "bf16fmaf",
     additional_deps = [":__support_math_bf16fmaf"],
 )
 
+libc_math_function(
+    name = "bf16fmal",
+    additional_deps = [":__support_math_bf16fmal"],
+)
+
+libc_math_function(
+    name = "bf16fmul",
+    additional_deps = [
+        ":__support_math_bf16mul"
+    ],
+)
+
+libc_math_function(
+    name = "bf16fmulf",
+    additional_deps = [
+        ":__support_math_bf16mulf"
+    ],
+)
+
+libc_math_function(
+    name = "bf16fmulf128",
+    additional_deps = [
+        ":__support_math_bf16mulf128"
+    ],
+)
+
+libc_math_function(
+    name = "bf16fmull",
+    additional_deps = [
+        ":__support_math_bf16mull"
+    ],
+)
+
 libc_math_function(
     name = "canonicalize",
     additional_deps = [
@@ -4757,13 +4940,25 @@ libc_math_function(
     additional_deps = [":__support_math_expm1f16"],
 )
 
-libc_math_function(name = "f16add")
+libc_math_function(
+    name = "f16add",
+    additional_deps = [":__support_math_f16add"],
+)
 
-libc_math_function(name = "f16addf")
+libc_math_function(
+    name = "f16addf",
+    additional_deps = [":__support_math_f16addf"],
+)
 
-libc_math_function(name = "f16addf128")
+libc_math_function(
+    name = "f16addf128",
+    additional_deps = [":__support_math_f16addf128"],
+)
 
-libc_math_function(name = "f16addl")
+libc_math_function(
+    name = "f16addl",
+    additional_deps = [":__support_math_f16addl"],
+)
 
 libc_math_function(name = "f16div")
 
@@ -5312,7 +5507,10 @@ libc_math_function(
     additional_deps = [":__support_math_logbf"],
 )
 
-libc_math_function(name = "logbl")
+libc_math_function(
+    name = "logbl",
+    additional_deps = [":__support_math_logbl"],
+)
 
 libc_math_function(
     name = "logbf128",
@@ -5651,9 +5849,7 @@ libc_math_function(
 libc_math_function(
     name = "tanf16",
     additional_deps = [
-        ":__support_fputil_nearest_integer",
-        ":__support_fputil_polyeval",
-        ":__support_math_sincosf16_utils",
+        ":__support_math_tanf16",
     ],
 )
 
@@ -5674,11 +5870,7 @@ libc_math_function(
 libc_math_function(
     name = "tanpif",
     additional_deps = [
-        ":__support_sincosf_utils",
-        ":hdr_fenv_macros",
-        ":__support_macros_config",
-        ":__support_macros_optimization",
-        ":__support_fputil_multiply_add",
+        ":__support_math_tanpif",
     ],
 )
 
diff --git a/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel b/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel
index 59916fe16be1d..24d2de92636d1 100644
--- a/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel
@@ -2078,6 +2078,7 @@ cc_library(
         "//llvm:BinaryFormat",
         "//llvm:Object",
         "//llvm:Support",
+        "//llvm:TargetParser",
     ],
 )
 
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index c18f71f466ea2..3292454f49de4 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -2911,8 +2911,10 @@ cc_library(
         ":DialectUtils",
         ":FuncDialect",
         ":IR",
+        ":IndexDialect",
         ":LoopLikeInterface",
         ":MemRefDialect",
+        ":MemRefUtils",
         ":Pass",
         ":Rewrite",
         ":SCFDialect",
@@ -2924,6 +2926,7 @@ cc_library(
         ":TensorTransforms",
         ":TilingInterface",
         ":TransformUtils",
+        ":VectorDialect",
         ":ViewLikeInterface",
         "//llvm:Support",
     ],

>From cc671df1081c4e50f99eaf2c4d8cd98b7b2e3413 Mon Sep 17 00:00:00 2001
From: hulxv <hulxxv at gmail.com>
Date: Tue, 24 Feb 2026 04:14:57 +0200
Subject: [PATCH 2/2] format

---
 utils/bazel/llvm-project-overlay/libc/BUILD.bazel | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index e50c7adf4628a..1695f80daa6dd 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -4790,28 +4790,28 @@ libc_math_function(
 libc_math_function(
     name = "bf16fmul",
     additional_deps = [
-        ":__support_math_bf16mul"
+        ":__support_math_bf16mul",
     ],
 )
 
 libc_math_function(
     name = "bf16fmulf",
     additional_deps = [
-        ":__support_math_bf16mulf"
+        ":__support_math_bf16mulf",
     ],
 )
 
 libc_math_function(
     name = "bf16fmulf128",
     additional_deps = [
-        ":__support_math_bf16mulf128"
+        ":__support_math_bf16mulf128",
     ],
 )
 
 libc_math_function(
     name = "bf16fmull",
     additional_deps = [
-        ":__support_math_bf16mull"
+        ":__support_math_bf16mull",
     ],
 )
 



More information about the flang-commits mailing list