[flang-commits] [clang] [clang-tools-extra] [compiler-rt] [flang] [libc] [libclc] [lldb] [llvm] [libc][math] Refactor bf16mul family to header-only (PR #182018)
Mohamed Emad via flang-commits
flang-commits at lists.llvm.org
Mon Feb 23 18:15:18 PST 2026
https://github.com/hulxv updated https://github.com/llvm/llvm-project/pull/182018
>From 36635f8c6d4dc09398c5cdef70a30c53b3016b90 Mon Sep 17 00:00:00 2001
From: hulxv <hulxxv at gmail.com>
Date: Thu, 19 Feb 2026 02:28:59 +0200
Subject: [PATCH 1/2] [libc][math] Refactor bf16mul family to header-only
Refactored functions:
- bf16mul
- bf16mulf
- bf16mulf128
- bf16mull
---
clang-tools-extra/clangd/TidyProvider.cpp | 3 +-
.../abseil/unchecked-statusor-access.rst | 42 +-
clang/docs/ReleaseNotes.rst | 2 +
clang/docs/analyzer/checkers.rst | 98 +-
.../FlowSensitive/DataflowAnalysisContext.h | 21 +-
.../SerializationFormatRegistry.h | 21 +-
clang/include/clang/Basic/BuiltinsAMDGPU.td | 148 +-
clang/include/clang/Basic/CodeGenOptions.def | 3 +
clang/include/clang/CIR/Dialect/IR/CIROps.td | 8 -
clang/include/clang/Options/Options.td | 17 +-
.../clang/StaticAnalyzer/Checkers/Checkers.td | 10 +-
clang/lib/AST/Stmt.cpp | 47 +-
.../FlowSensitive/DataflowAnalysisContext.cpp | 12 +-
.../lib/Analysis/FlowSensitive/RecordOps.cpp | 4 +-
clang/lib/Basic/ParsedAttrInfo.cpp | 17 +-
.../lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp | 62 +-
clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp | 19 +-
clang/lib/CodeGen/CGExpr.cpp | 11 +-
clang/lib/CodeGen/CGHLSLRuntime.cpp | 26 +
clang/lib/CodeGen/CGHLSLRuntime.h | 3 +
clang/lib/CodeGen/CodeGenAction.cpp | 4 +-
clang/lib/CodeGen/CodeGenModule.cpp | 4 +-
clang/lib/CodeGen/HLSLBufferLayoutBuilder.cpp | 27 +-
clang/lib/CodeGen/HLSLBufferLayoutBuilder.h | 7 +
clang/lib/Driver/Driver.cpp | 7 +
clang/lib/Driver/ToolChains/CommonArgs.cpp | 11 +-
clang/lib/Driver/ToolChains/Darwin.cpp | 15 +-
clang/lib/Sema/SemaHLSL.cpp | 4 +-
clang/lib/Sema/SemaInit.cpp | 37 +-
clang/lib/Sema/SemaType.cpp | 76 +-
clang/lib/Serialization/ASTReaderDecl.cpp | 14 +
.../Checkers/BasicObjCFoundationChecks.cpp | 16 +-
.../Checkers/WebKit/NoDeleteChecker.cpp | 6 +-
.../Checkers/WebKit/PtrTypesSemantics.cpp | 82 +-
.../Checkers/WebKit/PtrTypesSemantics.h | 5 +
.../Analysis/Checkers/WebKit/mock-types.h | 42 +-
.../Checkers/WebKit/nodelete-annotation.cpp | 123 +
.../Checkers/WebKit/uncounted-local-vars.cpp | 6 +
.../test/Analysis/analyzer-enabled-checkers.c | 1 -
clang/test/Analysis/builtin_bitcast.cpp | 2 +-
clang/test/Analysis/concrete-address.c | 2 +-
clang/test/Analysis/dtor.cpp | 2 +-
clang/test/Analysis/fixed-address-notes.c | 2 +-
clang/test/Analysis/misc-ps.m | 4 +-
clang/test/Analysis/pr22954.c | 2 +-
...c-library-functions-arg-enabled-checkers.c | 1 -
...ress-dereferences-from-any-address-space.c | 8 +-
.../builtin-structured-binding-size.cpp | 30 +
.../builtin-trivally-copyable.cpp | 57 +
.../builtin-types-compatible.c | 59 +
clang/test/CodeGen/AArch64/neon/fullfp16.c | 22 +
.../CodeGen/AArch64/v8.2a-fp16-intrinsics.c | 16 -
clang/test/CodeGen/attr-no-outline.c | 120 +-
.../MatrixToAndFromVectorConstructors.hlsl | 121 +
clang/test/CodeGenHLSL/matrix_types.hlsl | 36 +-
.../resources/cbuffer_matrix_align.hlsl | 71 +
.../default_cbuffer_with_layout.hlsl | 11 +-
clang/test/CodeGenObjC/attr-no-outline.m | 73 +-
.../builtins-amdgcn-gfx12-wmma-w32.cl | 141 +-
.../builtins-amdgcn-gfx12-wmma-w64.cl | 141 +-
.../builtins-amdgcn-wmma-w32-gfx10-err.cl | 16 +-
.../builtins-amdgcn-wmma-w64-gfx10-err.cl | 16 +-
.../usr/include/libxml/.keep | 0
clang/test/Driver/aarch64-outliner.c | 2 +-
clang/test/Driver/arm-machine-outliner.c | 2 +-
clang/test/Driver/crash-diagnostics-dir-3.c | 2 +-
clang/test/Driver/crash-diagnostics-dir.c | 2 +-
clang/test/Driver/crash-ir-repro.cpp | 2 +-
clang/test/Driver/crash-report-clang-cl.cpp | 2 +-
clang/test/Driver/crash-report-header.h | 2 +-
clang/test/Driver/crash-report-spaces.c | 2 +-
clang/test/Driver/crash-report-with-asserts.c | 4 +-
clang/test/Driver/crash-report.cpp | 4 +-
clang/test/Driver/emit-reproducer.c | 14 +-
clang/test/Driver/incompatible_sysroot.c | 11 +-
clang/test/Driver/lit.local.cfg | 7 +
clang/test/Driver/output-file-cleanup.c | 2 +-
clang/test/Driver/riscv-outliner.c | 2 +-
clang/test/Driver/x86-outliner.c | 2 +-
clang/test/Modules/demote-var-def.cpp | 94 -
clang/test/Modules/pr149404-02.cppm | 104 -
clang/test/Modules/pr172241.cppm | 47 -
clang/test/Modules/var-inst-def.cppm | 110 -
.../BuiltinMatrix/MatrixSplatErrors.hlsl | 9 +-
clang/tools/driver/driver.cpp | 20 +-
.../sanitizer_common/sanitizer_unwind_win.cpp | 49 +-
.../flang/Optimizer/Builder/HLFIRTools.h | 25 +-
flang/lib/Lower/Bridge.cpp | 1 -
flang/lib/Optimizer/Builder/HLFIRTools.cpp | 35 +-
.../Support/FIROpenACCTypeInterfaces.cpp | 56 +-
flang/test/Lower/OpenACC/acc-reduction.f90 | 145 +-
flang/test/Lower/do-while-to-scf-while.f90 | 19 +-
libc/shared/math.h | 13 +
libc/shared/math/bf16divl.h | 23 +
libc/shared/math/bf16fmal.h | 25 +
libc/shared/math/bf16mul.h | 22 +
libc/shared/math/bf16mulf.h | 22 +
libc/shared/math/bf16mulf128.h | 28 +
libc/shared/math/bf16mull.h | 22 +
libc/shared/math/f16add.h | 29 +
libc/shared/math/f16addf.h | 29 +
libc/shared/math/f16addf128.h | 32 +
libc/shared/math/f16addl.h | 29 +
libc/shared/math/logbl.h | 23 +
libc/shared/math/tanf16.h | 29 +
libc/shared/math/tanpif.h | 23 +
libc/src/__support/math/CMakeLists.txt | 141 +
libc/src/__support/math/bf16divl.h | 26 +
libc/src/__support/math/bf16fmal.h | 26 +
libc/src/__support/math/bf16mul.h | 27 +
libc/src/__support/math/bf16mulf.h | 27 +
libc/src/__support/math/bf16mulf128.h | 33 +
libc/src/__support/math/bf16mull.h | 27 +
libc/src/__support/math/f16add.h | 31 +
libc/src/__support/math/f16addf.h | 31 +
libc/src/__support/math/f16addf128.h | 34 +
libc/src/__support/math/f16addl.h | 31 +
libc/src/__support/math/logbl.h | 26 +
libc/src/__support/math/tanf16.h | 137 +
libc/src/__support/math/tanpif.h | 115 +
libc/src/math/generic/CMakeLists.txt | 80 +-
libc/src/math/generic/bf16divl.cpp | 7 +-
libc/src/math/generic/bf16fmal.cpp | 9 +-
libc/src/math/generic/bf16mul.cpp | 7 +-
libc/src/math/generic/bf16mulf.cpp | 7 +-
libc/src/math/generic/bf16mulf128.cpp | 7 +-
libc/src/math/generic/bf16mull.cpp | 7 +-
libc/src/math/generic/f16add.cpp | 6 +-
libc/src/math/generic/f16addf.cpp | 6 +-
libc/src/math/generic/f16addf128.cpp | 6 +-
libc/src/math/generic/f16addl.cpp | 6 +-
libc/src/math/generic/logbl.cpp | 6 +-
libc/src/math/generic/tanf16.cpp | 112 +-
libc/src/math/generic/tanpif.cpp | 95 +-
libc/test/shared/CMakeLists.txt | 13 +
libc/test/shared/shared_math_test.cpp | 21 +
.../lib/amdgcn/workitem/get_local_size.cl | 10 +-
.../lib/amdgcn/workitem/get_num_groups.cl | 10 +-
.../src/detail/offload/offload_topology.cpp | 2 +-
lldb/include/lldb/Host/ProcessLaunchInfo.h | 3 +-
.../include/lldb/Host/posix/HostThreadPosix.h | 2 +-
lldb/include/lldb/Utility/ArchSpec.h | 11 +
.../Python/lldbsuite/test/lldbinline.py | 1 +
.../Python/lldbsuite/test/lldbtest.py | 16 +-
.../test/tools/lldb-dap/dap_server.py | 25 +-
.../test/tools/lldb-dap/lldbdap_testcase.py | 37 +-
lldb/source/Host/posix/HostThreadPosix.cpp | 12 +-
.../Disassembler/LLVMC/DisassemblerLLVMC.cpp | 39 +-
.../Plugins/ObjectFile/ELF/ObjectFileELF.cpp | 187 +
.../Plugins/ObjectFile/ELF/ObjectFileELF.h | 4 +
.../Platform/MacOSX/PlatformDarwin.cpp | 232 +-
.../Process/Windows/Common/ProcessWindows.cpp | 4 -
.../API/commands/frame/var/TestFrameVar.py | 1 +
.../platform/connect/TestPlatformConnect.py | 1 +
.../TestPlatformLaunchGDBServer.py | 1 +
.../process/launch/TestProcessLaunch.py | 1 +
.../use_source_cache/TestUseSourceCache.py | 1 +
.../commands/statistics/basic/TestStats.py | 1 +
.../TestAutoInstallMainExecutable.py | 1 +
.../dwo/TestDumpDwo.py | 1 +
.../API/commands/trace/TestTraceStartStop.py | 2 +
.../TestBreakpointCommand.py | 1 +
.../comp_dir_symlink/TestCompDirSymLink.py | 2 +
.../breakpoint/objc/TestObjCBreakpoints.py | 2 +
.../generic/map/TestDataFormatterStdMap.py | 1 +
.../string/TestDataFormatterStdString.py | 1 +
.../gdb_remote_client/TestPty.py | 2 +
.../inferior-changed/TestInferiorChanged.py | 2 +
.../limit-debug-info/TestLimitDebugInfo.py | 2 +
.../module_cache/bsd/TestModuleCacheBSD.py | 2 +
.../debug_index/TestDebugIndexCache.py | 2 +
.../rerun_and_expr/TestRerunAndExpr.py | 2 +
.../TestRerunAndExprDylib.py | 2 +
.../thread/step_until/TestStepUntilAPI.py | 1 +
.../TestCCallingConventions.py | 1 +
.../TestSharedLibStrippedSymbols.py | 2 +
.../cpp/abi_tag_lookup/TestAbiTagLookup.py | 2 +
.../abi_tag_structors/TestAbiTagStructors.py | 2 +
.../TestConstStaticIntegralMember.py | 2 +
.../TestExprDefinitionInDylib.py | 1 +
.../TestTemplateWithSameArg.py | 2 +
.../lang/cpp/namespace/TestNamespaceLookup.py | 2 +
.../cpp/template-alias/TestTemplateAlias.py | 2 +
.../TestTemplateFunctions.py | 2 +
.../cpp/unique-types3/TestUniqueTypes3.py | 2 +
.../TestObjCStructArgument.py | 2 +
.../TestLLDBUtilFailedToHitBreakpoint.py | 1 -
.../TestFirmwareCorefiles.py | 2 +
.../macosx/simulator/TestSimulatorPlatform.py | 1 +
.../skinny-corefile/TestSkinnyCorefile.py | 2 +
.../python_api/debugger/TestDebuggerAPI.py | 1 +
.../TestTargetArchFromModule.py | 2 +
.../TestModuleUnifiedSectionList.py | 2 +
.../riscv/disassembler/TestDisassembler.py | 79 +
lldb/test/API/riscv/disassembler/a.out.yaml | 32 +
.../riscv/disassembler/conflicting.out.yaml | 38 +
.../API/riscv/disassembler/stripped.out.yaml | 28 +
.../API/source-manager/TestSourceManager.py | 1 +
lldb/test/API/test_utils/base/TestBaseTest.py | 2 +
.../breakpoint/TestDAP_setBreakpoints.py | 2 +
.../lldb-dap/disconnect/TestDAP_disconnect.py | 2 +
.../lldb-dap/launch/TestDAP_launch_args.py | 3 -
.../lldb-dap/launch/TestDAP_launch_basic.py | 3 -
.../launch/TestDAP_launch_debuggerRoot.py | 3 -
.../TestDAP_launch_environment_with_object.py | 3 -
...AP_launch_shellExpandArguments_disabled.py | 3 -
...DAP_launch_shellExpandArguments_enabled.py | 5 +-
...AP_launch_stdio_redirection_and_console.py | 6 +-
.../lldb-dap/launch/TestDAP_launch_version.py | 3 -
.../runInTerminal/TestDAP_runInTerminal.py | 2 +
.../lldb-dap/variables/TestDAP_variables.py | 12 +
.../lldb-server/TestGdbRemotePlatformFile.py | 2 +
.../commandline/TestGdbRemoteConnection.py | 2 +
lldb/test/API/types/AbstractBase.py | 2 +
.../Handler/SetVariableRequestHandler.cpp | 3 +-
.../lldb-dap/Protocol/ProtocolRequests.h | 2 +-
lldb/unittests/DAP/ProtocolRequestsTest.cpp | 20 +
llvm/cmake/modules/AddLLVM.cmake | 7 +
llvm/cmake/modules/HandleLLVMOptions.cmake | 1 -
.../cmake/modules/llvm-driver-template.cpp.in | 2 +-
llvm/docs/MIRLangRef.rst | 7 +-
.../llvm/Support/CrashRecoveryContext.h | 2 +-
llvm/include/llvm/Support/InitLLVM.h | 9 +-
llvm/include/llvm/Support/Signals.h | 8 +-
llvm/lib/Analysis/ConstantFolding.cpp | 3 +-
llvm/lib/Bitcode/Reader/BitcodeReader.cpp | 35 +-
llvm/lib/CodeGen/MIRParser/MIParser.cpp | 97 +-
.../CodeGen/SelectionDAG/TargetLowering.cpp | 4 +
llvm/lib/Support/CrashRecoveryContext.cpp | 25 +-
llvm/lib/Support/InitLLVM.cpp | 8 +-
llvm/lib/Support/KnownBits.cpp | 18 +-
llvm/lib/Support/Unix/Signals.inc | 58 +-
llvm/lib/Support/Windows/Signals.inc | 4 +-
llvm/lib/Target/AArch64/AArch64InstrInfo.h | 8 +-
.../AArch64/GISel/AArch64RegisterBankInfo.cpp | 2 +-
llvm/lib/Target/AMDGPU/AMDGPU.td | 31 +-
.../Target/AMDGPU/AMDGPULowerVGPREncoding.cpp | 9 +-
.../AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 2 +
.../Disassembler/AMDGPUDisassembler.cpp | 10 +
.../AMDGPU/Disassembler/AMDGPUDisassembler.h | 1 +
llvm/lib/Target/AMDGPU/GCNSubtarget.h | 4 +
llvm/lib/Target/AMDGPU/R600InstrInfo.h | 2 +-
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 20 +-
llvm/lib/Target/AMDGPU/SIInstrInfo.h | 42 +-
llvm/lib/Target/AMDGPU/SIInstrInfo.td | 7 +-
.../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 6 +-
llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 5 +-
llvm/lib/Target/AMDGPU/VOP3PInstructions.td | 148 +-
.../Target/PowerPC/PPCHazardRecognizers.cpp | 6 +-
llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 24 +-
llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp | 2 +-
.../Target/RISCV/AsmParser/RISCVAsmParser.cpp | 99 +-
.../RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp | 1 +
.../RISCV/MCTargetDesc/RISCVMCTargetDesc.h | 1 +
llvm/lib/Target/RISCV/RISCVInstrFormatsV.td | 8 +
llvm/lib/Target/RISCV/RISCVInstrInfo.cpp | 1 -
llvm/lib/Target/RISCV/RISCVInstrInfo.h | 1 -
llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td | 12 +
llvm/lib/Target/RISCV/RISCVInstrInfoXRivos.td | 4 +
llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td | 4 +-
llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td | 10 +-
llvm/lib/Target/SystemZ/SystemZInstrInfo.h | 4 +-
.../TargetInfo/WebAssemblyTargetInfo.h | 6 +-
llvm/lib/Target/X86/X86ISelLowering.cpp | 10 +-
llvm/lib/TargetParser/TargetParser.cpp | 29 +-
.../AArch64/fp16_intrinsic_scalar_1op.ll | 86 +-
...wmma-gfx12-w32-f16-f32-matrix-modifiers.ll | 374 +-
.../AMDGPU/GlobalISel/wmma-gfx12-w32-imm.ll | 598 +-
.../GlobalISel/wmma-gfx12-w32-iu-modifiers.ll | 258 +-
.../wmma-gfx12-w32-swmmac-index_key.ll | 183 +-
.../AMDGPU/GlobalISel/wmma-gfx12-w32.ll | 298 +-
...wmma-gfx12-w64-f16-f32-matrix-modifiers.ll | 291 +-
.../AMDGPU/GlobalISel/wmma-gfx12-w64-imm.ll | 442 +-
.../GlobalISel/wmma-gfx12-w64-iu-modifiers.ll | 186 +-
.../wmma-gfx12-w64-swmmac-index_key.ll | 266 +-
.../AMDGPU/GlobalISel/wmma-gfx12-w64.ll | 226 +-
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll | 1 +
...wmma-gfx12-w32-f16-f32-matrix-modifiers.ll | 371 +-
.../test/CodeGen/AMDGPU/wmma-gfx12-w32-imm.ll | 334 +-
.../AMDGPU/wmma-gfx12-w32-iu-modifiers.ll | 258 +-
.../AMDGPU/wmma-gfx12-w32-swmmac-index_key.ll | 183 +-
llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32.ll | 298 +-
...wmma-gfx12-w64-f16-f32-matrix-modifiers.ll | 303 +-
.../test/CodeGen/AMDGPU/wmma-gfx12-w64-imm.ll | 334 +-
.../AMDGPU/wmma-gfx12-w64-iu-modifiers.ll | 186 +-
.../AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll | 266 +-
llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64.ll | 226 +-
.../CodeGen/AMDGPU/wmma-hazards-gfx12-w32.mir | 237 +-
.../CodeGen/AMDGPU/wmma-hazards-gfx12-w64.mir | 237 +-
.../X86/expected-integer-after-tied-def.mir | 2 +-
.../MIR/X86/invalid-tied-physical-reg-def.mir | 15 +
...aren.mir => invalid-type-physical-reg.mir} | 2 +-
llvm/test/CodeGen/PowerPC/clmul-vector.ll | 8874 +++++++++++++++++
llvm/test/CodeGen/X86/known-pow2.ll | 47 +-
llvm/test/MC/AMDGPU/gfx1170_asm_wmma_w32.s | 1529 +++
llvm/test/MC/AMDGPU/gfx1170_asm_wmma_w64.s | 1529 +++
llvm/test/MC/AMDGPU/gfx1170_unsupported.s | 11 +
llvm/test/MC/AMDGPU/literals.s | 8 +-
.../AMDGPU/gfx1170_dasm_wmma_w32.txt | 1628 +++
.../AMDGPU/gfx1170_dasm_wmma_w64.txt | 1628 +++
.../ConstProp/vector-type-constant-folding.ll | 25 +
llvm/unittests/Target/X86/CMakeLists.txt | 2 +
.../Target/X86/X86SelectionDAGTest.cpp | 103 +
llvm/utils/TableGen/CodeGenMapTable.cpp | 6 +-
.../llvm/unittests/Target/X86/BUILD.gn | 1 +
.../mlir/Dialect/MemRef/Utils/MemRefUtils.h | 15 +
mlir/include/mlir/Dialect/SCF/Utils/Utils.h | 7 +
.../Conversion/MathToSPIRV/MathToSPIRV.cpp | 15 +-
mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp | 8 +-
mlir/lib/Dialect/MemRef/Utils/MemRefUtils.cpp | 41 +
mlir/lib/Dialect/OpenACC/IR/CMakeLists.txt | 1 +
mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp | 119 -
mlir/lib/Dialect/OpenACC/IR/OpenACCCG.cpp | 186 +
.../lib/Dialect/SCF/Transforms/CMakeLists.txt | 1 +
.../SCF/Transforms/ParallelLoopFusion.cpp | 742 +-
mlir/lib/Dialect/SCF/Utils/Utils.cpp | 19 +
.../MathToSPIRV/math-to-gl-spirv.mlir | 21 +
mlir/test/Dialect/LLVMIR/invalid.mlir | 7 +
.../Dialect/SCF/parallel-loop-fusion.mlir | 424 +-
.../Dialect/XeGPU/LANE/no-xegpu-ops.mlir | 53 +
utils/bazel/BUILD.bazel | 8 +-
.../clang-tools-extra/clang-query/BUILD.bazel | 16 +-
.../llvm-project-overlay/libc/BUILD.bazel | 218 +-
.../lldb/source/Plugins/BUILD.bazel | 1 +
.../llvm-project-overlay/mlir/BUILD.bazel | 3 +
325 files changed, 25008 insertions(+), 4720 deletions(-)
create mode 100644 clang/test/CIR/CodeGenBuiltins/builtin-structured-binding-size.cpp
create mode 100644 clang/test/CIR/CodeGenBuiltins/builtin-trivally-copyable.cpp
create mode 100644 clang/test/CIR/CodeGenBuiltins/builtin-types-compatible.c
create mode 100644 clang/test/CodeGenHLSL/BasicFeatures/MatrixToAndFromVectorConstructors.hlsl
create mode 100644 clang/test/CodeGenHLSL/resources/cbuffer_matrix_align.hlsl
create mode 100644 clang/test/Driver/Inputs/XRSimulator1.0.sdk/usr/include/libxml/.keep
delete mode 100644 clang/test/Modules/demote-var-def.cpp
delete mode 100644 clang/test/Modules/pr149404-02.cppm
delete mode 100644 clang/test/Modules/pr172241.cppm
delete mode 100644 clang/test/Modules/var-inst-def.cppm
create mode 100644 libc/shared/math/bf16divl.h
create mode 100644 libc/shared/math/bf16fmal.h
create mode 100644 libc/shared/math/bf16mul.h
create mode 100644 libc/shared/math/bf16mulf.h
create mode 100644 libc/shared/math/bf16mulf128.h
create mode 100644 libc/shared/math/bf16mull.h
create mode 100644 libc/shared/math/f16add.h
create mode 100644 libc/shared/math/f16addf.h
create mode 100644 libc/shared/math/f16addf128.h
create mode 100644 libc/shared/math/f16addl.h
create mode 100644 libc/shared/math/logbl.h
create mode 100644 libc/shared/math/tanf16.h
create mode 100644 libc/shared/math/tanpif.h
create mode 100644 libc/src/__support/math/bf16divl.h
create mode 100644 libc/src/__support/math/bf16fmal.h
create mode 100644 libc/src/__support/math/bf16mul.h
create mode 100644 libc/src/__support/math/bf16mulf.h
create mode 100644 libc/src/__support/math/bf16mulf128.h
create mode 100644 libc/src/__support/math/bf16mull.h
create mode 100644 libc/src/__support/math/f16add.h
create mode 100644 libc/src/__support/math/f16addf.h
create mode 100644 libc/src/__support/math/f16addf128.h
create mode 100644 libc/src/__support/math/f16addl.h
create mode 100644 libc/src/__support/math/logbl.h
create mode 100644 libc/src/__support/math/tanf16.h
create mode 100644 libc/src/__support/math/tanpif.h
create mode 100644 lldb/test/API/riscv/disassembler/TestDisassembler.py
create mode 100644 lldb/test/API/riscv/disassembler/a.out.yaml
create mode 100644 lldb/test/API/riscv/disassembler/conflicting.out.yaml
create mode 100644 lldb/test/API/riscv/disassembler/stripped.out.yaml
create mode 100644 llvm/test/CodeGen/MIR/X86/invalid-tied-physical-reg-def.mir
rename llvm/test/CodeGen/MIR/X86/{expected-tied-def-after-lparen.mir => invalid-type-physical-reg.mir} (87%)
create mode 100644 llvm/test/CodeGen/PowerPC/clmul-vector.ll
create mode 100644 llvm/test/MC/AMDGPU/gfx1170_asm_wmma_w32.s
create mode 100644 llvm/test/MC/AMDGPU/gfx1170_asm_wmma_w64.s
create mode 100644 llvm/test/MC/AMDGPU/gfx1170_unsupported.s
create mode 100644 llvm/test/MC/Disassembler/AMDGPU/gfx1170_dasm_wmma_w32.txt
create mode 100644 llvm/test/MC/Disassembler/AMDGPU/gfx1170_dasm_wmma_w64.txt
create mode 100644 llvm/test/Transforms/InstSimplify/ConstProp/vector-type-constant-folding.ll
create mode 100644 llvm/unittests/Target/X86/X86SelectionDAGTest.cpp
create mode 100644 mlir/lib/Dialect/OpenACC/IR/OpenACCCG.cpp
create mode 100644 mlir/test/Integration/Dialect/XeGPU/LANE/no-xegpu-ops.mlir
diff --git a/clang-tools-extra/clangd/TidyProvider.cpp b/clang-tools-extra/clangd/TidyProvider.cpp
index 801b3af2fbdd5..bfb0835af2245 100644
--- a/clang-tools-extra/clangd/TidyProvider.cpp
+++ b/clang-tools-extra/clangd/TidyProvider.cpp
@@ -222,7 +222,8 @@ TidyProvider disableUnusableChecks(llvm::ArrayRef<std::string> ExtraBadChecks) {
"-hicpp-invalid-access-moved",
// Check uses dataflow analysis, which might hang/crash unexpectedly on
// incomplete code.
- "-bugprone-unchecked-optional-access");
+ "-bugprone-unchecked-optional-access",
+ "-abseil-unchecked-statusor-access");
size_t Size = BadChecks.size();
for (const std::string &Str : ExtraBadChecks) {
diff --git a/clang-tools-extra/docs/clang-tidy/checks/abseil/unchecked-statusor-access.rst b/clang-tools-extra/docs/clang-tidy/checks/abseil/unchecked-statusor-access.rst
index 8a766e8f6abe4..c56ff8c886e2c 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/abseil/unchecked-statusor-access.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/abseil/unchecked-statusor-access.rst
@@ -93,7 +93,7 @@ known to have ok status. For example:
Ensuring that the status is ok using common macros
--------------------------------------------------
-The check is aware of common macros like ``ABSL_CHECK`` and ``ASSERT_THAT``.
+The check is aware of common macros like ``ABSL_CHECK`` or ``ABSL_CHECK_OK``.
Those can be used to ensure that the status of a ``StatusOr<T>`` object
is ok. For example:
@@ -104,6 +104,46 @@ is ok. For example:
use(*x);
}
+Ensuring that the status is ok using googletest macros
+------------------------------------------------------
+
+The check is aware of ``googletest`` (or ``gtest``) macros and matchers.
+Accessing the value of a ``StatusOr<T>`` object is considered safe if it
+is preceded by an ``ASSERT_`` macro that ensures the status is ok.
+For example:
+
+.. code:: cpp
+
+ TEST(MySuite, MyTest) {
+ absl::StatusOr<int> x = foo();
+ ASSERT_OK(x);
+ use(*x);
+ }
+
+ TEST(MySuite, MyOtherTest) {
+ absl::StatusOr<int> x = foo();
+ ASSERT_THAT(x, absl_testing::IsOk());
+ use(*x);
+ }
+
+The following ``googletest`` macros are supported:
+
+- ``ASSERT_OK(...)``
+- ``ASSERT_TRUE(...)``
+- ``ASSERT_FALSE(...)``
+- ``ASSERT_THAT(...)``
+
+The following matchers are supported:
+
+- ``IsOk()``
+- ``StatusIs(...)``
+- ``IsOkAndHolds(...)``
+- ``CanonicalStatusIs(...)``
+
+**Note**: ``EXPECT_`` macros (like ``EXPECT_OK`` or ``EXPECT_TRUE(x.ok())``)
+do **not** make subsequent accesses safe because they do not terminate the
+test execution.
+
Ensuring that the status is ok, then accessing the value in a correlated branch
-------------------------------------------------------------------------------
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 86cee7d1b6f9b..56c8b79e37576 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -160,6 +160,8 @@ Deprecated Compiler Flags
Modified Compiler Flags
-----------------------
- The `-mno-outline` and `-moutline` compiler flags are now allowed on RISC-V and X86, which both support the machine outliner.
+- The `-mno-outline` flag will now add the `nooutline` IR attribute, so that
+ `-mno-outline` and `-moutline` objects can be mixed correctly during LTO.
Removed Compiler Flags
----------------------
diff --git a/clang/docs/analyzer/checkers.rst b/clang/docs/analyzer/checkers.rst
index 7ff55bc9d77a7..e51015655de65 100644
--- a/clang/docs/analyzer/checkers.rst
+++ b/clang/docs/analyzer/checkers.rst
@@ -139,55 +139,6 @@ core.DivideZero (C, C++, ObjC)
.. literalinclude:: checkers/dividezero_example.c
:language: c
-.. _core-FixedAddressDereference:
-
-core.FixedAddressDereference (C, C++, ObjC)
-"""""""""""""""""""""""""""""""""""""""""""
-Check for dereferences of fixed addresses.
-
-A pointer contains a fixed address if it was set to a hard-coded value or it
-becomes otherwise obvious that at that point it can have only a single fixed
-numerical value.
-
-.. code-block:: c
-
- void test1() {
- int *p = (int *)0x020;
- int x = p[0]; // warn
- }
-
- void test2(int *p) {
- if (p == (int *)-1)
- *p = 0; // warn
- }
-
- void test3() {
- int (*p_function)(char, char);
- p_function = (int (*)(char, char))0x04080;
- int x = (*p_function)('x', 'y'); // NO warning yet at functon pointer calls
- }
-
- void volatile_pointee() {
- *(volatile int *)0x404 = 1; // no warning: constant non-null "volatile" pointee, you must know what you are doing
- }
-
- void deref_volatile_nullptr() {
- *(volatile int *)0 = 1; // core.NullDereference still warns about this
- }
-
-If your project is low-level (e.g., firmware), or deals with hardware interop with a lot of genuine constant addresses, then consider disabling this checker.
-The checker automatically suppresses issues if the type of the pointee of the address is ``volatile``.
-You probably already need this to be ``volatile`` for legitimate access, so the checker suppresses such issues to avoid false-positives.
-Note that null pointers will still be reported by :ref:`core.NullDereference <core-NullDereference>`
-regardless if the pointee is ``volatile`` or not.
-
-If the analyzer option ``suppress-dereferences-from-any-address-space`` is set
-to true (the default value), then this checker never reports dereference of
-pointers with a specified address space. If the option is set to false, then
-reports from the specific x86 address spaces 256, 257 and 258 are still
-suppressed, but fixed address dereferences from other address spaces are
-reported.
-
.. _core-NonNullParamChecker:
core.NonNullParamChecker (C, C++, ObjC)
@@ -898,6 +849,55 @@ of this Clang attribute.
Projects that use this pattern should not enable this optin checker.
+.. _optin-core-FixedAddressDereference:
+
+optin.core.FixedAddressDereference (C, C++, ObjC)
+"""""""""""""""""""""""""""""""""""""""""""""""""
+Check for dereferences of fixed addresses.
+
+A pointer contains a fixed address if it was set to a hard-coded value or it
+becomes otherwise obvious that at that point it can have only a single fixed
+numerical value.
+
+.. code-block:: c
+
+ void test1() {
+ int *p = (int *)0x020;
+ int x = p[0]; // warn
+ }
+
+ void test2(int *p) {
+ if (p == (int *)-1)
+ *p = 0; // warn
+ }
+
+ void test3() {
+ int (*p_function)(char, char);
+ p_function = (int (*)(char, char))0x04080;
+ int x = (*p_function)('x', 'y'); // NO warning yet at functon pointer calls
+ }
+
+ void volatile_pointee() {
+ *(volatile int *)0x404 = 1; // no warning: constant non-null "volatile" pointee, you must know what you are doing
+ }
+
+ void deref_volatile_nullptr() {
+ *(volatile int *)0 = 1; // core.NullDereference still warns about this
+ }
+
+If your project is low-level (e.g., firmware), or deals with hardware interop with a lot of genuine constant addresses, then consider disabling this checker.
+The checker automatically suppresses issues if the type of the pointee of the address is ``volatile``.
+You probably already need this to be ``volatile`` for legitimate access, so the checker suppresses such issues to avoid false-positives.
+Note that null pointers will still be reported by :ref:`core.NullDereference <core-NullDereference>`
+regardless if the pointee is ``volatile`` or not.
+
+If the analyzer option ``suppress-dereferences-from-any-address-space`` is set
+to true (the default value), then this checker never reports dereference of
+pointers with a specified address space. If the option is set to false, then
+reports from the specific x86 address spaces 256, 257 and 258 are still
+suppressed, but fixed address dereferences from other address spaces are
+reported.
+
.. _optin-cplusplus-UninitializedObject:
optin.cplusplus.UninitializedObject (C++)
diff --git a/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysisContext.h b/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysisContext.h
index 11042e865c4e6..4b6306eb21dc3 100644
--- a/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysisContext.h
+++ b/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysisContext.h
@@ -17,6 +17,7 @@
#include "clang/AST/Decl.h"
#include "clang/AST/Expr.h"
+#include "clang/AST/Type.h"
#include "clang/AST/TypeOrdering.h"
#include "clang/Analysis/FlowSensitive/ASTOps.h"
#include "clang/Analysis/FlowSensitive/AdornedCFG.h"
@@ -207,8 +208,9 @@ class DataflowAnalysisContext {
Solver::Result querySolver(llvm::SetVector<const Formula *> Constraints);
/// Returns the fields of `Type`, limited to the set of fields modeled by this
- /// context.
- FieldSet getModeledFields(QualType Type);
+ /// context. The returned reference is valid for the lifetime of the context,
+ /// or until `addModeledFields()` is called.
+ const FieldSet &getModeledFields(QualType Type);
/// Returns the names and types of the synthetic fields for the given record
/// type.
@@ -262,7 +264,11 @@ class DataflowAnalysisContext {
/// `Tokens` in the dependency graph.
llvm::DenseSet<Atom> collectDependencies(llvm::DenseSet<Atom> Tokens) const;
- // Extends the set of modeled field declarations.
+ /// Computes and returns the fields of `Type`, limited to the set of fields
+ /// modeled by this context.
+ FieldSet computeModeledFields(QualType Type);
+
+ /// Extends the set of modeled field declarations.
void addModeledFields(const FieldSet &Fields);
/// Adds all constraints of the flow condition identified by `Token` and all
@@ -326,9 +332,16 @@ class DataflowAnalysisContext {
llvm::DenseMap<const FunctionDecl *, AdornedCFG> FunctionContexts;
- // Fields modeled by environments covered by this context.
+ // Fields (from any record Type) modeled by environments using this context.
+ // The set may only contain fields that are referenced in the scope of
+ // the environments (but it is up to the environment what is relevant to
+ // model).
FieldSet ModeledFields;
+ // A lazily-computed and cached version of ModeledFields that is split by
+ // record Type.
+ llvm::DenseMap<QualType, std::unique_ptr<FieldSet>> CachedModeledFields;
+
std::unique_ptr<Logger> LogOwner; // If created via flags.
std::function<llvm::StringMap<QualType>(QualType)> SyntheticFieldCallback;
diff --git a/clang/include/clang/Analysis/Scalable/Serialization/SerializationFormatRegistry.h b/clang/include/clang/Analysis/Scalable/Serialization/SerializationFormatRegistry.h
index d7e77b9b18f77..ef060dd27c522 100644
--- a/clang/include/clang/Analysis/Scalable/Serialization/SerializationFormatRegistry.h
+++ b/clang/include/clang/Analysis/Scalable/Serialization/SerializationFormatRegistry.h
@@ -7,7 +7,24 @@
//===----------------------------------------------------------------------===//
//
// Registry for SerializationFormats, and some helper functions.
-// To register some custom serialization format, insert this code:
+//
+// To register some custom serialization format, you will need to add some
+// declarations and defintions.
+//
+// Insert this code to the header file:
+//
+// namespace llvm {
+// extern template class CLANG_TEMPLATE_ABI
+// Registry<clang::ssaf::MyFormat::FormatInfo>;
+// } // namespace llvm
+//
+// Insert this declaration to the MyFormat class:
+//
+// using FormatInfo = FormatInfoEntry<SerializerFn, DeserializerFn>;
+//
+// Insert this code to the cpp file:
+//
+// LLVM_INSTANTIATE_REGISTRY(llvm::Registry<MyFormat::FormatInfo>)
//
// static SerializationFormatRegistry::Add<MyFormat>
// RegisterFormat("MyFormat", "My awesome serialization format");
@@ -17,7 +34,7 @@
//
// namespace {
// using FormatInfo = MyFormat::FormatInfo;
-// struct MyAnalysisFormatInfo : FormatInfo {
+// struct MyAnalysisFormatInfo final : FormatInfo {
// MyAnalysisFormatInfo() : FormatInfo{
// SummaryName("MyAnalysis"),
// serializeMyAnalysis,
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.td b/clang/include/clang/Basic/BuiltinsAMDGPU.td
index 78443ac291f31..86b10eba55e8e 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.td
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.td
@@ -358,23 +358,23 @@ def __builtin_amdgcn_s_wait_event : AMDGPUBuiltin<"void(_Constant short)", [], "
// Postfix w32 indicates the builtin requires wavefront size of 32.
// Postfix w64 indicates the builtin requires wavefront size of 64.
//===----------------------------------------------------------------------===//
-def __builtin_amdgcn_wmma_f32_16x16x16_f16_w32 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<16, _Float16>, _ExtVector<16, _Float16>, _ExtVector<8, float>)", [Const], "gfx11-insts,wavefrontsize32">;
-def __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<16, short>, _ExtVector<16, short>, _ExtVector<8, float>)", [Const], "gfx11-insts,wavefrontsize32">;
-def __builtin_amdgcn_wmma_f16_16x16x16_f16_w32 : AMDGPUBuiltin<"_ExtVector<16, _Float16>(_ExtVector<16, _Float16>, _ExtVector<16, _Float16>, _ExtVector<16, _Float16>, _Constant bool)", [Const], "gfx11-insts,wavefrontsize32">;
-def __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32 : AMDGPUBuiltin<"_ExtVector<16, short>(_ExtVector<16, short>, _ExtVector<16, short>, _ExtVector<16, short>, _Constant bool)", [Const], "gfx11-insts,wavefrontsize32">;
-def __builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32 : AMDGPUBuiltin<"_ExtVector<16, _Float16>(_ExtVector<16, _Float16>, _ExtVector<16, _Float16>, _ExtVector<16, _Float16>, _Constant bool)", [Const], "gfx11-insts,wavefrontsize32">;
-def __builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32 : AMDGPUBuiltin<"_ExtVector<16, short>(_ExtVector<16, short>, _ExtVector<16, short>, _ExtVector<16, short>, _Constant bool)", [Const], "gfx11-insts,wavefrontsize32">;
-def __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32 : AMDGPUBuiltin<"_ExtVector<8, int>(_Constant bool, _ExtVector<4, int>, _Constant bool, _ExtVector<4, int>, _ExtVector<8, int>, _Constant bool)", [Const], "gfx11-insts,wavefrontsize32">;
-def __builtin_amdgcn_wmma_i32_16x16x16_iu4_w32 : AMDGPUBuiltin<"_ExtVector<8, int>(_Constant bool, _ExtVector<2, int>, _Constant bool, _ExtVector<2, int>, _ExtVector<8, int>, _Constant bool)", [Const], "gfx11-insts,wavefrontsize32">;
-
-def __builtin_amdgcn_wmma_f32_16x16x16_f16_w64 : AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<16, _Float16>, _ExtVector<16, _Float16>, _ExtVector<4, float>)", [Const], "gfx11-insts,wavefrontsize64">;
-def __builtin_amdgcn_wmma_f32_16x16x16_bf16_w64 : AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<16, short>, _ExtVector<16, short>, _ExtVector<4, float>)", [Const], "gfx11-insts,wavefrontsize64">;
-def __builtin_amdgcn_wmma_f16_16x16x16_f16_w64 : AMDGPUBuiltin<"_ExtVector<8, _Float16>(_ExtVector<16, _Float16>, _ExtVector<16, _Float16>, _ExtVector<8, _Float16>, _Constant bool)", [Const], "gfx11-insts,wavefrontsize64">;
-def __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64 : AMDGPUBuiltin<"_ExtVector<8, short>(_ExtVector<16, short>, _ExtVector<16, short>, _ExtVector<8, short>, _Constant bool)", [Const], "gfx11-insts,wavefrontsize64">;
-def __builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64 : AMDGPUBuiltin<"_ExtVector<8, _Float16>(_ExtVector<16, _Float16>, _ExtVector<16, _Float16>, _ExtVector<8, _Float16>, _Constant bool)", [Const], "gfx11-insts,wavefrontsize64">;
-def __builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64 : AMDGPUBuiltin<"_ExtVector<8, short>(_ExtVector<16, short>, _ExtVector<16, short>, _ExtVector<8, short>, _Constant bool)", [Const], "gfx11-insts,wavefrontsize64">;
-def __builtin_amdgcn_wmma_i32_16x16x16_iu8_w64 : AMDGPUBuiltin<"_ExtVector<4, int>(_Constant bool, _ExtVector<4, int>, _Constant bool, _ExtVector<4, int>, _ExtVector<4, int>, _Constant bool)", [Const], "gfx11-insts,wavefrontsize64">;
-def __builtin_amdgcn_wmma_i32_16x16x16_iu4_w64 : AMDGPUBuiltin<"_ExtVector<4, int>(_Constant bool, _ExtVector<2, int>, _Constant bool, _ExtVector<2, int>, _ExtVector<4, int>, _Constant bool)", [Const], "gfx11-insts,wavefrontsize64">;
+def __builtin_amdgcn_wmma_f32_16x16x16_f16_w32 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<16, _Float16>, _ExtVector<16, _Float16>, _ExtVector<8, float>)", [Const], "wmma-256b-insts,wavefrontsize32">;
+def __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<16, short>, _ExtVector<16, short>, _ExtVector<8, float>)", [Const], "wmma-256b-insts,wavefrontsize32">;
+def __builtin_amdgcn_wmma_f16_16x16x16_f16_w32 : AMDGPUBuiltin<"_ExtVector<16, _Float16>(_ExtVector<16, _Float16>, _ExtVector<16, _Float16>, _ExtVector<16, _Float16>, _Constant bool)", [Const], "wmma-256b-insts,wavefrontsize32">;
+def __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32 : AMDGPUBuiltin<"_ExtVector<16, short>(_ExtVector<16, short>, _ExtVector<16, short>, _ExtVector<16, short>, _Constant bool)", [Const], "wmma-256b-insts,wavefrontsize32">;
+def __builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32 : AMDGPUBuiltin<"_ExtVector<16, _Float16>(_ExtVector<16, _Float16>, _ExtVector<16, _Float16>, _ExtVector<16, _Float16>, _Constant bool)", [Const], "wmma-256b-insts,wavefrontsize32">;
+def __builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32 : AMDGPUBuiltin<"_ExtVector<16, short>(_ExtVector<16, short>, _ExtVector<16, short>, _ExtVector<16, short>, _Constant bool)", [Const], "wmma-256b-insts,wavefrontsize32">;
+def __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32 : AMDGPUBuiltin<"_ExtVector<8, int>(_Constant bool, _ExtVector<4, int>, _Constant bool, _ExtVector<4, int>, _ExtVector<8, int>, _Constant bool)", [Const], "wmma-256b-insts,wavefrontsize32">;
+def __builtin_amdgcn_wmma_i32_16x16x16_iu4_w32 : AMDGPUBuiltin<"_ExtVector<8, int>(_Constant bool, _ExtVector<2, int>, _Constant bool, _ExtVector<2, int>, _ExtVector<8, int>, _Constant bool)", [Const], "wmma-256b-insts,wavefrontsize32">;
+
+def __builtin_amdgcn_wmma_f32_16x16x16_f16_w64 : AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<16, _Float16>, _ExtVector<16, _Float16>, _ExtVector<4, float>)", [Const], "wmma-256b-insts,wavefrontsize64">;
+def __builtin_amdgcn_wmma_f32_16x16x16_bf16_w64 : AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<16, short>, _ExtVector<16, short>, _ExtVector<4, float>)", [Const], "wmma-256b-insts,wavefrontsize64">;
+def __builtin_amdgcn_wmma_f16_16x16x16_f16_w64 : AMDGPUBuiltin<"_ExtVector<8, _Float16>(_ExtVector<16, _Float16>, _ExtVector<16, _Float16>, _ExtVector<8, _Float16>, _Constant bool)", [Const], "wmma-256b-insts,wavefrontsize64">;
+def __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64 : AMDGPUBuiltin<"_ExtVector<8, short>(_ExtVector<16, short>, _ExtVector<16, short>, _ExtVector<8, short>, _Constant bool)", [Const], "wmma-256b-insts,wavefrontsize64">;
+def __builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64 : AMDGPUBuiltin<"_ExtVector<8, _Float16>(_ExtVector<16, _Float16>, _ExtVector<16, _Float16>, _ExtVector<8, _Float16>, _Constant bool)", [Const], "wmma-256b-insts,wavefrontsize64">;
+def __builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64 : AMDGPUBuiltin<"_ExtVector<8, short>(_ExtVector<16, short>, _ExtVector<16, short>, _ExtVector<8, short>, _Constant bool)", [Const], "wmma-256b-insts,wavefrontsize64">;
+def __builtin_amdgcn_wmma_i32_16x16x16_iu8_w64 : AMDGPUBuiltin<"_ExtVector<4, int>(_Constant bool, _ExtVector<4, int>, _Constant bool, _ExtVector<4, int>, _ExtVector<4, int>, _Constant bool)", [Const], "wmma-256b-insts,wavefrontsize64">;
+def __builtin_amdgcn_wmma_i32_16x16x16_iu4_w64 : AMDGPUBuiltin<"_ExtVector<4, int>(_Constant bool, _ExtVector<2, int>, _Constant bool, _ExtVector<2, int>, _ExtVector<4, int>, _Constant bool)", [Const], "wmma-256b-insts,wavefrontsize64">;
def __builtin_amdgcn_s_sendmsg_rtn : AMDGPUBuiltin<"unsigned int(_Constant unsigned int)", [], "gfx11-insts">;
def __builtin_amdgcn_s_sendmsg_rtnl : AMDGPUBuiltin<"uint64_t(_Constant unsigned int)", [], "gfx11-insts">;
@@ -599,67 +599,71 @@ def __builtin_amdgcn_ds_bvh_stack_push8_pop1_rtn : AMDGPUBuiltin<"_ExtVector<2,
// The second return value of the intrinsic is zext'ed.
def __builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn : AMDGPUBuiltin<"_ExtVector<2, uint64_t>(unsigned int, unsigned int, _ExtVector<8, unsigned int>, _Constant int)", [], "gfx12-insts">;
+//===----------------------------------------------------------------------===//
+// GFX1170, GFX12+ only builtins.
+//===----------------------------------------------------------------------===//
+
//===----------------------------------------------------------------------===//
// WMMA builtins.
// Postfix w32 indicates the builtin requires wavefront size of 32.
// Postfix w64 indicates the builtin requires wavefront size of 64.
//
-// Some of these are very similar to their GFX11 counterparts, but they don't
-// require replication of the A,B matrices, so they use fewer vector elements.
-// Therefore, we add an "_gfx12" suffix to distinguish them from the existing
-// builtins.
+// Some of these are very similar to their base GFX11 counterparts, but they
+// don't require replication of the A,B matrices, so they use fewer vector
+// elements. Therefore, we add an "_gfx12" suffix to distinguish them from the
+// existing builtins.
//===----------------------------------------------------------------------===//
-def __builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<8, _Float16>, _ExtVector<8, _Float16>, _ExtVector<8, float>)", [Const], "gfx12-insts,wavefrontsize32">;
-def __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<8, short>, _ExtVector<8, short>, _ExtVector<8, float>)", [Const], "gfx12-insts,wavefrontsize32">;
-def __builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, _Float16>(_ExtVector<8, _Float16>, _ExtVector<8, _Float16>, _ExtVector<8, _Float16>)", [Const], "gfx12-insts,wavefrontsize32">;
-def __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, short>(_ExtVector<8, short>, _ExtVector<8, short>, _ExtVector<8, short>)", [Const], "gfx12-insts,wavefrontsize32">;
-def __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, int>(_Constant bool, _ExtVector<2, int>, _Constant bool, _ExtVector<2, int>, _ExtVector<8, int>, _Constant bool)", [Const], "gfx12-insts,wavefrontsize32">;
-def __builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, int>(_Constant bool, int, _Constant bool, int, _ExtVector<8, int>, _Constant bool)", [Const], "gfx12-insts,wavefrontsize32">;
-// These are gfx12-only, but for consistency with the other WMMA variants we're
-// keeping the "_gfx12" suffix.
-def __builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<2, int>, _ExtVector<2, int>, _ExtVector<8, float>)", [Const], "gfx12-insts,wavefrontsize32">;
-def __builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<2, int>, _ExtVector<2, int>, _ExtVector<8, float>)", [Const], "gfx12-insts,wavefrontsize32">;
-def __builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<2, int>, _ExtVector<2, int>, _ExtVector<8, float>)", [Const], "gfx12-insts,wavefrontsize32">;
-def __builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<2, int>, _ExtVector<2, int>, _ExtVector<8, float>)", [Const], "gfx12-insts,wavefrontsize32">;
-def __builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, int>(_Constant bool, _ExtVector<2, int>, _Constant bool, _ExtVector<2, int>, _ExtVector<8, int>, _Constant bool)", [Const], "gfx12-insts,wavefrontsize32">;
-
-def __builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<4, _Float16>, _ExtVector<4, _Float16>, _ExtVector<4, float>)", [Const], "gfx12-insts,wavefrontsize64">;
-def __builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<4, short>, _ExtVector<4, short>, _ExtVector<4, float>)", [Const], "gfx12-insts,wavefrontsize64">;
-def __builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, _Float16>(_ExtVector<4, _Float16>, _ExtVector<4, _Float16>, _ExtVector<4, _Float16>)", [Const], "gfx12-insts,wavefrontsize64">;
-def __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, short>(_ExtVector<4, short>, _ExtVector<4, short>, _ExtVector<4, short>)", [Const], "gfx12-insts,wavefrontsize64">;
-def __builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, int>(_Constant bool, int, _Constant bool, int, _ExtVector<4, int>, _Constant bool)", [Const], "gfx12-insts,wavefrontsize64">;
-def __builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, int>(_Constant bool, int, _Constant bool, int, _ExtVector<4, int>, _Constant bool)", [Const], "gfx12-insts,wavefrontsize64">;
-// These are gfx12-only, but for consistency with the other WMMA variants we're
-// keeping the "_gfx12" suffix.
-def __builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, float>(int, int, _ExtVector<4, float>)", [Const], "gfx12-insts,wavefrontsize64">;
-def __builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, float>(int, int, _ExtVector<4, float>)", [Const], "gfx12-insts,wavefrontsize64">;
-def __builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, float>(int, int, _ExtVector<4, float>)", [Const], "gfx12-insts,wavefrontsize64">;
-def __builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, float>(int, int, _ExtVector<4, float>)", [Const], "gfx12-insts,wavefrontsize64">;
-def __builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, int>(_Constant bool, int, _Constant bool, int, _ExtVector<4, int>, _Constant bool)", [Const], "gfx12-insts,wavefrontsize64">;
-
-def __builtin_amdgcn_swmmac_f32_16x16x32_f16_w32 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<8, __fp16>, _ExtVector<16, __fp16>, _ExtVector<8, float>, int)", [Const], "gfx12-insts,wavefrontsize32">;
-def __builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<8, short>, _ExtVector<16, short>, _ExtVector<8, float>, int)", [Const], "gfx12-insts,wavefrontsize32">;
-def __builtin_amdgcn_swmmac_f16_16x16x32_f16_w32 : AMDGPUBuiltin<"_ExtVector<8, __fp16>(_ExtVector<8, __fp16>, _ExtVector<16, __fp16>, _ExtVector<8, __fp16>, int)", [Const], "gfx12-insts,wavefrontsize32">;
-def __builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32 : AMDGPUBuiltin<"_ExtVector<8, short>(_ExtVector<8, short>, _ExtVector<16, short>, _ExtVector<8, short>, int)", [Const], "gfx12-insts,wavefrontsize32">;
-def __builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32 : AMDGPUBuiltin<"_ExtVector<8, int>(_Constant bool, _ExtVector<2, int>, _Constant bool, _ExtVector<4, int>, _ExtVector<8, int>, int, _Constant bool)", [Const], "gfx12-insts,wavefrontsize32">;
-def __builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32 : AMDGPUBuiltin<"_ExtVector<8, int>(_Constant bool, int, _Constant bool, _ExtVector<2, int>, _ExtVector<8, int>, int, _Constant bool)", [Const], "gfx12-insts,wavefrontsize32">;
-def __builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32 : AMDGPUBuiltin<"_ExtVector<8, int>(_Constant bool, _ExtVector<2, int>, _Constant bool, _ExtVector<4, int>, _ExtVector<8, int>, int, _Constant bool)", [Const], "gfx12-insts,wavefrontsize32">;
-def __builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<2, int>, _ExtVector<4, int>, _ExtVector<8, float>, int)", [Const], "gfx12-insts,wavefrontsize32">;
-def __builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<2, int>, _ExtVector<4, int>, _ExtVector<8, float>, int)", [Const], "gfx12-insts,wavefrontsize32">;
-def __builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<2, int>, _ExtVector<4, int>, _ExtVector<8, float>, int)", [Const], "gfx12-insts,wavefrontsize32">;
-def __builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<2, int>, _ExtVector<4, int>, _ExtVector<8, float>, int)", [Const], "gfx12-insts,wavefrontsize32">;
-
-def __builtin_amdgcn_swmmac_f32_16x16x32_f16_w64 : AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<4, __fp16>, _ExtVector<8, __fp16>, _ExtVector<4, float>, int)", [Const], "gfx12-insts,wavefrontsize64">;
-def __builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64 : AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<4, short>, _ExtVector<8, short>, _ExtVector<4, float>, int)", [Const], "gfx12-insts,wavefrontsize64">;
-def __builtin_amdgcn_swmmac_f16_16x16x32_f16_w64 : AMDGPUBuiltin<"_ExtVector<4, __fp16>(_ExtVector<4, __fp16>, _ExtVector<8, __fp16>, _ExtVector<4, __fp16>, int)", [Const], "gfx12-insts,wavefrontsize64">;
-def __builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64 : AMDGPUBuiltin<"_ExtVector<4, short>(_ExtVector<4, short>, _ExtVector<8, short>, _ExtVector<4, short>, int)", [Const], "gfx12-insts,wavefrontsize64">;
-def __builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64 : AMDGPUBuiltin<"_ExtVector<4, int>(_Constant bool, int, _Constant bool, _ExtVector<2, int>, _ExtVector<4, int>, int, _Constant bool)", [Const], "gfx12-insts,wavefrontsize64">;
-def __builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64 : AMDGPUBuiltin<"_ExtVector<4, int>(_Constant bool, int, _Constant bool, int, _ExtVector<4, int>, int, _Constant bool)", [Const], "gfx12-insts,wavefrontsize64">;
-def __builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64 : AMDGPUBuiltin<"_ExtVector<4, int>(_Constant bool, int, _Constant bool, _ExtVector<2, int>, _ExtVector<4, int>, int, _Constant bool)", [Const], "gfx12-insts,wavefrontsize64">;
-def __builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64 : AMDGPUBuiltin<"_ExtVector<4, float>(int, _ExtVector<2, int>, _ExtVector<4, float>, int)", [Const], "gfx12-insts,wavefrontsize64">;
-def __builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64 : AMDGPUBuiltin<"_ExtVector<4, float>(int, _ExtVector<2, int>, _ExtVector<4, float>, int)", [Const], "gfx12-insts,wavefrontsize64">;
-def __builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64 : AMDGPUBuiltin<"_ExtVector<4, float>(int, _ExtVector<2, int>, _ExtVector<4, float>, int)", [Const], "gfx12-insts,wavefrontsize64">;
-def __builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64 : AMDGPUBuiltin<"_ExtVector<4, float>(int, _ExtVector<2, int>, _ExtVector<4, float>, int)", [Const], "gfx12-insts,wavefrontsize64">;
+def __builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<8, _Float16>, _ExtVector<8, _Float16>, _ExtVector<8, float>)", [Const], "wmma-128b-insts,wavefrontsize32">;
+def __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<8, short>, _ExtVector<8, short>, _ExtVector<8, float>)", [Const], "wmma-128b-insts,wavefrontsize32">;
+def __builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, _Float16>(_ExtVector<8, _Float16>, _ExtVector<8, _Float16>, _ExtVector<8, _Float16>)", [Const], "wmma-128b-insts,wavefrontsize32">;
+def __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, short>(_ExtVector<8, short>, _ExtVector<8, short>, _ExtVector<8, short>)", [Const], "wmma-128b-insts,wavefrontsize32">;
+def __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, int>(_Constant bool, _ExtVector<2, int>, _Constant bool, _ExtVector<2, int>, _ExtVector<8, int>, _Constant bool)", [Const], "wmma-128b-insts,wavefrontsize32">;
+def __builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, int>(_Constant bool, int, _Constant bool, int, _ExtVector<8, int>, _Constant bool)", [Const], "wmma-128b-insts,wavefrontsize32">;
+// These are gfx1170 and gfx12 only, but for consistency with the other WMMA
+// variants we're keeping the "_gfx12" suffix.
+def __builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<2, int>, _ExtVector<2, int>, _ExtVector<8, float>)", [Const], "wmma-128b-insts,wavefrontsize32">;
+def __builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<2, int>, _ExtVector<2, int>, _ExtVector<8, float>)", [Const], "wmma-128b-insts,wavefrontsize32">;
+def __builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<2, int>, _ExtVector<2, int>, _ExtVector<8, float>)", [Const], "wmma-128b-insts,wavefrontsize32">;
+def __builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<2, int>, _ExtVector<2, int>, _ExtVector<8, float>)", [Const], "wmma-128b-insts,wavefrontsize32">;
+def __builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12 : AMDGPUBuiltin<"_ExtVector<8, int>(_Constant bool, _ExtVector<2, int>, _Constant bool, _ExtVector<2, int>, _ExtVector<8, int>, _Constant bool)", [Const], "wmma-128b-insts,wavefrontsize32">;
+
+def __builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<4, _Float16>, _ExtVector<4, _Float16>, _ExtVector<4, float>)", [Const], "wmma-128b-insts,wavefrontsize64">;
+def __builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<4, short>, _ExtVector<4, short>, _ExtVector<4, float>)", [Const], "wmma-128b-insts,wavefrontsize64">;
+def __builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, _Float16>(_ExtVector<4, _Float16>, _ExtVector<4, _Float16>, _ExtVector<4, _Float16>)", [Const], "wmma-128b-insts,wavefrontsize64">;
+def __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, short>(_ExtVector<4, short>, _ExtVector<4, short>, _ExtVector<4, short>)", [Const], "wmma-128b-insts,wavefrontsize64">;
+def __builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, int>(_Constant bool, int, _Constant bool, int, _ExtVector<4, int>, _Constant bool)", [Const], "wmma-128b-insts,wavefrontsize64">;
+def __builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, int>(_Constant bool, int, _Constant bool, int, _ExtVector<4, int>, _Constant bool)", [Const], "wmma-128b-insts,wavefrontsize64">;
+// These are gfx1170 and gfx12 only, but for consistency with the other WMMA
+// variants we're keeping the "_gfx12" suffix.
+def __builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, float>(int, int, _ExtVector<4, float>)", [Const], "wmma-128b-insts,wavefrontsize64">;
+def __builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, float>(int, int, _ExtVector<4, float>)", [Const], "wmma-128b-insts,wavefrontsize64">;
+def __builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, float>(int, int, _ExtVector<4, float>)", [Const], "wmma-128b-insts,wavefrontsize64">;
+def __builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, float>(int, int, _ExtVector<4, float>)", [Const], "wmma-128b-insts,wavefrontsize64">;
+def __builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12 : AMDGPUBuiltin<"_ExtVector<4, int>(_Constant bool, int, _Constant bool, int, _ExtVector<4, int>, _Constant bool)", [Const], "wmma-128b-insts,wavefrontsize64">;
+
+def __builtin_amdgcn_swmmac_f32_16x16x32_f16_w32 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<8, __fp16>, _ExtVector<16, __fp16>, _ExtVector<8, float>, int)", [Const], "wmma-128b-insts,wavefrontsize32">;
+def __builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<8, short>, _ExtVector<16, short>, _ExtVector<8, float>, int)", [Const], "wmma-128b-insts,wavefrontsize32">;
+def __builtin_amdgcn_swmmac_f16_16x16x32_f16_w32 : AMDGPUBuiltin<"_ExtVector<8, __fp16>(_ExtVector<8, __fp16>, _ExtVector<16, __fp16>, _ExtVector<8, __fp16>, int)", [Const], "wmma-128b-insts,wavefrontsize32">;
+def __builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32 : AMDGPUBuiltin<"_ExtVector<8, short>(_ExtVector<8, short>, _ExtVector<16, short>, _ExtVector<8, short>, int)", [Const], "wmma-128b-insts,wavefrontsize32">;
+def __builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32 : AMDGPUBuiltin<"_ExtVector<8, int>(_Constant bool, _ExtVector<2, int>, _Constant bool, _ExtVector<4, int>, _ExtVector<8, int>, int, _Constant bool)", [Const], "wmma-128b-insts,wavefrontsize32">;
+def __builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32 : AMDGPUBuiltin<"_ExtVector<8, int>(_Constant bool, int, _Constant bool, _ExtVector<2, int>, _ExtVector<8, int>, int, _Constant bool)", [Const], "wmma-128b-insts,wavefrontsize32">;
+def __builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32 : AMDGPUBuiltin<"_ExtVector<8, int>(_Constant bool, _ExtVector<2, int>, _Constant bool, _ExtVector<4, int>, _ExtVector<8, int>, int, _Constant bool)", [Const], "wmma-128b-insts,wavefrontsize32">;
+def __builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<2, int>, _ExtVector<4, int>, _ExtVector<8, float>, int)", [Const], "wmma-128b-insts,wavefrontsize32">;
+def __builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<2, int>, _ExtVector<4, int>, _ExtVector<8, float>, int)", [Const], "wmma-128b-insts,wavefrontsize32">;
+def __builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<2, int>, _ExtVector<4, int>, _ExtVector<8, float>, int)", [Const], "wmma-128b-insts,wavefrontsize32">;
+def __builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<2, int>, _ExtVector<4, int>, _ExtVector<8, float>, int)", [Const], "wmma-128b-insts,wavefrontsize32">;
+
+def __builtin_amdgcn_swmmac_f32_16x16x32_f16_w64 : AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<4, __fp16>, _ExtVector<8, __fp16>, _ExtVector<4, float>, int)", [Const], "wmma-128b-insts,wavefrontsize64">;
+def __builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64 : AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<4, short>, _ExtVector<8, short>, _ExtVector<4, float>, int)", [Const], "wmma-128b-insts,wavefrontsize64">;
+def __builtin_amdgcn_swmmac_f16_16x16x32_f16_w64 : AMDGPUBuiltin<"_ExtVector<4, __fp16>(_ExtVector<4, __fp16>, _ExtVector<8, __fp16>, _ExtVector<4, __fp16>, int)", [Const], "wmma-128b-insts,wavefrontsize64">;
+def __builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64 : AMDGPUBuiltin<"_ExtVector<4, short>(_ExtVector<4, short>, _ExtVector<8, short>, _ExtVector<4, short>, int)", [Const], "wmma-128b-insts,wavefrontsize64">;
+def __builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64 : AMDGPUBuiltin<"_ExtVector<4, int>(_Constant bool, int, _Constant bool, _ExtVector<2, int>, _ExtVector<4, int>, int, _Constant bool)", [Const], "wmma-128b-insts,wavefrontsize64">;
+def __builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64 : AMDGPUBuiltin<"_ExtVector<4, int>(_Constant bool, int, _Constant bool, int, _ExtVector<4, int>, int, _Constant bool)", [Const], "wmma-128b-insts,wavefrontsize64">;
+def __builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64 : AMDGPUBuiltin<"_ExtVector<4, int>(_Constant bool, int, _Constant bool, _ExtVector<2, int>, _ExtVector<4, int>, int, _Constant bool)", [Const], "wmma-128b-insts,wavefrontsize64">;
+def __builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64 : AMDGPUBuiltin<"_ExtVector<4, float>(int, _ExtVector<2, int>, _ExtVector<4, float>, int)", [Const], "wmma-128b-insts,wavefrontsize64">;
+def __builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64 : AMDGPUBuiltin<"_ExtVector<4, float>(int, _ExtVector<2, int>, _ExtVector<4, float>, int)", [Const], "wmma-128b-insts,wavefrontsize64">;
+def __builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64 : AMDGPUBuiltin<"_ExtVector<4, float>(int, _ExtVector<2, int>, _ExtVector<4, float>, int)", [Const], "wmma-128b-insts,wavefrontsize64">;
+def __builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64 : AMDGPUBuiltin<"_ExtVector<4, float>(int, _ExtVector<2, int>, _ExtVector<4, float>, int)", [Const], "wmma-128b-insts,wavefrontsize64">;
def __builtin_amdgcn_prng_b32 : AMDGPUBuiltin<"unsigned int(unsigned int)", [Const], "prng-inst">;
def __builtin_amdgcn_cvt_scalef32_pk32_fp6_f16 : AMDGPUBuiltin<"_ExtVector<6, unsigned int>(_ExtVector<32, _Float16>, float)", [Const], "f16bf16-to-fp6bf6-cvt-scale-insts">;
diff --git a/clang/include/clang/Basic/CodeGenOptions.def b/clang/include/clang/Basic/CodeGenOptions.def
index 8c056bb690690..5e174b21be466 100644
--- a/clang/include/clang/Basic/CodeGenOptions.def
+++ b/clang/include/clang/Basic/CodeGenOptions.def
@@ -509,6 +509,9 @@ CODEGENOPT(AllResourcesBound, 1, 0, Benign)
ENUM_CODEGENOPT(WinX64EHUnwindV2, WinX64EHUnwindV2Mode,
2, WinX64EHUnwindV2Mode::Disabled, Benign)
+/// Adds attributes that prevent outlining (`-mno-outline`)
+CODEGENOPT(DisableOutlining, 1, 0, Benign)
+
/// FIXME: Make DebugOptions its own top-level .def file.
#include "DebugOptions.def"
diff --git a/clang/include/clang/CIR/Dialect/IR/CIROps.td b/clang/include/clang/CIR/Dialect/IR/CIROps.td
index cf312af194e85..78d7ef7510000 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIROps.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIROps.td
@@ -6211,12 +6211,6 @@ def CIR_TryOp : CIR_Op<"try",[
Holds the lexical scope of `try {}`. Note that resources used on catch
clauses are usually allocated in the same parent as `cir.try`.
- `synthetic`: use `cir.try` to represent try/catches not originally
- present in the source code. For example, a synthetic `cir.try` region
- is created around the constructor call when `operator new` is used
- so that the memory allocated will be freed if the constructor throws
- an exception.
-
`cleanup`: indicates that there are cleanups that must be performed
when exiting the try region via exception, even if the exception is not
caught.
@@ -6238,7 +6232,6 @@ def CIR_TryOp : CIR_Op<"try",[
}];
let arguments = (ins
- UnitAttr:$synthetic,
UnitAttr:$cleanup,
DefaultValuedAttr<CIR_TryHandlerArrayAttr, "{}">:$handler_types
);
@@ -6249,7 +6242,6 @@ def CIR_TryOp : CIR_Op<"try",[
);
let assemblyFormat = [{
- (`synthetic` $synthetic^)?
(`cleanup` $cleanup^)?
$try_region
custom<TryHandlerRegions>($handler_regions, $handler_types)
diff --git a/clang/include/clang/Options/Options.td b/clang/include/clang/Options/Options.td
index 24b31fb3fefcc..c8fb2a55fe7ac 100644
--- a/clang/include/clang/Options/Options.td
+++ b/clang/include/clang/Options/Options.td
@@ -5300,16 +5300,13 @@ def mmacos_version_min_EQ : Joined<["-"], "mmacos-version-min=">,
def : Joined<["-"], "mmacosx-version-min=">,
Visibility<[ClangOption, CC1Option, FC1Option, FlangOption]>,
Group<m_Group>, Alias<mmacos_version_min_EQ>;
-def moutline
- : Flag<["-"], "moutline">,
- Group<f_clang_Group>,
- Visibility<[ClangOption, CC1Option]>,
- HelpText<"Enable function outlining (AArch64,Arm,RISC-V,X86 only)">;
-def mno_outline
- : Flag<["-"], "mno-outline">,
- Group<f_clang_Group>,
- Visibility<[ClangOption, CC1Option]>,
- HelpText<"Disable function outlining (AArch64,Arm,RISC-V,X86 only)">;
+defm outline
+ : BoolMOption<
+ "outline", CodeGenOpts<"DisableOutlining">, DefaultFalse,
+ NegFlag<SetTrue, [], [ClangOption, CC1Option],
+ "Disable function outlining (AArch64,Arm,RISC-V,X86 only)">,
+ PosFlag<SetFalse, [], [ClangOption],
+ "Enable function outlining (AArch64,Arm,RISC-V,X86 only)">>;
def mms_bitfields : Flag<["-"], "mms-bitfields">, Group<m_Group>,
HelpText<"Set the default structure layout to be compatible with the Microsoft compiler standard">;
def mno_ms_bitfields : Flag<["-"], "mno-ms-bitfields">, Group<m_Group>,
diff --git a/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td b/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td
index 58e785d5ca36f..35d2f9c1d5ef1 100644
--- a/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td
+++ b/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td
@@ -188,11 +188,6 @@ def CallAndMessageChecker
]>,
Documentation<HasDocumentation>;
-def FixedAddressDereferenceChecker
- : Checker<"FixedAddressDereference">,
- HelpText<"Check for dereferences of fixed addresses">,
- Documentation<HasDocumentation>;
-
def NullDereferenceChecker
: Checker<"NullDereference">,
HelpText<"Check for dereferences of null pointers">,
@@ -426,6 +421,11 @@ def EnumCastOutOfRangeChecker : Checker<"EnumCastOutOfRange">,
HelpText<"Check integer to enumeration casts for out of range values">,
Documentation<HasDocumentation>;
+def FixedAddressDereferenceChecker
+ : Checker<"FixedAddressDereference">,
+ HelpText<"Check for dereferences of fixed addresses">,
+ Documentation<HasDocumentation>;
+
} // end "optin.core"
//===----------------------------------------------------------------------===//
diff --git a/clang/lib/AST/Stmt.cpp b/clang/lib/AST/Stmt.cpp
index 5b745dd3c43f5..15d0e6435aaf3 100644
--- a/clang/lib/AST/Stmt.cpp
+++ b/clang/lib/AST/Stmt.cpp
@@ -40,6 +40,7 @@
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include <algorithm>
+#include <array>
#include <cassert>
#include <cstring>
#include <optional>
@@ -57,25 +58,23 @@ using namespace clang;
#define ABSTRACT_STMT(STMT)
#include "clang/AST/StmtNodes.inc"
-static struct StmtClassNameTable {
+struct StmtClassNameTable {
const char *Name;
unsigned Counter;
unsigned Size;
-} StmtClassInfo[Stmt::lastStmtConstant+1];
+};
static StmtClassNameTable &getStmtInfoTableEntry(Stmt::StmtClass E) {
- static bool Initialized = false;
- if (Initialized)
- return StmtClassInfo[E];
-
- // Initialize the table on the first use.
- Initialized = true;
+ static std::array<StmtClassNameTable, Stmt::lastStmtConstant + 1>
+ StmtClassInfo = [] {
+ std::array<StmtClassNameTable, Stmt::lastStmtConstant + 1> Table{};
#define ABSTRACT_STMT(STMT)
-#define STMT(CLASS, PARENT) \
- StmtClassInfo[(unsigned)Stmt::CLASS##Class].Name = #CLASS; \
- StmtClassInfo[(unsigned)Stmt::CLASS##Class].Size = sizeof(CLASS);
+#define STMT(CLASS, PARENT) \
+ Table[static_cast<unsigned>(Stmt::CLASS##Class)].Name = #CLASS; \
+ Table[static_cast<unsigned>(Stmt::CLASS##Class)].Size = sizeof(CLASS);
#include "clang/AST/StmtNodes.inc"
-
+ return Table;
+ }();
return StmtClassInfo[E];
}
@@ -85,7 +84,7 @@ void *Stmt::operator new(size_t bytes, const ASTContext& C,
}
const char *Stmt::getStmtClassName() const {
- return getStmtInfoTableEntry((StmtClass) StmtBits.sClass).Name;
+ return getStmtInfoTableEntry(static_cast<StmtClass>(StmtBits.sClass)).Name;
}
// Check that no statement / expression class is polymorphic. LLVM style RTTI
@@ -113,19 +112,25 @@ void Stmt::PrintStats() {
unsigned sum = 0;
llvm::errs() << "\n*** Stmt/Expr Stats:\n";
for (int i = 0; i != Stmt::lastStmtConstant+1; i++) {
- if (StmtClassInfo[i].Name == nullptr) continue;
- sum += StmtClassInfo[i].Counter;
+ const StmtClassNameTable &Entry =
+ getStmtInfoTableEntry(static_cast<Stmt::StmtClass>(i));
+ if (Entry.Name == nullptr)
+ continue;
+ sum += Entry.Counter;
}
llvm::errs() << " " << sum << " stmts/exprs total.\n";
sum = 0;
for (int i = 0; i != Stmt::lastStmtConstant+1; i++) {
- if (StmtClassInfo[i].Name == nullptr) continue;
- if (StmtClassInfo[i].Counter == 0) continue;
- llvm::errs() << " " << StmtClassInfo[i].Counter << " "
- << StmtClassInfo[i].Name << ", " << StmtClassInfo[i].Size
- << " each (" << StmtClassInfo[i].Counter*StmtClassInfo[i].Size
+ const StmtClassNameTable &Entry =
+ getStmtInfoTableEntry(static_cast<Stmt::StmtClass>(i));
+ if (Entry.Name == nullptr)
+ continue;
+ if (Entry.Counter == 0)
+ continue;
+ llvm::errs() << " " << Entry.Counter << " " << Entry.Name << ", "
+ << Entry.Size << " each (" << Entry.Counter * Entry.Size
<< " bytes)\n";
- sum += StmtClassInfo[i].Counter*StmtClassInfo[i].Size;
+ sum += Entry.Counter * Entry.Size;
}
llvm::errs() << "Total bytes = " << sum << "\n";
diff --git a/clang/lib/Analysis/FlowSensitive/DataflowAnalysisContext.cpp b/clang/lib/Analysis/FlowSensitive/DataflowAnalysisContext.cpp
index 6e3a270e6bed6..1a68b2e81634f 100644
--- a/clang/lib/Analysis/FlowSensitive/DataflowAnalysisContext.cpp
+++ b/clang/lib/Analysis/FlowSensitive/DataflowAnalysisContext.cpp
@@ -13,6 +13,7 @@
//===----------------------------------------------------------------------===//
#include "clang/Analysis/FlowSensitive/DataflowAnalysisContext.h"
+#include "clang/AST/Type.h"
#include "clang/Analysis/FlowSensitive/ASTOps.h"
#include "clang/Analysis/FlowSensitive/Formula.h"
#include "clang/Analysis/FlowSensitive/Logger.h"
@@ -43,7 +44,7 @@ static llvm::cl::opt<std::string> DataflowLog(
namespace clang {
namespace dataflow {
-FieldSet DataflowAnalysisContext::getModeledFields(QualType Type) {
+FieldSet DataflowAnalysisContext::computeModeledFields(QualType Type) {
// During context-sensitive analysis, a struct may be allocated in one
// function, but its field accessed in a function lower in the stack than
// the allocation. Since we only collect fields used in the function where
@@ -57,8 +58,17 @@ FieldSet DataflowAnalysisContext::getModeledFields(QualType Type) {
return llvm::set_intersection(getObjectFields(Type), ModeledFields);
}
+const FieldSet &DataflowAnalysisContext::getModeledFields(QualType Type) {
+ QualType CanonicalType = Type.getCanonicalType().getUnqualifiedType();
+ std::unique_ptr<FieldSet> &Fields = CachedModeledFields[CanonicalType];
+ if (Fields == nullptr)
+ Fields = std::make_unique<FieldSet>(computeModeledFields(CanonicalType));
+ return *Fields;
+}
+
void DataflowAnalysisContext::addModeledFields(const FieldSet &Fields) {
ModeledFields.set_union(Fields);
+ CachedModeledFields.clear();
}
StorageLocation &DataflowAnalysisContext::createStorageLocation(QualType Type) {
diff --git a/clang/lib/Analysis/FlowSensitive/RecordOps.cpp b/clang/lib/Analysis/FlowSensitive/RecordOps.cpp
index 03d6ed8020a0a..767521334b0a2 100644
--- a/clang/lib/Analysis/FlowSensitive/RecordOps.cpp
+++ b/clang/lib/Analysis/FlowSensitive/RecordOps.cpp
@@ -85,7 +85,7 @@ void copyRecord(RecordStorageLocation &Src, RecordStorageLocation &Dst,
// Dst may have children modeled from other derived types than SrcType, e.g.
// after casts of Dst to other types derived from DstType. Only copy the
// children and synthetic fields present in both Dst and SrcType.
- const FieldSet FieldsInSrcType =
+ const FieldSet &FieldsInSrcType =
Env.getDataflowAnalysisContext().getModeledFields(SrcType);
for (auto [Field, DstFieldLoc] : Dst.children())
if (const auto *FieldAsFieldDecl = dyn_cast<FieldDecl>(Field);
@@ -103,7 +103,7 @@ void copyRecord(RecordStorageLocation &Src, RecordStorageLocation &Dst,
// after other casts of Src to those types (likely in different branches,
// but without flow-condition-dependent field modeling). Only copy the
// children and synthetic fields of Src that are present in DstType.
- const FieldSet FieldsInDstType =
+ const FieldSet &FieldsInDstType =
Env.getDataflowAnalysisContext().getModeledFields(DstType);
for (auto [Field, SrcFieldLoc] : Src.children()) {
if (const auto *FieldAsFieldDecl = dyn_cast<FieldDecl>(Field);
diff --git a/clang/lib/Basic/ParsedAttrInfo.cpp b/clang/lib/Basic/ParsedAttrInfo.cpp
index 16fa314b642b9..d5b17b34b6e3a 100644
--- a/clang/lib/Basic/ParsedAttrInfo.cpp
+++ b/clang/lib/Basic/ParsedAttrInfo.cpp
@@ -20,13 +20,16 @@ using namespace clang;
LLVM_INSTANTIATE_REGISTRY(ParsedAttrInfoRegistry)
+static std::list<std::unique_ptr<ParsedAttrInfo>> instantiateEntries() {
+ std::list<std::unique_ptr<ParsedAttrInfo>> Instances;
+ for (const auto &It : ParsedAttrInfoRegistry::entries())
+ Instances.emplace_back(It.instantiate());
+ return Instances;
+}
+
const std::list<std::unique_ptr<ParsedAttrInfo>> &
clang::getAttributePluginInstances() {
- static llvm::ManagedStatic<std::list<std::unique_ptr<ParsedAttrInfo>>>
- PluginAttrInstances;
- if (PluginAttrInstances->empty())
- for (const auto &It : ParsedAttrInfoRegistry::entries())
- PluginAttrInstances->emplace_back(It.instantiate());
-
- return *PluginAttrInstances;
+ static std::list<std::unique_ptr<ParsedAttrInfo>> Instances =
+ instantiateEntries();
+ return Instances;
}
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
index 71cf896aede10..699fee5a3a358 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
@@ -123,6 +123,17 @@ emitAArch64CompareBuiltinExpr(CIRGenFunction &cgf, CIRGenBuilderTy &builder,
return builder.createCast(loc, cir::CastKind::integral, cmp, retTy);
}
+// Emit an intrinsic where all operands are of the same type as the result.
+// Depending on mode, this may be a constrained floating-point intrinsic.
+static mlir::Value
+emitCallMaybeConstrainedBuiltin(CIRGenBuilderTy &builder, mlir::Location loc,
+ StringRef intrName, mlir::Type retTy,
+ llvm::SmallVector<mlir::Value> &ops) {
+ assert(!cir::MissingFeatures::emitConstrainedFPCall());
+
+ return builder.emitIntrinsicCallOp(loc, intrName, retTy, ops);
+}
+
bool CIRGenFunction::getAArch64SVEProcessedOperands(
unsigned builtinID, const CallExpr *expr, SmallVectorImpl<mlir::Value> &ops,
SVETypeFlags typeFlags) {
@@ -1344,10 +1355,41 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned builtinID, const CallExpr *expr,
// Find out if any arguments are required to be integer constant
// expressions.
assert(!cir::MissingFeatures::handleBuiltinICEArguments());
+ unsigned iceArguments = 0;
+ ASTContext::GetBuiltinTypeError error;
+ getContext().GetBuiltinType(builtinID, error, &iceArguments);
+ assert(error == ASTContext::GE_None && "Should not codegen an error");
+ llvm::SmallVector<mlir::Value> ops;
+ for (auto [idx, arg] : llvm::enumerate(expr->arguments())) {
+ if (idx == 0) {
+ switch (builtinID) {
+ case NEON::BI__builtin_neon_vld1_v:
+ case NEON::BI__builtin_neon_vld1q_v:
+ case NEON::BI__builtin_neon_vld1_dup_v:
+ case NEON::BI__builtin_neon_vld1q_dup_v:
+ case NEON::BI__builtin_neon_vld1_lane_v:
+ case NEON::BI__builtin_neon_vld1q_lane_v:
+ case NEON::BI__builtin_neon_vst1_v:
+ case NEON::BI__builtin_neon_vst1q_v:
+ case NEON::BI__builtin_neon_vst1_lane_v:
+ case NEON::BI__builtin_neon_vst1q_lane_v:
+ case NEON::BI__builtin_neon_vldap1_lane_s64:
+ case NEON::BI__builtin_neon_vldap1q_lane_s64:
+ case NEON::BI__builtin_neon_vstl1_lane_s64:
+ case NEON::BI__builtin_neon_vstl1q_lane_s64:
+ // Get the alignment for the argument in addition to the value;
+ // we'll use it later.
+ cgm.errorNYI(
+ expr->getSourceRange(),
+ std::string("unimplemented AArch64 builtin argument handling ") +
+ getContext().BuiltinInfo.getName(builtinID));
+ }
+ }
+ ops.push_back(emitScalarOrConstFoldImmArg(iceArguments, idx, arg));
+ }
assert(!cir::MissingFeatures::neonSISDIntrinsics());
- llvm::SmallVector<mlir::Value> ops;
mlir::Location loc = getLoc(expr->getExprLoc());
// Handle non-overloaded intrinsics first.
@@ -1355,7 +1397,6 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned builtinID, const CallExpr *expr,
default:
break;
case NEON::BI__builtin_neon_vabsh_f16: {
- ops.push_back(emitScalarExpr(expr->getArg(0)));
return cir::FAbsOp::create(builder, loc, ops);
}
case NEON::BI__builtin_neon_vaddq_p128:
@@ -1397,7 +1438,6 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned builtinID, const CallExpr *expr,
getContext().BuiltinInfo.getName(builtinID));
return mlir::Value{};
case NEON::BI__builtin_neon_vceqzd_s64:
- ops.push_back(emitScalarExpr(expr->getArg(0)));
return emitAArch64CompareBuiltinExpr(
*this, builder, loc, ops[0],
convertType(expr->getCallReturnType(getContext())), cir::CmpOpKind::eq);
@@ -1451,11 +1491,9 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned builtinID, const CallExpr *expr,
getContext().BuiltinInfo.getName(builtinID));
return mlir::Value{};
case NEON::BI__builtin_neon_vnegd_s64: {
- ops.push_back(emitScalarExpr(expr->getArg(0)));
return builder.createNeg(ops[0]);
}
case NEON::BI__builtin_neon_vnegh_f16: {
- ops.push_back(emitScalarExpr(expr->getArg(0)));
return builder.createFNeg(ops[0]);
}
case NEON::BI__builtin_neon_vtstd_s64:
@@ -1508,8 +1546,22 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned builtinID, const CallExpr *expr,
case NEON::BI__builtin_neon_vsubh_f16:
case NEON::BI__builtin_neon_vmulh_f16:
case NEON::BI__builtin_neon_vdivh_f16:
+ cgm.errorNYI(expr->getSourceRange(),
+ std::string("unimplemented AArch64 builtin call: ") +
+ getContext().BuiltinInfo.getName(builtinID));
+ return mlir::Value{};
case NEON::BI__builtin_neon_vfmah_f16:
+ // NEON intrinsic puts accumulator first, unlike the LLVM fma.
+ std::rotate(ops.begin(), ops.begin() + 1, ops.end());
+ return emitCallMaybeConstrainedBuiltin(builder, loc, "fma",
+ convertType(expr->getType()), ops);
+ break;
case NEON::BI__builtin_neon_vfmsh_f16:
+ // NEON intrinsic puts accumulator first, unlike the LLVM fma.
+ std::rotate(ops.begin(), ops.begin() + 1, ops.end());
+ ops[0] = builder.createFNeg(ops[0]);
+ return emitCallMaybeConstrainedBuiltin(builder, loc, "fma",
+ convertType(expr->getType()), ops);
case NEON::BI__builtin_neon_vaddd_s64:
case NEON::BI__builtin_neon_vaddd_u64:
case NEON::BI__builtin_neon_vsubd_s64:
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
index 5977f8c585e26..2c5a57e1ba2ee 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
@@ -870,12 +870,21 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
return {};
}
mlir::Value VisitTypeTraitExpr(const TypeTraitExpr *e) {
+ // We diverge slightly from classic codegen here because CIR has stricter
+ // typing. In LLVM IR, constant folding covers up some potential type
+ // mismatches such as bool-to-int conversions that would fail the verifier
+ // in CIR. To make things work, we need to be sure we only emit a bool value
+ // if the expression type is bool.
mlir::Location loc = cgf.getLoc(e->getExprLoc());
- if (e->isStoredAsBoolean())
- return builder.getBool(e->getBoolValue(), loc);
- cgf.cgm.errorNYI(e->getSourceRange(),
- "ScalarExprEmitter: TypeTraitExpr stored as int");
- return {};
+ if (e->isStoredAsBoolean()) {
+ if (e->getType()->isBooleanType())
+ return builder.getBool(e->getBoolValue(), loc);
+ assert(e->getType()->isIntegerType() &&
+ "Expected int type for TypeTraitExpr");
+ return builder.getConstInt(loc, cgf.convertType(e->getType()),
+ (uint64_t)e->getBoolValue());
+ }
+ return builder.getConstInt(loc, e->getAPValue().getInt());
}
mlir::Value
VisitConceptSpecializationExpr(const ConceptSpecializationExpr *e) {
diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index 3d4aa552b6af2..eb778f3583a98 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -2426,7 +2426,16 @@ void CodeGenFunction::EmitStoreOfScalar(llvm::Value *value, LValue lvalue,
static RValue EmitLoadOfMatrixLValue(LValue LV, SourceLocation Loc,
CodeGenFunction &CGF) {
assert(LV.getType()->isConstantMatrixType());
- Address Addr = MaybeConvertMatrixAddress(LV.getAddress(), CGF);
+ RawAddress DestAddr = LV.getAddress();
+
+ // HLSL constant buffers may pad matrix layouts, so copy elements into a
+ // non-padded local alloca before loading.
+ if (CGF.getLangOpts().HLSL &&
+ LV.getType().getAddressSpace() == LangAS::hlsl_constant)
+ DestAddr =
+ CGF.CGM.getHLSLRuntime().createBufferMatrixTempAddress(LV, Loc, CGF);
+
+ Address Addr = MaybeConvertMatrixAddress(DestAddr, CGF);
LV.setAddress(Addr);
return RValue::get(CGF.EmitLoadOfScalar(LV, Loc));
}
diff --git a/clang/lib/CodeGen/CGHLSLRuntime.cpp b/clang/lib/CodeGen/CGHLSLRuntime.cpp
index c68c9f16482ff..805f7a8b4445b 100644
--- a/clang/lib/CodeGen/CGHLSLRuntime.cpp
+++ b/clang/lib/CodeGen/CGHLSLRuntime.cpp
@@ -1356,6 +1356,32 @@ bool CGHLSLRuntime::emitResourceArrayCopy(LValue &LHS, Expr *RHSExpr,
return EndIndex.has_value();
}
+RawAddress CGHLSLRuntime::createBufferMatrixTempAddress(const LValue &LV,
+ SourceLocation Loc,
+ CodeGenFunction &CGF) {
+
+ assert(LV.getType()->isConstantMatrixType() && "expected matrix type");
+ assert(LV.getType().getAddressSpace() == LangAS::hlsl_constant &&
+ "expected cbuffer matrix");
+
+ QualType MatQualTy = LV.getType();
+ llvm::Type *MemTy = CGF.ConvertTypeForMem(MatQualTy);
+ llvm::Type *LayoutTy = HLSLBufferLayoutBuilder(CGF.CGM).layOutType(MatQualTy);
+
+ if (LayoutTy == MemTy)
+ return LV.getAddress();
+
+ Address SrcAddr = LV.getAddress();
+ // NOTE: B\C CreateMemTemp flattens MatrixTypes which causes
+ // overlapping GEPs in emitBufferCopy. Use CreateTempAlloca with
+ // the non-padded layout.
+ CharUnits Align =
+ CharUnits::fromQuantity(CGF.CGM.getDataLayout().getABITypeAlign(MemTy));
+ RawAddress DestAlloca = CGF.CreateTempAlloca(MemTy, Align, "matrix.buf.copy");
+ emitBufferCopy(CGF, DestAlloca, SrcAddr, MatQualTy);
+ return DestAlloca;
+}
+
std::optional<LValue> CGHLSLRuntime::emitBufferArraySubscriptExpr(
const ArraySubscriptExpr *E, CodeGenFunction &CGF,
llvm::function_ref<llvm::Value *(bool Promote)> EmitIdxAfterBase) {
diff --git a/clang/lib/CodeGen/CGHLSLRuntime.h b/clang/lib/CodeGen/CGHLSLRuntime.h
index 62349c9dea7eb..dbbc887353cec 100644
--- a/clang/lib/CodeGen/CGHLSLRuntime.h
+++ b/clang/lib/CodeGen/CGHLSLRuntime.h
@@ -281,6 +281,9 @@ class CGHLSLRuntime {
const ArraySubscriptExpr *E, CodeGenFunction &CGF,
llvm::function_ref<llvm::Value *(bool Promote)> EmitIdxAfterBase);
+ RawAddress createBufferMatrixTempAddress(const LValue &LV, SourceLocation Loc,
+ CodeGenFunction &CGF);
+
bool emitBufferCopy(CodeGenFunction &CGF, Address DestPtr, Address SrcPtr,
QualType CType);
diff --git a/clang/lib/CodeGen/CodeGenAction.cpp b/clang/lib/CodeGen/CodeGenAction.cpp
index a5ef4ac9d361d..29dcabd1b0971 100644
--- a/clang/lib/CodeGen/CodeGenAction.cpp
+++ b/clang/lib/CodeGen/CodeGenAction.cpp
@@ -248,6 +248,8 @@ void BackendConsumer::HandleTranslationUnit(ASTContext &C) {
LLVMContext &Ctx = getModule()->getContext();
std::unique_ptr<DiagnosticHandler> OldDiagnosticHandler =
Ctx.getDiagnosticHandler();
+ llvm::scope_exit RestoreDiagnosticHandler(
+ [&]() { Ctx.setDiagnosticHandler(std::move(OldDiagnosticHandler)); });
Ctx.setDiagnosticHandler(std::make_unique<ClangDiagnosticHandler>(
CodeGenOpts, this));
@@ -311,8 +313,6 @@ void BackendConsumer::HandleTranslationUnit(ASTContext &C) {
C.getTargetInfo().getDataLayoutString(), getModule(),
Action, FS, std::move(AsmOutStream), this);
- Ctx.setDiagnosticHandler(std::move(OldDiagnosticHandler));
-
if (OptRecordFile)
OptRecordFile->keep();
}
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index 6a087be3751f0..43b8af0b2156a 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -2928,7 +2928,9 @@ void CodeGenModule::SetLLVMFunctionAttributesForDefinition(const Decl *D,
B.addAttribute(llvm::Attribute::MinSize);
}
- if (D->hasAttr<NoOutlineAttr>())
+ // Add `nooutline` if Outlining is disabled with a command-line flag or a
+ // function attribute.
+ if (CodeGenOpts.DisableOutlining || D->hasAttr<NoOutlineAttr>())
B.addAttribute(llvm::Attribute::NoOutline);
F->addFnAttrs(B);
diff --git a/clang/lib/CodeGen/HLSLBufferLayoutBuilder.cpp b/clang/lib/CodeGen/HLSLBufferLayoutBuilder.cpp
index 07cc738882b50..0b644b9d8e441 100644
--- a/clang/lib/CodeGen/HLSLBufferLayoutBuilder.cpp
+++ b/clang/lib/CodeGen/HLSLBufferLayoutBuilder.cpp
@@ -103,10 +103,8 @@ HLSLBufferLayoutBuilder::layOutStruct(const RecordType *RT,
return NewTy;
}
-llvm::Type *HLSLBufferLayoutBuilder::layOutArray(const ConstantArrayType *AT) {
- llvm::Type *EltTy = layOutType(AT->getElementType());
- uint64_t Count = AT->getZExtSize();
-
+llvm::Type *HLSLBufferLayoutBuilder::padArrayElements(llvm::Type *EltTy,
+ uint64_t Count) {
CharUnits EltSize =
CharUnits::fromQuantity(CGM.getDataLayout().getTypeSizeInBits(EltTy) / 8);
CharUnits Padding = EltSize.alignTo(CBufferRowSize) - EltSize;
@@ -127,6 +125,22 @@ llvm::Type *HLSLBufferLayoutBuilder::layOutArray(const ConstantArrayType *AT) {
/*IsPacked=*/true);
}
+llvm::Type *HLSLBufferLayoutBuilder::layOutArray(const ConstantArrayType *AT) {
+ llvm::Type *EltTy = layOutType(AT->getElementType());
+ uint64_t Count = AT->getZExtSize();
+ return padArrayElements(EltTy, Count);
+}
+
+llvm::Type *
+HLSLBufferLayoutBuilder::layOutMatrix(const ConstantMatrixType *MT) {
+ // ConvertTypeForMem already handles row/column-major layout and bool
+ // promotion, producing [Count x <VecLen x EltTy>]. We just need to add
+ // cbuffer padding between the array elements.
+ llvm::ArrayType *MemTy =
+ cast<llvm::ArrayType>(CGM.getTypes().ConvertTypeForMem(QualType(MT, 0)));
+ return padArrayElements(MemTy->getElementType(), MemTy->getNumElements());
+}
+
llvm::Type *HLSLBufferLayoutBuilder::layOutType(QualType Ty) {
if (const auto *AT = CGM.getContext().getAsConstantArrayType(Ty))
return layOutArray(AT);
@@ -136,6 +150,11 @@ llvm::Type *HLSLBufferLayoutBuilder::layOutType(QualType Ty) {
return layOutStruct(Ty->getAsCanonical<RecordType>(), EmptyOffsets);
}
+ if (Ty->isConstantMatrixType()) {
+ const auto *MT = Ty->castAs<ConstantMatrixType>();
+ return layOutMatrix(MT);
+ }
+
return CGM.getTypes().ConvertTypeForMem(Ty);
}
diff --git a/clang/lib/CodeGen/HLSLBufferLayoutBuilder.h b/clang/lib/CodeGen/HLSLBufferLayoutBuilder.h
index c55f680fe5a98..5d75b36993d1f 100644
--- a/clang/lib/CodeGen/HLSLBufferLayoutBuilder.h
+++ b/clang/lib/CodeGen/HLSLBufferLayoutBuilder.h
@@ -25,6 +25,10 @@ class HLSLBufferLayoutBuilder {
private:
CodeGenModule &CGM;
+ /// Pads an array of elements to 16-byte cbuffer row boundaries.
+ /// This implements the common pattern of padding all-but-the-last element.
+ llvm::Type *padArrayElements(llvm::Type *EltTy, uint64_t Count);
+
public:
HLSLBufferLayoutBuilder(CodeGenModule &CGM) : CGM(CGM) {}
@@ -45,6 +49,9 @@ class HLSLBufferLayoutBuilder {
/// Lays out an array type following HLSL buffer rules.
llvm::Type *layOutArray(const ConstantArrayType *AT);
+ /// Lays out a matrix type following HLSL buffer rules.
+ llvm::Type *layOutMatrix(const ConstantMatrixType *MT);
+
/// Lays out a type following HLSL buffer rules. Arrays and structures will be
/// padded appropriately and nested objects will be converted as appropriate.
llvm::Type *layOutType(QualType Type);
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index d7d744d1770b6..1f3f66bf37c4a 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -2318,7 +2318,14 @@ int Driver::ExecuteCompilation(
if (!FailingCommand->getCreator().hasGoodDiagnostics() || CommandRes != 1) {
// FIXME: See FIXME above regarding result code interpretation.
+#if LLVM_ON_UNIX
+ // On Unix, signals are represented by return codes of 128 plus the
+ // signal number. Return code 255 is excluded because some tools,
+ // such as llvm-ifs, exit with code 255 (-1) on failure.
+ if (CommandRes > 128 && CommandRes != 255)
+#else
if (CommandRes < 0)
+#endif
Diag(clang::diag::err_drv_command_signalled)
<< FailingTool.getShortName();
else
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index 8bb271d27a3c4..9a17fa2546e68 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -2966,11 +2966,12 @@ void tools::addMachineOutlinerArgs(const Driver &D,
D.Diag(diag::warn_drv_moutline_unsupported_opt) << Triple.getArchName();
}
} else {
- // Disable all outlining behaviour.
- //
- // FIXME: This should probably use the `nooutline` attribute rather than
- // tweaking Pipeline Pass flags, so `-mno-outline` and `-moutline` objects
- // can be combined correctly during LTO.
+ if (!IsLTO)
+ // Disable all outlining behaviour using `nooutline` attribute, in case
+ // Linker Invocation lacks `-mno-outline`.
+ CmdArgs.push_back("-mno-outline");
+
+ // Disable Pass in Pipeline
addArg(Twine("-enable-machine-outliner=never"));
}
}
diff --git a/clang/lib/Driver/ToolChains/Darwin.cpp b/clang/lib/Driver/ToolChains/Darwin.cpp
index 1c95a79a52a9c..74fcb10c0be22 100644
--- a/clang/lib/Driver/ToolChains/Darwin.cpp
+++ b/clang/lib/Driver/ToolChains/Darwin.cpp
@@ -1145,11 +1145,11 @@ void Darwin::VerifyTripleForSDK(const llvm::opt::ArgList &Args,
getDriver().Diag(diag::warn_incompatible_sysroot)
<< SDKInfo->getDisplayName() << Triple.getTriple();
} else if (const Arg *A = Args.getLastArg(options::OPT_isysroot)) {
+ // If there is no SDK info, assume this is building against an SDK that
+ // predates SDKSettings.json. Try to match the triple to the SDK path.
const char *isysroot = A->getValue();
- StringRef SDK = getSDKName(isysroot);
- if (!SDK.empty()) {
- size_t StartVer = SDK.find_first_of("0123456789");
- StringRef SDKName = SDK.slice(0, StartVer);
+ StringRef SDKName = getSDKName(isysroot);
+ if (!SDKName.empty()) {
bool supported = true;
if (Triple.isWatchOS())
supported = SDKName.starts_with("Watch");
@@ -1161,9 +1161,8 @@ void Darwin::VerifyTripleForSDK(const llvm::opt::ArgList &Args,
supported = SDKName.starts_with("iPhone");
else if (Triple.isMacOSX())
supported = SDKName.starts_with("MacOSX");
- else
- llvm::reportFatalUsageError(Twine("SDK at '") + isysroot +
- "' missing SDKSettings.json.");
+ // If it's not an older SDK, then it might be a damaged SDK or a
+ // non-standard -isysroot path. Don't try to diagnose that here.
if (!supported)
getDriver().Diag(diag::warn_incompatible_sysroot)
@@ -2484,6 +2483,8 @@ void Darwin::AddDeploymentTarget(DerivedArgList &Args) const {
// Read the SDKSettings.json file for more information, like the SDK version
// that we can pass down to the compiler.
SDKInfo = parseSDKSettings(getVFS(), Args, getDriver());
+ // FIXME: If SDKInfo is std::nullopt, diagnose a bad isysroot value (e.g.
+ // doesn't end in .sdk).
// The OS and the version can be specified using the -target argument.
std::optional<DarwinPlatform> PlatformAndVersion =
diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
index 4823bb4265789..802a1bdbccfdd 100644
--- a/clang/lib/Sema/SemaHLSL.cpp
+++ b/clang/lib/Sema/SemaHLSL.cpp
@@ -218,8 +218,8 @@ Decl *SemaHLSL::ActOnStartBuffer(Scope *BufferScope, bool CBuffer,
static unsigned calculateLegacyCbufferFieldAlign(const ASTContext &Context,
QualType T) {
- // Arrays and Structs are always aligned to new buffer rows
- if (T->isArrayType() || T->isStructureType())
+ // Arrays, Matrices, and Structs are always aligned to new buffer rows
+ if (T->isArrayType() || T->isStructureType() || T->isConstantMatrixType())
return 16;
// Vectors are aligned to the type they contain
diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp
index 498ffd0887630..b79c22603494c 100644
--- a/clang/lib/Sema/SemaInit.cpp
+++ b/clang/lib/Sema/SemaInit.cpp
@@ -6947,10 +6947,39 @@ void InitializationSequence::InitializeFrom(Sema &S,
// For HLSL ext vector types we allow list initialization behavior for C++
// functional cast expressions which look like constructor syntax. This is
// accomplished by converting initialization arguments to InitListExpr.
- if (S.getLangOpts().HLSL && Args.size() > 1 &&
- (DestType->isExtVectorType() || DestType->isConstantMatrixType()) &&
- (SourceType.isNull() ||
- !Context.hasSameUnqualifiedType(SourceType, DestType))) {
+ auto ShouldTryListInitialization = [&]() -> bool {
+ // Only try list initialization for HLSL.
+ if (!S.getLangOpts().HLSL)
+ return false;
+
+ bool DestIsVec = DestType->isExtVectorType();
+ bool DestIsMat = DestType->isConstantMatrixType();
+
+ // If the destination type is neither a vector nor a matrix, then don't try
+ // list initialization.
+ if (!DestIsVec && !DestIsMat)
+ return false;
+
+ // If there is only a single source argument, then only try list
+ // initialization if initializing a matrix with a vector or vice versa.
+ if (Args.size() == 1) {
+ assert(!SourceType.isNull() &&
+ "Source QualType should not be null when arg size is exactly 1");
+ bool SourceIsVec = SourceType->isExtVectorType();
+ bool SourceIsMat = SourceType->isConstantMatrixType();
+
+ if (DestIsMat && !SourceIsVec)
+ return false;
+ if (DestIsVec && !SourceIsMat)
+ return false;
+ }
+
+ // Try list initialization if the source type is null or if the
+ // destination and source types differ.
+ return SourceType.isNull() ||
+ !Context.hasSameUnqualifiedType(SourceType, DestType);
+ };
+ if (ShouldTryListInitialization()) {
InitListExpr *ILE = new (Context)
InitListExpr(S.getASTContext(), Args.front()->getBeginLoc(), Args,
Args.back()->getEndLoc());
diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp
index a6d4b989cae3d..c0c0ab7a09c72 100644
--- a/clang/lib/Sema/SemaType.cpp
+++ b/clang/lib/Sema/SemaType.cpp
@@ -9309,59 +9309,11 @@ bool Sema::hasAcceptableDefinition(NamedDecl *D, NamedDecl **Suggested,
// If this definition was instantiated from a template, map back to the
// pattern from which it was instantiated.
- if (isa<TagDecl>(D) && cast<TagDecl>(D)->isBeingDefined())
+ if (isa<TagDecl>(D) && cast<TagDecl>(D)->isBeingDefined()) {
// We're in the middle of defining it; this definition should be treated
// as visible.
return true;
-
- auto DefinitionIsAcceptable = [&](NamedDecl *D) {
- // The (primary) definition might be in a visible module.
- if (isAcceptable(D, Kind))
- return true;
-
- // A visible module might have a merged definition instead.
- if (D->isModulePrivate() ? hasMergedDefinitionInCurrentModule(D)
- : hasVisibleMergedDefinition(D)) {
- if (CodeSynthesisContexts.empty() &&
- !getLangOpts().ModulesLocalVisibility) {
- // Cache the fact that this definition is implicitly visible because
- // there is a visible merged definition.
- D->setVisibleDespiteOwningModule();
- }
- return true;
- }
-
- return false;
- };
- auto IsDefinition = [](NamedDecl *D) {
- if (auto *RD = dyn_cast<CXXRecordDecl>(D))
- return RD->isThisDeclarationADefinition();
- if (auto *ED = dyn_cast<EnumDecl>(D))
- return ED->isThisDeclarationADefinition();
- if (auto *FD = dyn_cast<FunctionDecl>(D))
- return FD->isThisDeclarationADefinition();
- if (auto *VD = dyn_cast<VarDecl>(D))
- return VD->isThisDeclarationADefinition() == VarDecl::Definition;
- llvm_unreachable("unexpected decl type");
- };
- auto FoundAcceptableDefinition = [&](NamedDecl *D) {
- if (!isa<CXXRecordDecl, FunctionDecl, EnumDecl, VarDecl>(D))
- return DefinitionIsAcceptable(D);
-
- for (auto *RD : D->redecls()) {
- auto *ND = cast<NamedDecl>(RD);
- if (!IsDefinition(ND))
- continue;
- if (DefinitionIsAcceptable(ND)) {
- *Suggested = ND;
- return true;
- }
- }
-
- return false;
- };
-
- if (auto *RD = dyn_cast<CXXRecordDecl>(D)) {
+ } else if (auto *RD = dyn_cast<CXXRecordDecl>(D)) {
if (auto *Pattern = RD->getTemplateInstantiationPattern())
RD = Pattern;
D = RD->getDefinition();
@@ -9400,14 +9352,34 @@ bool Sema::hasAcceptableDefinition(NamedDecl *D, NamedDecl **Suggested,
*Suggested = D;
- if (FoundAcceptableDefinition(D))
+ auto DefinitionIsAcceptable = [&] {
+ // The (primary) definition might be in a visible module.
+ if (isAcceptable(D, Kind))
+ return true;
+
+ // A visible module might have a merged definition instead.
+ if (D->isModulePrivate() ? hasMergedDefinitionInCurrentModule(D)
+ : hasVisibleMergedDefinition(D)) {
+ if (CodeSynthesisContexts.empty() &&
+ !getLangOpts().ModulesLocalVisibility) {
+ // Cache the fact that this definition is implicitly visible because
+ // there is a visible merged definition.
+ D->setVisibleDespiteOwningModule();
+ }
+ return true;
+ }
+
+ return false;
+ };
+
+ if (DefinitionIsAcceptable())
return true;
// The external source may have additional definitions of this entity that are
// visible, so complete the redeclaration chain now and ask again.
if (auto *Source = Context.getExternalSource()) {
Source->CompleteRedeclChain(D);
- return FoundAcceptableDefinition(D);
+ return DefinitionIsAcceptable();
}
return false;
diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp
index f0fb247f1afb9..f8e9caa3f5d1d 100644
--- a/clang/lib/Serialization/ASTReaderDecl.cpp
+++ b/clang/lib/Serialization/ASTReaderDecl.cpp
@@ -3642,9 +3642,23 @@ template<>
void ASTDeclReader::attachPreviousDeclImpl(ASTReader &Reader,
Redeclarable<VarDecl> *D,
Decl *Previous, Decl *Canon) {
+ auto *VD = static_cast<VarDecl *>(D);
auto *PrevVD = cast<VarDecl>(Previous);
D->RedeclLink.setPrevious(PrevVD);
D->First = PrevVD->First;
+
+ // We should keep at most one definition on the chain.
+ // FIXME: Cache the definition once we've found it. Building a chain with
+ // N definitions currently takes O(N^2) time here.
+ if (VD->isThisDeclarationADefinition() == VarDecl::Definition) {
+ for (VarDecl *CurD = PrevVD; CurD; CurD = CurD->getPreviousDecl()) {
+ if (CurD->isThisDeclarationADefinition() == VarDecl::Definition) {
+ Reader.mergeDefinitionVisibility(CurD, VD);
+ VD->demoteThisDefinitionToDeclaration();
+ break;
+ }
+ }
+ }
}
static bool isUndeducedReturnType(QualType T) {
diff --git a/clang/lib/StaticAnalyzer/Checkers/BasicObjCFoundationChecks.cpp b/clang/lib/StaticAnalyzer/Checkers/BasicObjCFoundationChecks.cpp
index e682c4ef80896..f226f80aa441f 100644
--- a/clang/lib/StaticAnalyzer/Checkers/BasicObjCFoundationChecks.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/BasicObjCFoundationChecks.cpp
@@ -70,16 +70,12 @@ enum FoundationClass {
static FoundationClass findKnownClass(const ObjCInterfaceDecl *ID,
bool IncludeSuperclasses = true) {
- static llvm::StringMap<FoundationClass> Classes;
- if (Classes.empty()) {
- Classes["NSArray"] = FC_NSArray;
- Classes["NSDictionary"] = FC_NSDictionary;
- Classes["NSEnumerator"] = FC_NSEnumerator;
- Classes["NSNull"] = FC_NSNull;
- Classes["NSOrderedSet"] = FC_NSOrderedSet;
- Classes["NSSet"] = FC_NSSet;
- Classes["NSString"] = FC_NSString;
- }
+ static const llvm::StringMap<FoundationClass> Classes{
+ {"NSArray", FC_NSArray}, {"NSDictionary", FC_NSDictionary},
+ {"NSEnumerator", FC_NSEnumerator}, {"NSNull", FC_NSNull},
+ {"NSOrderedSet", FC_NSOrderedSet}, {"NSSet", FC_NSSet},
+ {"NSString", FC_NSString},
+ };
// FIXME: Should we cache this at all?
FoundationClass result = Classes.lookup(ID->getIdentifier()->getName());
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/NoDeleteChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/NoDeleteChecker.cpp
index 2740890704767..abbdc2967e859 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/NoDeleteChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/NoDeleteChecker.cpp
@@ -92,7 +92,11 @@ class NoDeleteChecker : public Checker<check::ASTDecl<TranslationUnitDecl>> {
return;
auto Body = FD->getBody();
- if (!Body || TFA.isTrivial(Body))
+ if (!Body)
+ return;
+
+ auto hasTrivialDtor = [&](VarDecl *D) { return TFA.hasTrivialDtor(D); };
+ if (llvm::all_of(FD->parameters(), hasTrivialDtor) && TFA.isTrivial(Body))
return;
SmallString<100> Buf;
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
index c47dabf2ec5b0..8cd64c12b7a73 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
@@ -516,19 +516,57 @@ class TrivialFunctionAnalysisVisitor
return Result;
}
+ bool CanTriviallyDestruct(QualType Ty) {
+ assert(!Ty.isNull());
+
+ // T*, T& or T&& does not run its destructor.
+ if (Ty->isPointerOrReferenceType())
+ return true;
+
+ // Primitive types don't have destructors.
+ if (Ty->isIntegralOrEnumerationType())
+ return true;
+
+ if (const auto *R = Ty->getAsCXXRecordDecl()) {
+ // C++ trivially destructible classes are fine.
+ if (R->hasTrivialDestructor())
+ return true;
+
+ // For Webkit, side-effects are fine as long as we don't delete objects,
+ // so check recursively.
+ if (const auto *Dtor = R->getDestructor())
+ return IsFunctionTrivial(Dtor);
+ }
+
+ // Structs in C are trivial.
+ if (Ty->isRecordType())
+ return true;
+
+ // For arrays it depends on the element type.
+ // FIXME: We should really use ASTContext::getAsArrayType instead.
+ if (const auto *AT = Ty->getAsArrayTypeUnsafe())
+ return CanTriviallyDestruct(AT->getElementType());
+
+ return false; // Otherwise it's likely not trivial.
+ }
+
public:
using CacheTy = TrivialFunctionAnalysis::CacheTy;
TrivialFunctionAnalysisVisitor(CacheTy &Cache) : Cache(Cache) {}
bool IsFunctionTrivial(const Decl *D) {
- if (auto *FnDecl = dyn_cast<FunctionDecl>(D)) {
- if (isNoDeleteFunction(FnDecl))
- return true;
- if (FnDecl->isVirtualAsWritten())
- return false;
- }
return WithCachedResult(D, [&]() {
+ if (auto *FnDecl = dyn_cast<FunctionDecl>(D)) {
+ if (isNoDeleteFunction(FnDecl))
+ return true;
+ if (auto *MD = dyn_cast<CXXMethodDecl>(D); MD && MD->isVirtual())
+ return false;
+ for (auto *Param : FnDecl->parameters()) {
+ if (!HasTrivialDestructor(Param))
+ return false;
+ }
+ }
if (auto *CtorDecl = dyn_cast<CXXConstructorDecl>(D)) {
for (auto *CtorInit : CtorDecl->inits()) {
if (!Visit(CtorInit->getInit()))
@@ -542,6 +580,11 @@ class TrivialFunctionAnalysisVisitor
});
}
+ bool HasTrivialDestructor(const VarDecl *VD) {
+ return WithCachedResult(
+ VD, [&] { return CanTriviallyDestruct(VD->getType()); });
+ }
+
bool IsStatementTrivial(const Stmt *S) {
auto CacheIt = Cache.find(S);
if (CacheIt != Cache.end())
@@ -579,7 +622,16 @@ class TrivialFunctionAnalysisVisitor
return true;
}
- bool VisitDeclStmt(const DeclStmt *DS) { return VisitChildren(DS); }
+ bool VisitDeclStmt(const DeclStmt *DS) {
+ for (auto &Decl : DS->decls()) {
+ // FIXME: Handle DecompositionDecls.
+ if (auto *VD = dyn_cast<VarDecl>(Decl)) {
+ if (!HasTrivialDestructor(VD))
+ return false;
+ }
+ }
+ return VisitChildren(DS);
+ }
bool VisitDoStmt(const DoStmt *DS) { return VisitChildren(DS); }
bool VisitIfStmt(const IfStmt *IS) {
return WithCachedResult(IS, [&]() { return VisitChildren(IS); });
@@ -731,6 +783,10 @@ class TrivialFunctionAnalysisVisitor
return true;
}
+ bool VisitCXXDefaultInitExpr(const CXXDefaultInitExpr *E) {
+ return Visit(E->getExpr());
+ }
+
bool checkArguments(const CallExpr *CE) {
for (const Expr *Arg : CE->arguments()) {
if (Arg && !Visit(Arg))
@@ -749,6 +805,10 @@ class TrivialFunctionAnalysisVisitor
return IsFunctionTrivial(CE->getConstructor());
}
+ bool VisitCXXDeleteExpr(const CXXDeleteExpr *DE) {
+ return CanTriviallyDestruct(DE->getDestroyedType());
+ }
+
bool VisitCXXInheritedCtorInitExpr(const CXXInheritedCtorInitExpr *E) {
return IsFunctionTrivial(E->getConstructor());
}
@@ -769,7 +829,7 @@ class TrivialFunctionAnalysisVisitor
bool VisitCXXBindTemporaryExpr(const CXXBindTemporaryExpr *BTE) {
if (auto *Temp = BTE->getTemporary()) {
- if (!TrivialFunctionAnalysis::isTrivialImpl(Temp->getDestructor(), Cache))
+ if (!IsFunctionTrivial(Temp->getDestructor()))
return false;
}
return Visit(BTE->getSubExpr());
@@ -857,4 +917,10 @@ bool TrivialFunctionAnalysis::isTrivialImpl(
return V.IsStatementTrivial(S);
}
+bool TrivialFunctionAnalysis::hasTrivialDtorImpl(const VarDecl *VD,
+ CacheTy &Cache) {
+ TrivialFunctionAnalysisVisitor V(Cache);
+ return V.HasTrivialDestructor(VD);
+}
+
} // namespace clang
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.h b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.h
index 431357a2150be..8a696a789c65b 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.h
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.h
@@ -28,6 +28,7 @@ class Stmt;
class TranslationUnitDecl;
class Type;
class TypedefDecl;
+class VarDecl;
// Ref-countability of a type is implicitly defined by Ref<T> and RefPtr<T>
// implementation. It can be modeled as: type T having public methods ref() and
@@ -169,6 +170,9 @@ class TrivialFunctionAnalysis {
/// \returns true if \p D is a "trivial" function.
bool isTrivial(const Decl *D) const { return isTrivialImpl(D, TheCache); }
bool isTrivial(const Stmt *S) const { return isTrivialImpl(S, TheCache); }
+ bool hasTrivialDtor(const VarDecl *VD) const {
+ return hasTrivialDtorImpl(VD, TheCache);
+ }
private:
friend class TrivialFunctionAnalysisVisitor;
@@ -179,6 +183,7 @@ class TrivialFunctionAnalysis {
static bool isTrivialImpl(const Decl *D, CacheTy &Cache);
static bool isTrivialImpl(const Stmt *S, CacheTy &Cache);
+ static bool hasTrivialDtorImpl(const VarDecl *VD, CacheTy &Cache);
};
} // namespace clang
diff --git a/clang/test/Analysis/Checkers/WebKit/mock-types.h b/clang/test/Analysis/Checkers/WebKit/mock-types.h
index 8a24a3c64e0e4..c139a5cb13de7 100644
--- a/clang/test/Analysis/Checkers/WebKit/mock-types.h
+++ b/clang/test/Analysis/Checkers/WebKit/mock-types.h
@@ -227,12 +227,20 @@ template <typename T> bool operator!=(const RefPtr<T> &, T &) { return false; }
struct RefCountable {
static Ref<RefCountable> create();
static std::unique_ptr<RefCountable> makeUnique();
- void ref() {}
- void deref() {}
+ void ref() { ++m_refCount; }
+ void deref() {
+ --m_refCount;
+ if (!--m_refCount)
+ delete this;
+ }
+ ~RefCountable();
void method();
void constMethod() const;
int trivial() { return 123; }
RefCountable* next();
+
+private:
+ unsigned m_refCount { 0 };
};
template <typename T> T *downcast(T *t) { return t; }
@@ -280,11 +288,14 @@ template <typename T> struct CheckedPtr {
class CheckedObj {
public:
- void incrementCheckedPtrCount();
- void decrementCheckedPtrCount();
+ void incrementCheckedPtrCount() { ++m_ptrCount; }
+ void decrementCheckedPtrCount() { --m_ptrCount; }
void method();
int trivial() { return 123; }
CheckedObj* next();
+
+private:
+ unsigned m_ptrCount { 0 };
};
class RefCountableAndCheckable {
@@ -348,8 +359,8 @@ class WeakPtrImpl {
private:
template <typename T>
- WeakPtrImpl(T* t)
- : ptr(static_cast<void*>(t))
+ WeakPtrImpl(T& t)
+ : ptr(static_cast<void*>(&t))
{ }
};
@@ -361,9 +372,9 @@ class CanMakeWeakPtr {
template <typename U> friend class CanMakeWeakPtr;
template <typename U> friend class WeakPtr;
- Ref<WeakPtrImpl> createWeakPtrImpl() {
+ WeakPtrImpl& createWeakPtrImpl() {
if (!impl)
- impl = WeakPtrImpl::create(static_cast<T>(*this));
+ impl = WeakPtrImpl::create(static_cast<T&>(*this));
return *impl;
}
@@ -382,21 +393,26 @@ class WeakPtr {
RefPtr<WeakPtrImpl> impl;
public:
- WeakPtr(T& t) {
- *this = t;
+ WeakPtr(T& t)
+ : impl(t.createWeakPtrImpl()) {
}
- WeakPtr(T* t) {
- *this = t;
+ WeakPtr(T* t)
+ : impl(t ? &t->createWeakPtrImpl() : nullptr) {
}
template <typename U>
WeakPtr<T> operator=(U& obj) {
impl = obj.createWeakPtrImpl();
+ return *this;
}
template <typename U>
WeakPtr<T> operator=(U* obj) {
- impl = obj ? obj->createWeakPtrImpl() : nullptr;
+ if (obj)
+ impl = obj->createWeakPtrImpl();
+ else
+ impl = nullptr;
+ return *this;
}
T* get() {
diff --git a/clang/test/Analysis/Checkers/WebKit/nodelete-annotation.cpp b/clang/test/Analysis/Checkers/WebKit/nodelete-annotation.cpp
index 82667a7916f42..98f4017e5e3fd 100644
--- a/clang/test/Analysis/Checkers/WebKit/nodelete-annotation.cpp
+++ b/clang/test/Analysis/Checkers/WebKit/nodelete-annotation.cpp
@@ -1,5 +1,7 @@
// RUN: %clang_analyze_cc1 -analyzer-checker=alpha.webkit.NoDeleteChecker -verify %s
+#include "mock-types.h"
+
void someFunction();
void [[clang::annotate_type("webkit.nodelete")]] safeFunction();
@@ -28,7 +30,28 @@ void [[clang::annotate_type("webkit.nodelete")]] defWithNoDelete() {
someFunction();
}
+class WeakRefCountable : public CanMakeWeakPtr<WeakRefCountable> {
+public:
+ static Ref<WeakRefCountable> create();
+
+ ~WeakRefCountable();
+
+ void ref() { m_refCount++; }
+ void deref() {
+ m_refCount--;
+ if (!m_refCount)
+ delete this;
+ }
+
+private:
+ WeakRefCountable();
+
+ unsigned m_refCount { 0 };
+};
+
class SomeClass {
+public:
+
void [[clang::annotate_type("webkit.nodelete")]] someMethod();
void [[clang::annotate_type("webkit.nodelete")]] unsafeMethod() {
// expected-warning at -1{{A function 'unsafeMethod' has [[clang::annotate_type("webkit.nodelete")]] but it contains code that could destruct an object}}
@@ -57,6 +80,59 @@ class SomeClass {
}
virtual void [[clang::annotate_type("webkit.nodelete")]] anotherVirtualMethod();
+
+ void [[clang::annotate_type("webkit.nodelete")]] setObj(RefCountable* obj) {
+ // expected-warning at -1{{A function 'setObj' has [[clang::annotate_type("webkit.nodelete")]] but it contains code that could destruct an object}}
+ m_obj = obj;
+ }
+
+ void [[clang::annotate_type("webkit.nodelete")]] swapObj(RefPtr<RefCountable>&& obj) {
+ m_obj.swap(obj);
+ }
+
+ void [[clang::annotate_type("webkit.nodelete")]] clearObj(RefCountable* obj) {
+ // expected-warning at -1{{A function 'clearObj' has [[clang::annotate_type("webkit.nodelete")]] but it contains code that could destruct an object}}
+ m_obj = nullptr;
+ }
+
+ void [[clang::annotate_type("webkit.nodelete")]] deposeArg(WeakRefCountable&& unused) {
+ }
+
+ void [[clang::annotate_type("webkit.nodelete")]] deposeArgPtr(RefPtr<RefCountable>&& unused) {
+ }
+
+ enum class E : unsigned char { V1, V2 };
+ bool [[clang::annotate_type("webkit.nodelete")]] deposeArgEnum() {
+ E&& e = E::V1;
+ return e != E::V2;
+ }
+
+ void [[clang::annotate_type("webkit.nodelete")]] deposeLocal() {
+ // expected-warning at -1{{A function 'deposeLocal' has [[clang::annotate_type("webkit.nodelete")]] but it contains code that could destruct an object}}
+ RefPtr<RefCountable> obj = std::move(m_obj);
+ }
+
+ RefPtr<RefCountable> [[clang::annotate_type("webkit.nodelete")]] copyRefPtr() {
+ return m_obj;
+ }
+
+ Ref<WeakRefCountable> [[clang::annotate_type("webkit.nodelete")]] copyRef() {
+ return *m_weakObj.get();
+ }
+
+ RefPtr<WeakRefCountable> [[clang::annotate_type("webkit.nodelete")]] getWeakPtr() {
+ return m_weakObj.get();
+ }
+
+ WeakRefCountable* [[clang::annotate_type("webkit.nodelete")]] useWeakPtr() {
+ WeakPtr localWeak = m_weakObj.get();
+ return localWeak.get();
+ }
+
+private:
+ RefPtr<RefCountable> m_obj;
+ Ref<RefCountable> m_ref;
+ WeakPtr<WeakRefCountable> m_weakObj;
};
class IntermediateClass : public SomeClass {
@@ -81,3 +157,50 @@ class Derived : public Base<Type> {
public:
virtual unsigned foo() const { return 0; }
};
+
+struct Data {
+ static Ref<Data> create() {
+ return adoptRef(*new Data);
+ }
+
+ void ref() {
+ ++refCount;
+ }
+
+ void deref() {
+ --refCount;
+ if (!refCount)
+ delete this;
+ }
+
+ virtual void doSomething() { }
+
+ int a[3] { 0 };
+
+protected:
+ Data() = default;
+
+private:
+ unsigned refCount { 0 };
+};
+
+struct SubData : Data {
+ static Ref<SubData> create() {
+ return adoptRef(*new SubData);
+ }
+
+ void doSomething() override { }
+
+private:
+ SubData() = default;
+};
+
+void [[clang::annotate_type("webkit.nodelete")]] makeData() {
+ RefPtr<Data> constantData[2] = { Data::create() };
+ RefPtr<Data> data[] = { Data::create() };
+}
+
+void [[clang::annotate_type("webkit.nodelete")]] makeSubData() {
+ // expected-warning at -1{{A function 'makeSubData' has [[clang::annotate_type("webkit.nodelete")]] but it contains code that could destruct an object}}
+ SubData::create()->doSomething();
+}
diff --git a/clang/test/Analysis/Checkers/WebKit/uncounted-local-vars.cpp b/clang/test/Analysis/Checkers/WebKit/uncounted-local-vars.cpp
index e8022b7fe8ba0..ad90198d5ac8b 100644
--- a/clang/test/Analysis/Checkers/WebKit/uncounted-local-vars.cpp
+++ b/clang/test/Analysis/Checkers/WebKit/uncounted-local-vars.cpp
@@ -26,6 +26,7 @@ void foo_ref() {
void foo_ref_trivial() {
RefCountable automatic;
RefCountable &bar = automatic;
+ // expected-warning at -1{{Local variable 'bar' is uncounted and unsafe [alpha.webkit.UncountedLocalVarsChecker]}}
}
void bar_ref(RefCountable &) {}
@@ -63,7 +64,12 @@ void foo4() {
void foo5() {
RefPtr<RefCountable> foo;
auto* bar = foo.get();
+ // expected-warning at -1{{Local variable 'bar' is uncounted and unsafe [alpha.webkit.UncountedLocalVarsChecker]}}
bar->trivial();
+ {
+ auto* baz = foo.get();
+ baz->trivial();
+ }
}
void foo6() {
diff --git a/clang/test/Analysis/analyzer-enabled-checkers.c b/clang/test/Analysis/analyzer-enabled-checkers.c
index bfe418b112a9d..c1ed882069073 100644
--- a/clang/test/Analysis/analyzer-enabled-checkers.c
+++ b/clang/test/Analysis/analyzer-enabled-checkers.c
@@ -15,7 +15,6 @@
// CHECK-NEXT: core.CallAndMessage
// CHECK-NEXT: core.DivideZero
// CHECK-NEXT: core.DynamicTypePropagation
-// CHECK-NEXT: core.FixedAddressDereference
// CHECK-NEXT: core.NonNullParamChecker
// CHECK-NEXT: core.NonnilStringConstants
// CHECK-NEXT: core.NullDereference
diff --git a/clang/test/Analysis/builtin_bitcast.cpp b/clang/test/Analysis/builtin_bitcast.cpp
index 2ba32ec6d23d2..bcaec9ecc3096 100644
--- a/clang/test/Analysis/builtin_bitcast.cpp
+++ b/clang/test/Analysis/builtin_bitcast.cpp
@@ -1,5 +1,5 @@
// RUN: %clang_analyze_cc1 -triple x86_64-unknown-unknown -verify %s \
-// RUN: -analyzer-checker=core,debug.ExprInspection -analyzer-disable-checker=core.FixedAddressDereference
+// RUN: -analyzer-checker=core,debug.ExprInspection
template <typename T> void clang_analyzer_dump(T);
using size_t = decltype(sizeof(int));
diff --git a/clang/test/Analysis/concrete-address.c b/clang/test/Analysis/concrete-address.c
index 683b7f29f4611..0822c8a0b7532 100644
--- a/clang/test/Analysis/concrete-address.c
+++ b/clang/test/Analysis/concrete-address.c
@@ -1,4 +1,4 @@
-// RUN: %clang_analyze_cc1 -analyzer-checker=core,alpha.core -verify %s
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,alpha.core.FixedAddr,optin.core.FixedAddressDereference -verify %s
extern void __assert_fail (__const char *__assertion, __const char *__file,
unsigned int __line, __const char *__function)
diff --git a/clang/test/Analysis/dtor.cpp b/clang/test/Analysis/dtor.cpp
index 9e00e937a7c29..ab46ff5ec5ecf 100644
--- a/clang/test/Analysis/dtor.cpp
+++ b/clang/test/Analysis/dtor.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_analyze_cc1 -analyzer-checker=core,unix.Malloc,debug.ExprInspection,cplusplus -analyzer-disable-checker=core.FixedAddressDereference -analyzer-config c++-inlining=destructors -Wno-null-dereference -Wno-inaccessible-base -verify -analyzer-config eagerly-assume=false %s
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,unix.Malloc,debug.ExprInspection,cplusplus -analyzer-config c++-inlining=destructors -Wno-null-dereference -Wno-inaccessible-base -verify -analyzer-config eagerly-assume=false %s
void clang_analyzer_eval(bool);
void clang_analyzer_checkInlined(bool);
diff --git a/clang/test/Analysis/fixed-address-notes.c b/clang/test/Analysis/fixed-address-notes.c
index e246ee5a464b0..537fa8cbb6463 100644
--- a/clang/test/Analysis/fixed-address-notes.c
+++ b/clang/test/Analysis/fixed-address-notes.c
@@ -1,4 +1,4 @@
-// RUN: %clang_analyze_cc1 -analyzer-checker=core -analyzer-output=text -verify %s
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,optin.core.FixedAddressDereference -analyzer-output=text -verify %s
extern char *something();
diff --git a/clang/test/Analysis/misc-ps.m b/clang/test/Analysis/misc-ps.m
index 794d8bbceb459..c22e0dbb6137d 100644
--- a/clang/test/Analysis/misc-ps.m
+++ b/clang/test/Analysis/misc-ps.m
@@ -1,6 +1,6 @@
// NOTE: Use '-fobjc-gc' to test the analysis being run twice, and multiple reports are not issued.
-// RUN: %clang_analyze_cc1 -triple i386-apple-darwin10 -analyzer-checker=core,alpha.core,osx.cocoa.AtSync -analyzer-disable-checker=core.FixedAddressDereference -Wno-strict-prototypes -Wno-pointer-to-int-cast -verify -fblocks -Wno-unreachable-code -Wno-null-dereference -Wno-objc-root-class %s
-// RUN: %clang_analyze_cc1 -triple x86_64-apple-darwin10 -analyzer-checker=core,alpha.core,osx.cocoa.AtSync -analyzer-disable-checker=core.FixedAddressDereference -Wno-strict-prototypes -Wno-pointer-to-int-cast -verify -fblocks -Wno-unreachable-code -Wno-null-dereference -Wno-objc-root-class %s
+// RUN: %clang_analyze_cc1 -triple i386-apple-darwin10 -analyzer-checker=core,alpha.core,osx.cocoa.AtSync -Wno-strict-prototypes -Wno-pointer-to-int-cast -verify -fblocks -Wno-unreachable-code -Wno-null-dereference -Wno-objc-root-class %s
+// RUN: %clang_analyze_cc1 -triple x86_64-apple-darwin10 -analyzer-checker=core,alpha.core,osx.cocoa.AtSync -Wno-strict-prototypes -Wno-pointer-to-int-cast -verify -fblocks -Wno-unreachable-code -Wno-null-dereference -Wno-objc-root-class %s
#ifndef __clang_analyzer__
#error __clang_analyzer__ not defined
diff --git a/clang/test/Analysis/pr22954.c b/clang/test/Analysis/pr22954.c
index b5f8aeb2a5ca6..3d1cac1972066 100644
--- a/clang/test/Analysis/pr22954.c
+++ b/clang/test/Analysis/pr22954.c
@@ -3,7 +3,7 @@
// At the moment the whole of the destination array content is invalidated.
// If a.s1 region has a symbolic offset, the whole region of 'a' is invalidated.
// Specific triple set to test structures of size 0.
-// RUN: %clang_analyze_cc1 -triple x86_64-pc-linux-gnu -analyzer-checker=core,unix.Malloc,debug.ExprInspection -analyzer-disable-checker=core.FixedAddressDereference -Wno-error=int-conversion -verify -analyzer-config eagerly-assume=false %s
+// RUN: %clang_analyze_cc1 -triple x86_64-pc-linux-gnu -analyzer-checker=core,unix.Malloc,debug.ExprInspection -Wno-error=int-conversion -verify -analyzer-config eagerly-assume=false %s
typedef __typeof(sizeof(int)) size_t;
diff --git a/clang/test/Analysis/std-c-library-functions-arg-enabled-checkers.c b/clang/test/Analysis/std-c-library-functions-arg-enabled-checkers.c
index 9b3296064981f..4de004e00687a 100644
--- a/clang/test/Analysis/std-c-library-functions-arg-enabled-checkers.c
+++ b/clang/test/Analysis/std-c-library-functions-arg-enabled-checkers.c
@@ -23,7 +23,6 @@
// CHECK-NEXT: core.CallAndMessage
// CHECK-NEXT: core.DivideZero
// CHECK-NEXT: core.DynamicTypePropagation
-// CHECK-NEXT: core.FixedAddressDereference
// CHECK-NEXT: core.NonNullParamChecker
// CHECK-NEXT: core.NonnilStringConstants
// CHECK-NEXT: core.NullDereference
diff --git a/clang/test/Analysis/suppress-dereferences-from-any-address-space.c b/clang/test/Analysis/suppress-dereferences-from-any-address-space.c
index 5b42262c87223..c14781876c4ef 100644
--- a/clang/test/Analysis/suppress-dereferences-from-any-address-space.c
+++ b/clang/test/Analysis/suppress-dereferences-from-any-address-space.c
@@ -1,7 +1,7 @@
-// RUN: %clang_analyze_cc1 -triple x86_64-pc-linux-gnu -analyzer-checker=core,alpha.core -std=gnu99 -analyzer-config suppress-dereferences-from-any-address-space=false -verify=x86-nosuppress,common %s
-// RUN: %clang_analyze_cc1 -triple x86_64-pc-linux-gnu -analyzer-checker=core,alpha.core -std=gnu99 -verify=x86-suppress,common %s
-// RUN: %clang_analyze_cc1 -triple arm-pc-linux-gnu -analyzer-checker=core,alpha.core -std=gnu99 -analyzer-config suppress-dereferences-from-any-address-space=false -verify=other-nosuppress,common %s
-// RUN: %clang_analyze_cc1 -triple arm-pc-linux-gnu -analyzer-checker=core,alpha.core -std=gnu99 -verify=other-suppress,common %s
+// RUN: %clang_analyze_cc1 -triple x86_64-pc-linux-gnu -analyzer-checker=core,optin.core.FixedAddressDereference -std=gnu99 -analyzer-config suppress-dereferences-from-any-address-space=false -verify=x86-nosuppress,common %s
+// RUN: %clang_analyze_cc1 -triple x86_64-pc-linux-gnu -analyzer-checker=core,optin.core.FixedAddressDereference -std=gnu99 -verify=x86-suppress,common %s
+// RUN: %clang_analyze_cc1 -triple arm-pc-linux-gnu -analyzer-checker=core,optin.core.FixedAddressDereference -std=gnu99 -analyzer-config suppress-dereferences-from-any-address-space=false -verify=other-nosuppress,common %s
+// RUN: %clang_analyze_cc1 -triple arm-pc-linux-gnu -analyzer-checker=core,optin.core.FixedAddressDereference -std=gnu99 -verify=other-suppress,common %s
// Address-space attributes suppress the report even if the pointees are not marked `volatile`.
#define AS_ATTRIBUTE(_X) __attribute__((address_space(_X)))
diff --git a/clang/test/CIR/CodeGenBuiltins/builtin-structured-binding-size.cpp b/clang/test/CIR/CodeGenBuiltins/builtin-structured-binding-size.cpp
new file mode 100644
index 0000000000000..cd391bfe14f4b
--- /dev/null
+++ b/clang/test/CIR/CodeGenBuiltins/builtin-structured-binding-size.cpp
@@ -0,0 +1,30 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-cir %s -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s -check-prefix=CIR
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-llvm %s -o %t-cir.ll
+// RUN: FileCheck --input-file=%t-cir.ll %s -check-prefix=LLVM
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm %s -o %t.ll
+// RUN: FileCheck --input-file=%t.ll %s -check-prefix=OGCG
+
+struct S {
+ int a;
+ double b;
+ char c;
+};
+
+int test_structured_binding_size() {
+ return __builtin_structured_binding_size(S);
+}
+
+// CIR: cir.func {{.*}} @_Z28test_structured_binding_sizev()
+// CIR: %[[SIZE:.*]] = cir.const #cir.int<3> : !s32i
+// CIR: cir.store %[[SIZE:.*]], %[[RETVAL:.*]]
+// CIR: %[[RET:.*]] = cir.load %[[RETVAL:.*]]
+// CIR: cir.return %[[RET:.*]] : !s32i
+
+// LLVM: define{{.*}} i32 @_Z28test_structured_binding_sizev()
+// LLVM: store i32 3, ptr %[[RETVAL:.*]]
+// LLVM: %[[RET:.*]] = load i32, ptr %[[RETVAL:.*]]
+// LLVM: ret i32 %[[RET:.*]]
+
+// OGCG: define{{.*}} i32 @_Z28test_structured_binding_sizev()
+// OGCG: ret i32 3
diff --git a/clang/test/CIR/CodeGenBuiltins/builtin-trivally-copyable.cpp b/clang/test/CIR/CodeGenBuiltins/builtin-trivally-copyable.cpp
new file mode 100644
index 0000000000000..e8e0fbf3eeedd
--- /dev/null
+++ b/clang/test/CIR/CodeGenBuiltins/builtin-trivally-copyable.cpp
@@ -0,0 +1,57 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-cir %s -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s -check-prefix=CIR
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-llvm %s -o %t-cir.ll
+// RUN: FileCheck --input-file=%t-cir.ll %s -check-prefix=LLVM
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm %s -o %t.ll
+// RUN: FileCheck --input-file=%t.ll %s -check-prefix=OGCG
+
+bool g;
+void store_trivially_copyable_result() {
+ g = __is_trivially_copyable(int);
+}
+
+// CIR: cir.func {{.*}} @_Z31store_trivially_copyable_resultv()
+// CIR: %[[TRUE:.*]] = cir.const #true
+// CIR: %[[G_PTR:.*]] = cir.get_global @g : !cir.ptr<!cir.bool>
+// CIR: cir.store{{.*}} %[[TRUE]], %[[G_PTR:.*]] : !cir.bool, !cir.ptr<!cir.bool>
+
+// LLVM: define{{.*}} void @_Z31store_trivially_copyable_resultv()
+// LLVM: store i8 1, ptr @g
+
+// OGCG: define{{.*}} void @_Z31store_trivially_copyable_resultv()
+// OGCG: store i8 1, ptr @g
+
+int test_trivially_copyable_as_bool() {
+ if (!__is_trivially_copyable(int))
+ return -1;
+ return 0;
+}
+
+// CIR: cir.func {{.*}} @_Z31test_trivially_copyable_as_boolv()
+// CIR: %[[FALSE:.*]] = cir.const #false
+// CIR: cir.if %[[FALSE]] {
+// CIR: %[[NEG_ONE:.*]] = cir.const #cir.int<-1> : !s32i
+// CIR: cir.store %[[NEG_ONE]], %[[RETVAL:.*]]
+// CIR: %[[RET:.*]] = cir.load %[[RETVAL:.*]] : !cir.ptr<!s32i>, !s32i
+// CIR: cir.return %[[RET:.*]] : !s32i
+// CIR: }
+// CIR: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
+// CIR: cir.store %[[ZERO]], %[[RETVAL:.*]] : !s32i, !cir.ptr<!s32i>
+// CIR: %[[RET:.*]] = cir.load %[[RETVAL:.*]] : !cir.ptr<!s32i>, !s32i
+// CIR: cir.return %[[RET:.*]] : !s32i
+
+// LLVM: define{{.*}} i32 @_Z31test_trivially_copyable_as_boolv()
+// LLVM: br i1 false, label %[[IF_THEN:.*]], label %[[IF_ELSE:.*]]
+// LLVM: [[IF_THEN]]:
+// LLVM: store i32 -1, ptr %[[RETVAL:.*]]
+// LLVM: %[[RET:.*]] = load i32, ptr %[[RETVAL:.*]]
+// LLVM: ret i32 %[[RET:.*]]
+// LLVM: [[IF_ELSE]]:
+// LLVM: br label %[[IF_END:.*]]
+// LLVM: [[IF_END]]:
+// LLVM: store i32 0, ptr %[[RETVAL:.*]]
+// LLVM: %[[RET:.*]] = load i32, ptr %[[RETVAL:.*]]
+// LLVM: ret i32 %[[RET:.*]]
+
+// OGCG: define{{.*}} i32 @_Z31test_trivially_copyable_as_boolv()
+// OGCG: ret i32 0
diff --git a/clang/test/CIR/CodeGenBuiltins/builtin-types-compatible.c b/clang/test/CIR/CodeGenBuiltins/builtin-types-compatible.c
new file mode 100644
index 0000000000000..dcf5fd4246481
--- /dev/null
+++ b/clang/test/CIR/CodeGenBuiltins/builtin-types-compatible.c
@@ -0,0 +1,59 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-cir %s -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s -check-prefix=CIR
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-llvm %s -o %t-cir.ll
+// RUN: FileCheck --input-file=%t-cir.ll %s -check-prefix=LLVM
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm %s -o %t.ll
+// RUN: FileCheck --input-file=%t.ll %s -check-prefix=OGCG
+
+int g;
+void store_types_compatible_result() {
+ g = __builtin_types_compatible_p(int, const int);
+}
+
+// CIR: cir.func {{.*}} @store_types_compatible_result()
+// CIR: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+// CIR: %[[G_PTR:.*]] = cir.get_global @g : !cir.ptr<!s32i>
+// CIR: cir.store{{.*}} %[[ONE]], %[[G_PTR:.*]] : !s32i, !cir.ptr<!s32i>
+
+// LLVM: define{{.*}} void @store_types_compatible_result()
+// LLVM: store i32 1, ptr @g
+
+// OGCG: define{{.*}} void @store_types_compatible_result()
+// OGCG: store i32 1, ptr @g
+
+int test_convert_bool_to_int() {
+ if (!__builtin_types_compatible_p(int, const int))
+ return -1;
+ return 0;
+}
+
+// CIR: cir.func {{.*}} @test_convert_bool_to_int()
+// CIR: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+// CIR: %[[BOOL:.*]] = cir.cast int_to_bool %[[ONE]] : !s32i -> !cir.bool
+// CIR: %[[NOT:.*]] = cir.unary(not, %[[BOOL]]) : !cir.bool, !cir.bool
+// CIR: cir.if %[[NOT]] {
+// CIR: %[[NEG_ONE:.*]] = cir.const #cir.int<-1> : !s32i
+// CIR: cir.store %[[NEG_ONE]], %[[RETVAL:.*]]
+// CIR: %[[RET:.*]] = cir.load %[[RETVAL:.*]] : !cir.ptr<!s32i>, !s32i
+// CIR: cir.return %[[RET:.*]] : !s32i
+// CIR: }
+// CIR: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
+// CIR: cir.store %[[ZERO]], %[[RETVAL:.*]] : !s32i, !cir.ptr<!s32i>
+// CIR: %[[RET:.*]] = cir.load %[[RETVAL:.*]] : !cir.ptr<!s32i>, !s32i
+// CIR: cir.return %[[RET:.*]] : !s32i
+
+// LLVM: define{{.*}} i32 @test_convert_bool_to_int()
+// LLVM: br i1 false, label %[[IF_THEN:.*]], label %[[IF_ELSE:.*]]
+// LLVM: [[IF_THEN]]:
+// LLVM: store i32 -1, ptr %[[RETVAL:.*]]
+// LLVM: %[[RET:.*]] = load i32, ptr %[[RETVAL:.*]]
+// LLVM: ret i32 %[[RET:.*]]
+// LLVM: [[IF_ELSE]]:
+// LLVM: br label %[[IF_END:.*]]
+// LLVM: [[IF_END]]:
+// LLVM: store i32 0, ptr %[[RETVAL:.*]]
+// LLVM: %[[RET:.*]] = load i32, ptr %[[RETVAL:.*]]
+// LLVM: ret i32 %[[RET:.*]]
+
+// OGCG: define{{.*}} i32 @test_convert_bool_to_int()
+// OGCG: ret i32 0
diff --git a/clang/test/CodeGen/AArch64/neon/fullfp16.c b/clang/test/CodeGen/AArch64/neon/fullfp16.c
index f3268df2f4165..ab424fc08f176 100644
--- a/clang/test/CodeGen/AArch64/neon/fullfp16.c
+++ b/clang/test/CodeGen/AArch64/neon/fullfp16.c
@@ -50,3 +50,25 @@ float16_t test_vnegh_f16(float16_t a) {
// LLVM: ret half [[NEG]]
return vnegh_f16(a);
}
+
+// ALL-LABEL: test_vfmah_f16
+float16_t test_vfmah_f16(float16_t a, float16_t b, float16_t c) {
+// CIR: cir.call_llvm_intrinsic "fma" {{.*}} : (!cir.f16, !cir.f16, !cir.f16) -> !cir.f16
+
+// LLVM-SAME: half{{.*}} [[A:%.*]], half{{.*}} [[B:%.*]], half{{.*}} [[C:%.*]])
+// LLVM: [[FMA:%.*]] = call half @llvm.fma.f16(half [[B]], half [[C]], half [[A]])
+// LLVM: ret half [[FMA]]
+ return vfmah_f16(a, b, c);
+}
+
+// ALL-LABEL: test_vfmsh_f16
+float16_t test_vfmsh_f16(float16_t a, float16_t b, float16_t c) {
+// CIR: [[SUB:%.*]] = cir.unary(minus, %{{.*}}) : !cir.f16, !cir.f16
+// CIR: cir.call_llvm_intrinsic "fma" [[SUB]], {{.*}} : (!cir.f16, !cir.f16, !cir.f16) -> !cir.f16
+
+// LLVM-SAME: half{{.*}} [[A:%.*]], half{{.*}} [[B:%.*]], half{{.*}} [[C:%.*]])
+// LLVM: [[SUB:%.*]] = fneg half [[B]]
+// LLVM: [[ADD:%.*]] = call half @llvm.fma.f16(half [[SUB]], half [[C]], half [[A]])
+// LLVM: ret half [[ADD]]
+ return vfmsh_f16(a, b, c);
+}
diff --git a/clang/test/CodeGen/AArch64/v8.2a-fp16-intrinsics.c b/clang/test/CodeGen/AArch64/v8.2a-fp16-intrinsics.c
index 353f02195721f..080e2351ff1e7 100644
--- a/clang/test/CodeGen/AArch64/v8.2a-fp16-intrinsics.c
+++ b/clang/test/CodeGen/AArch64/v8.2a-fp16-intrinsics.c
@@ -619,19 +619,3 @@ float16_t test_vrsqrtsh_f16(float16_t a, float16_t b) {
float16_t test_vsubh_f16(float16_t a, float16_t b) {
return vsubh_f16(a, b);
}
-
-// CHECK-LABEL: test_vfmah_f16
-// CHECK: [[FMA:%.*]] = call half @llvm.fma.f16(half %b, half %c, half %a)
-// CHECK: ret half [[FMA]]
-float16_t test_vfmah_f16(float16_t a, float16_t b, float16_t c) {
- return vfmah_f16(a, b, c);
-}
-
-// CHECK-LABEL: test_vfmsh_f16
-// CHECK: [[SUB:%.*]] = fneg half %b
-// CHECK: [[ADD:%.*]] = call half @llvm.fma.f16(half [[SUB]], half %c, half %a)
-// CHECK: ret half [[ADD]]
-float16_t test_vfmsh_f16(float16_t a, float16_t b, float16_t c) {
- return vfmsh_f16(a, b, c);
-}
-
diff --git a/clang/test/CodeGen/attr-no-outline.c b/clang/test/CodeGen/attr-no-outline.c
index 60d2ab5563f34..3e82ca338a121 100644
--- a/clang/test/CodeGen/attr-no-outline.c
+++ b/clang/test/CodeGen/attr-no-outline.c
@@ -1,16 +1,46 @@
-// RUN: %clang_cc1 -emit-llvm -x c %s -triple x86_64-unknown-linux-gnu -o - -femit-all-decls -fblocks | FileCheck %s --check-prefix=C
-// RUN: %clang_cc1 -emit-llvm -x c++ %s -triple x86_64-unknown-linux-gnu -o - -femit-all-decls -fblocks | FileCheck %s --check-prefix=CXX
-// RUN: %clang_cc1 -emit-llvm -x c++ %s -triple x86_64-unknown-linux-gnu -o - -femit-all-decls -fblocks -std=c++23 | FileCheck %s --check-prefixes=CXX,CXX23
+// RUN: %clang_cc1 -emit-llvm -x c %s -triple x86_64-unknown-linux-gnu -o - -femit-all-decls -fblocks -DTEST_ATTR | FileCheck %s --check-prefix=C,C-ATTR
+// RUN: %clang_cc1 -emit-llvm -x c %s -triple x86_64-unknown-linux-gnu -o - -femit-all-decls -fblocks -mno-outline | FileCheck %s --check-prefix=C,C-ARG
+// RUN: %clang_cc1 -emit-llvm -x c %s -triple x86_64-unknown-linux-gnu -o - -femit-all-decls -fblocks | FileCheck %s --check-prefix=C,C-NONE
+
+
+// RUN: %clang_cc1 -emit-llvm -x c++ %s -triple x86_64-unknown-linux-gnu -o - -femit-all-decls -fblocks -std=c++23 -DTEST_ATTR | FileCheck %s --check-prefixes=CXX,CXX-ATTR
+// RUN: %clang_cc1 -emit-llvm -x c++ %s -triple x86_64-unknown-linux-gnu -o - -femit-all-decls -fblocks -std=c++23 -mno-outline | FileCheck %s --check-prefixes=CXX,CXX-ARG
+// RUN: %clang_cc1 -emit-llvm -x c++ %s -triple x86_64-unknown-linux-gnu -o - -femit-all-decls -fblocks -std=c++23 | FileCheck %s --check-prefixes=CXX,CXX-NONE
+
+// This test checks that:
+// - [[clang::no_outline]] adds the nooutline IR attribute to specific definitions
+// - `-mno-outline` adds the nooutline IR attribute to all definitions
+// - Lack of either does not add nooutline IR attribute
+
+#ifdef TEST_ATTR
+#define ATTR [[clang::no_outline]]
+#define ATTR_DUNDER __attribute__((no_outline))
+#else
+#define ATTR
+#define ATTR_DUNDER
+#endif
// C-LABEL: define dso_local i32 @toplevel_func(
-// C-SAME: ) #[[ATTR0:[0-9]+]] {
+// C-SAME: ) #[[ATTR1:[0-9]+]] {
// CXX-LABEL: define dso_local noundef i32 @_Z13toplevel_funci(
-// CXX-SAME: ) #[[ATTR0:[0-9]+]] {
-[[clang::no_outline]] int toplevel_func(int x) {
+// CXX-SAME: ) #[[ATTR1:[0-9]+]] {
+ATTR int toplevel_func(int x) {
return x;
}
+// C-LABEL: define dso_local i32 @toplevel_func_noattr(
+// C-ATTR-SAME: ) #[[ATTR2:[0-9]+]] {
+// C-ARG-SAME: ) #[[ATTR1]] {
+// C-NONE-SAME: ) #[[ATTR1]] {
+
+// CXX-LABEL: define dso_local noundef i32 @_Z20toplevel_func_noattri(
+// CXX-ATTR-SAME: ) #[[ATTR2:[0-9]+]] {
+// CXX-ARG-SAME: ) #[[ATTR1]] {
+// CXX-NONE-SAME: ) #[[ATTR1]] {
+int toplevel_func_noattr(int x) {
+ return x;
+}
// C-only: Function without prototype
#ifndef __cplusplus
@@ -19,9 +49,9 @@
#pragma clang diagnostic ignored "-Wimplicit-int"
// C-LABEL: define dso_local i32 @no_proto_func(
-// C-SAME: ) #[[ATTR0]] {
+// C-SAME: ) #[[ATTR1]] {
-[[clang::no_outline]] no_proto_func(x)
+ATTR no_proto_func(x)
int x; {
return x;
}
@@ -32,14 +62,25 @@ int x; {
// With Blocks
#if __has_feature(blocks)
+// C-LABEL: define dso_local i32 @func_with_block(
+// C-ATTR-SAME: ) #[[ATTR2]] {
+// C-ARG-SAME: ) #[[ATTR1]] {
+// C-NONE-SAME: ) #[[ATTR1]] {
+
+// CXX-LABEL: define dso_local noundef i32 @_Z15func_with_blocki(
+// CXX-ATTR-SAME: ) #[[ATTR2]] {
+// CXX-ARG-SAME: ) #[[ATTR1]] {
+// CXX-NONE-SAME: ) #[[ATTR1]] {
int func_with_block(int x) {
+
// C-LABEL: define internal i32 @__func_with_block_block_invoke(
-// C-SAME: ) #[[ATTR0]] {
+// C-SAME: ) #[[ATTR1]] {
// CXX-LABEL: define internal noundef i32 @___Z15func_with_blocki_block_invoke(
-// CXX-SAME: ) #[[ATTR1:[0-9]+]] {
-
- int (^block)(int) = ^ __attribute__((no_outline)) int (int y) { return y; };
+// CXX-ATTR-SAME: ) #[[ATTR3:[0-9]+]] {
+// CXX-ARG-SAME: ) #[[ATTR2:[0-9]+]] {
+// CXX-NONE-SAME: ) #[[ATTR2:[0-9]+]] {
+ int (^block)(int) = ^ ATTR_DUNDER int (int y) { return y; };
return block(x);
}
@@ -51,57 +92,74 @@ int func_with_block(int x) {
struct my_struct {
// CXX-LABEL: define linkonce_odr noundef i32 @_ZN9my_struct11member_funcEi(
-// CXX-SAME: ) #[[ATTR0]] comdat
- [[clang::no_outline]] int member_func(int x) {
+// CXX-SAME: ) #[[ATTR1]] comdat
+ ATTR int member_func(int x) {
return x;
}
// CXX-LABEL: define linkonce_odr noundef i32 @_ZN9my_struct11static_funcEi(
-// CXX-SAME: ) #[[ATTR0]] comdat
- [[clang::no_outline]] static int static_func(int x) {
+// CXX-SAME: ) #[[ATTR1]] comdat
+ ATTR static int static_func(int x) {
return x;
}
};
template <typename T> struct templated_struct {
- [[clang::no_outline]] T member_func(T x) {
+ ATTR T member_func(T x) {
return x;
}
- [[clang::no_outline]] static T static_func(T x) {
+ ATTR static T static_func(T x) {
return x;
}
};
// CXX-LABEL: define weak_odr noundef i32 @_ZN16templated_structIiE11member_funcEi(
-// CXX-SAME: ) #[[ATTR0]] comdat
+// CXX-SAME: ) #[[ATTR1]] comdat
// CXX-LABEL: define weak_odr noundef i32 @_ZN16templated_structIiE11static_funcEi(
-// CXX-SAME: ) #[[ATTR0]] comdat
+// CXX-SAME: ) #[[ATTR1]] comdat
template struct templated_struct<int>;
-#if __cplusplus >= 202302L
+// CXX-LABEL: define dso_local noundef i32 @_Z16func_with_lambdai(
+// CXX-ATTR-SAME: ) #[[ATTR2]]
+// CXX-ARG-SAME: ) #[[ATTR1]]
+// CXX-NONE-SAME: ) #[[ATTR1]]
int func_with_lambda(int x) {
- // CXX23-LABEL: define internal noundef i32 @"_ZZ16func_with_lambdaiENK3$_0clEv"(
- // CXX23-SAME: ) #[[ATTR0]]
- auto lambda = [x][[clang::no_outline]]() -> int {
+
+// CXX-LABEL: define internal noundef i32 @"_ZZ16func_with_lambdaiENK3$_0clEv"(
+// CXX-SAME: ) #[[ATTR1]]
+ auto lambda = [x] ATTR () -> int {
return x;
};
return lambda();
}
#endif
-#endif
-// C: attributes #[[ATTR0]] = {
-// C-SAME: nooutline
+// C: attributes #[[ATTR1]] = {
+// C-ATTR-SAME: nooutline
+// C-ARG-SAME: nooutline
+// C-NONE-NOT: nooutline
// C-SAME: }
-// CXX: attributes #[[ATTR0]] = {
-// CXX-SAME: nooutline
-// CXX-SAME: }
+// C-ATTR: attributes #[[ATTR2]] = {
+// C-ATTR-NOT: nooutline
+// C-ATTR-SAME: }
// CXX: attributes #[[ATTR1]] = {
-// CXX-SAME: nooutline
+// CXX-ATTR-SAME: nooutline
+// CXX-ARG-SAME: nooutline
+// CXX-NONE-NOT: nooutline
// CXX-SAME: }
+
+// CXX: attributes #[[ATTR2]] = {
+// CXX-ATTR-NOT: nooutline
+// CXX-ARG-SAME: nooutline
+// CXX-NONE-NOT: nooutline
+// CXX-SAME: }
+
+// CXX-ATTR: attributes #[[ATTR3]] = {
+// CXX-ATTR-SAME: nooutline
+// CXX-ATTR-SAME: }
diff --git a/clang/test/CodeGenHLSL/BasicFeatures/MatrixToAndFromVectorConstructors.hlsl b/clang/test/CodeGenHLSL/BasicFeatures/MatrixToAndFromVectorConstructors.hlsl
new file mode 100644
index 0000000000000..8b1fb9038bedd
--- /dev/null
+++ b/clang/test/CodeGenHLSL/BasicFeatures/MatrixToAndFromVectorConstructors.hlsl
@@ -0,0 +1,121 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -disable-llvm-passes -emit-llvm -finclude-default-header -o - -fmatrix-memory-layout=column-major %s | FileCheck %s --check-prefixes=CHECK,COL-CHECK
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -disable-llvm-passes -emit-llvm -finclude-default-header -o - -fmatrix-memory-layout=row-major %s | FileCheck %s --check-prefixes=CHECK,ROW-CHECK
+
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z2fnu11matrix_typeILm2ELm2EfE(
+// CHECK-SAME: <4 x float> noundef nofpclass(nan inf) [[M:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[M_ADDR:%.*]] = alloca [2 x <2 x float>], align 4
+// CHECK-NEXT: [[V:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT: store <4 x float> [[M]], ptr [[M_ADDR]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[M_ADDR]], align 4
+// CHECK-NEXT: [[MATRIXEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
+// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x float> poison, float [[MATRIXEXT]], i32 0
+// CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[M_ADDR]], align 4
+// COL-CHECK-NEXT: [[MATRIXEXT1:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+// ROW-CHECK-NEXT: [[MATRIXEXT1:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+// CHECK-NEXT: [[VECINIT2:%.*]] = insertelement <4 x float> [[VECINIT]], float [[MATRIXEXT1]], i32 1
+// CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[M_ADDR]], align 4
+// COL-CHECK-NEXT: [[MATRIXEXT3:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
+// ROW-CHECK-NEXT: [[MATRIXEXT3:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
+// CHECK-NEXT: [[VECINIT4:%.*]] = insertelement <4 x float> [[VECINIT2]], float [[MATRIXEXT3]], i32 2
+// CHECK-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[M_ADDR]], align 4
+// CHECK-NEXT: [[MATRIXEXT5:%.*]] = extractelement <4 x float> [[TMP3]], i32 3
+// CHECK-NEXT: [[VECINIT6:%.*]] = insertelement <4 x float> [[VECINIT4]], float [[MATRIXEXT5]], i32 3
+// CHECK-NEXT: store <4 x float> [[VECINIT6]], ptr [[V]], align 16
+// CHECK-NEXT: [[TMP4:%.*]] = load <4 x float>, ptr [[V]], align 16
+// CHECK-NEXT: ret <4 x float> [[TMP4]]
+//
+float4 fn(float2x2 m) {
+ float4 v = m;
+ return v;
+}
+
+// CHECK-LABEL: define hidden noundef <4 x i32> @_Z2fnDv4_i(
+// CHECK-SAME: <4 x i32> noundef [[V:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[V_ADDR:%.*]] = alloca <4 x i32>, align 16
+// CHECK-NEXT: [[M:%.*]] = alloca [2 x <2 x i32>], align 4
+// CHECK-NEXT: store <4 x i32> [[V]], ptr [[V_ADDR]], align 16
+// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[V_ADDR]], align 16
+// CHECK-NEXT: [[VECEXT:%.*]] = extractelement <4 x i32> [[TMP0]], i64 0
+// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x i32> poison, i32 [[VECEXT]], i32 0
+// CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[V_ADDR]], align 16
+// CHECK-NEXT: [[VECEXT1:%.*]] = extractelement <4 x i32> [[TMP1]], i64 2
+// CHECK-NEXT: [[VECINIT2:%.*]] = insertelement <4 x i32> [[VECINIT]], i32 [[VECEXT1]], i32 1
+// CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr [[V_ADDR]], align 16
+// CHECK-NEXT: [[VECEXT3:%.*]] = extractelement <4 x i32> [[TMP2]], i64 1
+// CHECK-NEXT: [[VECINIT4:%.*]] = insertelement <4 x i32> [[VECINIT2]], i32 [[VECEXT3]], i32 2
+// CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr [[V_ADDR]], align 16
+// CHECK-NEXT: [[VECEXT5:%.*]] = extractelement <4 x i32> [[TMP3]], i64 3
+// CHECK-NEXT: [[VECINIT6:%.*]] = insertelement <4 x i32> [[VECINIT4]], i32 [[VECEXT5]], i32 3
+// CHECK-NEXT: store <4 x i32> [[VECINIT6]], ptr [[M]], align 4
+// CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr [[M]], align 4
+// CHECK-NEXT: ret <4 x i32> [[TMP4]]
+//
+int2x2 fn(int4 v) {
+ int2x2 m = v;
+ return m;
+}
+
+// CHECK-LABEL: define hidden noundef <2 x i32> @_Z3fn1Dv2_i(
+// CHECK-SAME: <2 x i32> noundef [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[V_ADDR:%.*]] = alloca <2 x i32>, align 8
+// CHECK-NEXT: store <2 x i32> [[V]], ptr [[V_ADDR]], align 8
+// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[V_ADDR]], align 8
+// CHECK-NEXT: [[VECEXT:%.*]] = extractelement <2 x i32> [[TMP0]], i64 0
+// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <2 x i32> poison, i32 [[VECEXT]], i32 0
+// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[V_ADDR]], align 8
+// CHECK-NEXT: [[VECEXT1:%.*]] = extractelement <2 x i32> [[TMP1]], i64 1
+// CHECK-NEXT: [[VECINIT2:%.*]] = insertelement <2 x i32> [[VECINIT]], i32 [[VECEXT1]], i32 1
+// CHECK-NEXT: ret <2 x i32> [[VECINIT2]]
+//
+int1x2 fn1(int2 v) {
+ return v;
+}
+
+// CHECK-LABEL: define hidden noundef <3 x i1> @_Z3fn2Dv3_b(
+// CHECK-SAME: <3 x i1> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[B_ADDR:%.*]] = alloca <3 x i32>, align 16
+// CHECK-NEXT: [[TMP0:%.*]] = zext <3 x i1> [[B]] to <3 x i32>
+// CHECK-NEXT: store <3 x i32> [[TMP0]], ptr [[B_ADDR]], align 16
+// CHECK-NEXT: [[TMP1:%.*]] = load <3 x i32>, ptr [[B_ADDR]], align 16
+// CHECK-NEXT: [[LOADEDV:%.*]] = trunc <3 x i32> [[TMP1]] to <3 x i1>
+// CHECK-NEXT: [[VECEXT:%.*]] = extractelement <3 x i1> [[LOADEDV]], i64 0
+// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <3 x i1> poison, i1 [[VECEXT]], i32 0
+// CHECK-NEXT: [[TMP2:%.*]] = load <3 x i32>, ptr [[B_ADDR]], align 16
+// CHECK-NEXT: [[LOADEDV1:%.*]] = trunc <3 x i32> [[TMP2]] to <3 x i1>
+// CHECK-NEXT: [[VECEXT2:%.*]] = extractelement <3 x i1> [[LOADEDV1]], i64 1
+// CHECK-NEXT: [[VECINIT3:%.*]] = insertelement <3 x i1> [[VECINIT]], i1 [[VECEXT2]], i32 1
+// CHECK-NEXT: [[TMP3:%.*]] = load <3 x i32>, ptr [[B_ADDR]], align 16
+// CHECK-NEXT: [[LOADEDV4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i1>
+// CHECK-NEXT: [[VECEXT5:%.*]] = extractelement <3 x i1> [[LOADEDV4]], i64 2
+// CHECK-NEXT: [[VECINIT6:%.*]] = insertelement <3 x i1> [[VECINIT3]], i1 [[VECEXT5]], i32 2
+// CHECK-NEXT: ret <3 x i1> [[VECINIT6]]
+//
+bool3x1 fn2(bool3 b) {
+ return b;
+}
+
+// CHECK-LABEL: define hidden noundef <3 x i32> @_Z3fn3u11matrix_typeILm1ELm3EbE(
+// CHECK-SAME: <3 x i1> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// COL-CHECK-NEXT: [[B_ADDR:%.*]] = alloca [3 x <1 x i32>], align 4
+// ROW-CHECK-NEXT: [[B_ADDR:%.*]] = alloca [1 x <3 x i32>], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = zext <3 x i1> [[B]] to <3 x i32>
+// CHECK-NEXT: store <3 x i32> [[TMP0]], ptr [[B_ADDR]], align 4
+// CHECK-NEXT: [[TMP1:%.*]] = load <3 x i32>, ptr [[B_ADDR]], align 4
+// CHECK-NEXT: [[MATRIXEXT:%.*]] = extractelement <3 x i32> [[TMP1]], i32 0
+// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <3 x i32> poison, i32 [[MATRIXEXT]], i32 0
+// CHECK-NEXT: [[TMP2:%.*]] = load <3 x i32>, ptr [[B_ADDR]], align 4
+// CHECK-NEXT: [[MATRIXEXT1:%.*]] = extractelement <3 x i32> [[TMP2]], i32 1
+// CHECK-NEXT: [[VECINIT2:%.*]] = insertelement <3 x i32> [[VECINIT]], i32 [[MATRIXEXT1]], i32 1
+// CHECK-NEXT: [[TMP3:%.*]] = load <3 x i32>, ptr [[B_ADDR]], align 4
+// CHECK-NEXT: [[MATRIXEXT3:%.*]] = extractelement <3 x i32> [[TMP3]], i32 2
+// CHECK-NEXT: [[VECINIT4:%.*]] = insertelement <3 x i32> [[VECINIT2]], i32 [[MATRIXEXT3]], i32 2
+// CHECK-NEXT: ret <3 x i32> [[VECINIT4]]
+//
+int3 fn3(bool1x3 b) {
+ return b;
+}
diff --git a/clang/test/CodeGenHLSL/matrix_types.hlsl b/clang/test/CodeGenHLSL/matrix_types.hlsl
index 1c2f9cd316543..c502a79d28e23 100644
--- a/clang/test/CodeGenHLSL/matrix_types.hlsl
+++ b/clang/test/CodeGenHLSL/matrix_types.hlsl
@@ -16,31 +16,31 @@
// CHECK-ROW-MAJOR: @bool1x2_Val = external hidden addrspace(2) global [1 x <2 x i32>], align 4
// CHECK-ROW-MAJOR: @bool1x3_Val = external hidden addrspace(2) global [1 x <3 x i32>], align 4
// CHECK-ROW-MAJOR: @bool1x4_Val = external hidden addrspace(2) global [1 x <4 x i32>], align 4
-// CHECK-ROW-MAJOR: @bool2x1_Val = external hidden addrspace(2) global [2 x <1 x i32>], align 4
-// CHECK-ROW-MAJOR: @bool2x2_Val = external hidden addrspace(2) global [2 x <2 x i32>], align 4
-// CHECK-ROW-MAJOR: @bool2x3_Val = external hidden addrspace(2) global [2 x <3 x i32>], align 4
+// CHECK-ROW-MAJOR: @bool2x1_Val = external hidden addrspace(2) global <{ [1 x <{ <1 x i32>, target("dx.Padding", 12) }>], <1 x i32> }>, align 4
+// CHECK-ROW-MAJOR: @bool2x2_Val = external hidden addrspace(2) global <{ [1 x <{ <2 x i32>, target("dx.Padding", 8) }>], <2 x i32> }>, align 4
+// CHECK-ROW-MAJOR: @bool2x3_Val = external hidden addrspace(2) global <{ [1 x <{ <3 x i32>, target("dx.Padding", 4) }>], <3 x i32> }>, align 4
// CHECK-ROW-MAJOR: @bool2x4_Val = external hidden addrspace(2) global [2 x <4 x i32>], align 4
-// CHECK-ROW-MAJOR: @bool3x1_Val = external hidden addrspace(2) global [3 x <1 x i32>], align 4
-// CHECK-ROW-MAJOR: @bool3x2_Val = external hidden addrspace(2) global [3 x <2 x i32>], align 4
-// CHECK-ROW-MAJOR: @bool3x3_Val = external hidden addrspace(2) global [3 x <3 x i32>], align 4
+// CHECK-ROW-MAJOR: @bool3x1_Val = external hidden addrspace(2) global <{ [2 x <{ <1 x i32>, target("dx.Padding", 12) }>], <1 x i32> }>, align 4
+// CHECK-ROW-MAJOR: @bool3x2_Val = external hidden addrspace(2) global <{ [2 x <{ <2 x i32>, target("dx.Padding", 8) }>], <2 x i32> }>, align 4
+// CHECK-ROW-MAJOR: @bool3x3_Val = external hidden addrspace(2) global <{ [2 x <{ <3 x i32>, target("dx.Padding", 4) }>], <3 x i32> }>, align 4
// CHECK-ROW-MAJOR: @bool3x4_Val = external hidden addrspace(2) global [3 x <4 x i32>], align 4
-// CHECK-ROW-MAJOR: @bool4x1_Val = external hidden addrspace(2) global [4 x <1 x i32>], align 4
-// CHECK-ROW-MAJOR: @bool4x2_Val = external hidden addrspace(2) global [4 x <2 x i32>], align 4
-// CHECK-ROW-MAJOR: @bool4x3_Val = external hidden addrspace(2) global [4 x <3 x i32>], align 4
+// CHECK-ROW-MAJOR: @bool4x1_Val = external hidden addrspace(2) global <{ [3 x <{ <1 x i32>, target("dx.Padding", 12) }>], <1 x i32> }>, align 4
+// CHECK-ROW-MAJOR: @bool4x2_Val = external hidden addrspace(2) global <{ [3 x <{ <2 x i32>, target("dx.Padding", 8) }>], <2 x i32> }>, align 4
+// CHECK-ROW-MAJOR: @bool4x3_Val = external hidden addrspace(2) global <{ [3 x <{ <3 x i32>, target("dx.Padding", 4) }>], <3 x i32> }>, align 4
// CHECK-ROW-MAJOR: @bool4x4_Val = external hidden addrspace(2) global [4 x <4 x i32>], align 4
// CHECK-COL-MAJOR: @bool1x1_Val = external hidden addrspace(2) global [1 x <1 x i32>], align 4
-// CHECK-COL-MAJOR: @bool1x2_Val = external hidden addrspace(2) global [2 x <1 x i32>], align 4
-// CHECK-COL-MAJOR: @bool1x3_Val = external hidden addrspace(2) global [3 x <1 x i32>], align 4
-// CHECK-COL-MAJOR: @bool1x4_Val = external hidden addrspace(2) global [4 x <1 x i32>], align 4
+// CHECK-COL-MAJOR: @bool1x2_Val = external hidden addrspace(2) global <{ [1 x <{ <1 x i32>, target("dx.Padding", 12) }>], <1 x i32> }>, align 4
+// CHECK-COL-MAJOR: @bool1x3_Val = external hidden addrspace(2) global <{ [2 x <{ <1 x i32>, target("dx.Padding", 12) }>], <1 x i32> }>, align 4
+// CHECK-COL-MAJOR: @bool1x4_Val = external hidden addrspace(2) global <{ [3 x <{ <1 x i32>, target("dx.Padding", 12) }>], <1 x i32> }>, align 4
// CHECK-COL-MAJOR: @bool2x1_Val = external hidden addrspace(2) global [1 x <2 x i32>], align 4
-// CHECK-COL-MAJOR: @bool2x2_Val = external hidden addrspace(2) global [2 x <2 x i32>], align 4
-// CHECK-COL-MAJOR: @bool2x3_Val = external hidden addrspace(2) global [3 x <2 x i32>], align 4
-// CHECK-COL-MAJOR: @bool2x4_Val = external hidden addrspace(2) global [4 x <2 x i32>], align 4
+// CHECK-COL-MAJOR: @bool2x2_Val = external hidden addrspace(2) global <{ [1 x <{ <2 x i32>, target("dx.Padding", 8) }>], <2 x i32> }>, align 4
+// CHECK-COL-MAJOR: @bool2x3_Val = external hidden addrspace(2) global <{ [2 x <{ <2 x i32>, target("dx.Padding", 8) }>], <2 x i32> }>, align 4
+// CHECK-COL-MAJOR: @bool2x4_Val = external hidden addrspace(2) global <{ [3 x <{ <2 x i32>, target("dx.Padding", 8) }>], <2 x i32> }>, align 4
// CHECK-COL-MAJOR: @bool3x1_Val = external hidden addrspace(2) global [1 x <3 x i32>], align 4
-// CHECK-COL-MAJOR: @bool3x2_Val = external hidden addrspace(2) global [2 x <3 x i32>], align 4
-// CHECK-COL-MAJOR: @bool3x3_Val = external hidden addrspace(2) global [3 x <3 x i32>], align 4
-// CHECK-COL-MAJOR: @bool3x4_Val = external hidden addrspace(2) global [4 x <3 x i32>], align 4
+// CHECK-COL-MAJOR: @bool3x2_Val = external hidden addrspace(2) global <{ [1 x <{ <3 x i32>, target("dx.Padding", 4) }>], <3 x i32> }>, align 4
+// CHECK-COL-MAJOR: @bool3x3_Val = external hidden addrspace(2) global <{ [2 x <{ <3 x i32>, target("dx.Padding", 4) }>], <3 x i32> }>, align 4
+// CHECK-COL-MAJOR: @bool3x4_Val = external hidden addrspace(2) global <{ [3 x <{ <3 x i32>, target("dx.Padding", 4) }>], <3 x i32> }>, align 4
// CHECK-COL-MAJOR: @bool4x1_Val = external hidden addrspace(2) global [1 x <4 x i32>], align 4
// CHECK-COL-MAJOR: @bool4x2_Val = external hidden addrspace(2) global [2 x <4 x i32>], align 4
// CHECK-COL-MAJOR: @bool4x3_Val = external hidden addrspace(2) global [3 x <4 x i32>], align 4
diff --git a/clang/test/CodeGenHLSL/resources/cbuffer_matrix_align.hlsl b/clang/test/CodeGenHLSL/resources/cbuffer_matrix_align.hlsl
new file mode 100644
index 0000000000000..70b2732372691
--- /dev/null
+++ b/clang/test/CodeGenHLSL/resources/cbuffer_matrix_align.hlsl
@@ -0,0 +1,71 @@
+// RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -fsyntax-only -verify -verify-ignore-unexpected=warning
+
+cbuffer MatArr0Pass {
+ float2x4 A0p[2] : packoffset(c0.x);
+ float a0tail : packoffset(c4.x);
+}
+
+cbuffer MatArr0Fail {
+ float2x4 A0f[2] : packoffset(c0.x);
+ float a0bad : packoffset(c3.z);
+ // expected-error at -1 {{packoffset overlap between 'a0bad', 'A0f'}}
+}
+
+// Struct containing a matrix.
+
+struct MS0 {
+ float2x4 M;
+ float2 V;
+};
+
+cbuffer MatStruct0Pass {
+ MS0 s0p : packoffset(c0.x);
+ float s0tail: packoffset(c2.z);
+}
+
+cbuffer MatStruct0Fail {
+ MS0 s0f : packoffset(c0.x);
+ float s0bad : packoffset(c2.y);
+ // expected-error at -1 {{packoffset overlap between 's0bad', 's0f'}}
+}
+
+// Nested struct containing a matrix.
+struct Inner0 {
+ float2x4 M;
+ float F;
+};
+
+struct Outer0 {
+ float2 Head;
+ Inner0 I;
+ float2 Tail;
+};
+
+cbuffer MatNested0Pass {
+ Outer0 o0p : packoffset(c0.x);
+ float o0tail: packoffset(c4.x);
+}
+
+cbuffer MatNested0Fail {
+ Outer0 o0f : packoffset(c0.x);
+ float o0bad: packoffset(c3.z);
+ // expected-error at -1 {{packoffset overlap between 'o0bad', 'o0f'}}
+}
+
+// Array-of-struct where struct contains a matrix.
+
+struct AMS0 {
+ float2x4 M;
+ float2 V;
+};
+
+cbuffer MatArrStruct0Pass {
+ AMS0 as0p[2] : packoffset(c0.x);
+ float as0tail : packoffset(c5.z);
+}
+
+cbuffer MatArrStruct0Fail {
+ AMS0 as0f[2] : packoffset(c0.x);
+ float as0bad : packoffset(c5.y);
+ // expected-error at -1 {{packoffset overlap between 'as0bad', 'as0f'}}
+}
diff --git a/clang/test/CodeGenHLSL/resources/default_cbuffer_with_layout.hlsl b/clang/test/CodeGenHLSL/resources/default_cbuffer_with_layout.hlsl
index 7be1f9043042c..63960f817de8f 100644
--- a/clang/test/CodeGenHLSL/resources/default_cbuffer_with_layout.hlsl
+++ b/clang/test/CodeGenHLSL/resources/default_cbuffer_with_layout.hlsl
@@ -9,7 +9,10 @@
// CHECK-SAME: %S
// CHECK-SAME: i32,
// CHECK-SAME: target("dx.Padding", 4),
-// CHECK-SAME: <4 x float>
+// CHECK-SAME: <4 x float>,
+// CHECK-SAME: <{ [2 x <{ <2 x float>, target("dx.Padding", 8) }>], <2 x float> }>,
+// CHECK-SAME: target("dx.Padding", 8),
+// CHECK-SAME: [3 x <4 x float>]
// CHECK-SAME: }>
// CHECK: %S = type <{ <2 x float> }>
@@ -21,6 +24,8 @@
// CHECK-DAG: @d = external hidden addrspace(2) global <4 x i32>, align 16
// CHECK-DAG: @e = external hidden addrspace(2) global <4 x float>, align 16
// CHECK-DAG: @s = external hidden addrspace(2) global %S, align 1
+// CHECK-DAG: @m = external hidden addrspace(2) global <{ [2 x <{ <2 x float>, target("dx.Padding", 8) }>], <2 x float> }>, align 4
+// CHECK-DAG: @n = external hidden addrspace(2) global [3 x <4 x float>], align 4
struct S {
float2 v;
@@ -32,6 +37,8 @@ int4 d : register(c6);
double c[4] : register(c2);
float4 e;
S s : register(c7);
+float2x3 m;
+float4x3 n;
RWBuffer<float> Buf;
@@ -41,4 +48,4 @@ void main() {
}
// CHECK: !hlsl.cbs = !{![[CB:.*]]}
-// CHECK: ![[CB]] = !{ptr @"$Globals.cb", ptr addrspace(2) @b, ptr addrspace(2) @c, ptr addrspace(2) @d, ptr addrspace(2) @s, ptr addrspace(2) @a, ptr addrspace(2) @e}
+// CHECK: ![[CB]] = !{ptr @"$Globals.cb", ptr addrspace(2) @b, ptr addrspace(2) @c, ptr addrspace(2) @d, ptr addrspace(2) @s, ptr addrspace(2) @a, ptr addrspace(2) @e, ptr addrspace(2) @m, ptr addrspace(2) @n}
diff --git a/clang/test/CodeGenObjC/attr-no-outline.m b/clang/test/CodeGenObjC/attr-no-outline.m
index 16d1a9eb867a0..8819f1d81107c 100644
--- a/clang/test/CodeGenObjC/attr-no-outline.m
+++ b/clang/test/CodeGenObjC/attr-no-outline.m
@@ -1,9 +1,28 @@
-// RUN: %clang_cc1 -emit-llvm %s -triple x86_64-unknown-linux-gnu -o - | FileCheck %s --check-prefix=OBJC
-// RUN: %clang_cc1 -emit-llvm -x objective-c++ %s -triple x86_64-unknown-linux-gnu -o - | FileCheck %s --check-prefix=OBJCXX
+// RUN: %clang_cc1 -emit-llvm %s -triple x86_64-unknown-linux-gnu -o - -DTEST_ATTR | FileCheck %s --check-prefixes=OBJC,OBJC-ATTR
+// RUN: %clang_cc1 -emit-llvm %s -triple x86_64-unknown-linux-gnu -o - -mno-outline | FileCheck %s --check-prefixes=OBJC,OBJC-ARG
+// RUN: %clang_cc1 -emit-llvm %s -triple x86_64-unknown-linux-gnu -o - | FileCheck %s --check-prefixes=OBJC,OBJC-NONE
+
+// RUN: %clang_cc1 -emit-llvm -x objective-c++ %s -triple x86_64-unknown-linux-gnu -o - -DTEST_ATTR | FileCheck %s --check-prefixes=OBJCXX,OBJCXX-ATTR
+// RUN: %clang_cc1 -emit-llvm -x objective-c++ %s -triple x86_64-unknown-linux-gnu -o - -mno-outline | FileCheck %s --check-prefixes=OBJCXX,OBJCXX-ARG
+// RUN: %clang_cc1 -emit-llvm -x objective-c++ %s -triple x86_64-unknown-linux-gnu -o - | FileCheck %s --check-prefixes=OBJCXX,OBJCXX-NONE
+
+// This test checks that:
+// - [[clang::no_outline]] adds the nooutline IR attribute to specific definitions
+// - `-mno-outline` adds the nooutline IR attribute to all definitions
+// - Lack of either does not add nooutline IR attribute
+
+
+#ifdef TEST_ATTR
+#define ATTR [[clang::no_outline]]
+#else
+#define ATTR
+#endif
@interface Test
- (int)method:(int)x;
+- (int)method_no_attr:(int)x;
+ (int)static_method:(int)x;
++ (int)static_method_no_attr:(int)x;
@end
@implementation Test
@@ -13,7 +32,20 @@ @implementation Test
// OBJCXX-LABEL: define internal noundef i32 @"\01-[Test method:]"(
// OBJCXX: ) #[[ATTR0:[0-9]+]] {
-- (int)method:(int)x [[clang::no_outline]] {
+- (int)method:(int)x ATTR {
+ return x;
+}
+
+// OBJC-LABEL: define internal i32 @"\01-[Test method_no_attr:]"(
+// OBJC-ATTR: ) #[[ATTR1:[0-9]+]] {
+// OBJC-ARG: ) #[[ATTR0]] {
+// OBJC-NONE: ) #[[ATTR0]] {
+
+// OBJCXX-LABEL: define internal noundef i32 @"\01-[Test method_no_attr:]"(
+// OBJCXX-ATTR: ) #[[ATTR1:[0-9]+]] {
+// OBJCXX-ARG: ) #[[ATTR0]] {
+// OBJCXX-NONE: ) #[[ATTR0]] {
+- (int)method_no_attr:(int) x {
return x;
}
@@ -22,19 +54,44 @@ - (int)method:(int)x [[clang::no_outline]] {
// OBJCXX-LABEL: define internal noundef i32 @"\01+[Test static_method:]"(
// OBJCXX: ) #[[ATTR0]] {
-+ (int)static_method:(int)x [[clang::no_outline]] {
++ (int)static_method:(int)x ATTR {
+ return x;
+}
+
+
+// OBJC-LABEL: define internal i32 @"\01+[Test static_method_no_attr:]"(
+// OBJC-ATTR: ) #[[ATTR1]] {
+// OBJC-ARG: ) #[[ATTR0]] {
+// OBJC-NONE: ) #[[ATTR0]] {
+
+
+// OBJCXX-LABEL: define internal noundef i32 @"\01+[Test static_method_no_attr:]"(
+// OBJCXX-ATTR: ) #[[ATTR1]] {
+// OBJCXX-ARG: ) #[[ATTR0]] {
+// OBJCXX-NONE: ) #[[ATTR0]] {
+
++ (int)static_method_no_attr:(int)x {
return x;
}
@end
// OBJC: attributes #[[ATTR0]] = {
-// OBJC-SAME: nooutline
+// OBJC-ATTR-SAME: nooutline
+// OBJC-ARG-SAME: nooutline
+// OBJC-NONE-NOT: nooutline
// OBJC-SAME: }
+// OBJC-ATTR: attributes #[[ATTR1]] = {
+// OBJC-ATTR-NOT: nooutline
+// OBJC-ATTR-SAME: }
+
// OBJCXX: attributes #[[ATTR0]] = {
-// OBJCXX-SAME: nooutline
+// OBJCXX-ATTR-SAME: nooutline
+// OBJCXX-ARG-SAME: nooutline
+// OBJCXX-NONE-NOT: nooutline
// OBJCXX-SAME: }
-
-
+// OBJCXX-ATTR: attributes #[[ATTR1]] = {
+// OBJCXX-ATTR-NOT: nooutline
+// OBJCXX-ATTR-SAME: }
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w32.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w32.cl
index 6326866ed3c35..47ae7ce82becf 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w32.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w32.cl
@@ -1,6 +1,7 @@
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
// REQUIRES: amdgpu-registered-target
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize32 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1170 -target-feature +wavefrontsize32 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize32 -emit-llvm -o - %s | FileCheck %s
typedef int v2i __attribute__((ext_vector_type(2)));
typedef float v8f __attribute__((ext_vector_type(8)));
@@ -14,12 +15,12 @@ typedef int v8i __attribute__((ext_vector_type(8)));
// amdgcn_wmma_f32_16x16x16_f16
//
-// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_f16_w32(
-// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-// CHECK-GFX1200-NEXT: [[ENTRY:.*:]]
-// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> [[A]], <8 x half> [[B]], <8 x float> [[C]])
-// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8:![0-9]+]]
-// CHECK-GFX1200-NEXT: ret void
+// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_f16_w32(
+// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> [[A]], <8 x half> [[B]], <8 x float> [[C]])
+// CHECK-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8:![0-9]+]]
+// CHECK-NEXT: ret void
//
void test_amdgcn_wmma_f32_16x16x16_f16_w32(global v8f* out, v8h a, v8h b, v8f c)
{
@@ -30,12 +31,12 @@ void test_amdgcn_wmma_f32_16x16x16_f16_w32(global v8f* out, v8h a, v8h b, v8f c)
// amdgcn_wmma_f32_16x16x16_bf16
//
-// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_bf16_w32(
-// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-GFX1200-NEXT: [[ENTRY:.*:]]
-// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16(<8 x i16> [[A]], <8 x i16> [[B]], <8 x float> [[C]])
-// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]]
-// CHECK-GFX1200-NEXT: ret void
+// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_bf16_w32(
+// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16(<8 x i16> [[A]], <8 x i16> [[B]], <8 x float> [[C]])
+// CHECK-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]]
+// CHECK-NEXT: ret void
//
void test_amdgcn_wmma_f32_16x16x16_bf16_w32(global v8f* out, v8s a, v8s b, v8f c)
{
@@ -46,12 +47,12 @@ void test_amdgcn_wmma_f32_16x16x16_bf16_w32(global v8f* out, v8s a, v8s b, v8f c
// amdgcn_wmma_f16_16x16x16_f16
//
-// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_f16_16x16x16_f16_w32(
-// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-GFX1200-NEXT: [[ENTRY:.*:]]
-// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> [[A]], <8 x half> [[B]], <8 x half> [[C]], i1 false)
-// CHECK-GFX1200-NEXT: store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
-// CHECK-GFX1200-NEXT: ret void
+// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_f16_16x16x16_f16_w32(
+// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> [[A]], <8 x half> [[B]], <8 x half> [[C]], i1 false)
+// CHECK-NEXT: store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
+// CHECK-NEXT: ret void
//
void test_amdgcn_wmma_f16_16x16x16_f16_w32(global v8h* out, v8h a, v8h b, v8h c)
{
@@ -62,12 +63,12 @@ void test_amdgcn_wmma_f16_16x16x16_f16_w32(global v8h* out, v8h a, v8h b, v8h c)
// amdgcn_wmma_bf16_16x16x16_bf16
//
-// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_bf16_16x16x16_bf16_w32(
-// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <8 x i16> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-GFX1200-NEXT: [[ENTRY:.*:]]
-// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16(<8 x i16> [[A]], <8 x i16> [[B]], <8 x i16> [[C]], i1 false)
-// CHECK-GFX1200-NEXT: store <8 x i16> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
-// CHECK-GFX1200-NEXT: ret void
+// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_bf16_16x16x16_bf16_w32(
+// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <8 x i16> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16(<8 x i16> [[A]], <8 x i16> [[B]], <8 x i16> [[C]], i1 false)
+// CHECK-NEXT: store <8 x i16> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
+// CHECK-NEXT: ret void
//
void test_amdgcn_wmma_bf16_16x16x16_bf16_w32(global v8s* out, v8s a, v8s b, v8s c)
{
@@ -78,12 +79,12 @@ void test_amdgcn_wmma_bf16_16x16x16_bf16_w32(global v8s* out, v8s a, v8s b, v8s
// amdgcn_wmma_i32_16x16x16_iu8
//
-// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_i32_16x16x16_iu8_w32(
-// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <8 x i32> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-GFX1200-NEXT: [[ENTRY:.*:]]
-// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32(i1 true, <2 x i32> [[A]], i1 true, <2 x i32> [[B]], <8 x i32> [[C]], i1 false)
-// CHECK-GFX1200-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]]
-// CHECK-GFX1200-NEXT: ret void
+// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_i32_16x16x16_iu8_w32(
+// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <8 x i32> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32(i1 true, <2 x i32> [[A]], i1 true, <2 x i32> [[B]], <8 x i32> [[C]], i1 false)
+// CHECK-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]]
+// CHECK-NEXT: ret void
//
void test_amdgcn_wmma_i32_16x16x16_iu8_w32(global v8i* out, v2i a, v2i b, v8i c)
{
@@ -94,79 +95,79 @@ void test_amdgcn_wmma_i32_16x16x16_iu8_w32(global v8i* out, v2i a, v2i b, v8i c)
// amdgcn_wmma_i32_16x16x16_iu4
//
-// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_i32_16x16x16_iu4_w32(
-// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], <8 x i32> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-GFX1200-NEXT: [[ENTRY:.*:]]
-// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32(i1 true, i32 [[A]], i1 true, i32 [[B]], <8 x i32> [[C]], i1 false)
-// CHECK-GFX1200-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]]
-// CHECK-GFX1200-NEXT: ret void
+// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_i32_16x16x16_iu4_w32(
+// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], <8 x i32> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32(i1 true, i32 [[A]], i1 true, i32 [[B]], <8 x i32> [[C]], i1 false)
+// CHECK-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]]
+// CHECK-NEXT: ret void
//
void test_amdgcn_wmma_i32_16x16x16_iu4_w32(global v8i* out, int a, int b, v8i c)
{
*out = __builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12(true, a, true, b, c, false);
}
-// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32(
-// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-GFX1200-NEXT: [[ENTRY:.*:]]
-// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32(<2 x i32> [[A]], <2 x i32> [[B]], <8 x float> [[C]])
-// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]]
-// CHECK-GFX1200-NEXT: ret void
+// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32(
+// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32(<2 x i32> [[A]], <2 x i32> [[B]], <8 x float> [[C]])
+// CHECK-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]]
+// CHECK-NEXT: ret void
//
void test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32(global v8f* out, v2i a, v2i b, v8f c)
{
*out = __builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12(a, b, c);
}
-// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32(
-// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-GFX1200-NEXT: [[ENTRY:.*:]]
-// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32(<2 x i32> [[A]], <2 x i32> [[B]], <8 x float> [[C]])
-// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]]
-// CHECK-GFX1200-NEXT: ret void
+// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32(
+// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32(<2 x i32> [[A]], <2 x i32> [[B]], <8 x float> [[C]])
+// CHECK-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]]
+// CHECK-NEXT: ret void
//
void test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32(global v8f* out, v2i a, v2i b, v8f c)
{
*out = __builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12(a, b, c);
}
-// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32(
-// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-GFX1200-NEXT: [[ENTRY:.*:]]
-// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32(<2 x i32> [[A]], <2 x i32> [[B]], <8 x float> [[C]])
-// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]]
-// CHECK-GFX1200-NEXT: ret void
+// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32(
+// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32(<2 x i32> [[A]], <2 x i32> [[B]], <8 x float> [[C]])
+// CHECK-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]]
+// CHECK-NEXT: ret void
//
void test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32(global v8f* out, v2i a, v2i b, v8f c)
{
*out = __builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12(a, b, c);
}
-// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32(
-// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-GFX1200-NEXT: [[ENTRY:.*:]]
-// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32(<2 x i32> [[A]], <2 x i32> [[B]], <8 x float> [[C]])
-// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]]
-// CHECK-GFX1200-NEXT: ret void
+// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32(
+// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32(<2 x i32> [[A]], <2 x i32> [[B]], <8 x float> [[C]])
+// CHECK-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]]
+// CHECK-NEXT: ret void
//
void test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32(global v8f* out, v2i a, v2i b, v8f c)
{
*out = __builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12(a, b, c);
}
-// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_i32_16x16x32_iu4_w32(
-// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <8 x i32> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-GFX1200-NEXT: [[ENTRY:.*:]]
-// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32(i1 true, <2 x i32> [[A]], i1 true, <2 x i32> [[B]], <8 x i32> [[C]], i1 false)
-// CHECK-GFX1200-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]]
-// CHECK-GFX1200-NEXT: ret void
+// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_i32_16x16x32_iu4_w32(
+// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <8 x i32> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32(i1 true, <2 x i32> [[A]], i1 true, <2 x i32> [[B]], <8 x i32> [[C]], i1 false)
+// CHECK-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]]
+// CHECK-NEXT: ret void
//
void test_amdgcn_wmma_i32_16x16x32_iu4_w32(global v8i* out, v2i a, v2i b, v8i c)
{
*out = __builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12(true, a, true, b, c, false);
}
//.
-// CHECK-GFX1200: [[META6:![0-9]+]] = !{!"omnipotent char", [[META7:![0-9]+]], i64 0}
-// CHECK-GFX1200: [[META7]] = !{!"Simple C/C++ TBAA"}
-// CHECK-GFX1200: [[CHAR_TBAA8]] = !{[[META6]], [[META6]], i64 0}
+// CHECK: [[META6:![0-9]+]] = !{!"omnipotent char", [[META7:![0-9]+]], i64 0}
+// CHECK: [[META7]] = !{!"Simple C/C++ TBAA"}
+// CHECK: [[CHAR_TBAA8]] = !{[[META6]], [[META6]], i64 0}
//.
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w64.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w64.cl
index a79c3d4da1ebb..98ce84adf1554 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w64.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w64.cl
@@ -1,6 +1,7 @@
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
// REQUIRES: amdgpu-registered-target
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize64 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1170 -target-feature +wavefrontsize64 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize64 -emit-llvm -o - %s | FileCheck %s
typedef float v4f __attribute__((ext_vector_type(4)));
typedef half v4h __attribute__((ext_vector_type(4)));
@@ -13,12 +14,12 @@ typedef int v4i __attribute__((ext_vector_type(4)));
// amdgcn_wmma_f32_16x16x16_f16
//
-// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_f16_w64(
-// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-// CHECK-GFX1200-NEXT: [[ENTRY:.*:]]
-// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> [[A]], <4 x half> [[B]], <4 x float> [[C]])
-// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8:![0-9]+]]
-// CHECK-GFX1200-NEXT: ret void
+// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_f16_w64(
+// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> [[A]], <4 x half> [[B]], <4 x float> [[C]])
+// CHECK-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8:![0-9]+]]
+// CHECK-NEXT: ret void
//
void test_amdgcn_wmma_f32_16x16x16_f16_w64(global v4f* out, v4h a, v4h b, v4f c)
{
@@ -29,12 +30,12 @@ void test_amdgcn_wmma_f32_16x16x16_f16_w64(global v4f* out, v4h a, v4h b, v4f c)
// amdgcn_wmma_f32_16x16x16_bf16
//
-// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_bf16_w64(
-// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-GFX1200-NEXT: [[ENTRY:.*:]]
-// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16(<4 x i16> [[A]], <4 x i16> [[B]], <4 x float> [[C]])
-// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
-// CHECK-GFX1200-NEXT: ret void
+// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_bf16_w64(
+// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16(<4 x i16> [[A]], <4 x i16> [[B]], <4 x float> [[C]])
+// CHECK-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
+// CHECK-NEXT: ret void
//
void test_amdgcn_wmma_f32_16x16x16_bf16_w64(global v4f* out, v4s a, v4s b, v4f c)
{
@@ -45,12 +46,12 @@ void test_amdgcn_wmma_f32_16x16x16_bf16_w64(global v4f* out, v4s a, v4s b, v4f c
// amdgcn_wmma_f16_16x16x16_f16
//
-// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_f16_16x16x16_f16_w64(
-// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 8)) [[OUT:%.*]], <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-GFX1200-NEXT: [[ENTRY:.*:]]
-// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16(<4 x half> [[A]], <4 x half> [[B]], <4 x half> [[C]], i1 false)
-// CHECK-GFX1200-NEXT: store <4 x half> [[TMP0]], ptr addrspace(1) [[OUT]], align 8, !tbaa [[CHAR_TBAA8]]
-// CHECK-GFX1200-NEXT: ret void
+// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_f16_16x16x16_f16_w64(
+// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 8)) [[OUT:%.*]], <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16(<4 x half> [[A]], <4 x half> [[B]], <4 x half> [[C]], i1 false)
+// CHECK-NEXT: store <4 x half> [[TMP0]], ptr addrspace(1) [[OUT]], align 8, !tbaa [[CHAR_TBAA8]]
+// CHECK-NEXT: ret void
//
void test_amdgcn_wmma_f16_16x16x16_f16_w64(global v4h* out, v4h a, v4h b, v4h c)
{
@@ -61,12 +62,12 @@ void test_amdgcn_wmma_f16_16x16x16_f16_w64(global v4h* out, v4h a, v4h b, v4h c)
// amdgcn_wmma_bf16_16x16x16_bf16
//
-// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_bf16_16x16x16_bf16_w64(
-// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 8)) [[OUT:%.*]], <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-GFX1200-NEXT: [[ENTRY:.*:]]
-// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16(<4 x i16> [[A]], <4 x i16> [[B]], <4 x i16> [[C]], i1 false)
-// CHECK-GFX1200-NEXT: store <4 x i16> [[TMP0]], ptr addrspace(1) [[OUT]], align 8, !tbaa [[CHAR_TBAA8]]
-// CHECK-GFX1200-NEXT: ret void
+// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_bf16_16x16x16_bf16_w64(
+// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 8)) [[OUT:%.*]], <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16(<4 x i16> [[A]], <4 x i16> [[B]], <4 x i16> [[C]], i1 false)
+// CHECK-NEXT: store <4 x i16> [[TMP0]], ptr addrspace(1) [[OUT]], align 8, !tbaa [[CHAR_TBAA8]]
+// CHECK-NEXT: ret void
//
void test_amdgcn_wmma_bf16_16x16x16_bf16_w64(global v4s* out, v4s a, v4s b, v4s c)
{
@@ -77,12 +78,12 @@ void test_amdgcn_wmma_bf16_16x16x16_bf16_w64(global v4s* out, v4s a, v4s b, v4s
// amdgcn_wmma_i32_16x16x16_iu8
//
-// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_i32_16x16x16_iu8_w64(
-// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-GFX1200-NEXT: [[ENTRY:.*:]]
-// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32(i1 true, i32 [[A]], i1 true, i32 [[B]], <4 x i32> [[C]], i1 false)
-// CHECK-GFX1200-NEXT: store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
-// CHECK-GFX1200-NEXT: ret void
+// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_i32_16x16x16_iu8_w64(
+// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32(i1 true, i32 [[A]], i1 true, i32 [[B]], <4 x i32> [[C]], i1 false)
+// CHECK-NEXT: store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
+// CHECK-NEXT: ret void
//
void test_amdgcn_wmma_i32_16x16x16_iu8_w64(global v4i* out, int a, int b, v4i c)
{
@@ -93,79 +94,79 @@ void test_amdgcn_wmma_i32_16x16x16_iu8_w64(global v4i* out, int a, int b, v4i c)
// amdgcn_wmma_i32_16x16x16_iu4
//
-// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_i32_16x16x16_iu4_w64(
-// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-GFX1200-NEXT: [[ENTRY:.*:]]
-// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32(i1 true, i32 [[A]], i1 true, i32 [[B]], <4 x i32> [[C]], i1 false)
-// CHECK-GFX1200-NEXT: store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
-// CHECK-GFX1200-NEXT: ret void
+// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_i32_16x16x16_iu4_w64(
+// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32(i1 true, i32 [[A]], i1 true, i32 [[B]], <4 x i32> [[C]], i1 false)
+// CHECK-NEXT: store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
+// CHECK-NEXT: ret void
//
void test_amdgcn_wmma_i32_16x16x16_iu4_w64(global v4i* out, int a, int b, v4i c)
{
*out = __builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12(true, a, true, b, c, false);
}
-// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32(
-// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-GFX1200-NEXT: [[ENTRY:.*:]]
-// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32(i32 [[A]], i32 [[B]], <4 x float> [[C]])
-// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
-// CHECK-GFX1200-NEXT: ret void
+// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32(
+// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32(i32 [[A]], i32 [[B]], <4 x float> [[C]])
+// CHECK-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
+// CHECK-NEXT: ret void
//
void test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32(global v4f* out, int a, int b, v4f c)
{
*out = __builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12(a, b, c);
}
-// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32(
-// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-GFX1200-NEXT: [[ENTRY:.*:]]
-// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32(i32 [[A]], i32 [[B]], <4 x float> [[C]])
-// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
-// CHECK-GFX1200-NEXT: ret void
+// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32(
+// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32(i32 [[A]], i32 [[B]], <4 x float> [[C]])
+// CHECK-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
+// CHECK-NEXT: ret void
//
void test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32(global v4f* out, int a, int b, v4f c)
{
*out = __builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12(a, b, c);
}
-// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32(
-// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-GFX1200-NEXT: [[ENTRY:.*:]]
-// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32(i32 [[A]], i32 [[B]], <4 x float> [[C]])
-// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
-// CHECK-GFX1200-NEXT: ret void
+// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32(
+// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32(i32 [[A]], i32 [[B]], <4 x float> [[C]])
+// CHECK-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
+// CHECK-NEXT: ret void
//
void test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32(global v4f* out, int a, int b, v4f c)
{
*out = __builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12(a, b, c);
}
-// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32(
-// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-GFX1200-NEXT: [[ENTRY:.*:]]
-// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32(i32 [[A]], i32 [[B]], <4 x float> [[C]])
-// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
-// CHECK-GFX1200-NEXT: ret void
+// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32(
+// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32(i32 [[A]], i32 [[B]], <4 x float> [[C]])
+// CHECK-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
+// CHECK-NEXT: ret void
//
void test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32(global v4f* out, int a, int b, v4f c)
{
*out = __builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12(a, b, c);
}
-// CHECK-GFX1200-LABEL: define dso_local void @test_amdgcn_wmma_i32_16x16x32_iu4_w32(
-// CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-GFX1200-NEXT: [[ENTRY:.*:]]
-// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32(i1 true, i32 [[A]], i1 true, i32 [[B]], <4 x i32> [[C]], i1 false)
-// CHECK-GFX1200-NEXT: store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
-// CHECK-GFX1200-NEXT: ret void
+// CHECK-LABEL: define dso_local void @test_amdgcn_wmma_i32_16x16x32_iu4_w32(
+// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32(i1 true, i32 [[A]], i1 true, i32 [[B]], <4 x i32> [[C]], i1 false)
+// CHECK-NEXT: store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
+// CHECK-NEXT: ret void
//
void test_amdgcn_wmma_i32_16x16x32_iu4_w32(global v4i* out, int a, int b, v4i c)
{
*out = __builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12(true, a, true, b, c, false);
}
//.
-// CHECK-GFX1200: [[META6:![0-9]+]] = !{!"omnipotent char", [[META7:![0-9]+]], i64 0}
-// CHECK-GFX1200: [[META7]] = !{!"Simple C/C++ TBAA"}
-// CHECK-GFX1200: [[CHAR_TBAA8]] = !{[[META6]], [[META6]], i64 0}
+// CHECK: [[META6:![0-9]+]] = !{!"omnipotent char", [[META7:![0-9]+]], i64 0}
+// CHECK: [[META7]] = !{!"Simple C/C++ TBAA"}
+// CHECK: [[CHAR_TBAA8]] = !{[[META6]], [[META6]], i64 0}
//.
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32-gfx10-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32-gfx10-err.cl
index a1a56f0d8417d..ed72a8ee7dbd2 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32-gfx10-err.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32-gfx10-err.cl
@@ -21,14 +21,14 @@ void test_amdgcn_wmma_f32_16x16x16_bf16_w32(global v8f* out8f, v16s a16s, v16s b
global v16s* out16s, v2i a2i, v2i b2i, v16s c16s,
global v8i* out8i, v4i a4i, v4i b4i, v8i c8i)
{
- *out8f = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32(a16h, b16h, c8f); // expected-error{{'__builtin_amdgcn_wmma_f32_16x16x16_f16_w32' needs target feature gfx11-insts,wavefrontsize32}}
- *out8f = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32(a16s, b16s, c8f); // expected-error{{'__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32' needs target feature gfx11-insts,wavefrontsize32}}
- *out16h = __builtin_amdgcn_wmma_f16_16x16x16_f16_w32(a16h, b16h, c16h, true); // expected-error{{'__builtin_amdgcn_wmma_f16_16x16x16_f16_w32' needs target feature gfx11-insts,wavefrontsize32}}
- *out16s = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32(a16s, b16s, c16s, true); // expected-error{{'__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32' needs target feature gfx11-insts,wavefrontsize32}}
- *out16h = __builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32(a16h, b16h, c16h, true); // expected-error{{'__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32' needs target feature gfx11-insts,wavefrontsize32}}
- *out16s = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32(a16s, b16s, c16s, true); // expected-error{{'__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32' needs target feature gfx11-insts,wavefrontsize32}}
- *out8i = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32(true, a4i, true, b4i, c8i, false); // expected-error{{'__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32' needs target feature gfx11-insts,wavefrontsize32}}
- *out8i = __builtin_amdgcn_wmma_i32_16x16x16_iu4_w32(true, a2i, true, b2i, c8i, false); // expected-error{{'__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32' needs target feature gfx11-insts,wavefrontsize32}}
+ *out8f = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32(a16h, b16h, c8f); // expected-error{{'__builtin_amdgcn_wmma_f32_16x16x16_f16_w32' needs target feature wmma-256b-insts,wavefrontsize32}}
+ *out8f = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32(a16s, b16s, c8f); // expected-error{{'__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32' needs target feature wmma-256b-insts,wavefrontsize32}}
+ *out16h = __builtin_amdgcn_wmma_f16_16x16x16_f16_w32(a16h, b16h, c16h, true); // expected-error{{'__builtin_amdgcn_wmma_f16_16x16x16_f16_w32' needs target feature wmma-256b-insts,wavefrontsize32}}
+ *out16s = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32(a16s, b16s, c16s, true); // expected-error{{'__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32' needs target feature wmma-256b-insts,wavefrontsize32}}
+ *out16h = __builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32(a16h, b16h, c16h, true); // expected-error{{'__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32' needs target feature wmma-256b-insts,wavefrontsize32}}
+ *out16s = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32(a16s, b16s, c16s, true); // expected-error{{'__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32' needs target feature wmma-256b-insts,wavefrontsize32}}
+ *out8i = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32(true, a4i, true, b4i, c8i, false); // expected-error{{'__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32' needs target feature wmma-256b-insts,wavefrontsize32}}
+ *out8i = __builtin_amdgcn_wmma_i32_16x16x16_iu4_w32(true, a2i, true, b2i, c8i, false); // expected-error{{'__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32' needs target feature wmma-256b-insts,wavefrontsize32}}
}
#endif
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w64-gfx10-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w64-gfx10-err.cl
index d995b1dc46be7..4b1808fe6d6e6 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w64-gfx10-err.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w64-gfx10-err.cl
@@ -21,14 +21,14 @@ void test_amdgcn_wmma_f32_16x16x16_bf16_w64(global v4f* out4f, v16h a16h, v16h b
global v8s* out8s, v4i a4i, v4i b4i, v8s c8s,
global v4i* out4i, v2i a2i, v2i b2i, v4i c4i)
{
- *out4f = __builtin_amdgcn_wmma_f32_16x16x16_f16_w64(a16h, b16h, c4f); // expected-error{{'__builtin_amdgcn_wmma_f32_16x16x16_f16_w64' needs target feature gfx11-insts,wavefrontsize64}}
- *out4f = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w64(a16s, b16s, c4f); // expected-error{{'__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64' needs target feature gfx11-insts,wavefrontsize64}}
- *out8h = __builtin_amdgcn_wmma_f16_16x16x16_f16_w64(a16h, b16h, c8h, true); // expected-error{{'__builtin_amdgcn_wmma_f16_16x16x16_f16_w64' needs target feature gfx11-insts,wavefrontsize64}}
- *out8s = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64(a16s, b16s, c8s, true); // expected-error{{'__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64' needs target feature gfx11-insts,wavefrontsize64}}
- *out8h = __builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64(a16h, b16h, c8h, true); // expected-error{{'__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64' needs target feature gfx11-insts,wavefrontsize64}}
- *out8s = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64(a16s, b16s, c8s, true); // expected-error{{'__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64' needs target feature gfx11-insts,wavefrontsize64}}
- *out4i = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w64(true, a4i, true, b4i, c4i, false); // expected-error{{'__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64' needs target feature gfx11-insts,wavefrontsize64}}
- *out4i = __builtin_amdgcn_wmma_i32_16x16x16_iu4_w64(true, a2i, true, b2i, c4i, false); // expected-error{{'__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64' needs target feature gfx11-insts,wavefrontsize64}}
+ *out4f = __builtin_amdgcn_wmma_f32_16x16x16_f16_w64(a16h, b16h, c4f); // expected-error{{'__builtin_amdgcn_wmma_f32_16x16x16_f16_w64' needs target feature wmma-256b-insts,wavefrontsize64}}
+ *out4f = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w64(a16s, b16s, c4f); // expected-error{{'__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64' needs target feature wmma-256b-insts,wavefrontsize64}}
+ *out8h = __builtin_amdgcn_wmma_f16_16x16x16_f16_w64(a16h, b16h, c8h, true); // expected-error{{'__builtin_amdgcn_wmma_f16_16x16x16_f16_w64' needs target feature wmma-256b-insts,wavefrontsize64}}
+ *out8s = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64(a16s, b16s, c8s, true); // expected-error{{'__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64' needs target feature wmma-256b-insts,wavefrontsize64}}
+ *out8h = __builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64(a16h, b16h, c8h, true); // expected-error{{'__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64' needs target feature wmma-256b-insts,wavefrontsize64}}
+ *out8s = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64(a16s, b16s, c8s, true); // expected-error{{'__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64' needs target feature wmma-256b-insts,wavefrontsize64}}
+ *out4i = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w64(true, a4i, true, b4i, c4i, false); // expected-error{{'__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64' needs target feature wmma-256b-insts,wavefrontsize64}}
+ *out4i = __builtin_amdgcn_wmma_i32_16x16x16_iu4_w64(true, a2i, true, b2i, c4i, false); // expected-error{{'__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64' needs target feature wmma-256b-insts,wavefrontsize64}}
}
#endif
diff --git a/clang/test/Driver/Inputs/XRSimulator1.0.sdk/usr/include/libxml/.keep b/clang/test/Driver/Inputs/XRSimulator1.0.sdk/usr/include/libxml/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/aarch64-outliner.c b/clang/test/Driver/aarch64-outliner.c
index 5ed822f122fc4..4d5b7321e330f 100644
--- a/clang/test/Driver/aarch64-outliner.c
+++ b/clang/test/Driver/aarch64-outliner.c
@@ -3,4 +3,4 @@
// ON: "-mllvm" "-enable-machine-outliner"
// RUN: %clang --target=aarch64 -moutline -mno-outline -S %s -### 2>&1 | FileCheck %s -check-prefix=OFF
// RUN: %clang --target=aarch64_be -moutline -mno-outline -S %s -### 2>&1 | FileCheck %s -check-prefix=OFF
-// OFF: "-mllvm" "-enable-machine-outliner=never"
+// OFF: "-mno-outline" "-mllvm" "-enable-machine-outliner=never"
diff --git a/clang/test/Driver/arm-machine-outliner.c b/clang/test/Driver/arm-machine-outliner.c
index a1e705cb60a1b..efa29d2ab8450 100644
--- a/clang/test/Driver/arm-machine-outliner.c
+++ b/clang/test/Driver/arm-machine-outliner.c
@@ -3,6 +3,6 @@
// RUN: %clang -target armv7-linux-gnueabihf -flto -moutline %s -### 2>&1 | FileCheck %s -check-prefix=ON-LTO
// ON-LTO: "-plugin-opt=-enable-machine-outliner"
// RUN: %clang -target armv7-linux-gnueabihf -moutline -mno-outline -c %s -### 2>&1 | FileCheck %s -check-prefix=OFF
-// OFF: "-mllvm" "-enable-machine-outliner=never"
+// OFF: "-mno-outline" "-mllvm" "-enable-machine-outliner=never"
// RUN: %clang -target armv7-linux-gnueabihf -flto -moutline -mno-outline %s -### 2>&1 | FileCheck %s -check-prefix=OFF-LTO
// OFF-LTO: "-plugin-opt=-enable-machine-outliner=never"
diff --git a/clang/test/Driver/crash-diagnostics-dir-3.c b/clang/test/Driver/crash-diagnostics-dir-3.c
index a91bc48d7e462..63a5efc853a4c 100644
--- a/clang/test/Driver/crash-diagnostics-dir-3.c
+++ b/clang/test/Driver/crash-diagnostics-dir-3.c
@@ -1,6 +1,6 @@
// RUN: export LSAN_OPTIONS=detect_leaks=0
// RUN: rm -rf %t
-// RUN: not env CLANG_CRASH_DIAGNOSTICS_DIR=%t %clang -c %s -o - 2>&1 | FileCheck %s
+// RUN: not %crash_opt env CLANG_CRASH_DIAGNOSTICS_DIR=%t %clang -c %s -o - 2>&1 | FileCheck %s
#pragma clang __debug parser_crash
// CHECK: Preprocessed source(s) and associated run script(s) are located at:
// CHECK: diagnostic msg: {{.*}}{{/|\\}}crash-diagnostics-dir-3.c.tmp{{(/|\\).*}}.c
diff --git a/clang/test/Driver/crash-diagnostics-dir.c b/clang/test/Driver/crash-diagnostics-dir.c
index 16382eff1cde7..9a8299bffe005 100644
--- a/clang/test/Driver/crash-diagnostics-dir.c
+++ b/clang/test/Driver/crash-diagnostics-dir.c
@@ -1,6 +1,6 @@
// RUN: export LSAN_OPTIONS=detect_leaks=0
// RUN: rm -rf %t
-// RUN: not %clang -fcrash-diagnostics-dir=%t -c %s -o - 2>&1 | FileCheck %s
+// RUN: not %crash_opt %clang -fcrash-diagnostics-dir=%t -c %s -o - 2>&1 | FileCheck %s
#pragma clang __debug parser_crash
// CHECK: Preprocessed source(s) and associated run script(s) are located at:
// CHECK: diagnostic msg: {{.*}}{{/|\\}}crash-diagnostics-dir.c.tmp{{(/|\\).*}}.c
diff --git a/clang/test/Driver/crash-ir-repro.cpp b/clang/test/Driver/crash-ir-repro.cpp
index 217d5ed421bdb..1a2000ad1279f 100644
--- a/clang/test/Driver/crash-ir-repro.cpp
+++ b/clang/test/Driver/crash-ir-repro.cpp
@@ -1,5 +1,5 @@
// RUN: %clang -S -emit-llvm -o %t.ll %s
-// RUN: not %clang -S -DCRASH %s -o %t.ll 2>&1 | FileCheck %s
+// RUN: not %crash_opt %clang -S -DCRASH %s -o %t.ll 2>&1 | FileCheck %s
// TODO(boomanaiden154): This test case causes clang to raise a signal when
// running under ubsan, but not in normal build configurations. This should
diff --git a/clang/test/Driver/crash-report-clang-cl.cpp b/clang/test/Driver/crash-report-clang-cl.cpp
index 963c3b6d0ab03..f61b94626f584 100644
--- a/clang/test/Driver/crash-report-clang-cl.cpp
+++ b/clang/test/Driver/crash-report-clang-cl.cpp
@@ -2,7 +2,7 @@
// RUN: rm -rf %t
// RUN: mkdir %t
-// RUN: not %clang_cl -fsyntax-only /Brepro /source-charset:utf-8 \
+// RUN: not %crash_opt %clang_cl -fsyntax-only /Brepro /source-charset:utf-8 \
// RUN: -fcrash-diagnostics-dir=%t -- %s 2>&1 | FileCheck %s
// RUN: cat %t/crash-report-clang-cl-*.cpp | FileCheck --check-prefix=CHECKSRC %s
// RUN: cat %t/crash-report-clang-cl-*.sh | FileCheck --check-prefix=CHECKSH %s
diff --git a/clang/test/Driver/crash-report-header.h b/clang/test/Driver/crash-report-header.h
index 04865a0cc300f..6d5156537126d 100644
--- a/clang/test/Driver/crash-report-header.h
+++ b/clang/test/Driver/crash-report-header.h
@@ -1,7 +1,7 @@
// RUN: export LSAN_OPTIONS=detect_leaks=0
// RUN: rm -rf %t
// RUN: mkdir %t
-// RUN: env TMPDIR="%t" TEMP="%t" TMP="%t" RC_DEBUG_OPTIONS=1 not %clang -fsyntax-only %s 2>&1 | FileCheck %s
+// RUN: env TMPDIR="%t" TEMP="%t" TMP="%t" RC_DEBUG_OPTIONS=1 not %crash_opt %clang -fsyntax-only %s 2>&1 | FileCheck %s
// RUN: cat %t/crash-report-header-*.h | FileCheck --check-prefix=CHECKSRC "%s"
// RUN: cat %t/crash-report-header-*.sh | FileCheck --check-prefix=CHECKSH "%s"
// REQUIRES: crash-recovery
diff --git a/clang/test/Driver/crash-report-spaces.c b/clang/test/Driver/crash-report-spaces.c
index b4d8ac1f57e83..b5fbb59683fc0 100644
--- a/clang/test/Driver/crash-report-spaces.c
+++ b/clang/test/Driver/crash-report-spaces.c
@@ -2,7 +2,7 @@
// RUN: rm -rf "%t"
// RUN: mkdir "%t"
// RUN: cp "%s" "%t/crash report spaces.c"
-// RUN: env TMPDIR="%t" TEMP="%t" TMP="%t" RC_DEBUG_OPTIONS=1 not %clang -fsyntax-only "%t/crash report spaces.c" 2>&1 | FileCheck "%s"
+// RUN: env TMPDIR="%t" TEMP="%t" TMP="%t" RC_DEBUG_OPTIONS=1 not %crash_opt %clang -fsyntax-only "%t/crash report spaces.c" 2>&1 | FileCheck "%s"
// RUN: cat "%t/crash report spaces"-*.c | FileCheck --check-prefix=CHECKSRC "%s"
// RUN: cat "%t/crash report spaces"-*.sh | FileCheck --check-prefix=CHECKSH "%s"
// REQUIRES: crash-recovery
diff --git a/clang/test/Driver/crash-report-with-asserts.c b/clang/test/Driver/crash-report-with-asserts.c
index 686c49f339fb7..278860a9158e4 100644
--- a/clang/test/Driver/crash-report-with-asserts.c
+++ b/clang/test/Driver/crash-report-with-asserts.c
@@ -12,13 +12,13 @@
// RUN: env TMPDIR=%t TEMP=%t TMP=%t RC_DEBUG_OPTIONS=1 \
// RUN: CC_PRINT_HEADERS=1 CC_LOG_DIAGNOSTICS=1 \
-// RUN: not %clang %s @%t.rsp -DASSERT 2>&1 | FileCheck %s
+// RUN: not %crash_opt %clang %s @%t.rsp -DASSERT 2>&1 | FileCheck %s
// RUN: cat %t/crash-report-*.c | FileCheck --check-prefix=CHECKSRC %s
// RUN: cat %t/crash-report-*.sh | FileCheck --check-prefix=CHECKSH %s
// RUN: env TMPDIR=%t TEMP=%t TMP=%t RC_DEBUG_OPTIONS=1 \
// RUN: CC_PRINT_HEADERS=1 CC_LOG_DIAGNOSTICS=1 \
-// RUN: not %clang %s @%t.rsp -DUNREACHABLE 2>&1 | FileCheck %s
+// RUN: not %crash_opt %clang %s @%t.rsp -DUNREACHABLE 2>&1 | FileCheck %s
// RUN: cat %t/crash-report-with-asserts-*.c | FileCheck --check-prefix=CHECKSRC %s
// RUN: cat %t/crash-report-with-asserts-*.sh | FileCheck --check-prefix=CHECKSH %s
diff --git a/clang/test/Driver/crash-report.cpp b/clang/test/Driver/crash-report.cpp
index 59eee65af57ee..c431940bf9ea1 100644
--- a/clang/test/Driver/crash-report.cpp
+++ b/clang/test/Driver/crash-report.cpp
@@ -12,13 +12,13 @@
// RUN: env TMPDIR=%t TEMP=%t TMP=%t RC_DEBUG_OPTIONS=1 \
// RUN: CC_PRINT_HEADERS=1 CC_LOG_DIAGNOSTICS=1 \
-// RUN: not %clang %s @%t.rsp -DPARSER 2>&1 | FileCheck %s
+// RUN: not %crash_opt %clang %s @%t.rsp -DPARSER 2>&1 | FileCheck %s
// RUN: cat %t/crash-report-*.cpp | FileCheck --check-prefix=CHECKSRC %s
// RUN: cat %t/crash-report-*.sh | FileCheck --check-prefix=CHECKSH %s
// RUN: env TMPDIR=%t TEMP=%t TMP=%t RC_DEBUG_OPTIONS=1 \
// RUN: CC_PRINT_HEADERS=1 CC_LOG_DIAGNOSTICS=1 \
-// RUN: not %clang %s @%t.rsp -DCRASH 2>&1 | FileCheck %s
+// RUN: not %crash_opt %clang %s @%t.rsp -DCRASH 2>&1 | FileCheck %s
// RUN: cat %t/crash-report-*.cpp | FileCheck --check-prefix=CHECKSRC %s
// RUN: cat %t/crash-report-*.sh | FileCheck --check-prefix=CHECKSH %s
diff --git a/clang/test/Driver/emit-reproducer.c b/clang/test/Driver/emit-reproducer.c
index 18e1b4e41b91d..6fd1735ee8549 100644
--- a/clang/test/Driver/emit-reproducer.c
+++ b/clang/test/Driver/emit-reproducer.c
@@ -3,13 +3,13 @@
// RUN: echo "%s -fcrash-diagnostics-dir=%t -fsyntax-only" | sed -e 's/\\/\\\\/g' > %t.rsp
-// RUN: not %clang -DFATAL @%t.rsp -gen-reproducer=off 2>&1 | FileCheck %s --check-prefix=NOT
-// RUN: not %clang -DFATAL @%t.rsp -fno-crash-diagnostics 2>&1 | FileCheck %s --check-prefix=NOT
-// RUN: not %clang -DFATAL @%t.rsp 2>&1 | FileCheck %s
-// RUN: not %clang -DFATAL @%t.rsp -gen-reproducer=crash 2>&1 | FileCheck %s
-// RUN: not %clang -DFATAL @%t.rsp -gen-reproducer=error 2>&1 | FileCheck %s
-// RUN: not %clang -DFATAL @%t.rsp -gen-reproducer=always 2>&1 | FileCheck %s
-// RUN: not %clang -DFATAL @%t.rsp -gen-reproducer 2>&1 | FileCheck %s
+// RUN: not %crash_opt %clang -DFATAL @%t.rsp -gen-reproducer=off 2>&1 | FileCheck %s --check-prefix=NOT
+// RUN: not %crash_opt %clang -DFATAL @%t.rsp -fno-crash-diagnostics 2>&1 | FileCheck %s --check-prefix=NOT
+// RUN: not %crash_opt %clang -DFATAL @%t.rsp 2>&1 | FileCheck %s
+// RUN: not %crash_opt %clang -DFATAL @%t.rsp -gen-reproducer=crash 2>&1 | FileCheck %s
+// RUN: not %crash_opt %clang -DFATAL @%t.rsp -gen-reproducer=error 2>&1 | FileCheck %s
+// RUN: not %crash_opt %clang -DFATAL @%t.rsp -gen-reproducer=always 2>&1 | FileCheck %s
+// RUN: not %crash_opt %clang -DFATAL @%t.rsp -gen-reproducer 2>&1 | FileCheck %s
// RUN: not %clang -DERROR @%t.rsp -gen-reproducer=off 2>&1 | FileCheck %s --check-prefix=NOT
// RUN: not %clang -DERROR @%t.rsp -fno-crash-diagnostics 2>&1 | FileCheck %s --check-prefix=NOT
diff --git a/clang/test/Driver/incompatible_sysroot.c b/clang/test/Driver/incompatible_sysroot.c
index a5f7d03da7254..6bc8cd07d1f12 100644
--- a/clang/test/Driver/incompatible_sysroot.c
+++ b/clang/test/Driver/incompatible_sysroot.c
@@ -12,13 +12,14 @@
// RUN: %clang -target arm64-apple-visionos1.0-simulator -Wincompatible-sysroot -isysroot %S/Inputs/XRSimulator1.0.sdk -S -o - %s 2>&1 | FileCheck -check-prefix CHECK-VISIONOSSIM %s
// RUN: %clang -target arm64-apple-xros1.0 -Wincompatible-sysroot -isysroot %S/Inputs/XRSimulator1.0.sdk -S -o - %s 2>&1 | FileCheck -check-prefix CHECK-VISIONOSSIM-VISIONOS %s
// RUN: %clang -target arm64-apple-ios17.1 -Wincompatible-sysroot -isysroot %S/Inputs/XRSimulator1.0.sdk -S -o - %s 2>&1 | FileCheck -check-prefix CHECK-VISIONOSSIM-IOS %s
+// RUN: %clang -target arm64-apple-visionos1.0-simulator -Wincompatible-sysroot -isysroot %S/Inputs/XRSimulator1.0.sdk/usr/include/libxml -S -o - %s 2>&1 | FileCheck -check-prefix CHECK-VISIONOSSIM %s
int main() { return 0; }
-// CHECK-OSX-IOS: warning: using sysroot for 'MacOSX' but targeting 'x86_64-apple-ios9.0.0-simulator'
-// CHECK-IOS-WATCHOS: warning: using sysroot for 'iPhoneOS' but targeting 'arm64-apple-watchos2.0.0'
-// CHECK-IOS-TVOS: warning: using sysroot for 'iPhoneOS' but targeting 'arm64-apple-tvos9.0.0'
-// CHECK-OSX-DRIVERKIT: warning: using sysroot for 'MacOSX' but targeting 'x86_64-apple-driverkit19.0.0'
-// CHECK-IOS-DRIVERKIT: warning: using sysroot for 'iPhoneOS' but targeting 'x86_64-apple-driverkit19.0.0'
+// CHECK-OSX-IOS: warning: using sysroot for 'MacOSX10.9' but targeting 'x86_64-apple-ios9.0.0-simulator'
+// CHECK-IOS-WATCHOS: warning: using sysroot for 'iPhoneOS9.2' but targeting 'arm64-apple-watchos2.0.0'
+// CHECK-IOS-TVOS: warning: using sysroot for 'iPhoneOS9.2' but targeting 'arm64-apple-tvos9.0.0'
+// CHECK-OSX-DRIVERKIT: warning: using sysroot for 'MacOSX10.9' but targeting 'x86_64-apple-driverkit19.0.0'
+// CHECK-IOS-DRIVERKIT: warning: using sysroot for 'iPhoneOS9.2' but targeting 'x86_64-apple-driverkit19.0.0'
// CHECK-IOS-IOSSIM-NOT: warning: using sysroot for '{{.*}}' but targeting '{{.*}}'
// CHECK-OSX-IOS-DISABLED-NOT: warning: using sysroot for '{{.*}}' but targeting '{{.*}}'
diff --git a/clang/test/Driver/lit.local.cfg b/clang/test/Driver/lit.local.cfg
index 6370e9f92d89b..a47d0de90d763 100644
--- a/clang/test/Driver/lit.local.cfg
+++ b/clang/test/Driver/lit.local.cfg
@@ -1,4 +1,5 @@
from lit.llvm import llvm_config
+import sys
config.suffixes = [
".c",
@@ -27,6 +28,12 @@ config.substitutions.insert(
0, ("%clang_cc1", """*** Do not use 'clang -cc1' in Driver tests. ***""")
)
+is_windows = sys.platform.startswith("win")
+if is_windows:
+ config.substitutions.append(('%crash_opt', ''))
+else:
+ config.substitutions.append(('%crash_opt', '--crash'))
+
# Remove harmful environmental variables for clang Driver tests.
# Some might be useful for other tests so they are only removed here.
driver_overwrite_env_vars = [
diff --git a/clang/test/Driver/output-file-cleanup.c b/clang/test/Driver/output-file-cleanup.c
index 3628df8192652..432ff640656e7 100644
--- a/clang/test/Driver/output-file-cleanup.c
+++ b/clang/test/Driver/output-file-cleanup.c
@@ -2,7 +2,7 @@
// RUN: rm -f "%t.d" "%t1.s" "%t2.s" "%t3.s" "%t4.s" "%t5.s"
//
// RUN: touch %t.s
-// RUN: not %clang -S -DCRASH -o %t.s -MMD -MF %t.d %s
+// RUN: not %crash_opt %clang -S -DCRASH -o %t.s -MMD -MF %t.d %s
// RUN: test ! -f %t.s
// RUN: test ! -f %t.d
diff --git a/clang/test/Driver/riscv-outliner.c b/clang/test/Driver/riscv-outliner.c
index 9e9905ab4fd8a..fa69977331e13 100644
--- a/clang/test/Driver/riscv-outliner.c
+++ b/clang/test/Driver/riscv-outliner.c
@@ -4,4 +4,4 @@
// RUN: %clang --target=riscv32 -moutline -mno-outline -S %s -### 2>&1 | FileCheck %s -check-prefix=OFF
// RUN: %clang --target=riscv64 -moutline -mno-outline -S %s -### 2>&1 | FileCheck %s -check-prefix=OFF
-// OFF: "-mllvm" "-enable-machine-outliner=never"
+// OFF: "-mno-outline" "-mllvm" "-enable-machine-outliner=never"
diff --git a/clang/test/Driver/x86-outliner.c b/clang/test/Driver/x86-outliner.c
index e2af85d3d16ab..7da56ac93fa5e 100644
--- a/clang/test/Driver/x86-outliner.c
+++ b/clang/test/Driver/x86-outliner.c
@@ -4,4 +4,4 @@
// RUN: %clang --target=i386 -moutline -mno-outline -S %s -### 2>&1 | FileCheck %s -check-prefix=OFF
// RUN: %clang --target=x86_64 -moutline -mno-outline -S %s -### 2>&1 | FileCheck %s -check-prefix=OFF
-// OFF: "-mllvm" "-enable-machine-outliner=never"
+// OFF: "-mno-outline" "-mllvm" "-enable-machine-outliner=never"
diff --git a/clang/test/Modules/demote-var-def.cpp b/clang/test/Modules/demote-var-def.cpp
deleted file mode 100644
index 811440dd736f2..0000000000000
--- a/clang/test/Modules/demote-var-def.cpp
+++ /dev/null
@@ -1,94 +0,0 @@
-// RUN: rm -rf %t
-// RUN: mkdir -p %t
-// RUN: split-file %s %t
-// RUN: cd %t
-//
-// DEFINE: %{common-flags}= -I %t -isystem %t -xc++ -std=c++20 -fmodules
-//
-// RUN: mkdir -p %t/b2
-// RUN: mkdir -p %t/b1
-// RUN: %clang_cc1 %{common-flags} -emit-module -fmodule-name=module_d \
-// RUN: d.cppmap -o d.pcm
-// RUN: %clang_cc1 %{common-flags} -emit-module -fmodule-name=module_a \
-// RUN: -fmodule-file=d.pcm a.cppmap -o a.pcm
-// RUN: %clang_cc1 %{common-flags} -emit-module -fmodule-name=module_b2 \
-// RUN: -fmodule-file=a.pcm b2/b.cppmap -o b2/b.pcm
-// RUN: %clang_cc1 %{common-flags} -emit-module -fmodule-name=module_b1 \
-// RUN: -fmodule-file=b2/b.pcm b1/b.cppmap -o b1/b.pcm
-// RUN: %clang_cc1 %{common-flags} -emit-module -fmodule-name=module_f \
-// RUN: -fmodule-file=b1/b.pcm f.cppmap -o f.pcm
-// RUN: %clang_cc1 %{common-flags} -emit-module -fmodule-name=module_c \
-// RUN: -fmodule-file=f.pcm c.cppmap -o c.pcm
-// RUN: %clang_cc1 %{common-flags} -emit-module \
-// RUN: -fmodule-name=module_e e.cppmap -o e.pcm
-//
-// RUN: %clang_cc1 %{common-flags} \
-// RUN: -fmodule-file=c.pcm -fmodule-file=e.pcm \
-// RUN: src.cpp -o src.pic.o
-
-//--- invoke.h
-#ifndef _LIBCPP___TYPE_TRAITS_IS_SAME_H
-#define _LIBCPP___TYPE_TRAITS_IS_SAME_H
-namespace std { inline namespace _LIBCPP_ABI_NAMESPACE {
-template <class _Tp, class _Up>
-constexpr bool is_same_v = __is_same(_Tp, _Up);
-} }
-#endif
-
-//--- memory
-#include <invoke.h>
-namespace std { inline namespace _LIBCPP_ABI_NAMESPACE {
-template <class _Tp>
-using __decay_t = __decay(_Tp);
-template <class _Tp>
-using decay_t = __decay_t<_Tp>;
-} }
-
-//--- other.h
-#include <invoke.h>
-
-//--- a.cppmap
-module "module_a" {
-}
-
-//--- b1/b.cppmap
-module "module_b1" {
-}
-
-//--- b2/b.cppmap
-module "module_b2" {
-}
-
-//--- c.cppmap
-module "module_c" {
-}
-
-//--- d.cppmap
-module "module_d" {
- header "d.h"
-}
-
-//--- d.h
-#include <other.h>
-
-//--- e.cppmap
-module "module_e" {
- header "e.h"
-}
-
-//--- e.h
-#include <memory>
-
-//--- f.cppmap
-module "module_f" {
-}
-
-//--- src.cpp
-#include <d.h>
-#include <memory>
-template <typename T>
-concept coroutine_result =
- std::is_same_v<std::decay_t<T>, T>;
-template <coroutine_result R>
-class Co;
-using T = Co<void>;
diff --git a/clang/test/Modules/pr149404-02.cppm b/clang/test/Modules/pr149404-02.cppm
deleted file mode 100644
index 291619ea05b8a..0000000000000
--- a/clang/test/Modules/pr149404-02.cppm
+++ /dev/null
@@ -1,104 +0,0 @@
-// RUN: rm -rf %t
-// RUN: mkdir -p %t
-// RUN: split-file %s %t
-
-// RUN: %clang_cc1 -std=c++20 -emit-module-interface -o %t/format.pcm %t/format.cppm
-// RUN: %clang_cc1 -std=c++20 -emit-module-interface -o %t/includes_in_gmf.pcm %t/includes_in_gmf.cppm
-// RUN: %clang_cc1 -std=c++20 -fprebuilt-module-path=%t %t/test.cpp -verify -fsyntax-only
-
-// RUN: %clang_cc1 -std=c++20 -emit-reduced-module-interface -o %t/format.pcm %t/format.cppm
-// RUN: %clang_cc1 -std=c++20 -emit-reduced-module-interface -o %t/includes_in_gmf.pcm %t/includes_in_gmf.cppm
-// RUN: %clang_cc1 -std=c++20 -fprebuilt-module-path=%t %t/test.cpp -verify -fsyntax-only
-
-//--- format.h
-#pragma once
-
-namespace test {
-
-template <class _Tp>
-struct type_identity {
- typedef _Tp type;
-};
-
-template <class _Tp>
-using type_identity_t = typename type_identity<_Tp>::type;
-
-
-template <class _Tp, class _CharT>
-struct formatter
-{
- formatter() = delete;
-};
-
-template <>
-struct formatter<char, char>
-{};
-
-template <class _CharT, class... _Args>
-struct basic_format_string {
- static inline const int __handles_{ [] {
- formatter<char, _CharT> f;
- (void)f;
- return 0;
- }() };
-
- consteval basic_format_string(const _CharT*) {
- (void)__handles_;
- }
-};
-
-template <class... _Args>
-using wformat_string = basic_format_string<wchar_t, type_identity_t<_Args>...>;
-
-template <class... _Args>
-using format_string = basic_format_string<char, type_identity_t<_Args>...>;
-
-template <class... _Args>
-void format(format_string<_Args...> __fmt, _Args&&... __args) {}
-
-template <class... _Args>
-void format(wformat_string<_Args...> __fmt, _Args&&... __args) {}
-
-}
-
-//--- format.cppm
-module;
-#include "format.h"
-export module format;
-
-export namespace test {
- using test::format;
- using test::formatter;
- using test::format_string;
-}
-
-auto something() -> void
-{
- auto a = 'a';
- test::format("{}", a);
-}
-
-//--- includes_in_gmf.cppm
-module;
-#include "format.h"
-export module includes_in_gmf;
-
-namespace test {
- using test::format;
- using test::formatter;
- using test::format_string;
-}
-
-//--- test.cpp
-// expected-no-diagnostics
-import format;
-import includes_in_gmf;
-
-auto what() -> void
-{
- auto a = 'a';
- test::format("{}", a);
-
- constexpr auto fs = "{}"; // test::format_string<char>{ "{}" }; // <- same result even passing exact param type
- test::format(fs, 'r');
-}
diff --git a/clang/test/Modules/pr172241.cppm b/clang/test/Modules/pr172241.cppm
deleted file mode 100644
index 3eb885e8b2d9f..0000000000000
--- a/clang/test/Modules/pr172241.cppm
+++ /dev/null
@@ -1,47 +0,0 @@
-// RUN: rm -rf %t
-// RUN: mkdir -p %t
-// RUN: split-file %s %t
-//
-// RUN: %clang_cc1 -std=c++20 -triple %itanium_abi_triple %t/m.cppm -emit-module-interface -o %t/m.pcm
-// RUN: %clang_cc1 -std=c++20 -triple %itanium_abi_triple %t/use.cpp -fmodule-file=m=%t/m.pcm -emit-llvm -o - | FileCheck %t/use.cpp
-//
-// RUN: %clang_cc1 -std=c++20 -triple %itanium_abi_triple %t/m.cppm -emit-reduced-module-interface -o %t/m.pcm
-// RUN: %clang_cc1 -std=c++20 -triple %itanium_abi_triple %t/use.cpp -fmodule-file=m=%t/m.pcm -emit-llvm -o - | FileCheck %t/use.cpp
-
-//--- header.h
-#pragma once
-
-template <unsigned T>
-class Templ {
-public:
- void lock() { __set_locked_bit(); }
-
-private:
- static constexpr auto __set_locked_bit = [](){};
-};
-
-class JT {
-public:
- ~JT() {
- Templ<4> state;
- state.lock();
- }
-};
-
-//--- m.cppm
-module;
-#include "header.h"
-export module m;
-export struct M {
- JT jt;
-};
-//--- use.cpp
-#include "header.h"
-import m;
-
-int main() {
- M m;
- return 0;
-}
-
-// CHECK: @_ZN5TemplILj4EE16__set_locked_bitE = {{.*}}linkonce_odr
diff --git a/clang/test/Modules/var-inst-def.cppm b/clang/test/Modules/var-inst-def.cppm
deleted file mode 100644
index 1414ec76c7be5..0000000000000
--- a/clang/test/Modules/var-inst-def.cppm
+++ /dev/null
@@ -1,110 +0,0 @@
-// RUN: rm -rf %t
-// RUN: mkdir -p %t
-// RUN: split-file %s %t
-// RUN: cd %t
-//
-// RUN: %clang_cc1 -fmodule-name=A -xc++ -emit-module -fmodules \
-// RUN: -fno-cxx-modules -fno-implicit-modules \
-// RUN: -fmodule-map-file-home-is-cwd -std=c++20 -I. a.modulemap -o a.pcm
-//
-// RUN: %clang_cc1 -fmodule-name=B -xc++ -emit-module -fmodules \
-// RUN: -fno-cxx-modules -fno-implicit-modules \
-// RUN: -fmodule-map-file-home-is-cwd -std=c++20 -I. b.modulemap -o b.pcm
-//
-// RUN: %clang_cc1 -fmodule-name=C -xc++ -emit-module -fmodules \
-// RUN: -fno-cxx-modules -fno-implicit-modules \
-// RUN: -fmodule-map-file-home-is-cwd -std=c++20 -I. c.modulemap -o c.pcm
-//
-// RUN: %clang_cc1 -fno-cxx-modules -fmodules -fno-implicit-modules \
-// RUN: -fmodule-map-file-home-is-cwd \
-// RUN: -fmodule-file=a.pcm -fmodule-file=b.pcm -fmodule-file=c.pcm \
-// RUN: -std=c++20 -I. main.cpp -o /dev/null
-
-//--- a.modulemap
-module "A" { header "a.h" }
-//--- b.modulemap
-module "B" { header "b.h" }
-//--- c.modulemap
-module "C" { header "c.h" }
-
-//--- common.h
-#pragma once
-#include "stl.h"
-
-//--- a.h
-#pragma once
-#include "common.h"
-#include "repro.h"
-
-//--- b.h
-#pragma once
-#include "common.h"
-#include "repro.h"
-
-//--- c.h
-#pragma once
-#include "common.h"
-#include "repro.h"
-
-//--- repro.h
-#pragma once
-#include "stl.h"
-
-namespace k {
-template <template <typename> class , typename >
-struct is_instantiation : std::integral_constant<bool, false> {};
-template <template <typename> class C, typename T>
-constexpr bool is_instantiation_v = is_instantiation<C, T>::value;
-}
-
-struct ThreadState;
-
-namespace cc::subtle {
-template <typename T>
-class U;
-}
-namespace cc {
-template <typename T> class Co;
-namespace internal {
-template <typename T>
-class Promise {
- static_assert(!k::is_instantiation_v<subtle::U, T>);
-};
-}
-}
-
-//--- stl.h
-#pragma once
-namespace std {
-inline namespace abi {
-template <class _Tp, _Tp __v>
-struct integral_constant {
- static const _Tp value = __v;
-};
-template <class _Tp, class _Up>
-constexpr bool is_same_v = __is_same(_Tp, _Up);
-template <class _Tp>
-using decay_t = __decay(_Tp);
-
-template <class>
-struct __invoke_result_impl ;
-template <class... _Args>
-using invoke_result_t = __invoke_result_impl<_Args...>;
-}
-}
-
-//--- main.cpp
-#include "stl.h"
-#include "a.h"
-
-namespace cc {
-template <typename F>
- requires k::is_instantiation_v<Co, std::invoke_result_t<F>>
-using result_type =
- std::invoke_result_t<F>;
-}
-namespace cc::internal {
-class final {
- Promise<ThreadState> outgoing_work_;
-};
-}
diff --git a/clang/test/SemaHLSL/Types/BuiltinMatrix/MatrixSplatErrors.hlsl b/clang/test/SemaHLSL/Types/BuiltinMatrix/MatrixSplatErrors.hlsl
index 0c2e53d382180..fc3f8e7adc050 100644
--- a/clang/test/SemaHLSL/Types/BuiltinMatrix/MatrixSplatErrors.hlsl
+++ b/clang/test/SemaHLSL/Types/BuiltinMatrix/MatrixSplatErrors.hlsl
@@ -1,8 +1,13 @@
// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.6-library -finclude-default-header -std=hlsl202x -verify %s
-void SplatOfVectortoMat(int4 V){
+void SplatOfUndersizedVectortoMat(int3 V){
int2x2 M = V;
- // expected-error at -1 {{cannot initialize a variable of type 'int2x2' (aka 'matrix<int, 2, 2>') with an lvalue of type 'int4' (aka 'vector<int, 4>')}}
+ // expected-error at -1 {{too few initializers in list for type 'int2x2' (aka 'matrix<int, 2, 2>') (expected 4 but found 3)}}
+}
+
+void SplatOfOversizedVectortoMat(int3 V){
+ int1x2 M = V;
+ // expected-error at -1 {{too many initializers in list for type 'int1x2' (aka 'matrix<int, 1, 2>') (expected 2 but found 3)}}
}
void SplatOfMattoMat(int4x3 N){
diff --git a/clang/tools/driver/driver.cpp b/clang/tools/driver/driver.cpp
index 490136961ebc6..e6cabcc7eb530 100644
--- a/clang/tools/driver/driver.cpp
+++ b/clang/tools/driver/driver.cpp
@@ -55,6 +55,9 @@
#include <optional>
#include <set>
#include <system_error>
+#if LLVM_ON_UNIX
+#include <signal.h>
+#endif
using namespace clang;
using namespace clang::driver;
@@ -378,7 +381,7 @@ int clang_main(int Argc, char **Argv, const llvm::ToolContext &ToolContext) {
if (!UseNewCC1Process) {
TheDriver.CC1Main = ExecuteCC1WithContext;
// Ensure the CC1Command actually catches cc1 crashes
- llvm::CrashRecoveryContext::Enable();
+ llvm::CrashRecoveryContext::Enable(true);
}
std::unique_ptr<Compilation> C(TheDriver.BuildCompilation(Args));
@@ -407,6 +410,7 @@ int clang_main(int Argc, char **Argv, const llvm::ToolContext &ToolContext) {
Driver::CommandStatus CommandStatus = Driver::CommandStatus::Ok;
// Pretend the first command failed if ReproStatus is Always.
const Command *FailingCommand = nullptr;
+ int CommandRes = 0;
if (!C->getJobs().empty())
FailingCommand = &*C->getJobs().begin();
if (C && !C->containsError()) {
@@ -414,7 +418,7 @@ int clang_main(int Argc, char **Argv, const llvm::ToolContext &ToolContext) {
Res = TheDriver.ExecuteCompilation(*C, FailingCommands);
for (const auto &P : FailingCommands) {
- int CommandRes = P.first;
+ CommandRes = P.first;
FailingCommand = P.second;
if (!Res)
Res = CommandRes;
@@ -471,6 +475,18 @@ int clang_main(int Argc, char **Argv, const llvm::ToolContext &ToolContext) {
Res = 1;
#endif
+#if LLVM_ON_UNIX
+ // On Unix, signals are represented by return codes of 128 plus the signal
+ // number. If the return code indicates it was from a signal handler, raise
+ // the signal so that the exit code includes the signal number, as required
+ // by POSIX. Return code 255 is excluded because some tools, such as
+ // llvm-ifs, exit with code 255 (-1) on failure.
+ if (CommandRes > 128 && CommandRes != 255) {
+ llvm::sys::unregisterHandlers();
+ raise(CommandRes - 128);
+ }
+#endif
+
// If we have multiple failing commands, we return the result of the first
// failing command.
return Res;
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_unwind_win.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_unwind_win.cpp
index 30ba812afc4b0..f1b2a157c3538 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_unwind_win.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_unwind_win.cpp
@@ -43,10 +43,47 @@ void BufferedStackTrace::UnwindSlow(uptr pc, u32 max_depth) {
trace_buffer[0] = pc;
}
-#ifdef __clang__
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wframe-larger-than="
-#endif
+PVOID CALLBACK FallbackFunctionTableAccess(HANDLE hProcess,
+ DWORD64 dwAddrBase) {
+ // First try DbgHelp's function.
+ if (PVOID pResult =
+ __sanitizer::SymFunctionTableAccess64(hProcess, dwAddrBase)) {
+ return pResult;
+ }
+
+ // Fall back to RtlLookupFunctionEntry for dynamic code.
+ // Function registered with RtlAddFunctionTable is not necessarily registered
+ // with DbgHelp, so this is required to cover some edge cases (e.g. JIT
+ // compilers can use Rtl* functions).
+# if SANITIZER_WINDOWS64
+ DWORD64 dw64ImageBase = 0;
+ return RtlLookupFunctionEntry(dwAddrBase, &dw64ImageBase, nullptr);
+# else
+ return nullptr;
+# endif
+}
+
+DWORD64 CALLBACK FallbackGetModuleBase(HANDLE hProcess, DWORD64 dwAddr) {
+ if (DWORD64 dwResult = __sanitizer::SymGetModuleBase64(hProcess, dwAddr)) {
+ return dwResult;
+ }
+
+ // Both GetModuleBase and FunctionTableAccess must provide this fallback,
+ // otherwise dynamic functions won't be properly unwound.
+# if SANITIZER_WINDOWS64
+ DWORD64 dw64ImageBase = 0;
+ if (RtlLookupFunctionEntry(dwAddr, &dw64ImageBase, nullptr)) {
+ return dw64ImageBase;
+ }
+# endif
+
+ return 0;
+}
+
+# ifdef __clang__
+# pragma clang diagnostic push
+# pragma clang diagnostic ignored "-Wframe-larger-than="
+# endif
void BufferedStackTrace::UnwindSlow(uptr pc, void *context, u32 max_depth) {
CHECK(context);
CHECK_GE(max_depth, 2);
@@ -91,8 +128,8 @@ void BufferedStackTrace::UnwindSlow(uptr pc, void *context, u32 max_depth) {
stack_frame.AddrFrame.Mode = AddrModeFlat;
stack_frame.AddrStack.Mode = AddrModeFlat;
while (StackWalk64(machine_type, GetCurrentProcess(), GetCurrentThread(),
- &stack_frame, &ctx, NULL, SymFunctionTableAccess64,
- SymGetModuleBase64, NULL) &&
+ &stack_frame, &ctx, NULL, FallbackFunctionTableAccess,
+ FallbackGetModuleBase, NULL) &&
size < Min(max_depth, kStackTraceMax)) {
trace_buffer[size++] = (uptr)stack_frame.AddrPC.Offset;
}
diff --git a/flang/include/flang/Optimizer/Builder/HLFIRTools.h b/flang/include/flang/Optimizer/Builder/HLFIRTools.h
index 37297dd0ad6d1..e75bddc7e1bef 100644
--- a/flang/include/flang/Optimizer/Builder/HLFIRTools.h
+++ b/flang/include/flang/Optimizer/Builder/HLFIRTools.h
@@ -456,13 +456,18 @@ mlir::Value inlineElementalOp(
/// over the optimal extents deduced from both shapes. If \p emitWorkshareLoop
/// is true, a workshare loop construct may be emitted when available.
/// Allocatable LHS must be allocated with the right shape and parameters.
+/// An optional scalarCombineAndAssign can be provided to provide logic for more
+/// complex assignment actions like for reductions that may need to happen
+/// atomically. When provided, the callback will be passed scalar addresses for
+/// the LHS and RHS elements and is in charge of generating the combination and
+/// assignment logic.
void genNoAliasArrayAssignment(
mlir::Location loc, fir::FirOpBuilder &builder, hlfir::Entity rhs,
hlfir::Entity lhs, bool emitWorkshareLoop = false,
bool temporaryLHS = false,
- std::function<hlfir::Entity(mlir::Location, fir::FirOpBuilder &,
- hlfir::Entity, hlfir::Entity)> *combiner =
- nullptr,
+ std::function<void(mlir::Location, fir::FirOpBuilder &, hlfir::Entity,
+ hlfir::Entity, mlir::ArrayAttr)>
+ *scalarCombineAndAssign = nullptr,
mlir::ArrayAttr accessGroups = {});
/// Generate an assignment from \p rhs to \p lhs when they are known not to
@@ -474,19 +479,19 @@ void genNoAliasAssignment(
mlir::Location loc, fir::FirOpBuilder &builder, hlfir::Entity rhs,
hlfir::Entity lhs, bool emitWorkshareLoop = false,
bool temporaryLHS = false,
- std::function<hlfir::Entity(mlir::Location, fir::FirOpBuilder &,
- hlfir::Entity, hlfir::Entity)> *combiner =
- nullptr,
+ std::function<void(mlir::Location, fir::FirOpBuilder &, hlfir::Entity,
+ hlfir::Entity, mlir::ArrayAttr accessGroups)>
+ *scalarCombineAndAssign = nullptr,
mlir::ArrayAttr accessGroups = {});
inline void genNoAliasAssignment(
mlir::Location loc, fir::FirOpBuilder &builder, hlfir::Entity rhs,
hlfir::Entity lhs, bool emitWorkshareLoop, bool temporaryLHS,
- std::function<hlfir::Entity(mlir::Location, fir::FirOpBuilder &,
- hlfir::Entity, hlfir::Entity)>
- combiner,
+ std::function<void(mlir::Location, fir::FirOpBuilder &, hlfir::Entity,
+ hlfir::Entity, mlir::ArrayAttr)>
+ scalarCombineAndAssign,
mlir::ArrayAttr accessGroups = {}) {
genNoAliasAssignment(loc, builder, rhs, lhs, emitWorkshareLoop, temporaryLHS,
- &combiner, accessGroups);
+ &scalarCombineAndAssign, accessGroups);
}
/// Create a new temporary with the shape and parameters of the provided
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index 6eedb089eac40..687c2f0f4a42a 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -2540,7 +2540,6 @@ class FirConverter : public Fortran::lower::AbstractConverter {
// PFT branch analysis), allowing the loop to exit only when the condition
// becomes false.
if (!unstructuredContext) {
- maybeStartBlock(preheaderBlock); // no block or empty block
genDoWhileAsSCFWhile(*whileCondition, eval, doStmtEval);
return;
}
diff --git a/flang/lib/Optimizer/Builder/HLFIRTools.cpp b/flang/lib/Optimizer/Builder/HLFIRTools.cpp
index 3355bf1475e30..e7a286b73bb4b 100644
--- a/flang/lib/Optimizer/Builder/HLFIRTools.cpp
+++ b/flang/lib/Optimizer/Builder/HLFIRTools.cpp
@@ -1395,20 +1395,17 @@ bool hlfir::elementalOpMustProduceTemp(hlfir::ElementalOp elemental) {
static void combineAndStoreElement(
mlir::Location loc, fir::FirOpBuilder &builder, hlfir::Entity lhs,
hlfir::Entity rhs, bool temporaryLHS,
- std::function<hlfir::Entity(mlir::Location, fir::FirOpBuilder &,
- hlfir::Entity, hlfir::Entity)> *combiner,
+ std::function<void(mlir::Location, fir::FirOpBuilder &, hlfir::Entity,
+ hlfir::Entity, mlir::ArrayAttr)> *scalarCombineAndAssign,
mlir::ArrayAttr accessGroups) {
+ if (scalarCombineAndAssign) {
+ (*scalarCombineAndAssign)(loc, builder, lhs, rhs, accessGroups);
+ return;
+ }
hlfir::Entity valueToAssign = hlfir::loadTrivialScalar(loc, builder, rhs);
if (accessGroups)
if (auto load = valueToAssign.getDefiningOp<fir::LoadOp>())
load.setAccessGroupsAttr(accessGroups);
- if (combiner) {
- hlfir::Entity lhsValue = hlfir::loadTrivialScalar(loc, builder, lhs);
- if (accessGroups)
- if (auto load = lhsValue.getDefiningOp<fir::LoadOp>())
- load.setAccessGroupsAttr(accessGroups);
- valueToAssign = (*combiner)(loc, builder, lhsValue, valueToAssign);
- }
auto assign = hlfir::AssignOp::create(builder, loc, valueToAssign, lhs,
/*realloc=*/false,
/*keep_lhs_length_if_realloc=*/false,
@@ -1420,8 +1417,8 @@ static void combineAndStoreElement(
void hlfir::genNoAliasArrayAssignment(
mlir::Location loc, fir::FirOpBuilder &builder, hlfir::Entity rhs,
hlfir::Entity lhs, bool emitWorkshareLoop, bool temporaryLHS,
- std::function<hlfir::Entity(mlir::Location, fir::FirOpBuilder &,
- hlfir::Entity, hlfir::Entity)> *combiner,
+ std::function<void(mlir::Location, fir::FirOpBuilder &, hlfir::Entity,
+ hlfir::Entity, mlir::ArrayAttr)> *scalarCombineAndAssign,
mlir::ArrayAttr accessGroups) {
mlir::OpBuilder::InsertionGuard guard(builder);
rhs = hlfir::derefPointersAndAllocatables(loc, builder, rhs);
@@ -1441,28 +1438,30 @@ void hlfir::genNoAliasArrayAssignment(
builder.setInsertionPointToStart(loopNest.body);
auto rhsArrayElement =
hlfir::getElementAt(loc, builder, rhs, loopNest.oneBasedIndices);
- rhsArrayElement = hlfir::loadTrivialScalar(loc, builder, rhsArrayElement);
+ if (!scalarCombineAndAssign)
+ rhsArrayElement = hlfir::loadTrivialScalar(loc, builder, rhsArrayElement);
auto lhsArrayElement =
hlfir::getElementAt(loc, builder, lhs, loopNest.oneBasedIndices);
combineAndStoreElement(loc, builder, lhsArrayElement, rhsArrayElement,
- temporaryLHS, combiner, accessGroups);
+ temporaryLHS, scalarCombineAndAssign, accessGroups);
}
void hlfir::genNoAliasAssignment(
mlir::Location loc, fir::FirOpBuilder &builder, hlfir::Entity rhs,
hlfir::Entity lhs, bool emitWorkshareLoop, bool temporaryLHS,
- std::function<hlfir::Entity(mlir::Location, fir::FirOpBuilder &,
- hlfir::Entity, hlfir::Entity)> *combiner,
+ std::function<void(mlir::Location, fir::FirOpBuilder &, hlfir::Entity,
+ hlfir::Entity, mlir::ArrayAttr)> *scalarCombineAndAssign,
mlir::ArrayAttr accessGroups) {
if (lhs.isArray()) {
genNoAliasArrayAssignment(loc, builder, rhs, lhs, emitWorkshareLoop,
- temporaryLHS, combiner, accessGroups);
+ temporaryLHS, scalarCombineAndAssign,
+ accessGroups);
return;
}
rhs = hlfir::derefPointersAndAllocatables(loc, builder, rhs);
lhs = hlfir::derefPointersAndAllocatables(loc, builder, lhs);
- combineAndStoreElement(loc, builder, lhs, rhs, temporaryLHS, combiner,
- accessGroups);
+ combineAndStoreElement(loc, builder, lhs, rhs, temporaryLHS,
+ scalarCombineAndAssign, accessGroups);
}
std::pair<hlfir::Entity, bool>
diff --git a/flang/lib/Optimizer/OpenACC/Support/FIROpenACCTypeInterfaces.cpp b/flang/lib/Optimizer/OpenACC/Support/FIROpenACCTypeInterfaces.cpp
index 9ced235f05707..d8ed9ce968e0a 100644
--- a/flang/lib/Optimizer/OpenACC/Support/FIROpenACCTypeInterfaces.cpp
+++ b/flang/lib/Optimizer/OpenACC/Support/FIROpenACCTypeInterfaces.cpp
@@ -31,6 +31,19 @@
#include "mlir/Support/LLVM.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/TypeSwitch.h"
+#include "llvm/Support/CommandLine.h"
+
+static llvm::cl::opt<bool> useAccReductionCombine(
+ "openacc-use-reduction-combine",
+ llvm::cl::desc("Whether to generate acc.reduction_combine. Does not "
+ "control reduction for MIN/MAX and logical reductions."),
+ llvm::cl::init(false));
+
+static llvm::cl::opt<bool> useAccReductionCombineAll(
+ "openacc-use-reduction-combine-all",
+ llvm::cl::desc("Whether to generate acc.reduction_combine for all types "
+ "and operators"),
+ llvm::cl::init(false));
namespace fir::acc {
@@ -1045,6 +1058,25 @@ static mlir::Value genScalarCombiner(fir::FirOpBuilder &builder,
TODO(loc, "reduction operator");
}
+static bool useAccReductionCombineOp(mlir::Type elementType,
+ mlir::acc::ReductionOperator op) {
+ if (useAccReductionCombineAll)
+ return true;
+ if (!useAccReductionCombine)
+ return false;
+ // LOGICAL operators do not have mlir operators and requires FIR specific
+ // logic to interpret the TRUE and FALSE values from the storage (implemented
+ // in fir.convert to i1).
+ if (!llvm::isa<mlir::IntegerType, mlir::FloatType, mlir::ComplexType>(
+ elementType))
+ return false;
+ // MIN/MAX for floating point can have different edge-case behaviors (NANs).
+ // Currently the mlir operator does not match the behavior implemented by
+ // flang.
+ return op != mlir::acc::ReductionOperator::AccMax &&
+ op != mlir::acc::ReductionOperator::AccMin;
+}
+
template <typename Ty>
bool OpenACCMappableModel<Ty>::generateCombiner(
mlir::Type type, mlir::OpBuilder &mlirBuilder, mlir::Location loc,
@@ -1069,11 +1101,25 @@ bool OpenACCMappableModel<Ty>::generateCombiner(
}
mlir::Type elementType = fir::getFortranElementType(dest.getType());
- auto genKernel = [&](mlir::Location l, fir::FirOpBuilder &b,
- hlfir::Entity srcElementValue,
- hlfir::Entity destElementValue) -> hlfir::Entity {
- return hlfir::Entity{genScalarCombiner(builder, loc, op, elementType,
- srcElementValue, destElementValue)};
+ auto genKernel =
+ [&](mlir::Location l, fir::FirOpBuilder &b, hlfir::Entity destElementAddr,
+ hlfir::Entity srcElementAddr, mlir::ArrayAttr accessGroups) -> void {
+ assert(!accessGroups && "access groups not expected in acc reductions");
+ if (useAccReductionCombineOp(elementType, op)) {
+ mlir::acc::ReductionCombineOp::create(builder, loc, destElementAddr,
+ srcElementAddr, op);
+ return;
+ }
+ hlfir::Entity srcElementValue =
+ hlfir::loadTrivialScalar(loc, builder, srcElementAddr);
+ hlfir::Entity destElementValue =
+ hlfir::loadTrivialScalar(loc, builder, destElementAddr);
+ hlfir::Entity combined(genScalarCombiner(
+ builder, loc, op, elementType, destElementValue, srcElementValue));
+ hlfir::AssignOp::create(builder, loc, combined, destElementAddr,
+ /*realloc=*/false,
+ /*keep_lhs_length_if_realloc=*/false,
+ /*temporary_lhs=*/false);
};
hlfir::genNoAliasAssignment(loc, builder, srcSection, destSection,
/*emitWorkshareLoop=*/false,
diff --git a/flang/test/Lower/OpenACC/acc-reduction.f90 b/flang/test/Lower/OpenACC/acc-reduction.f90
index 339a4e3435c0d..2c79cacada050 100644
--- a/flang/test/Lower/OpenACC/acc-reduction.f90
+++ b/flang/test/Lower/OpenACC/acc-reduction.f90
@@ -1,6 +1,7 @@
! This test checks lowering of OpenACC reduction clause.
! RUN: bbc -fopenacc -emit-hlfir %s -o - | FileCheck %s
+! RUN: bbc -fopenacc -emit-hlfir %s -o - -openacc-use-reduction-combine | FileCheck -check-prefix=ACC_COMBINE %s
! CHECK-LABEL: acc.reduction.recipe @reduction_lor_ref_box_heap_l32 : !fir.ref<!fir.box<!fir.heap<!fir.logical<4>>>> reduction_operator <lor> init {
! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.logical<4>>>>):
@@ -97,7 +98,6 @@
! CHECK: %[[SUBI_1:.*]] = arith.subi %[[BOX_DIMS_5]]#0, %[[CONSTANT_7]] : index
! CHECK: %[[ADDI_1:.*]] = arith.addi %[[VAL_2]], %[[SUBI_1]] : index
! CHECK: %[[DESIGNATE_0:.*]] = hlfir.designate %[[VAL_1]] (%[[ADDI_0]], %[[ADDI_1]]) : (!fir.box<!fir.array<?x?xf32>>, index, index) -> !fir.ref<f32>
-! CHECK: %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<f32>
! CHECK: %[[CONSTANT_8:.*]] = arith.constant 0 : index
! CHECK: %[[BOX_DIMS_6:.*]]:3 = fir.box_dims %[[VAL_0]], %[[CONSTANT_8]] : (!fir.box<!fir.array<?x?xf32>>, index) -> (index, index, index)
! CHECK: %[[CONSTANT_9:.*]] = arith.constant 1 : index
@@ -108,6 +108,7 @@
! CHECK: %[[SUBI_3:.*]] = arith.subi %[[BOX_DIMS_7]]#0, %[[CONSTANT_10]] : index
! CHECK: %[[ADDI_3:.*]] = arith.addi %[[VAL_2]], %[[SUBI_3]] : index
! CHECK: %[[DESIGNATE_1:.*]] = hlfir.designate %[[VAL_0]] (%[[ADDI_2]], %[[ADDI_3]]) : (!fir.box<!fir.array<?x?xf32>>, index, index) -> !fir.ref<f32>
+! CHECK: %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<f32>
! CHECK: %[[LOAD_1:.*]] = fir.load %[[DESIGNATE_1]] : !fir.ref<f32>
! CHECK: %[[CMPF_0:.*]] = arith.cmpf ogt, %[[LOAD_1]], %[[LOAD_0]] fastmath<contract> : f32
! CHECK: %[[SELECT_0:.*]] = arith.select %[[CMPF_0]], %[[LOAD_1]], %[[LOAD_0]] : f32
@@ -170,13 +171,13 @@
! CHECK: %[[SUBI_0:.*]] = arith.subi %[[BOX_DIMS_2]]#0, %[[CONSTANT_4]] : index
! CHECK: %[[ADDI_0:.*]] = arith.addi %[[VAL_2]], %[[SUBI_0]] : index
! CHECK: %[[DESIGNATE_0:.*]] = hlfir.designate %[[LOAD_0]] (%[[ADDI_0]]) : (!fir.box<!fir.ptr<!fir.array<?xf32>>>, index) -> !fir.ref<f32>
-! CHECK: %[[LOAD_2:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<f32>
! CHECK: %[[CONSTANT_5:.*]] = arith.constant 0 : index
! CHECK: %[[BOX_DIMS_3:.*]]:3 = fir.box_dims %[[LOAD_1]], %[[CONSTANT_5]] : (!fir.box<!fir.ptr<!fir.array<?xf32>>>, index) -> (index, index, index)
! CHECK: %[[CONSTANT_6:.*]] = arith.constant 1 : index
! CHECK: %[[SUBI_1:.*]] = arith.subi %[[BOX_DIMS_3]]#0, %[[CONSTANT_6]] : index
! CHECK: %[[ADDI_1:.*]] = arith.addi %[[VAL_2]], %[[SUBI_1]] : index
! CHECK: %[[DESIGNATE_1:.*]] = hlfir.designate %[[LOAD_1]] (%[[ADDI_1]]) : (!fir.box<!fir.ptr<!fir.array<?xf32>>>, index) -> !fir.ref<f32>
+! CHECK: %[[LOAD_2:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<f32>
! CHECK: %[[LOAD_3:.*]] = fir.load %[[DESIGNATE_1]] : !fir.ref<f32>
! CHECK: %[[CMPF_0:.*]] = arith.cmpf ogt, %[[LOAD_3]], %[[LOAD_2]] fastmath<contract> : f32
! CHECK: %[[SELECT_0:.*]] = arith.select %[[CMPF_0]], %[[LOAD_3]], %[[LOAD_2]] : f32
@@ -239,13 +240,13 @@
! CHECK: %[[SUBI_0:.*]] = arith.subi %[[BOX_DIMS_2]]#0, %[[CONSTANT_4]] : index
! CHECK: %[[ADDI_0:.*]] = arith.addi %[[VAL_2]], %[[SUBI_0]] : index
! CHECK: %[[DESIGNATE_0:.*]] = hlfir.designate %[[LOAD_0]] (%[[ADDI_0]]) : (!fir.box<!fir.heap<!fir.array<?xf32>>>, index) -> !fir.ref<f32>
-! CHECK: %[[LOAD_2:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<f32>
! CHECK: %[[CONSTANT_5:.*]] = arith.constant 0 : index
! CHECK: %[[BOX_DIMS_3:.*]]:3 = fir.box_dims %[[LOAD_1]], %[[CONSTANT_5]] : (!fir.box<!fir.heap<!fir.array<?xf32>>>, index) -> (index, index, index)
! CHECK: %[[CONSTANT_6:.*]] = arith.constant 1 : index
! CHECK: %[[SUBI_1:.*]] = arith.subi %[[BOX_DIMS_3]]#0, %[[CONSTANT_6]] : index
! CHECK: %[[ADDI_1:.*]] = arith.addi %[[VAL_2]], %[[SUBI_1]] : index
! CHECK: %[[DESIGNATE_1:.*]] = hlfir.designate %[[LOAD_1]] (%[[ADDI_1]]) : (!fir.box<!fir.heap<!fir.array<?xf32>>>, index) -> !fir.ref<f32>
+! CHECK: %[[LOAD_2:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<f32>
! CHECK: %[[LOAD_3:.*]] = fir.load %[[DESIGNATE_1]] : !fir.ref<f32>
! CHECK: %[[CMPF_0:.*]] = arith.cmpf ogt, %[[LOAD_3]], %[[LOAD_2]] fastmath<contract> : f32
! CHECK: %[[SELECT_0:.*]] = arith.select %[[CMPF_0]], %[[LOAD_3]], %[[LOAD_2]] : f32
@@ -321,8 +322,8 @@
! CHECK: %[[CONSTANT_9:.*]] = arith.constant 1 : index
! CHECK: fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_9]] to %[[CONSTANT_5]] step %[[CONSTANT_9]] unordered {
! CHECK: %[[DESIGNATE_2:.*]] = hlfir.designate %[[DESIGNATE_0]] (%[[VAL_2]]) : (!fir.box<!fir.array<3xi32>>, index) -> !fir.ref<i32>
-! CHECK: %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_2]] : !fir.ref<i32>
! CHECK: %[[DESIGNATE_3:.*]] = hlfir.designate %[[DESIGNATE_1]] (%[[VAL_2]]) : (!fir.box<!fir.array<3xi32>>, index) -> !fir.ref<i32>
+! CHECK: %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_2]] : !fir.ref<i32>
! CHECK: %[[LOAD_1:.*]] = fir.load %[[DESIGNATE_3]] : !fir.ref<i32>
! CHECK: %[[ADDI_4:.*]] = arith.addi %[[LOAD_1]], %[[LOAD_0]] : i32
! CHECK: hlfir.assign %[[ADDI_4]] to %[[DESIGNATE_3]] : i32, !fir.ref<i32>
@@ -371,13 +372,13 @@
! CHECK: %[[SUBI_0:.*]] = arith.subi %[[BOX_DIMS_2]]#0, %[[CONSTANT_4]] : index
! CHECK: %[[ADDI_0:.*]] = arith.addi %[[VAL_2]], %[[SUBI_0]] : index
! CHECK: %[[DESIGNATE_0:.*]] = hlfir.designate %[[VAL_1]] (%[[ADDI_0]]) : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
-! CHECK: %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<f32>
! CHECK: %[[CONSTANT_5:.*]] = arith.constant 0 : index
! CHECK: %[[BOX_DIMS_3:.*]]:3 = fir.box_dims %[[VAL_0]], %[[CONSTANT_5]] : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index)
! CHECK: %[[CONSTANT_6:.*]] = arith.constant 1 : index
! CHECK: %[[SUBI_1:.*]] = arith.subi %[[BOX_DIMS_3]]#0, %[[CONSTANT_6]] : index
! CHECK: %[[ADDI_1:.*]] = arith.addi %[[VAL_2]], %[[SUBI_1]] : index
! CHECK: %[[DESIGNATE_1:.*]] = hlfir.designate %[[VAL_0]] (%[[ADDI_1]]) : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
+! CHECK: %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<f32>
! CHECK: %[[LOAD_1:.*]] = fir.load %[[DESIGNATE_1]] : !fir.ref<f32>
! CHECK: %[[CMPF_0:.*]] = arith.cmpf ogt, %[[LOAD_1]], %[[LOAD_0]] fastmath<contract> : f32
! CHECK: %[[SELECT_0:.*]] = arith.select %[[CMPF_0]], %[[LOAD_1]], %[[LOAD_0]] : f32
@@ -434,13 +435,13 @@
! CHECK: %[[SUBI_0:.*]] = arith.subi %[[BOX_DIMS_2]]#0, %[[CONSTANT_4]] : index
! CHECK: %[[ADDI_0:.*]] = arith.addi %[[VAL_2]], %[[SUBI_0]] : index
! CHECK: %[[DESIGNATE_0:.*]] = hlfir.designate %[[VAL_1]] (%[[ADDI_0]]) : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
-! CHECK: %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<i32>
! CHECK: %[[CONSTANT_5:.*]] = arith.constant 0 : index
! CHECK: %[[BOX_DIMS_3:.*]]:3 = fir.box_dims %[[VAL_0]], %[[CONSTANT_5]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
! CHECK: %[[CONSTANT_6:.*]] = arith.constant 1 : index
! CHECK: %[[SUBI_1:.*]] = arith.subi %[[BOX_DIMS_3]]#0, %[[CONSTANT_6]] : index
! CHECK: %[[ADDI_1:.*]] = arith.addi %[[VAL_2]], %[[SUBI_1]] : index
! CHECK: %[[DESIGNATE_1:.*]] = hlfir.designate %[[VAL_0]] (%[[ADDI_1]]) : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+! CHECK: %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<i32>
! CHECK: %[[LOAD_1:.*]] = fir.load %[[DESIGNATE_1]] : !fir.ref<i32>
! CHECK: %[[ADDI_2:.*]] = arith.addi %[[LOAD_1]], %[[LOAD_0]] : i32
! CHECK: hlfir.assign %[[ADDI_2]] to %[[DESIGNATE_1]] : i32, !fir.ref<i32>
@@ -524,8 +525,8 @@
! CHECK: fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_17]] to %[[CONSTANT_9]] step %[[CONSTANT_17]] unordered {
! CHECK: fir.do_loop %[[VAL_3:.*]] = %[[CONSTANT_17]] to %[[CONSTANT_6]] step %[[CONSTANT_17]] unordered {
! CHECK: %[[DESIGNATE_2:.*]] = hlfir.designate %[[DESIGNATE_0]] (%[[VAL_3]], %[[VAL_2]]) : (!fir.ref<!fir.array<10x20xi32>>, index, index) -> !fir.ref<i32>
-! CHECK: %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_2]] : !fir.ref<i32>
! CHECK: %[[DESIGNATE_3:.*]] = hlfir.designate %[[DESIGNATE_1]] (%[[VAL_3]], %[[VAL_2]]) : (!fir.ref<!fir.array<10x20xi32>>, index, index) -> !fir.ref<i32>
+! CHECK: %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_2]] : !fir.ref<i32>
! CHECK: %[[LOAD_1:.*]] = fir.load %[[DESIGNATE_3]] : !fir.ref<i32>
! CHECK: %[[ADDI_0:.*]] = arith.addi %[[LOAD_1]], %[[LOAD_0]] : i32
! CHECK: hlfir.assign %[[ADDI_0]] to %[[DESIGNATE_3]] : i32, !fir.ref<i32>
@@ -584,8 +585,8 @@
! CHECK: %[[CONSTANT_10:.*]] = arith.constant 1 : index
! CHECK: fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_10]] to %[[CONSTANT_5]] step %[[CONSTANT_10]] unordered {
! CHECK: %[[DESIGNATE_2:.*]] = hlfir.designate %[[DESIGNATE_0]] (%[[VAL_2]]) : (!fir.ref<!fir.array<10xi32>>, index) -> !fir.ref<i32>
-! CHECK: %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_2]] : !fir.ref<i32>
! CHECK: %[[DESIGNATE_3:.*]] = hlfir.designate %[[DESIGNATE_1]] (%[[VAL_2]]) : (!fir.ref<!fir.array<10xi32>>, index) -> !fir.ref<i32>
+! CHECK: %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_2]] : !fir.ref<i32>
! CHECK: %[[LOAD_1:.*]] = fir.load %[[DESIGNATE_3]] : !fir.ref<i32>
! CHECK: %[[ADDI_0:.*]] = arith.addi %[[LOAD_1]], %[[LOAD_0]] : i32
! CHECK: hlfir.assign %[[ADDI_0]] to %[[DESIGNATE_3]] : i32, !fir.ref<i32>
@@ -847,8 +848,8 @@
! CHECK: %[[CONSTANT_2:.*]] = arith.constant 1 : index
! CHECK: fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_2]] to %[[CONSTANT_0]] step %[[CONSTANT_2]] unordered {
! CHECK: %[[DESIGNATE_0:.*]] = hlfir.designate %[[VAL_1]] (%[[VAL_2]]) : (!fir.ref<!fir.array<100xf32>>, index) -> !fir.ref<f32>
-! CHECK: %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<f32>
! CHECK: %[[DESIGNATE_1:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_2]]) : (!fir.ref<!fir.array<100xf32>>, index) -> !fir.ref<f32>
+! CHECK: %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<f32>
! CHECK: %[[LOAD_1:.*]] = fir.load %[[DESIGNATE_1]] : !fir.ref<f32>
! CHECK: %[[CMPF_0:.*]] = arith.cmpf ogt, %[[LOAD_1]], %[[LOAD_0]] fastmath<contract> : f32
! CHECK: %[[SELECT_0:.*]] = arith.select %[[CMPF_0]], %[[LOAD_1]], %[[LOAD_0]] : f32
@@ -906,8 +907,8 @@
! CHECK: fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_4]] to %[[CONSTANT_1]] step %[[CONSTANT_4]] unordered {
! CHECK: fir.do_loop %[[VAL_3:.*]] = %[[CONSTANT_4]] to %[[CONSTANT_0]] step %[[CONSTANT_4]] unordered {
! CHECK: %[[DESIGNATE_0:.*]] = hlfir.designate %[[VAL_1]] (%[[VAL_3]], %[[VAL_2]]) : (!fir.ref<!fir.array<100x10xi32>>, index, index) -> !fir.ref<i32>
-! CHECK: %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<i32>
! CHECK: %[[DESIGNATE_1:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_3]], %[[VAL_2]]) : (!fir.ref<!fir.array<100x10xi32>>, index, index) -> !fir.ref<i32>
+! CHECK: %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<i32>
! CHECK: %[[LOAD_1:.*]] = fir.load %[[DESIGNATE_1]] : !fir.ref<i32>
! CHECK: %[[CMPI_0:.*]] = arith.cmpi sgt, %[[LOAD_1]], %[[LOAD_0]] : i32
! CHECK: %[[SELECT_0:.*]] = arith.select %[[CMPI_0]], %[[LOAD_1]], %[[LOAD_0]] : i32
@@ -966,8 +967,8 @@
! CHECK: fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_4]] to %[[CONSTANT_1]] step %[[CONSTANT_4]] unordered {
! CHECK: fir.do_loop %[[VAL_3:.*]] = %[[CONSTANT_4]] to %[[CONSTANT_0]] step %[[CONSTANT_4]] unordered {
! CHECK: %[[DESIGNATE_0:.*]] = hlfir.designate %[[VAL_1]] (%[[VAL_3]], %[[VAL_2]]) : (!fir.ref<!fir.array<100x10xf32>>, index, index) -> !fir.ref<f32>
-! CHECK: %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<f32>
! CHECK: %[[DESIGNATE_1:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_3]], %[[VAL_2]]) : (!fir.ref<!fir.array<100x10xf32>>, index, index) -> !fir.ref<f32>
+! CHECK: %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<f32>
! CHECK: %[[LOAD_1:.*]] = fir.load %[[DESIGNATE_1]] : !fir.ref<f32>
! CHECK: %[[CMPF_0:.*]] = arith.cmpf olt, %[[LOAD_1]], %[[LOAD_0]] fastmath<contract> : f32
! CHECK: %[[SELECT_0:.*]] = arith.select %[[CMPF_0]], %[[LOAD_1]], %[[LOAD_0]] : f32
@@ -1019,8 +1020,8 @@
! CHECK: %[[CONSTANT_2:.*]] = arith.constant 1 : index
! CHECK: fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_2]] to %[[CONSTANT_0]] step %[[CONSTANT_2]] unordered {
! CHECK: %[[DESIGNATE_0:.*]] = hlfir.designate %[[VAL_1]] (%[[VAL_2]]) : (!fir.ref<!fir.array<100xi32>>, index) -> !fir.ref<i32>
-! CHECK: %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<i32>
! CHECK: %[[DESIGNATE_1:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_2]]) : (!fir.ref<!fir.array<100xi32>>, index) -> !fir.ref<i32>
+! CHECK: %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<i32>
! CHECK: %[[LOAD_1:.*]] = fir.load %[[DESIGNATE_1]] : !fir.ref<i32>
! CHECK: %[[CMPI_0:.*]] = arith.cmpi slt, %[[LOAD_1]], %[[LOAD_0]] : i32
! CHECK: %[[SELECT_0:.*]] = arith.select %[[CMPI_0]], %[[LOAD_1]], %[[LOAD_0]] : i32
@@ -1071,8 +1072,8 @@
! CHECK: %[[CONSTANT_2:.*]] = arith.constant 1 : index
! CHECK: fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_2]] to %[[CONSTANT_0]] step %[[CONSTANT_2]] unordered {
! CHECK: %[[DESIGNATE_0:.*]] = hlfir.designate %[[VAL_1]] (%[[VAL_2]]) : (!fir.ref<!fir.array<100xf32>>, index) -> !fir.ref<f32>
-! CHECK: %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<f32>
! CHECK: %[[DESIGNATE_1:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_2]]) : (!fir.ref<!fir.array<100xf32>>, index) -> !fir.ref<f32>
+! CHECK: %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<f32>
! CHECK: %[[LOAD_1:.*]] = fir.load %[[DESIGNATE_1]] : !fir.ref<f32>
! CHECK: %[[MULF_0:.*]] = arith.mulf %[[LOAD_1]], %[[LOAD_0]] fastmath<contract> : f32
! CHECK: hlfir.assign %[[MULF_0]] to %[[DESIGNATE_1]] : f32, !fir.ref<f32>
@@ -1121,8 +1122,8 @@
! CHECK: %[[CONSTANT_2:.*]] = arith.constant 1 : index
! CHECK: fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_2]] to %[[CONSTANT_0]] step %[[CONSTANT_2]] unordered {
! CHECK: %[[DESIGNATE_0:.*]] = hlfir.designate %[[VAL_1]] (%[[VAL_2]]) : (!fir.ref<!fir.array<100xi32>>, index) -> !fir.ref<i32>
-! CHECK: %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<i32>
! CHECK: %[[DESIGNATE_1:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_2]]) : (!fir.ref<!fir.array<100xi32>>, index) -> !fir.ref<i32>
+! CHECK: %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<i32>
! CHECK: %[[LOAD_1:.*]] = fir.load %[[DESIGNATE_1]] : !fir.ref<i32>
! CHECK: %[[MULI_0:.*]] = arith.muli %[[LOAD_1]], %[[LOAD_0]] : i32
! CHECK: hlfir.assign %[[MULI_0]] to %[[DESIGNATE_1]] : i32, !fir.ref<i32>
@@ -1171,8 +1172,8 @@
! CHECK: %[[CONSTANT_2:.*]] = arith.constant 1 : index
! CHECK: fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_2]] to %[[CONSTANT_0]] step %[[CONSTANT_2]] unordered {
! CHECK: %[[DESIGNATE_0:.*]] = hlfir.designate %[[VAL_1]] (%[[VAL_2]]) : (!fir.ref<!fir.array<100xf32>>, index) -> !fir.ref<f32>
-! CHECK: %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<f32>
! CHECK: %[[DESIGNATE_1:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_2]]) : (!fir.ref<!fir.array<100xf32>>, index) -> !fir.ref<f32>
+! CHECK: %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<f32>
! CHECK: %[[LOAD_1:.*]] = fir.load %[[DESIGNATE_1]] : !fir.ref<f32>
! CHECK: %[[ADDF_0:.*]] = arith.addf %[[LOAD_1]], %[[LOAD_0]] fastmath<contract> : f32
! CHECK: hlfir.assign %[[ADDF_0]] to %[[DESIGNATE_1]] : f32, !fir.ref<f32>
@@ -1235,8 +1236,8 @@
! CHECK: fir.do_loop %[[VAL_3:.*]] = %[[CONSTANT_6]] to %[[CONSTANT_1]] step %[[CONSTANT_6]] unordered {
! CHECK: fir.do_loop %[[VAL_4:.*]] = %[[CONSTANT_6]] to %[[CONSTANT_0]] step %[[CONSTANT_6]] unordered {
! CHECK: %[[DESIGNATE_0:.*]] = hlfir.designate %[[VAL_1]] (%[[VAL_4]], %[[VAL_3]], %[[VAL_2]]) : (!fir.ref<!fir.array<100x10x2xi32>>, index, index, index) -> !fir.ref<i32>
-! CHECK: %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<i32>
! CHECK: %[[DESIGNATE_1:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_4]], %[[VAL_3]], %[[VAL_2]]) : (!fir.ref<!fir.array<100x10x2xi32>>, index, index, index) -> !fir.ref<i32>
+! CHECK: %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<i32>
! CHECK: %[[LOAD_1:.*]] = fir.load %[[DESIGNATE_1]] : !fir.ref<i32>
! CHECK: %[[ADDI_0:.*]] = arith.addi %[[LOAD_1]], %[[LOAD_0]] : i32
! CHECK: hlfir.assign %[[ADDI_0]] to %[[DESIGNATE_1]] : i32, !fir.ref<i32>
@@ -1278,8 +1279,8 @@
! CHECK: fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_4]] to %[[CONSTANT_1]] step %[[CONSTANT_4]] unordered {
! CHECK: fir.do_loop %[[VAL_3:.*]] = %[[CONSTANT_4]] to %[[CONSTANT_0]] step %[[CONSTANT_4]] unordered {
! CHECK: %[[DESIGNATE_0:.*]] = hlfir.designate %[[VAL_1]] (%[[VAL_3]], %[[VAL_2]]) : (!fir.ref<!fir.array<100x10xi32>>, index, index) -> !fir.ref<i32>
-! CHECK: %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<i32>
! CHECK: %[[DESIGNATE_1:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_3]], %[[VAL_2]]) : (!fir.ref<!fir.array<100x10xi32>>, index, index) -> !fir.ref<i32>
+! CHECK: %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<i32>
! CHECK: %[[LOAD_1:.*]] = fir.load %[[DESIGNATE_1]] : !fir.ref<i32>
! CHECK: %[[ADDI_0:.*]] = arith.addi %[[LOAD_1]], %[[LOAD_0]] : i32
! CHECK: hlfir.assign %[[ADDI_0]] to %[[DESIGNATE_1]] : i32, !fir.ref<i32>
@@ -1313,8 +1314,8 @@
! CHECK: %[[CONSTANT_2:.*]] = arith.constant 1 : index
! CHECK: fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_2]] to %[[CONSTANT_0]] step %[[CONSTANT_2]] unordered {
! CHECK: %[[DESIGNATE_0:.*]] = hlfir.designate %[[VAL_1]] (%[[VAL_2]]) : (!fir.ref<!fir.array<100xi32>>, index) -> !fir.ref<i32>
-! CHECK: %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<i32>
! CHECK: %[[DESIGNATE_1:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_2]]) : (!fir.ref<!fir.array<100xi32>>, index) -> !fir.ref<i32>
+! CHECK: %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<i32>
! CHECK: %[[LOAD_1:.*]] = fir.load %[[DESIGNATE_1]] : !fir.ref<i32>
! CHECK: %[[ADDI_0:.*]] = arith.addi %[[LOAD_1]], %[[LOAD_0]] : i32
! CHECK: hlfir.assign %[[ADDI_0]] to %[[DESIGNATE_1]] : i32, !fir.ref<i32>
@@ -1884,3 +1885,111 @@ subroutine acc_reduction_logical_allocatable(l)
! CHECK-LABEL: func.func @_QPacc_reduction_logical_allocatable(
! CHECK: %[[REDUCTION_0:.*]] = acc.reduction varPtr(%{{.*}} : !fir.ref<!fir.box<!fir.heap<!fir.logical<4>>>>) recipe(@reduction_lor_ref_box_heap_l32) -> !fir.ref<!fir.box<!fir.heap<!fir.logical<4>>>> {name = "l"}
! CHECK: acc.parallel reduction(%[[REDUCTION_0]] : !fir.ref<!fir.box<!fir.heap<!fir.logical<4>>>>)
+
+! ACC_COMBINE-LABEL: acc.reduction.recipe @reduction_lor_ref_box_heap_l32 : !fir.ref<!fir.box<!fir.heap<!fir.logical<4>>>> reduction_operator <lor> init {
+! ACC_COMBINE-NOT: acc.reduction_combine
+! ACC_COMBINE-LABEL: acc.reduction.recipe @reduction_max_box_UxUxf32 : !fir.box<!fir.array<?x?xf32>> reduction_operator <max> init {
+! ACC_COMBINE-NOT: acc.reduction_combine
+! ACC_COMBINE-LABEL: acc.reduction.recipe @reduction_max_ref_box_ptr_Uxf32 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>> reduction_operator <max> init {
+! ACC_COMBINE-NOT: acc.reduction_combine
+! ACC_COMBINE-LABEL: acc.reduction.recipe @reduction_max_ref_box_heap_Uxf32 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> reduction_operator <max> init {
+! ACC_COMBINE-NOT: acc.reduction_combine
+
+! ACC_COMBINE-LABEL: acc.reduction.recipe @reduction_add_section_lb1.ub3_box_Uxi32 : !fir.box<!fir.array<?xi32>> reduction_operator <add> init {
+! ACC_COMBINE-LABEL: } combiner {
+! ACC_COMBINE: ^bb0(%[[VAL_0:.*]]: !fir.box<!fir.array<?xi32>>, %[[VAL_1:.*]]: !fir.box<!fir.array<?xi32>>):
+! ACC_COMBINE: %[[CONSTANT_0:.*]] = arith.constant 1 : index
+! ACC_COMBINE: %[[CONSTANT_1:.*]] = arith.constant 1 : index
+! ACC_COMBINE: %[[CONSTANT_2:.*]] = arith.constant 3 : index
+! ACC_COMBINE: %[[CONSTANT_3:.*]] = arith.constant 0 : index
+! ACC_COMBINE: %[[CONSTANT_4:.*]] = arith.constant 2 : index
+! ACC_COMBINE: %[[CONSTANT_5:.*]] = arith.constant 3 : index
+! ACC_COMBINE: %[[CONSTANT_6:.*]] = arith.constant true
+! ACC_COMBINE: %[[SHAPE_0:.*]] = fir.shape %[[CONSTANT_5]] : (index) -> !fir.shape<1>
+! ACC_COMBINE: %[[CONSTANT_7:.*]] = arith.constant 0 : index
+! ACC_COMBINE: %[[BOX_DIMS_0:.*]]:3 = fir.box_dims %[[VAL_0]], %[[CONSTANT_7]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
+! ACC_COMBINE: %[[ADDI_0:.*]] = arith.addi %[[BOX_DIMS_0]]#0, %[[CONSTANT_1]] : index
+! ACC_COMBINE: %[[ADDI_1:.*]] = arith.addi %[[BOX_DIMS_0]]#0, %[[CONSTANT_2]] : index
+! ACC_COMBINE: %[[CONSTANT_8:.*]] = arith.constant 0 : index
+! ACC_COMBINE: %[[BOX_DIMS_1:.*]]:3 = fir.box_dims %[[VAL_1]], %[[CONSTANT_8]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
+! ACC_COMBINE: %[[ADDI_2:.*]] = arith.addi %[[BOX_DIMS_1]]#0, %[[CONSTANT_1]] : index
+! ACC_COMBINE: %[[ADDI_3:.*]] = arith.addi %[[BOX_DIMS_1]]#0, %[[CONSTANT_2]] : index
+! ACC_COMBINE: %[[DESIGNATE_0:.*]] = hlfir.designate %[[VAL_1]] (%[[ADDI_2]]:%[[ADDI_3]]:%[[CONSTANT_0]]) shape %[[SHAPE_0]] : (!fir.box<!fir.array<?xi32>>, index, index, index, !fir.shape<1>) -> !fir.box<!fir.array<3xi32>>
+! ACC_COMBINE: %[[DESIGNATE_1:.*]] = hlfir.designate %[[VAL_0]] (%[[ADDI_0]]:%[[ADDI_1]]:%[[CONSTANT_0]]) shape %[[SHAPE_0]] : (!fir.box<!fir.array<?xi32>>, index, index, index, !fir.shape<1>) -> !fir.box<!fir.array<3xi32>>
+! ACC_COMBINE: %[[CONSTANT_9:.*]] = arith.constant 1 : index
+! ACC_COMBINE: fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_9]] to %[[CONSTANT_5]] step %[[CONSTANT_9]] unordered {
+! ACC_COMBINE: %[[DESIGNATE_2:.*]] = hlfir.designate %[[DESIGNATE_0]] (%[[VAL_2]]) : (!fir.box<!fir.array<3xi32>>, index) -> !fir.ref<i32>
+! ACC_COMBINE: %[[DESIGNATE_3:.*]] = hlfir.designate %[[DESIGNATE_1]] (%[[VAL_2]]) : (!fir.box<!fir.array<3xi32>>, index) -> !fir.ref<i32>
+! ACC_COMBINE: acc.reduction_combine %[[DESIGNATE_2]] into %[[DESIGNATE_3]] <add> : !fir.ref<i32>
+! ACC_COMBINE: }
+! ACC_COMBINE: acc.yield %[[VAL_0]] : !fir.box<!fir.array<?xi32>>
+! ACC_COMBINE: }
+
+
+! ACC_COMBINE-LABEL: acc.reduction.recipe @reduction_max_box_Uxf32 : !fir.box<!fir.array<?xf32>> reduction_operator <max> init {
+! ACC_COMBINE-NOT: acc.reduction_combine
+! ACC_COMBINE-LABEL: acc.reduction.recipe @reduction_add_box_Uxi32 : !fir.box<!fir.array<?xi32>> reduction_operator <add> init {
+! ACC_COMBINE: acc.reduction_combine %{{.*}} into %{{.*}} <add> : !fir.ref<i32>
+! ACC_COMBINE-LABEL: acc.reduction.recipe @reduction_add_section_lb0.ub9xlb0.ub19_ref_10x20xi32 : !fir.ref<!fir.array<10x20xi32>> reduction_operator <add> init {
+! ACC_COMBINE: acc.reduction_combine %{{.*}} into %{{.*}} <add> : !fir.ref<i32>
+! ACC_COMBINE-LABEL: acc.reduction.recipe @reduction_add_section_lb10.ub19_ref_100xi32 : !fir.ref<!fir.array<100xi32>> reduction_operator <add> init {
+! ACC_COMBINE: acc.reduction_combine %{{.*}} into %{{.*}} <add> : !fir.ref<i32>
+! ACC_COMBINE-LABEL: acc.reduction.recipe @reduction_add_ref_box_ptr_i32 : !fir.ref<!fir.box<!fir.ptr<i32>>> reduction_operator <add> init {
+! ACC_COMBINE: acc.reduction_combine %{{.*}} into %{{.*}} <add> : !fir.ptr<i32>
+! ACC_COMBINE-LABEL: acc.reduction.recipe @reduction_add_ref_box_heap_i32 : !fir.ref<!fir.box<!fir.heap<i32>>> reduction_operator <add> init {
+! ACC_COMBINE: acc.reduction_combine %{{.*}} into %{{.*}} <add> : !fir.heap<i32>
+! ACC_COMBINE-LABEL: acc.reduction.recipe @reduction_mul_ref_z32 : !fir.ref<complex<f32>> reduction_operator <mul> init {
+! ACC_COMBINE: acc.reduction_combine %{{.*}} into %{{.*}} <mul> : !fir.ref<complex<f32>>
+! ACC_COMBINE-LABEL: acc.reduction.recipe @reduction_add_ref_z32 : !fir.ref<complex<f32>> reduction_operator <add> init {
+! ACC_COMBINE: acc.reduction_combine %{{.*}} into %{{.*}} <add> : !fir.ref<complex<f32>>
+! ACC_COMBINE-LABEL: acc.reduction.recipe @reduction_neqv_ref_l32 : !fir.ref<!fir.logical<4>> reduction_operator <neqv> init {
+! ACC_COMBINE-NOT: acc.reduction_combine
+! ACC_COMBINE-LABEL: acc.reduction.recipe @reduction_eqv_ref_l32 : !fir.ref<!fir.logical<4>> reduction_operator <eqv> init {
+! ACC_COMBINE-NOT: acc.reduction_combine
+! ACC_COMBINE-LABEL: acc.reduction.recipe @reduction_lor_ref_l32 : !fir.ref<!fir.logical<4>> reduction_operator <lor> init {
+! ACC_COMBINE-NOT: acc.reduction_combine
+! ACC_COMBINE-LABEL: acc.reduction.recipe @reduction_land_ref_l32 : !fir.ref<!fir.logical<4>> reduction_operator <land> init {
+! ACC_COMBINE-NOT: acc.reduction_combine
+! ACC_COMBINE-LABEL: acc.reduction.recipe @reduction_xor_ref_i32 : !fir.ref<i32> reduction_operator <xor> init {
+! ACC_COMBINE: acc.reduction_combine %{{.*}} into %{{.*}} <xor> : !fir.ref<i32>
+! ACC_COMBINE-LABEL: acc.reduction.recipe @reduction_ior_ref_i32 : !fir.ref<i32> reduction_operator <ior> init {
+! ACC_COMBINE: acc.reduction_combine %{{.*}} into %{{.*}} <ior> : !fir.ref<i32>
+! ACC_COMBINE-LABEL: acc.reduction.recipe @reduction_iand_ref_i32 : !fir.ref<i32> reduction_operator <iand> init {
+! ACC_COMBINE: acc.reduction_combine %{{.*}} into %{{.*}} <iand> : !fir.ref<i32>
+! ACC_COMBINE-LABEL: acc.reduction.recipe @reduction_max_ref_100xf32 : !fir.ref<!fir.array<100xf32>> reduction_operator <max> init {
+! ACC_COMBINE-NOT: acc.reduction_combine
+! ACC_COMBINE-LABEL: acc.reduction.recipe @reduction_max_ref_f32 : !fir.ref<f32> reduction_operator <max> init {
+! ACC_COMBINE-NOT: acc.reduction_combine
+! ACC_COMBINE-LABEL: acc.reduction.recipe @reduction_max_ref_100x10xi32 : !fir.ref<!fir.array<100x10xi32>> reduction_operator <max> init {
+! ACC_COMBINE-NOT: acc.reduction_combine
+! ACC_COMBINE-LABEL: acc.reduction.recipe @reduction_max_ref_i32 : !fir.ref<i32> reduction_operator <max> init {
+! ACC_COMBINE-NOT: acc.reduction_combine
+! ACC_COMBINE-LABEL: acc.reduction.recipe @reduction_min_ref_100x10xf32 : !fir.ref<!fir.array<100x10xf32>> reduction_operator <min> init {
+! ACC_COMBINE-NOT: acc.reduction_combine
+! ACC_COMBINE-LABEL: acc.reduction.recipe @reduction_min_ref_f32 : !fir.ref<f32> reduction_operator <min> init {
+! ACC_COMBINE-NOT: acc.reduction_combine
+! ACC_COMBINE-LABEL: acc.reduction.recipe @reduction_min_ref_100xi32 : !fir.ref<!fir.array<100xi32>> reduction_operator <min> init {
+! ACC_COMBINE-NOT: acc.reduction_combine
+! ACC_COMBINE-LABEL: acc.reduction.recipe @reduction_min_ref_i32 : !fir.ref<i32> reduction_operator <min> init {
+! ACC_COMBINE-NOT: acc.reduction_combine
+! ACC_COMBINE-LABEL: acc.reduction.recipe @reduction_mul_ref_100xf32 : !fir.ref<!fir.array<100xf32>> reduction_operator <mul> init {
+! ACC_COMBINE-NOT: acc.reduction_combine
+! ACC_COMBINE: acc.reduction_combine %{{.*}} into %{{.*}} <mul> : !fir.ref<f32>
+! ACC_COMBINE-LABEL: acc.reduction.recipe @reduction_mul_ref_f32 : !fir.ref<f32> reduction_operator <mul> init {
+! ACC_COMBINE: acc.reduction_combine %{{.*}} into %{{.*}} <mul> : !fir.ref<f32>
+! ACC_COMBINE-LABEL: acc.reduction.recipe @reduction_mul_ref_100xi32 : !fir.ref<!fir.array<100xi32>> reduction_operator <mul> init {
+! ACC_COMBINE: acc.reduction_combine %{{.*}} into %{{.*}} <mul> : !fir.ref<i32>
+! ACC_COMBINE-LABEL: acc.reduction.recipe @reduction_mul_ref_i32 : !fir.ref<i32> reduction_operator <mul> init {
+! ACC_COMBINE: acc.reduction_combine %{{.*}} into %{{.*}} <mul> : !fir.ref<i32>
+! ACC_COMBINE-LABEL: acc.reduction.recipe @reduction_add_ref_100xf32 : !fir.ref<!fir.array<100xf32>> reduction_operator <add> init {
+! ACC_COMBINE: acc.reduction_combine %{{.*}} into %{{.*}} <add> : !fir.ref<f32>
+! ACC_COMBINE-LABEL: acc.reduction.recipe @reduction_add_ref_f32 : !fir.ref<f32> reduction_operator <add> init {
+! ACC_COMBINE: acc.reduction_combine %{{.*}} into %{{.*}} <add> : !fir.ref<f32>
+! ACC_COMBINE-LABEL: acc.reduction.recipe @reduction_add_ref_100x10x2xi32 : !fir.ref<!fir.array<100x10x2xi32>> reduction_operator <add> init {
+! ACC_COMBINE: acc.reduction_combine %{{.*}} into %{{.*}} <add> : !fir.ref<i32>
+! ACC_COMBINE-LABEL: acc.reduction.recipe @reduction_add_ref_100x10xi32 : !fir.ref<!fir.array<100x10xi32>> reduction_operator <add> init {
+! ACC_COMBINE: acc.reduction_combine %{{.*}} into %{{.*}} <add> : !fir.ref<i32>
+! ACC_COMBINE-LABEL: acc.reduction.recipe @reduction_add_ref_100xi32 : !fir.ref<!fir.array<100xi32>> reduction_operator <add> init {
+! ACC_COMBINE: acc.reduction_combine %{{.*}} into %{{.*}} <add> : !fir.ref<i32>
+! ACC_COMBINE-LABEL: acc.reduction.recipe @reduction_add_ref_i32 : !fir.ref<i32> reduction_operator <add> init {
+! ACC_COMBINE: acc.reduction_combine %{{.*}} into %{{.*}} <add> : !fir.ref<i32>
diff --git a/flang/test/Lower/do-while-to-scf-while.f90 b/flang/test/Lower/do-while-to-scf-while.f90
index d2f38d6e09694..6d057ed823c36 100644
--- a/flang/test/Lower/do-while-to-scf-while.f90
+++ b/flang/test/Lower/do-while-to-scf-while.f90
@@ -1,4 +1,4 @@
-! RUN: bbc -emit-fir -hlfir=false -lower-do-while-to-scf-while %s -o - | FileCheck %s
+! RUN: bbc -emit-hlfir -lower-do-while-to-scf-while %s -o - | FileCheck %s
! CHECK-LABEL: func.func @_QPsimple_do_while()
! CHECK: scf.while
@@ -85,3 +85,20 @@ subroutine do_while_goto_internal_backedge()
print *, "sum=", sum
end subroutine do_while_goto_internal_backedge
+! CHECK-LABEL: func.func @_QPtest_after_unstructured(
+! CHECK: scf.while
+! CHECK-NOT: cf.br
+! CHECK: return
+subroutine test_after_unstructured(cdt, switch)
+ logical :: cdt, eval
+ integer :: switch, i = 1
+ if (cdt) then
+ select case (switch)
+ case (0)
+ call print1()
+ end select
+ end if
+ do while(eval(i))
+ call incr(i)
+ end do
+end subroutine
diff --git a/libc/shared/math.h b/libc/shared/math.h
index 828980a3500df..841ad0a34b2f1 100644
--- a/libc/shared/math.h
+++ b/libc/shared/math.h
@@ -34,7 +34,13 @@
#include "math/bf16addf.h"
#include "math/bf16addf128.h"
#include "math/bf16divf.h"
+#include "math/bf16divl.h"
#include "math/bf16fmaf.h"
+#include "math/bf16fmal.h"
+#include "math/bf16mul.h"
+#include "math/bf16mulf.h"
+#include "math/bf16mulf128.h"
+#include "math/bf16mull.h"
#include "math/canonicalize.h"
#include "math/canonicalizebf16.h"
#include "math/canonicalizef.h"
@@ -70,6 +76,10 @@
#include "math/expm1.h"
#include "math/expm1f.h"
#include "math/expm1f16.h"
+#include "math/f16add.h"
+#include "math/f16addf.h"
+#include "math/f16addf128.h"
+#include "math/f16addl.h"
#include "math/f16fma.h"
#include "math/f16fmaf.h"
#include "math/f16fmaf128.h"
@@ -113,6 +123,7 @@
#include "math/logbf.h"
#include "math/logbf128.h"
#include "math/logbf16.h"
+#include "math/logbl.h"
#include "math/logf.h"
#include "math/logf16.h"
#include "math/pow.h"
@@ -134,7 +145,9 @@
#include "math/sqrtf16.h"
#include "math/tan.h"
#include "math/tanf.h"
+#include "math/tanf16.h"
#include "math/tanhf.h"
#include "math/tanhf16.h"
+#include "math/tanpif.h"
#endif // LLVM_LIBC_SHARED_MATH_H
diff --git a/libc/shared/math/bf16divl.h b/libc/shared/math/bf16divl.h
new file mode 100644
index 0000000000000..f30cfaa012c4f
--- /dev/null
+++ b/libc/shared/math/bf16divl.h
@@ -0,0 +1,23 @@
+//===-- Shared bf16divl function --------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SHARED_MATH_BF16DIVL_H
+#define LLVM_LIBC_SHARED_MATH_BF16DIVL_H
+
+#include "shared/libc_common.h"
+#include "src/__support/math/bf16divl.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace shared {
+
+using math::bf16divl;
+
+} // namespace shared
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SHARED_MATH_BF16DIVL_H
diff --git a/libc/shared/math/bf16fmal.h b/libc/shared/math/bf16fmal.h
new file mode 100644
index 0000000000000..24aacc53c72a8
--- /dev/null
+++ b/libc/shared/math/bf16fmal.h
@@ -0,0 +1,25 @@
+//===-- Shared bf16fmal function --------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SHARED_MATH_BF16FMAL_H
+#define LLVM_LIBC_SHARED_MATH_BF16FMAL_H
+
+#include "shared/libc_common.h"
+#include "src/__support/math/bf16fmal.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+namespace shared {
+
+using math::bf16fmal;
+
+} // namespace shared
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SHARED_MATH_BF16FMAL_H
diff --git a/libc/shared/math/bf16mul.h b/libc/shared/math/bf16mul.h
new file mode 100644
index 0000000000000..064416c498f59
--- /dev/null
+++ b/libc/shared/math/bf16mul.h
@@ -0,0 +1,22 @@
+//===-- Shared bf16mul function ---------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SHARED_MATH_BF16MUL_H
+#define LLVM_LIBC_SHARED_MATH_BF16MUL_H
+
+#include "src/__support/math/bf16mul.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace shared {
+
+using math::bf16mul;
+
+} // namespace shared
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SHARED_MATH_BF16MUL_H
diff --git a/libc/shared/math/bf16mulf.h b/libc/shared/math/bf16mulf.h
new file mode 100644
index 0000000000000..456bf85bfadf4
--- /dev/null
+++ b/libc/shared/math/bf16mulf.h
@@ -0,0 +1,22 @@
+//===-- Shared bf16mulf function --------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SHARED_MATH_BF16MULF_H
+#define LLVM_LIBC_SHARED_MATH_BF16MULF_H
+
+#include "src/__support/math/bf16mulf.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace shared {
+
+using math::bf16mulf;
+
+} // namespace shared
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SHARED_MATH_BF16MULF_H
diff --git a/libc/shared/math/bf16mulf128.h b/libc/shared/math/bf16mulf128.h
new file mode 100644
index 0000000000000..41baf47dcd78d
--- /dev/null
+++ b/libc/shared/math/bf16mulf128.h
@@ -0,0 +1,28 @@
+//===-- Shared bf16mulf128 function -----------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SHARED_MATH_BF16MULF128_H
+#define LLVM_LIBC_SHARED_MATH_BF16MULF128_H
+
+#include "include/llvm-libc-types/float128.h"
+
+#ifdef LIBC_TYPES_HAS_FLOAT128
+
+#include "src/__support/math/bf16mulf128.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace shared {
+
+using math::bf16mulf128;
+
+} // namespace shared
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LIBC_TYPES_HAS_FLOAT128
+
+#endif // LLVM_LIBC_SHARED_MATH_BF16MULF128_H
diff --git a/libc/shared/math/bf16mull.h b/libc/shared/math/bf16mull.h
new file mode 100644
index 0000000000000..fdea2182279b7
--- /dev/null
+++ b/libc/shared/math/bf16mull.h
@@ -0,0 +1,22 @@
+//===-- Shared bf16mull function --------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SHARED_MATH_BF16MULL_H
+#define LLVM_LIBC_SHARED_MATH_BF16MULL_H
+
+#include "src/__support/math/bf16mull.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace shared {
+
+using math::bf16mull;
+
+} // namespace shared
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SHARED_MATH_BF16MULL_H
diff --git a/libc/shared/math/f16add.h b/libc/shared/math/f16add.h
new file mode 100644
index 0000000000000..4a51de05d0857
--- /dev/null
+++ b/libc/shared/math/f16add.h
@@ -0,0 +1,29 @@
+//===-- Shared f16add function ----------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SHARED_MATH_F16ADD_H
+#define LLVM_LIBC_SHARED_MATH_F16ADD_H
+
+#include "include/llvm-libc-macros/float16-macros.h"
+
+#ifdef LIBC_TYPES_HAS_FLOAT16
+
+#include "shared/libc_common.h"
+#include "src/__support/math/f16add.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace shared {
+
+using math::f16add;
+
+} // namespace shared
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LIBC_TYPES_HAS_FLOAT16
+
+#endif // LLVM_LIBC_SHARED_MATH_F16ADD_H
diff --git a/libc/shared/math/f16addf.h b/libc/shared/math/f16addf.h
new file mode 100644
index 0000000000000..346b584cb826d
--- /dev/null
+++ b/libc/shared/math/f16addf.h
@@ -0,0 +1,29 @@
+//===-- Shared f16addf function ---------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SHARED_MATH_F16ADDF_H
+#define LLVM_LIBC_SHARED_MATH_F16ADDF_H
+
+#include "include/llvm-libc-macros/float16-macros.h"
+
+#ifdef LIBC_TYPES_HAS_FLOAT16
+
+#include "shared/libc_common.h"
+#include "src/__support/math/f16addf.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace shared {
+
+using math::f16addf;
+
+} // namespace shared
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LIBC_TYPES_HAS_FLOAT16
+
+#endif // LLVM_LIBC_SHARED_MATH_F16ADDF_H
diff --git a/libc/shared/math/f16addf128.h b/libc/shared/math/f16addf128.h
new file mode 100644
index 0000000000000..40321695a6342
--- /dev/null
+++ b/libc/shared/math/f16addf128.h
@@ -0,0 +1,32 @@
+//===-- Shared f16addf128 function ------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SHARED_MATH_F16ADDF128_H
+#define LLVM_LIBC_SHARED_MATH_F16ADDF128_H
+
+#include "include/llvm-libc-macros/float16-macros.h"
+#include "include/llvm-libc-types/float128.h"
+
+#ifdef LIBC_TYPES_HAS_FLOAT16
+#ifdef LIBC_TYPES_HAS_FLOAT128
+
+#include "shared/libc_common.h"
+#include "src/__support/math/f16addf128.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace shared {
+
+using math::f16addf128;
+
+} // namespace shared
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LIBC_TYPES_HAS_FLOAT16
+#endif // LIBC_TYPES_HAS_FLOAT128
+
+#endif // LLVM_LIBC_SHARED_MATH_F16ADDF128_H
diff --git a/libc/shared/math/f16addl.h b/libc/shared/math/f16addl.h
new file mode 100644
index 0000000000000..3406b0e65313a
--- /dev/null
+++ b/libc/shared/math/f16addl.h
@@ -0,0 +1,29 @@
+//===-- Shared f16addl function ---------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SHARED_MATH_F16ADDL_H
+#define LLVM_LIBC_SHARED_MATH_F16ADDL_H
+
+#include "include/llvm-libc-macros/float16-macros.h"
+
+#ifdef LIBC_TYPES_HAS_FLOAT16
+
+#include "shared/libc_common.h"
+#include "src/__support/math/f16addl.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace shared {
+
+using math::f16addl;
+
+} // namespace shared
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LIBC_TYPES_HAS_FLOAT16
+
+#endif // LLVM_LIBC_SHARED_MATH_F16ADDL_H
diff --git a/libc/shared/math/logbl.h b/libc/shared/math/logbl.h
new file mode 100644
index 0000000000000..d2bee4afe4e76
--- /dev/null
+++ b/libc/shared/math/logbl.h
@@ -0,0 +1,23 @@
+//===-- Shared logbl function -----------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SHARED_MATH_LOGBL_H
+#define LLVM_LIBC_SHARED_MATH_LOGBL_H
+
+#include "shared/libc_common.h"
+#include "src/__support/math/logbl.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace shared {
+
+using math::logbl;
+
+} // namespace shared
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SHARED_MATH_LOGBL_H
diff --git a/libc/shared/math/tanf16.h b/libc/shared/math/tanf16.h
new file mode 100644
index 0000000000000..b8ca2b87335e5
--- /dev/null
+++ b/libc/shared/math/tanf16.h
@@ -0,0 +1,29 @@
+//===-- Shared tanf16 function ----------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SHARED_MATH_TANF16_H
+#define LLVM_LIBC_SHARED_MATH_TANF16_H
+
+#include "include/llvm-libc-macros/float16-macros.h"
+
+#ifdef LIBC_TYPES_HAS_FLOAT16
+
+#include "shared/libc_common.h"
+#include "src/__support/math/tanf16.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace shared {
+
+using math::tanf16;
+
+} // namespace shared
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LIBC_TYPES_HAS_FLOAT16
+
+#endif // LLVM_LIBC_SHARED_MATH_TANF16_H
diff --git a/libc/shared/math/tanpif.h b/libc/shared/math/tanpif.h
new file mode 100644
index 0000000000000..4c1f691ddb1d2
--- /dev/null
+++ b/libc/shared/math/tanpif.h
@@ -0,0 +1,23 @@
+//===-- Shared tanpif function ----------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SHARED_MATH_TANPIF_H
+#define LLVM_LIBC_SHARED_MATH_TANPIF_H
+
+#include "shared/libc_common.h"
+#include "src/__support/math/tanpif.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace shared {
+
+using math::tanpif;
+
+} // namespace shared
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SHARED_MATH_TANPIF_H
diff --git a/libc/src/__support/math/CMakeLists.txt b/libc/src/__support/math/CMakeLists.txt
index af2c66597b75a..98e2721b73a65 100644
--- a/libc/src/__support/math/CMakeLists.txt
+++ b/libc/src/__support/math/CMakeLists.txt
@@ -381,6 +381,43 @@ add_header_library(
libc.src.__support.FPUtil.fma
libc.src.__support.macros.config
)
+add_header_library(
+ bf16mul
+ HDRS
+ bf16mul.h
+ DEPENDS
+ libc.src.__support.FPUtil.bfloat16
+ libc.src.__support.FPUtil.generic.mul
+ libc.src.__support.macros.config
+)
+add_header_library(
+ bf16mulf
+ HDRS
+ bf16mulf.h
+ DEPENDS
+ libc.src.__support.FPUtil.bfloat16
+ libc.src.__support.FPUtil.generic.mul
+ libc.src.__support.macros.config
+)
+add_header_library(
+ bf16mulf128
+ HDRS
+ bf16mulf128.h
+ DEPENDS
+ libc.include.llvm-libc-types.float128
+ libc.src.__support.FPUtil.bfloat16
+ libc.src.__support.FPUtil.generic.mul
+ libc.src.__support.macros.config
+)
+add_header_library(
+ bf16mull
+ HDRS
+ bf16mull.h
+ DEPENDS
+ libc.src.__support.FPUtil.bfloat16
+ libc.src.__support.FPUtil.generic.mul
+ libc.src.__support.macros.config
+)
add_header_library(
canonicalize
@@ -440,6 +477,16 @@ add_header_library(
)
+add_header_library(
+ bf16divl
+ HDRS
+ bf16divl.h
+ DEPENDS
+ libc.src.__support.FPUtil.bfloat16
+ libc.src.__support.FPUtil.generic.div
+ libc.src.__support.macros.config
+)
+
add_header_library(
cbrt
HDRS
@@ -701,6 +748,47 @@ add_header_library(
libc.src.__support.math.exp10_float16_constants
)
+add_header_library(
+ f16add
+ HDRS
+ f16add.h
+ DEPENDS
+ libc.include.llvm-libc-macros.float16_macros
+ libc.src.__support.FPUtil.generic.add_sub
+ libc.src.__support.macros.config
+)
+
+add_header_library(
+ f16addf
+ HDRS
+ f16addf.h
+ DEPENDS
+ libc.include.llvm-libc-macros.float16_macros
+ libc.src.__support.FPUtil.generic.add_sub
+ libc.src.__support.macros.config
+)
+
+add_header_library(
+ f16addf128
+ HDRS
+ f16addf128.h
+ DEPENDS
+ libc.include.llvm-libc-macros.float16_macros
+ libc.include.llvm-libc-types.float128
+ libc.src.__support.FPUtil.generic.add_sub
+ libc.src.__support.macros.config
+)
+
+add_header_library(
+ f16addl
+ HDRS
+ f16addl.h
+ DEPENDS
+ libc.include.llvm-libc-macros.float16_macros
+ libc.src.__support.FPUtil.generic.add_sub
+ libc.src.__support.macros.config
+)
+
add_header_library(
ffmal
HDRS
@@ -845,6 +933,16 @@ add_header_library(
libc.include.llvm-libc-macros.float16_macros
)
+add_header_library(
+ bf16fmal
+ HDRS
+ bf16fmal.h
+ DEPENDS
+ libc.src.__support.macros.config
+ libc.src.__support.FPUtil.fma
+ libc.src.__support.FPUtil.bfloat16
+)
+
add_header_library(
ilogb
HDRS
@@ -1675,6 +1773,16 @@ add_header_library(
libc.src.__support.macros.properties.cpu_features
)
+add_header_library(
+ logbl
+ HDRS
+ logbl.h
+ DEPENDS
+ libc.src.__support.FPUtil.manipulation_functions
+ libc.src.__support.common
+ libc.src.__support.macros.config
+)
+
add_header_library(
log_range_reduction
HDRS
@@ -1888,6 +1996,23 @@ add_header_library(
libc.src.__support.macros.optimization
)
+add_header_library(
+ tanf16
+ HDRS
+ tanf16.h
+ DEPENDS
+ .sincosf16_utils
+ libc.hdr.errno_macros
+ libc.hdr.fenv_macros
+ libc.src.__support.FPUtil.cast
+ libc.src.__support.FPUtil.fenv_impl
+ libc.src.__support.FPUtil.fp_bits
+ libc.src.__support.FPUtil.except_value_utils
+ libc.src.__support.FPUtil.multiply_add
+ libc.src.__support.macros.optimization
+ libc.include.llvm-libc-macros.float16_macros
+)
+
add_header_library(
tanhf
HDRS
@@ -1924,3 +2049,19 @@ add_header_library(
libc.src.__support.macros.optimization
libc.include.llvm-libc-macros.float16_macros
)
+
+add_header_library(
+ tanpif
+ HDRS
+ tanpif.h
+ DEPENDS
+ .sincosf16_utils
+ libc.src.__support.FPUtil.fenv_impl
+ libc.src.__support.FPUtil.fp_bits
+ libc.src.__support.FPUtil.cast
+ libc.src.__support.FPUtil.except_value_utils
+ libc.src.__support.FPUtil.multiply_add
+ libc.src.__support.common
+ libc.src.__support.macros.config
+ libc.src.__support.macros.optimization
+)
diff --git a/libc/src/__support/math/bf16divl.h b/libc/src/__support/math/bf16divl.h
new file mode 100644
index 0000000000000..ec5a9244b98d6
--- /dev/null
+++ b/libc/src/__support/math/bf16divl.h
@@ -0,0 +1,26 @@
+//===-- Implementation header for bf16divl ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_BF16DIVL_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_BF16DIVL_H
+
+#include "src/__support/FPUtil/bfloat16.h"
+#include "src/__support/FPUtil/generic/div.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace math {
+
+LIBC_INLINE bfloat16 bf16divl(long double x, long double y) {
+ return fputil::generic::div<bfloat16>(x, y);
+}
+
+} // namespace math
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_MATH_BF16DIVL_H
diff --git a/libc/src/__support/math/bf16fmal.h b/libc/src/__support/math/bf16fmal.h
new file mode 100644
index 0000000000000..93a04d0ec8fac
--- /dev/null
+++ b/libc/src/__support/math/bf16fmal.h
@@ -0,0 +1,26 @@
+//===-- Implementation header for bf16fmal ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_BF16FMAL_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_BF16FMAL_H
+
+#include "src/__support/FPUtil/FMA.h"
+#include "src/__support/FPUtil/bfloat16.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace math {
+
+LIBC_INLINE bfloat16 bf16fmal(long double x, long double y, long double z) {
+ return fputil::fma<bfloat16>(x, y, z);
+}
+
+} // namespace math
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_MATH_BF16FMAL_H
diff --git a/libc/src/__support/math/bf16mul.h b/libc/src/__support/math/bf16mul.h
new file mode 100644
index 0000000000000..af55c519ef9cf
--- /dev/null
+++ b/libc/src/__support/math/bf16mul.h
@@ -0,0 +1,27 @@
+//===-- Implementation header for bf16mul -----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_BF16MUL_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_BF16MUL_H
+
+#include "src/__support/FPUtil/bfloat16.h"
+#include "src/__support/FPUtil/generic/mul.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+namespace math {
+
+LIBC_INLINE constexpr bfloat16 bf16mul(double x, double y) {
+ return fputil::generic::mul<bfloat16>(x, y);
+}
+
+} // namespace math
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_MATH_BF16MUL_H
diff --git a/libc/src/__support/math/bf16mulf.h b/libc/src/__support/math/bf16mulf.h
new file mode 100644
index 0000000000000..24eae3d43f419
--- /dev/null
+++ b/libc/src/__support/math/bf16mulf.h
@@ -0,0 +1,27 @@
+//===-- Implementation header for bf16mulf ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_BF16MULF_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_BF16MULF_H
+
+#include "src/__support/FPUtil/bfloat16.h"
+#include "src/__support/FPUtil/generic/mul.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+namespace math {
+
+LIBC_INLINE constexpr bfloat16 bf16mulf(float x, float y) {
+ return fputil::generic::mul<bfloat16>(x, y);
+}
+
+} // namespace math
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_MATH_BF16MULF_H
diff --git a/libc/src/__support/math/bf16mulf128.h b/libc/src/__support/math/bf16mulf128.h
new file mode 100644
index 0000000000000..d9f50044047bd
--- /dev/null
+++ b/libc/src/__support/math/bf16mulf128.h
@@ -0,0 +1,33 @@
+//===-- Implementation header for bf16mulf128 -------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_BF16MULF128_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_BF16MULF128_H
+
+#include "include/llvm-libc-types/float128.h"
+
+#ifdef LIBC_TYPES_HAS_FLOAT128
+
+#include "src/__support/FPUtil/bfloat16.h"
+#include "src/__support/FPUtil/generic/mul.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+namespace math {
+
+LIBC_INLINE constexpr bfloat16 bf16mulf128(float128 x, float128 y) {
+ return fputil::generic::mul<bfloat16>(x, y);
+}
+
+} // namespace math
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LIBC_TYPES_HAS_FLOAT128
+
+#endif // LLVM_LIBC_SRC___SUPPORT_MATH_BF16MULF128_H
diff --git a/libc/src/__support/math/bf16mull.h b/libc/src/__support/math/bf16mull.h
new file mode 100644
index 0000000000000..d54a2e9bdc272
--- /dev/null
+++ b/libc/src/__support/math/bf16mull.h
@@ -0,0 +1,27 @@
+//===-- Implementation header for bf16mull ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_BF16MULL_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_BF16MULL_H
+
+#include "src/__support/FPUtil/bfloat16.h"
+#include "src/__support/FPUtil/generic/mul.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+namespace math {
+
+LIBC_INLINE constexpr bfloat16 bf16mull(long double x, long double y) {
+ return fputil::generic::mul<bfloat16>(x, y);
+}
+
+} // namespace math
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_MATH_BF16MULL_H
diff --git a/libc/src/__support/math/f16add.h b/libc/src/__support/math/f16add.h
new file mode 100644
index 0000000000000..7e046c3f115dc
--- /dev/null
+++ b/libc/src/__support/math/f16add.h
@@ -0,0 +1,31 @@
+//===-- Implementation header for f16add ------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_F16ADD_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_F16ADD_H
+
+#include "include/llvm-libc-macros/float16-macros.h"
+
+#ifdef LIBC_TYPES_HAS_FLOAT16
+
+#include "src/__support/FPUtil/generic/add_sub.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace math {
+
+LIBC_INLINE float16 f16add(double x, double y) {
+ return fputil::generic::add<float16>(x, y);
+}
+
+} // namespace math
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LIBC_TYPES_HAS_FLOAT16
+
+#endif // LLVM_LIBC_SRC___SUPPORT_MATH_F16ADD_H
diff --git a/libc/src/__support/math/f16addf.h b/libc/src/__support/math/f16addf.h
new file mode 100644
index 0000000000000..5e140bc6e5373
--- /dev/null
+++ b/libc/src/__support/math/f16addf.h
@@ -0,0 +1,31 @@
+//===-- Implementation header for f16addf -----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_F16ADDF_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_F16ADDF_H
+
+#include "include/llvm-libc-macros/float16-macros.h"
+
+#ifdef LIBC_TYPES_HAS_FLOAT16
+
+#include "src/__support/FPUtil/generic/add_sub.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace math {
+
+LIBC_INLINE float16 f16addf(float x, float y) {
+ return fputil::generic::add<float16>(x, y);
+}
+
+} // namespace math
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LIBC_TYPES_HAS_FLOAT16
+
+#endif // LLVM_LIBC_SRC___SUPPORT_MATH_F16ADDF_H
diff --git a/libc/src/__support/math/f16addf128.h b/libc/src/__support/math/f16addf128.h
new file mode 100644
index 0000000000000..8d259a273a8d1
--- /dev/null
+++ b/libc/src/__support/math/f16addf128.h
@@ -0,0 +1,34 @@
+//===-- Implementation header for f16addf128 --------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_F16ADDF128_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_F16ADDF128_H
+
+#include "include/llvm-libc-macros/float16-macros.h"
+#include "include/llvm-libc-types/float128.h"
+
+#ifdef LIBC_TYPES_HAS_FLOAT16
+#ifdef LIBC_TYPES_HAS_FLOAT128
+
+#include "src/__support/FPUtil/generic/add_sub.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace math {
+
+LIBC_INLINE float16 f16addf128(float128 x, float128 y) {
+ return fputil::generic::add<float16>(x, y);
+}
+
+} // namespace math
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LIBC_TYPES_HAS_FLOAT16
+#endif // LIBC_TYPES_HAS_FLOAT128
+
+#endif // LLVM_LIBC_SRC___SUPPORT_MATH_F16ADDF128_H
diff --git a/libc/src/__support/math/f16addl.h b/libc/src/__support/math/f16addl.h
new file mode 100644
index 0000000000000..88f16857be48e
--- /dev/null
+++ b/libc/src/__support/math/f16addl.h
@@ -0,0 +1,31 @@
+//===-- Implementation header for f16addl -----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_F16ADDL_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_F16ADDL_H
+
+#include "include/llvm-libc-macros/float16-macros.h"
+
+#ifdef LIBC_TYPES_HAS_FLOAT16
+
+#include "src/__support/FPUtil/generic/add_sub.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace math {
+
+LIBC_INLINE float16 f16addl(long double x, long double y) {
+ return fputil::generic::add<float16>(x, y);
+}
+
+} // namespace math
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LIBC_TYPES_HAS_FLOAT16
+
+#endif // LLVM_LIBC_SRC___SUPPORT_MATH_F16ADDL_H
diff --git a/libc/src/__support/math/logbl.h b/libc/src/__support/math/logbl.h
new file mode 100644
index 0000000000000..750050277c165
--- /dev/null
+++ b/libc/src/__support/math/logbl.h
@@ -0,0 +1,26 @@
+//===-- Implementation header for logbl -------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_LOGBL_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_LOGBL_H
+
+#include "src/__support/FPUtil/ManipulationFunctions.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace math {
+
+LIBC_INLINE constexpr long double logbl(long double x) {
+ return fputil::logb(x);
+}
+
+} // namespace math
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_MATH_LOGBL_H
diff --git a/libc/src/__support/math/tanf16.h b/libc/src/__support/math/tanf16.h
new file mode 100644
index 0000000000000..6b9b9224fb84d
--- /dev/null
+++ b/libc/src/__support/math/tanf16.h
@@ -0,0 +1,137 @@
+//===-- Single-precision tanf16 function ----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_TANF16_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_TANF16_H
+
+#include "include/llvm-libc-macros/float16-macros.h"
+
+#ifdef LIBC_TYPES_HAS_FLOAT16
+
+#include "hdr/errno_macros.h"
+#include "hdr/fenv_macros.h"
+#include "sincosf16_utils.h"
+#include "src/__support/FPUtil/FEnvImpl.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/cast.h"
+#include "src/__support/FPUtil/except_value_utils.h"
+#include "src/__support/FPUtil/multiply_add.h"
+#include "src/__support/macros/optimization.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+namespace math {
+
+LIBC_INLINE float16 tanf16(float16 x) {
+ using namespace sincosf16_internal;
+ using FPBits = fputil::FPBits<float16>;
+ FPBits xbits(x);
+
+#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+ constexpr size_t N_EXCEPTS = 9;
+ constexpr fputil::ExceptValues<float16, N_EXCEPTS> TANF16_EXCEPTS{{
+ // (input, RZ output, RU offset, RD offset, RN offset)
+ {0x2894, 0x2894, 1, 0, 1},
+ {0x3091, 0x3099, 1, 0, 0},
+ {0x3098, 0x30a0, 1, 0, 0},
+ {0x55ed, 0x3911, 1, 0, 0},
+ {0x607b, 0xc638, 0, 1, 1},
+ {0x674e, 0x3b7d, 1, 0, 0},
+ {0x6807, 0x4014, 1, 0, 1},
+ {0x6f4d, 0xbe19, 0, 1, 1},
+ {0x7330, 0xcb62, 0, 1, 0},
+ }};
+#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
+ uint16_t x_u = xbits.uintval();
+ uint16_t x_abs = x_u & 0x7fff;
+ float xf = x;
+
+#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+ bool x_sign = x_u >> 15;
+ // Handle exceptional values
+ if (auto r = TANF16_EXCEPTS.lookup_odd(x_abs, x_sign);
+ LIBC_UNLIKELY(r.has_value()))
+ return r.value();
+#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
+ // |x| <= 0x1.d1p-5
+ if (LIBC_UNLIKELY(x_abs <= 0x2b44)) {
+ // |x| <= 0x1.398p-11
+ if (LIBC_UNLIKELY(x_abs <= 0x10e6)) {
+ // tan(+/-0) = +/-0
+ if (LIBC_UNLIKELY(x_abs == 0))
+ return x;
+
+ int rounding = fputil::quick_get_round();
+
+ // Exhaustive tests show that, when:
+ // x > 0, and rounding upward or
+ // x < 0, and rounding downward then,
+ // tan(x) = x * 2^-11 + x
+ if ((xbits.is_pos() && rounding == FE_UPWARD) ||
+ (xbits.is_neg() && rounding == FE_DOWNWARD))
+ return fputil::cast<float16>(fputil::multiply_add(xf, 0x1.0p-11f, xf));
+ return x;
+ }
+
+ float xsq = xf * xf;
+
+ // Degree-6 minimax odd polynomial of tan(x) generated by Sollya with:
+ // > P = fpminimax(tan(x)/x, [|0, 2, 4, 6|], [|1, SG...|], [0, pi/32]);
+ float result = fputil::polyeval(xsq, 0x1p0f, 0x1.555556p-2f, 0x1.110ee4p-3f,
+ 0x1.be80f6p-5f);
+
+ return fputil::cast<float16>(xf * result);
+ }
+
+ // tan(+/-inf) = NaN, and tan(NaN) = NaN
+ if (LIBC_UNLIKELY(x_abs >= 0x7c00)) {
+ if (xbits.is_signaling_nan()) {
+ fputil::raise_except_if_required(FE_INVALID);
+ return FPBits::quiet_nan().get_val();
+ }
+ // x = +/-inf
+ if (x_abs == 0x7c00) {
+ fputil::set_errno_if_required(EDOM);
+ fputil::raise_except_if_required(FE_INVALID);
+ }
+
+ return x + FPBits::quiet_nan().get_val();
+ }
+
+ // Range reduction:
+ // For |x| > pi/32, we perform range reduction as follows:
+ // Find k and y such that:
+ // x = (k + y) * pi/32;
+ // k is an integer, |y| < 0.5
+ //
+ // This is done by performing:
+ // k = round(x * 32/pi)
+ // y = x * 32/pi - k
+ //
+ // Once k and y are computed, we then deduce the answer by the formula:
+ // tan(x) = sin(x) / cos(x)
+ // = (sin_y * cos_k + cos_y * sin_k) / (cos_y * cos_k - sin_y * sin_k)
+ float sin_k, cos_k, sin_y, cosm1_y;
+ sincosf16_eval(xf, sin_k, cos_k, sin_y, cosm1_y);
+
+ // Note that, cosm1_y = cos_y - 1:
+ using fputil::multiply_add;
+ return fputil::cast<float16>(
+ multiply_add(sin_y, cos_k, multiply_add(cosm1_y, sin_k, sin_k)) /
+ multiply_add(sin_y, -sin_k, multiply_add(cosm1_y, cos_k, cos_k)));
+}
+
+} // namespace math
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LIBC_TYPES_HAS_FLOAT16
+
+#endif // LLVM_LIBC_SRC___SUPPORT_MATH_TANF16_H
diff --git a/libc/src/__support/math/tanpif.h b/libc/src/__support/math/tanpif.h
new file mode 100644
index 0000000000000..114fcb6053d30
--- /dev/null
+++ b/libc/src/__support/math/tanpif.h
@@ -0,0 +1,115 @@
+//===-- Single-precision tanpi function -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_TANPIF_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_TANPIF_H
+
+#include "sincosf_utils.h"
+#include "src/__support/FPUtil/FEnvImpl.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/cast.h"
+#include "src/__support/FPUtil/except_value_utils.h"
+#include "src/__support/FPUtil/multiply_add.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
+
+namespace LIBC_NAMESPACE_DECL {
+
+namespace math {
+
+LIBC_INLINE float tanpif(float x) {
+ using namespace sincosf_utils_internal;
+
+ using FPBits = typename fputil::FPBits<float>;
+ FPBits xbits(x);
+
+#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+ constexpr size_t N_EXCEPTS = 3;
+ constexpr fputil::ExceptValues<float, N_EXCEPTS> TANPIF_EXCEPTS{{
+ // (input, RZ output, RU offset, RD offset, RN offset)
+ {0x38F26685, 0x39BE6182, 1, 0, 0},
+ {0x3E933802, 0x3FA267DD, 1, 0, 0},
+ {0x3F3663FF, 0xBFA267DD, 0, 1, 0},
+ }};
+#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
+ uint32_t x_u = xbits.uintval();
+ uint32_t x_abs = x_u & 0x7fff'ffffU;
+ double xd = static_cast<double>(xbits.get_val());
+
+ // Handle exceptional values
+ if (LIBC_UNLIKELY(x_abs <= 0x3F3663FF)) {
+ if (LIBC_UNLIKELY(x_abs == 0U))
+ return x;
+
+#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+ bool x_sign = x_u >> 31;
+
+ if (auto r = TANPIF_EXCEPTS.lookup_odd(x_abs, x_sign);
+ LIBC_UNLIKELY(r.has_value()))
+ return r.value();
+#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+ }
+
+ // Numbers greater or equal to 2^23 are always integers, or infinity, or NaN
+ if (LIBC_UNLIKELY(x_abs >= 0x4B00'0000)) {
+ // x is inf or NaN.
+ if (LIBC_UNLIKELY(x_abs >= 0x7f80'0000U)) {
+ if (xbits.is_signaling_nan()) {
+ fputil::raise_except_if_required(FE_INVALID);
+ return FPBits::quiet_nan().get_val();
+ }
+
+ if (x_abs == 0x7f80'0000U) {
+ fputil::set_errno_if_required(EDOM);
+ fputil::raise_except_if_required(FE_INVALID);
+ }
+
+ return x + FPBits::quiet_nan().get_val();
+ }
+
+ return FPBits::zero(xbits.sign()).get_val();
+ }
+
+ // Range reduction:
+ // For |x| > 1/32, we perform range reduction as follows:
+ // Find k and y such that:
+ // x = (k + y) * 1/32
+ // k is an integer
+ // |y| < 0.5
+ //
+ // This is done by performing:
+ // k = round(x * 32)
+ // y = x * 32 - k
+ //
+ // Once k and y are computed, we then deduce the answer by the formula:
+ // tan(x) = sin(x) / cos(x)
+ // = (sin_y * cos_k + cos_y * sin_k) / (cos_y * cos_k - sin_y * sin_k)
+ double sin_k, cos_k, sin_y, cosm1_y;
+ sincospif_eval(xd, sin_k, cos_k, sin_y, cosm1_y);
+
+ if (LIBC_UNLIKELY(sin_y == 0 && cos_k == 0)) {
+ fputil::set_errno_if_required(EDOM);
+ fputil::raise_except_if_required(FE_DIVBYZERO);
+
+ int32_t x_mp5_i = static_cast<int32_t>(xd - 0.5);
+ return FPBits::inf((x_mp5_i & 0x1) ? Sign::NEG : Sign::POS).get_val();
+ }
+
+ using fputil::multiply_add;
+ return fputil::cast<float>(
+ multiply_add(sin_y, cos_k, multiply_add(cosm1_y, sin_k, sin_k)) /
+ multiply_add(sin_y, -sin_k, multiply_add(cosm1_y, cos_k, cos_k)));
+}
+
+} // namespace math
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_MATH_TANPIF_H
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index 47101706ce4c8..d9e33686a3132 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -453,16 +453,7 @@ add_entrypoint_object(
HDRS
../tanf16.h
DEPENDS
- libc.hdr.errno_macros
- libc.hdr.fenv_macros
- libc.src.__support.FPUtil.cast
- libc.src.__support.FPUtil.fenv_impl
- libc.src.__support.FPUtil.fp_bits
- libc.src.__support.FPUtil.except_value_utils
- libc.src.__support.FPUtil.multiply_add
- libc.src.__support.macros.optimization
- libc.src.__support.macros.properties.types
- libc.src.__support.math.sincosf16_utils
+ libc.src.__support.math.tanf16
)
add_entrypoint_object(
@@ -472,12 +463,7 @@ add_entrypoint_object(
HDRS
../tanpif.h
DEPENDS
- libc.src.__support.math.sincosf_utils
- libc.src.__support.FPUtil.except_value_utils
- libc.src.__support.FPUtil.fenv_impl
- libc.src.__support.FPUtil.fp_bits
- libc.src.__support.FPUtil.multiply_add
- libc.src.__support.macros.optimization
+ libc.src.__support.math.tanpif
)
add_entrypoint_object(
@@ -2013,7 +1999,7 @@ add_entrypoint_object(
HDRS
../logbl.h
DEPENDS
- libc.src.__support.FPUtil.manipulation_functions
+ libc.src.__support.math.logbl
)
add_entrypoint_object(
@@ -4791,8 +4777,7 @@ add_entrypoint_object(
HDRS
../f16add.h
DEPENDS
- libc.src.__support.macros.properties.types
- libc.src.__support.FPUtil.generic.add_sub
+ libc.src.__support.math.f16add
)
add_entrypoint_object(
@@ -4802,30 +4787,27 @@ add_entrypoint_object(
HDRS
../f16addf.h
DEPENDS
- libc.src.__support.macros.properties.types
- libc.src.__support.FPUtil.generic.add_sub
+ libc.src.__support.math.f16addf
)
add_entrypoint_object(
- f16addl
+ f16addf128
SRCS
- f16addl.cpp
+ f16addf128.cpp
HDRS
- ../f16addl.h
+ ../f16addf128.h
DEPENDS
- libc.src.__support.macros.properties.types
- libc.src.__support.FPUtil.generic.add_sub
+ libc.src.__support.math.f16addf128
)
add_entrypoint_object(
- f16addf128
+ f16addl
SRCS
- f16addf128.cpp
+ f16addl.cpp
HDRS
- ../f16addf128.h
+ ../f16addl.h
DEPENDS
- libc.src.__support.macros.properties.types
- libc.src.__support.FPUtil.generic.add_sub
+ libc.src.__support.math.f16addl
)
add_entrypoint_object(
@@ -5189,11 +5171,7 @@ add_entrypoint_object(
HDRS
../bf16divl.h
DEPENDS
- libc.src.__support.common
- libc.src.__support.FPUtil.bfloat16
- libc.src.__support.FPUtil.generic.div
- libc.src.__support.macros.config
- libc.src.__support.macros.properties.types
+ libc.src.__support.math.bf16divl
)
add_entrypoint_object(
@@ -5241,11 +5219,7 @@ add_entrypoint_object(
HDRS
../bf16fmal.h
DEPENDS
- libc.src.__support.common
- libc.src.__support.FPUtil.bfloat16
- libc.src.__support.FPUtil.fma
- libc.src.__support.macros.config
- libc.src.__support.macros.properties.types
+ libc.src.__support.math.bf16fmal
)
add_entrypoint_object(
@@ -5269,11 +5243,7 @@ add_entrypoint_object(
HDRS
../bf16mul.h
DEPENDS
- libc.src.__support.common
- libc.src.__support.FPUtil.bfloat16
- libc.src.__support.FPUtil.generic.mul
- libc.src.__support.macros.config
- libc.src.__support.macros.properties.types
+ libc.src.__support.math.bf16mul
)
add_entrypoint_object(
@@ -5283,11 +5253,7 @@ add_entrypoint_object(
HDRS
../bf16mulf.h
DEPENDS
- libc.src.__support.common
- libc.src.__support.FPUtil.bfloat16
- libc.src.__support.FPUtil.generic.mul
- libc.src.__support.macros.config
- libc.src.__support.macros.properties.types
+ libc.src.__support.math.bf16mulf
)
add_entrypoint_object(
@@ -5297,11 +5263,7 @@ add_entrypoint_object(
HDRS
../bf16mull.h
DEPENDS
- libc.src.__support.common
- libc.src.__support.FPUtil.bfloat16
- libc.src.__support.FPUtil.generic.mul
- libc.src.__support.macros.config
- libc.src.__support.macros.properties.types
+ libc.src.__support.math.bf16mull
)
add_entrypoint_object(
@@ -5311,11 +5273,7 @@ add_entrypoint_object(
HDRS
../bf16mulf128.h
DEPENDS
- libc.src.__support.common
- libc.src.__support.FPUtil.bfloat16
- libc.src.__support.FPUtil.generic.mul
- libc.src.__support.macros.config
- libc.src.__support.macros.properties.types
+ libc.src.__support.math.bf16mulf128
)
add_entrypoint_object(
diff --git a/libc/src/math/generic/bf16divl.cpp b/libc/src/math/generic/bf16divl.cpp
index 21dd6b150e07a..432ed829005f0 100644
--- a/libc/src/math/generic/bf16divl.cpp
+++ b/libc/src/math/generic/bf16divl.cpp
@@ -7,15 +7,12 @@
//===----------------------------------------------------------------------===//
#include "src/math/bf16divl.h"
-#include "src/__support/FPUtil/bfloat16.h"
-#include "src/__support/FPUtil/generic/div.h"
-#include "src/__support/common.h"
-#include "src/__support/macros/config.h"
+#include "src/__support/math/bf16divl.h"
namespace LIBC_NAMESPACE_DECL {
LLVM_LIBC_FUNCTION(bfloat16, bf16divl, (long double x, long double y)) {
- return fputil::generic::div<bfloat16>(x, y);
+ return math::bf16divl(x, y);
}
} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/bf16fmal.cpp b/libc/src/math/generic/bf16fmal.cpp
index f31ec6904760b..0e8f1901c2093 100644
--- a/libc/src/math/generic/bf16fmal.cpp
+++ b/libc/src/math/generic/bf16fmal.cpp
@@ -7,16 +7,13 @@
//===----------------------------------------------------------------------===//
#include "src/math/bf16fmal.h"
-
-#include "src/__support/FPUtil/FMA.h"
-#include "src/__support/FPUtil/bfloat16.h"
-#include "src/__support/common.h"
-#include "src/__support/macros/config.h"
+#include "src/__support/math/bf16fmal.h"
namespace LIBC_NAMESPACE_DECL {
LLVM_LIBC_FUNCTION(bfloat16, bf16fmal,
(long double x, long double y, long double z)) {
- return fputil::fma<bfloat16>(x, y, z);
+ return math::bf16fmal(x, y, z);
}
+
} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/bf16mul.cpp b/libc/src/math/generic/bf16mul.cpp
index c50eec2b52e5c..e6b5d81004d97 100644
--- a/libc/src/math/generic/bf16mul.cpp
+++ b/libc/src/math/generic/bf16mul.cpp
@@ -7,15 +7,12 @@
//===----------------------------------------------------------------------===//
#include "src/math/bf16mul.h"
-#include "src/__support/FPUtil/bfloat16.h"
-#include "src/__support/FPUtil/generic/mul.h"
-#include "src/__support/common.h"
-#include "src/__support/macros/config.h"
+#include "src/__support/math/bf16mul.h"
namespace LIBC_NAMESPACE_DECL {
LLVM_LIBC_FUNCTION(bfloat16, bf16mul, (double x, double y)) {
- return fputil::generic::mul<bfloat16>(x, y);
+ return math::bf16mul(x, y);
}
} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/bf16mulf.cpp b/libc/src/math/generic/bf16mulf.cpp
index 117fcd1d661ab..a16086decb7ca 100644
--- a/libc/src/math/generic/bf16mulf.cpp
+++ b/libc/src/math/generic/bf16mulf.cpp
@@ -7,15 +7,12 @@
//===----------------------------------------------------------------------===//
#include "src/math/bf16mulf.h"
-#include "src/__support/FPUtil/bfloat16.h"
-#include "src/__support/FPUtil/generic/mul.h"
-#include "src/__support/common.h"
-#include "src/__support/macros/config.h"
+#include "src/__support/math/bf16mulf.h"
namespace LIBC_NAMESPACE_DECL {
LLVM_LIBC_FUNCTION(bfloat16, bf16mulf, (float x, float y)) {
- return fputil::generic::mul<bfloat16>(x, y);
+ return math::bf16mulf(x, y);
}
} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/bf16mulf128.cpp b/libc/src/math/generic/bf16mulf128.cpp
index ff2a081d82e6b..685568c15d161 100644
--- a/libc/src/math/generic/bf16mulf128.cpp
+++ b/libc/src/math/generic/bf16mulf128.cpp
@@ -7,15 +7,12 @@
//===----------------------------------------------------------------------===//
#include "src/math/bf16mulf128.h"
-#include "src/__support/FPUtil/bfloat16.h"
-#include "src/__support/FPUtil/generic/mul.h"
-#include "src/__support/common.h"
-#include "src/__support/macros/config.h"
+#include "src/__support/math/bf16mulf128.h"
namespace LIBC_NAMESPACE_DECL {
LLVM_LIBC_FUNCTION(bfloat16, bf16mulf128, (float128 x, float128 y)) {
- return fputil::generic::mul<bfloat16>(x, y);
+ return math::bf16mulf128(x, y);
}
} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/bf16mull.cpp b/libc/src/math/generic/bf16mull.cpp
index e7c4fc085a3cd..11a7ef833e6fb 100644
--- a/libc/src/math/generic/bf16mull.cpp
+++ b/libc/src/math/generic/bf16mull.cpp
@@ -7,15 +7,12 @@
//===----------------------------------------------------------------------===//
#include "src/math/bf16mull.h"
-#include "src/__support/FPUtil/bfloat16.h"
-#include "src/__support/FPUtil/generic/mul.h"
-#include "src/__support/common.h"
-#include "src/__support/macros/config.h"
+#include "src/__support/math/bf16mull.h"
namespace LIBC_NAMESPACE_DECL {
LLVM_LIBC_FUNCTION(bfloat16, bf16mull, (long double x, long double y)) {
- return fputil::generic::mul<bfloat16>(x, y);
+ return math::bf16mull(x, y);
}
} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/f16add.cpp b/libc/src/math/generic/f16add.cpp
index e9be8a743721e..fcd31e73183a1 100644
--- a/libc/src/math/generic/f16add.cpp
+++ b/libc/src/math/generic/f16add.cpp
@@ -7,14 +7,12 @@
//===----------------------------------------------------------------------===//
#include "src/math/f16add.h"
-#include "src/__support/FPUtil/generic/add_sub.h"
-#include "src/__support/common.h"
-#include "src/__support/macros/config.h"
+#include "src/__support/math/f16add.h"
namespace LIBC_NAMESPACE_DECL {
LLVM_LIBC_FUNCTION(float16, f16add, (double x, double y)) {
- return fputil::generic::add<float16>(x, y);
+ return math::f16add(x, y);
}
} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/f16addf.cpp b/libc/src/math/generic/f16addf.cpp
index ee05ff7f00531..63fe8c5f044bd 100644
--- a/libc/src/math/generic/f16addf.cpp
+++ b/libc/src/math/generic/f16addf.cpp
@@ -7,14 +7,12 @@
//===----------------------------------------------------------------------===//
#include "src/math/f16addf.h"
-#include "src/__support/FPUtil/generic/add_sub.h"
-#include "src/__support/common.h"
-#include "src/__support/macros/config.h"
+#include "src/__support/math/f16addf.h"
namespace LIBC_NAMESPACE_DECL {
LLVM_LIBC_FUNCTION(float16, f16addf, (float x, float y)) {
- return fputil::generic::add<float16>(x, y);
+ return math::f16addf(x, y);
}
} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/f16addf128.cpp b/libc/src/math/generic/f16addf128.cpp
index 4e9038e23125a..87e327e0c20d4 100644
--- a/libc/src/math/generic/f16addf128.cpp
+++ b/libc/src/math/generic/f16addf128.cpp
@@ -7,14 +7,12 @@
//===----------------------------------------------------------------------===//
#include "src/math/f16addf128.h"
-#include "src/__support/FPUtil/generic/add_sub.h"
-#include "src/__support/common.h"
-#include "src/__support/macros/config.h"
+#include "src/__support/math/f16addf128.h"
namespace LIBC_NAMESPACE_DECL {
LLVM_LIBC_FUNCTION(float16, f16addf128, (float128 x, float128 y)) {
- return fputil::generic::add<float16>(x, y);
+ return math::f16addf128(x, y);
}
} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/f16addl.cpp b/libc/src/math/generic/f16addl.cpp
index 925f08418b99d..4d93b7105aa79 100644
--- a/libc/src/math/generic/f16addl.cpp
+++ b/libc/src/math/generic/f16addl.cpp
@@ -7,14 +7,12 @@
//===----------------------------------------------------------------------===//
#include "src/math/f16addl.h"
-#include "src/__support/FPUtil/generic/add_sub.h"
-#include "src/__support/common.h"
-#include "src/__support/macros/config.h"
+#include "src/__support/math/f16addl.h"
namespace LIBC_NAMESPACE_DECL {
LLVM_LIBC_FUNCTION(float16, f16addl, (long double x, long double y)) {
- return fputil::generic::add<float16>(x, y);
+ return math::f16addl(x, y);
}
} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/logbl.cpp b/libc/src/math/generic/logbl.cpp
index dcab957f2c9c5..6c1df6d6549c0 100644
--- a/libc/src/math/generic/logbl.cpp
+++ b/libc/src/math/generic/logbl.cpp
@@ -7,14 +7,12 @@
//===----------------------------------------------------------------------===//
#include "src/math/logbl.h"
-#include "src/__support/FPUtil/ManipulationFunctions.h"
-#include "src/__support/common.h"
-#include "src/__support/macros/config.h"
+#include "src/__support/math/logbl.h"
namespace LIBC_NAMESPACE_DECL {
LLVM_LIBC_FUNCTION(long double, logbl, (long double x)) {
- return fputil::logb(x);
+ return math::logbl(x);
}
} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/tanf16.cpp b/libc/src/math/generic/tanf16.cpp
index 880ba0101a96e..8126a06cbaba9 100644
--- a/libc/src/math/generic/tanf16.cpp
+++ b/libc/src/math/generic/tanf16.cpp
@@ -7,118 +7,10 @@
//===----------------------------------------------------------------------===//
#include "src/math/tanf16.h"
-#include "hdr/errno_macros.h"
-#include "hdr/fenv_macros.h"
-#include "src/__support/FPUtil/FEnvImpl.h"
-#include "src/__support/FPUtil/FPBits.h"
-#include "src/__support/FPUtil/cast.h"
-#include "src/__support/FPUtil/except_value_utils.h"
-#include "src/__support/FPUtil/multiply_add.h"
-#include "src/__support/macros/optimization.h"
-#include "src/__support/math/sincosf16_utils.h"
+#include "src/__support/math/tanf16.h"
namespace LIBC_NAMESPACE_DECL {
-#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-constexpr size_t N_EXCEPTS = 9;
-
-constexpr fputil::ExceptValues<float16, N_EXCEPTS> TANF16_EXCEPTS{{
- // (input, RZ output, RU offset, RD offset, RN offset)
- {0x2894, 0x2894, 1, 0, 1},
- {0x3091, 0x3099, 1, 0, 0},
- {0x3098, 0x30a0, 1, 0, 0},
- {0x55ed, 0x3911, 1, 0, 0},
- {0x607b, 0xc638, 0, 1, 1},
- {0x674e, 0x3b7d, 1, 0, 0},
- {0x6807, 0x4014, 1, 0, 1},
- {0x6f4d, 0xbe19, 0, 1, 1},
- {0x7330, 0xcb62, 0, 1, 0},
-}};
-#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-
-LLVM_LIBC_FUNCTION(float16, tanf16, (float16 x)) {
- using namespace math::sincosf16_internal;
- using FPBits = fputil::FPBits<float16>;
- FPBits xbits(x);
-
- uint16_t x_u = xbits.uintval();
- uint16_t x_abs = x_u & 0x7fff;
- float xf = x;
-
-#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
- bool x_sign = x_u >> 15;
- // Handle exceptional values
- if (auto r = TANF16_EXCEPTS.lookup_odd(x_abs, x_sign);
- LIBC_UNLIKELY(r.has_value()))
- return r.value();
-#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-
- // |x| <= 0x1.d1p-5
- if (LIBC_UNLIKELY(x_abs <= 0x2b44)) {
- // |x| <= 0x1.398p-11
- if (LIBC_UNLIKELY(x_abs <= 0x10e6)) {
- // tan(+/-0) = +/-0
- if (LIBC_UNLIKELY(x_abs == 0))
- return x;
-
- int rounding = fputil::quick_get_round();
-
- // Exhaustive tests show that, when:
- // x > 0, and rounding upward or
- // x < 0, and rounding downward then,
- // tan(x) = x * 2^-11 + x
- if ((xbits.is_pos() && rounding == FE_UPWARD) ||
- (xbits.is_neg() && rounding == FE_DOWNWARD))
- return fputil::cast<float16>(fputil::multiply_add(xf, 0x1.0p-11f, xf));
- return x;
- }
-
- float xsq = xf * xf;
-
- // Degree-6 minimax odd polynomial of tan(x) generated by Sollya with:
- // > P = fpminimax(tan(x)/x, [|0, 2, 4, 6|], [|1, SG...|], [0, pi/32]);
- float result = fputil::polyeval(xsq, 0x1p0f, 0x1.555556p-2f, 0x1.110ee4p-3f,
- 0x1.be80f6p-5f);
-
- return fputil::cast<float16>(xf * result);
- }
-
- // tan(+/-inf) = NaN, and tan(NaN) = NaN
- if (LIBC_UNLIKELY(x_abs >= 0x7c00)) {
- if (xbits.is_signaling_nan()) {
- fputil::raise_except_if_required(FE_INVALID);
- return FPBits::quiet_nan().get_val();
- }
- // x = +/-inf
- if (x_abs == 0x7c00) {
- fputil::set_errno_if_required(EDOM);
- fputil::raise_except_if_required(FE_INVALID);
- }
-
- return x + FPBits::quiet_nan().get_val();
- }
-
- // Range reduction:
- // For |x| > pi/32, we perform range reduction as follows:
- // Find k and y such that:
- // x = (k + y) * pi/32;
- // k is an integer, |y| < 0.5
- //
- // This is done by performing:
- // k = round(x * 32/pi)
- // y = x * 32/pi - k
- //
- // Once k and y are computed, we then deduce the answer by the formula:
- // tan(x) = sin(x) / cos(x)
- // = (sin_y * cos_k + cos_y * sin_k) / (cos_y * cos_k - sin_y * sin_k)
- float sin_k, cos_k, sin_y, cosm1_y;
- sincosf16_eval(xf, sin_k, cos_k, sin_y, cosm1_y);
-
- // Note that, cosm1_y = cos_y - 1:
- using fputil::multiply_add;
- return fputil::cast<float16>(
- multiply_add(sin_y, cos_k, multiply_add(cosm1_y, sin_k, sin_k)) /
- multiply_add(sin_y, -sin_k, multiply_add(cosm1_y, cos_k, cos_k)));
-}
+LLVM_LIBC_FUNCTION(float16, tanf16, (float16 x)) { return math::tanf16(x); }
} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/tanpif.cpp b/libc/src/math/generic/tanpif.cpp
index 44df22b517a46..e3568d6e9d35c 100644
--- a/libc/src/math/generic/tanpif.cpp
+++ b/libc/src/math/generic/tanpif.cpp
@@ -7,101 +7,10 @@
//===----------------------------------------------------------------------===//
#include "src/math/tanpif.h"
-#include "src/__support/FPUtil/FEnvImpl.h"
-#include "src/__support/FPUtil/FPBits.h"
-#include "src/__support/FPUtil/cast.h"
-#include "src/__support/FPUtil/except_value_utils.h"
-#include "src/__support/FPUtil/multiply_add.h"
-#include "src/__support/common.h"
-#include "src/__support/macros/config.h"
-#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
-#include "src/__support/math/sincosf_utils.h"
+#include "src/__support/math/tanpif.h"
namespace LIBC_NAMESPACE_DECL {
-#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-constexpr size_t N_EXCEPTS = 3;
-
-constexpr fputil::ExceptValues<float, N_EXCEPTS> TANPIF_EXCEPTS{{
- // (input, RZ output, RU offset, RD offset, RN offset)
- {0x38F26685, 0x39BE6182, 1, 0, 0},
- {0x3E933802, 0x3FA267DD, 1, 0, 0},
- {0x3F3663FF, 0xBFA267DD, 0, 1, 0},
-}};
-#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-
-LLVM_LIBC_FUNCTION(float, tanpif, (float x)) {
- using namespace math::sincosf_utils_internal;
- using FPBits = typename fputil::FPBits<float>;
- FPBits xbits(x);
-
- uint32_t x_u = xbits.uintval();
- uint32_t x_abs = x_u & 0x7fff'ffffU;
- double xd = static_cast<double>(xbits.get_val());
-
- // Handle exceptional values
- if (LIBC_UNLIKELY(x_abs <= 0x3F3663FF)) {
- if (LIBC_UNLIKELY(x_abs == 0U))
- return x;
-
-#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
- bool x_sign = x_u >> 31;
-
- if (auto r = TANPIF_EXCEPTS.lookup_odd(x_abs, x_sign);
- LIBC_UNLIKELY(r.has_value()))
- return r.value();
-#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
- }
-
- // Numbers greater or equal to 2^23 are always integers, or infinity, or NaN
- if (LIBC_UNLIKELY(x_abs >= 0x4B00'0000)) {
- // x is inf or NaN.
- if (LIBC_UNLIKELY(x_abs >= 0x7f80'0000U)) {
- if (xbits.is_signaling_nan()) {
- fputil::raise_except_if_required(FE_INVALID);
- return FPBits::quiet_nan().get_val();
- }
-
- if (x_abs == 0x7f80'0000U) {
- fputil::set_errno_if_required(EDOM);
- fputil::raise_except_if_required(FE_INVALID);
- }
-
- return x + FPBits::quiet_nan().get_val();
- }
-
- return FPBits::zero(xbits.sign()).get_val();
- }
-
- // Range reduction:
- // For |x| > 1/32, we perform range reduction as follows:
- // Find k and y such that:
- // x = (k + y) * 1/32
- // k is an integer
- // |y| < 0.5
- //
- // This is done by performing:
- // k = round(x * 32)
- // y = x * 32 - k
- //
- // Once k and y are computed, we then deduce the answer by the formula:
- // tan(x) = sin(x) / cos(x)
- // = (sin_y * cos_k + cos_y * sin_k) / (cos_y * cos_k - sin_y * sin_k)
- double sin_k, cos_k, sin_y, cosm1_y;
- sincospif_eval(xd, sin_k, cos_k, sin_y, cosm1_y);
-
- if (LIBC_UNLIKELY(sin_y == 0 && cos_k == 0)) {
- fputil::set_errno_if_required(EDOM);
- fputil::raise_except_if_required(FE_DIVBYZERO);
-
- int32_t x_mp5_i = static_cast<int32_t>(xd - 0.5);
- return FPBits::inf((x_mp5_i & 0x1) ? Sign::NEG : Sign::POS).get_val();
- }
-
- using fputil::multiply_add;
- return fputil::cast<float>(
- multiply_add(sin_y, cos_k, multiply_add(cosm1_y, sin_k, sin_k)) /
- multiply_add(sin_y, -sin_k, multiply_add(cosm1_y, cos_k, cos_k)));
-}
+LLVM_LIBC_FUNCTION(float, tanpif, (float x)) { return math::tanpif(x); }
} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/test/shared/CMakeLists.txt b/libc/test/shared/CMakeLists.txt
index dfe2378269921..91942a12016ec 100644
--- a/libc/test/shared/CMakeLists.txt
+++ b/libc/test/shared/CMakeLists.txt
@@ -30,7 +30,13 @@ add_fp_unittest(
libc.src.__support.math.bf16addf
libc.src.__support.math.bf16addf128
libc.src.__support.math.bf16divf
+ libc.src.__support.math.bf16divl
libc.src.__support.math.bf16fmaf
+ libc.src.__support.math.bf16fmal
+ libc.src.__support.math.bf16mul
+ libc.src.__support.math.bf16mulf
+ libc.src.__support.math.bf16mulf128
+ libc.src.__support.math.bf16mull
libc.src.__support.math.canonicalize
libc.src.__support.math.canonicalizebf16
libc.src.__support.math.canonicalizef
@@ -66,6 +72,10 @@ add_fp_unittest(
libc.src.__support.math.exp10f16
libc.src.__support.math.expf
libc.src.__support.math.expf16
+ libc.src.__support.math.f16add
+ libc.src.__support.math.f16addf
+ libc.src.__support.math.f16addf128
+ libc.src.__support.math.f16addl
libc.src.__support.math.f16fma
libc.src.__support.math.f16fmaf
libc.src.__support.math.f16fmaf128
@@ -110,6 +120,7 @@ add_fp_unittest(
libc.src.__support.math.llogbf
libc.src.__support.math.llogbf128
libc.src.__support.math.llogbf16
+ libc.src.__support.math.logbl
libc.src.__support.math.logf16
libc.src.__support.math.llogbl
libc.src.__support.math.pow
@@ -131,6 +142,8 @@ add_fp_unittest(
libc.src.__support.math.sqrtf
libc.src.__support.math.tan
libc.src.__support.math.tanf
+ libc.src.__support.math.tanf16
libc.src.__support.math.tanhf
libc.src.__support.math.tanhf16
+ libc.src.__support.math.tanpif
)
diff --git a/libc/test/shared/shared_math_test.cpp b/libc/test/shared/shared_math_test.cpp
index 8eba836538c41..d290fadb52d11 100644
--- a/libc/test/shared/shared_math_test.cpp
+++ b/libc/test/shared/shared_math_test.cpp
@@ -47,8 +47,14 @@ TEST(LlvmLibcSharedMathTest, AllFloat16) {
EXPECT_FP_EQ(10.0f16, LIBC_NAMESPACE::shared::f16fmaf128(
float128(2.0), float128(3.0), float128(4.0)));
+ EXPECT_FP_EQ(
+ 5.0f16, LIBC_NAMESPACE::shared::f16addf128(float128(2.0), float128(3.0)));
+
#endif
+ EXPECT_FP_EQ(5.0f16, LIBC_NAMESPACE::shared::f16add(2.0, 3.0));
+ EXPECT_FP_EQ(5.0f16, LIBC_NAMESPACE::shared::f16addf(2.0f, 3.0f));
+ EXPECT_FP_EQ(5.0f16, LIBC_NAMESPACE::shared::f16addl(2.0L, 3.0L));
EXPECT_FP_EQ(0x0p+0f16, LIBC_NAMESPACE::shared::f16sqrt(0.0));
EXPECT_FP_EQ(0x0p+0f16, LIBC_NAMESPACE::shared::f16sqrtf(0.0f));
@@ -73,6 +79,7 @@ TEST(LlvmLibcSharedMathTest, AllFloat16) {
EXPECT_FP_EQ(0x1.921fb6p+0f16, LIBC_NAMESPACE::shared::acosf16(0.0f16));
EXPECT_FP_EQ(0x1p+0f16, LIBC_NAMESPACE::shared::f16sqrtl(1.0L));
EXPECT_FP_EQ(0.0f16, LIBC_NAMESPACE::shared::sinf16(0.0f16));
+ EXPECT_FP_EQ(0.0f16, LIBC_NAMESPACE::shared::tanf16(0.0f16));
EXPECT_FP_EQ(0x0p+0f16, LIBC_NAMESPACE::shared::sinpif16(0.0f16));
EXPECT_FP_EQ(0.0f16, LIBC_NAMESPACE::shared::tanhf16(0.0f16));
@@ -136,12 +143,15 @@ TEST(LlvmLibcSharedMathTest, AllFloat) {
EXPECT_FP_EQ(0.0f, LIBC_NAMESPACE::shared::sqrtf(0.0f));
EXPECT_FP_EQ(0.0f, LIBC_NAMESPACE::shared::tanf(0.0f));
EXPECT_FP_EQ(0.0f, LIBC_NAMESPACE::shared::tanhf(0.0f));
+ EXPECT_FP_EQ(0.0f, LIBC_NAMESPACE::shared::tanpif(0.0f));
float canonicalizef_cx = 0.0f;
float canonicalizef_x = 0.0f;
EXPECT_EQ(0, LIBC_NAMESPACE::shared::canonicalizef(&canonicalizef_cx,
&canonicalizef_x));
EXPECT_FP_EQ(0x0p+0f, canonicalizef_cx);
+
+ EXPECT_FP_EQ(bfloat16(0.0), LIBC_NAMESPACE::shared::bf16mulf(0.0f, 0.0f));
}
TEST(LlvmLibcSharedMathTest, AllDouble) {
@@ -187,6 +197,7 @@ TEST(LlvmLibcSharedMathTest, AllLongDouble) {
EXPECT_FP_EQ(0x0p+0f, LIBC_NAMESPACE::shared::fsqrtl(0.0L));
EXPECT_EQ(0, LIBC_NAMESPACE::shared::ilogbl(0x1.p+0L));
EXPECT_EQ(0L, LIBC_NAMESPACE::shared::llogbl(1.0L));
+ EXPECT_FP_EQ(0x0p+0L, LIBC_NAMESPACE::shared::logbl(1.0L));
EXPECT_FP_EQ(10.0f, LIBC_NAMESPACE::shared::ffmal(2.0L, 3.0, 4.0L));
long double canonicalizel_cx = 0.0L;
@@ -194,6 +205,10 @@ TEST(LlvmLibcSharedMathTest, AllLongDouble) {
EXPECT_EQ(0, LIBC_NAMESPACE::shared::canonicalizel(&canonicalizel_cx,
&canonicalizel_x));
EXPECT_FP_EQ(0x0p+0L, canonicalizel_cx);
+
+ EXPECT_FP_EQ(bfloat16(0.0), LIBC_NAMESPACE::shared::bf16mul(0.0L, 0.0L));
+
+ EXPECT_FP_EQ(bfloat16(0.0), LIBC_NAMESPACE::shared::bf16mull(0.0L, 0.0L));
}
#ifdef LIBC_TYPES_HAS_FLOAT128
@@ -229,6 +244,9 @@ TEST(LlvmLibcSharedMathTest, AllFloat128) {
EXPECT_EQ(0, LIBC_NAMESPACE::shared::canonicalizef128(&canonicalizef128_cx,
&canonicalizef128_x));
EXPECT_FP_EQ(float128(0.0), canonicalizef128_cx);
+
+ EXPECT_FP_EQ(bfloat16(0.0), LIBC_NAMESPACE::shared::bf16mulf128(
+ float128(0.0), float128(0.0)));
}
#endif // LIBC_TYPES_HAS_FLOAT128
@@ -236,6 +254,9 @@ TEST(LlvmLibcSharedMathTest, AllFloat128) {
TEST(LlvmLibcSharedMathTest, AllBFloat16) {
EXPECT_FP_EQ(bfloat16(5.0), LIBC_NAMESPACE::shared::bf16add(2.0, 3.0));
EXPECT_FP_EQ(bfloat16(2.0f), LIBC_NAMESPACE::shared::bf16divf(4.0f, 2.0f));
+ EXPECT_FP_EQ(bfloat16(2.0), LIBC_NAMESPACE::shared::bf16divl(6.0L, 3.0L));
+ EXPECT_FP_EQ(bfloat16(10.0),
+ LIBC_NAMESPACE::shared::bf16fmal(2.0L, 3.0L, 4.0L));
bfloat16 canonicalizebf16_cx = bfloat16(0.0);
bfloat16 canonicalizebf16_x = bfloat16(0.0);
diff --git a/libclc/opencl/lib/amdgcn/workitem/get_local_size.cl b/libclc/opencl/lib/amdgcn/workitem/get_local_size.cl
index 8aa24201de573..34e4f2f1b4c19 100644
--- a/libclc/opencl/lib/amdgcn/workitem/get_local_size.cl
+++ b/libclc/opencl/lib/amdgcn/workitem/get_local_size.cl
@@ -8,18 +8,14 @@
#include <clc/opencl/opencl-base.h>
-uint __clc_amdgcn_get_local_size_x(void) __asm("llvm.r600.read.local.size.x");
-uint __clc_amdgcn_get_local_size_y(void) __asm("llvm.r600.read.local.size.y");
-uint __clc_amdgcn_get_local_size_z(void) __asm("llvm.r600.read.local.size.z");
-
_CLC_DEF _CLC_OVERLOAD size_t get_local_size(uint dim) {
switch (dim) {
case 0:
- return __clc_amdgcn_get_local_size_x();
+ return __builtin_amdgcn_workgroup_size_x();
case 1:
- return __clc_amdgcn_get_local_size_y();
+ return __builtin_amdgcn_workgroup_size_y();
case 2:
- return __clc_amdgcn_get_local_size_z();
+ return __builtin_amdgcn_workgroup_size_z();
default:
return 1;
}
diff --git a/libclc/opencl/lib/amdgcn/workitem/get_num_groups.cl b/libclc/opencl/lib/amdgcn/workitem/get_num_groups.cl
index 11c1ba373aeff..9e8dddb859064 100644
--- a/libclc/opencl/lib/amdgcn/workitem/get_num_groups.cl
+++ b/libclc/opencl/lib/amdgcn/workitem/get_num_groups.cl
@@ -8,18 +8,14 @@
#include <clc/opencl/opencl-base.h>
-uint __clc_amdgcn_get_num_groups_x(void) __asm("llvm.r600.read.ngroups.x");
-uint __clc_amdgcn_get_num_groups_y(void) __asm("llvm.r600.read.ngroups.y");
-uint __clc_amdgcn_get_num_groups_z(void) __asm("llvm.r600.read.ngroups.z");
-
_CLC_DEF _CLC_OVERLOAD size_t get_num_groups(uint dim) {
switch (dim) {
case 0:
- return __clc_amdgcn_get_num_groups_x();
+ return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
case 1:
- return __clc_amdgcn_get_num_groups_y();
+ return __builtin_amdgcn_grid_size_y() / __builtin_amdgcn_workgroup_size_y();
case 2:
- return __clc_amdgcn_get_num_groups_z();
+ return __builtin_amdgcn_grid_size_z() / __builtin_amdgcn_workgroup_size_z();
default:
return 1;
}
diff --git a/libsycl/src/detail/offload/offload_topology.cpp b/libsycl/src/detail/offload/offload_topology.cpp
index 5e595e520a452..ab4c57ecf37eb 100644
--- a/libsycl/src/detail/offload/offload_topology.cpp
+++ b/libsycl/src/detail/offload/offload_topology.cpp
@@ -56,7 +56,7 @@ void OffloadTopology::registerNewPlatformsAndDevices(
}
void discoverOffloadDevices() {
- callAndThrow(olInit);
+ callAndThrow(olInit, nullptr);
// liboffload returns devices sorted by backend + platform. We rely on this
// behavior during device enumeration.
diff --git a/lldb/include/lldb/Host/ProcessLaunchInfo.h b/lldb/include/lldb/Host/ProcessLaunchInfo.h
index d89fe68b2d0d4..e13eecc9463ea 100644
--- a/lldb/include/lldb/Host/ProcessLaunchInfo.h
+++ b/lldb/include/lldb/Host/ProcessLaunchInfo.h
@@ -137,8 +137,7 @@ class ProcessLaunchInfo : public ProcessInfo {
bool ShouldUsePTY() const {
#ifdef _WIN32
return GetPTY().GetPseudoTerminalHandle() != ((HANDLE)(long long)-1) &&
- GetNumFileActions() == 0 &&
- GetFlags().Test(lldb::eLaunchFlagLaunchInTTY);
+ GetNumFileActions() == 0;
#else
return true;
#endif
diff --git a/lldb/include/lldb/Host/posix/HostThreadPosix.h b/lldb/include/lldb/Host/posix/HostThreadPosix.h
index 6c8e09fc11030..32be7154fa1d8 100644
--- a/lldb/include/lldb/Host/posix/HostThreadPosix.h
+++ b/lldb/include/lldb/Host/posix/HostThreadPosix.h
@@ -25,7 +25,7 @@ class HostThreadPosix : public HostNativeThreadBase {
Status Join(lldb::thread_result_t *result) override;
Status Cancel() override;
- Status Detach();
+ void Reset() override;
};
} // namespace lldb_private
diff --git a/lldb/include/lldb/Utility/ArchSpec.h b/lldb/include/lldb/Utility/ArchSpec.h
index 361108fd8f0e7..438a5e1faf86d 100644
--- a/lldb/include/lldb/Utility/ArchSpec.h
+++ b/lldb/include/lldb/Utility/ArchSpec.h
@@ -14,6 +14,7 @@
#include "lldb/lldb-forward.h"
#include "lldb/lldb-private-enumerations.h"
#include "llvm/ADT/StringRef.h"
+#include "llvm/TargetParser/SubtargetFeature.h"
#include "llvm/TargetParser/Triple.h"
#include <cstddef>
#include <cstdint>
@@ -542,6 +543,14 @@ class ArchSpec {
void SetFlags(const std::string &elf_abi);
+ const llvm::SubtargetFeatures &GetSubtargetFeatures() const {
+ return m_subtarget_features;
+ }
+
+ void SetSubtargetFeatures(llvm::SubtargetFeatures &&subtarget_features) {
+ m_subtarget_features = std::move(subtarget_features);
+ }
+
protected:
void UpdateCore();
@@ -553,6 +562,8 @@ class ArchSpec {
// these are application specific extensions like micromips, mips16 etc.
uint32_t m_flags = 0;
+ llvm::SubtargetFeatures m_subtarget_features;
+
// Called when m_def or m_entry are changed. Fills in all remaining members
// with default values.
void CoreUpdated(bool update_triple);
diff --git a/lldb/packages/Python/lldbsuite/test/lldbinline.py b/lldb/packages/Python/lldbsuite/test/lldbinline.py
index ae38ab9d8c9d7..d1225db4d61a9 100644
--- a/lldb/packages/Python/lldbsuite/test/lldbinline.py
+++ b/lldb/packages/Python/lldbsuite/test/lldbinline.py
@@ -210,4 +210,5 @@ def MakeInlineTest(__file, __globals, decorators=None, name=None, build_dict=Non
# correctly in test results.
test_class.test_filename = __file
test_class.mydir = TestBase.compute_mydir(__file)
+ test_class.SHARED_BUILD_TESTCASE = False
return test_class
diff --git a/lldb/packages/Python/lldbsuite/test/lldbtest.py b/lldb/packages/Python/lldbsuite/test/lldbtest.py
index 6034eca3b93f2..65fd56ed76c1c 100644
--- a/lldb/packages/Python/lldbsuite/test/lldbtest.py
+++ b/lldb/packages/Python/lldbsuite/test/lldbtest.py
@@ -569,6 +569,11 @@ class Base(unittest.TestCase):
# Can be overridden by the LLDB_TIME_WAIT_NEXT_LAUNCH environment variable.
timeWaitNextLaunch = 1.0
+ # Some test case classes require a separate build directory for each test
+ # function. Subclasses can set this to False in those cases. This slows down
+ # the test, but provides isolation where needed.
+ SHARED_BUILD_TESTCASE = True
+
@staticmethod
def compute_mydir(test_file):
"""Subclasses should call this function to correctly calculate the
@@ -754,7 +759,10 @@ def getSourceDir(self):
return os.path.join(configuration.test_src_root, self.mydir)
def getBuildDirBasename(self):
- return self.__class__.__module__ + "." + self.testMethodName
+ if self.SHARED_BUILD_TESTCASE:
+ return self.__class__.__module__
+ else:
+ return self.__class__.__module__ + "." + self.testMethodName
def getBuildDir(self):
"""Return the full path to the current test."""
@@ -763,10 +771,10 @@ def getBuildDir(self):
)
def makeBuildDir(self):
- """Create the test-specific working directory, deleting any previous
- contents."""
+ """Create the test-specific working directory, optionally deleting any
+ previous contents."""
bdir = self.getBuildDir()
- if os.path.isdir(bdir):
+ if os.path.isdir(bdir) and not self.SHARED_BUILD_TESTCASE:
shutil.rmtree(bdir)
lldbutil.mkdir_p(bdir)
diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
index 8e342b5277fc4..e8fd8e8e37e65 100644
--- a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
+++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
@@ -245,10 +245,7 @@ def __init__(
self.terminated: bool = False
self.events: List[Event] = []
self.progress_events: List[Event] = []
- self.invalidated_event: Optional[Event] = None
- self.memory_event: Optional[Event] = None
self.reverse_requests: List[Request] = []
- self.module_events: List[Dict] = []
self.sequence: int = 1
self.output: Dict[str, str] = {}
self.reverse_process: Optional[subprocess.Popen] = None
@@ -513,10 +510,6 @@ def _handle_event(self, packet: Event) -> None:
elif event == "capabilities" and body:
# Update the capabilities with new ones from the event.
self.capabilities.update(body["capabilities"])
- elif event == "invalidated":
- self.invalidated_event = packet
- elif event == "memory":
- self.memory_event = packet
def _handle_reverse_request(self, request: Request) -> None:
if request in self.reverse_requests:
@@ -704,6 +697,18 @@ def wait_for_terminated(self):
raise ValueError("didn't get terminated event")
return event_dict
+ def wait_for_invalidated(self):
+ event_dict = self.wait_for_event(["invalidated"])
+ if event_dict is None:
+ raise ValueError("didn't get invalidated event")
+ return event_dict
+
+ def wait_for_memory(self):
+ event_dict = self.wait_for_event(["memory"])
+ if event_dict is None:
+ raise ValueError("didn't get memory event")
+ return event_dict
+
def get_capability(self, key: str):
"""Get a value for the given key if it there is a key/value pair in
the capabilities reported by the adapter.
@@ -1581,7 +1586,7 @@ def request_threads(self):
return response
def request_variables(
- self, variablesReference, start=None, count=None, is_hex=None
+ self, variablesReference, start=None, count=None, is_hex: Optional[bool] = None
):
args_dict = {"variablesReference": variablesReference}
if start is not None:
@@ -1597,7 +1602,7 @@ def request_variables(
}
return self._send_recv(command_dict)
- def request_setVariable(self, containingVarRef, name, value, id=None):
+ def request_setVariable(self, containingVarRef, name, value, id=None, is_hex=None):
args_dict = {
"variablesReference": containingVarRef,
"name": name,
@@ -1605,6 +1610,8 @@ def request_setVariable(self, containingVarRef, name, value, id=None):
}
if id is not None:
args_dict["id"] = id
+ if is_hex is not None:
+ args_dict["format"] = {"hex": is_hex}
command_dict = {
"command": "setVariable",
"type": "request",
diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py
index e49e4a28e3878..14a5698653588 100644
--- a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py
+++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py
@@ -265,21 +265,6 @@ def verify_commands(self, flavor: str, output: str, commands: List[str]):
f"Command '{flavor}' - '{cmd}' not found in output: {output}",
)
- def verify_invalidated_event(self, expected_areas):
- event = self.dap_server.invalidated_event
- self.dap_server.invalidated_event = None
- self.assertIsNotNone(event)
- areas = event["body"].get("areas", [])
- self.assertEqual(set(expected_areas), set(areas))
-
- def verify_memory_event(self, memoryReference):
- if memoryReference is None:
- self.assertIsNone(self.dap_server.memory_event)
- event = self.dap_server.memory_event
- self.dap_server.memory_event = None
- self.assertIsNotNone(event)
- self.assertEqual(memoryReference, event["body"].get("memoryReference"))
-
def get_dict_value(self, d: Mapping[str, Any], key_path: List[str]) -> Any:
"""Verify each key in the key_path array is in contained in each
dictionary within "d". Assert if any key isn't in the
@@ -375,21 +360,28 @@ def get_local_as_int(self, name, threadId=None):
else:
return int(value)
- def set_variable(self, varRef, name, value, id=None):
+ def set_variable(self, varRef, name, value, id=None, is_hex: Optional[bool] = None):
"""Set a variable."""
- response = self.dap_server.request_setVariable(varRef, name, str(value), id=id)
+ response = self.dap_server.request_setVariable(
+ varRef, name, str(value), id=id, is_hex=is_hex
+ )
if response["success"]:
- self.verify_invalidated_event(["variables"])
- self.verify_memory_event(response["body"].get("memoryReference"))
+ invalidated_event = self.dap_server.wait_for_invalidated()
+ self.assertEqual(invalidated_event["body"].get("areas"), ["variables"])
+ memory_event = self.dap_server.wait_for_memory()
+ self.assertEqual(
+ memory_event["body"].get("memoryReference"),
+ response["body"].get("memoryReference"),
+ )
return response
- def set_local(self, name, value, id=None):
+ def set_local(self, name, value, id=None, is_hex: Optional[bool] = None):
"""Set a top level local variable only."""
# Get the locals scope reference dynamically
locals_ref = self.get_locals_scope_reference()
if locals_ref is None:
return None
- return self.set_variable(locals_ref, name, str(value), id=id)
+ return self.set_variable(locals_ref, name, str(value), id=id, is_hex=is_hex)
def get_locals_scope_reference(self):
"""Get the variablesReference for the locals scope."""
@@ -623,5 +615,6 @@ def writeMemory(self, memoryReference, data=None, offset=0, allowPartial=False):
memoryReference, encodedData, offset=offset, allowPartial=allowPartial
)
if response["success"]:
- self.verify_invalidated_event(["all"])
+ invalidated_event = self.dap_server.wait_for_invalidated()
+ self.assertEqual(invalidated_event["body"].get("areas"), ["all"])
return response
diff --git a/lldb/source/Host/posix/HostThreadPosix.cpp b/lldb/source/Host/posix/HostThreadPosix.cpp
index a53a8cc9d8389..92f172ecd00a5 100644
--- a/lldb/source/Host/posix/HostThreadPosix.cpp
+++ b/lldb/source/Host/posix/HostThreadPosix.cpp
@@ -50,12 +50,8 @@ Status HostThreadPosix::Cancel() {
return error;
}
-Status HostThreadPosix::Detach() {
- Status error;
- if (IsJoinable()) {
- int err = ::pthread_detach(m_thread);
- error = Status(err, eErrorTypePOSIX);
- }
- Reset();
- return error;
+void HostThreadPosix::Reset() {
+ if (IsJoinable())
+ ::pthread_detach(m_thread);
+ HostNativeThreadBase::Reset();
}
diff --git a/lldb/source/Plugins/Disassembler/LLVMC/DisassemblerLLVMC.cpp b/lldb/source/Plugins/Disassembler/LLVMC/DisassemblerLLVMC.cpp
index 8e495e20d254a..6384b5e1bb57c 100644
--- a/lldb/source/Plugins/Disassembler/LLVMC/DisassemblerLLVMC.cpp
+++ b/lldb/source/Plugins/Disassembler/LLVMC/DisassemblerLLVMC.cpp
@@ -1593,23 +1593,28 @@ DisassemblerLLVMC::DisassemblerLLVMC(const ArchSpec &arch,
}
if (triple.isRISCV() && !cpu_or_features_overriden) {
- uint32_t arch_flags = arch.GetFlags();
- if (arch_flags & ArchSpec::eRISCV_rvc)
- features_str += "+c,";
- if (arch_flags & ArchSpec::eRISCV_rve)
- features_str += "+e,";
- if ((arch_flags & ArchSpec::eRISCV_float_abi_single) ==
- ArchSpec::eRISCV_float_abi_single)
- features_str += "+f,";
- if ((arch_flags & ArchSpec::eRISCV_float_abi_double) ==
- ArchSpec::eRISCV_float_abi_double)
- features_str += "+f,+d,";
- if ((arch_flags & ArchSpec::eRISCV_float_abi_quad) ==
- ArchSpec::eRISCV_float_abi_quad)
- features_str += "+f,+d,+q,";
- // FIXME: how do we detect features such as `+a`, `+m`?
- // Turn them on by default now, since everyone seems to use them
- features_str += "+a,+m,";
+ auto subtarget_features = arch.GetSubtargetFeatures().getString();
+ if (!subtarget_features.empty()) {
+ features_str += subtarget_features;
+ } else {
+ uint32_t arch_flags = arch.GetFlags();
+ if (arch_flags & ArchSpec::eRISCV_rvc)
+ features_str += "+c,";
+ if (arch_flags & ArchSpec::eRISCV_rve)
+ features_str += "+e,";
+ if ((arch_flags & ArchSpec::eRISCV_float_abi_single) ==
+ ArchSpec::eRISCV_float_abi_single)
+ features_str += "+f,";
+ if ((arch_flags & ArchSpec::eRISCV_float_abi_double) ==
+ ArchSpec::eRISCV_float_abi_double)
+ features_str += "+f,+d,";
+ if ((arch_flags & ArchSpec::eRISCV_float_abi_quad) ==
+ ArchSpec::eRISCV_float_abi_quad)
+ features_str += "+f,+d,+q,";
+ // FIXME: how do we detect features such as `+a`, `+m`?
+ // Turn them on by default now, since everyone seems to use them
+ features_str += "+a,+m,";
+ }
}
// We use m_disasm_up.get() to tell whether we are valid or not, so if this
diff --git a/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp b/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp
index 1a515852e7092..830ff4c1091fa 100644
--- a/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp
+++ b/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp
@@ -13,6 +13,7 @@
#include <optional>
#include <unordered_map>
+#include "lldb/Core/Debugger.h"
#include "lldb/Core/Module.h"
#include "lldb/Core/ModuleSpec.h"
#include "lldb/Core/PluginManager.h"
@@ -27,12 +28,14 @@
#include "lldb/Target/Target.h"
#include "lldb/Utility/ArchSpec.h"
#include "lldb/Utility/DataBufferHeap.h"
+#include "lldb/Utility/DataExtractor.h"
#include "lldb/Utility/FileSpecList.h"
#include "lldb/Utility/LLDBLog.h"
#include "lldb/Utility/Log.h"
#include "lldb/Utility/RangeMap.h"
#include "lldb/Utility/Status.h"
#include "lldb/Utility/Stream.h"
+#include "lldb/Utility/StreamString.h"
#include "lldb/Utility/Timer.h"
#include "llvm/ADT/IntervalMap.h"
#include "llvm/ADT/PointerUnion.h"
@@ -45,6 +48,9 @@
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/MemoryBuffer.h"
#include "llvm/Support/MipsABIFlags.h"
+#include "llvm/Support/RISCVAttributes.h"
+#include "llvm/TargetParser/RISCVISAInfo.h"
+#include "llvm/TargetParser/SubtargetFeature.h"
#define CASE_AND_STREAM(s, def, width) \
case def: \
@@ -1407,6 +1413,178 @@ void ObjectFileELF::ParseARMAttributes(DataExtractor &data, uint64_t length,
}
}
+static std::optional<lldb::offset_t>
+FindSubSectionOffsetByName(const DataExtractor &data, lldb::offset_t offset,
+ uint32_t length, llvm::StringRef name) {
+ uint32_t section_length = 0;
+ llvm::StringRef section_name;
+ do {
+ offset += section_length;
+ // Sub-section's size and name are included in the total sub-section length.
+ // Don't shift the offset here, so it will point at the beginning of the
+ // sub-section and could be used as a return value.
+ auto tmp_offset = offset;
+ section_length = data.GetU32(&tmp_offset);
+ section_name = data.GetCStr(&tmp_offset);
+ } while (section_name != name && offset + section_length < length);
+
+ if (section_name == name)
+ return offset;
+
+ return std::nullopt;
+}
+
+static std::optional<lldb::offset_t>
+FindSubSubSectionOffsetByTag(const DataExtractor &data, lldb::offset_t offset,
+ unsigned tag) {
+ // Consume a sub-section size and name to shift the offset at the beginning of
+ // the sub-sub-sections list.
+ auto parent_section_length = data.GetU32(&offset);
+ data.GetCStr(&offset);
+ auto parent_section_end_offset = offset + parent_section_length;
+
+ uint32_t section_length = 0;
+ unsigned section_tag = 0;
+ do {
+ offset += section_length;
+ // Similar to sub-section sub-sub-section's tag and size are included in the
+ // total sub-sub-section length.
+ auto tmp_offset = offset;
+ section_tag = data.GetULEB128(&tmp_offset);
+ section_length = data.GetU32(&tmp_offset);
+ } while (section_tag != tag &&
+ offset + section_length < parent_section_end_offset);
+
+ if (section_tag == tag)
+ return offset;
+
+ return std::nullopt;
+}
+
+static std::optional<std::variant<uint64_t, llvm::StringRef>>
+GetAttributeValueByTag(const DataExtractor &data, lldb::offset_t offset,
+ unsigned tag) {
+ // Consume a sub-sub-section tag and size to shift the offset at the beginning
+ // of the attribute list.
+ data.GetULEB128(&offset);
+ auto parent_section_length = data.GetU32(&offset);
+ auto parent_section_end_offset = offset + parent_section_length;
+
+ std::variant<uint64_t, llvm::StringRef> result;
+ unsigned attribute_tag = 0;
+ do {
+ attribute_tag = data.GetULEB128(&offset);
+ // From the riscv psABI document:
+ // RISC-V attributes have a string value if the tag number is odd and an
+ // integer value if the tag number is even.
+ if (attribute_tag % 2)
+ result = data.GetCStr(&offset);
+ else
+ result = data.GetULEB128(&offset);
+ } while (attribute_tag != tag && offset < parent_section_end_offset);
+
+ if (attribute_tag == tag)
+ return result;
+
+ return std::nullopt;
+}
+
+void ObjectFileELF::ParseRISCVAttributes(DataExtractor &data, uint64_t length,
+ ArchSpec &arch_spec) {
+ Log *log = GetLog(LLDBLog::Modules);
+
+ lldb::offset_t offset = 0;
+
+ // According to the riscv psABI, the .riscv.attributes section has the
+ // following hierarchical structure:
+ //
+ // Section:
+ // .riscv.attributes {
+ // - (uint8_t) format
+ // - Sub-Section 1 {
+ // * (uint32_t) length
+ // * (c_str) name
+ // * Sub-Sub-Section 1.1 {
+ // > (uleb128_t) tag
+ // > (uint32_t) length
+ // > (uleb128_t) attribute_tag_1.1.1
+ // $ (c_str or uleb128_t) value
+ // > (uleb128_t) attribute_tag_1.1.2
+ // $ (c_str or uleb128_t) value
+ // ...
+ // Other attributes...
+ // ...
+ // > (uleb128_t) attribute_tag_1.1.N
+ // $ (c_str or uleb128_t) value
+ // }
+ // * Sub-Sub-Section 1.2 {
+ // ...
+ // Sub-Sub-Section structure...
+ // ...
+ // }
+ // ...
+ // Other sub-sub-sections...
+ // ...
+ // }
+ // - Sub-Section 2 {
+ // ...
+ // Sub-Section structure...
+ // ...
+ // }
+ // ...
+ // Other sub-sections...
+ // ...
+ // }
+
+ uint8_t format_version = data.GetU8(&offset);
+ if (format_version != llvm::ELFAttrs::Format_Version)
+ return;
+
+ auto subsection_or_opt =
+ FindSubSectionOffsetByName(data, offset, length, "riscv");
+ if (!subsection_or_opt) {
+ LLDB_LOGF(log,
+ "ObjectFileELF::%s Ill-formed .riscv.attributes section: "
+ "mandatory 'riscv' sub-section was not preserved",
+ __FUNCTION__);
+ return;
+ }
+
+ auto subsubsection_or_opt = FindSubSubSectionOffsetByTag(
+ data, *subsection_or_opt, llvm::ELFAttrs::File);
+ if (!subsubsection_or_opt)
+ return;
+
+ auto value_or_opt = GetAttributeValueByTag(data, *subsubsection_or_opt,
+ llvm::RISCVAttrs::ARCH);
+ if (!value_or_opt)
+ return;
+
+ auto normalized_isa_info = llvm::RISCVISAInfo::parseNormalizedArchString(
+ std::get<llvm::StringRef>(*value_or_opt));
+ if (llvm::errorToBool(normalized_isa_info.takeError()))
+ return;
+
+ llvm::SubtargetFeatures features;
+ features.addFeaturesVector((*normalized_isa_info)->toFeatures());
+ arch_spec.SetSubtargetFeatures(std::move(features));
+
+ // Additional verification of the arch string. This is primarily needed to
+ // warn users if the executable file contains conflicting RISC-V extensions
+ // that could lead to invalid disassembler output.
+ auto isa_info = llvm::RISCVISAInfo::parseArchString(
+ std::get<llvm::StringRef>(*value_or_opt),
+ /* EnableExperimentalExtension=*/true);
+ if (auto error = isa_info.takeError()) {
+ StreamString ss;
+ ss << "The .riscv.attributes section contains an invalid RISC-V arch "
+ "string: "
+ << llvm::toString(std::move(error))
+ << "\n\tThis could result in misleading disassembler output.\n";
+ Debugger::ReportWarning(ss.GetString().str());
+ }
+}
+
// GetSectionHeaderInfo
size_t ObjectFileELF::GetSectionHeaderInfo(SectionHeaderColl §ion_headers,
DataExtractor &object_data,
@@ -1624,6 +1802,15 @@ size_t ObjectFileELF::GetSectionHeaderInfo(SectionHeaderColl §ion_headers,
ParseARMAttributes(data, section_size, arch_spec);
}
+ if (arch_spec.GetTriple().isRISCV()) {
+ DataExtractor data;
+ if (sheader.sh_type == llvm::ELF::SHT_RISCV_ATTRIBUTES &&
+ section_size != 0 &&
+ data.SetData(object_data, sheader.sh_offset, section_size) ==
+ section_size)
+ ParseRISCVAttributes(data, section_size, arch_spec);
+ }
+
if (name == g_sect_name_gnu_debuglink) {
DataExtractor data;
if (section_size && (data.SetData(object_data, sheader.sh_offset,
diff --git a/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.h b/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.h
index 9fc19bcd07f34..866ef270fa731 100644
--- a/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.h
+++ b/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.h
@@ -275,6 +275,10 @@ class ObjectFileELF : public lldb_private::ObjectFile {
uint64_t length,
lldb_private::ArchSpec &arch_spec);
+ static void ParseRISCVAttributes(lldb_private::DataExtractor &data,
+ uint64_t length,
+ lldb_private::ArchSpec &arch_spec);
+
/// Parses the elf section headers and returns the uuid, debug link name,
/// crc, archspec.
static size_t GetSectionHeaderInfo(SectionHeaderColl §ion_headers,
diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp
index 3e085e993cad7..5e1aa33a59f89 100644
--- a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp
+++ b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp
@@ -199,125 +199,137 @@ PlatformDarwin::PutFile(const lldb_private::FileSpec &source,
return PlatformPOSIX::PutFile(source, destination, uid, gid);
}
-FileSpecList PlatformDarwin::LocateExecutableScriptingResources(
- Target *target, Module &module, Stream &feedback_stream) {
+static FileSpecList LocateExecutableScriptingResourcesFromDSYM(
+ Stream &feedback_stream, FileSpec module_spec, const Target &target,
+ const FileSpec &symfile_spec) {
FileSpecList file_list;
- if (target &&
- target->GetDebugger().GetScriptLanguage() == eScriptLanguagePython) {
- // NB some extensions might be meaningful and should not be stripped -
- // "this.binary.file"
- // should not lose ".file" but GetFileNameStrippingExtension() will do
- // precisely that. Ideally, we should have a per-platform list of
- // extensions (".exe", ".app", ".dSYM", ".framework") which should be
- // stripped while leaving "this.binary.file" as-is.
-
- FileSpec module_spec = module.GetFileSpec();
-
- if (module_spec) {
- if (SymbolFile *symfile = module.GetSymbolFile()) {
- ObjectFile *objfile = symfile->GetObjectFile();
- if (objfile) {
- FileSpec symfile_spec(objfile->GetFileSpec());
- if (symfile_spec &&
- llvm::StringRef(symfile_spec.GetPath())
- .contains_insensitive(".dSYM/Contents/Resources/DWARF") &&
- FileSystem::Instance().Exists(symfile_spec)) {
- while (module_spec.GetFilename()) {
- std::string module_basename(
- module_spec.GetFilename().GetCString());
- std::string original_module_basename(module_basename);
-
- bool was_keyword = false;
-
- // FIXME: for Python, we cannot allow certain characters in
- // module
- // filenames we import. Theoretically, different scripting
- // languages may have different sets of forbidden tokens in
- // filenames, and that should be dealt with by each
- // ScriptInterpreter. For now, we just replace dots with
- // underscores, but if we ever support anything other than
- // Python we will need to rework this
- llvm::replace(module_basename, '.', '_');
- llvm::replace(module_basename, ' ', '_');
- llvm::replace(module_basename, '-', '_');
- ScriptInterpreter *script_interpreter =
- target->GetDebugger().GetScriptInterpreter();
- if (script_interpreter &&
- script_interpreter->IsReservedWord(module_basename.c_str())) {
- module_basename.insert(module_basename.begin(), '_');
- was_keyword = true;
- }
+ while (module_spec.GetFilename()) {
+ std::string module_basename(module_spec.GetFilename().GetCString());
+ std::string original_module_basename(module_basename);
+
+ bool was_keyword = false;
+
+ // FIXME: for Python, don't allow certain characters in imported module
+ // filenames. Theoretically, different scripting languages may have
+ // different sets of forbidden tokens in filenames, and that should
+ // be dealt with by each ScriptInterpreter. For now, just replace dots
+ // with underscores. In order to support anything other than Python
+ // this will need to be reworked.
+ llvm::replace(module_basename, '.', '_');
+ llvm::replace(module_basename, ' ', '_');
+ llvm::replace(module_basename, '-', '_');
+ ScriptInterpreter *script_interpreter =
+ target.GetDebugger().GetScriptInterpreter();
+ if (script_interpreter &&
+ script_interpreter->IsReservedWord(module_basename.c_str())) {
+ module_basename.insert(module_basename.begin(), '_');
+ was_keyword = true;
+ }
- StreamString path_string;
- StreamString original_path_string;
- // for OSX we are going to be in
- // .dSYM/Contents/Resources/DWARF/<basename> let us go to
- // .dSYM/Contents/Resources/Python/<basename>.py and see if the
- // file exists
- path_string.Printf("%s/../Python/%s.py",
- symfile_spec.GetDirectory().GetCString(),
- module_basename.c_str());
- original_path_string.Printf(
- "%s/../Python/%s.py",
- symfile_spec.GetDirectory().GetCString(),
- original_module_basename.c_str());
- FileSpec script_fspec(path_string.GetString());
- FileSystem::Instance().Resolve(script_fspec);
- FileSpec orig_script_fspec(original_path_string.GetString());
- FileSystem::Instance().Resolve(orig_script_fspec);
-
- // if we did some replacements of reserved characters, and a
- // file with the untampered name exists, then warn the user
- // that the file as-is shall not be loaded
- if (module_basename != original_module_basename &&
- FileSystem::Instance().Exists(orig_script_fspec)) {
- const char *reason_for_complaint =
- was_keyword ? "conflicts with a keyword"
- : "contains reserved characters";
- if (FileSystem::Instance().Exists(script_fspec))
- feedback_stream.Printf(
- "warning: the symbol file '%s' contains a debug "
- "script. However, its name"
- " '%s' %s and as such cannot be loaded. LLDB will"
- " load '%s' instead. Consider removing the file with "
- "the malformed name to"
- " eliminate this warning.\n",
- symfile_spec.GetPath().c_str(),
- original_path_string.GetData(), reason_for_complaint,
- path_string.GetData());
- else
- feedback_stream.Printf(
- "warning: the symbol file '%s' contains a debug "
- "script. However, its name"
- " %s and as such cannot be loaded. If you intend"
- " to have this script loaded, please rename '%s' to "
- "'%s' and retry.\n",
- symfile_spec.GetPath().c_str(), reason_for_complaint,
- original_path_string.GetData(), path_string.GetData());
- }
+ StreamString path_string;
+ StreamString original_path_string;
+ // for OSX we are going to be in
+ // .dSYM/Contents/Resources/DWARF/<basename> let us go to
+ // .dSYM/Contents/Resources/Python/<basename>.py and see if the
+ // file exists
+ path_string.Printf("%s/../Python/%s.py",
+ symfile_spec.GetDirectory().GetCString(),
+ module_basename.c_str());
+ original_path_string.Printf("%s/../Python/%s.py",
+ symfile_spec.GetDirectory().GetCString(),
+ original_module_basename.c_str());
+ FileSpec script_fspec(path_string.GetString());
+ FileSystem::Instance().Resolve(script_fspec);
+ FileSpec orig_script_fspec(original_path_string.GetString());
+ FileSystem::Instance().Resolve(orig_script_fspec);
+
+ // if we did some replacements of reserved characters, and a
+ // file with the untampered name exists, then warn the user
+ // that the file as-is shall not be loaded
+ if (module_basename != original_module_basename &&
+ FileSystem::Instance().Exists(orig_script_fspec)) {
+ const char *reason_for_complaint = was_keyword
+ ? "conflicts with a keyword"
+ : "contains reserved characters";
+ if (FileSystem::Instance().Exists(script_fspec))
+ feedback_stream.Printf(
+ "warning: the symbol file '%s' contains a debug "
+ "script. However, its name"
+ " '%s' %s and as such cannot be loaded. LLDB will"
+ " load '%s' instead. Consider removing the file with "
+ "the malformed name to"
+ " eliminate this warning.\n",
+ symfile_spec.GetPath().c_str(), original_path_string.GetData(),
+ reason_for_complaint, path_string.GetData());
+ else
+ feedback_stream.Printf(
+ "warning: the symbol file '%s' contains a debug "
+ "script. However, its name"
+ " %s and as such cannot be loaded. If you intend"
+ " to have this script loaded, please rename '%s' to "
+ "'%s' and retry.\n",
+ symfile_spec.GetPath().c_str(), reason_for_complaint,
+ original_path_string.GetData(), path_string.GetData());
+ }
- if (FileSystem::Instance().Exists(script_fspec)) {
- file_list.Append(script_fspec);
- break;
- }
+ if (FileSystem::Instance().Exists(script_fspec)) {
+ file_list.Append(script_fspec);
+ break;
+ }
- // If we didn't find the python file, then keep stripping the
- // extensions and try again
- ConstString filename_no_extension(
- module_spec.GetFileNameStrippingExtension());
- if (module_spec.GetFilename() == filename_no_extension)
- break;
+ // If we didn't find the python file, then keep stripping the
+ // extensions and try again
+ ConstString filename_no_extension(
+ module_spec.GetFileNameStrippingExtension());
+ if (module_spec.GetFilename() == filename_no_extension)
+ break;
- module_spec.SetFilename(filename_no_extension);
- }
- }
- }
- }
- }
+ module_spec.SetFilename(filename_no_extension);
}
+
return file_list;
}
+FileSpecList PlatformDarwin::LocateExecutableScriptingResources(
+ Target *target, Module &module, Stream &feedback_stream) {
+ if (!target)
+ return {};
+
+ // For now only Python scripts supported for auto-loading.
+ if (target->GetDebugger().GetScriptLanguage() != eScriptLanguagePython)
+ return {};
+
+ // NB some extensions might be meaningful and should not be stripped -
+ // "this.binary.file"
+ // should not lose ".file" but GetFileNameStrippingExtension() will do
+ // precisely that. Ideally, we should have a per-platform list of
+ // extensions (".exe", ".app", ".dSYM", ".framework") which should be
+ // stripped while leaving "this.binary.file" as-is.
+
+ const FileSpec &module_spec = module.GetFileSpec();
+
+ if (!module_spec)
+ return {};
+
+ SymbolFile *symfile = module.GetSymbolFile();
+ if (!symfile)
+ return {};
+
+ ObjectFile *objfile = symfile->GetObjectFile();
+ if (!objfile)
+ return {};
+
+ const FileSpec &symfile_spec = objfile->GetFileSpec();
+ if (symfile_spec &&
+ llvm::StringRef(symfile_spec.GetPath())
+ .contains_insensitive(".dSYM/Contents/Resources/DWARF") &&
+ FileSystem::Instance().Exists(symfile_spec))
+ return LocateExecutableScriptingResourcesFromDSYM(
+ feedback_stream, module_spec, *target, symfile_spec);
+
+ return {};
+}
+
Status PlatformDarwin::ResolveSymbolFile(Target &target,
const ModuleSpec &sym_spec,
FileSpec &sym_file) {
diff --git a/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.cpp b/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.cpp
index 9a53252b2b4ae..8c1919eca7dda 100644
--- a/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.cpp
+++ b/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.cpp
@@ -215,10 +215,6 @@ Status ProcessWindows::DoLaunch(Module *exe_module,
if (error.Success())
SetID(launch_info.GetProcessID());
m_pty = launch_info.TakePTY();
- // At this point, Process owns the ConPTY. If ProcessLaunchInfo still has a
- // reference to it, it might get closed prematurely if another target is
- // created.
- assert(m_pty.use_count() == 1 && "More than one reference to the ConPTY");
return error;
}
diff --git a/lldb/test/API/commands/frame/var/TestFrameVar.py b/lldb/test/API/commands/frame/var/TestFrameVar.py
index d8260a5657618..b70120cb2d8e1 100644
--- a/lldb/test/API/commands/frame/var/TestFrameVar.py
+++ b/lldb/test/API/commands/frame/var/TestFrameVar.py
@@ -16,6 +16,7 @@ class TestFrameVar(TestBase):
# set this to true. That way it won't be run once for
# each debug info format.
NO_DEBUG_INFO_TESTCASE = True
+ SHARED_BUILD_TESTCASE = False
def test_frame_var(self):
self.build()
diff --git a/lldb/test/API/commands/platform/connect/TestPlatformConnect.py b/lldb/test/API/commands/platform/connect/TestPlatformConnect.py
index 5df0c16fbd1f7..0f9a51e216215 100644
--- a/lldb/test/API/commands/platform/connect/TestPlatformConnect.py
+++ b/lldb/test/API/commands/platform/connect/TestPlatformConnect.py
@@ -8,6 +8,7 @@
class TestPlatformProcessConnect(TestBase):
NO_DEBUG_INFO_TESTCASE = True
+ SHARED_BUILD_TESTCASE = False
@skipIfRemote
@expectedFailureAll(hostoslist=["windows"], triple=".*-android")
diff --git a/lldb/test/API/commands/platform/launchgdbserver/TestPlatformLaunchGDBServer.py b/lldb/test/API/commands/platform/launchgdbserver/TestPlatformLaunchGDBServer.py
index 65f0cefae96b4..e9e6845c0f549 100644
--- a/lldb/test/API/commands/platform/launchgdbserver/TestPlatformLaunchGDBServer.py
+++ b/lldb/test/API/commands/platform/launchgdbserver/TestPlatformLaunchGDBServer.py
@@ -9,6 +9,7 @@
class TestPlatformProcessLaunchGDBServer(TestBase):
NO_DEBUG_INFO_TESTCASE = True
+ SHARED_BUILD_TESTCASE = False
def _launch_and_connect(self, exe):
hostname = socket.getaddrinfo("localhost", 0, proto=socket.IPPROTO_TCP)[0][4][0]
diff --git a/lldb/test/API/commands/process/launch/TestProcessLaunch.py b/lldb/test/API/commands/process/launch/TestProcessLaunch.py
index 92d0c468741e5..28de5bc0623d0 100644
--- a/lldb/test/API/commands/process/launch/TestProcessLaunch.py
+++ b/lldb/test/API/commands/process/launch/TestProcessLaunch.py
@@ -13,6 +13,7 @@
class ProcessLaunchTestCase(TestBase):
NO_DEBUG_INFO_TESTCASE = True
+ SHARED_BUILD_TESTCASE = False
def setUp(self):
# Call super's setUp().
diff --git a/lldb/test/API/commands/settings/use_source_cache/TestUseSourceCache.py b/lldb/test/API/commands/settings/use_source_cache/TestUseSourceCache.py
index 8425ab09ab9d7..01f5e652d37bd 100644
--- a/lldb/test/API/commands/settings/use_source_cache/TestUseSourceCache.py
+++ b/lldb/test/API/commands/settings/use_source_cache/TestUseSourceCache.py
@@ -12,6 +12,7 @@
class SettingsUseSourceCacheTestCase(TestBase):
NO_DEBUG_INFO_TESTCASE = True
+ SHARED_BUILD_TESTCASE = False
def test_set_use_source_cache_false(self):
"""Test that after 'set use-source-cache false', files are not locked."""
diff --git a/lldb/test/API/commands/statistics/basic/TestStats.py b/lldb/test/API/commands/statistics/basic/TestStats.py
index c8527abf3c84e..a32b8feecc5cf 100644
--- a/lldb/test/API/commands/statistics/basic/TestStats.py
+++ b/lldb/test/API/commands/statistics/basic/TestStats.py
@@ -11,6 +11,7 @@
class TestCase(TestBase):
NO_DEBUG_INFO_TESTCASE = True
+ SHARED_BUILD_TESTCASE = False
def test_enable_disable(self):
"""
diff --git a/lldb/test/API/commands/target/auto-install-main-executable/TestAutoInstallMainExecutable.py b/lldb/test/API/commands/target/auto-install-main-executable/TestAutoInstallMainExecutable.py
index 165fae72319ae..47bbd2439434c 100644
--- a/lldb/test/API/commands/target/auto-install-main-executable/TestAutoInstallMainExecutable.py
+++ b/lldb/test/API/commands/target/auto-install-main-executable/TestAutoInstallMainExecutable.py
@@ -13,6 +13,7 @@
class TestAutoInstallMainExecutable(TestBase):
NO_DEBUG_INFO_TESTCASE = True
+ SHARED_BUILD_TESTCASE = False
@skipIfRemote
@skipIfWindows # This test is flaky on Windows
diff --git a/lldb/test/API/commands/target/dump-separate-debug-info/dwo/TestDumpDwo.py b/lldb/test/API/commands/target/dump-separate-debug-info/dwo/TestDumpDwo.py
index d819b5ed9ca87..7ac4e16e8cdcb 100644
--- a/lldb/test/API/commands/target/dump-separate-debug-info/dwo/TestDumpDwo.py
+++ b/lldb/test/API/commands/target/dump-separate-debug-info/dwo/TestDumpDwo.py
@@ -12,6 +12,7 @@
class TestDumpDWO(lldbtest.TestBase):
NO_DEBUG_INFO_TESTCASE = True
+ SHARED_BUILD_TESTCASE = False
def get_dwos_from_json_output(self):
"""Returns a dictionary of `symfile` -> {`dwo_name` -> dwo_info object}."""
diff --git a/lldb/test/API/commands/trace/TestTraceStartStop.py b/lldb/test/API/commands/trace/TestTraceStartStop.py
index 9450f8b0961a8..8f882eb5d974b 100644
--- a/lldb/test/API/commands/trace/TestTraceStartStop.py
+++ b/lldb/test/API/commands/trace/TestTraceStartStop.py
@@ -7,6 +7,8 @@
@skipIfNoIntelPT
class TestTraceStartStop(TraceIntelPTTestCaseBase):
+ SHARED_BUILD_TESTCASE = False
+
def expectGenericHelpMessageForStartCommand(self):
self.expect(
"help thread trace start",
diff --git a/lldb/test/API/functionalities/breakpoint/breakpoint_command/TestBreakpointCommand.py b/lldb/test/API/functionalities/breakpoint/breakpoint_command/TestBreakpointCommand.py
index 605561c757372..e45362295690d 100644
--- a/lldb/test/API/functionalities/breakpoint/breakpoint_command/TestBreakpointCommand.py
+++ b/lldb/test/API/functionalities/breakpoint/breakpoint_command/TestBreakpointCommand.py
@@ -14,6 +14,7 @@
class BreakpointCommandTestCase(TestBase):
NO_DEBUG_INFO_TESTCASE = True
+ SHARED_BUILD_TESTCASE = False
@expectedFailureAll(oslist=["windows"], bugnumber="llvm.org/pr24528")
def test_breakpoint_command_sequence(self):
diff --git a/lldb/test/API/functionalities/breakpoint/comp_dir_symlink/TestCompDirSymLink.py b/lldb/test/API/functionalities/breakpoint/comp_dir_symlink/TestCompDirSymLink.py
index ca2c7c3d1ad93..1430eb890c2e6 100644
--- a/lldb/test/API/functionalities/breakpoint/comp_dir_symlink/TestCompDirSymLink.py
+++ b/lldb/test/API/functionalities/breakpoint/comp_dir_symlink/TestCompDirSymLink.py
@@ -16,6 +16,8 @@
class CompDirSymLinkTestCase(TestBase):
+ SHARED_BUILD_TESTCASE = False
+
def setUp(self):
# Call super's setUp().
TestBase.setUp(self)
diff --git a/lldb/test/API/functionalities/breakpoint/objc/TestObjCBreakpoints.py b/lldb/test/API/functionalities/breakpoint/objc/TestObjCBreakpoints.py
index 29cf31563a9a9..7cecd1d290683 100644
--- a/lldb/test/API/functionalities/breakpoint/objc/TestObjCBreakpoints.py
+++ b/lldb/test/API/functionalities/breakpoint/objc/TestObjCBreakpoints.py
@@ -13,6 +13,8 @@
class TestObjCBreakpoints(TestBase):
+ SHARED_BUILD_TESTCASE = False
+
@add_test_categories(["objc"])
def test_break(self):
"""Test setting Objective-C specific breakpoints (DWARF in .o files)."""
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/map/TestDataFormatterStdMap.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/map/TestDataFormatterStdMap.py
index ca2d2d6b49541..2a420d8ed58ae 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/map/TestDataFormatterStdMap.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/map/TestDataFormatterStdMap.py
@@ -10,6 +10,7 @@
class StdMapDataFormatterTestCase(TestBase):
TEST_WITH_PDB_DEBUG_INFO = True
+ SHARED_BUILD_TESTCASE = False
def setUp(self):
TestBase.setUp(self)
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/string/TestDataFormatterStdString.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/string/TestDataFormatterStdString.py
index 00047e419de37..34989aea9de6e 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/string/TestDataFormatterStdString.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/string/TestDataFormatterStdString.py
@@ -12,6 +12,7 @@
class StdStringDataFormatterTestCase(TestBase):
TEST_WITH_PDB_DEBUG_INFO = True
+ SHARED_BUILD_TESTCASE = False
def setUp(self):
# Call super's setUp().
diff --git a/lldb/test/API/functionalities/gdb_remote_client/TestPty.py b/lldb/test/API/functionalities/gdb_remote_client/TestPty.py
index 94eeb6e3ba11a..47f687806dbab 100644
--- a/lldb/test/API/functionalities/gdb_remote_client/TestPty.py
+++ b/lldb/test/API/functionalities/gdb_remote_client/TestPty.py
@@ -7,6 +7,8 @@
@skipIf(hostoslist=["windows"])
class TestPty(GDBRemoteTestBase):
+ SHARED_BUILD_TESTCASE = False
+
server_socket_class = PtyServerSocket
def get_term_attrs(self):
diff --git a/lldb/test/API/functionalities/inferior-changed/TestInferiorChanged.py b/lldb/test/API/functionalities/inferior-changed/TestInferiorChanged.py
index ea0283e119f1e..c8114ab78d9e0 100644
--- a/lldb/test/API/functionalities/inferior-changed/TestInferiorChanged.py
+++ b/lldb/test/API/functionalities/inferior-changed/TestInferiorChanged.py
@@ -10,6 +10,8 @@
class ChangedInferiorTestCase(TestBase):
+ SHARED_BUILD_TESTCASE = False
+
@skipIf(hostoslist=["windows"])
@no_debug_info_test
def test_inferior_crashing(self):
diff --git a/lldb/test/API/functionalities/limit-debug-info/TestLimitDebugInfo.py b/lldb/test/API/functionalities/limit-debug-info/TestLimitDebugInfo.py
index 18371669462e2..668d31d8a5acd 100644
--- a/lldb/test/API/functionalities/limit-debug-info/TestLimitDebugInfo.py
+++ b/lldb/test/API/functionalities/limit-debug-info/TestLimitDebugInfo.py
@@ -10,6 +10,8 @@
class LimitDebugInfoTestCase(TestBase):
+ SHARED_BUILD_TESTCASE = False
+
def _check_type(self, target, name):
exe = target.FindModule(lldb.SBFileSpec("a.out"))
type_ = exe.FindFirstType(name)
diff --git a/lldb/test/API/functionalities/module_cache/bsd/TestModuleCacheBSD.py b/lldb/test/API/functionalities/module_cache/bsd/TestModuleCacheBSD.py
index 20312f829fdc1..180d487fd4845 100644
--- a/lldb/test/API/functionalities/module_cache/bsd/TestModuleCacheBSD.py
+++ b/lldb/test/API/functionalities/module_cache/bsd/TestModuleCacheBSD.py
@@ -10,6 +10,8 @@
class ModuleCacheTestcaseBSD(TestBase):
+ SHARED_BUILD_TESTCASE = False
+
def setUp(self):
# Call super's setUp().
TestBase.setUp(self)
diff --git a/lldb/test/API/functionalities/module_cache/debug_index/TestDebugIndexCache.py b/lldb/test/API/functionalities/module_cache/debug_index/TestDebugIndexCache.py
index 501ceb705c579..7d7d31b366f4b 100644
--- a/lldb/test/API/functionalities/module_cache/debug_index/TestDebugIndexCache.py
+++ b/lldb/test/API/functionalities/module_cache/debug_index/TestDebugIndexCache.py
@@ -9,6 +9,8 @@
class DebugIndexCacheTestcase(TestBase):
+ SHARED_BUILD_TESTCASE = False
+
def setUp(self):
# Call super's setUp().
TestBase.setUp(self)
diff --git a/lldb/test/API/functionalities/rerun_and_expr/TestRerunAndExpr.py b/lldb/test/API/functionalities/rerun_and_expr/TestRerunAndExpr.py
index 1d62af4299c3b..e0d6a811fc22c 100644
--- a/lldb/test/API/functionalities/rerun_and_expr/TestRerunAndExpr.py
+++ b/lldb/test/API/functionalities/rerun_and_expr/TestRerunAndExpr.py
@@ -11,6 +11,8 @@
class TestRerunExpr(TestBase):
+ SHARED_BUILD_TESTCASE = False
+
# FIXME: on Windows rebuilding the binary isn't enough to unload it
# on progrem restart. One will have to try hard to evict
# the module from the ModuleList (possibly including a call to
diff --git a/lldb/test/API/functionalities/rerun_and_expr_dylib/TestRerunAndExprDylib.py b/lldb/test/API/functionalities/rerun_and_expr_dylib/TestRerunAndExprDylib.py
index 19edaac964e62..cc9ffd280a021 100644
--- a/lldb/test/API/functionalities/rerun_and_expr_dylib/TestRerunAndExprDylib.py
+++ b/lldb/test/API/functionalities/rerun_and_expr_dylib/TestRerunAndExprDylib.py
@@ -26,6 +26,8 @@ def isUbuntu18_04():
class TestRerunExprDylib(TestBase):
+ SHARED_BUILD_TESTCASE = False
+
@skipTestIfFn(isUbuntu18_04, bugnumber="rdar://103831050")
@skipIfWindows
@skipIfRemote
diff --git a/lldb/test/API/functionalities/thread/step_until/TestStepUntilAPI.py b/lldb/test/API/functionalities/thread/step_until/TestStepUntilAPI.py
index 08d78fb996c75..8181f27a0d669 100644
--- a/lldb/test/API/functionalities/thread/step_until/TestStepUntilAPI.py
+++ b/lldb/test/API/functionalities/thread/step_until/TestStepUntilAPI.py
@@ -6,6 +6,7 @@
class TestStepUntilAPI(TestBase):
NO_DEBUG_INFO_TESTCASE = True
+ SHARED_BUILD_TESTCASE = False
def setUp(self):
super().setUp()
diff --git a/lldb/test/API/lang/c/calling-conventions/TestCCallingConventions.py b/lldb/test/API/lang/c/calling-conventions/TestCCallingConventions.py
index 9540dc066f308..a8cb87063f50a 100644
--- a/lldb/test/API/lang/c/calling-conventions/TestCCallingConventions.py
+++ b/lldb/test/API/lang/c/calling-conventions/TestCCallingConventions.py
@@ -7,6 +7,7 @@
class TestCase(TestBase):
NO_DEBUG_INFO_TESTCASE = True
+ SHARED_BUILD_TESTCASE = False
def build_and_run(self, test_file):
"""
diff --git a/lldb/test/API/lang/c/shared_lib_stripped_symbols/TestSharedLibStrippedSymbols.py b/lldb/test/API/lang/c/shared_lib_stripped_symbols/TestSharedLibStrippedSymbols.py
index 5440778572f8d..173ebc769b922 100644
--- a/lldb/test/API/lang/c/shared_lib_stripped_symbols/TestSharedLibStrippedSymbols.py
+++ b/lldb/test/API/lang/c/shared_lib_stripped_symbols/TestSharedLibStrippedSymbols.py
@@ -9,6 +9,8 @@
class SharedLibStrippedTestCase(TestBase):
+ SHARED_BUILD_TESTCASE = False
+
@expectedFailureAll(oslist=["windows"])
# Sometimes fails with:
# error: Couldn't allocate space for materialized struct: Couldn't malloc: address space is full
diff --git a/lldb/test/API/lang/cpp/abi_tag_lookup/TestAbiTagLookup.py b/lldb/test/API/lang/cpp/abi_tag_lookup/TestAbiTagLookup.py
index 4f6e41ed29de1..19f4a4e14ed22 100644
--- a/lldb/test/API/lang/cpp/abi_tag_lookup/TestAbiTagLookup.py
+++ b/lldb/test/API/lang/cpp/abi_tag_lookup/TestAbiTagLookup.py
@@ -10,6 +10,8 @@
class AbiTagLookupTestCase(TestBase):
+ SHARED_BUILD_TESTCASE = False
+
@skipIfWindows
@expectedFailureAll(debug_info=["dwarf", "gmodules", "dwo"])
def test_abi_tag_lookup(self):
diff --git a/lldb/test/API/lang/cpp/abi_tag_structors/TestAbiTagStructors.py b/lldb/test/API/lang/cpp/abi_tag_structors/TestAbiTagStructors.py
index 2d3e4f7cdd472..58b726417ee0a 100644
--- a/lldb/test/API/lang/cpp/abi_tag_structors/TestAbiTagStructors.py
+++ b/lldb/test/API/lang/cpp/abi_tag_structors/TestAbiTagStructors.py
@@ -10,6 +10,8 @@
class AbiTagStructorsTestCase(TestBase):
+ SHARED_BUILD_TESTCASE = False
+
@skipIf(
compiler="clang",
compiler_version=["<", "22"],
diff --git a/lldb/test/API/lang/cpp/const_static_integral_member/TestConstStaticIntegralMember.py b/lldb/test/API/lang/cpp/const_static_integral_member/TestConstStaticIntegralMember.py
index 9de7eb2e4a6e3..3f42dc195d118 100644
--- a/lldb/test/API/lang/cpp/const_static_integral_member/TestConstStaticIntegralMember.py
+++ b/lldb/test/API/lang/cpp/const_static_integral_member/TestConstStaticIntegralMember.py
@@ -9,6 +9,8 @@
class TestCase(TestBase):
+ SHARED_BUILD_TESTCASE = False
+
def test(self):
self.build()
lldbutil.run_to_source_breakpoint(
diff --git a/lldb/test/API/lang/cpp/expr-definition-in-dylib/TestExprDefinitionInDylib.py b/lldb/test/API/lang/cpp/expr-definition-in-dylib/TestExprDefinitionInDylib.py
index b3bed43c75873..b0781c8d442e5 100644
--- a/lldb/test/API/lang/cpp/expr-definition-in-dylib/TestExprDefinitionInDylib.py
+++ b/lldb/test/API/lang/cpp/expr-definition-in-dylib/TestExprDefinitionInDylib.py
@@ -5,6 +5,7 @@
class ExprDefinitionInDylibTestCase(TestBase):
+ SHARED_BUILD_TESTCASE = False
@skipIf(
compiler="clang",
diff --git a/lldb/test/API/lang/cpp/gmodules/template-with-same-arg/TestTemplateWithSameArg.py b/lldb/test/API/lang/cpp/gmodules/template-with-same-arg/TestTemplateWithSameArg.py
index d40be55872eae..00fa739ee0591 100644
--- a/lldb/test/API/lang/cpp/gmodules/template-with-same-arg/TestTemplateWithSameArg.py
+++ b/lldb/test/API/lang/cpp/gmodules/template-with-same-arg/TestTemplateWithSameArg.py
@@ -27,6 +27,8 @@
class TestTemplateWithSameArg(TestBase):
+ SHARED_BUILD_TESTCASE = False
+
def setUp(self):
TestBase.setUp(self)
self.build()
diff --git a/lldb/test/API/lang/cpp/namespace/TestNamespaceLookup.py b/lldb/test/API/lang/cpp/namespace/TestNamespaceLookup.py
index 41141164769ec..73d43207cd12c 100644
--- a/lldb/test/API/lang/cpp/namespace/TestNamespaceLookup.py
+++ b/lldb/test/API/lang/cpp/namespace/TestNamespaceLookup.py
@@ -11,6 +11,8 @@
from lldbsuite.test import lldbplatformutil
class NamespaceLookupTestCase(TestBase):
+ SHARED_BUILD_TESTCASE = False
+
def setUp(self):
# Call super's setUp().
TestBase.setUp(self)
diff --git a/lldb/test/API/lang/cpp/template-alias/TestTemplateAlias.py b/lldb/test/API/lang/cpp/template-alias/TestTemplateAlias.py
index 8b6d6dcbc38ba..83a776fc1735d 100644
--- a/lldb/test/API/lang/cpp/template-alias/TestTemplateAlias.py
+++ b/lldb/test/API/lang/cpp/template-alias/TestTemplateAlias.py
@@ -5,6 +5,8 @@
class TestTemplateAlias(TestBase):
+ SHARED_BUILD_TESTCASE = False
+
def do_test(self, extra_flags):
self.build(dictionary=extra_flags)
self.main_source_file = lldb.SBFileSpec("main.cpp")
diff --git a/lldb/test/API/lang/cpp/template-function/TestTemplateFunctions.py b/lldb/test/API/lang/cpp/template-function/TestTemplateFunctions.py
index 3be93dedfd11d..aac9b0a2450cc 100644
--- a/lldb/test/API/lang/cpp/template-function/TestTemplateFunctions.py
+++ b/lldb/test/API/lang/cpp/template-function/TestTemplateFunctions.py
@@ -8,6 +8,8 @@
class TemplateFunctionsTestCase(TestBase):
+ SHARED_BUILD_TESTCASE = False
+
def do_test_template_function(self, add_cast):
self.build()
lldbutil.run_to_source_breakpoint(
diff --git a/lldb/test/API/lang/cpp/unique-types3/TestUniqueTypes3.py b/lldb/test/API/lang/cpp/unique-types3/TestUniqueTypes3.py
index f72701b5eee07..9f41bbeee0636 100644
--- a/lldb/test/API/lang/cpp/unique-types3/TestUniqueTypes3.py
+++ b/lldb/test/API/lang/cpp/unique-types3/TestUniqueTypes3.py
@@ -9,6 +9,8 @@
class UniqueTypesTestCase3(TestBase):
+ SHARED_BUILD_TESTCASE = False
+
def do_test(self, debug_flags):
"""Test that we display the correct template instantiation."""
self.build(dictionary=debug_flags)
diff --git a/lldb/test/API/lang/objc/objc-struct-argument/TestObjCStructArgument.py b/lldb/test/API/lang/objc/objc-struct-argument/TestObjCStructArgument.py
index 480d99523e8a2..921db09db1c57 100644
--- a/lldb/test/API/lang/objc/objc-struct-argument/TestObjCStructArgument.py
+++ b/lldb/test/API/lang/objc/objc-struct-argument/TestObjCStructArgument.py
@@ -8,6 +8,8 @@
class TestObjCStructArgument(TestBase):
+ SHARED_BUILD_TESTCASE = False
+
def setUp(self):
# Call super's setUp().
TestBase.setUp(self)
diff --git a/lldb/test/API/lldbutil-tests/failed-to-hit-breakpoint/TestLLDBUtilFailedToHitBreakpoint.py b/lldb/test/API/lldbutil-tests/failed-to-hit-breakpoint/TestLLDBUtilFailedToHitBreakpoint.py
index 632645ac7c7df..60a94e9c25b34 100644
--- a/lldb/test/API/lldbutil-tests/failed-to-hit-breakpoint/TestLLDBUtilFailedToHitBreakpoint.py
+++ b/lldb/test/API/lldbutil-tests/failed-to-hit-breakpoint/TestLLDBUtilFailedToHitBreakpoint.py
@@ -11,7 +11,6 @@
class LLDBUtilFailedToHitBreakpointTest(TestBase):
NO_DEBUG_INFO_TESTCASE = True
- @expectedFailureAll(oslist=["windows"])
def test_error_message(self):
"""
Tests that run_to_source_breakpoint prints the right error message
diff --git a/lldb/test/API/macosx/lc-note/firmware-corefile/TestFirmwareCorefiles.py b/lldb/test/API/macosx/lc-note/firmware-corefile/TestFirmwareCorefiles.py
index 9309de4824ec4..c941d7a61da05 100644
--- a/lldb/test/API/macosx/lc-note/firmware-corefile/TestFirmwareCorefiles.py
+++ b/lldb/test/API/macosx/lc-note/firmware-corefile/TestFirmwareCorefiles.py
@@ -12,6 +12,8 @@
class TestFirmwareCorefiles(TestBase):
+ SHARED_BUILD_TESTCASE = False
+
@skipIf(
debug_info=no_match(["dsym"]),
bugnumber="This test is looking explicitly for a dSYM",
diff --git a/lldb/test/API/macosx/simulator/TestSimulatorPlatform.py b/lldb/test/API/macosx/simulator/TestSimulatorPlatform.py
index b17ee83ea04fe..b6f6368f6da80 100644
--- a/lldb/test/API/macosx/simulator/TestSimulatorPlatform.py
+++ b/lldb/test/API/macosx/simulator/TestSimulatorPlatform.py
@@ -7,6 +7,7 @@
class TestSimulatorPlatformLaunching(TestBase):
NO_DEBUG_INFO_TESTCASE = True
+ SHARED_BUILD_TESTCASE = False
def check_load_commands(self, expected_load_command):
"""sanity check the built binary for the expected number of load commands"""
diff --git a/lldb/test/API/macosx/skinny-corefile/TestSkinnyCorefile.py b/lldb/test/API/macosx/skinny-corefile/TestSkinnyCorefile.py
index bc19c69df7620..66a3cba83ff45 100644
--- a/lldb/test/API/macosx/skinny-corefile/TestSkinnyCorefile.py
+++ b/lldb/test/API/macosx/skinny-corefile/TestSkinnyCorefile.py
@@ -12,6 +12,8 @@
class TestSkinnyCorefile(TestBase):
+ SHARED_BUILD_TESTCASE = False
+
@skipIf(
debug_info=no_match(["dsym"]),
bugnumber="This test is looking explicitly for a dSYM",
diff --git a/lldb/test/API/python_api/debugger/TestDebuggerAPI.py b/lldb/test/API/python_api/debugger/TestDebuggerAPI.py
index 600cde3c6b807..93ee52dc88ef8 100644
--- a/lldb/test/API/python_api/debugger/TestDebuggerAPI.py
+++ b/lldb/test/API/python_api/debugger/TestDebuggerAPI.py
@@ -11,6 +11,7 @@
class DebuggerAPITestCase(TestBase):
NO_DEBUG_INFO_TESTCASE = True
+ SHARED_BUILD_TESTCASE = False
def test_debugger_api_boundary_condition(self):
"""Exercise SBDebugger APIs with boundary conditions."""
diff --git a/lldb/test/API/python_api/target-arch-from-module/TestTargetArchFromModule.py b/lldb/test/API/python_api/target-arch-from-module/TestTargetArchFromModule.py
index 0141828ae1eab..ba9ab286f82e6 100644
--- a/lldb/test/API/python_api/target-arch-from-module/TestTargetArchFromModule.py
+++ b/lldb/test/API/python_api/target-arch-from-module/TestTargetArchFromModule.py
@@ -12,6 +12,8 @@
class TargetArchFromModule(TestBase):
+ SHARED_BUILD_TESTCASE = False
+
@skipIf(
debug_info=no_match(["dsym"]),
bugnumber="This test is looking explicitly for a dSYM",
diff --git a/lldb/test/API/python_api/unified_section_list/TestModuleUnifiedSectionList.py b/lldb/test/API/python_api/unified_section_list/TestModuleUnifiedSectionList.py
index 93b23d0ba81cb..a559307e59930 100644
--- a/lldb/test/API/python_api/unified_section_list/TestModuleUnifiedSectionList.py
+++ b/lldb/test/API/python_api/unified_section_list/TestModuleUnifiedSectionList.py
@@ -13,6 +13,8 @@
class ModuleUnifiedSectionList(TestBase):
+ SHARED_BUILD_TESTCASE = False
+
@skipUnlessPlatform(["linux", "freebsd", "netbsd"])
def test_unified_section_list(self):
self.build()
diff --git a/lldb/test/API/riscv/disassembler/TestDisassembler.py b/lldb/test/API/riscv/disassembler/TestDisassembler.py
new file mode 100644
index 0000000000000..2f01283786b1f
--- /dev/null
+++ b/lldb/test/API/riscv/disassembler/TestDisassembler.py
@@ -0,0 +1,79 @@
+"""
+Tests that LLDB can correctly set up a disassembler using extensions from the .riscv.attributes section.
+"""
+
+import lldb
+from lldbsuite.test.decorators import *
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test import lldbutil
+import os
+
+
+class TestDisassembler(TestBase):
+ expected_zbb_instrs = ["andn", "orn", "xnor", "rol", "ror"]
+
+ def test_without_riscv_attributes(self):
+ """
+ Tests disassembly of a riscv binary without the .riscv.attributes.
+ Without the .riscv.attributes section lldb won't set up a disassembler to
+ handle the bitmanip extension, so it is not expected to see zbb instructions
+ in the output.
+ """
+ yaml = os.path.join(self.getSourceDir(), "stripped.out.yaml")
+ exe = self.getBuildArtifact("stripped.out")
+ self.yaml2obj(yaml, exe)
+
+ target = self.dbg.CreateTarget(exe)
+
+ self.expect("disassemble --name do_zbb_stuff")
+ output = self.res.GetOutput()
+
+ for instr in self.expected_zbb_instrs:
+ self.assertFalse(
+ instr in output, "Zbb instructions should not be disassembled"
+ )
+
+ self.assertEqual(
+ output.count("unknown"),
+ len(self.expected_zbb_instrs),
+ "Instructions from the Zbb extension should be displayed as <unknown>",
+ )
+
+ def test_with_riscv_attributes(self):
+ """
+ Tests disassembly of a riscv binary with the .riscv.attributes.
+ """
+ yaml = os.path.join(self.getSourceDir(), "a.out.yaml")
+ exe = self.getBuildArtifact("a.out")
+ self.yaml2obj(yaml, exe)
+
+ target = self.dbg.CreateTarget(self.getBuildArtifact("a.out"))
+
+ self.expect("disassemble --name do_zbb_stuff")
+ output = self.res.GetOutput()
+
+ for instr in self.expected_zbb_instrs:
+ self.assertTrue(instr in output, "Invalid disassembler output")
+
+ def test_conflicting_extensions(self):
+ """
+ This test demonstrates the scenario where:
+ 1. file_with_zcd.c is compiled with rv64gc (includes C and D).
+ 2. file_with_zcmp.c is compiled with rv64imad_zcmp (includes Zcmp).
+ 3. The linker merges .riscv.attributes, creating the union: C + D + Zcmp.
+
+ The Zcmp extension is incompatible with the C extension when the D extension is enabled.
+ Therefore, the arch string contains conflicting extensions, and LLDB should
+ display an appropriate warning in this case.
+ """
+ yaml = os.path.join(self.getSourceDir(), "conflicting.out.yaml")
+ exe = self.getBuildArtifact("conflicting.out")
+ self.yaml2obj(yaml, exe)
+
+ target = self.dbg.CreateTarget(self.getBuildArtifact("a.out"))
+ output = self.res.GetOutput()
+
+ self.assertIn(
+ output,
+ "The .riscv.attributes section contains an invalid RISC-V arch string",
+ )
diff --git a/lldb/test/API/riscv/disassembler/a.out.yaml b/lldb/test/API/riscv/disassembler/a.out.yaml
new file mode 100644
index 0000000000000..5823ded9606c8
--- /dev/null
+++ b/lldb/test/API/riscv/disassembler/a.out.yaml
@@ -0,0 +1,32 @@
+--- !ELF
+FileHeader:
+ Class: ELFCLASS64
+ Data: ELFDATA2LSB
+ Type: ET_DYN
+ Machine: EM_RISCV
+ Flags: [ EF_RISCV_RVC, EF_RISCV_FLOAT_ABI_DOUBLE ]
+Sections:
+ - Name: .text
+ Type: SHT_PROGBITS
+ Flags: [ SHF_ALLOC, SHF_EXECINSTR ]
+ Address: 0x550
+ AddressAlign: 0x4
+ Content: EF002002AA87172500000335A5A782653000137101FF814601470A88EFF05FFD029097210000938161298280010017250000130525A9972700009387A7A86388A7009727000083B7E7A491C38287828017250000130505A797250000938585A6898D93D73540FD91BE95858599C59727000083B7A7A291C3828782809727000083C747A49DE7411106E49727000083B7E79F91C717250000033545A28297EFF01FF9A260854717270000230DF7A041018280828071BF411106E422E000083376B5403366B5403346B5403316B5603356B560A260026441018280011106EC22E8001001452330A4FE2326A4FEEFF0BFFC033504FEE260426405618280
+ - Name: .riscv.attributes
+ Type: SHT_RISCV_ATTRIBUTES
+ AddressAlign: 0x1
+ Content: 416C000000726973637600016200000004100572763634693270315F6D3270305F613270315F663270325F643270325F633270305F7A696373723270305F7A6966656E6365693270305F7A6D6D756C3170305F7A61616D6F3170305F7A616C7273633170305F7A626231703000
+Symbols:
+ - Name: _Z12do_zbb_stuffv
+ Type: STT_FUNC
+ Section: .text
+ Binding: STB_GLOBAL
+ Value: 0x606
+ Size: 0x24
+ - Name: main
+ Type: STT_FUNC
+ Section: .text
+ Binding: STB_GLOBAL
+ Value: 0x62A
+ Size: 0x22
+...
diff --git a/lldb/test/API/riscv/disassembler/conflicting.out.yaml b/lldb/test/API/riscv/disassembler/conflicting.out.yaml
new file mode 100644
index 0000000000000..2e0a155f3d0ce
--- /dev/null
+++ b/lldb/test/API/riscv/disassembler/conflicting.out.yaml
@@ -0,0 +1,38 @@
+--- !ELF
+FileHeader:
+ Class: ELFCLASS64
+ Data: ELFDATA2LSB
+ Type: ET_DYN
+ Machine: EM_RISCV
+ Flags: [ EF_RISCV_RVC, EF_RISCV_FLOAT_ABI_DOUBLE ]
+Sections:
+ - Name: .text
+ Type: SHT_PROGBITS
+ Flags: [ SHF_ALLOC, SHF_EXECINSTR ]
+ Address: 0x550
+ AddressAlign: 0x4
+ Content: EF002002AA87172500000335A5A782653000137101FF814601470A88EFF05FFD029097210000938161298280010017250000130525A9972700009387A7A86388A7009727000083B7E7A491C38287828017250000130505A797250000938585A6898D93D73540FD91BE95858599C59727000083B7A7A291C3828782809727000083C747A49DE7411106E49727000083B7E79F91C717250000033545A28297EFF01FF9A260854717270000230DF7A041018280828071BF011106EC22E8001001452330A4FE2326A4FEEF004001EF008002033504FEE260426405618280411106E422E0000802A006A42AA82EACA260026441018280411106E422E0000872B866AC26AC72BEA260026441018280
+ - Name: .riscv.attributes
+ Type: SHT_RISCV_ATTRIBUTES
+ AddressAlign: 0x1
+ Content: 4174000000726973637600016A00000004100572763634693270315F6D3270305F613270315F663270325F643270325F633270305F7A696373723270305F7A6966656E6365693270305F7A6D6D756C3170305F7A61616D6F3170305F7A616C7273633170305F7A63613170305F7A636D7031703000
+Symbols:
+ - Name: main
+ Type: STT_FUNC
+ Section: .text
+ Binding: STB_GLOBAL
+ Value: 0x606
+ Size: 0x26
+ - Name: function_with_zcd_instructions
+ Type: STT_FUNC
+ Section: .text
+ Binding: STB_GLOBAL
+ Value: 0x62C
+ Size: 0x18
+ - Name: function_with_zcmp_extension
+ Type: STT_FUNC
+ Section: .text
+ Binding: STB_GLOBAL
+ Value: 0x644
+ Size: 0x18
+...
diff --git a/lldb/test/API/riscv/disassembler/stripped.out.yaml b/lldb/test/API/riscv/disassembler/stripped.out.yaml
new file mode 100644
index 0000000000000..7c94fa577abc7
--- /dev/null
+++ b/lldb/test/API/riscv/disassembler/stripped.out.yaml
@@ -0,0 +1,28 @@
+--- !ELF
+FileHeader:
+ Class: ELFCLASS64
+ Data: ELFDATA2LSB
+ Type: ET_DYN
+ Machine: EM_RISCV
+ Flags: [ EF_RISCV_RVC, EF_RISCV_FLOAT_ABI_DOUBLE ]
+Sections:
+ - Name: .text
+ Type: SHT_PROGBITS
+ Flags: [ SHF_ALLOC, SHF_EXECINSTR ]
+ Address: 0x550
+ AddressAlign: 0x4
+ Content: EF002002AA87172500000335A5A782653000137101FF814601470A88EFF05FFD029097210000938161298280010017250000130525A9972700009387A7A86388A7009727000083B7E7A491C38287828017250000130505A797250000938585A6898D93D73540FD91BE95858599C59727000083B7A7A291C3828782809727000083C747A49DE7411106E49727000083B7E79F91C717250000033545A28297EFF01FF9A260854717270000230DF7A041018280828071BF411106E422E000083376B5403366B5403346B5403316B5603356B560A260026441018280011106EC22E8001001452330A4FE2326A4FEEFF0BFFC033504FEE260426405618280
+Symbols:
+ - Name: _Z12do_zbb_stuffv
+ Type: STT_FUNC
+ Section: .text
+ Binding: STB_GLOBAL
+ Value: 0x606
+ Size: 0x24
+ - Name: main
+ Type: STT_FUNC
+ Section: .text
+ Binding: STB_GLOBAL
+ Value: 0x62A
+ Size: 0x22
+...
diff --git a/lldb/test/API/source-manager/TestSourceManager.py b/lldb/test/API/source-manager/TestSourceManager.py
index 3500dded815b9..9055dcba93fd1 100644
--- a/lldb/test/API/source-manager/TestSourceManager.py
+++ b/lldb/test/API/source-manager/TestSourceManager.py
@@ -30,6 +30,7 @@ def ansi_color_surround_regex(inner_regex_text):
class SourceManagerTestCase(TestBase):
NO_DEBUG_INFO_TESTCASE = True
+ SHARED_BUILD_TESTCASE = False
def setUp(self):
# Call super's setUp().
diff --git a/lldb/test/API/test_utils/base/TestBaseTest.py b/lldb/test/API/test_utils/base/TestBaseTest.py
index 41ba481b9b74f..afff48d0c6d13 100644
--- a/lldb/test/API/test_utils/base/TestBaseTest.py
+++ b/lldb/test/API/test_utils/base/TestBaseTest.py
@@ -9,6 +9,8 @@
class TestBuildMethod(Base):
+ SHARED_BUILD_TESTCASE = False
+
def setUp(self):
super().setUp()
self._traces = []
diff --git a/lldb/test/API/tools/lldb-dap/breakpoint/TestDAP_setBreakpoints.py b/lldb/test/API/tools/lldb-dap/breakpoint/TestDAP_setBreakpoints.py
index 3309800c1dd10..7f9d4325ce4f8 100644
--- a/lldb/test/API/tools/lldb-dap/breakpoint/TestDAP_setBreakpoints.py
+++ b/lldb/test/API/tools/lldb-dap/breakpoint/TestDAP_setBreakpoints.py
@@ -12,6 +12,8 @@
class TestDAP_setBreakpoints(lldbdap_testcase.DAPTestCaseBase):
+ SHARED_BUILD_TESTCASE = False
+
def setUp(self):
lldbdap_testcase.DAPTestCaseBase.setUp(self)
diff --git a/lldb/test/API/tools/lldb-dap/disconnect/TestDAP_disconnect.py b/lldb/test/API/tools/lldb-dap/disconnect/TestDAP_disconnect.py
index 19f88d88c2ff4..3a4bc62fc6872 100644
--- a/lldb/test/API/tools/lldb-dap/disconnect/TestDAP_disconnect.py
+++ b/lldb/test/API/tools/lldb-dap/disconnect/TestDAP_disconnect.py
@@ -12,6 +12,8 @@
class TestDAP_disconnect(lldbdap_testcase.DAPTestCaseBase):
+ SHARED_BUILD_TESTCASE = False
+
source = "main.cpp"
def disconnect_and_assert_no_output_printed(self):
diff --git a/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch_args.py b/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch_args.py
index 221147f535958..6774f0516ae79 100644
--- a/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch_args.py
+++ b/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch_args.py
@@ -11,9 +11,6 @@ class TestDAP_launch_args(lldbdap_testcase.DAPTestCaseBase):
Tests launch of a simple program with arguments
"""
- @expectedFailureWindows(
- bugnumber="https://github.com/llvm/llvm-project/issues/137599"
- )
def test(self):
program = self.getBuildArtifact("a.out")
args = ["one", "with space", "'with single quotes'", '"with double quotes"']
diff --git a/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch_basic.py b/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch_basic.py
index d0e8a792e4e25..93ae5d05e9d6c 100644
--- a/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch_basic.py
+++ b/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch_basic.py
@@ -12,9 +12,6 @@ class TestDAP_launch_basic(lldbdap_testcase.DAPTestCaseBase):
environment, or anything else is specified.
"""
- @expectedFailureWindows(
- bugnumber="https://github.com/llvm/llvm-project/issues/137599"
- )
def test(self):
program = self.getBuildArtifact("a.out")
self.build_and_launch(program)
diff --git a/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch_debuggerRoot.py b/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch_debuggerRoot.py
index 39397120bf5d8..deeab23d3ec56 100644
--- a/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch_debuggerRoot.py
+++ b/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch_debuggerRoot.py
@@ -14,9 +14,6 @@ class TestDAP_launch_debuggerRoot(lldbdap_testcase.DAPTestCaseBase):
the lldb-dap debug adapter.
"""
- @expectedFailureWindows(
- bugnumber="https://github.com/llvm/llvm-project/issues/137599"
- )
def test(self):
program = self.getBuildArtifact("a.out")
program_parent_dir = os.path.realpath(os.path.dirname(os.path.dirname(program)))
diff --git a/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch_environment_with_object.py b/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch_environment_with_object.py
index f84aff742eed2..8c7994eac7926 100644
--- a/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch_environment_with_object.py
+++ b/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch_environment_with_object.py
@@ -7,9 +7,6 @@
class TestDAP_launch(lldbdap_testcase.DAPTestCaseBase):
- @expectedFailureWindows(
- bugnumber="https://github.com/llvm/llvm-project/issues/137599"
- )
def test_environment_with_object(self):
"""
Tests launch of a simple program with environment variables
diff --git a/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch_shellExpandArguments_disabled.py b/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch_shellExpandArguments_disabled.py
index b71bfc8152b69..d7b8579845956 100644
--- a/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch_shellExpandArguments_disabled.py
+++ b/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch_shellExpandArguments_disabled.py
@@ -13,9 +13,6 @@ class TestDAP_launch_shellExpandArguments_disabled(lldbdap_testcase.DAPTestCaseB
disabled.
"""
- @expectedFailureWindows(
- bugnumber="https://github.com/llvm/llvm-project/issues/137599"
- )
def test(self):
program = self.getBuildArtifact("a.out")
program_dir = os.path.dirname(program)
diff --git a/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch_shellExpandArguments_enabled.py b/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch_shellExpandArguments_enabled.py
index 443bb6f6fee54..7ddde219fc88d 100644
--- a/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch_shellExpandArguments_enabled.py
+++ b/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch_shellExpandArguments_enabled.py
@@ -18,10 +18,9 @@ class TestDAP_launch_shellExpandArguments_enabled(lldbdap_testcase.DAPTestCaseBa
"""
@skipIfLinux # shell argument expansion doesn't seem to work on Linux
- @expectedFailureWindows(
- bugnumber="https://github.com/llvm/llvm-project/issues/137599"
+ @expectedFailureAll(
+ oslist=["freebsd", "netbsd", "windows"], bugnumber="llvm.org/pr48349"
)
- @expectedFailureAll(oslist=["freebsd", "netbsd"], bugnumber="llvm.org/pr48349")
def test(self):
program = self.getBuildArtifact("a.out")
program_dir = os.path.dirname(program)
diff --git a/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch_stdio_redirection_and_console.py b/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch_stdio_redirection_and_console.py
index 0ed8a5e11bf8b..bec76fb4ef5e1 100644
--- a/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch_stdio_redirection_and_console.py
+++ b/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch_stdio_redirection_and_console.py
@@ -4,10 +4,10 @@
from lldbsuite.test.decorators import (
skipIfAsan,
- expectedFailureWindows,
skipIf,
skipIfBuildType,
no_match,
+ skipIfWindows,
)
import lldbdap_testcase
import tempfile
@@ -19,9 +19,7 @@ class TestDAP_launch_stdio_redirection_and_console(lldbdap_testcase.DAPTestCaseB
"""
@skipIfAsan
- @expectedFailureWindows(
- bugnumber="https://github.com/llvm/llvm-project/issues/137599"
- )
+ @skipIfWindows # https://github.com/llvm/llvm-project/issues/62336
@skipIf(oslist=["linux"], archs=no_match(["x86_64"]))
@skipIfBuildType(["debug"])
def test(self):
diff --git a/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch_version.py b/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch_version.py
index a598db41595a5..fca153044da82 100644
--- a/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch_version.py
+++ b/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch_version.py
@@ -13,9 +13,6 @@ class TestDAP_launch_version(lldbdap_testcase.DAPTestCaseBase):
as the one returned by "version" command.
"""
- @expectedFailureWindows(
- bugnumber="https://github.com/llvm/llvm-project/issues/137599"
- )
def test(self):
program = self.getBuildArtifact("a.out")
self.build_and_launch(program)
diff --git a/lldb/test/API/tools/lldb-dap/runInTerminal/TestDAP_runInTerminal.py b/lldb/test/API/tools/lldb-dap/runInTerminal/TestDAP_runInTerminal.py
index dfb4906ae6a49..0fdc719b6cb76 100644
--- a/lldb/test/API/tools/lldb-dap/runInTerminal/TestDAP_runInTerminal.py
+++ b/lldb/test/API/tools/lldb-dap/runInTerminal/TestDAP_runInTerminal.py
@@ -12,6 +12,8 @@
@skipIfBuildType(["debug"])
class TestDAP_runInTerminal(lldbdap_testcase.DAPTestCaseBase):
+ SHARED_BUILD_TESTCASE = False
+
def read_pid_message(self, fifo_file):
with open(fifo_file, "r") as file:
self.assertIn("pid", file.readline())
diff --git a/lldb/test/API/tools/lldb-dap/variables/TestDAP_variables.py b/lldb/test/API/tools/lldb-dap/variables/TestDAP_variables.py
index 10c67a94407e6..a70fefd358b4b 100644
--- a/lldb/test/API/tools/lldb-dap/variables/TestDAP_variables.py
+++ b/lldb/test/API/tools/lldb-dap/variables/TestDAP_variables.py
@@ -17,6 +17,8 @@ def make_buffer_verify_dict(start_idx, count, offset=0):
class TestDAP_variables(lldbdap_testcase.DAPTestCaseBase):
+ SHARED_BUILD_TESTCASE = False
+
def verify_values(self, verify_dict, actual, varref_dict=None, expression=None):
if "equals" in verify_dict:
verify = verify_dict["equals"]
@@ -306,6 +308,16 @@ def do_test_scopes_variables_setVariable_evaluate(
argv, 0x1234, "verify argv was set to 0x1234 (0x1234 != %#x)" % (argv)
)
+ # Test hexadecimal format
+ response = self.set_local("argc", 42, is_hex=True)
+ verify_response = {
+ "type": "int",
+ "value": "0x0000002a",
+ }
+ for key, value in verify_response.items():
+ self.assertEqual(value, response["body"][key])
+ self.set_local("argc", 123)
+
# Set a variable value whose name is synthetic, like a variable index
# and verify the value by reading it
variable_value = 100
diff --git a/lldb/test/API/tools/lldb-server/TestGdbRemotePlatformFile.py b/lldb/test/API/tools/lldb-server/TestGdbRemotePlatformFile.py
index 76b0b204123dd..4bfd816ae2cdb 100644
--- a/lldb/test/API/tools/lldb-server/TestGdbRemotePlatformFile.py
+++ b/lldb/test/API/tools/lldb-server/TestGdbRemotePlatformFile.py
@@ -43,6 +43,8 @@ def uint32_trunc(x):
class TestGdbRemotePlatformFile(GdbRemoteTestCaseBase):
+ SHARED_BUILD_TESTCASE = False
+
@skipIfWindows
@add_test_categories(["llgs"])
def test_platform_file_rdonly(self):
diff --git a/lldb/test/API/tools/lldb-server/commandline/TestGdbRemoteConnection.py b/lldb/test/API/tools/lldb-server/commandline/TestGdbRemoteConnection.py
index ed600d396fad4..12946d9d42d11 100644
--- a/lldb/test/API/tools/lldb-server/commandline/TestGdbRemoteConnection.py
+++ b/lldb/test/API/tools/lldb-server/commandline/TestGdbRemoteConnection.py
@@ -9,6 +9,8 @@
class TestGdbRemoteConnection(gdbremote_testcase.GdbRemoteTestCaseBase):
+ SHARED_BUILD_TESTCASE = False
+
@skipIfRemote # reverse connect is not a supported use case for now
def test_reverse_connect(self):
# Reverse connect is the default connection method.
diff --git a/lldb/test/API/types/AbstractBase.py b/lldb/test/API/types/AbstractBase.py
index fb1e25254b281..0420ffd8f3bbb 100644
--- a/lldb/test/API/types/AbstractBase.py
+++ b/lldb/test/API/types/AbstractBase.py
@@ -22,6 +22,8 @@ class GenericTester(TestBase):
# printf() stmts (see basic_type.cpp).
pattern = re.compile(r" (\*?a[^=]*) = '([^=]*)'$")
+ SHARED_BUILD_TESTCASE = False
+
# Assert message.
DATA_TYPE_GROKKED = "Data type from expr parser output is parsed correctly"
diff --git a/lldb/tools/lldb-dap/Handler/SetVariableRequestHandler.cpp b/lldb/tools/lldb-dap/Handler/SetVariableRequestHandler.cpp
index fb02d0ada651e..725d5de094c95 100644
--- a/lldb/tools/lldb-dap/Handler/SetVariableRequestHandler.cpp
+++ b/lldb/tools/lldb-dap/Handler/SetVariableRequestHandler.cpp
@@ -54,8 +54,9 @@ SetVariableRequestHandler::Run(const SetVariableArguments &args) const {
if (!success)
return llvm::make_error<DAPError>(error.GetCString());
+ const bool hex = args.format ? args.format->hex : false;
VariableDescription desc(variable,
- dap.configuration.enableAutoVariableSummaries);
+ dap.configuration.enableAutoVariableSummaries, hex);
SetVariableResponseBody body;
body.value = desc.display_value;
diff --git a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h
index 9bf04757294d6..a8280bcdd9ee6 100644
--- a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h
+++ b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h
@@ -444,7 +444,7 @@ struct SetVariableArguments {
std::string value;
/// Specifies details on how to format the response value.
- ValueFormat format;
+ std::optional<ValueFormat> format;
};
bool fromJSON(const llvm::json::Value &, SetVariableArguments &,
llvm::json::Path);
diff --git a/lldb/unittests/DAP/ProtocolRequestsTest.cpp b/lldb/unittests/DAP/ProtocolRequestsTest.cpp
index 18ba5cbf58cfd..5d6086b46add0 100644
--- a/lldb/unittests/DAP/ProtocolRequestsTest.cpp
+++ b/lldb/unittests/DAP/ProtocolRequestsTest.cpp
@@ -11,6 +11,7 @@
#include "TestingSupport/TestUtilities.h"
#include "llvm/Testing/Support/Error.h"
#include <gtest/gtest.h>
+#include <optional>
using namespace llvm;
using namespace lldb_dap::protocol;
@@ -413,3 +414,22 @@ TEST(ProtocolRequestsTest, StackTraceResponseBody) {
ASSERT_THAT_EXPECTED(expected, llvm::Succeeded());
EXPECT_EQ(PrettyPrint(*expected), PrettyPrint(body));
}
+
+TEST(ProtocolRequestsTest, SetVariableArguments) {
+ llvm::Expected<SetVariableArguments> expected =
+ parse<SetVariableArguments>(R"({
+ "variablesReference": 42,
+ "name": "test",
+ "value": "12345"
+ })");
+ ASSERT_THAT_EXPECTED(expected, llvm::Succeeded());
+ EXPECT_EQ(expected->variablesReference.AsUInt32(), 42U);
+ EXPECT_EQ(expected->name, "test");
+ EXPECT_EQ(expected->value, "12345");
+ EXPECT_EQ(expected->format, std::nullopt);
+
+ // Check required keys.
+ EXPECT_THAT_EXPECTED(
+ parse<SetVariableArguments>(R"({})"),
+ FailedWithMessage("missing value at (root).variablesReference"));
+}
diff --git a/llvm/cmake/modules/AddLLVM.cmake b/llvm/cmake/modules/AddLLVM.cmake
index d938214f9d0df..81aaf6034cca7 100644
--- a/llvm/cmake/modules/AddLLVM.cmake
+++ b/llvm/cmake/modules/AddLLVM.cmake
@@ -1004,6 +1004,13 @@ macro(generate_llvm_objects name)
if (ARG_GENERATE_DRIVER)
string(REPLACE "-" "_" TOOL_NAME ${name})
+
+ set(INITLLVM_ARGS "")
+
+ if(${name} STREQUAL "clang")
+ set(INITLLVM_ARGS ", /*InstallPipeSignalExitHandler=*/true, /*NeedsPOSIXUtilitySignalHandling=*/true")
+ endif()
+
foreach(path ${CMAKE_MODULE_PATH})
if(EXISTS ${path}/llvm-driver-template.cpp.in)
configure_file(
diff --git a/llvm/cmake/modules/HandleLLVMOptions.cmake b/llvm/cmake/modules/HandleLLVMOptions.cmake
index 8b7c8cd4028bd..b5372eea259f9 100644
--- a/llvm/cmake/modules/HandleLLVMOptions.cmake
+++ b/llvm/cmake/modules/HandleLLVMOptions.cmake
@@ -733,7 +733,6 @@ if (MSVC)
-wd4244 # Suppress ''argument' : conversion from 'type1' to 'type2', possible loss of data'
-wd4267 # Suppress ''var' : conversion from 'size_t' to 'type', possible loss of data'
-wd4291 # Suppress ''declaration' : no matching operator delete found; memory will not be freed if initialization throws an exception'
- -wd4351 # Suppress 'new behavior: elements of array 'array' will be default initialized'
-wd4456 # Suppress 'declaration of 'var' hides local variable'
-wd4457 # Suppress 'declaration of 'var' hides function parameter'
-wd4458 # Suppress 'declaration of 'var' hides class member'
diff --git a/llvm/cmake/modules/llvm-driver-template.cpp.in b/llvm/cmake/modules/llvm-driver-template.cpp.in
index 1470ef1f06164..d4c385c8cf412 100644
--- a/llvm/cmake/modules/llvm-driver-template.cpp.in
+++ b/llvm/cmake/modules/llvm-driver-template.cpp.in
@@ -13,6 +13,6 @@
int @TOOL_NAME at _main(int argc, char **, const llvm::ToolContext &);
int main(int argc, char **argv) {
- llvm::InitLLVM X(argc, argv);
+ llvm::InitLLVM X(argc, argv at INITLLVM_ARGS@);
return @TOOL_NAME at _main(argc, argv, {argv[0], nullptr, false});
}
diff --git a/llvm/docs/MIRLangRef.rst b/llvm/docs/MIRLangRef.rst
index f7df57d05baa0..a2ee28151b354 100644
--- a/llvm/docs/MIRLangRef.rst
+++ b/llvm/docs/MIRLangRef.rst
@@ -523,7 +523,7 @@ The full syntax of a register operand is shown below:
.. code-block:: text
- [<flags>] <register> [ :<subregister-idx-name> ] [ (tied-def <tied-op>) ]
+ [<flags>] <register> [ .<subregister-idx-name> ] [ :<register-class> ] [ (tied-def <tied-op>) ] [ (<type>) ]
This example shows an instance of the X86 ``XOR32rr`` instruction that has
5 register operands with different register flags:
@@ -532,6 +532,9 @@ This example shows an instance of the X86 ``XOR32rr`` instruction that has
dead $eax = XOR32rr undef $eax, undef $eax, implicit-def dead $eflags, implicit-def $al
+Note that subregister-index, register-class and type cannot be specified for
+physical registers. Additionally, tied-def can only be specified for a use.
+
.. _register-flags:
Register Flags
@@ -602,7 +605,7 @@ lower bits from the 32-bit virtual register 0 to the 8-bit virtual register 1:
.. code-block:: text
- %1 = COPY %0:sub_8bit
+ %1 = COPY %0.sub_8bit
The names of the subregister indices are target specific, and are typically
defined in the target's ``*RegisterInfo.td`` file.
diff --git a/llvm/include/llvm/Support/CrashRecoveryContext.h b/llvm/include/llvm/Support/CrashRecoveryContext.h
index ffee81dd24587..4abd2d0eedcde 100644
--- a/llvm/include/llvm/Support/CrashRecoveryContext.h
+++ b/llvm/include/llvm/Support/CrashRecoveryContext.h
@@ -60,7 +60,7 @@ class CrashRecoveryContext {
LLVM_ABI void unregisterCleanup(CrashRecoveryContextCleanup *cleanup);
/// Enable crash recovery.
- LLVM_ABI static void Enable();
+ LLVM_ABI static void Enable(bool NeedsPOSIXUtilitySignalHandling = false);
/// Disable crash recovery.
LLVM_ABI static void Disable();
diff --git a/llvm/include/llvm/Support/InitLLVM.h b/llvm/include/llvm/Support/InitLLVM.h
index 748f5d8aa6aea..4d513bfd576bb 100644
--- a/llvm/include/llvm/Support/InitLLVM.h
+++ b/llvm/include/llvm/Support/InitLLVM.h
@@ -36,10 +36,13 @@ namespace llvm {
class InitLLVM {
public:
LLVM_ABI InitLLVM(int &Argc, const char **&Argv,
- bool InstallPipeSignalExitHandler = true);
- InitLLVM(int &Argc, char **&Argv, bool InstallPipeSignalExitHandler = true)
+ bool InstallPipeSignalExitHandler = true,
+ bool NeedsPOSIXUtilitySignalHandling = false);
+ InitLLVM(int &Argc, char **&Argv, bool InstallPipeSignalExitHandler = true,
+ bool NeedsPOSIXUtilitySignalHandling = false)
: InitLLVM(Argc, const_cast<const char **&>(Argv),
- InstallPipeSignalExitHandler) {}
+ InstallPipeSignalExitHandler,
+ NeedsPOSIXUtilitySignalHandling) {}
LLVM_ABI ~InitLLVM();
diff --git a/llvm/include/llvm/Support/Signals.h b/llvm/include/llvm/Support/Signals.h
index 21b425fffef53..d9bff20b85393 100644
--- a/llvm/include/llvm/Support/Signals.h
+++ b/llvm/include/llvm/Support/Signals.h
@@ -99,8 +99,12 @@ using SignalHandlerCallback = void (*)(void *);
/// Add a function to be called when an abort/kill signal is delivered to the
/// process. The handler can have a cookie passed to it to identify what
-/// instance of the handler it is.
-LLVM_ABI void AddSignalHandler(SignalHandlerCallback FnPtr, void *Cookie);
+/// instance of the handler it is. The NeedsPOSIXUtilitySignalHandling
+/// argument indicates whether POSIX signal handling semantics are followed,
+/// so that the signal handler resignals itself to terminate after handling
+/// the signal.
+LLVM_ABI void AddSignalHandler(SignalHandlerCallback FnPtr, void *Cookie,
+ bool NeedsPOSIXUtilitySignalHandling = false);
/// This function registers a function to be called when the user "interrupts"
/// the program (typically by pressing ctrl-c). When the user interrupts the
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index b0ac14ba8b393..738d0c063a5e4 100644
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -329,8 +329,7 @@ bool llvm::IsConstantOffsetFromGlobal(Constant *C, GlobalValue *&GV,
// Look through ptr->int and ptr->ptr casts.
if (CE->getOpcode() == Instruction::PtrToInt ||
- CE->getOpcode() == Instruction::PtrToAddr ||
- CE->getOpcode() == Instruction::BitCast)
+ CE->getOpcode() == Instruction::PtrToAddr)
return IsConstantOffsetFromGlobal(CE->getOperand(0), GV, Offset, DL,
DSOEquiv);
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index 32b936aa45eae..2c8612a9d7822 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -984,6 +984,11 @@ class ModuleSummaryIndexBitcodeReader : public BitcodeReaderBase {
/// the CallStackRadixTreeBuilder class in ProfileData/MemProf.h for format.
std::vector<uint64_t> RadixArray;
+ /// Map from the module's stack id index to the index in the
+ /// ModuleSummaryIndex's StackIds vector. Populated when the STACK_IDS record
+ /// is processed and used to avoid repeated hash lookups.
+ std::vector<unsigned> StackIdToIndex;
+
public:
ModuleSummaryIndexBitcodeReader(
BitstreamCursor Stream, StringRef Strtab, ModuleSummaryIndex &TheIndex,
@@ -7636,8 +7641,7 @@ SmallVector<unsigned> ModuleSummaryIndexBitcodeReader::parseAllocInfoContext(
StackIdList.reserve(NumStackEntries);
for (unsigned J = 0; J < NumStackEntries; J++) {
assert(Record[I] < StackIds.size());
- StackIdList.push_back(
- TheIndex.addOrGetStackIdIndex(StackIds[Record[I++]]));
+ StackIdList.push_back(StackIdToIndex[Record[I++]]);
}
} else {
unsigned RadixIndex = Record[I++];
@@ -7660,7 +7664,7 @@ SmallVector<unsigned> ModuleSummaryIndexBitcodeReader::parseAllocInfoContext(
assert(static_cast<std::make_signed_t<unsigned>>(Elem) >= 0);
}
RadixIndex++;
- StackIdList.push_back(TheIndex.addOrGetStackIdIndex(StackIds[Elem]));
+ StackIdList.push_back(StackIdToIndex[Elem]);
}
}
return StackIdList;
@@ -8123,16 +8127,22 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
case bitc::FS_STACK_IDS: { // [n x stackid]
// Save stack ids in the reader to consult when adding stack ids from the
// lists in the stack node and alloc node entries.
+ assert(StackIds.empty());
if (Version <= 11) {
StackIds = ArrayRef<uint64_t>(Record);
- break;
+ } else {
+ // This is an array of 32-bit fixed-width values, holding each 64-bit
+ // context id as a pair of adjacent (most significant first) 32-bit
+ // words.
+ assert(Record.size() % 2 == 0);
+ StackIds.reserve(Record.size() / 2);
+ for (auto R = Record.begin(); R != Record.end(); R += 2)
+ StackIds.push_back(*R << 32 | *(R + 1));
}
- // This is an array of 32-bit fixed-width values, holding each 64-bit
- // context id as a pair of adjacent (most significant first) 32-bit words.
- assert(Record.size() % 2 == 0);
- StackIds.reserve(Record.size() / 2);
- for (auto R = Record.begin(); R != Record.end(); R += 2)
- StackIds.push_back(*R << 32 | *(R + 1));
+ assert(StackIdToIndex.empty());
+ StackIdToIndex.reserve(StackIds.size());
+ for (uint64_t StackId : StackIds)
+ StackIdToIndex.push_back(TheIndex.addOrGetStackIdIndex(StackId));
break;
}
@@ -8146,7 +8156,7 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
SmallVector<unsigned> StackIdList;
for (uint64_t R : drop_begin(Record)) {
assert(R < StackIds.size());
- StackIdList.push_back(TheIndex.addOrGetStackIdIndex(StackIds[R]));
+ StackIdList.push_back(StackIdToIndex[R]);
}
ValueInfo VI = std::get<0>(getValueInfoFromValueId(ValueID));
PendingCallsites.push_back(CallsiteInfo({VI, std::move(StackIdList)}));
@@ -8162,8 +8172,7 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
SmallVector<unsigned> StackIdList;
for (unsigned J = 0; J < NumStackIds; J++) {
assert(*RecordIter < StackIds.size());
- StackIdList.push_back(
- TheIndex.addOrGetStackIdIndex(StackIds[*RecordIter++]));
+ StackIdList.push_back(StackIdToIndex[*RecordIter++]);
}
SmallVector<unsigned> Versions;
for (unsigned J = 0; J < NumVersions; J++)
diff --git a/llvm/lib/CodeGen/MIRParser/MIParser.cpp b/llvm/lib/CodeGen/MIRParser/MIParser.cpp
index c90ee21c6750b..f54a0c44d717b 100644
--- a/llvm/lib/CodeGen/MIRParser/MIParser.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MIParser.cpp
@@ -1724,16 +1724,14 @@ bool MIParser::parseSubRegisterIndex(unsigned &SubReg) {
}
bool MIParser::parseRegisterTiedDefIndex(unsigned &TiedDefIdx) {
- if (!consumeIfPresent(MIToken::kw_tied_def))
- return true;
+ assert(Token.is(MIToken::kw_tied_def));
+ lex();
if (Token.isNot(MIToken::IntegerLiteral))
return error("expected an integer literal after 'tied-def'");
if (getUnsigned(TiedDefIdx))
return true;
lex();
- if (expectAndConsume(MIToken::rparen))
- return true;
- return false;
+ return expectAndConsume(MIToken::rparen);
}
bool MIParser::assignRegisterTies(MachineInstr &MI,
@@ -1781,6 +1779,8 @@ bool MIParser::parseRegisterOperand(MachineOperand &Dest,
if (parseRegisterFlag(Flags))
return true;
}
+ // Update IsDef as we may have read a def flag.
+ IsDef = hasRegState(Flags, RegState::Define);
if (!Token.isRegister())
return error("expected a register after register flags");
Register Reg;
@@ -1802,56 +1802,46 @@ bool MIParser::parseRegisterOperand(MachineOperand &Dest,
if (parseRegisterClassOrBank(*RegInfo))
return true;
}
- MachineRegisterInfo &MRI = MF.getRegInfo();
- if (!hasRegState(Flags, RegState::Define)) {
- if (consumeIfPresent(MIToken::lparen)) {
- unsigned Idx;
- if (!parseRegisterTiedDefIndex(Idx))
- TiedDefIdx = Idx;
- else {
- // Try a redundant low-level type.
- LLT Ty;
- if (parseLowLevelType(Token.location(), Ty))
- return error("expected tied-def or low-level type after '('");
-
- if (expectAndConsume(MIToken::rparen))
- return true;
-
- if (MRI.getType(Reg).isValid() && MRI.getType(Reg) != Ty)
- return error("inconsistent type for generic virtual register");
- MRI.setRegClassOrRegBank(Reg, static_cast<RegisterBank *>(nullptr));
- MRI.setType(Reg, Ty);
- MRI.noteNewVirtualRegister(Reg);
- }
- }
- } else if (consumeIfPresent(MIToken::lparen)) {
- // Virtual registers may have a tpe with GlobalISel.
- if (!Reg.isVirtual())
- return error("unexpected type on physical register");
+ if (consumeIfPresent(MIToken::lparen)) {
+ // For a def, we only expect a type. For use we expect either a type or a
+ // tied-def. Additionally, for physical registers, we don't expect a type.
+ if (Token.is(MIToken::kw_tied_def)) {
+ if (IsDef)
+ return error("tied-def not supported for defs");
+ unsigned Idx;
+ if (parseRegisterTiedDefIndex(Idx))
+ return true;
+ TiedDefIdx = Idx;
+ } else {
+ if (!Reg.isVirtual())
+ return error("unexpected type on physical register");
- LLT Ty;
- if (parseLowLevelType(Token.location(), Ty))
- return true;
+ LLT Ty;
+ // If type parsing fails, forwad the parse error for defs.
+ if (parseLowLevelType(Token.location(), Ty))
+ return IsDef ? true
+ : error("expected tied-def or low-level type after '('");
- if (expectAndConsume(MIToken::rparen))
- return true;
+ if (expectAndConsume(MIToken::rparen))
+ return true;
- if (MRI.getType(Reg).isValid() && MRI.getType(Reg) != Ty)
- return error("inconsistent type for generic virtual register");
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ if (MRI.getType(Reg).isValid() && MRI.getType(Reg) != Ty)
+ return error("inconsistent type for generic virtual register");
- MRI.setRegClassOrRegBank(Reg, static_cast<RegisterBank *>(nullptr));
- MRI.setType(Reg, Ty);
- } else if (Reg.isVirtual()) {
- // Generic virtual registers must have a type.
- // If we end up here this means the type hasn't been specified and
- // this is bad!
+ MRI.setRegClassOrRegBank(Reg, static_cast<RegisterBank *>(nullptr));
+ MRI.setType(Reg, Ty);
+ MRI.noteNewVirtualRegister(Reg);
+ }
+ } else if (IsDef && Reg.isVirtual()) {
+ // Generic virtual registers defs must have a type.
if (RegInfo->Kind == VRegInfo::GENERIC ||
RegInfo->Kind == VRegInfo::REGBANK)
return error("generic virtual registers must have a type");
}
- if (hasRegState(Flags, RegState::Define)) {
+ if (IsDef) {
if (hasRegState(Flags, RegState::Kill))
return error("cannot have a killed def operand");
} else {
@@ -1859,15 +1849,14 @@ bool MIParser::parseRegisterOperand(MachineOperand &Dest,
return error("cannot have a dead use operand");
}
- Dest = MachineOperand::CreateReg(Reg, hasRegState(Flags, RegState::Define),
- hasRegState(Flags, RegState::Implicit),
- hasRegState(Flags, RegState::Kill),
- hasRegState(Flags, RegState::Dead),
- hasRegState(Flags, RegState::Undef),
- hasRegState(Flags, RegState::EarlyClobber),
- SubReg, hasRegState(Flags, RegState::Debug),
- hasRegState(Flags, RegState::InternalRead),
- hasRegState(Flags, RegState::Renamable));
+ Dest = MachineOperand::CreateReg(
+ Reg, IsDef, hasRegState(Flags, RegState::Implicit),
+ hasRegState(Flags, RegState::Kill), hasRegState(Flags, RegState::Dead),
+ hasRegState(Flags, RegState::Undef),
+ hasRegState(Flags, RegState::EarlyClobber), SubReg,
+ hasRegState(Flags, RegState::Debug),
+ hasRegState(Flags, RegState::InternalRead),
+ hasRegState(Flags, RegState::Renamable));
return false;
}
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index e4b4d80896fa7..3c7b46a9021da 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -8439,6 +8439,10 @@ SDValue TargetLowering::expandCLMUL(SDNode *Node, SelectionDAG &DAG) const {
unsigned BW = VT.getScalarSizeInBits();
unsigned Opcode = Node->getOpcode();
+ // Scalarize if the vector multiplication is unlikely to work.
+ if (VT.isVector() && !isOperationLegalOrCustom(ISD::MUL, VT))
+ return DAG.UnrollVectorOp(Node);
+
switch (Opcode) {
case ISD::CLMUL: {
// NOTE: If you change this expansion, please update the cost model
diff --git a/llvm/lib/Support/CrashRecoveryContext.cpp b/llvm/lib/Support/CrashRecoveryContext.cpp
index fc30d421506a6..fb9f2ec333a7b 100644
--- a/llvm/lib/Support/CrashRecoveryContext.cpp
+++ b/llvm/lib/Support/CrashRecoveryContext.cpp
@@ -92,7 +92,8 @@ static LLVM_THREAD_LOCAL const CrashRecoveryContext *IsRecoveringFromCrash;
} // namespace
-static void installExceptionOrSignalHandlers();
+static void
+installExceptionOrSignalHandlers(bool NeedsPOSIXUtilitySignalHandling);
static void uninstallExceptionOrSignalHandlers();
CrashRecoveryContextCleanup::~CrashRecoveryContextCleanup() = default;
@@ -137,13 +138,13 @@ CrashRecoveryContext *CrashRecoveryContext::GetCurrent() {
return CRCI->CRC;
}
-void CrashRecoveryContext::Enable() {
+void CrashRecoveryContext::Enable(bool NeedsPOSIXUtilitySignalHandling) {
std::lock_guard<std::mutex> L(getCrashRecoveryContextMutex());
// FIXME: Shouldn't this be a refcount or something?
if (gCrashRecoveryEnabled)
return;
gCrashRecoveryEnabled = true;
- installExceptionOrSignalHandlers();
+ installExceptionOrSignalHandlers(NeedsPOSIXUtilitySignalHandling);
}
void CrashRecoveryContext::Disable() {
@@ -193,7 +194,8 @@ CrashRecoveryContext::unregisterCleanup(CrashRecoveryContextCleanup *cleanup) {
// catches exceptions if they would bubble out from the stack frame with __try /
// __except.
-static void installExceptionOrSignalHandlers() {}
+static void
+installExceptionOrSignalHandlers(bool NeedsPOSIXUtilitySignalHandling) {}
static void uninstallExceptionOrSignalHandlers() {}
// We need this function because the call to GetExceptionInformation() can only
@@ -309,7 +311,8 @@ static LONG CALLBACK ExceptionHandler(PEXCEPTION_POINTERS ExceptionInfo)
// non-NULL, valid VEH handles, or NULL.
static LLVM_THREAD_LOCAL const void* sCurrentExceptionHandle;
-static void installExceptionOrSignalHandlers() {
+static void
+installExceptionOrSignalHandlers(bool NeedsPOSIXUtilitySignalHandling) {
// We can set up vectored exception handling now. We will install our
// handler as the front of the list, though there's no assurances that
// it will remain at the front (another call could install itself before
@@ -390,7 +393,8 @@ static void CrashRecoverySignalHandler(int Signal) {
const_cast<CrashRecoveryContextImpl *>(CRCI)->HandleCrash(RetCode, Signal);
}
-static void installExceptionOrSignalHandlers() {
+static void
+installExceptionOrSignalHandlers(bool NeedsPOSIXUtilitySignalHandling) {
// Setup the signal handler.
struct sigaction Handler;
Handler.sa_handler = CrashRecoverySignalHandler;
@@ -398,7 +402,14 @@ static void installExceptionOrSignalHandlers() {
sigemptyset(&Handler.sa_mask);
for (unsigned i = 0; i != NumSignals; ++i) {
- sigaction(Signals[i], &Handler, &PrevActions[i]);
+ if (NeedsPOSIXUtilitySignalHandling) {
+ // Don't install the new handler if the signal disposition is SIG_IGN.
+ struct sigaction act;
+ if (sigaction(Signals[i], NULL, &act) == 0 && act.sa_handler != SIG_IGN)
+ sigaction(Signals[i], &Handler, &PrevActions[i]);
+ } else {
+ sigaction(Signals[i], &Handler, &PrevActions[i]);
+ }
}
}
diff --git a/llvm/lib/Support/InitLLVM.cpp b/llvm/lib/Support/InitLLVM.cpp
index b90f4e0714458..797c5d35bec35 100644
--- a/llvm/lib/Support/InitLLVM.cpp
+++ b/llvm/lib/Support/InitLLVM.cpp
@@ -73,7 +73,8 @@ using namespace llvm;
using namespace llvm::sys;
InitLLVM::InitLLVM(int &Argc, const char **&Argv,
- bool InstallPipeSignalExitHandler) {
+ bool InstallPipeSignalExitHandler,
+ bool NeedsPOSIXUtilitySignalHandling) {
#ifndef NDEBUG
static std::atomic<bool> Initialized{false};
assert(!Initialized && "InitLLVM was already initialized!");
@@ -81,7 +82,12 @@ InitLLVM::InitLLVM(int &Argc, const char **&Argv,
#endif
// Bring stdin/stdout/stderr into a known state.
+#ifdef _WIN32
sys::AddSignalHandler(CleanupStdHandles, nullptr);
+#else
+ sys::AddSignalHandler(CleanupStdHandles, nullptr,
+ NeedsPOSIXUtilitySignalHandling);
+#endif
if (InstallPipeSignalExitHandler)
// The pipe signal handler must be installed before any other handlers are
diff --git a/llvm/lib/Support/KnownBits.cpp b/llvm/lib/Support/KnownBits.cpp
index 58d21154fed08..07e7781d0839d 100644
--- a/llvm/lib/Support/KnownBits.cpp
+++ b/llvm/lib/Support/KnownBits.cpp
@@ -632,31 +632,31 @@ KnownBits KnownBits::clmul(const KnownBits &LHS, const KnownBits &RHS) {
std::optional<bool> KnownBits::eq(const KnownBits &LHS, const KnownBits &RHS) {
if (LHS.isConstant() && RHS.isConstant())
- return std::optional<bool>(LHS.getConstant() == RHS.getConstant());
+ return LHS.getConstant() == RHS.getConstant();
if (LHS.One.intersects(RHS.Zero) || RHS.One.intersects(LHS.Zero))
- return std::optional<bool>(false);
+ return false;
return std::nullopt;
}
std::optional<bool> KnownBits::ne(const KnownBits &LHS, const KnownBits &RHS) {
if (std::optional<bool> KnownEQ = eq(LHS, RHS))
- return std::optional<bool>(!*KnownEQ);
+ return !*KnownEQ;
return std::nullopt;
}
std::optional<bool> KnownBits::ugt(const KnownBits &LHS, const KnownBits &RHS) {
// LHS >u RHS -> false if umax(LHS) <= umax(RHS)
if (LHS.getMaxValue().ule(RHS.getMinValue()))
- return std::optional<bool>(false);
+ return false;
// LHS >u RHS -> true if umin(LHS) > umax(RHS)
if (LHS.getMinValue().ugt(RHS.getMaxValue()))
- return std::optional<bool>(true);
+ return true;
return std::nullopt;
}
std::optional<bool> KnownBits::uge(const KnownBits &LHS, const KnownBits &RHS) {
if (std::optional<bool> IsUGT = ugt(RHS, LHS))
- return std::optional<bool>(!*IsUGT);
+ return !*IsUGT;
return std::nullopt;
}
@@ -671,16 +671,16 @@ std::optional<bool> KnownBits::ule(const KnownBits &LHS, const KnownBits &RHS) {
std::optional<bool> KnownBits::sgt(const KnownBits &LHS, const KnownBits &RHS) {
// LHS >s RHS -> false if smax(LHS) <= smax(RHS)
if (LHS.getSignedMaxValue().sle(RHS.getSignedMinValue()))
- return std::optional<bool>(false);
+ return false;
// LHS >s RHS -> true if smin(LHS) > smax(RHS)
if (LHS.getSignedMinValue().sgt(RHS.getSignedMaxValue()))
- return std::optional<bool>(true);
+ return true;
return std::nullopt;
}
std::optional<bool> KnownBits::sge(const KnownBits &LHS, const KnownBits &RHS) {
if (std::optional<bool> KnownSGT = sgt(RHS, LHS))
- return std::optional<bool>(!*KnownSGT);
+ return !*KnownSGT;
return std::nullopt;
}
diff --git a/llvm/lib/Support/Unix/Signals.inc b/llvm/lib/Support/Unix/Signals.inc
index 56ad4fc504153..e861240189617 100644
--- a/llvm/lib/Support/Unix/Signals.inc
+++ b/llvm/lib/Support/Unix/Signals.inc
@@ -84,8 +84,10 @@
using namespace llvm;
-static void SignalHandler(int Sig, siginfo_t *Info, void *);
+static void SignalHandler(int Sig, siginfo_t *Info, void *Context);
+static void SignalHandlerTerminate(int Sig, siginfo_t *Info, void *Context);
static void InfoSignalHandler(int Sig); // defined below.
+static void InfoSignalHandlerTerminate(int Sig); // defined below.
using SignalHandlerFunctionType = void (*)();
/// The function to call if ctrl-c is pressed.
@@ -292,7 +294,8 @@ static void CreateSigAltStack() {
static void CreateSigAltStack() {}
#endif
-static void RegisterHandlers() { // Not signal-safe.
+static void RegisterHandlers(
+ bool NeedsPOSIXUtilitySignalHandling = false) { // Not signal-safe.
// The mutex prevents other threads from registering handlers while we're
// doing it. We also have to protect the handlers and their count because
// a signal handler could fire while we're registering handlers.
@@ -317,18 +320,34 @@ static void RegisterHandlers() { // Not signal-safe.
switch (Kind) {
case SignalKind::IsKill:
- NewHandler.sa_sigaction = SignalHandler;
+ if (NeedsPOSIXUtilitySignalHandling)
+ // If POSIX signal-handling semantics are followed, the signal handler
+ // resignal itself to terminate after handling the signal.
+ NewHandler.sa_sigaction = SignalHandlerTerminate;
+ else
+ NewHandler.sa_sigaction = SignalHandler;
NewHandler.sa_flags = SA_NODEFER | SA_RESETHAND | SA_ONSTACK | SA_SIGINFO;
break;
case SignalKind::IsInfo:
- NewHandler.sa_handler = InfoSignalHandler;
+ if (NeedsPOSIXUtilitySignalHandling)
+ // If POSIX signal-handling semantics are followed, the signal handler
+ // resignal itself to terminate after handling the signal.
+ NewHandler.sa_handler = InfoSignalHandlerTerminate;
+ else
+ NewHandler.sa_handler = InfoSignalHandler;
NewHandler.sa_flags = SA_ONSTACK;
break;
}
sigemptyset(&NewHandler.sa_mask);
- // Install the new handler, save the old one in RegisteredSignalInfo.
- sigaction(Signal, &NewHandler, &RegisteredSignalInfo[Index].SA);
+ if (NeedsPOSIXUtilitySignalHandling) {
+ // Don't install the new handler if the signal disposition is SIG_IGN.
+ struct sigaction act;
+ if (sigaction(Signal, NULL, &act) == 0 && act.sa_handler != SIG_IGN)
+ sigaction(Signal, &NewHandler, &RegisteredSignalInfo[Index].SA);
+ } else {
+ sigaction(Signal, &NewHandler, &RegisteredSignalInfo[Index].SA);
+ }
RegisteredSignalInfo[Index].SigNo = Signal;
++NumRegisteredSignals;
};
@@ -377,7 +396,7 @@ void sys::CleanupOnSignal(uintptr_t Context) {
}
// The signal handler that runs.
-static void SignalHandler(int Sig, siginfo_t *Info, void *) {
+static void SignalHandler(int Sig, siginfo_t *Info, void *Context) {
// Restore the signal behavior to default, so that the program actually
// crashes when we return and the signal reissues. This also ensures that if
// we crash in our signal handler that the program will terminate immediately
@@ -437,12 +456,30 @@ static void SignalHandler(int Sig, siginfo_t *Info, void *) {
#endif
}
+static void SignalHandlerTerminate(int Sig, siginfo_t *Info, void *Context) {
+ SignalHandler(Sig, Info, Context);
+
+ // Resignal if it is a kill signal so that the exit code contains the
+ // terminating signal number.
+ if (llvm::is_contained(KillSigs, Sig))
+ raise(Sig); // Execute the default handler.
+}
+
static void InfoSignalHandler(int Sig) {
SaveAndRestore SaveErrnoDuringASignalHandler(errno);
if (SignalHandlerFunctionType CurrentInfoFunction = InfoSignalFunction)
CurrentInfoFunction();
}
+static void InfoSignalHandlerTerminate(int Sig) {
+ InfoSignalHandler(Sig);
+
+ if (Sig == SIGUSR1) {
+ sys::unregisterHandlers();
+ raise(Sig);
+ }
+}
+
void sys::RunInterruptHandlers() {
// Let's not interfere with stack trace symbolication and friends.
auto BypassSandbox = sandbox::scopedDisable();
@@ -488,10 +525,11 @@ void llvm::sys::DontRemoveFileOnSignal(StringRef Filename) {
/// Add a function to be called when a signal is delivered to the process. The
/// handler can have a cookie passed to it to identify what instance of the
/// handler it is.
-void llvm::sys::AddSignalHandler(sys::SignalHandlerCallback FnPtr,
- void *Cookie) { // Signal-safe.
+void llvm::sys::AddSignalHandler(sys::SignalHandlerCallback FnPtr, void *Cookie,
+ bool NeedsPOSIXUtilitySignalHandling) {
+ // Signal-safe.
insertSignalHandler(FnPtr, Cookie);
- RegisterHandlers();
+ RegisterHandlers(NeedsPOSIXUtilitySignalHandling);
}
#if ENABLE_BACKTRACES && defined(HAVE_BACKTRACE) && \
diff --git a/llvm/lib/Support/Windows/Signals.inc b/llvm/lib/Support/Windows/Signals.inc
index eec112e5a80f5..7e6799befb983 100644
--- a/llvm/lib/Support/Windows/Signals.inc
+++ b/llvm/lib/Support/Windows/Signals.inc
@@ -549,8 +549,8 @@ void llvm::sys::CallOneShotPipeSignalHandler() {
/// Add a function to be called when a signal is delivered to the process. The
/// handler can have a cookie passed to it to identify what instance of the
/// handler it is.
-void llvm::sys::AddSignalHandler(sys::SignalHandlerCallback FnPtr,
- void *Cookie) {
+void llvm::sys::AddSignalHandler(sys::SignalHandlerCallback FnPtr, void *Cookie,
+ bool NeedsPOSIXUtilitySignalHandling) {
insertSignalHandler(FnPtr, Cookie);
RegisterHandler();
LeaveCriticalSection(&CriticalSection);
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index ac6b9dd5dbd68..9babe675200c1 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -890,11 +890,11 @@ enum SMEMatrixType {
#undef TSFLAG_INSTR_FLAGS
#undef TSFLAG_SME_MATRIX_TYPE
-int64_t getSVEPseudoMap(uint32_t Opcode);
-int64_t getSVERevInstr(uint32_t Opcode);
-int64_t getSVENonRevInstr(uint32_t Opcode);
+int32_t getSVEPseudoMap(uint32_t Opcode);
+int32_t getSVERevInstr(uint32_t Opcode);
+int32_t getSVENonRevInstr(uint32_t Opcode);
-int64_t getSMEPseudoMap(uint32_t Opcode);
+int32_t getSMEPseudoMap(uint32_t Opcode);
}
} // end namespace llvm
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
index 7630007d0f8af..a192f788ead71 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
@@ -1243,7 +1243,7 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
// Fp conversions to i16 must be kept on fp register banks to ensure
// proper saturation, as there are no 16-bit gprs.
// In addition, conversion intrinsics have fpr output when the input
- // size matches the output size, or PRCVT is present.
+ // size matches the output size, or FPRCVT is present.
if (DstSize == 16 ||
((DstSize == SrcSize || STI.hasFeature(AArch64::FeatureFPRCVT)) &&
all_of(MRI.use_nodbg_instructions(MI.getOperand(0).getReg()),
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 9ad2f2e11fbcc..5f0341c5aaa92 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -775,6 +775,14 @@ defm CvtFP8VOP1Bug : AMDGPUSubtargetFeature<"cvt-fp8-vop1-bug",
[FeatureFP8ConversionInsts]
>;
+defm WMMA256bInsts : AMDGPUSubtargetFeature<"wmma-256b-insts",
+ "Has WMMA instructions where A and B matrices have duplicated data"
+>;
+
+defm WMMA128bInsts : AMDGPUSubtargetFeature<"wmma-128b-insts",
+ "Has WMMA instructions where A and B matrices do not have duplicated data"
+>;
+
defm PkFmacF16Inst : AMDGPUSubtargetFeature<"pk-fmac-f16-inst",
"Has v_pk_fmac_f16 instruction"
>;
@@ -1799,7 +1807,6 @@ def FeatureISAVersion11_Common : FeatureSet<
FeatureBackOffBarrier,
FeatureLDSBankCount32,
FeatureDLInsts,
- FeatureDot5Insts,
FeatureDot7Insts,
FeatureDot8Insts,
FeatureDot9Insts,
@@ -1820,9 +1827,9 @@ def FeatureISAVersion11_Common : FeatureSet<
FeatureD16Writes32BitVgpr,
]>;
-// There are few workarounds that need to be
-// added to all targets. This pessimizes codegen
-// a bit on the generic GFX11 target.
+// There are few workarounds that need to be added to all targets. This
+// pessimizes codegen a bit on the generic GFX11 target. This generic target
+// does not include GFX1170 due to incompatible changes.
def FeatureISAVersion11_Generic: FeatureSet<
!listconcat(FeatureISAVersion11_Common.Features,
[FeatureMSAALoadDstSelBug,
@@ -1831,14 +1838,18 @@ def FeatureISAVersion11_Generic: FeatureSet<
FeatureMADIntraFwdBug,
FeaturePrivEnabledTrap2NopBug,
FeatureRequiresCOV6,
- FeatureRequiredExportPriority])>;
+ FeatureRequiredExportPriority,
+ FeatureDot5Insts,
+ FeatureWMMA256bInsts])>;
def FeatureISAVersion11_0_Common : FeatureSet<
!listconcat(FeatureISAVersion11_Common.Features,
[FeatureMSAALoadDstSelBug,
FeatureVALUTransUseHazard,
FeatureMADIntraFwdBug,
- FeaturePrivEnabledTrap2NopBug])>;
+ FeaturePrivEnabledTrap2NopBug,
+ FeatureDot5Insts,
+ FeatureWMMA256bInsts])>;
def FeatureISAVersion11_0_0 : FeatureSet<
!listconcat(FeatureISAVersion11_0_Common.Features,
@@ -1861,7 +1872,9 @@ def FeatureISAVersion11_5_Common : FeatureSet<
!listconcat(FeatureISAVersion11_Common.Features,
[FeatureSALUFloatInsts,
FeatureDPPSrc1SGPR,
- FeatureRequiredExportPriority])>;
+ FeatureRequiredExportPriority,
+ FeatureDot5Insts,
+ FeatureWMMA256bInsts])>;
def FeatureISAVersion11_5_0 : FeatureSet<
!listconcat(FeatureISAVersion11_5_Common.Features,
@@ -1885,7 +1898,8 @@ def FeatureISAVersion11_7_0 : FeatureSet<
[FeatureSALUFloatInsts,
FeatureDPPSrc1SGPR,
FeatureFP8ConversionInsts,
- FeatureDot11Insts])>;
+ FeatureDot11Insts,
+ FeatureWMMA128bInsts])>;
def FeatureISAVersion12 : FeatureSet<
[FeatureGFX12,
@@ -1915,6 +1929,7 @@ def FeatureISAVersion12 : FeatureSet<
FeatureImageInsts,
FeatureExtendedImageInsts,
FeatureFP8ConversionInsts,
+ FeatureWMMA128bInsts,
FeatureIEEEMinimumMaximumInsts,
FeaturePackedTID,
FeatureVcmpxPermlaneHazard,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp
index f4872ec63f7c3..4c8b91da765f7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp
@@ -148,7 +148,7 @@ class AMDGPULowerVGPREncoding {
/// bit mapping. Optionally takes second array \p Ops2 for VOPD.
/// If provided and an operand from \p Ops is not a VGPR, then \p Ops2
/// is checked.
- void computeMode(ModeTy &NewMode, MachineInstr &MI,
+ void computeMode(ModeTy &NewMode, const MachineInstr &MI,
const AMDGPU::OpName Ops[OpNum],
const AMDGPU::OpName *Ops2 = nullptr);
@@ -224,13 +224,14 @@ AMDGPULowerVGPREncoding::getMSBs(const MachineOperand &MO) const {
return Idx >> 8;
}
-void AMDGPULowerVGPREncoding::computeMode(ModeTy &NewMode, MachineInstr &MI,
+void AMDGPULowerVGPREncoding::computeMode(ModeTy &NewMode,
+ const MachineInstr &MI,
const AMDGPU::OpName Ops[OpNum],
const AMDGPU::OpName *Ops2) {
NewMode = {};
for (unsigned I = 0; I < OpNum; ++I) {
- MachineOperand *Op = TII->getNamedOperand(MI, Ops[I]);
+ const MachineOperand *Op = TII->getNamedOperand(MI, Ops[I]);
std::optional<unsigned> MSBits;
if (Op)
@@ -238,7 +239,7 @@ void AMDGPULowerVGPREncoding::computeMode(ModeTy &NewMode, MachineInstr &MI,
#if !defined(NDEBUG)
if (MSBits.has_value() && Ops2) {
- auto Op2 = TII->getNamedOperand(MI, Ops2[I]);
+ const MachineOperand *Op2 = TII->getNamedOperand(MI, Ops2[I]);
if (Op2) {
std::optional<unsigned> MSBits2;
MSBits2 = getMSBs(*Op2);
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 998a9d0910a07..01cc4ff4ae854 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -1556,6 +1556,8 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
return AMDGPU::isGFX11Plus(getSTI());
}
+ bool isGFX1170() const { return AMDGPU::isGFX1170(getSTI()); }
+
bool isGFX12() const { return AMDGPU::isGFX12(getSTI()); }
bool isGFX12Plus() const { return AMDGPU::isGFX12Plus(getSTI()); }
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index b2dfd098735a0..2309a56f612f1 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -686,11 +686,19 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
Address, CS))
break;
+ if (isGFX1170() &&
+ tryDecodeInst(DecoderTableGFX117064, MI, QW, Address, CS))
+ break;
+
if (isGFX11() &&
tryDecodeInst(DecoderTableGFX1164, DecoderTableGFX11_FAKE1664, MI, QW,
Address, CS))
break;
+ if (isGFX1170() &&
+ tryDecodeInst(DecoderTableGFX1170W6464, MI, QW, Address, CS))
+ break;
+
if (isGFX11() &&
tryDecodeInst(DecoderTableGFX11W6464, MI, QW, Address, CS))
break;
@@ -2247,6 +2255,8 @@ bool AMDGPUDisassembler::isGFX11Plus() const {
return AMDGPU::isGFX11Plus(STI);
}
+bool AMDGPUDisassembler::isGFX1170() const { return AMDGPU::isGFX1170(STI); }
+
bool AMDGPUDisassembler::isGFX12() const {
return STI.hasFeature(AMDGPU::FeatureGFX12);
}
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index 28f71d8d7556b..b01eb8dd59fad 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -178,6 +178,7 @@ class AMDGPUDisassembler : public MCDisassembler {
bool isGFX10() const;
bool isGFX10Plus() const;
bool isGFX11() const;
+ bool isGFX1170() const;
bool isGFX11Plus() const;
bool isGFX12() const;
bool isGFX12Plus() const;
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index b308e0d77305f..2365b6175a46f 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -396,6 +396,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
return isMesa3DOS() && AMDGPU::isShader(F.getCallingConv());
}
+ bool isGFX1170() const {
+ return getGeneration() == GFX11 && hasWMMA128bInsts();
+ }
+
bool hasMad64_32() const { return getGeneration() >= SEA_ISLANDS; }
bool hasAtomicFaddInsts() const {
diff --git a/llvm/lib/Target/AMDGPU/R600InstrInfo.h b/llvm/lib/Target/AMDGPU/R600InstrInfo.h
index b96c17e137072..f6e9d2d485444 100644
--- a/llvm/lib/Target/AMDGPU/R600InstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/R600InstrInfo.h
@@ -326,7 +326,7 @@ class R600InstrInfo final : public R600GenInstrInfo {
namespace R600 {
-int64_t getLDSNoRetOp(uint32_t Opcode);
+int32_t getLDSNoRetOp(uint32_t Opcode);
} //End namespace AMDGPU
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 9180d5fc8bcf0..d7c997c1f5092 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -1159,7 +1159,7 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
}
int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
- int64_t NewOpc;
+ int32_t NewOpc;
// Try to map original to commuted opcode
NewOpc = AMDGPU::getCommuteRev(Opcode);
@@ -10377,9 +10377,9 @@ int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
Opcode = MFMAOp;
}
- int64_t MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
+ int32_t MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
- if (MCOp == (uint32_t)-1 && ST.hasGFX1250Insts())
+ if (MCOp == AMDGPU::INSTRUCTION_LIST_END && ST.hasGFX1250Insts())
MCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX12);
// -1 means that Opcode is already a native instruction.
@@ -10387,20 +10387,20 @@ int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
return Opcode;
if (ST.hasGFX90AInsts()) {
- uint32_t NMCOp = (uint32_t)-1;
+ uint32_t NMCOp = AMDGPU::INSTRUCTION_LIST_END;
if (ST.hasGFX940Insts())
NMCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX940);
- if (NMCOp == (uint32_t)-1)
+ if (NMCOp == AMDGPU::INSTRUCTION_LIST_END)
NMCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX90A);
- if (NMCOp == (uint32_t)-1)
+ if (NMCOp == AMDGPU::INSTRUCTION_LIST_END)
NMCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX9);
- if (NMCOp != (uint32_t)-1)
+ if (NMCOp != AMDGPU::INSTRUCTION_LIST_END)
MCOp = NMCOp;
}
- // (uint32_t)-1 means that Opcode is a pseudo instruction that has
- // no encoding in the given subtarget generation.
- if (MCOp == (uint32_t)-1)
+ // INSTRUCTION_LIST_END means that Opcode is a pseudo instruction that has no
+ // encoding in the given subtarget generation.
+ if (MCOp == AMDGPU::INSTRUCTION_LIST_END)
return -1;
if (isAsmOnlyOpcode(MCOp))
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 0b54513bb6114..c945533f0f2ab 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1737,86 +1737,86 @@ bool execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI,
namespace AMDGPU {
LLVM_READONLY
- int64_t getVOPe64(uint32_t Opcode);
+ int32_t getVOPe64(uint32_t Opcode);
LLVM_READONLY
- int64_t getVOPe32(uint32_t Opcode);
+ int32_t getVOPe32(uint32_t Opcode);
LLVM_READONLY
- int64_t getSDWAOp(uint32_t Opcode);
+ int32_t getSDWAOp(uint32_t Opcode);
LLVM_READONLY
- int64_t getDPPOp32(uint32_t Opcode);
+ int32_t getDPPOp32(uint32_t Opcode);
LLVM_READONLY
- int64_t getDPPOp64(uint32_t Opcode);
+ int32_t getDPPOp64(uint32_t Opcode);
LLVM_READONLY
- int64_t getBasicFromSDWAOp(uint32_t Opcode);
+ int32_t getBasicFromSDWAOp(uint32_t Opcode);
LLVM_READONLY
- int64_t getCommuteRev(uint32_t Opcode);
+ int32_t getCommuteRev(uint32_t Opcode);
LLVM_READONLY
- int64_t getCommuteOrig(uint32_t Opcode);
+ int32_t getCommuteOrig(uint32_t Opcode);
LLVM_READONLY
- int64_t getAddr64Inst(uint32_t Opcode);
+ int32_t getAddr64Inst(uint32_t Opcode);
/// Check if \p Opcode is an Addr64 opcode.
///
/// \returns \p Opcode if it is an Addr64 opcode, otherwise -1.
LLVM_READONLY
- int64_t getIfAddr64Inst(uint32_t Opcode);
+ int32_t getIfAddr64Inst(uint32_t Opcode);
LLVM_READONLY
- int64_t getSOPKOp(uint32_t Opcode);
+ int32_t getSOPKOp(uint32_t Opcode);
/// \returns SADDR form of a FLAT Global instruction given an \p Opcode
/// of a VADDR form.
LLVM_READONLY
- int64_t getGlobalSaddrOp(uint32_t Opcode);
+ int32_t getGlobalSaddrOp(uint32_t Opcode);
/// \returns VADDR form of a FLAT Global instruction given an \p Opcode
/// of a SADDR form.
LLVM_READONLY
- int64_t getGlobalVaddrOp(uint32_t Opcode);
+ int32_t getGlobalVaddrOp(uint32_t Opcode);
LLVM_READONLY
- int64_t getVCMPXNoSDstOp(uint32_t Opcode);
+ int32_t getVCMPXNoSDstOp(uint32_t Opcode);
/// \returns ST form with only immediate offset of a FLAT Scratch instruction
/// given an \p Opcode of an SS (SADDR) form.
LLVM_READONLY
- int64_t getFlatScratchInstSTfromSS(uint32_t Opcode);
+ int32_t getFlatScratchInstSTfromSS(uint32_t Opcode);
/// \returns SV (VADDR) form of a FLAT Scratch instruction given an \p Opcode
/// of an SVS (SADDR + VADDR) form.
LLVM_READONLY
- int64_t getFlatScratchInstSVfromSVS(uint32_t Opcode);
+ int32_t getFlatScratchInstSVfromSVS(uint32_t Opcode);
/// \returns SS (SADDR) form of a FLAT Scratch instruction given an \p Opcode
/// of an SV (VADDR) form.
LLVM_READONLY
- int64_t getFlatScratchInstSSfromSV(uint32_t Opcode);
+ int32_t getFlatScratchInstSSfromSV(uint32_t Opcode);
/// \returns SV (VADDR) form of a FLAT Scratch instruction given an \p Opcode
/// of an SS (SADDR) form.
LLVM_READONLY
- int64_t getFlatScratchInstSVfromSS(uint32_t Opcode);
+ int32_t getFlatScratchInstSVfromSS(uint32_t Opcode);
/// \returns earlyclobber version of a MAC MFMA is exists.
LLVM_READONLY
- int64_t getMFMAEarlyClobberOp(uint32_t Opcode);
+ int32_t getMFMAEarlyClobberOp(uint32_t Opcode);
/// \returns Version of an MFMA instruction which uses AGPRs for srcC and
/// vdst, given an \p Opcode of an MFMA which uses VGPRs for srcC/vdst.
LLVM_READONLY
- int64_t getMFMASrcCVDstAGPROp(uint32_t Opcode);
+ int32_t getMFMASrcCVDstAGPROp(uint32_t Opcode);
/// \returns v_cmpx version of a v_cmp instruction.
LLVM_READONLY
- int64_t getVCMPXOpFromVCMP(uint32_t Opcode);
+ int32_t getVCMPXOpFromVCMP(uint32_t Opcode);
const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL;
const uint64_t RSRC_ELEMENT_SIZE_SHIFT = (32 + 19);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index f063b4eb77774..c2396674e4f96 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -44,9 +44,10 @@ class GFXGen<Predicate pred, string dn, string suffix, int sub> {
def GFX13Gen : GFXGen<isGFX13Only, "GFX13", "_gfx13", SIEncodingFamily.GFX13>;
def GFX1250Gen : GFXGen<isGFX125xOnly, "GFX1250", "_gfx1250", SIEncodingFamily.GFX1250>;
def GFX12Not12_50Gen : GFXGen<isGFX12Not12_50, "GFX12", "_gfx12", SIEncodingFamily.GFX12>;
-def GFX12Gen : GFXGen<isGFX12Only, "GFX12", "_gfx12", SIEncodingFamily.GFX12>;
-def GFX11Gen : GFXGen<isGFX11Only, "GFX11", "_gfx11", SIEncodingFamily.GFX11>;
-def GFX10Gen : GFXGen<isGFX10Only, "GFX10", "_gfx10", SIEncodingFamily.GFX10>;
+def GFX12Gen : GFXGen<isGFX12Only, "GFX12", "_gfx12", SIEncodingFamily.GFX12>;
+def GFX1170Gen : GFXGen<isGFX11Only, "GFX1170", "_gfx1170", SIEncodingFamily.GFX11>;
+def GFX11Gen : GFXGen<isGFX11Only, "GFX11", "_gfx11", SIEncodingFamily.GFX11>;
+def GFX10Gen : GFXGen<isGFX10Only, "GFX10", "_gfx10", SIEncodingFamily.GFX10>;
//===----------------------------------------------------------------------===//
// SI DAG Nodes
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 3f32d1166fc89..3ffae37bbf239 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -812,7 +812,7 @@ unsigned mapWMMA3AddrTo2AddrOpcode(unsigned Opc) {
// Wrapper for Tablegen'd function. enum Subtarget is not defined in any
// header files, so we need to wrap it in a function that takes unsigned
// instead.
-int64_t getMCOpcode(uint32_t Opcode, unsigned Gen) {
+int32_t getMCOpcode(uint32_t Opcode, unsigned Gen) {
return getMCOpcodeGen(Opcode, static_cast<Subtarget>(Gen));
}
@@ -2598,6 +2598,10 @@ bool isGFX11(const MCSubtargetInfo &STI) {
return STI.hasFeature(AMDGPU::FeatureGFX11);
}
+bool isGFX1170(const MCSubtargetInfo &STI) {
+ return isGFX11(STI) && STI.hasFeature(AMDGPU::FeatureWMMA128bInsts);
+}
+
bool isGFX11Plus(const MCSubtargetInfo &STI) {
return isGFX11(STI) || isGFX12Plus(STI);
}
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 7500c2481a4bd..fa24383c90fa6 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -416,7 +416,7 @@ inline bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx) {
}
LLVM_READONLY
-int64_t getSOPPWithRelaxation(uint32_t Opcode);
+int32_t getSOPPWithRelaxation(uint32_t Opcode);
struct MIMGBaseOpcodeInfo {
MIMGBaseOpcode BaseOpcode;
@@ -646,7 +646,7 @@ const GcnBufferFormatInfo *getGcnBufferFormatInfo(uint8_t Format,
const MCSubtargetInfo &STI);
LLVM_READONLY
-int64_t getMCOpcode(uint32_t Opcode, unsigned Gen);
+int32_t getMCOpcode(uint32_t Opcode, unsigned Gen);
LLVM_READONLY
unsigned getVOPDOpcode(unsigned Opc, bool VOPD3);
@@ -1705,6 +1705,7 @@ bool isGFX10Plus(const MCSubtargetInfo &STI);
bool isNotGFX10Plus(const MCSubtargetInfo &STI);
bool isGFX10Before1030(const MCSubtargetInfo &STI);
bool isGFX11(const MCSubtargetInfo &STI);
+bool isGFX1170(const MCSubtargetInfo &STI);
bool isGFX11Plus(const MCSubtargetInfo &STI);
bool isGFX12(const MCSubtargetInfo &STI);
bool isGFX12Plus(const MCSubtargetInfo &STI);
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 9a4054b8ad248..9e62dc7c9db0a 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -1426,22 +1426,18 @@ multiclass WMMAInst<string Suffix, string Instr, VOPProfile P, SDPatternOperator
defvar WMMAConstraints3Addr = "@earlyclobber $vdst";
defvar WMMAProfile = VOPProfileWMMA<P, Suffix, _Src01RC64, Type.hasClamp, Type.hasOpsel>;
- let isConvergent = 1, Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in {
- let Constraints = WMMAConstraints2Addr, isConvertibleToThreeAddress = convertibleTo3Addr in {
- def _twoaddr # Suffix : VOP3P_Pseudo<Instr # Suffix, WMMAProfile>;
- }
- }
- if convertibleTo3Addr then {
+
+ let SubtargetPredicate = HasWMMA256bInsts in {
let isConvergent = 1, Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in {
- let Constraints = WMMAConstraints3Addr, SchedRW = [Write32Bit, Write32Bit] in {
- def _threeaddr # Suffix : VOP3P_Pseudo<Instr # Suffix, WMMAProfile>;
+ let Constraints = WMMAConstraints2Addr, isConvertibleToThreeAddress = convertibleTo3Addr in {
+ def _twoaddr # Suffix : VOP3P_Pseudo<Instr # Suffix, WMMAProfile>;
+ }
+ if convertibleTo3Addr then {
+ let Constraints = WMMAConstraints3Addr, SchedRW = [Write32Bit, Write32Bit] in {
+ def _threeaddr # Suffix : VOP3P_Pseudo<Instr # Suffix, WMMAProfile>;
+ }
}
}
- def : WMMAOpcodeMapping<!cast<Instruction>(NAME # _twoaddr # Suffix),
- !cast<Instruction>(NAME # _threeaddr # Suffix)>;
- }
-
- let SubtargetPredicate = isGFX11Only in {
if !eq(Type, WMMAOpSel) then {
def : WMMAOpSelPat<!cast<Instruction>(NAME # _twoaddr # Suffix), node, P>;
} else if !eq(Type, WMMAUIClamp) then {
@@ -1450,6 +1446,11 @@ multiclass WMMAInst<string Suffix, string Instr, VOPProfile P, SDPatternOperator
def : WMMARegularPat<!cast<Instruction>(NAME # _twoaddr # Suffix), node, P>;
}
}
+
+ if convertibleTo3Addr then {
+ def : WMMAOpcodeMapping<!cast<Instruction>(NAME # _twoaddr # Suffix),
+ !cast<Instruction>(NAME # _threeaddr # Suffix)>;
+ }
}
@@ -1727,7 +1728,7 @@ multiclass WMMAInstGFX12<string Instr, VOP3PWMMA_Profile WMMAProfile, string Pse
defvar WMMAConstraints2Addr = !if(DiffVdstSrc2, "@earlyclobber $vdst", "@earlyclobber $vdst,$vdst = $src2");
defvar WMMAConstraints3Addr = "@earlyclobber $vdst";
- let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0, isConvergent = 1 in {
+ let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0, isConvergent = 1, SubtargetPredicate = HasWMMA128bInsts in {
let Constraints = WMMAConstraints2Addr, isConvertibleToThreeAddress = 1 in
def _twoaddr : VOP3P_Pseudo<Instr, WMMAProfile>, WMMAInstInfo {
let PseudoInstr = Instr#PseudoInstrSuffix;
@@ -2047,7 +2048,7 @@ class SWMMACPat_w64<Instruction Inst, SDPatternOperator node, VOP3PWMMA_Profile
let WaveSizePredicate = isWave64;
}
-let WaveSizePredicate = isWave32, SubtargetPredicate = isGFX12PlusNot12_50 in {
+let WaveSizePredicate = isWave32, SubtargetPredicate = isGFX11PlusNot12_50, OtherPredicates = [HasWMMA128bInsts] in {
defm : WMMAPat<"V_WMMA_F32_16X16X16_F16_w32", int_amdgcn_wmma_f32_16x16x16_f16, F32_F16_WMMA_w32>;
defm : WMMAPat<"V_WMMA_F32_16X16X16_BF16_w32", int_amdgcn_wmma_f32_16x16x16_bf16, F32_BF16_WMMA_w32>;
defm : WMMAPat<"V_WMMA_F16_16X16X16_F16_w32", int_amdgcn_wmma_f16_16x16x16_f16, F16_F16_WMMA_w32,1>;
@@ -2074,7 +2075,7 @@ let WaveSizePredicate = isWave32, SubtargetPredicate = isGFX12PlusNot12_50 in {
def : SWMMACPat<V_SWMMAC_F32_16X16X32_BF8_BF8_w32_twoaddr, int_amdgcn_swmmac_f32_16x16x32_bf8_bf8, F32_FP8BF8_SWMMAC_w32>;
}
-let WaveSizePredicate = isWave64, SubtargetPredicate = isGFX12PlusNot12_50 in {
+let WaveSizePredicate = isWave64, SubtargetPredicate = isGFX11PlusNot12_50, OtherPredicates = [HasWMMA128bInsts] in {
defm : WMMAPat<"V_WMMA_F32_16X16X16_F16_w64", int_amdgcn_wmma_f32_16x16x16_f16, F32_F16_WMMA_w64>;
defm : WMMAPat<"V_WMMA_F32_16X16X16_BF16_w64", int_amdgcn_wmma_f32_16x16x16_bf16, F32_BF16_WMMA_w64>;
defm : WMMAPat<"V_WMMA_F16_16X16X16_F16_w64", int_amdgcn_wmma_f16_16x16x16_f16, F16_F16_WMMA_w64,1>;
@@ -2229,6 +2230,18 @@ multiclass VOP3P_WMMA_Real_Base<GFXGen Gen, bits<8> op, VOP3PWMMA_Profile WMMAP,
VOP3PeWmma<op, !cast<VOP3P_Pseudo>(backing_ps_name).Pfl, WMMAP>;
}
+multiclass VOP3P_Real_WMMA_gfx1170 <bits<8> op, VOP3PWMMA_Profile WMMAP> {
+ let WaveSizePredicate = isWave32, DecoderNamespace = "GFX1170" in {
+ defm _twoaddr : VOP3P_WMMA_Real_Base <GFX1170Gen, op, WMMAP>;
+ }
+}
+
+multiclass VOP3P_Real_WMMA_gfx1170w64 <bits<8> op, VOP3PWMMA_Profile WMMAP> {
+ let WaveSizePredicate = isWave64, DecoderNamespace = "GFX1170W64" in {
+ defm _twoaddr : VOP3P_WMMA_Real_Base <GFX1170Gen, op, WMMAP>;
+ }
+}
+
multiclass VOP3P_Real_WMMA_gfx12 <bits<8> op, VOP3PWMMA_Profile WMMAP> {
let WaveSizePredicate = isWave32, DecoderNamespace = "GFX12" in {
defm _twoaddr : VOP3P_WMMA_Real_Base <GFX12Gen, op, WMMAP>;
@@ -2241,6 +2254,14 @@ multiclass VOP3P_Real_WMMA_gfx12w64 <bits<8> op, VOP3PWMMA_Profile WMMAP> {
}
}
+multiclass VOP3P_Real_WMMA_gfx1170_gfx12 <bits<8> op, VOP3PWMMA_Profile WMMAP> :
+ VOP3P_Real_WMMA_gfx1170<op, WMMAP>,
+ VOP3P_Real_WMMA_gfx12<op, WMMAP>;
+
+multiclass VOP3P_Real_WMMA_gfx1170_gfx12w64 <bits<8> op, VOP3PWMMA_Profile WMMAP> :
+ VOP3P_Real_WMMA_gfx1170w64<op, WMMAP>,
+ VOP3P_Real_WMMA_gfx12w64<op, WMMAP>;
+
multiclass VOP3P_Real_WMMA_gfx1250 <bits<8> op, VOP3PWMMA_Profile WMMAP> {
let WaveSizePredicate = isWave32, DecoderNamespace = "GFX12" in {
defm _twoaddr : VOP3P_WMMA_Real_Base <GFX1250Gen, op, WMMAP>;
@@ -2345,54 +2366,53 @@ multiclass VOP3PX2_Real_ScaledWMMA_SrcFormats<string Gen, bits<8> op, bits<8> Ld
}
}
-defm V_WMMA_F32_16X16X16_F16_w32 : VOP3P_Real_WMMA_gfx12 <0x040, F32_F16_WMMA_w32>;
-defm V_WMMA_F32_16X16X16_BF16_w32 : VOP3P_Real_WMMA_gfx12 <0x041, F32_BF16_WMMA_w32>;
-defm V_WMMA_F16_16X16X16_F16_w32 : VOP3P_Real_WMMA_gfx12 <0x042, F16_F16_WMMA_w32>;
-defm V_WMMA_BF16_16X16X16_BF16_w32 : VOP3P_Real_WMMA_gfx12 <0x043, BF16_BF16_WMMA_w32>;
-defm V_WMMA_I32_16X16X16_IU8_w32 : VOP3P_Real_WMMA_gfx12 <0x044, I32_IU8_WMMA_w32>;
-defm V_WMMA_I32_16X16X16_IU4_w32 : VOP3P_Real_WMMA_gfx12 <0x045, I32_IU4X16_WMMA_w32>;
-defm V_WMMA_F32_16X16X16_FP8_FP8_w32 : VOP3P_Real_WMMA_gfx12 <0x046, F32_FP8BF8_WMMA_w32>;
-defm V_WMMA_F32_16X16X16_FP8_BF8_w32 : VOP3P_Real_WMMA_gfx12 <0x047, F32_FP8BF8_WMMA_w32>;
-defm V_WMMA_F32_16X16X16_BF8_FP8_w32 : VOP3P_Real_WMMA_gfx12 <0x048, F32_FP8BF8_WMMA_w32>;
-defm V_WMMA_F32_16X16X16_BF8_BF8_w32 : VOP3P_Real_WMMA_gfx12 <0x049, F32_FP8BF8_WMMA_w32>;
-defm V_WMMA_I32_16X16X32_IU4_w32 : VOP3P_Real_WMMA_gfx12 <0x04a, I32_IU4X32_WMMA_w32>;
-
-defm V_WMMA_F32_16X16X16_F16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x040, F32_F16_WMMA_w64>;
-defm V_WMMA_F32_16X16X16_BF16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x041, F32_BF16_WMMA_w64>;
-defm V_WMMA_F16_16X16X16_F16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x042, F16_F16_WMMA_w64>;
-defm V_WMMA_BF16_16X16X16_BF16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x043, BF16_BF16_WMMA_w64>;
-defm V_WMMA_I32_16X16X16_IU8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x044, I32_IU8_WMMA_w64>;
-defm V_WMMA_I32_16X16X16_IU4_w64 : VOP3P_Real_WMMA_gfx12w64 <0x045, I32_IU4X16_WMMA_w64>;
-defm V_WMMA_F32_16X16X16_FP8_FP8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x046, F32_FP8BF8_WMMA_w64>;
-defm V_WMMA_F32_16X16X16_FP8_BF8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x047, F32_FP8BF8_WMMA_w64>;
-defm V_WMMA_F32_16X16X16_BF8_FP8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x048, F32_FP8BF8_WMMA_w64>;
-defm V_WMMA_F32_16X16X16_BF8_BF8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x049, F32_FP8BF8_WMMA_w64>;
-defm V_WMMA_I32_16X16X32_IU4_w64 : VOP3P_Real_WMMA_gfx12w64 <0x04a, I32_IU4X32_WMMA_w64>;
-
-
-defm V_SWMMAC_F32_16X16X32_F16_w32 : VOP3P_Real_WMMA_gfx12 <0x050, F32_F16_SWMMAC_w32>;
-defm V_SWMMAC_F32_16X16X32_BF16_w32 : VOP3P_Real_WMMA_gfx12 <0x051, F32_BF16_SWMMAC_w32>;
-defm V_SWMMAC_F16_16X16X32_F16_w32 : VOP3P_Real_WMMA_gfx12 <0x052, F16_F16_SWMMAC_w32>;
-defm V_SWMMAC_BF16_16X16X32_BF16_w32 : VOP3P_Real_WMMA_gfx12 <0x053, BF16_BF16_SWMMAC_w32>;
-defm V_SWMMAC_I32_16X16X32_IU8_w32 : VOP3P_Real_WMMA_gfx12 <0x054, I32_IU8_SWMMAC_w32>;
-defm V_SWMMAC_I32_16X16X32_IU4_w32 : VOP3P_Real_WMMA_gfx12 <0x055, I32_IU4X32_SWMMAC_w32>;
-defm V_SWMMAC_I32_16X16X64_IU4_w32 : VOP3P_Real_WMMA_gfx12 <0x056, I32_IU4X64_SWMMAC_w32>;
-defm V_SWMMAC_F32_16X16X32_FP8_FP8_w32 : VOP3P_Real_WMMA_gfx12 <0x057, F32_FP8BF8_SWMMAC_w32>;
-defm V_SWMMAC_F32_16X16X32_FP8_BF8_w32 : VOP3P_Real_WMMA_gfx12 <0x058, F32_FP8BF8_SWMMAC_w32>;
-defm V_SWMMAC_F32_16X16X32_BF8_FP8_w32 : VOP3P_Real_WMMA_gfx12 <0x059, F32_FP8BF8_SWMMAC_w32>;
-defm V_SWMMAC_F32_16X16X32_BF8_BF8_w32 : VOP3P_Real_WMMA_gfx12 <0x05a, F32_FP8BF8_SWMMAC_w32>;
-
-defm V_SWMMAC_F32_16X16X32_F16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x050, F32_F16_SWMMAC_w64>;
-defm V_SWMMAC_F32_16X16X32_BF16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x051, F32_BF16_SWMMAC_w64>;
-defm V_SWMMAC_F16_16X16X32_F16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x052, F16_F16_SWMMAC_w64>;
-defm V_SWMMAC_BF16_16X16X32_BF16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x053, BF16_BF16_SWMMAC_w64>;
-defm V_SWMMAC_I32_16X16X32_IU8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x054, I32_IU8_SWMMAC_w64>;
-defm V_SWMMAC_I32_16X16X32_IU4_w64 : VOP3P_Real_WMMA_gfx12w64 <0x055, I32_IU4X32_SWMMAC_w64>;
-defm V_SWMMAC_I32_16X16X64_IU4_w64 : VOP3P_Real_WMMA_gfx12w64 <0x056, I32_IU4X64_SWMMAC_w64>;
-defm V_SWMMAC_F32_16X16X32_FP8_FP8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x057, F32_FP8BF8_SWMMAC_w64>;
-defm V_SWMMAC_F32_16X16X32_FP8_BF8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x058, F32_FP8BF8_SWMMAC_w64>;
-defm V_SWMMAC_F32_16X16X32_BF8_FP8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x059, F32_FP8BF8_SWMMAC_w64>;
-defm V_SWMMAC_F32_16X16X32_BF8_BF8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x05a, F32_FP8BF8_SWMMAC_w64>;
+defm V_WMMA_F32_16X16X16_F16_w32 : VOP3P_Real_WMMA_gfx1170_gfx12 <0x040, F32_F16_WMMA_w32>;
+defm V_WMMA_F32_16X16X16_BF16_w32 : VOP3P_Real_WMMA_gfx1170_gfx12 <0x041, F32_BF16_WMMA_w32>;
+defm V_WMMA_F16_16X16X16_F16_w32 : VOP3P_Real_WMMA_gfx1170_gfx12 <0x042, F16_F16_WMMA_w32>;
+defm V_WMMA_BF16_16X16X16_BF16_w32 : VOP3P_Real_WMMA_gfx1170_gfx12 <0x043, BF16_BF16_WMMA_w32>;
+defm V_WMMA_I32_16X16X16_IU8_w32 : VOP3P_Real_WMMA_gfx1170_gfx12 <0x044, I32_IU8_WMMA_w32>;
+defm V_WMMA_I32_16X16X16_IU4_w32 : VOP3P_Real_WMMA_gfx1170_gfx12 <0x045, I32_IU4X16_WMMA_w32>;
+defm V_WMMA_F32_16X16X16_FP8_FP8_w32 : VOP3P_Real_WMMA_gfx1170_gfx12 <0x046, F32_FP8BF8_WMMA_w32>;
+defm V_WMMA_F32_16X16X16_FP8_BF8_w32 : VOP3P_Real_WMMA_gfx1170_gfx12 <0x047, F32_FP8BF8_WMMA_w32>;
+defm V_WMMA_F32_16X16X16_BF8_FP8_w32 : VOP3P_Real_WMMA_gfx1170_gfx12 <0x048, F32_FP8BF8_WMMA_w32>;
+defm V_WMMA_F32_16X16X16_BF8_BF8_w32 : VOP3P_Real_WMMA_gfx1170_gfx12 <0x049, F32_FP8BF8_WMMA_w32>;
+defm V_WMMA_I32_16X16X32_IU4_w32 : VOP3P_Real_WMMA_gfx1170_gfx12 <0x04a, I32_IU4X32_WMMA_w32>;
+
+defm V_WMMA_F32_16X16X16_F16_w64 : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x040, F32_F16_WMMA_w64>;
+defm V_WMMA_F32_16X16X16_BF16_w64 : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x041, F32_BF16_WMMA_w64>;
+defm V_WMMA_F16_16X16X16_F16_w64 : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x042, F16_F16_WMMA_w64>;
+defm V_WMMA_BF16_16X16X16_BF16_w64 : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x043, BF16_BF16_WMMA_w64>;
+defm V_WMMA_I32_16X16X16_IU8_w64 : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x044, I32_IU8_WMMA_w64>;
+defm V_WMMA_I32_16X16X16_IU4_w64 : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x045, I32_IU4X16_WMMA_w64>;
+defm V_WMMA_F32_16X16X16_FP8_FP8_w64 : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x046, F32_FP8BF8_WMMA_w64>;
+defm V_WMMA_F32_16X16X16_FP8_BF8_w64 : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x047, F32_FP8BF8_WMMA_w64>;
+defm V_WMMA_F32_16X16X16_BF8_FP8_w64 : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x048, F32_FP8BF8_WMMA_w64>;
+defm V_WMMA_F32_16X16X16_BF8_BF8_w64 : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x049, F32_FP8BF8_WMMA_w64>;
+defm V_WMMA_I32_16X16X32_IU4_w64 : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x04a, I32_IU4X32_WMMA_w64>;
+
+defm V_SWMMAC_F32_16X16X32_F16_w32 : VOP3P_Real_WMMA_gfx1170_gfx12 <0x050, F32_F16_SWMMAC_w32>;
+defm V_SWMMAC_F32_16X16X32_BF16_w32 : VOP3P_Real_WMMA_gfx1170_gfx12 <0x051, F32_BF16_SWMMAC_w32>;
+defm V_SWMMAC_F16_16X16X32_F16_w32 : VOP3P_Real_WMMA_gfx1170_gfx12 <0x052, F16_F16_SWMMAC_w32>;
+defm V_SWMMAC_BF16_16X16X32_BF16_w32 : VOP3P_Real_WMMA_gfx1170_gfx12 <0x053, BF16_BF16_SWMMAC_w32>;
+defm V_SWMMAC_I32_16X16X32_IU8_w32 : VOP3P_Real_WMMA_gfx1170_gfx12 <0x054, I32_IU8_SWMMAC_w32>;
+defm V_SWMMAC_I32_16X16X32_IU4_w32 : VOP3P_Real_WMMA_gfx1170_gfx12 <0x055, I32_IU4X32_SWMMAC_w32>;
+defm V_SWMMAC_I32_16X16X64_IU4_w32 : VOP3P_Real_WMMA_gfx1170_gfx12 <0x056, I32_IU4X64_SWMMAC_w32>;
+defm V_SWMMAC_F32_16X16X32_FP8_FP8_w32 : VOP3P_Real_WMMA_gfx1170_gfx12 <0x057, F32_FP8BF8_SWMMAC_w32>;
+defm V_SWMMAC_F32_16X16X32_FP8_BF8_w32 : VOP3P_Real_WMMA_gfx1170_gfx12 <0x058, F32_FP8BF8_SWMMAC_w32>;
+defm V_SWMMAC_F32_16X16X32_BF8_FP8_w32 : VOP3P_Real_WMMA_gfx1170_gfx12 <0x059, F32_FP8BF8_SWMMAC_w32>;
+defm V_SWMMAC_F32_16X16X32_BF8_BF8_w32 : VOP3P_Real_WMMA_gfx1170_gfx12 <0x05a, F32_FP8BF8_SWMMAC_w32>;
+
+defm V_SWMMAC_F32_16X16X32_F16_w64 : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x050, F32_F16_SWMMAC_w64>;
+defm V_SWMMAC_F32_16X16X32_BF16_w64 : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x051, F32_BF16_SWMMAC_w64>;
+defm V_SWMMAC_F16_16X16X32_F16_w64 : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x052, F16_F16_SWMMAC_w64>;
+defm V_SWMMAC_BF16_16X16X32_BF16_w64 : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x053, BF16_BF16_SWMMAC_w64>;
+defm V_SWMMAC_I32_16X16X32_IU8_w64 : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x054, I32_IU8_SWMMAC_w64>;
+defm V_SWMMAC_I32_16X16X32_IU4_w64 : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x055, I32_IU4X32_SWMMAC_w64>;
+defm V_SWMMAC_I32_16X16X64_IU4_w64 : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x056, I32_IU4X64_SWMMAC_w64>;
+defm V_SWMMAC_F32_16X16X32_FP8_FP8_w64 : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x057, F32_FP8BF8_SWMMAC_w64>;
+defm V_SWMMAC_F32_16X16X32_FP8_BF8_w64 : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x058, F32_FP8BF8_SWMMAC_w64>;
+defm V_SWMMAC_F32_16X16X32_BF8_FP8_w64 : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x059, F32_FP8BF8_SWMMAC_w64>;
+defm V_SWMMAC_F32_16X16X32_BF8_BF8_w64 : VOP3P_Real_WMMA_gfx1170_gfx12w64 <0x05a, F32_FP8BF8_SWMMAC_w64>;
defm V_WMMA_F32_16X16X4_F32_w32 : VOP3P_Real_WMMA_gfx1250 <0x05d, F32_F32_WMMA_w32>;
defm V_WMMA_F32_16X16X32_BF16_w32 : VOP3P_Real_WMMA_gfx1250 <0x062, F32_BF16X32_WMMA_w32>;
diff --git a/llvm/lib/Target/PowerPC/PPCHazardRecognizers.cpp b/llvm/lib/Target/PowerPC/PPCHazardRecognizers.cpp
index afa1345fdb469..41f066b49cfd5 100644
--- a/llvm/lib/Target/PowerPC/PPCHazardRecognizers.cpp
+++ b/llvm/lib/Target/PowerPC/PPCHazardRecognizers.cpp
@@ -78,7 +78,11 @@ bool PPCDispatchGroupSBHazardRecognizer::isBCTRAfterSet(SUnit *SU) {
}
// FIXME: Remove this when we don't need this:
-namespace llvm { namespace PPC { extern int64_t getNonRecordFormOpcode(uint32_t); } }
+namespace llvm {
+namespace PPC {
+extern int32_t getNonRecordFormOpcode(uint32_t);
+}
+} // namespace llvm
// FIXME: A lot of code in PPCDispatchGroupSBHazardRecognizer is P7 specific.
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 3c2ad1b30b139..bf1755733392c 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -12046,31 +12046,15 @@ SDValue PPCTargetLowering::LowerDMFVectorLoad(SDValue Op,
}
SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains);
- SDValue Lo =
- DAG.getNode(PPCISD::INST512, dl, MVT::v512i1, Loads[0], Loads[1]);
- SDValue LoSub = DAG.getTargetConstant(PPC::sub_wacc_lo, dl, MVT::i32);
- SDValue Hi =
- DAG.getNode(PPCISD::INST512HI, dl, MVT::v512i1, Loads[2], Loads[3]);
- SDValue HiSub = DAG.getTargetConstant(PPC::sub_wacc_hi, dl, MVT::i32);
- SDValue RC = DAG.getTargetConstant(PPC::DMRRCRegClassID, dl, MVT::i32);
- const SDValue Ops[] = {RC, Lo, LoSub, Hi, HiSub};
-
- SDValue Value =
- SDValue(DAG.getMachineNode(PPC::REG_SEQUENCE, dl, MVT::v1024i1, Ops), 0);
+ SDValue Value = DMFInsert1024(Loads, dl, DAG);
if (IsV1024i1) {
return DAG.getMergeValues({Value, TF}, dl);
}
// Handle Loads for V2048i1 which represents a dmr pair.
- SDValue DmrPValue;
- SDValue Dmr1Lo =
- DAG.getNode(PPCISD::INST512, dl, MVT::v512i1, Loads[4], Loads[5]);
- SDValue Dmr1Hi =
- DAG.getNode(PPCISD::INST512HI, dl, MVT::v512i1, Loads[6], Loads[7]);
- const SDValue Dmr1Ops[] = {RC, Dmr1Lo, LoSub, Dmr1Hi, HiSub};
- SDValue Dmr1Value = SDValue(
- DAG.getMachineNode(PPC::REG_SEQUENCE, dl, MVT::v1024i1, Dmr1Ops), 0);
+ SmallVector<SDValue, 4> MoreLoads{Loads[4], Loads[5], Loads[6], Loads[7]};
+ SDValue Dmr1Value = DMFInsert1024(MoreLoads, dl, DAG);
SDValue Dmr0Sub = DAG.getTargetConstant(PPC::sub_dmr0, dl, MVT::i32);
SDValue Dmr1Sub = DAG.getTargetConstant(PPC::sub_dmr1, dl, MVT::i32);
@@ -12078,7 +12062,7 @@ SDValue PPCTargetLowering::LowerDMFVectorLoad(SDValue Op,
SDValue DmrPRC = DAG.getTargetConstant(PPC::DMRpRCRegClassID, dl, MVT::i32);
const SDValue DmrPOps[] = {DmrPRC, Value, Dmr0Sub, Dmr1Value, Dmr1Sub};
- DmrPValue = SDValue(
+ SDValue DmrPValue = SDValue(
DAG.getMachineNode(PPC::REG_SEQUENCE, dl, MVT::v2048i1, DmrPOps), 0);
return DAG.getMergeValues({DmrPValue, TF}, dl);
diff --git a/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp b/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp
index f33d4bf89381b..9114ded4a3cc4 100644
--- a/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp
+++ b/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp
@@ -47,7 +47,7 @@ static cl::opt<bool> DisableVSXFMAMutate(
#define DEBUG_TYPE "ppc-vsx-fma-mutate"
namespace llvm { namespace PPC {
- int64_t getAltVSXFMAOpcode(uint32_t Opcode);
+int32_t getAltVSXFMAOpcode(uint32_t Opcode);
} }
namespace {
diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
index 521b14e498af5..4168cb473fdf1 100644
--- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
+++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
@@ -3829,68 +3829,63 @@ bool RISCVAsmParser::validateInstruction(MCInst &Inst,
if (!(MCID.TSFlags & RISCVII::RVVConstraintMask))
return false;
- if (Opcode == RISCV::SF_VC_V_XVW || Opcode == RISCV::SF_VC_V_IVW ||
- Opcode == RISCV::SF_VC_V_FVW || Opcode == RISCV::SF_VC_V_VVW) {
- // Operands Opcode, Dst, uimm, Dst, Rs2, Rs1 for SF_VC_V_XVW.
- MCRegister VCIXDst = Inst.getOperand(0).getReg();
- SMLoc VCIXDstLoc = Operands[2]->getStartLoc();
- if (MCID.TSFlags & RISCVII::VS1Constraint) {
- MCRegister VCIXRs1 = Inst.getOperand(Inst.getNumOperands() - 1).getReg();
- if (VCIXDst == VCIXRs1)
- return Error(VCIXDstLoc, "the destination vector register group cannot"
- " overlap the source vector register group");
- }
- if (MCID.TSFlags & RISCVII::VS2Constraint) {
- MCRegister VCIXRs2 = Inst.getOperand(Inst.getNumOperands() - 2).getReg();
- if (VCIXDst == VCIXRs2)
- return Error(VCIXDstLoc, "the destination vector register group cannot"
- " overlap the source vector register group");
- }
- return false;
- }
+ int DestIdx = RISCV::getNamedOperandIdx(Inst.getOpcode(), RISCV::OpName::vd);
+ MCRegister DestReg = Inst.getOperand(DestIdx).getReg();
- MCRegister DestReg = Inst.getOperand(0).getReg();
- unsigned Offset = 0;
- int TiedOp = MCID.getOperandConstraint(1, MCOI::TIED_TO);
- if (TiedOp == 0)
- Offset = 1;
+ // Operands[1] or Operands[2] will be the first operand, DestReg.
+ const MCParsedAsmOperand *ParsedOp = Operands[1].get();
+ if (!ParsedOp->isReg()) {
+ // XSfvcp instructions may have an immediate before vd.
+ // FIXME: Is there a better way to do this?
+ ParsedOp = Operands[2].get();
+ }
+ assert(ParsedOp->getReg() == DestReg && "Can't find parsed dest operand");
+ SMLoc Loc = ParsedOp->getStartLoc();
- // Operands[1] will be the first operand, DestReg.
- SMLoc Loc = Operands[1]->getStartLoc();
if (MCID.TSFlags & RISCVII::VS2Constraint) {
- MCRegister CheckReg = Inst.getOperand(Offset + 1).getReg();
+ int VS2Idx =
+ RISCV::getNamedOperandIdx(Inst.getOpcode(), RISCV::OpName::vs2);
+ assert(VS2Idx >= 0 && "No vs2 operand?");
+ MCRegister CheckReg = Inst.getOperand(VS2Idx).getReg();
if (DestReg == CheckReg)
return Error(Loc, "the destination vector register group cannot overlap"
" the source vector register group");
}
- if ((MCID.TSFlags & RISCVII::VS1Constraint) && Inst.getOperand(Offset + 2).isReg()) {
- MCRegister CheckReg = Inst.getOperand(Offset + 2).getReg();
- if (DestReg == CheckReg)
- return Error(Loc, "the destination vector register group cannot overlap"
- " the source vector register group");
+ if (MCID.TSFlags & RISCVII::VS1Constraint) {
+ int VS1Idx =
+ RISCV::getNamedOperandIdx(Inst.getOpcode(), RISCV::OpName::vs1);
+ // FIXME: The vs1 constraint is used on scalar and imm instructions so we
+ // need to check that the operand exists.
+ if (VS1Idx >= 0) {
+ MCRegister CheckReg = Inst.getOperand(VS1Idx).getReg();
+ if (DestReg == CheckReg)
+ return Error(Loc, "the destination vector register group cannot overlap"
+ " the source vector register group");
+ }
}
- if ((MCID.TSFlags & RISCVII::VMConstraint) && (DestReg == RISCV::V0)) {
- // vadc, vsbc are special cases. These instructions have no mask register.
- // The destination register could not be V0.
- if (Opcode == RISCV::VADC_VVM || Opcode == RISCV::VADC_VXM ||
- Opcode == RISCV::VADC_VIM || Opcode == RISCV::VSBC_VVM ||
- Opcode == RISCV::VSBC_VXM || Opcode == RISCV::VFMERGE_VFM ||
- Opcode == RISCV::VMERGE_VIM || Opcode == RISCV::VMERGE_VVM ||
- Opcode == RISCV::VMERGE_VXM)
- return Error(Loc, "the destination vector register group cannot be V0");
-
- // Regardless masked or unmasked version, the number of operands is the
- // same. For example, "viota.m v0, v2" is "viota.m v0, v2, NoRegister"
- // actually. We need to check the last operand to ensure whether it is
- // masked or not.
- MCRegister CheckReg = Inst.getOperand(Inst.getNumOperands() - 1).getReg();
- assert((CheckReg == RISCV::V0 || !CheckReg) &&
- "Unexpected register for mask operand");
- if (DestReg == CheckReg)
- return Error(Loc, "the destination vector register group cannot overlap"
- " the mask register");
+ if (MCID.TSFlags & RISCVII::VMConstraint) {
+ int VMIdx = RISCV::getNamedOperandIdx(Inst.getOpcode(), RISCV::OpName::vm);
+ assert(VMIdx >= 0 && "No vm operand?");
+
+ if (DestReg == RISCV::V0) {
+ if (MCID.operands()[Inst.getNumOperands() - 1].OperandType !=
+ RISCVOp::OPERAND_VMASK)
+ return Error(Loc, "the destination vector register group cannot be V0");
+
+ // Regardless masked or unmasked version, the number of operands is the
+ // same. For example, "viota.m v0, v2" is "viota.m v0, v2, NoRegister"
+ // actually. We need to check the operand to see whether it is masked or
+ // not.
+ MCRegister CheckReg = Inst.getOperand(VMIdx).getReg();
+ assert((!CheckReg.isValid() || CheckReg == RISCV::V0) &&
+ "Unexpected mask operand register");
+ if (CheckReg.isValid())
+ return Error(Loc, "the destination vector register group cannot overlap"
+ " the mask register");
+ }
}
+
return false;
}
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp
index 38d154d90d7e0..4cb41fc92c4ba 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp
@@ -35,6 +35,7 @@
#define GET_INSTRINFO_MC_DESC
#define ENABLE_INSTR_PREDICATE_VERIFIER
+#define GET_INSTRINFO_NAMED_OPS
#include "RISCVGenInstrInfo.inc"
#define GET_REGINFO_MC_DESC
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.h
index d1733886637f8..39a34f6ae434e 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.h
@@ -53,6 +53,7 @@ void updateCZceFeatureImplications(MCSubtargetInfo &STI);
// Defines symbolic names for RISC-V instructions.
#define GET_INSTRINFO_ENUM
#define GET_INSTRINFO_MC_HELPER_DECLS
+#define GET_INSTRINFO_OPERAND_ENUM
#include "RISCVGenInstrInfo.inc"
#define GET_SUBTARGETINFO_ENUM
diff --git a/llvm/lib/Target/RISCV/RISCVInstrFormatsV.td b/llvm/lib/Target/RISCV/RISCVInstrFormatsV.td
index 8aa3fb341e3b4..c007838d31dfe 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrFormatsV.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrFormatsV.td
@@ -102,6 +102,8 @@ class RVInstVBase<bits<6> funct6, RISCVVFormat opv, dag outs, dag ins,
let Uses = [VL, VTYPE];
let RVVConstraint = VMConstraint;
+
+ let UseNamedOperandTable = true;
}
class RVInstVV<bits<6> funct6, RISCVVFormat opv, dag outs, dag ins,
@@ -160,6 +162,8 @@ class RVInstVUnaryRd<bits<6> funct6, bits<5> vs1, RISCVVFormat opv, dag outs,
let Uses = [VL, VTYPE];
let RVVConstraint = NoConstraint;
+
+ let UseNamedOperandTable = true;
}
class RVInstVLoadBase<bits<3> nf, RISCVWidth width, RISCVMOP mop,
@@ -181,6 +185,8 @@ class RVInstVLoadBase<bits<3> nf, RISCVWidth width, RISCVMOP mop,
let Uses = [VL, VTYPE];
let RVVConstraint = VMConstraint;
+
+ let UseNamedOperandTable = true;
}
class RVInstVLU<bits<3> nf, RISCVWidth width, RISCVLUMOP lumop, dag outs,
@@ -224,6 +230,8 @@ class RVInstVStoreBase<bits<3> nf, RISCVWidth width, RISCVMOP mop, dag outs,
let Inst{6-0} = OPC_STORE_FP.Value;
let Uses = [VL, VTYPE];
+
+ let UseNamedOperandTable = true;
}
class RVInstVSU<bits<3> nf, RISCVWidth width, RISCVSUMOP sumop, dag outs,
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 2b496b1b20318..98561a9345daf 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -42,7 +42,6 @@ using namespace llvm;
#include "RISCVGenCompressInstEmitter.inc"
#define GET_INSTRINFO_CTOR_DTOR
-#define GET_INSTRINFO_NAMED_OPS
#include "RISCVGenInstrInfo.inc"
#define DEBUG_TYPE "riscv-instr-info"
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
index 2932efffdb814..cfe2e5c474fbd 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
@@ -19,7 +19,6 @@
#include "llvm/IR/DiagnosticInfo.h"
#define GET_INSTRINFO_HEADER
-#define GET_INSTRINFO_OPERAND_ENUM
#include "RISCVGenInstrInfo.inc"
#include "RISCVGenRegisterInfo.inc"
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td
index 1b8cf3ddf7d2e..8f97e81537f1b 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td
@@ -380,6 +380,8 @@ class NDSRVInstVSINTLN<bits<5> funct5, string opcodestr>
let mayLoad = 1;
let mayStore = 0;
let Uses = [VL, VTYPE];
+
+ let UseNamedOperandTable = true;
}
class NDSRVInstVSINTCvt<bits<5> fucnt5, string opcodestr>
@@ -401,6 +403,8 @@ class NDSRVInstVSINTCvt<bits<5> fucnt5, string opcodestr>
let mayStore = 0;
let Uses = [FRM, VL, VTYPE];
let RVVConstraint = VMConstraint;
+
+ let UseNamedOperandTable = true;
}
class NDSRVInstBFHCvt<bits<7> funct7, bits<5> rs1val, DAGOperand rdty,
@@ -435,6 +439,8 @@ class NDSRVInstVFPMAD<bits<6> funct6, string opcodestr>
let mayStore = 0;
let RVVConstraint = VMConstraint;
+
+ let UseNamedOperandTable = true;
}
class NDSRVInstVD4DOT<bits<6> funct6, string opcodestr>
@@ -458,6 +464,8 @@ class NDSRVInstVD4DOT<bits<6> funct6, string opcodestr>
let mayStore = 0;
let RVVConstraint = VMConstraint;
+
+ let UseNamedOperandTable = true;
}
class NDSRVInstVBFHCvt<bits<5> vs1, string opcodestr>
@@ -477,6 +485,8 @@ class NDSRVInstVBFHCvt<bits<5> vs1, string opcodestr>
let mayStore = 0;
let Uses = [VL, VTYPE];
+
+ let UseNamedOperandTable = true;
}
class NDSRVInstVLN<bits<5> funct5, string opcodestr>
@@ -500,6 +510,8 @@ class NDSRVInstVLN<bits<5> funct5, string opcodestr>
let Uses = [VL, VTYPE];
let RVVConstraint = VMConstraint;
+
+ let UseNamedOperandTable = true;
}
class VPseudoVLN8NoMask<VReg RetClass, bit U> :
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXRivos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXRivos.td
index 51506f40d3811..601270d3be4ee 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXRivos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXRivos.td
@@ -29,6 +29,8 @@ class CustomRivosVXI<bits<6> funct6, RISCVVFormat opv, dag outs, dag ins,
let Uses = [VL, VTYPE];
let RVVConstraint = NoConstraint;
let Constraints = "$vd = $vd_wb";
+
+ let UseNamedOperandTable = true;
}
class CustomRivosXVI<bits<6> funct6, RISCVVFormat opv, dag outs, dag ins,
@@ -49,6 +51,8 @@ class CustomRivosXVI<bits<6> funct6, RISCVVFormat opv, dag outs, dag ins,
let Uses = [VL, VTYPE];
let RVVConstraint = NoConstraint;
+
+ let UseNamedOperandTable = true;
}
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
index bc65db1f77ffb..9cc5dbf595871 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
@@ -132,6 +132,8 @@ class RVInstVCCustom2Base<VCIXInfo info>
let RVVConstraint = info.RVVConstraint;
let ElementsDependOn = EltDepsVLMask;
let ReadsPastVL = 1;
+
+ let UseNamedOperandTable = true;
}
// VCIX instructions with GPR rs1 operand
@@ -254,7 +256,7 @@ let Predicates = [HasVendorXSfvfexpa], DecoderNamespace = "XSfvector" in {
}
let Predicates = [HasVendorXSfvqmaccdod], DecoderNamespace = "XSfvector",
- DestEEW = EEWSEWx4, RVVConstraint=VS2Constraint in {
+ DestEEW = EEWSEWx4, RVVConstraint=VS1Constraint in {
def SF_VQMACCU_2x8x2 : CustomSiFiveVMACC<0b101100, OPMVV, "sf.vqmaccu.2x8x2">;
def SF_VQMACC_2x8x2 : CustomSiFiveVMACC<0b101101, OPMVV, "sf.vqmacc.2x8x2">;
def SF_VQMACCUS_2x8x2 : CustomSiFiveVMACC<0b101110, OPMVV, "sf.vqmaccus.2x8x2">;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td
index ebabd03731298..9aae940476ae2 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td
@@ -173,12 +173,10 @@ let Predicates = [HasStdExtZvkned] in {
defm VAESDM : VAES_MV_V_S<0b101000, 0b101001, 0b00000, OPMVV, "vaesdm">;
defm VAESEF : VAES_MV_V_S<0b101000, 0b101001, 0b00011, OPMVV, "vaesef">;
defm VAESEM : VAES_MV_V_S<0b101000, 0b101001, 0b00010, OPMVV, "vaesem">;
- let RVVConstraint = NoConstraint in {
- def VAESKF1_VI : PALUVINoVm<0b100010, "vaeskf1.vi", uimm5>,
- SchedUnaryMC<"WriteVAESKF1V", "ReadVAESKF1V">;
- def VAESKF2_VI : PALUVINoVmBinary<0b101010, "vaeskf2.vi", uimm5>,
- SchedBinaryMC<"WriteVAESKF2V", "ReadVAESKF2V", "ReadVAESKF2V">;
- }
+ def VAESKF1_VI : PALUVINoVm<0b100010, "vaeskf1.vi", uimm5>,
+ SchedUnaryMC<"WriteVAESKF1V", "ReadVAESKF1V">;
+ def VAESKF2_VI : PALUVINoVmBinary<0b101010, "vaeskf2.vi", uimm5>,
+ SchedBinaryMC<"WriteVAESKF2V", "ReadVAESKF2V", "ReadVAESKF2V">;
let RVVConstraint = VS2Constraint in
def VAESZ_VS : PALUVs2NoVmBinary<0b101001, 0b00111, OPMVV, "vaesz.vs">,
SchedBinaryMC<"WriteVAESZV", "ReadVAESZV", "ReadVAESZV">;
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.h b/llvm/lib/Target/SystemZ/SystemZInstrInfo.h
index 8e834c74f5031..bf832154ad717 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.h
@@ -159,8 +159,8 @@ enum FusedCompareType {
} // end namespace SystemZII
namespace SystemZ {
-int64_t getTwoOperandOpcode(uint32_t Opcode);
-int64_t getTargetMemOpcode(uint32_t Opcode);
+int32_t getTwoOperandOpcode(uint32_t Opcode);
+int32_t getTargetMemOpcode(uint32_t Opcode);
// Return a version of comparison CC mask CCMask in which the LT and GT
// actions are swapped.
diff --git a/llvm/lib/Target/WebAssembly/TargetInfo/WebAssemblyTargetInfo.h b/llvm/lib/Target/WebAssembly/TargetInfo/WebAssemblyTargetInfo.h
index bbe3d4a89adce..c24ae77066f41 100644
--- a/llvm/lib/Target/WebAssembly/TargetInfo/WebAssemblyTargetInfo.h
+++ b/llvm/lib/Target/WebAssembly/TargetInfo/WebAssemblyTargetInfo.h
@@ -25,9 +25,9 @@ Target &getTheWebAssemblyTarget64();
namespace WebAssembly {
-int64_t getStackOpcode(uint32_t Opcode);
-int64_t getRegisterOpcode(uint32_t Opcode);
-int64_t getWasm64Opcode(uint32_t Opcode);
+int32_t getStackOpcode(uint32_t Opcode);
+int32_t getRegisterOpcode(uint32_t Opcode);
+int32_t getWasm64Opcode(uint32_t Opcode);
} // namespace WebAssembly
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index dbeb8fd86b835..3678327627b97 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -24370,7 +24370,7 @@ static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, const SDLoc &dl,
assert(And.getValueType().isScalarInteger() && "Scalar type expected");
APInt AndRHSVal;
- SDValue Shl, Src, BitNo;
+ SDValue Shl, Src, Mask, BitNo;
if (sd_match(And,
m_And(m_TruncOrSelf(m_Value(Src)),
m_TruncOrSelf(m_AllOf(m_Value(Shl),
@@ -24384,6 +24384,10 @@ static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, const SDLoc &dl,
if (Known.countMinLeadingZeros() < (BitWidth - AndBitWidth))
return SDValue();
}
+ } else if (sd_match(And,
+ m_ReassociatableAnd(m_Value(Src), m_Value(Mask),
+ m_Shl(m_One(), m_Value(BitNo))))) {
+ // (Src & Mask & (1 << BitNo)) ==/!= 0
} else if (sd_match(And,
m_And(m_TruncOrSelf(m_Srl(m_Value(Src), m_Value(BitNo))),
m_One()))) {
@@ -24402,6 +24406,9 @@ static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, const SDLoc &dl,
return SDValue();
}
+ if (Mask)
+ Src = DAG.getNode(ISD::AND, dl, Src.getValueType(), Src, Mask);
+
// Remove any bit flip.
if (isBitwiseNot(Src)) {
Src = Src.getOperand(0);
@@ -39314,6 +39321,7 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
}
break;
}
+ case X86ISD::FANDN:
case X86ISD::ANDNP: {
KnownBits Known2;
Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
diff --git a/llvm/lib/TargetParser/TargetParser.cpp b/llvm/lib/TargetParser/TargetParser.cpp
index fc5d7519bdffe..3d381e26c37b2 100644
--- a/llvm/lib/TargetParser/TargetParser.cpp
+++ b/llvm/lib/TargetParser/TargetParser.cpp
@@ -515,13 +515,37 @@ static void fillAMDGCNFeatureMap(StringRef GPU, const Triple &T,
Features["qsad-insts"] = true;
Features["cvt-pknorm-vop2-insts"] = true;
Features["fp8-conversion-insts"] = true;
+ Features["wmma-128b-insts"] = true;
Features["atomic-fmin-fmax-global-f32"] = true;
break;
case GK_GFX1170:
- // TODO-GFX1170: Update features map for gfx1170
+ Features["ci-insts"] = true;
+ Features["dot7-insts"] = true;
+ Features["dot8-insts"] = true;
+ Features["dot9-insts"] = true;
+ Features["dot10-insts"] = true;
+ Features["dot12-insts"] = true;
+ Features["dl-insts"] = true;
+ Features["16-bit-insts"] = true;
+ Features["dpp"] = true;
+ Features["gfx8-insts"] = true;
+ Features["gfx9-insts"] = true;
+ Features["gfx10-insts"] = true;
+ Features["gfx10-3-insts"] = true;
+ Features["gfx11-insts"] = true;
+ Features["atomic-fadd-rtn-insts"] = true;
+ Features["image-insts"] = true;
+ Features["cube-insts"] = true;
+ Features["lerp-inst"] = true;
+ Features["sad-insts"] = true;
+ Features["qsad-insts"] = true;
+ Features["cvt-pknorm-vop2-insts"] = true;
+ Features["gws"] = true;
Features["dot11-insts"] = true;
Features["fp8-conversion-insts"] = true;
- [[fallthrough]];
+ Features["wmma-128b-insts"] = true;
+ Features["atomic-fmin-fmax-global-f32"] = true;
+ break;
case GK_GFX1153:
case GK_GFX1152:
case GK_GFX1151:
@@ -554,6 +578,7 @@ static void fillAMDGCNFeatureMap(StringRef GPU, const Triple &T,
Features["qsad-insts"] = true;
Features["cvt-pknorm-vop2-insts"] = true;
Features["gws"] = true;
+ Features["wmma-256b-insts"] = true;
Features["atomic-fmin-fmax-global-f32"] = true;
break;
case GK_GFX1036:
diff --git a/llvm/test/CodeGen/AArch64/fp16_intrinsic_scalar_1op.ll b/llvm/test/CodeGen/AArch64/fp16_intrinsic_scalar_1op.ll
index 5ccf7b1adc3a7..61f8d6edce7b6 100644
--- a/llvm/test/CodeGen/AArch64/fp16_intrinsic_scalar_1op.ll
+++ b/llvm/test/CodeGen/AArch64/fp16_intrinsic_scalar_1op.ll
@@ -2,31 +2,7 @@
; RUN: llc < %s -mtriple=aarch64 -global-isel=0 -mattr=+v8.2a,+fullfp16 | FileCheck %s --check-prefixes=CHECK,CHECK-SD
; RUN: llc < %s -mtriple=aarch64 -global-isel=1 -mattr=+v8.2a,+fullfp16 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
-declare i64 @llvm.aarch64.neon.fcvtpu.i64.f16(half)
-declare i32 @llvm.aarch64.neon.fcvtpu.i32.f16(half)
-declare i64 @llvm.aarch64.neon.fcvtps.i64.f16(half)
-declare i32 @llvm.aarch64.neon.fcvtps.i32.f16(half)
-declare i64 @llvm.aarch64.neon.fcvtnu.i64.f16(half)
-declare i32 @llvm.aarch64.neon.fcvtnu.i32.f16(half)
-declare i64 @llvm.aarch64.neon.fcvtns.i64.f16(half)
-declare i32 @llvm.aarch64.neon.fcvtns.i32.f16(half)
-declare i64 @llvm.aarch64.neon.fcvtmu.i64.f16(half)
-declare i32 @llvm.aarch64.neon.fcvtmu.i32.f16(half)
-declare i64 @llvm.aarch64.neon.fcvtms.i64.f16(half)
-declare i32 @llvm.aarch64.neon.fcvtms.i32.f16(half)
-declare i64 @llvm.aarch64.neon.fcvtau.i64.f16(half)
-declare i32 @llvm.aarch64.neon.fcvtau.i32.f16(half)
-declare i64 @llvm.aarch64.neon.fcvtas.i64.f16(half)
-declare i32 @llvm.aarch64.neon.fcvtas.i32.f16(half)
-declare i64 @llvm.aarch64.neon.fcvtzs.i64.f16(half)
-declare i32 @llvm.aarch64.neon.fcvtzs.i32.f16(half)
-declare i64 @llvm.aarch64.neon.fcvtzu.i64.f16(half)
-declare i32 @llvm.aarch64.neon.fcvtzu.i32.f16(half)
-declare half @llvm.aarch64.neon.frsqrte.f16(half)
-declare half @llvm.aarch64.neon.frecpx.f16(half)
-declare half @llvm.aarch64.neon.frecpe.f16(half)
-
-define dso_local i16 @t2(half %a) {
+define i16 @t2(half %a) {
; CHECK-SD-LABEL: t2:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: fcmp h0, #0.0
@@ -45,7 +21,7 @@ entry:
ret i16 %vceqz
}
-define dso_local i16 @t3(half %a) {
+define i16 @t3(half %a) {
; CHECK-SD-LABEL: t3:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: fcmp h0, #0.0
@@ -64,7 +40,7 @@ entry:
ret i16 %vcgez
}
-define dso_local i16 @t4(half %a) {
+define i16 @t4(half %a) {
; CHECK-SD-LABEL: t4:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: fcmp h0, #0.0
@@ -83,7 +59,7 @@ entry:
ret i16 %vcgtz
}
-define dso_local i16 @t5(half %a) {
+define i16 @t5(half %a) {
; CHECK-SD-LABEL: t5:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: fcmp h0, #0.0
@@ -102,7 +78,7 @@ entry:
ret i16 %vclez
}
-define dso_local i16 @t6(half %a) {
+define i16 @t6(half %a) {
; CHECK-SD-LABEL: t6:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: fcmp h0, #0.0
@@ -121,7 +97,7 @@ entry:
ret i16 %vcltz
}
-define dso_local half @t8(i32 %a) {
+define half @t8(i32 %a) {
; CHECK-LABEL: t8:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: scvtf h0, w0
@@ -131,7 +107,7 @@ entry:
ret half %0
}
-define dso_local half @t9(i64 %a) {
+define half @t9(i64 %a) {
; CHECK-LABEL: t9:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: scvtf h0, x0
@@ -141,7 +117,7 @@ entry:
ret half %0
}
-define dso_local half @t12(i64 %a) {
+define half @t12(i64 %a) {
; CHECK-LABEL: t12:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ucvtf h0, x0
@@ -151,7 +127,7 @@ entry:
ret half %0
}
-define dso_local i16 @t13(half %a) {
+define i16 @t13(half %a) {
; CHECK-LABEL: t13:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fcvtzs w0, h0
@@ -161,7 +137,7 @@ entry:
ret i16 %0
}
-define dso_local i64 @t15(half %a) {
+define i64 @t15(half %a) {
; CHECK-LABEL: t15:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fcvtzs x0, h0
@@ -171,7 +147,7 @@ entry:
ret i64 %0
}
-define dso_local i16 @t16(half %a) {
+define i16 @t16(half %a) {
; CHECK-SD-LABEL: t16:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: fcvtzs w0, h0
@@ -186,7 +162,7 @@ entry:
ret i16 %0
}
-define dso_local i64 @t18(half %a) {
+define i64 @t18(half %a) {
; CHECK-LABEL: t18:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fcvtzu x0, h0
@@ -269,7 +245,7 @@ entry:
ret i16 %fcvt
}
-define dso_local i16 @t19(half %a) {
+define i16 @t19(half %a) {
; CHECK-LABEL: t19:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fcvtas w0, h0
@@ -280,7 +256,7 @@ entry:
ret i16 %0
}
-define dso_local i64 @t21(half %a) {
+define i64 @t21(half %a) {
; CHECK-LABEL: t21:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fcvtas x0, h0
@@ -301,7 +277,7 @@ entry:
ret i16 %fcvt
}
-define dso_local i16 @t22(half %a) {
+define i16 @t22(half %a) {
; CHECK-LABEL: t22:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fcvtau w0, h0
@@ -312,7 +288,7 @@ entry:
ret i16 %0
}
-define dso_local i64 @t24(half %a) {
+define i64 @t24(half %a) {
; CHECK-LABEL: t24:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fcvtau x0, h0
@@ -333,7 +309,7 @@ entry:
ret i16 %fcvt
}
-define dso_local i16 @t25(half %a) {
+define i16 @t25(half %a) {
; CHECK-LABEL: t25:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fcvtms w0, h0
@@ -344,7 +320,7 @@ entry:
ret i16 %0
}
-define dso_local i64 @t27(half %a) {
+define i64 @t27(half %a) {
; CHECK-LABEL: t27:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fcvtms x0, h0
@@ -365,7 +341,7 @@ entry:
ret i16 %fcvt
}
-define dso_local i16 @t28(half %a) {
+define i16 @t28(half %a) {
; CHECK-LABEL: t28:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fcvtmu w0, h0
@@ -376,7 +352,7 @@ entry:
ret i16 %0
}
-define dso_local i64 @t30(half %a) {
+define i64 @t30(half %a) {
; CHECK-LABEL: t30:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fcvtmu x0, h0
@@ -397,7 +373,7 @@ entry:
ret i16 %fcvt
}
-define dso_local i16 @t31(half %a) {
+define i16 @t31(half %a) {
; CHECK-LABEL: t31:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fcvtns w0, h0
@@ -408,7 +384,7 @@ entry:
ret i16 %0
}
-define dso_local i64 @t33(half %a) {
+define i64 @t33(half %a) {
; CHECK-LABEL: t33:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fcvtns x0, h0
@@ -429,7 +405,7 @@ entry:
ret i16 %fcvt
}
-define dso_local i16 @t34(half %a) {
+define i16 @t34(half %a) {
; CHECK-LABEL: t34:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fcvtnu w0, h0
@@ -440,7 +416,7 @@ entry:
ret i16 %0
}
-define dso_local i64 @t36(half %a) {
+define i64 @t36(half %a) {
; CHECK-LABEL: t36:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fcvtnu x0, h0
@@ -461,7 +437,7 @@ entry:
ret i16 %fcvt
}
-define dso_local i16 @t37(half %a) {
+define i16 @t37(half %a) {
; CHECK-LABEL: t37:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fcvtps w0, h0
@@ -472,7 +448,7 @@ entry:
ret i16 %0
}
-define dso_local i64 @t39(half %a) {
+define i64 @t39(half %a) {
; CHECK-LABEL: t39:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fcvtps x0, h0
@@ -493,7 +469,7 @@ entry:
ret i16 %fcvt
}
-define dso_local i16 @t40(half %a) {
+define i16 @t40(half %a) {
; CHECK-LABEL: t40:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fcvtpu w0, h0
@@ -504,7 +480,7 @@ entry:
ret i16 %0
}
-define dso_local i64 @t42(half %a) {
+define i64 @t42(half %a) {
; CHECK-LABEL: t42:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fcvtpu x0, h0
@@ -514,7 +490,7 @@ entry:
ret i64 %vcvtph_u64_f16
}
-define dso_local half @t44(half %a) {
+define half @t44(half %a) {
; CHECK-LABEL: t44:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: frecpe h0, h0
@@ -524,7 +500,7 @@ entry:
ret half %vrecpeh_f16
}
-define dso_local half @t45(half %a) {
+define half @t45(half %a) {
; CHECK-LABEL: t45:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: frecpx h0, h0
@@ -534,7 +510,7 @@ entry:
ret half %vrecpxh_f16
}
-define dso_local half @t53(half %a) {
+define half @t53(half %a) {
; CHECK-LABEL: t53:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: frsqrte h0, h0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll
index 9693d544d1535..450cd0701911a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll
@@ -1,14 +1,15 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1170 -mattr=-real-true16 < %s | FileCheck %s --check-prefixes=GCN,GFX1170
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck %s --check-prefixes=GCN,GFX12
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] neg_hi:[1,0,0]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
-; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_f16_negA:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[16:17], v[8:11], off
+; GCN-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
+; GCN-NEXT: s_endpgm
bb:
%fneg.A = fneg <8 x half> %A
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %fneg.A, <8 x half> %B, <8 x float> %C)
@@ -17,13 +18,13 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negB:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] neg_hi:[0,1,0]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
-; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_f16_negB:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[16:17], v[8:11], off
+; GCN-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
+; GCN-NEXT: s_endpgm
bb:
%fneg.B = fneg <8 x half> %B
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %fneg.B, <8 x float> %C)
@@ -32,13 +33,13 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negC:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
-; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_f16_negC:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[16:17], v[8:11], off
+; GCN-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
+; GCN-NEXT: s_endpgm
bb:
%fneg.C = fneg <8 x float> %C
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fneg.C)
@@ -47,13 +48,13 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_f16_absC:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
-; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_f16_absC:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[16:17], v[8:11], off
+; GCN-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
+; GCN-NEXT: s_endpgm
bb:
%fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fabs.C)
@@ -62,13 +63,13 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_negC:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
-; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf16_negC:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[16:17], v[8:11], off
+; GCN-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
+; GCN-NEXT: s_endpgm
bb:
%fneg.C = fneg <8 x float> %C
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> %fneg.C)
@@ -77,13 +78,13 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_absC:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
-; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf16_absC:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[16:17], v[8:11], off
+; GCN-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
+; GCN-NEXT: s_endpgm
bb:
%fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> %fabs.C)
@@ -92,11 +93,11 @@ bb:
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negA:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] neg_hi:[1,0,0]
-; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f16_16x16x16_f16_negA:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off
+; GCN-NEXT: s_endpgm
bb:
%fneg.A = fneg <8 x half> %A
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %fneg.A, <8 x half> %B, <8 x half> %C, i1 0)
@@ -105,11 +106,11 @@ bb:
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] neg_hi:[0,1,0]
-; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f16_16x16x16_f16_negB:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off
+; GCN-NEXT: s_endpgm
bb:
%fneg.B = fneg <8 x half> %B
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %fneg.B, <8 x half> %C, i1 0)
@@ -118,11 +119,11 @@ bb:
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1]
-; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f16_16x16x16_f16_negC:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1]
+; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off
+; GCN-NEXT: s_endpgm
bb:
%fneg.C = fneg <8 x half> %C
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fneg.C, i1 0)
@@ -131,11 +132,11 @@ bb:
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f16_16x16x16_f16_absC:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1]
-; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f16_16x16x16_f16_absC:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1]
+; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off
+; GCN-NEXT: s_endpgm
bb:
%fabs.C = call <8 x half> @llvm.fabs.v8f16(<8 x half> %C)
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fabs.C, i1 0)
@@ -144,13 +145,13 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_negC:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_fp8_fp8_negC:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off
+; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
+; GCN-NEXT: s_endpgm
bb:
%fneg.C = fneg <8 x float> %C
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
@@ -159,13 +160,13 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_absC:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_fp8_fp8_absC:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off
+; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
+; GCN-NEXT: s_endpgm
bb:
%fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
@@ -174,13 +175,13 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_negC:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf8_fp8_negC:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off
+; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
+; GCN-NEXT: s_endpgm
bb:
%fneg.C = fneg <8 x float> %C
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
@@ -189,13 +190,13 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_absC:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf8_fp8_absC:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off
+; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
+; GCN-NEXT: s_endpgm
bb:
%fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
@@ -204,13 +205,13 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_negC:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_fp8_bf8_negC:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off
+; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
+; GCN-NEXT: s_endpgm
bb:
%fneg.C = fneg <8 x float> %C
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
@@ -219,13 +220,13 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_absC:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_fp8_bf8_absC:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off
+; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
+; GCN-NEXT: s_endpgm
bb:
%fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
@@ -234,13 +235,13 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_negC:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf8_bf8_negC:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off
+; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
+; GCN-NEXT: s_endpgm
bb:
%fneg.C = fneg <8 x float> %C
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
@@ -249,13 +250,13 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_absC:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf8_bf8_absC:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off
+; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
+; GCN-NEXT: s_endpgm
bb:
%fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
@@ -264,13 +265,13 @@ bb:
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negA:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] neg_hi:[1,0,0]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off
-; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_f32_16x16x32_f16_negA:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[21:22], v[12:15], off
+; GCN-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16
+; GCN-NEXT: s_endpgm
bb:
%fneg.A = fneg <8 x half> %A
%res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f16.v16f16.v8f32.i16(<8 x half> %fneg.A, <16 x half> %B, <8 x float> %C, i16 %Index)
@@ -279,13 +280,13 @@ bb:
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negB:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] neg_hi:[0,1,0]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off
-; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_f32_16x16x32_f16_negB:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[21:22], v[12:15], off
+; GCN-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16
+; GCN-NEXT: s_endpgm
bb:
%fneg.B = fneg <16 x half> %B
%res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f16.v16f16.v8f32.i16(<8 x half> %A, <16 x half> %fneg.B, <8 x float> %C, i16 %Index)
@@ -294,11 +295,11 @@ bb:
}
define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negA:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] neg_hi:[1,0,0]
-; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_f16_16x16x32_f16_negA:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GCN-NEXT: global_store_b128 v[17:18], v[12:15], off
+; GCN-NEXT: s_endpgm
bb:
%fneg.A = fneg <8 x half> %A
%res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v16f16.v8f16.i16(<8 x half> %fneg.A, <16 x half> %B, <8 x half> %C, i16 %Index)
@@ -307,11 +308,11 @@ bb:
}
define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negB:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] neg_hi:[0,1,0]
-; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_f16_16x16x32_f16_negB:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GCN-NEXT: global_store_b128 v[17:18], v[12:15], off
+; GCN-NEXT: s_endpgm
bb:
%fneg.B = fneg <16 x half> %B
%res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v16f16.v8f16.i16(<8 x half> %A, <16 x half> %fneg.B, <8 x half> %C, i16 %Index)
@@ -322,13 +323,13 @@ bb:
; both neg and abs patterns (wmma matrix C f32 or f16 )
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negabsC:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] neg_hi:[0,0,1]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
-; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_f16_negabsC:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[16:17], v[8:11], off
+; GCN-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
+; GCN-NEXT: s_endpgm
bb:
%fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
%fneg.fabs.C = fneg <8 x float> %fabs.C
@@ -338,11 +339,11 @@ bb:
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negabsC:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] neg_hi:[0,0,1]
-; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f16_16x16x16_f16_negabsC:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off
+; GCN-NEXT: s_endpgm
bb:
%fabs.C = call <8 x half> @llvm.fabs.v8f16(<8 x half> %C)
%fneg.fabs.C = fneg <8 x half> %fabs.C
@@ -352,15 +353,15 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_f16_neg_partial_fabsA:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_and_b32_e32 v11, 0x7fffffff, v11
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
-; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_f16_neg_partial_fabsA:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_and_b32_e32 v11, 0x7fffffff, v11
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[16:17], v[8:11], off
+; GCN-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
+; GCN-NEXT: s_endpgm
bb:
%el3 = extractelement <8 x float> %C, i32 3
%el3.fabs = call float @llvm.fabs.f32(float %el3)
@@ -374,13 +375,13 @@ bb:
; A or B matrix modifier and constant in C
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA_constantC:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
-; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_f16_negA_constantC:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[8:9], v[10:13], off
+; GCN-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
+; GCN-NEXT: s_endpgm
bb:
%fneg.A = fneg <8 x half> %A
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %fneg.A, <8 x half> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
@@ -389,11 +390,11 @@ bb:
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB_constantC:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0]
-; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f16_16x16x16_f16_negB_constantC:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GCN-NEXT: global_store_b128 v[8:9], v[10:13], off
+; GCN-NEXT: s_endpgm
bb:
%fneg.B = fneg <8 x half> %B
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %fneg.B, <8 x half> <half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
@@ -404,6 +405,27 @@ bb:
; pack f16 elements with v_perm_b32 since they don't come from same b32
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<8 x half> %A, <8 x half> %B, ptr %Caddr, ptr addrspace(1) %out) {
+; GFX1170-LABEL: test_wmma_f16_16x16x16_f16_negC_pack:
+; GFX1170: ; %bb.0: ; %bb
+; GFX1170-NEXT: s_clause 0x1
+; GFX1170-NEXT: flat_load_b128 v[12:15], v[8:9]
+; GFX1170-NEXT: flat_load_b128 v[16:19], v[8:9] offset:16
+; GFX1170-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
+; GFX1170-NEXT: v_and_b32_e32 v8, 0xffff, v12
+; GFX1170-NEXT: v_and_b32_e32 v9, 0xffff, v14
+; GFX1170-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX1170-NEXT: v_and_b32_e32 v14, 0xffff, v16
+; GFX1170-NEXT: v_and_b32_e32 v16, 0xffff, v18
+; GFX1170-NEXT: v_lshl_or_b32 v12, v13, 16, v8
+; GFX1170-NEXT: v_lshl_or_b32 v13, v15, 16, v9
+; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1170-NEXT: v_lshl_or_b32 v14, v17, 16, v14
+; GFX1170-NEXT: v_lshl_or_b32 v15, v19, 16, v16
+; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT: v_wmma_f16_16x16x16_f16 v[12:15], v[0:3], v[4:7], v[12:15] neg_lo:[0,0,1]
+; GFX1170-NEXT: global_store_b128 v[10:11], v[12:15], off
+; GFX1170-NEXT: s_endpgm
+;
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC_pack:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: s_clause 0x1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-imm.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-imm.ll
index 6b749df71223f..8f8267952cbe1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-imm.ll
@@ -1,14 +1,15 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1170 < %s | FileCheck %s --check-prefixes=GCN,GFX1170
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefixes=GCN,GFX12
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], 1.0
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
-; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_f16_imm:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], 1.0
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[8:9], v[10:13], off
+; GCN-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
store <8 x float> %res, ptr addrspace(1) %out
@@ -16,27 +17,27 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm_non_inlineable(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm_non_inlineable:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: s_mov_b32 s0, 0x40400000
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_mov_b32 s7, s0
-; GFX12-NEXT: s_mov_b32 s1, s0
-; GFX12-NEXT: s_mov_b32 s2, s0
-; GFX12-NEXT: s_mov_b32 s3, s0
-; GFX12-NEXT: s_mov_b32 s4, s0
-; GFX12-NEXT: s_mov_b32 s5, s0
-; GFX12-NEXT: s_mov_b32 s6, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v17, s7 :: v_dual_mov_b32 v16, s6
-; GFX12-NEXT: v_dual_mov_b32 v15, s5 :: v_dual_mov_b32 v14, s4
-; GFX12-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2
-; GFX12-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0
-; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], v[10:17]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
-; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_f16_imm_non_inlineable:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: s_mov_b32 s0, 0x40400000
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT: s_mov_b32 s7, s0
+; GCN-NEXT: s_mov_b32 s1, s0
+; GCN-NEXT: s_mov_b32 s2, s0
+; GCN-NEXT: s_mov_b32 s3, s0
+; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: s_mov_b32 s5, s0
+; GCN-NEXT: s_mov_b32 s6, s0
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GCN-NEXT: v_dual_mov_b32 v17, s7 :: v_dual_mov_b32 v16, s6
+; GCN-NEXT: v_dual_mov_b32 v15, s5 :: v_dual_mov_b32 v14, s4
+; GCN-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2
+; GCN-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0
+; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], v[10:17]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[8:9], v[10:13], off
+; GCN-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
store <8 x float> %res, ptr addrspace(1) %out
@@ -44,13 +45,13 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[10:17], v[0:3], v[4:7], 1.0
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
-; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf16_imm:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_bf16 v[10:17], v[0:3], v[4:7], 1.0
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[8:9], v[10:13], off
+; GCN-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
store <8 x float> %res, ptr addrspace(1) %out
@@ -58,27 +59,27 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm_non_inlineable(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm_non_inlineable:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: s_mov_b32 s0, 0x40400000
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_mov_b32 s7, s0
-; GFX12-NEXT: s_mov_b32 s1, s0
-; GFX12-NEXT: s_mov_b32 s2, s0
-; GFX12-NEXT: s_mov_b32 s3, s0
-; GFX12-NEXT: s_mov_b32 s4, s0
-; GFX12-NEXT: s_mov_b32 s5, s0
-; GFX12-NEXT: s_mov_b32 s6, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v17, s7 :: v_dual_mov_b32 v16, s6
-; GFX12-NEXT: v_dual_mov_b32 v15, s5 :: v_dual_mov_b32 v14, s4
-; GFX12-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2
-; GFX12-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0
-; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[10:17], v[0:3], v[4:7], v[10:17]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
-; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf16_imm_non_inlineable:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: s_mov_b32 s0, 0x40400000
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT: s_mov_b32 s7, s0
+; GCN-NEXT: s_mov_b32 s1, s0
+; GCN-NEXT: s_mov_b32 s2, s0
+; GCN-NEXT: s_mov_b32 s3, s0
+; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: s_mov_b32 s5, s0
+; GCN-NEXT: s_mov_b32 s6, s0
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GCN-NEXT: v_dual_mov_b32 v17, s7 :: v_dual_mov_b32 v16, s6
+; GCN-NEXT: v_dual_mov_b32 v15, s5 :: v_dual_mov_b32 v14, s4
+; GCN-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2
+; GCN-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0
+; GCN-NEXT: v_wmma_f32_16x16x16_bf16 v[10:17], v[0:3], v[4:7], v[10:17]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[8:9], v[10:13], off
+; GCN-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
store <8 x float> %res, ptr addrspace(1) %out
@@ -86,11 +87,11 @@ bb:
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0
-; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f16_16x16x16_f16_imm:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0
+; GCN-NEXT: global_store_b128 v[8:9], v[10:13], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> <half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
store <8 x half> %res, ptr addrspace(1) %out
@@ -98,19 +99,19 @@ bb:
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm_non_inlineable(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm_non_inlineable:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: s_mov_b32 s0, 0x42004200
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT: s_mov_b32 s3, s0
-; GFX12-NEXT: s_mov_b32 s1, s0
-; GFX12-NEXT: s_mov_b32 s2, s0
-; GFX12-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2
-; GFX12-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], v[10:13]
-; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f16_16x16x16_f16_imm_non_inlineable:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: s_mov_b32 s0, 0x42004200
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GCN-NEXT: s_mov_b32 s3, s0
+; GCN-NEXT: s_mov_b32 s1, s0
+; GCN-NEXT: s_mov_b32 s2, s0
+; GCN-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2
+; GCN-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], v[10:13]
+; GCN-NEXT: global_store_b128 v[8:9], v[10:13], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> <half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0>, i1 0)
store <8 x half> %res, ptr addrspace(1) %out
@@ -118,19 +119,19 @@ bb:
}
define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: s_mov_b32 s0, 0x3f803f80
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT: s_mov_b32 s3, s0
-; GFX12-NEXT: s_mov_b32 s1, s0
-; GFX12-NEXT: s_mov_b32 s2, s0
-; GFX12-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2
-; GFX12-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13]
-; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_bf16_16x16x16_bf16_imm:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: s_mov_b32 s0, 0x3f803f80
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GCN-NEXT: s_mov_b32 s3, s0
+; GCN-NEXT: s_mov_b32 s1, s0
+; GCN-NEXT: s_mov_b32 s2, s0
+; GCN-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2
+; GCN-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT: v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13]
+; GCN-NEXT: global_store_b128 v[8:9], v[10:13], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> <i16 16256, i16 16256, i16 16256, i16 16256, i16 16256, i16 16256, i16 16256, i16 16256>, i1 0)
store <8 x i16> %res, ptr addrspace(1) %out
@@ -138,19 +139,19 @@ bb:
}
define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm_non_inlineable(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm_non_inlineable:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: s_mov_b32 s0, 0x3fc03fc0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT: s_mov_b32 s3, s0
-; GFX12-NEXT: s_mov_b32 s1, s0
-; GFX12-NEXT: s_mov_b32 s2, s0
-; GFX12-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2
-; GFX12-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13]
-; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_bf16_16x16x16_bf16_imm_non_inlineable:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: s_mov_b32 s0, 0x3fc03fc0
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GCN-NEXT: s_mov_b32 s3, s0
+; GCN-NEXT: s_mov_b32 s1, s0
+; GCN-NEXT: s_mov_b32 s2, s0
+; GCN-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2
+; GCN-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT: v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13]
+; GCN-NEXT: global_store_b128 v[8:9], v[10:13], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> <i16 16320, i16 16320, i16 16320, i16 16320, i16 16320, i16 16320, i16 16320, i16 16320>, i1 0)
store <8 x i16> %res, ptr addrspace(1) %out
@@ -158,13 +159,13 @@ bb:
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[6:13], v[0:1], v[2:3], 1
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
-; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu8_imm:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_i32_16x16x16_iu8 v[6:13], v[0:1], v[2:3], 1
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off
+; GCN-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
@@ -172,27 +173,27 @@ bb:
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm_non_inlineable:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: s_movk_i32 s0, 0x80
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_mov_b32 s7, s0
-; GFX12-NEXT: s_mov_b32 s1, s0
-; GFX12-NEXT: s_mov_b32 s2, s0
-; GFX12-NEXT: s_mov_b32 s3, s0
-; GFX12-NEXT: s_mov_b32 s4, s0
-; GFX12-NEXT: s_mov_b32 s5, s0
-; GFX12-NEXT: s_mov_b32 s6, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
-; GFX12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
-; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
-; GFX12-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
-; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[6:13], v[0:1], v[2:3], v[6:13]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
-; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu8_imm_non_inlineable:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: s_movk_i32 s0, 0x80
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT: s_mov_b32 s7, s0
+; GCN-NEXT: s_mov_b32 s1, s0
+; GCN-NEXT: s_mov_b32 s2, s0
+; GCN-NEXT: s_mov_b32 s3, s0
+; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: s_mov_b32 s5, s0
+; GCN-NEXT: s_mov_b32 s6, s0
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GCN-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
+; GCN-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
+; GCN-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
+; GCN-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
+; GCN-NEXT: v_wmma_i32_16x16x16_iu8 v[6:13], v[0:1], v[2:3], v[6:13]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off
+; GCN-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128>, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
@@ -200,13 +201,13 @@ bb:
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm(i32 %A, i32 %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v0, v1, 1
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT: global_store_b128 v[2:3], v[8:11], off offset:16
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu4_imm:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v0, v1, 1
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off
+; GCN-NEXT: global_store_b128 v[2:3], v[8:11], off offset:16
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
@@ -214,27 +215,27 @@ bb:
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm_non_inlineable:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: s_movk_i32 s0, 0x80
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_mov_b32 s7, s0
-; GFX12-NEXT: s_mov_b32 s1, s0
-; GFX12-NEXT: s_mov_b32 s2, s0
-; GFX12-NEXT: s_mov_b32 s3, s0
-; GFX12-NEXT: s_mov_b32 s4, s0
-; GFX12-NEXT: s_mov_b32 s5, s0
-; GFX12-NEXT: s_mov_b32 s6, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v11, s7 :: v_dual_mov_b32 v10, s6
-; GFX12-NEXT: v_dual_mov_b32 v9, s5 :: v_dual_mov_b32 v8, s4
-; GFX12-NEXT: v_dual_mov_b32 v7, s3 :: v_dual_mov_b32 v6, s2
-; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0
-; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v0, v1, v[4:11]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT: global_store_b128 v[2:3], v[8:11], off offset:16
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu4_imm_non_inlineable:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: s_movk_i32 s0, 0x80
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT: s_mov_b32 s7, s0
+; GCN-NEXT: s_mov_b32 s1, s0
+; GCN-NEXT: s_mov_b32 s2, s0
+; GCN-NEXT: s_mov_b32 s3, s0
+; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: s_mov_b32 s5, s0
+; GCN-NEXT: s_mov_b32 s6, s0
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GCN-NEXT: v_dual_mov_b32 v11, s7 :: v_dual_mov_b32 v10, s6
+; GCN-NEXT: v_dual_mov_b32 v9, s5 :: v_dual_mov_b32 v8, s4
+; GCN-NEXT: v_dual_mov_b32 v7, s3 :: v_dual_mov_b32 v6, s2
+; GCN-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0
+; GCN-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v0, v1, v[4:11]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off
+; GCN-NEXT: global_store_b128 v[2:3], v[8:11], off offset:16
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> <i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128>, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
@@ -242,13 +243,13 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[6:13], v[0:1], v[2:3], 1.0
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
-; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[6:13], v[0:1], v[2:3], 1.0
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off
+; GCN-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
store <8 x float> %res, ptr addrspace(1) %out
@@ -256,27 +257,27 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: s_mov_b32 s0, 0x40400000
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_mov_b32 s7, s0
-; GFX12-NEXT: s_mov_b32 s1, s0
-; GFX12-NEXT: s_mov_b32 s2, s0
-; GFX12-NEXT: s_mov_b32 s3, s0
-; GFX12-NEXT: s_mov_b32 s4, s0
-; GFX12-NEXT: s_mov_b32 s5, s0
-; GFX12-NEXT: s_mov_b32 s6, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
-; GFX12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
-; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
-; GFX12-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
-; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[6:13], v[0:1], v[2:3], v[6:13]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
-; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: s_mov_b32 s0, 0x40400000
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT: s_mov_b32 s7, s0
+; GCN-NEXT: s_mov_b32 s1, s0
+; GCN-NEXT: s_mov_b32 s2, s0
+; GCN-NEXT: s_mov_b32 s3, s0
+; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: s_mov_b32 s5, s0
+; GCN-NEXT: s_mov_b32 s6, s0
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GCN-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
+; GCN-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
+; GCN-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
+; GCN-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
+; GCN-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[6:13], v[0:1], v[2:3], v[6:13]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off
+; GCN-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
store <8 x float> %res, ptr addrspace(1) %out
@@ -284,13 +285,13 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[6:13], v[0:1], v[2:3], 1.0
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
-; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[6:13], v[0:1], v[2:3], 1.0
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off
+; GCN-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
store <8 x float> %res, ptr addrspace(1) %out
@@ -298,27 +299,27 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: s_mov_b32 s0, 0x40400000
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_mov_b32 s7, s0
-; GFX12-NEXT: s_mov_b32 s1, s0
-; GFX12-NEXT: s_mov_b32 s2, s0
-; GFX12-NEXT: s_mov_b32 s3, s0
-; GFX12-NEXT: s_mov_b32 s4, s0
-; GFX12-NEXT: s_mov_b32 s5, s0
-; GFX12-NEXT: s_mov_b32 s6, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
-; GFX12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
-; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
-; GFX12-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
-; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[6:13], v[0:1], v[2:3], v[6:13]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
-; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: s_mov_b32 s0, 0x40400000
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT: s_mov_b32 s7, s0
+; GCN-NEXT: s_mov_b32 s1, s0
+; GCN-NEXT: s_mov_b32 s2, s0
+; GCN-NEXT: s_mov_b32 s3, s0
+; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: s_mov_b32 s5, s0
+; GCN-NEXT: s_mov_b32 s6, s0
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GCN-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
+; GCN-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
+; GCN-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
+; GCN-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
+; GCN-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[6:13], v[0:1], v[2:3], v[6:13]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off
+; GCN-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
store <8 x float> %res, ptr addrspace(1) %out
@@ -326,13 +327,13 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[6:13], v[0:1], v[2:3], 1.0
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
-; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[6:13], v[0:1], v[2:3], 1.0
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off
+; GCN-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
store <8 x float> %res, ptr addrspace(1) %out
@@ -340,27 +341,27 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: s_mov_b32 s0, 0x40400000
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_mov_b32 s7, s0
-; GFX12-NEXT: s_mov_b32 s1, s0
-; GFX12-NEXT: s_mov_b32 s2, s0
-; GFX12-NEXT: s_mov_b32 s3, s0
-; GFX12-NEXT: s_mov_b32 s4, s0
-; GFX12-NEXT: s_mov_b32 s5, s0
-; GFX12-NEXT: s_mov_b32 s6, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
-; GFX12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
-; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
-; GFX12-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
-; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[6:13], v[0:1], v[2:3], v[6:13]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
-; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: s_mov_b32 s0, 0x40400000
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT: s_mov_b32 s7, s0
+; GCN-NEXT: s_mov_b32 s1, s0
+; GCN-NEXT: s_mov_b32 s2, s0
+; GCN-NEXT: s_mov_b32 s3, s0
+; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: s_mov_b32 s5, s0
+; GCN-NEXT: s_mov_b32 s6, s0
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GCN-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
+; GCN-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
+; GCN-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
+; GCN-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
+; GCN-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[6:13], v[0:1], v[2:3], v[6:13]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off
+; GCN-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
store <8 x float> %res, ptr addrspace(1) %out
@@ -368,13 +369,13 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[6:13], v[0:1], v[2:3], 1.0
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
-; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[6:13], v[0:1], v[2:3], 1.0
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off
+; GCN-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
store <8 x float> %res, ptr addrspace(1) %out
@@ -382,27 +383,27 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: s_mov_b32 s0, 0x40400000
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_mov_b32 s7, s0
-; GFX12-NEXT: s_mov_b32 s1, s0
-; GFX12-NEXT: s_mov_b32 s2, s0
-; GFX12-NEXT: s_mov_b32 s3, s0
-; GFX12-NEXT: s_mov_b32 s4, s0
-; GFX12-NEXT: s_mov_b32 s5, s0
-; GFX12-NEXT: s_mov_b32 s6, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
-; GFX12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
-; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
-; GFX12-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
-; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[6:13], v[0:1], v[2:3], v[6:13]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
-; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: s_mov_b32 s0, 0x40400000
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT: s_mov_b32 s7, s0
+; GCN-NEXT: s_mov_b32 s1, s0
+; GCN-NEXT: s_mov_b32 s2, s0
+; GCN-NEXT: s_mov_b32 s3, s0
+; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: s_mov_b32 s5, s0
+; GCN-NEXT: s_mov_b32 s6, s0
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GCN-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
+; GCN-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
+; GCN-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
+; GCN-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
+; GCN-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[6:13], v[0:1], v[2:3], v[6:13]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off
+; GCN-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
store <8 x float> %res, ptr addrspace(1) %out
@@ -410,13 +411,13 @@ bb:
}
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[6:13], v[0:1], v[2:3], 1
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
-; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x32_iu4_imm:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_i32_16x16x32_iu4 v[6:13], v[0:1], v[2:3], 1
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off
+; GCN-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
@@ -424,27 +425,27 @@ bb:
}
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm_non_inlineable:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: s_movk_i32 s0, 0x80
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_mov_b32 s7, s0
-; GFX12-NEXT: s_mov_b32 s1, s0
-; GFX12-NEXT: s_mov_b32 s2, s0
-; GFX12-NEXT: s_mov_b32 s3, s0
-; GFX12-NEXT: s_mov_b32 s4, s0
-; GFX12-NEXT: s_mov_b32 s5, s0
-; GFX12-NEXT: s_mov_b32 s6, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
-; GFX12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
-; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
-; GFX12-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
-; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[6:13], v[0:1], v[2:3], v[6:13]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
-; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x32_iu4_imm_non_inlineable:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: s_movk_i32 s0, 0x80
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT: s_mov_b32 s7, s0
+; GCN-NEXT: s_mov_b32 s1, s0
+; GCN-NEXT: s_mov_b32 s2, s0
+; GCN-NEXT: s_mov_b32 s3, s0
+; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: s_mov_b32 s5, s0
+; GCN-NEXT: s_mov_b32 s6, s0
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GCN-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
+; GCN-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
+; GCN-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
+; GCN-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
+; GCN-NEXT: v_wmma_i32_16x16x32_iu4 v[6:13], v[0:1], v[2:3], v[6:13]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off
+; GCN-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128>, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
@@ -473,3 +474,6 @@ declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX1170: {{.*}}
+; GFX12: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-iu-modifiers.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-iu-modifiers.ll
index 929a51bfff53c..37900d6db1027 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-iu-modifiers.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-iu-modifiers.ll
@@ -1,14 +1,15 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1170 < %s | FileCheck %s --check-prefixes=GCN,GFX1170
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefixes=GCN,GFX12
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src0(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src0:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu8_zext_src0:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off
+; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
@@ -16,13 +17,13 @@ bb:
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src1(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src1:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu8_zext_src1:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off
+; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
@@ -30,13 +31,13 @@ bb:
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_clamp:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] clamp
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu8_clamp:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] clamp
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off
+; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1)
store <8 x i32> %res, ptr addrspace(1) %out
@@ -46,13 +47,13 @@ bb:
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src0(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src0:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[1,0,0]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off
-; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu4_zext_src0:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[1,0,0]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[10:11], v[2:5], off
+; GCN-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 1, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
@@ -60,13 +61,13 @@ bb:
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src1(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src1:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[0,1,0]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off
-; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu4_zext_src1:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[0,1,0]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[10:11], v[2:5], off
+; GCN-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 1, i32 %B, <8 x i32> %C, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
@@ -74,13 +75,13 @@ bb:
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_clamp(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_clamp:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] clamp
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off
-; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu4_clamp:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] clamp
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[10:11], v[2:5], off
+; GCN-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 1)
store <8 x i32> %res, ptr addrspace(1) %out
@@ -90,13 +91,13 @@ bb:
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src0(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src0:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x32_iu4_zext_src0:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off
+; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
@@ -104,13 +105,13 @@ bb:
}
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src1(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src1:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x32_iu4_zext_src1:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off
+; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
@@ -118,13 +119,13 @@ bb:
}
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_clamp:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] clamp
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x32_iu4_clamp:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] clamp
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off
+; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1)
store <8 x i32> %res, ptr addrspace(1) %out
@@ -136,13 +137,13 @@ bb:
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src0(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src0:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
-; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src0:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[15:16], v[6:9], off
+; GCN-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 1, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
@@ -150,13 +151,13 @@ bb:
}
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src1(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src1:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
-; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src1:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[15:16], v[6:9], off
+; GCN-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
@@ -164,13 +165,13 @@ bb:
}
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_clamp(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_clamp:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 clamp
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
-; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x32_iu8_clamp:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 clamp
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[15:16], v[6:9], off
+; GCN-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 1)
store <8 x i32> %res, ptr addrspace(1) %out
@@ -180,13 +181,13 @@ bb:
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src0(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src0:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[1,0,0]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off
-; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src0:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[1,0,0]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[12:13], v[3:6], off
+; GCN-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 1, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
@@ -194,13 +195,13 @@ bb:
}
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src1(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src1:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[0,1,0]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off
-; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src1:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[0,1,0]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[12:13], v[3:6], off
+; GCN-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 1, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
@@ -208,13 +209,13 @@ bb:
}
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_clamp(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_clamp:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 clamp
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off
-; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x32_iu4_clamp:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 clamp
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[12:13], v[3:6], off
+; GCN-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 1)
store <8 x i32> %res, ptr addrspace(1) %out
@@ -224,13 +225,13 @@ bb:
define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src0(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src0:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
-; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src0:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[15:16], v[6:9], off
+; GCN-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 1, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
@@ -238,13 +239,13 @@ bb:
}
define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src1(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src1:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
-; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src1:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[15:16], v[6:9], off
+; GCN-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 0, <2 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
@@ -252,13 +253,13 @@ bb:
}
define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_clamp(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_clamp:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 clamp
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
-; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x64_iu4_clamp:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 clamp
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[15:16], v[6:9], off
+; GCN-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 1)
store <8 x i32> %res, ptr addrspace(1) %out
@@ -271,3 +272,6 @@ declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1
declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16 %Index, i1 immarg)
declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16 %Index, i1 immarg)
declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i32 %Index, i1 immarg)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX1170: {{.*}}
+; GFX12: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-swmmac-index_key.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-swmmac-index_key.ll
index 7c0f72606a5ba..a3d0da7dfc143 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-swmmac-index_key.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-swmmac-index_key.ll
@@ -1,7 +1,27 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1170 < %s | FileCheck %s --check-prefixes=GCN,GFX1170
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefixes=GCN,GFX12
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_index_key(<8 x half> %A, <16 x half> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX1170-LABEL: test_swmmac_f32_16x16x32_f16_index_key:
+; GFX1170: ; %bb.0: ; %bb
+; GFX1170-NEXT: global_load_b32 v20, v[20:21], off
+; GFX1170-NEXT: v_dual_mov_b32 v33, v19 :: v_dual_mov_b32 v32, v18
+; GFX1170-NEXT: v_dual_mov_b32 v31, v17 :: v_dual_mov_b32 v30, v16
+; GFX1170-NEXT: v_dual_mov_b32 v29, v15 :: v_dual_mov_b32 v28, v14
+; GFX1170-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
+; GFX1170-NEXT: s_waitcnt vmcnt(0)
+; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT: v_swmmac_f32_16x16x32_f16 v[26:33], v[0:3], v[4:11], v20
+; GFX1170-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 index_key:1
+; GFX1170-NEXT: s_clause 0x1
+; GFX1170-NEXT: global_store_b128 v[22:23], v[26:29], off
+; GFX1170-NEXT: global_store_b128 v[22:23], v[30:33], off offset:16
+; GFX1170-NEXT: s_clause 0x1
+; GFX1170-NEXT: global_store_b128 v[24:25], v[12:15], off
+; GFX1170-NEXT: global_store_b128 v[24:25], v[16:19], off offset:16
+; GFX1170-NEXT: s_endpgm
+;
; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_index_key:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: global_load_b32 v20, v[20:21], off
@@ -32,6 +52,25 @@ bb:
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16_index_key(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX1170-LABEL: test_swmmac_f32_16x16x32_bf16_index_key:
+; GFX1170: ; %bb.0: ; %bb
+; GFX1170-NEXT: global_load_b32 v20, v[20:21], off
+; GFX1170-NEXT: v_dual_mov_b32 v33, v19 :: v_dual_mov_b32 v32, v18
+; GFX1170-NEXT: v_dual_mov_b32 v31, v17 :: v_dual_mov_b32 v30, v16
+; GFX1170-NEXT: v_dual_mov_b32 v29, v15 :: v_dual_mov_b32 v28, v14
+; GFX1170-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
+; GFX1170-NEXT: s_waitcnt vmcnt(0)
+; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf16 v[26:33], v[0:3], v[4:11], v20
+; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 index_key:1
+; GFX1170-NEXT: s_clause 0x1
+; GFX1170-NEXT: global_store_b128 v[22:23], v[26:29], off
+; GFX1170-NEXT: global_store_b128 v[22:23], v[30:33], off offset:16
+; GFX1170-NEXT: s_clause 0x1
+; GFX1170-NEXT: global_store_b128 v[24:25], v[12:15], off
+; GFX1170-NEXT: global_store_b128 v[24:25], v[16:19], off offset:16
+; GFX1170-NEXT: s_endpgm
+;
; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16_index_key:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: global_load_b32 v20, v[20:21], off
@@ -62,6 +101,19 @@ bb:
}
define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_index_key(<8 x half> %A, <16 x half> %B, <8 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX1170-LABEL: test_swmmac_f16_16x16x32_f16_index_key:
+; GFX1170: ; %bb.0: ; %bb
+; GFX1170-NEXT: global_load_b32 v16, v[16:17], off
+; GFX1170-NEXT: v_dual_mov_b32 v25, v15 :: v_dual_mov_b32 v24, v14
+; GFX1170-NEXT: v_dual_mov_b32 v23, v13 :: v_dual_mov_b32 v22, v12
+; GFX1170-NEXT: s_waitcnt vmcnt(0)
+; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT: v_swmmac_f16_16x16x32_f16 v[22:25], v[0:3], v[4:11], v16
+; GFX1170-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:1
+; GFX1170-NEXT: global_store_b128 v[18:19], v[22:25], off
+; GFX1170-NEXT: global_store_b128 v[20:21], v[12:15], off
+; GFX1170-NEXT: s_endpgm
+;
; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_index_key:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: global_load_b32 v16, v[16:17], off
@@ -86,6 +138,19 @@ bb:
}
define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16_index_key(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX1170-LABEL: test_swmmac_bf16_16x16x32_bf16_index_key:
+; GFX1170: ; %bb.0: ; %bb
+; GFX1170-NEXT: global_load_b32 v16, v[16:17], off
+; GFX1170-NEXT: v_dual_mov_b32 v25, v15 :: v_dual_mov_b32 v24, v14
+; GFX1170-NEXT: v_dual_mov_b32 v23, v13 :: v_dual_mov_b32 v22, v12
+; GFX1170-NEXT: s_waitcnt vmcnt(0)
+; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT: v_swmmac_bf16_16x16x32_bf16 v[22:25], v[0:3], v[4:11], v16
+; GFX1170-NEXT: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:1
+; GFX1170-NEXT: global_store_b128 v[18:19], v[22:25], off
+; GFX1170-NEXT: global_store_b128 v[20:21], v[12:15], off
+; GFX1170-NEXT: s_endpgm
+;
; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16_index_key:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: global_load_b32 v16, v[16:17], off
@@ -110,6 +175,25 @@ bb:
}
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX1170-LABEL: test_swmmac_i32_16x16x32_iu8_index_key:
+; GFX1170: ; %bb.0: ; %bb
+; GFX1170-NEXT: global_load_b32 v14, v[14:15], off
+; GFX1170-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
+; GFX1170-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10
+; GFX1170-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8
+; GFX1170-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6
+; GFX1170-NEXT: s_waitcnt vmcnt(0)
+; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT: v_swmmac_i32_16x16x32_iu8 v[20:27], v[0:1], v[2:5], v14
+; GFX1170-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 index_key:1
+; GFX1170-NEXT: s_clause 0x1
+; GFX1170-NEXT: global_store_b128 v[16:17], v[20:23], off
+; GFX1170-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16
+; GFX1170-NEXT: s_clause 0x1
+; GFX1170-NEXT: global_store_b128 v[18:19], v[6:9], off
+; GFX1170-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16
+; GFX1170-NEXT: s_endpgm
+;
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_index_key:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: global_load_b32 v14, v[14:15], off
@@ -140,6 +224,25 @@ bb:
}
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_index_key(i32 %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX1170-LABEL: test_swmmac_i32_16x16x32_iu4_index_key:
+; GFX1170: ; %bb.0: ; %bb
+; GFX1170-NEXT: global_load_b32 v11, v[11:12], off
+; GFX1170-NEXT: v_dual_mov_b32 v24, v10 :: v_dual_mov_b32 v23, v9
+; GFX1170-NEXT: v_dual_mov_b32 v22, v8 :: v_dual_mov_b32 v21, v7
+; GFX1170-NEXT: v_dual_mov_b32 v20, v6 :: v_dual_mov_b32 v19, v5
+; GFX1170-NEXT: v_dual_mov_b32 v18, v4 :: v_dual_mov_b32 v17, v3
+; GFX1170-NEXT: s_waitcnt vmcnt(0)
+; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT: v_swmmac_i32_16x16x32_iu4 v[17:24], v0, v[1:2], v11
+; GFX1170-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 index_key:1
+; GFX1170-NEXT: s_clause 0x1
+; GFX1170-NEXT: global_store_b128 v[13:14], v[17:20], off
+; GFX1170-NEXT: global_store_b128 v[13:14], v[21:24], off offset:16
+; GFX1170-NEXT: s_clause 0x1
+; GFX1170-NEXT: global_store_b128 v[15:16], v[3:6], off
+; GFX1170-NEXT: global_store_b128 v[15:16], v[7:10], off offset:16
+; GFX1170-NEXT: s_endpgm
+;
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_index_key:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: global_load_b32 v11, v[11:12], off
@@ -170,6 +273,25 @@ bb:
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX1170-LABEL: test_swmmac_f32_16x16x32_fp8_fp8_index_key:
+; GFX1170: ; %bb.0: ; %bb
+; GFX1170-NEXT: global_load_b32 v14, v[14:15], off
+; GFX1170-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
+; GFX1170-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10
+; GFX1170-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8
+; GFX1170-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6
+; GFX1170-NEXT: s_waitcnt vmcnt(0)
+; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[20:27], v[0:1], v[2:5], v14
+; GFX1170-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1
+; GFX1170-NEXT: s_clause 0x1
+; GFX1170-NEXT: global_store_b128 v[16:17], v[20:23], off
+; GFX1170-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16
+; GFX1170-NEXT: s_clause 0x1
+; GFX1170-NEXT: global_store_b128 v[18:19], v[6:9], off
+; GFX1170-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16
+; GFX1170-NEXT: s_endpgm
+;
; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8_index_key:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: global_load_b32 v14, v[14:15], off
@@ -200,6 +322,25 @@ bb:
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX1170-LABEL: test_swmmac_f32_16x16x32_fp8_bf8_index_key:
+; GFX1170: ; %bb.0: ; %bb
+; GFX1170-NEXT: global_load_b32 v14, v[14:15], off
+; GFX1170-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
+; GFX1170-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10
+; GFX1170-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8
+; GFX1170-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6
+; GFX1170-NEXT: s_waitcnt vmcnt(0)
+; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[20:27], v[0:1], v[2:5], v14
+; GFX1170-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1
+; GFX1170-NEXT: s_clause 0x1
+; GFX1170-NEXT: global_store_b128 v[16:17], v[20:23], off
+; GFX1170-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16
+; GFX1170-NEXT: s_clause 0x1
+; GFX1170-NEXT: global_store_b128 v[18:19], v[6:9], off
+; GFX1170-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16
+; GFX1170-NEXT: s_endpgm
+;
; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8_index_key:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: global_load_b32 v14, v[14:15], off
@@ -230,6 +371,25 @@ bb:
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX1170-LABEL: test_swmmac_f32_16x16x32_bf8_fp8_index_key:
+; GFX1170: ; %bb.0: ; %bb
+; GFX1170-NEXT: global_load_b32 v14, v[14:15], off
+; GFX1170-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
+; GFX1170-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10
+; GFX1170-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8
+; GFX1170-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6
+; GFX1170-NEXT: s_waitcnt vmcnt(0)
+; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[20:27], v[0:1], v[2:5], v14
+; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1
+; GFX1170-NEXT: s_clause 0x1
+; GFX1170-NEXT: global_store_b128 v[16:17], v[20:23], off
+; GFX1170-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16
+; GFX1170-NEXT: s_clause 0x1
+; GFX1170-NEXT: global_store_b128 v[18:19], v[6:9], off
+; GFX1170-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16
+; GFX1170-NEXT: s_endpgm
+;
; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8_index_key:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: global_load_b32 v14, v[14:15], off
@@ -260,6 +420,25 @@ bb:
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX1170-LABEL: test_swmmac_f32_16x16x32_bf8_bf8_index_key:
+; GFX1170: ; %bb.0: ; %bb
+; GFX1170-NEXT: global_load_b32 v14, v[14:15], off
+; GFX1170-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
+; GFX1170-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10
+; GFX1170-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8
+; GFX1170-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6
+; GFX1170-NEXT: s_waitcnt vmcnt(0)
+; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[20:27], v[0:1], v[2:5], v14
+; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1
+; GFX1170-NEXT: s_clause 0x1
+; GFX1170-NEXT: global_store_b128 v[16:17], v[20:23], off
+; GFX1170-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16
+; GFX1170-NEXT: s_clause 0x1
+; GFX1170-NEXT: global_store_b128 v[18:19], v[6:9], off
+; GFX1170-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16
+; GFX1170-NEXT: s_endpgm
+;
; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8_index_key:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: global_load_b32 v14, v[14:15], off
@@ -299,3 +478,5 @@ declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32.ll
index da61bc4758879..4eacdbe171e3e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32.ll
@@ -1,14 +1,15 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1170 < %s | FileCheck %s --check-prefixes=GCN,GFX1170
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefixes=GCN,GFX12
define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_f16:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
-; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_f16:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[16:17], v[8:11], off
+; GCN-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %C)
store <8 x float> %res, ptr addrspace(1) %out
@@ -16,13 +17,13 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf16:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
-; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf16:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[16:17], v[8:11], off
+; GCN-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> %C)
store <8 x float> %res, ptr addrspace(1) %out
@@ -30,11 +31,11 @@ bb:
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f16_16x16x16_f16:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11]
-; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f16_16x16x16_f16:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11]
+; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %C, i1 0)
store <8 x half> %res, ptr addrspace(1) %out
@@ -42,11 +43,11 @@ bb:
}
define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11]
-; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_bf16_16x16x16_bf16:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11]
+; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i1 0)
store <8 x i16> %res, ptr addrspace(1) %out
@@ -54,13 +55,13 @@ bb:
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu8:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu8:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off
+; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
@@ -68,13 +69,13 @@ bb:
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu4:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off
-; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu4:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[10:11], v[2:5], off
+; GCN-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
@@ -82,13 +83,13 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_fp8_fp8:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off
+; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
store <8 x float> %res, ptr addrspace(1) %out
@@ -96,13 +97,13 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf8_fp8:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off
+; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
store <8 x float> %res, ptr addrspace(1) %out
@@ -110,13 +111,13 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_fp8_bf8:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off
+; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
store <8 x float> %res, ptr addrspace(1) %out
@@ -124,13 +125,13 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf8_bf8:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off
+; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
store <8 x float> %res, ptr addrspace(1) %out
@@ -138,13 +139,13 @@ bb:
}
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x32_iu4:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x32_iu4:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off
+; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
@@ -153,13 +154,13 @@ bb:
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f32_16x16x32_f16:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off
-; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_f32_16x16x32_f16:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[21:22], v[12:15], off
+; GCN-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index)
store <8 x float> %res, ptr addrspace(1) %out
@@ -167,13 +168,13 @@ bb:
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off
-; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_f32_16x16x32_bf16:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[21:22], v[12:15], off
+; GCN-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index)
store <8 x float> %res, ptr addrspace(1) %out
@@ -181,11 +182,11 @@ bb:
}
define amdgpu_ps void @test_swmmac_f16_16x16x32_f16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f16_16x16x32_f16:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16
-; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_f16_16x16x32_f16:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16
+; GCN-NEXT: global_store_b128 v[17:18], v[12:15], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index)
store <8 x half> %res, ptr addrspace(1) %out
@@ -193,11 +194,11 @@ bb:
}
define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16
-; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_bf16_16x16x32_bf16:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16
+; GCN-NEXT: global_store_b128 v[17:18], v[12:15], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index)
store <8 x i16> %res, ptr addrspace(1) %out
@@ -205,13 +206,13 @@ bb:
}
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
-; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x32_iu8:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[15:16], v[6:9], off
+; GCN-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
@@ -219,13 +220,13 @@ bb:
}
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off
-; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x32_iu4:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[12:13], v[3:6], off
+; GCN-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
@@ -233,13 +234,13 @@ bb:
}
define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
-; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x64_iu4:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[15:16], v[6:9], off
+; GCN-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
@@ -247,13 +248,13 @@ bb:
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
-; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_f32_16x16x32_fp8_fp8:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[15:16], v[6:9], off
+; GCN-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
store <8 x float> %res, ptr addrspace(1) %out
@@ -261,13 +262,13 @@ bb:
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
-; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_f32_16x16x32_fp8_bf8:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[15:16], v[6:9], off
+; GCN-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
store <8 x float> %res, ptr addrspace(1) %out
@@ -275,13 +276,13 @@ bb:
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
-; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_f32_16x16x32_bf8_fp8:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[15:16], v[6:9], off
+; GCN-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
store <8 x float> %res, ptr addrspace(1) %out
@@ -289,13 +290,13 @@ bb:
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
-; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_f32_16x16x32_bf8_bf8:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[15:16], v[6:9], off
+; GCN-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
store <8 x float> %res, ptr addrspace(1) %out
@@ -324,3 +325,6 @@ declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX1170: {{.*}}
+; GFX12: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll
index a345ee6def7a7..3886a072b1763 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll
@@ -1,12 +1,13 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,-real-true16 < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1170 -mattr=+wavefrontsize64,-real-true16 < %s | FileCheck %s --check-prefixes=GCN,GFX1170
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,-real-true16 < %s | FileCheck %s --check-prefixes=GCN,GFX12
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] neg_hi:[1,0,0]
-; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_f16_negA:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GCN-NEXT: global_store_b128 v[8:9], v[4:7], off
+; GCN-NEXT: s_endpgm
bb:
%fneg.A = fneg <4 x half> %A
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %fneg.A, <4 x half> %B, <4 x float> %C)
@@ -15,11 +16,11 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negB:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] neg_hi:[0,1,0]
-; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_f16_negB:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GCN-NEXT: global_store_b128 v[8:9], v[4:7], off
+; GCN-NEXT: s_endpgm
bb:
%fneg.B = fneg <4 x half> %B
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %fneg.B, <4 x float> %C)
@@ -28,11 +29,11 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negC:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
-; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_f16_negC:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
+; GCN-NEXT: global_store_b128 v[8:9], v[4:7], off
+; GCN-NEXT: s_endpgm
bb:
%fneg.C = fneg <4 x float> %C
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %fneg.C)
@@ -41,11 +42,11 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_f16_absC:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1]
-; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_f16_absC:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1]
+; GCN-NEXT: global_store_b128 v[8:9], v[4:7], off
+; GCN-NEXT: s_endpgm
bb:
%fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %fabs.C)
@@ -54,11 +55,11 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_negC:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
-; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf16_negC:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
+; GCN-NEXT: global_store_b128 v[8:9], v[4:7], off
+; GCN-NEXT: s_endpgm
bb:
%fneg.C = fneg <4 x float> %C
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> %fneg.C)
@@ -67,11 +68,11 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_absC:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1]
-; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf16_absC:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1]
+; GCN-NEXT: global_store_b128 v[8:9], v[4:7], off
+; GCN-NEXT: s_endpgm
bb:
%fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> %fabs.C)
@@ -80,11 +81,11 @@ bb:
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negA:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] neg_hi:[1,0,0]
-; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f16_16x16x16_f16_negA:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GCN-NEXT: global_store_b64 v[6:7], v[4:5], off
+; GCN-NEXT: s_endpgm
bb:
%fneg.A = fneg <4 x half> %A
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %fneg.A, <4 x half> %B, <4 x half> %C, i1 0)
@@ -93,11 +94,11 @@ bb:
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] neg_hi:[0,1,0]
-; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f16_16x16x16_f16_negB:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GCN-NEXT: global_store_b64 v[6:7], v[4:5], off
+; GCN-NEXT: s_endpgm
bb:
%fneg.B = fneg <4 x half> %B
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %fneg.B, <4 x half> %C, i1 0)
@@ -106,11 +107,11 @@ bb:
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1]
-; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f16_16x16x16_f16_negC:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1]
+; GCN-NEXT: global_store_b64 v[6:7], v[4:5], off
+; GCN-NEXT: s_endpgm
bb:
%fneg.C = fneg <4 x half> %C
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fneg.C, i1 0)
@@ -119,11 +120,11 @@ bb:
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f16_16x16x16_f16_absC:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1]
-; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f16_16x16x16_f16_absC:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1]
+; GCN-NEXT: global_store_b64 v[6:7], v[4:5], off
+; GCN-NEXT: s_endpgm
bb:
%fabs.C = call <4 x half> @llvm.fabs.v4f16(<4 x half> %C)
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fabs.C, i1 0)
@@ -132,11 +133,11 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_negC:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
-; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_fp8_fp8_negC:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
+; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT: s_endpgm
bb:
%fneg.C = fneg <4 x float> %C
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C)
@@ -145,11 +146,11 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_absC:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
-; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_fp8_fp8_absC:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
+; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT: s_endpgm
bb:
%fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C)
@@ -158,11 +159,11 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_negC:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
-; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf8_fp8_negC:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
+; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT: s_endpgm
bb:
%fneg.C = fneg <4 x float> %C
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C)
@@ -171,11 +172,11 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_absC:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
-; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf8_fp8_absC:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
+; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT: s_endpgm
bb:
%fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C)
@@ -184,11 +185,11 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_negC:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
-; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_fp8_bf8_negC:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
+; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT: s_endpgm
bb:
%fneg.C = fneg <4 x float> %C
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C)
@@ -197,11 +198,11 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_absC:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
-; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_fp8_bf8_absC:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
+; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT: s_endpgm
bb:
%fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C)
@@ -210,11 +211,11 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_negC:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
-; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf8_bf8_negC:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
+; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT: s_endpgm
bb:
%fneg.C = fneg <4 x float> %C
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C)
@@ -223,11 +224,11 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_absC:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
-; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf8_bf8_absC:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
+; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT: s_endpgm
bb:
%fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C)
@@ -236,11 +237,11 @@ bb:
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<4 x half> %A, <8 x half> %B, <4 x float> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negA:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] neg_hi:[1,0,0]
-; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_f32_16x16x32_f16_negA:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GCN-NEXT: global_store_b128 v[11:12], v[6:9], off
+; GCN-NEXT: s_endpgm
bb:
%fneg.A = fneg <4 x half> %A
%res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f16.v8f16.v4f32.i16(<4 x half> %fneg.A, <8 x half> %B, <4 x float> %C, i16 %Index)
@@ -249,11 +250,11 @@ bb:
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<4 x half> %A, <8 x half> %B, <4 x float> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negB:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] neg_hi:[0,1,0]
-; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_f32_16x16x32_f16_negB:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GCN-NEXT: global_store_b128 v[11:12], v[6:9], off
+; GCN-NEXT: s_endpgm
bb:
%fneg.B = fneg <8 x half> %B
%res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f16.v8f16.v4f32.i16(<4 x half> %A, <8 x half> %fneg.B, <4 x float> %C, i16 %Index)
@@ -262,11 +263,11 @@ bb:
}
define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<4 x half> %A, <8 x half> %B, <4 x half> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negA:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] neg_hi:[1,0,0]
-; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_f16_16x16x32_f16_negA:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GCN-NEXT: global_store_b64 v[9:10], v[6:7], off
+; GCN-NEXT: s_endpgm
bb:
%fneg.A = fneg <4 x half> %A
%res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v8f16.v4f16.i16(<4 x half> %fneg.A, <8 x half> %B, <4 x half> %C, i16 %Index)
@@ -275,11 +276,11 @@ bb:
}
define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<4 x half> %A, <8 x half> %B, <4 x half> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negB:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] neg_hi:[0,1,0]
-; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_f16_16x16x32_f16_negB:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GCN-NEXT: global_store_b64 v[9:10], v[6:7], off
+; GCN-NEXT: s_endpgm
bb:
%fneg.B = fneg <8 x half> %B
%res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v8f16.v4f16.i16(<4 x half> %A, <8 x half> %fneg.B, <4 x half> %C, i16 %Index)
@@ -290,11 +291,11 @@ bb:
; both neg and abs patterns (wmma matrix C f32 or f16 )
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negabsC:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] neg_hi:[0,0,1]
-; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_f16_negabsC:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GCN-NEXT: global_store_b128 v[8:9], v[4:7], off
+; GCN-NEXT: s_endpgm
bb:
%fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
%fneg.fabs.C = fneg <4 x float> %fabs.C
@@ -304,11 +305,11 @@ bb:
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negabsC:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] neg_hi:[0,0,1]
-; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f16_16x16x16_f16_negabsC:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GCN-NEXT: global_store_b64 v[6:7], v[4:5], off
+; GCN-NEXT: s_endpgm
bb:
%fabs.C = call <4 x half> @llvm.fabs.v4f16(<4 x half> %C)
%fneg.fabs.C = fneg <4 x half> %fabs.C
@@ -318,13 +319,13 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_f16_neg_partial_fabsA:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_and_b32_e32 v7, 0x7fffffff, v7
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
-; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_f16_neg_partial_fabsA:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_and_b32_e32 v7, 0x7fffffff, v7
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
+; GCN-NEXT: global_store_b128 v[8:9], v[4:7], off
+; GCN-NEXT: s_endpgm
bb:
%el3 = extractelement <4 x float> %C, i32 3
%el3.fabs = call float @llvm.fabs.f32(float %el3)
@@ -338,11 +339,11 @@ bb:
; A or B matrix modifier and constant in C
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA_constantC:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0]
-; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_f16_negA_constantC:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off
+; GCN-NEXT: s_endpgm
bb:
%fneg.A = fneg <4 x half> %A
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %fneg.A, <4 x half> %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
@@ -351,11 +352,11 @@ bb:
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB_constantC:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0]
-; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f16_16x16x16_f16_negB_constantC:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GCN-NEXT: global_store_b64 v[4:5], v[6:7], off
+; GCN-NEXT: s_endpgm
bb:
%fneg.B = fneg <4 x half> %B
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %fneg.B, <4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
@@ -366,6 +367,20 @@ bb:
; pack f16 elements with v_perm_b32 since they don't come from same b32
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<4 x half> %A, <4 x half> %B, ptr %Caddr, ptr addrspace(1) %out) {
+; GFX1170-LABEL: test_wmma_f16_16x16x16_f16_negC_pack:
+; GFX1170: ; %bb.0: ; %bb
+; GFX1170-NEXT: flat_load_b128 v[8:11], v[4:5]
+; GFX1170-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX1170-NEXT: v_and_b32_e32 v4, 0xffff, v8
+; GFX1170-NEXT: v_and_b32_e32 v5, 0xffff, v10
+; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1170-NEXT: v_lshl_or_b32 v4, v9, 16, v4
+; GFX1170-NEXT: v_lshl_or_b32 v5, v11, 16, v5
+; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1]
+; GFX1170-NEXT: global_store_b64 v[6:7], v[4:5], off
+; GFX1170-NEXT: s_endpgm
+;
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC_pack:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: flat_load_b128 v[8:11], v[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-imm.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-imm.ll
index 5344ab8da1ade..ce9b8f9fc3c14 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-imm.ll
@@ -1,12 +1,13 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1170 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GCN,GFX1170
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GCN,GFX12
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0
-; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_f16_imm:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0
+; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
store <4 x float> %res, ptr addrspace(1) %out
@@ -14,21 +15,21 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm_non_inlineable(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm_non_inlineable:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: s_mov_b32 s0, 0x40400000
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_mov_b32 s3, s0
-; GFX12-NEXT: s_mov_b32 s1, s0
-; GFX12-NEXT: s_mov_b32 s2, s0
-; GFX12-NEXT: v_mov_b32_e32 v9, s3
-; GFX12-NEXT: v_mov_b32_e32 v8, s2
-; GFX12-NEXT: v_mov_b32_e32 v7, s1
-; GFX12-NEXT: v_mov_b32_e32 v6, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], v[6:9]
-; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_f16_imm_non_inlineable:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: s_mov_b32 s0, 0x40400000
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT: s_mov_b32 s3, s0
+; GCN-NEXT: s_mov_b32 s1, s0
+; GCN-NEXT: s_mov_b32 s2, s0
+; GCN-NEXT: v_mov_b32_e32 v9, s3
+; GCN-NEXT: v_mov_b32_e32 v8, s2
+; GCN-NEXT: v_mov_b32_e32 v7, s1
+; GCN-NEXT: v_mov_b32_e32 v6, s0
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], v[6:9]
+; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
store <4 x float> %res, ptr addrspace(1) %out
@@ -36,11 +37,11 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[6:9], v[0:1], v[2:3], 1.0
-; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf16_imm:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_bf16 v[6:9], v[0:1], v[2:3], 1.0
+; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
store <4 x float> %res, ptr addrspace(1) %out
@@ -48,21 +49,21 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm_non_inlineable(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm_non_inlineable:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: s_mov_b32 s0, 0x40400000
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_mov_b32 s3, s0
-; GFX12-NEXT: s_mov_b32 s1, s0
-; GFX12-NEXT: s_mov_b32 s2, s0
-; GFX12-NEXT: v_mov_b32_e32 v9, s3
-; GFX12-NEXT: v_mov_b32_e32 v8, s2
-; GFX12-NEXT: v_mov_b32_e32 v7, s1
-; GFX12-NEXT: v_mov_b32_e32 v6, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[6:9], v[0:1], v[2:3], v[6:9]
-; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf16_imm_non_inlineable:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: s_mov_b32 s0, 0x40400000
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT: s_mov_b32 s3, s0
+; GCN-NEXT: s_mov_b32 s1, s0
+; GCN-NEXT: s_mov_b32 s2, s0
+; GCN-NEXT: v_mov_b32_e32 v9, s3
+; GCN-NEXT: v_mov_b32_e32 v8, s2
+; GCN-NEXT: v_mov_b32_e32 v7, s1
+; GCN-NEXT: v_mov_b32_e32 v6, s0
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT: v_wmma_f32_16x16x16_bf16 v[6:9], v[0:1], v[2:3], v[6:9]
+; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
store <4 x float> %res, ptr addrspace(1) %out
@@ -70,11 +71,11 @@ bb:
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0
-; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f16_16x16x16_f16_imm:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0
+; GCN-NEXT: global_store_b64 v[4:5], v[6:7], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
store <4 x half> %res, ptr addrspace(1) %out
@@ -82,17 +83,17 @@ bb:
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm_non_inlineable(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm_non_inlineable:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: s_mov_b32 s0, 0x42004200
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT: s_mov_b32 s1, s0
-; GFX12-NEXT: v_mov_b32_e32 v7, s1
-; GFX12-NEXT: v_mov_b32_e32 v6, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], v[6:7]
-; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f16_16x16x16_f16_imm_non_inlineable:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: s_mov_b32 s0, 0x42004200
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT: s_mov_b32 s1, s0
+; GCN-NEXT: v_mov_b32_e32 v7, s1
+; GCN-NEXT: v_mov_b32_e32 v6, s0
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], v[6:7]
+; GCN-NEXT: global_store_b64 v[4:5], v[6:7], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> <half 3.0, half 3.0, half 3.0, half 3.0>, i1 0)
store <4 x half> %res, ptr addrspace(1) %out
@@ -100,17 +101,17 @@ bb:
}
define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: s_mov_b32 s0, 0x3f803f80
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT: s_mov_b32 s1, s0
-; GFX12-NEXT: v_mov_b32_e32 v7, s1
-; GFX12-NEXT: v_mov_b32_e32 v6, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], v[6:7]
-; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_bf16_16x16x16_bf16_imm:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: s_mov_b32 s0, 0x3f803f80
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT: s_mov_b32 s1, s0
+; GCN-NEXT: v_mov_b32_e32 v7, s1
+; GCN-NEXT: v_mov_b32_e32 v6, s0
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT: v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], v[6:7]
+; GCN-NEXT: global_store_b64 v[4:5], v[6:7], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> <i16 16256, i16 16256, i16 16256, i16 16256>, i1 0)
store <4 x i16> %res, ptr addrspace(1) %out
@@ -118,17 +119,17 @@ bb:
}
define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm_non_inlineable(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm_non_inlineable:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: s_mov_b32 s0, 0x3fc03fc0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT: s_mov_b32 s1, s0
-; GFX12-NEXT: v_mov_b32_e32 v7, s1
-; GFX12-NEXT: v_mov_b32_e32 v6, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], v[6:7]
-; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_bf16_16x16x16_bf16_imm_non_inlineable:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: s_mov_b32 s0, 0x3fc03fc0
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT: s_mov_b32 s1, s0
+; GCN-NEXT: v_mov_b32_e32 v7, s1
+; GCN-NEXT: v_mov_b32_e32 v6, s0
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT: v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], v[6:7]
+; GCN-NEXT: global_store_b64 v[4:5], v[6:7], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> <i16 16320, i16 16320, i16 16320, i16 16320>, i1 0)
store <4 x i16> %res, ptr addrspace(1) %out
@@ -136,11 +137,11 @@ bb:
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm(i32 %A, i32 %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:7], v0, v1, 1
-; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu8_imm:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_i32_16x16x16_iu8 v[4:7], v0, v1, 1
+; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
@@ -148,21 +149,21 @@ bb:
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm_non_inlineable:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: s_movk_i32 s0, 0x80
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_mov_b32 s3, s0
-; GFX12-NEXT: s_mov_b32 s1, s0
-; GFX12-NEXT: s_mov_b32 s2, s0
-; GFX12-NEXT: v_mov_b32_e32 v7, s3
-; GFX12-NEXT: v_mov_b32_e32 v6, s2
-; GFX12-NEXT: v_mov_b32_e32 v5, s1
-; GFX12-NEXT: v_mov_b32_e32 v4, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:7], v0, v1, v[4:7]
-; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu8_imm_non_inlineable:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: s_movk_i32 s0, 0x80
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT: s_mov_b32 s3, s0
+; GCN-NEXT: s_mov_b32 s1, s0
+; GCN-NEXT: s_mov_b32 s2, s0
+; GCN-NEXT: v_mov_b32_e32 v7, s3
+; GCN-NEXT: v_mov_b32_e32 v6, s2
+; GCN-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT: v_wmma_i32_16x16x16_iu8 v[4:7], v0, v1, v[4:7]
+; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 128, i32 128, i32 128, i32 128>, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
@@ -170,11 +171,11 @@ bb:
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm(i32 %A, i32 %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v0, v1, 1
-; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu4_imm:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v0, v1, 1
+; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
@@ -182,21 +183,21 @@ bb:
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm_non_inlineable:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: s_movk_i32 s0, 0x80
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_mov_b32 s3, s0
-; GFX12-NEXT: s_mov_b32 s1, s0
-; GFX12-NEXT: s_mov_b32 s2, s0
-; GFX12-NEXT: v_mov_b32_e32 v7, s3
-; GFX12-NEXT: v_mov_b32_e32 v6, s2
-; GFX12-NEXT: v_mov_b32_e32 v5, s1
-; GFX12-NEXT: v_mov_b32_e32 v4, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v0, v1, v[4:7]
-; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu4_imm_non_inlineable:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: s_movk_i32 s0, 0x80
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT: s_mov_b32 s3, s0
+; GCN-NEXT: s_mov_b32 s1, s0
+; GCN-NEXT: s_mov_b32 s2, s0
+; GCN-NEXT: v_mov_b32_e32 v7, s3
+; GCN-NEXT: v_mov_b32_e32 v6, s2
+; GCN-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v0, v1, v[4:7]
+; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 128, i32 128, i32 128, i32 128>, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
@@ -204,11 +205,11 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:7], v0, v1, 1.0
-; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:7], v0, v1, 1.0
+; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
store <4 x float> %res, ptr addrspace(1) %out
@@ -216,21 +217,21 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: s_mov_b32 s0, 0x40400000
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_mov_b32 s3, s0
-; GFX12-NEXT: s_mov_b32 s1, s0
-; GFX12-NEXT: s_mov_b32 s2, s0
-; GFX12-NEXT: v_mov_b32_e32 v7, s3
-; GFX12-NEXT: v_mov_b32_e32 v6, s2
-; GFX12-NEXT: v_mov_b32_e32 v5, s1
-; GFX12-NEXT: v_mov_b32_e32 v4, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:7], v0, v1, v[4:7]
-; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: s_mov_b32 s0, 0x40400000
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT: s_mov_b32 s3, s0
+; GCN-NEXT: s_mov_b32 s1, s0
+; GCN-NEXT: s_mov_b32 s2, s0
+; GCN-NEXT: v_mov_b32_e32 v7, s3
+; GCN-NEXT: v_mov_b32_e32 v6, s2
+; GCN-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:7], v0, v1, v[4:7]
+; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
store <4 x float> %res, ptr addrspace(1) %out
@@ -238,11 +239,11 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:7], v0, v1, 1.0
-; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:7], v0, v1, 1.0
+; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
store <4 x float> %res, ptr addrspace(1) %out
@@ -250,21 +251,21 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: s_mov_b32 s0, 0x40400000
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_mov_b32 s3, s0
-; GFX12-NEXT: s_mov_b32 s1, s0
-; GFX12-NEXT: s_mov_b32 s2, s0
-; GFX12-NEXT: v_mov_b32_e32 v7, s3
-; GFX12-NEXT: v_mov_b32_e32 v6, s2
-; GFX12-NEXT: v_mov_b32_e32 v5, s1
-; GFX12-NEXT: v_mov_b32_e32 v4, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:7], v0, v1, v[4:7]
-; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: s_mov_b32 s0, 0x40400000
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT: s_mov_b32 s3, s0
+; GCN-NEXT: s_mov_b32 s1, s0
+; GCN-NEXT: s_mov_b32 s2, s0
+; GCN-NEXT: v_mov_b32_e32 v7, s3
+; GCN-NEXT: v_mov_b32_e32 v6, s2
+; GCN-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:7], v0, v1, v[4:7]
+; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
store <4 x float> %res, ptr addrspace(1) %out
@@ -272,11 +273,11 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:7], v0, v1, 1.0
-; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:7], v0, v1, 1.0
+; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
store <4 x float> %res, ptr addrspace(1) %out
@@ -284,21 +285,21 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: s_mov_b32 s0, 0x40400000
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_mov_b32 s3, s0
-; GFX12-NEXT: s_mov_b32 s1, s0
-; GFX12-NEXT: s_mov_b32 s2, s0
-; GFX12-NEXT: v_mov_b32_e32 v7, s3
-; GFX12-NEXT: v_mov_b32_e32 v6, s2
-; GFX12-NEXT: v_mov_b32_e32 v5, s1
-; GFX12-NEXT: v_mov_b32_e32 v4, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:7], v0, v1, v[4:7]
-; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: s_mov_b32 s0, 0x40400000
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT: s_mov_b32 s3, s0
+; GCN-NEXT: s_mov_b32 s1, s0
+; GCN-NEXT: s_mov_b32 s2, s0
+; GCN-NEXT: v_mov_b32_e32 v7, s3
+; GCN-NEXT: v_mov_b32_e32 v6, s2
+; GCN-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:7], v0, v1, v[4:7]
+; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
store <4 x float> %res, ptr addrspace(1) %out
@@ -306,11 +307,11 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:7], v0, v1, 1.0
-; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:7], v0, v1, 1.0
+; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
store <4 x float> %res, ptr addrspace(1) %out
@@ -318,21 +319,21 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: s_mov_b32 s0, 0x40400000
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_mov_b32 s3, s0
-; GFX12-NEXT: s_mov_b32 s1, s0
-; GFX12-NEXT: s_mov_b32 s2, s0
-; GFX12-NEXT: v_mov_b32_e32 v7, s3
-; GFX12-NEXT: v_mov_b32_e32 v6, s2
-; GFX12-NEXT: v_mov_b32_e32 v5, s1
-; GFX12-NEXT: v_mov_b32_e32 v4, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:7], v0, v1, v[4:7]
-; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: s_mov_b32 s0, 0x40400000
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT: s_mov_b32 s3, s0
+; GCN-NEXT: s_mov_b32 s1, s0
+; GCN-NEXT: s_mov_b32 s2, s0
+; GCN-NEXT: v_mov_b32_e32 v7, s3
+; GCN-NEXT: v_mov_b32_e32 v6, s2
+; GCN-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:7], v0, v1, v[4:7]
+; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
store <4 x float> %res, ptr addrspace(1) %out
@@ -340,11 +341,11 @@ bb:
}
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:7], v0, v1, 1
-; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x32_iu4_imm:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_i32_16x16x32_iu4 v[4:7], v0, v1, 1
+; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
@@ -352,21 +353,21 @@ bb:
}
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm_non_inlineable(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm_non_inlineable:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: s_movk_i32 s0, 0x80
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_mov_b32 s3, s0
-; GFX12-NEXT: s_mov_b32 s1, s0
-; GFX12-NEXT: s_mov_b32 s2, s0
-; GFX12-NEXT: v_mov_b32_e32 v7, s3
-; GFX12-NEXT: v_mov_b32_e32 v6, s2
-; GFX12-NEXT: v_mov_b32_e32 v5, s1
-; GFX12-NEXT: v_mov_b32_e32 v4, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:7], v0, v1, v[4:7]
-; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x32_iu4_imm_non_inlineable:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: s_movk_i32 s0, 0x80
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT: s_mov_b32 s3, s0
+; GCN-NEXT: s_mov_b32 s1, s0
+; GCN-NEXT: s_mov_b32 s2, s0
+; GCN-NEXT: v_mov_b32_e32 v7, s3
+; GCN-NEXT: v_mov_b32_e32 v6, s2
+; GCN-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT: v_wmma_i32_16x16x32_iu4 v[4:7], v0, v1, v[4:7]
+; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 128, i32 128, i32 128, i32 128>, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
@@ -384,3 +385,6 @@ declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX1170: {{.*}}
+; GFX12: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-iu-modifiers.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-iu-modifiers.ll
index e47350db4003e..a87163b0dca14 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-iu-modifiers.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-iu-modifiers.ll
@@ -1,12 +1,13 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1170 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GCN,GFX1170
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GCN,GFX12
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src0:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
-; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu8_zext_src0:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
+; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
@@ -14,11 +15,11 @@ bb:
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src1(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src1:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
-; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu8_zext_src1:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
+; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
@@ -26,11 +27,11 @@ bb:
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_clamp(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_clamp:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] clamp
-; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu8_clamp:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] clamp
+; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1)
store <4 x i32> %res, ptr addrspace(1) %out
@@ -40,11 +41,11 @@ bb:
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src0:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
-; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu4_zext_src0:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
+; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
@@ -52,11 +53,11 @@ bb:
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src1(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src1:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
-; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu4_zext_src1:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
+; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
@@ -64,11 +65,11 @@ bb:
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_clamp(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_clamp:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] clamp
-; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu4_clamp:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] clamp
+; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1)
store <4 x i32> %res, ptr addrspace(1) %out
@@ -78,11 +79,11 @@ bb:
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src0:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
-; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x32_iu4_zext_src0:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
+; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
@@ -90,11 +91,11 @@ bb:
}
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src1(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src1:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
-; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x32_iu4_zext_src1:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
+; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
@@ -102,11 +103,11 @@ bb:
}
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_clamp(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_clamp:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] clamp
-; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x32_iu4_clamp:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] clamp
+; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1)
store <4 x i32> %res, ptr addrspace(1) %out
@@ -119,11 +120,11 @@ bb:
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src0(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src0:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
-; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src0:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
+; GCN-NEXT: global_store_b128 v[8:9], v[3:6], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 1, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
@@ -131,11 +132,11 @@ bb:
}
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src1(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src1:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
-; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src1:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
+; GCN-NEXT: global_store_b128 v[8:9], v[3:6], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 1, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
@@ -143,11 +144,11 @@ bb:
}
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_clamp(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_clamp:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 clamp
-; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x32_iu8_clamp:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 clamp
+; GCN-NEXT: global_store_b128 v[8:9], v[3:6], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 1)
store <4 x i32> %res, ptr addrspace(1) %out
@@ -157,11 +158,11 @@ bb:
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src0(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src0:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[1,0,0]
-; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src0:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[1,0,0]
+; GCN-NEXT: global_store_b128 v[7:8], v[2:5], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
@@ -169,11 +170,11 @@ bb:
}
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src1(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src1:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[0,1,0]
-; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src1:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[0,1,0]
+; GCN-NEXT: global_store_b128 v[7:8], v[2:5], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i16 %Index, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
@@ -181,11 +182,11 @@ bb:
}
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_clamp(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_clamp:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 clamp
-; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x32_iu4_clamp:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 clamp
+; GCN-NEXT: global_store_b128 v[7:8], v[2:5], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 1)
store <4 x i32> %res, ptr addrspace(1) %out
@@ -195,11 +196,11 @@ bb:
define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src0(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src0:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
-; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src0:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
+; GCN-NEXT: global_store_b128 v[8:9], v[3:6], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 1, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
@@ -207,11 +208,11 @@ bb:
}
define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src1(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src1:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
-; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src1:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
+; GCN-NEXT: global_store_b128 v[8:9], v[3:6], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 1, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
@@ -219,11 +220,11 @@ bb:
}
define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_clamp(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_clamp:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 clamp
-; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x64_iu4_clamp:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 clamp
+; GCN-NEXT: global_store_b128 v[8:9], v[3:6], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 1)
store <4 x i32> %res, ptr addrspace(1) %out
@@ -236,3 +237,6 @@ declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 imma
declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i8 %Index, i1 immarg)
declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i16 %Index, i1 immarg)
declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i16 %Index, i1 immarg)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX1170: {{.*}}
+; GFX12: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-swmmac-index_key.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-swmmac-index_key.ll
index da6852042f7f5..7d31e262b4862 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-swmmac-index_key.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-swmmac-index_key.ll
@@ -1,7 +1,35 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1170 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GCN,GFX1170
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GCN,GFX12
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_index_key(<4 x half> %A, <8 x half> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX1170-LABEL: test_swmmac_f32_16x16x32_f16_index_key:
+; GFX1170: ; %bb.0: ; %bb
+; GFX1170-NEXT: global_load_b32 v10, v[10:11], off
+; GFX1170-NEXT: v_mov_b32_e32 v23, v9
+; GFX1170-NEXT: v_mov_b32_e32 v22, v8
+; GFX1170-NEXT: v_mov_b32_e32 v21, v7
+; GFX1170-NEXT: v_mov_b32_e32 v20, v6
+; GFX1170-NEXT: v_mov_b32_e32 v27, v9
+; GFX1170-NEXT: v_mov_b32_e32 v26, v8
+; GFX1170-NEXT: v_mov_b32_e32 v25, v7
+; GFX1170-NEXT: v_mov_b32_e32 v24, v6
+; GFX1170-NEXT: v_mov_b32_e32 v31, v9
+; GFX1170-NEXT: v_mov_b32_e32 v30, v8
+; GFX1170-NEXT: v_mov_b32_e32 v29, v7
+; GFX1170-NEXT: v_mov_b32_e32 v28, v6
+; GFX1170-NEXT: s_waitcnt vmcnt(0)
+; GFX1170-NEXT: v_swmmac_f32_16x16x32_f16 v[20:23], v[0:1], v[2:5], v10
+; GFX1170-NEXT: v_swmmac_f32_16x16x32_f16 v[24:27], v[0:1], v[2:5], v10 index_key:1
+; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX1170-NEXT: v_swmmac_f32_16x16x32_f16 v[28:31], v[0:1], v[2:5], v10 index_key:2
+; GFX1170-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:3
+; GFX1170-NEXT: global_store_b128 v[12:13], v[20:23], off
+; GFX1170-NEXT: global_store_b128 v[14:15], v[24:27], off
+; GFX1170-NEXT: global_store_b128 v[16:17], v[28:31], off
+; GFX1170-NEXT: global_store_b128 v[18:19], v[6:9], off
+; GFX1170-NEXT: s_endpgm
+;
; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_index_key:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: global_load_b32 v10, v[10:11], off
@@ -46,6 +74,33 @@ bb:
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16_index_key(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX1170-LABEL: test_swmmac_f32_16x16x32_bf16_index_key:
+; GFX1170: ; %bb.0: ; %bb
+; GFX1170-NEXT: global_load_b32 v10, v[10:11], off
+; GFX1170-NEXT: v_mov_b32_e32 v23, v9
+; GFX1170-NEXT: v_mov_b32_e32 v22, v8
+; GFX1170-NEXT: v_mov_b32_e32 v21, v7
+; GFX1170-NEXT: v_mov_b32_e32 v20, v6
+; GFX1170-NEXT: v_mov_b32_e32 v27, v9
+; GFX1170-NEXT: v_mov_b32_e32 v26, v8
+; GFX1170-NEXT: v_mov_b32_e32 v25, v7
+; GFX1170-NEXT: v_mov_b32_e32 v24, v6
+; GFX1170-NEXT: v_mov_b32_e32 v31, v9
+; GFX1170-NEXT: v_mov_b32_e32 v30, v8
+; GFX1170-NEXT: v_mov_b32_e32 v29, v7
+; GFX1170-NEXT: v_mov_b32_e32 v28, v6
+; GFX1170-NEXT: s_waitcnt vmcnt(0)
+; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf16 v[20:23], v[0:1], v[2:5], v10
+; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf16 v[24:27], v[0:1], v[2:5], v10 index_key:1
+; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf16 v[28:31], v[0:1], v[2:5], v10 index_key:2
+; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:3
+; GFX1170-NEXT: global_store_b128 v[12:13], v[20:23], off
+; GFX1170-NEXT: global_store_b128 v[14:15], v[24:27], off
+; GFX1170-NEXT: global_store_b128 v[16:17], v[28:31], off
+; GFX1170-NEXT: global_store_b128 v[18:19], v[6:9], off
+; GFX1170-NEXT: s_endpgm
+;
; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16_index_key:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: global_load_b32 v10, v[10:11], off
@@ -90,6 +145,27 @@ bb:
}
define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_index_key(<4 x half> %A, <8 x half> %B, <4 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX1170-LABEL: test_swmmac_f16_16x16x32_f16_index_key:
+; GFX1170: ; %bb.0: ; %bb
+; GFX1170-NEXT: global_load_b32 v22, v[8:9], off
+; GFX1170-NEXT: v_mov_b32_e32 v9, v7
+; GFX1170-NEXT: v_mov_b32_e32 v8, v6
+; GFX1170-NEXT: v_mov_b32_e32 v19, v7
+; GFX1170-NEXT: v_mov_b32_e32 v18, v6
+; GFX1170-NEXT: v_mov_b32_e32 v21, v7
+; GFX1170-NEXT: v_mov_b32_e32 v20, v6
+; GFX1170-NEXT: s_waitcnt vmcnt(0)
+; GFX1170-NEXT: v_swmmac_f16_16x16x32_f16 v[8:9], v[0:1], v[2:5], v22
+; GFX1170-NEXT: v_swmmac_f16_16x16x32_f16 v[18:19], v[0:1], v[2:5], v22 index_key:1
+; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX1170-NEXT: v_swmmac_f16_16x16x32_f16 v[20:21], v[0:1], v[2:5], v22 index_key:2
+; GFX1170-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v22 index_key:3
+; GFX1170-NEXT: global_store_b64 v[10:11], v[8:9], off
+; GFX1170-NEXT: global_store_b64 v[12:13], v[18:19], off
+; GFX1170-NEXT: global_store_b64 v[14:15], v[20:21], off
+; GFX1170-NEXT: global_store_b64 v[16:17], v[6:7], off
+; GFX1170-NEXT: s_endpgm
+;
; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_index_key:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: global_load_b32 v22, v[8:9], off
@@ -128,6 +204,27 @@ bb:
}
define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16_index_key(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX1170-LABEL: test_swmmac_bf16_16x16x32_bf16_index_key:
+; GFX1170: ; %bb.0: ; %bb
+; GFX1170-NEXT: global_load_b32 v22, v[8:9], off
+; GFX1170-NEXT: v_mov_b32_e32 v9, v7
+; GFX1170-NEXT: v_mov_b32_e32 v8, v6
+; GFX1170-NEXT: v_mov_b32_e32 v19, v7
+; GFX1170-NEXT: v_mov_b32_e32 v18, v6
+; GFX1170-NEXT: v_mov_b32_e32 v21, v7
+; GFX1170-NEXT: v_mov_b32_e32 v20, v6
+; GFX1170-NEXT: s_waitcnt vmcnt(0)
+; GFX1170-NEXT: v_swmmac_bf16_16x16x32_bf16 v[8:9], v[0:1], v[2:5], v22
+; GFX1170-NEXT: v_swmmac_bf16_16x16x32_bf16 v[18:19], v[0:1], v[2:5], v22 index_key:1
+; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX1170-NEXT: v_swmmac_bf16_16x16x32_bf16 v[20:21], v[0:1], v[2:5], v22 index_key:2
+; GFX1170-NEXT: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v22 index_key:3
+; GFX1170-NEXT: global_store_b64 v[10:11], v[8:9], off
+; GFX1170-NEXT: global_store_b64 v[12:13], v[18:19], off
+; GFX1170-NEXT: global_store_b64 v[14:15], v[20:21], off
+; GFX1170-NEXT: global_store_b64 v[16:17], v[6:7], off
+; GFX1170-NEXT: s_endpgm
+;
; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16_index_key:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: global_load_b32 v22, v[8:9], off
@@ -166,6 +263,33 @@ bb:
}
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_index_key(i32 %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX1170-LABEL: test_swmmac_i32_16x16x32_iu8_index_key:
+; GFX1170: ; %bb.0: ; %bb
+; GFX1170-NEXT: global_load_b32 v7, v[7:8], off
+; GFX1170-NEXT: v_mov_b32_e32 v20, v6
+; GFX1170-NEXT: v_mov_b32_e32 v19, v5
+; GFX1170-NEXT: v_mov_b32_e32 v18, v4
+; GFX1170-NEXT: v_mov_b32_e32 v17, v3
+; GFX1170-NEXT: v_mov_b32_e32 v24, v6
+; GFX1170-NEXT: v_mov_b32_e32 v23, v5
+; GFX1170-NEXT: v_mov_b32_e32 v22, v4
+; GFX1170-NEXT: v_mov_b32_e32 v21, v3
+; GFX1170-NEXT: v_mov_b32_e32 v28, v6
+; GFX1170-NEXT: v_mov_b32_e32 v27, v5
+; GFX1170-NEXT: v_mov_b32_e32 v26, v4
+; GFX1170-NEXT: v_mov_b32_e32 v25, v3
+; GFX1170-NEXT: s_waitcnt vmcnt(0)
+; GFX1170-NEXT: v_swmmac_i32_16x16x32_iu8 v[17:20], v0, v[1:2], v7
+; GFX1170-NEXT: v_swmmac_i32_16x16x32_iu8 v[21:24], v0, v[1:2], v7 index_key:1
+; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX1170-NEXT: v_swmmac_i32_16x16x32_iu8 v[25:28], v0, v[1:2], v7 index_key:2
+; GFX1170-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:3
+; GFX1170-NEXT: global_store_b128 v[9:10], v[17:20], off
+; GFX1170-NEXT: global_store_b128 v[11:12], v[21:24], off
+; GFX1170-NEXT: global_store_b128 v[13:14], v[25:28], off
+; GFX1170-NEXT: global_store_b128 v[15:16], v[3:6], off
+; GFX1170-NEXT: s_endpgm
+;
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_index_key:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: global_load_b32 v7, v[7:8], off
@@ -210,6 +334,21 @@ bb:
}
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_index_key(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX1170-LABEL: test_swmmac_i32_16x16x32_iu4_index_key:
+; GFX1170: ; %bb.0: ; %bb
+; GFX1170-NEXT: global_load_b32 v6, v[6:7], off
+; GFX1170-NEXT: v_mov_b32_e32 v15, v5
+; GFX1170-NEXT: v_mov_b32_e32 v14, v4
+; GFX1170-NEXT: v_mov_b32_e32 v13, v3
+; GFX1170-NEXT: v_mov_b32_e32 v12, v2
+; GFX1170-NEXT: s_waitcnt vmcnt(0)
+; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT: v_swmmac_i32_16x16x32_iu4 v[12:15], v0, v1, v6
+; GFX1170-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:1
+; GFX1170-NEXT: global_store_b128 v[8:9], v[12:15], off
+; GFX1170-NEXT: global_store_b128 v[10:11], v[2:5], off
+; GFX1170-NEXT: s_endpgm
+;
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_index_key:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: global_load_b32 v6, v[6:7], off
@@ -236,6 +375,21 @@ bb:
}
define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_index_key(i32 %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX1170-LABEL: test_swmmac_i32_16x16x64_iu4_index_key:
+; GFX1170: ; %bb.0: ; %bb
+; GFX1170-NEXT: global_load_b32 v7, v[7:8], off
+; GFX1170-NEXT: v_mov_b32_e32 v16, v6
+; GFX1170-NEXT: v_mov_b32_e32 v15, v5
+; GFX1170-NEXT: v_mov_b32_e32 v14, v4
+; GFX1170-NEXT: v_mov_b32_e32 v13, v3
+; GFX1170-NEXT: s_waitcnt vmcnt(0)
+; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT: v_swmmac_i32_16x16x64_iu4 v[13:16], v0, v[1:2], v7
+; GFX1170-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:1
+; GFX1170-NEXT: global_store_b128 v[9:10], v[13:16], off
+; GFX1170-NEXT: global_store_b128 v[11:12], v[3:6], off
+; GFX1170-NEXT: s_endpgm
+;
; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_index_key:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: global_load_b32 v7, v[7:8], off
@@ -262,6 +416,33 @@ bb:
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX1170-LABEL: test_swmmac_f32_16x16x32_fp8_fp8_index_key:
+; GFX1170: ; %bb.0: ; %bb
+; GFX1170-NEXT: global_load_b32 v7, v[7:8], off
+; GFX1170-NEXT: v_mov_b32_e32 v20, v6
+; GFX1170-NEXT: v_mov_b32_e32 v19, v5
+; GFX1170-NEXT: v_mov_b32_e32 v18, v4
+; GFX1170-NEXT: v_mov_b32_e32 v17, v3
+; GFX1170-NEXT: v_mov_b32_e32 v24, v6
+; GFX1170-NEXT: v_mov_b32_e32 v23, v5
+; GFX1170-NEXT: v_mov_b32_e32 v22, v4
+; GFX1170-NEXT: v_mov_b32_e32 v21, v3
+; GFX1170-NEXT: v_mov_b32_e32 v28, v6
+; GFX1170-NEXT: v_mov_b32_e32 v27, v5
+; GFX1170-NEXT: v_mov_b32_e32 v26, v4
+; GFX1170-NEXT: v_mov_b32_e32 v25, v3
+; GFX1170-NEXT: s_waitcnt vmcnt(0)
+; GFX1170-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[17:20], v0, v[1:2], v7
+; GFX1170-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[21:24], v0, v[1:2], v7 index_key:1
+; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX1170-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[25:28], v0, v[1:2], v7 index_key:2
+; GFX1170-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:3
+; GFX1170-NEXT: global_store_b128 v[9:10], v[17:20], off
+; GFX1170-NEXT: global_store_b128 v[11:12], v[21:24], off
+; GFX1170-NEXT: global_store_b128 v[13:14], v[25:28], off
+; GFX1170-NEXT: global_store_b128 v[15:16], v[3:6], off
+; GFX1170-NEXT: s_endpgm
+;
; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8_index_key:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: global_load_b32 v7, v[7:8], off
@@ -306,6 +487,33 @@ bb:
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX1170-LABEL: test_swmmac_f32_16x16x32_fp8_bf8_index_key:
+; GFX1170: ; %bb.0: ; %bb
+; GFX1170-NEXT: global_load_b32 v7, v[7:8], off
+; GFX1170-NEXT: v_mov_b32_e32 v20, v6
+; GFX1170-NEXT: v_mov_b32_e32 v19, v5
+; GFX1170-NEXT: v_mov_b32_e32 v18, v4
+; GFX1170-NEXT: v_mov_b32_e32 v17, v3
+; GFX1170-NEXT: v_mov_b32_e32 v24, v6
+; GFX1170-NEXT: v_mov_b32_e32 v23, v5
+; GFX1170-NEXT: v_mov_b32_e32 v22, v4
+; GFX1170-NEXT: v_mov_b32_e32 v21, v3
+; GFX1170-NEXT: v_mov_b32_e32 v28, v6
+; GFX1170-NEXT: v_mov_b32_e32 v27, v5
+; GFX1170-NEXT: v_mov_b32_e32 v26, v4
+; GFX1170-NEXT: v_mov_b32_e32 v25, v3
+; GFX1170-NEXT: s_waitcnt vmcnt(0)
+; GFX1170-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[17:20], v0, v[1:2], v7
+; GFX1170-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[21:24], v0, v[1:2], v7 index_key:1
+; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX1170-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[25:28], v0, v[1:2], v7 index_key:2
+; GFX1170-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:3
+; GFX1170-NEXT: global_store_b128 v[9:10], v[17:20], off
+; GFX1170-NEXT: global_store_b128 v[11:12], v[21:24], off
+; GFX1170-NEXT: global_store_b128 v[13:14], v[25:28], off
+; GFX1170-NEXT: global_store_b128 v[15:16], v[3:6], off
+; GFX1170-NEXT: s_endpgm
+;
; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8_index_key:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: global_load_b32 v7, v[7:8], off
@@ -350,6 +558,33 @@ bb:
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX1170-LABEL: test_swmmac_f32_16x16x32_bf8_fp8_index_key:
+; GFX1170: ; %bb.0: ; %bb
+; GFX1170-NEXT: global_load_b32 v7, v[7:8], off
+; GFX1170-NEXT: v_mov_b32_e32 v20, v6
+; GFX1170-NEXT: v_mov_b32_e32 v19, v5
+; GFX1170-NEXT: v_mov_b32_e32 v18, v4
+; GFX1170-NEXT: v_mov_b32_e32 v17, v3
+; GFX1170-NEXT: v_mov_b32_e32 v24, v6
+; GFX1170-NEXT: v_mov_b32_e32 v23, v5
+; GFX1170-NEXT: v_mov_b32_e32 v22, v4
+; GFX1170-NEXT: v_mov_b32_e32 v21, v3
+; GFX1170-NEXT: v_mov_b32_e32 v28, v6
+; GFX1170-NEXT: v_mov_b32_e32 v27, v5
+; GFX1170-NEXT: v_mov_b32_e32 v26, v4
+; GFX1170-NEXT: v_mov_b32_e32 v25, v3
+; GFX1170-NEXT: s_waitcnt vmcnt(0)
+; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[17:20], v0, v[1:2], v7
+; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[21:24], v0, v[1:2], v7 index_key:1
+; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[25:28], v0, v[1:2], v7 index_key:2
+; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:3
+; GFX1170-NEXT: global_store_b128 v[9:10], v[17:20], off
+; GFX1170-NEXT: global_store_b128 v[11:12], v[21:24], off
+; GFX1170-NEXT: global_store_b128 v[13:14], v[25:28], off
+; GFX1170-NEXT: global_store_b128 v[15:16], v[3:6], off
+; GFX1170-NEXT: s_endpgm
+;
; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8_index_key:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: global_load_b32 v7, v[7:8], off
@@ -394,6 +629,33 @@ bb:
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX1170-LABEL: test_swmmac_f32_16x16x32_bf8_bf8_index_key:
+; GFX1170: ; %bb.0: ; %bb
+; GFX1170-NEXT: global_load_b32 v7, v[7:8], off
+; GFX1170-NEXT: v_mov_b32_e32 v20, v6
+; GFX1170-NEXT: v_mov_b32_e32 v19, v5
+; GFX1170-NEXT: v_mov_b32_e32 v18, v4
+; GFX1170-NEXT: v_mov_b32_e32 v17, v3
+; GFX1170-NEXT: v_mov_b32_e32 v24, v6
+; GFX1170-NEXT: v_mov_b32_e32 v23, v5
+; GFX1170-NEXT: v_mov_b32_e32 v22, v4
+; GFX1170-NEXT: v_mov_b32_e32 v21, v3
+; GFX1170-NEXT: v_mov_b32_e32 v28, v6
+; GFX1170-NEXT: v_mov_b32_e32 v27, v5
+; GFX1170-NEXT: v_mov_b32_e32 v26, v4
+; GFX1170-NEXT: v_mov_b32_e32 v25, v3
+; GFX1170-NEXT: s_waitcnt vmcnt(0)
+; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[17:20], v0, v[1:2], v7
+; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[21:24], v0, v[1:2], v7 index_key:1
+; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[25:28], v0, v[1:2], v7 index_key:2
+; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:3
+; GFX1170-NEXT: global_store_b128 v[9:10], v[17:20], off
+; GFX1170-NEXT: global_store_b128 v[11:12], v[21:24], off
+; GFX1170-NEXT: global_store_b128 v[13:14], v[25:28], off
+; GFX1170-NEXT: global_store_b128 v[15:16], v[3:6], off
+; GFX1170-NEXT: s_endpgm
+;
; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8_index_key:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: global_load_b32 v7, v[7:8], off
@@ -448,3 +710,5 @@ declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f
declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64.ll
index 957b7b1b2c77c..bb256883c29ae 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64.ll
@@ -1,12 +1,13 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1170 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GCN,GFX1170
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GCN,GFX12
define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_f16:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7]
-; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_f16:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7]
+; GCN-NEXT: global_store_b128 v[8:9], v[4:7], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %C)
store <4 x float> %res, ptr addrspace(1) %out
@@ -14,11 +15,11 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf16:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7]
-; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf16:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7]
+; GCN-NEXT: global_store_b128 v[8:9], v[4:7], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> %C)
store <4 x float> %res, ptr addrspace(1) %out
@@ -26,11 +27,11 @@ bb:
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f16_16x16x16_f16:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5]
-; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f16_16x16x16_f16:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5]
+; GCN-NEXT: global_store_b64 v[6:7], v[4:5], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> %C, i1 0)
store <4 x half> %res, ptr addrspace(1) %out
@@ -38,11 +39,11 @@ bb:
}
define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5]
-; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_bf16_16x16x16_bf16:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5]
+; GCN-NEXT: global_store_b64 v[6:7], v[4:5], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i1 0)
store <4 x i16> %res, ptr addrspace(1) %out
@@ -50,11 +51,11 @@ bb:
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu8:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5]
-; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu8:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5]
+; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
@@ -62,11 +63,11 @@ bb:
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu4:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5]
-; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu4:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5]
+; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
@@ -74,11 +75,11 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5]
-; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_fp8_fp8:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5]
+; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C)
store <4 x float> %res, ptr addrspace(1) %out
@@ -86,11 +87,11 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5]
-; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf8_fp8:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5]
+; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C)
store <4 x float> %res, ptr addrspace(1) %out
@@ -98,11 +99,11 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5]
-; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_fp8_bf8:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5]
+; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C)
store <4 x float> %res, ptr addrspace(1) %out
@@ -110,11 +111,11 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5]
-; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf8_bf8:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5]
+; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C)
store <4 x float> %res, ptr addrspace(1) %out
@@ -122,11 +123,11 @@ bb:
}
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x32_iu4:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5]
-; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x32_iu4:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5]
+; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
@@ -134,11 +135,11 @@ bb:
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f32_16x16x32_f16:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10
-; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_f32_16x16x32_f16:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10
+; GCN-NEXT: global_store_b128 v[11:12], v[6:9], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index)
store <4 x float> %res, ptr addrspace(1) %out
@@ -146,11 +147,11 @@ bb:
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10
-; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_f32_16x16x32_bf16:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10
+; GCN-NEXT: global_store_b128 v[11:12], v[6:9], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index)
store <4 x float> %res, ptr addrspace(1) %out
@@ -158,11 +159,11 @@ bb:
}
define amdgpu_ps void @test_swmmac_f16_16x16x32_f16(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f16_16x16x32_f16:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8
-; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_f16_16x16x32_f16:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8
+; GCN-NEXT: global_store_b64 v[9:10], v[6:7], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index)
store <4 x half> %res, ptr addrspace(1) %out
@@ -170,11 +171,11 @@ bb:
}
define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8
-; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_bf16_16x16x32_bf16:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8
+; GCN-NEXT: global_store_b64 v[9:10], v[6:7], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index)
store <4 x i16> %res, ptr addrspace(1) %out
@@ -182,11 +183,11 @@ bb:
}
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7
-; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x32_iu8:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7
+; GCN-NEXT: global_store_b128 v[8:9], v[3:6], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
@@ -194,11 +195,11 @@ bb:
}
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6
-; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x32_iu4:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6
+; GCN-NEXT: global_store_b128 v[7:8], v[2:5], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
@@ -206,11 +207,11 @@ bb:
}
define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7
-; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x64_iu4:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7
+; GCN-NEXT: global_store_b128 v[8:9], v[3:6], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
@@ -218,11 +219,11 @@ bb:
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7
-; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_f32_16x16x32_fp8_fp8:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7
+; GCN-NEXT: global_store_b128 v[8:9], v[3:6], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index)
store <4 x float> %res, ptr addrspace(1) %out
@@ -230,11 +231,11 @@ bb:
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7
-; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_f32_16x16x32_fp8_bf8:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7
+; GCN-NEXT: global_store_b128 v[8:9], v[3:6], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index)
store <4 x float> %res, ptr addrspace(1) %out
@@ -242,11 +243,11 @@ bb:
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7
-; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_f32_16x16x32_bf8_fp8:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7
+; GCN-NEXT: global_store_b128 v[8:9], v[3:6], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index)
store <4 x float> %res, ptr addrspace(1) %out
@@ -254,11 +255,11 @@ bb:
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7
-; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_f32_16x16x32_bf8_bf8:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7
+; GCN-NEXT: global_store_b128 v[8:9], v[3:6], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index)
store <4 x float> %res, ptr addrspace(1) %out
@@ -287,3 +288,6 @@ declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f
declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX1170: {{.*}}
+; GFX12: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll
index 98cb09642511e..6919c6d3f70ea 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll
@@ -3,6 +3,7 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 < %s | FileCheck %s --check-prefixes=GCN,GFX10
; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 < %s | FileCheck %s --check-prefixes=GCN,GFX10
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 < %s | FileCheck %s --check-prefixes=GCN,GFX10
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1170 -amdgpu-enable-vopd=0 < %s | FileCheck %s --check-prefixes=GCN,GFX12
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-enable-vopd=0 < %s | FileCheck %s --check-prefixes=GCN,GFX12
declare float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %c, i1 %clamp)
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll
index 4a010071d58c8..bc5c3283fb49e 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll
@@ -1,14 +1,15 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1170 < %s | FileCheck %s --check-prefixes=GCN,GFX1170
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefixes=GCN,GFX12
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] neg_hi:[1,0,0]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
-; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_f16_negA:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
+; GCN-NEXT: global_store_b128 v[16:17], v[8:11], off
+; GCN-NEXT: s_endpgm
bb:
%fneg.A = fneg <8 x half> %A
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> %fneg.A, <8 x half> %B, <8 x float> %C)
@@ -17,13 +18,13 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negB:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] neg_hi:[0,1,0]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
-; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_f16_negB:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
+; GCN-NEXT: global_store_b128 v[16:17], v[8:11], off
+; GCN-NEXT: s_endpgm
bb:
%fneg.B = fneg <8 x half> %B
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> %A, <8 x half> %fneg.B, <8 x float> %C)
@@ -32,13 +33,13 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negC:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
-; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_f16_negC:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
+; GCN-NEXT: global_store_b128 v[16:17], v[8:11], off
+; GCN-NEXT: s_endpgm
bb:
%fneg.C = fneg <8 x float> %C
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> %A, <8 x half> %B, <8 x float> %fneg.C)
@@ -47,13 +48,13 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_f16_absC:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
-; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_f16_absC:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
+; GCN-NEXT: global_store_b128 v[16:17], v[8:11], off
+; GCN-NEXT: s_endpgm
bb:
%fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> %A, <8 x half> %B, <8 x float> %fabs.C)
@@ -62,13 +63,13 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_negC:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
-; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf16_negC:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
+; GCN-NEXT: global_store_b128 v[16:17], v[8:11], off
+; GCN-NEXT: s_endpgm
bb:
%fneg.C = fneg <8 x float> %C
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x float> %fneg.C)
@@ -77,13 +78,13 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_absC:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
-; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf16_absC:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
+; GCN-NEXT: global_store_b128 v[16:17], v[8:11], off
+; GCN-NEXT: s_endpgm
bb:
%fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x float> %fabs.C)
@@ -92,11 +93,11 @@ bb:
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negA:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] neg_hi:[1,0,0]
-; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f16_16x16x16_f16_negA:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off
+; GCN-NEXT: s_endpgm
bb:
%fneg.A = fneg <8 x half> %A
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %fneg.A, <8 x half> %B, <8 x half> %C, i1 0)
@@ -105,11 +106,11 @@ bb:
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] neg_hi:[0,1,0]
-; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f16_16x16x16_f16_negB:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off
+; GCN-NEXT: s_endpgm
bb:
%fneg.B = fneg <8 x half> %B
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %fneg.B, <8 x half> %C, i1 0)
@@ -118,11 +119,11 @@ bb:
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1]
-; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f16_16x16x16_f16_negC:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1]
+; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off
+; GCN-NEXT: s_endpgm
bb:
%fneg.C = fneg <8 x half> %C
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fneg.C, i1 0)
@@ -131,11 +132,11 @@ bb:
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f16_16x16x16_f16_absC:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1]
-; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f16_16x16x16_f16_absC:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1]
+; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off
+; GCN-NEXT: s_endpgm
bb:
%fabs.C = call <8 x half> @llvm.fabs.v8f16(<8 x half> %C)
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fabs.C, i1 0)
@@ -144,13 +145,13 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_negC:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_fp8_fp8_negC:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
+; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off
+; GCN-NEXT: s_endpgm
bb:
%fneg.C = fneg <8 x float> %C
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
@@ -159,13 +160,13 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_absC:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_fp8_fp8_absC:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
+; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off
+; GCN-NEXT: s_endpgm
bb:
%fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
@@ -174,13 +175,13 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_negC:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf8_fp8_negC:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
+; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off
+; GCN-NEXT: s_endpgm
bb:
%fneg.C = fneg <8 x float> %C
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
@@ -189,13 +190,13 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_absC:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf8_fp8_absC:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
+; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off
+; GCN-NEXT: s_endpgm
bb:
%fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
@@ -204,13 +205,13 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_negC:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_fp8_bf8_negC:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
+; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off
+; GCN-NEXT: s_endpgm
bb:
%fneg.C = fneg <8 x float> %C
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
@@ -219,13 +220,13 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_absC:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_fp8_bf8_absC:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
+; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off
+; GCN-NEXT: s_endpgm
bb:
%fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
@@ -234,13 +235,13 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_negC:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf8_bf8_negC:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
+; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off
+; GCN-NEXT: s_endpgm
bb:
%fneg.C = fneg <8 x float> %C
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
@@ -249,13 +250,13 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_absC:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf8_bf8_absC:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
+; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off
+; GCN-NEXT: s_endpgm
bb:
%fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
@@ -264,13 +265,13 @@ bb:
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negA:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] neg_hi:[1,0,0]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16
-; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_f32_16x16x32_f16_negA:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16
+; GCN-NEXT: global_store_b128 v[21:22], v[12:15], off
+; GCN-NEXT: s_endpgm
bb:
%fneg.A = fneg <8 x half> %A
%res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.i16(<8 x half> %fneg.A, <16 x half> %B, <8 x float> %C, i16 %Index)
@@ -279,13 +280,13 @@ bb:
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negB:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] neg_hi:[0,1,0]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16
-; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_f32_16x16x32_f16_negB:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16
+; GCN-NEXT: global_store_b128 v[21:22], v[12:15], off
+; GCN-NEXT: s_endpgm
bb:
%fneg.B = fneg <16 x half> %B
%res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.i16(<8 x half> %A, <16 x half> %fneg.B, <8 x float> %C, i16 %Index)
@@ -294,11 +295,11 @@ bb:
}
define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negA:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] neg_hi:[1,0,0]
-; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_f16_16x16x32_f16_negA:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GCN-NEXT: global_store_b128 v[17:18], v[12:15], off
+; GCN-NEXT: s_endpgm
bb:
%fneg.A = fneg <8 x half> %A
%res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %fneg.A, <16 x half> %B, <8 x half> %C, i16 %Index)
@@ -307,11 +308,11 @@ bb:
}
define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negB:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] neg_hi:[0,1,0]
-; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_f16_16x16x32_f16_negB:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GCN-NEXT: global_store_b128 v[17:18], v[12:15], off
+; GCN-NEXT: s_endpgm
bb:
%fneg.B = fneg <16 x half> %B
%res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %A, <16 x half> %fneg.B, <8 x half> %C, i16 %Index)
@@ -322,13 +323,13 @@ bb:
; both neg and abs patterns (wmma matrix C f32 or f16 )
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negabsC:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] neg_hi:[0,0,1]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
-; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_f16_negabsC:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
+; GCN-NEXT: global_store_b128 v[16:17], v[8:11], off
+; GCN-NEXT: s_endpgm
bb:
%fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
%fneg.fabs.C = fneg <8 x float> %fabs.C
@@ -338,11 +339,11 @@ bb:
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negabsC:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] neg_hi:[0,0,1]
-; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f16_16x16x16_f16_negabsC:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off
+; GCN-NEXT: s_endpgm
bb:
%fabs.C = call <8 x half> @llvm.fabs.v8f16(<8 x half> %C)
%fneg.fabs.C = fneg <8 x half> %fabs.C
@@ -352,15 +353,15 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_f16_neg_partial_fabsA:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_and_b32_e32 v11, 0x7fffffff, v11
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
-; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_f16_neg_partial_fabsA:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_and_b32_e32 v11, 0x7fffffff, v11
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
+; GCN-NEXT: global_store_b128 v[16:17], v[8:11], off
+; GCN-NEXT: s_endpgm
bb:
%el3 = extractelement <8 x float> %C, i32 3
%el3.fabs = call float @llvm.fabs.f32(float %el3)
@@ -374,13 +375,13 @@ bb:
; A or B matrix modifier and constant in C
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA_constantC:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
-; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_f16_negA_constantC:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
+; GCN-NEXT: global_store_b128 v[8:9], v[10:13], off
+; GCN-NEXT: s_endpgm
bb:
%fneg.A = fneg <8 x half> %A
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> %fneg.A, <8 x half> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
@@ -389,11 +390,11 @@ bb:
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB_constantC:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0]
-; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f16_16x16x16_f16_negB_constantC:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GCN-NEXT: global_store_b128 v[8:9], v[10:13], off
+; GCN-NEXT: s_endpgm
bb:
%fneg.B = fneg <8 x half> %B
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %fneg.B, <8 x half> <half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
@@ -404,6 +405,24 @@ bb:
; pack f16 elements with v_perm_b32 since they don't come from same b32
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<8 x half> %A, <8 x half> %B, ptr %Caddr, ptr addrspace(1) %out) {
+; GFX1170-LABEL: test_wmma_f16_16x16x16_f16_negC_pack:
+; GFX1170: ; %bb.0: ; %bb
+; GFX1170-NEXT: s_clause 0x1
+; GFX1170-NEXT: flat_load_b128 v[12:15], v[8:9] offset:16
+; GFX1170-NEXT: flat_load_b128 v[16:19], v[8:9]
+; GFX1170-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
+; GFX1170-NEXT: v_mov_b16_e32 v8.l, v15.l
+; GFX1170-NEXT: v_mov_b16_e32 v9.l, v14.l
+; GFX1170-NEXT: v_perm_b32 v14, v13, v12, 0x5040100
+; GFX1170-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX1170-NEXT: v_perm_b32 v13, v19, v18, 0x5040100
+; GFX1170-NEXT: v_perm_b32 v12, v17, v16, 0x5040100
+; GFX1170-NEXT: v_perm_b32 v15, v8, v9, 0x5040100
+; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT: v_wmma_f16_16x16x16_f16 v[12:15], v[0:3], v[4:7], v[12:15] neg_lo:[0,0,1]
+; GFX1170-NEXT: global_store_b128 v[10:11], v[12:15], off
+; GFX1170-NEXT: s_endpgm
+;
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC_pack:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: s_clause 0x1
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-imm.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-imm.ll
index 1b44e8f01c0f9..2558dc3903640 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-imm.ll
@@ -1,14 +1,15 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1170 < %s | FileCheck %s --check-prefixes=GCN,GFX1170
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefixes=GCN,GFX12
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], 1.0
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
-; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_f16_imm:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], 1.0
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
+; GCN-NEXT: global_store_b128 v[8:9], v[10:13], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> %A, <8 x half> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
store <8 x float> %res, ptr addrspace(1) %out
@@ -16,6 +17,24 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm_non_inlineable(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
+; GFX1170-LABEL: test_wmma_f32_16x16x16_f16_imm_non_inlineable:
+; GFX1170: ; %bb.0: ; %bb
+; GFX1170-NEXT: v_mov_b32_e32 v10, 0x40400000
+; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT: v_mov_b32_e32 v11, v10
+; GFX1170-NEXT: v_mov_b32_e32 v12, v10
+; GFX1170-NEXT: v_mov_b32_e32 v13, v10
+; GFX1170-NEXT: v_mov_b32_e32 v14, v10
+; GFX1170-NEXT: v_mov_b32_e32 v15, v10
+; GFX1170-NEXT: v_mov_b32_e32 v16, v10
+; GFX1170-NEXT: v_mov_b32_e32 v17, v10
+; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], v[10:17]
+; GFX1170-NEXT: s_clause 0x1
+; GFX1170-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
+; GFX1170-NEXT: global_store_b128 v[8:9], v[10:13], off
+; GFX1170-NEXT: s_endpgm
+;
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm_non_inlineable:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_mov_b32_e32 v10, 0x40400000
@@ -36,13 +55,13 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[10:17], v[0:3], v[4:7], 1.0
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
-; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf16_imm:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_bf16 v[10:17], v[0:3], v[4:7], 1.0
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
+; GCN-NEXT: global_store_b128 v[8:9], v[10:13], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
store <8 x float> %res, ptr addrspace(1) %out
@@ -50,6 +69,24 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm_non_inlineable(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
+; GFX1170-LABEL: test_wmma_f32_16x16x16_bf16_imm_non_inlineable:
+; GFX1170: ; %bb.0: ; %bb
+; GFX1170-NEXT: v_mov_b32_e32 v10, 0x40400000
+; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT: v_mov_b32_e32 v11, v10
+; GFX1170-NEXT: v_mov_b32_e32 v12, v10
+; GFX1170-NEXT: v_mov_b32_e32 v13, v10
+; GFX1170-NEXT: v_mov_b32_e32 v14, v10
+; GFX1170-NEXT: v_mov_b32_e32 v15, v10
+; GFX1170-NEXT: v_mov_b32_e32 v16, v10
+; GFX1170-NEXT: v_mov_b32_e32 v17, v10
+; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT: v_wmma_f32_16x16x16_bf16 v[10:17], v[0:3], v[4:7], v[10:17]
+; GFX1170-NEXT: s_clause 0x1
+; GFX1170-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
+; GFX1170-NEXT: global_store_b128 v[8:9], v[10:13], off
+; GFX1170-NEXT: s_endpgm
+;
; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm_non_inlineable:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_mov_b32_e32 v10, 0x40400000
@@ -70,11 +107,11 @@ bb:
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0
-; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f16_16x16x16_f16_imm:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0
+; GCN-NEXT: global_store_b128 v[8:9], v[10:13], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> <half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
store <8 x half> %res, ptr addrspace(1) %out
@@ -82,6 +119,17 @@ bb:
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm_non_inlineable(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
+; GFX1170-LABEL: test_wmma_f16_16x16x16_f16_imm_non_inlineable:
+; GFX1170: ; %bb.0: ; %bb
+; GFX1170-NEXT: v_mov_b32_e32 v10, 0x42004200
+; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1170-NEXT: v_mov_b32_e32 v11, v10
+; GFX1170-NEXT: v_mov_b32_e32 v12, v10
+; GFX1170-NEXT: v_mov_b32_e32 v13, v10
+; GFX1170-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], v[10:13]
+; GFX1170-NEXT: global_store_b128 v[8:9], v[10:13], off
+; GFX1170-NEXT: s_endpgm
+;
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm_non_inlineable:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_mov_b32_e32 v10, 0x42004200
@@ -98,6 +146,17 @@ bb:
}
define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
+; GFX1170-LABEL: test_wmma_bf16_16x16x16_bf16_imm:
+; GFX1170: ; %bb.0: ; %bb
+; GFX1170-NEXT: v_mov_b32_e32 v10, 0x3f803f80
+; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1170-NEXT: v_mov_b32_e32 v11, v10
+; GFX1170-NEXT: v_mov_b32_e32 v12, v10
+; GFX1170-NEXT: v_mov_b32_e32 v13, v10
+; GFX1170-NEXT: v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13]
+; GFX1170-NEXT: global_store_b128 v[8:9], v[10:13], off
+; GFX1170-NEXT: s_endpgm
+;
; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_mov_b32_e32 v10, 0x3f803f80
@@ -114,6 +173,17 @@ bb:
}
define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm_non_inlineable(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
+; GFX1170-LABEL: test_wmma_bf16_16x16x16_bf16_imm_non_inlineable:
+; GFX1170: ; %bb.0: ; %bb
+; GFX1170-NEXT: v_mov_b32_e32 v10, 0x3fc03fc0
+; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1170-NEXT: v_mov_b32_e32 v11, v10
+; GFX1170-NEXT: v_mov_b32_e32 v12, v10
+; GFX1170-NEXT: v_mov_b32_e32 v13, v10
+; GFX1170-NEXT: v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13]
+; GFX1170-NEXT: global_store_b128 v[8:9], v[10:13], off
+; GFX1170-NEXT: s_endpgm
+;
; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm_non_inlineable:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_mov_b32_e32 v10, 0x3fc03fc0
@@ -130,13 +200,13 @@ bb:
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[6:13], v[0:1], v[2:3], 1
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
-; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu8_imm:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_i32_16x16x16_iu8 v[6:13], v[0:1], v[2:3], 1
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
+; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
@@ -144,6 +214,24 @@ bb:
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
+; GFX1170-LABEL: test_wmma_i32_16x16x16_iu8_imm_non_inlineable:
+; GFX1170: ; %bb.0: ; %bb
+; GFX1170-NEXT: v_mov_b32_e32 v6, 0x80
+; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT: v_mov_b32_e32 v7, v6
+; GFX1170-NEXT: v_mov_b32_e32 v8, v6
+; GFX1170-NEXT: v_mov_b32_e32 v9, v6
+; GFX1170-NEXT: v_mov_b32_e32 v10, v6
+; GFX1170-NEXT: v_mov_b32_e32 v11, v6
+; GFX1170-NEXT: v_mov_b32_e32 v12, v6
+; GFX1170-NEXT: v_mov_b32_e32 v13, v6
+; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT: v_wmma_i32_16x16x16_iu8 v[6:13], v[0:1], v[2:3], v[6:13]
+; GFX1170-NEXT: s_clause 0x1
+; GFX1170-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX1170-NEXT: global_store_b128 v[4:5], v[6:9], off
+; GFX1170-NEXT: s_endpgm
+;
; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm_non_inlineable:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_mov_b32_e32 v6, 0x80
@@ -164,13 +252,13 @@ bb:
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm(i32 %A, i32 %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v0, v1, 1
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[2:3], v[8:11], off offset:16
-; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu4_imm:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v0, v1, 1
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[2:3], v[8:11], off offset:16
+; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
@@ -178,6 +266,24 @@ bb:
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) {
+; GFX1170-LABEL: test_wmma_i32_16x16x16_iu4_imm_non_inlineable:
+; GFX1170: ; %bb.0: ; %bb
+; GFX1170-NEXT: v_mov_b32_e32 v4, 0x80
+; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT: v_mov_b32_e32 v5, v4
+; GFX1170-NEXT: v_mov_b32_e32 v6, v4
+; GFX1170-NEXT: v_mov_b32_e32 v7, v4
+; GFX1170-NEXT: v_mov_b32_e32 v8, v4
+; GFX1170-NEXT: v_mov_b32_e32 v9, v4
+; GFX1170-NEXT: v_mov_b32_e32 v10, v4
+; GFX1170-NEXT: v_mov_b32_e32 v11, v4
+; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v0, v1, v[4:11]
+; GFX1170-NEXT: s_clause 0x1
+; GFX1170-NEXT: global_store_b128 v[2:3], v[8:11], off offset:16
+; GFX1170-NEXT: global_store_b128 v[2:3], v[4:7], off
+; GFX1170-NEXT: s_endpgm
+;
; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm_non_inlineable:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_mov_b32_e32 v4, 0x80
@@ -198,13 +304,13 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[6:13], v[0:1], v[2:3], 1.0
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
-; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[6:13], v[0:1], v[2:3], 1.0
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
+; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
store <8 x float> %res, ptr addrspace(1) %out
@@ -212,6 +318,24 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
+; GFX1170-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable:
+; GFX1170: ; %bb.0: ; %bb
+; GFX1170-NEXT: v_mov_b32_e32 v6, 0x40400000
+; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT: v_mov_b32_e32 v7, v6
+; GFX1170-NEXT: v_mov_b32_e32 v8, v6
+; GFX1170-NEXT: v_mov_b32_e32 v9, v6
+; GFX1170-NEXT: v_mov_b32_e32 v10, v6
+; GFX1170-NEXT: v_mov_b32_e32 v11, v6
+; GFX1170-NEXT: v_mov_b32_e32 v12, v6
+; GFX1170-NEXT: v_mov_b32_e32 v13, v6
+; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[6:13], v[0:1], v[2:3], v[6:13]
+; GFX1170-NEXT: s_clause 0x1
+; GFX1170-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX1170-NEXT: global_store_b128 v[4:5], v[6:9], off
+; GFX1170-NEXT: s_endpgm
+;
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_mov_b32_e32 v6, 0x40400000
@@ -232,13 +356,13 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[6:13], v[0:1], v[2:3], 1.0
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
-; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[6:13], v[0:1], v[2:3], 1.0
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
+; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
store <8 x float> %res, ptr addrspace(1) %out
@@ -246,6 +370,24 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
+; GFX1170-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable:
+; GFX1170: ; %bb.0: ; %bb
+; GFX1170-NEXT: v_mov_b32_e32 v6, 0x40400000
+; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT: v_mov_b32_e32 v7, v6
+; GFX1170-NEXT: v_mov_b32_e32 v8, v6
+; GFX1170-NEXT: v_mov_b32_e32 v9, v6
+; GFX1170-NEXT: v_mov_b32_e32 v10, v6
+; GFX1170-NEXT: v_mov_b32_e32 v11, v6
+; GFX1170-NEXT: v_mov_b32_e32 v12, v6
+; GFX1170-NEXT: v_mov_b32_e32 v13, v6
+; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[6:13], v[0:1], v[2:3], v[6:13]
+; GFX1170-NEXT: s_clause 0x1
+; GFX1170-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX1170-NEXT: global_store_b128 v[4:5], v[6:9], off
+; GFX1170-NEXT: s_endpgm
+;
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_mov_b32_e32 v6, 0x40400000
@@ -266,13 +408,13 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[6:13], v[0:1], v[2:3], 1.0
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
-; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[6:13], v[0:1], v[2:3], 1.0
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
+; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
store <8 x float> %res, ptr addrspace(1) %out
@@ -280,6 +422,24 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
+; GFX1170-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable:
+; GFX1170: ; %bb.0: ; %bb
+; GFX1170-NEXT: v_mov_b32_e32 v6, 0x40400000
+; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT: v_mov_b32_e32 v7, v6
+; GFX1170-NEXT: v_mov_b32_e32 v8, v6
+; GFX1170-NEXT: v_mov_b32_e32 v9, v6
+; GFX1170-NEXT: v_mov_b32_e32 v10, v6
+; GFX1170-NEXT: v_mov_b32_e32 v11, v6
+; GFX1170-NEXT: v_mov_b32_e32 v12, v6
+; GFX1170-NEXT: v_mov_b32_e32 v13, v6
+; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[6:13], v[0:1], v[2:3], v[6:13]
+; GFX1170-NEXT: s_clause 0x1
+; GFX1170-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX1170-NEXT: global_store_b128 v[4:5], v[6:9], off
+; GFX1170-NEXT: s_endpgm
+;
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_mov_b32_e32 v6, 0x40400000
@@ -300,13 +460,13 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[6:13], v[0:1], v[2:3], 1.0
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
-; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[6:13], v[0:1], v[2:3], 1.0
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
+; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
store <8 x float> %res, ptr addrspace(1) %out
@@ -314,6 +474,24 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
+; GFX1170-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable:
+; GFX1170: ; %bb.0: ; %bb
+; GFX1170-NEXT: v_mov_b32_e32 v6, 0x40400000
+; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT: v_mov_b32_e32 v7, v6
+; GFX1170-NEXT: v_mov_b32_e32 v8, v6
+; GFX1170-NEXT: v_mov_b32_e32 v9, v6
+; GFX1170-NEXT: v_mov_b32_e32 v10, v6
+; GFX1170-NEXT: v_mov_b32_e32 v11, v6
+; GFX1170-NEXT: v_mov_b32_e32 v12, v6
+; GFX1170-NEXT: v_mov_b32_e32 v13, v6
+; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[6:13], v[0:1], v[2:3], v[6:13]
+; GFX1170-NEXT: s_clause 0x1
+; GFX1170-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX1170-NEXT: global_store_b128 v[4:5], v[6:9], off
+; GFX1170-NEXT: s_endpgm
+;
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_mov_b32_e32 v6, 0x40400000
@@ -334,13 +512,13 @@ bb:
}
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[6:13], v[0:1], v[2:3], 1
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
-; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x32_iu4_imm:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_i32_16x16x32_iu4 v[6:13], v[0:1], v[2:3], 1
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
+; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
@@ -348,6 +526,24 @@ bb:
}
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
+; GFX1170-LABEL: test_wmma_i32_16x16x32_iu4_imm_non_inlineable:
+; GFX1170: ; %bb.0: ; %bb
+; GFX1170-NEXT: v_mov_b32_e32 v6, 0x80
+; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT: v_mov_b32_e32 v7, v6
+; GFX1170-NEXT: v_mov_b32_e32 v8, v6
+; GFX1170-NEXT: v_mov_b32_e32 v9, v6
+; GFX1170-NEXT: v_mov_b32_e32 v10, v6
+; GFX1170-NEXT: v_mov_b32_e32 v11, v6
+; GFX1170-NEXT: v_mov_b32_e32 v12, v6
+; GFX1170-NEXT: v_mov_b32_e32 v13, v6
+; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT: v_wmma_i32_16x16x32_iu4 v[6:13], v[0:1], v[2:3], v[6:13]
+; GFX1170-NEXT: s_clause 0x1
+; GFX1170-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX1170-NEXT: global_store_b128 v[4:5], v[6:9], off
+; GFX1170-NEXT: s_endpgm
+;
; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm_non_inlineable:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_mov_b32_e32 v6, 0x80
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-iu-modifiers.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-iu-modifiers.ll
index 945305848b3e1..9d8f26ea11cb8 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-iu-modifiers.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-iu-modifiers.ll
@@ -1,14 +1,15 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1170 < %s | FileCheck %s --check-prefixes=GCN,GFX1170
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefixes=GCN,GFX12
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src0(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src0:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu8_zext_src0:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
+; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
@@ -16,13 +17,13 @@ bb:
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src1(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src1:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu8_zext_src1:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
+; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
@@ -30,13 +31,13 @@ bb:
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_clamp:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] clamp
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu8_clamp:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] clamp
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
+; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1)
store <8 x i32> %res, ptr addrspace(1) %out
@@ -46,13 +47,13 @@ bb:
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src0(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src0:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[1,0,0]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16
-; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu4_zext_src0:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[1,0,0]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16
+; GCN-NEXT: global_store_b128 v[10:11], v[2:5], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32(i1 1, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
@@ -60,13 +61,13 @@ bb:
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src1(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src1:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[0,1,0]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16
-; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu4_zext_src1:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[0,1,0]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16
+; GCN-NEXT: global_store_b128 v[10:11], v[2:5], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32(i1 0, i32 %A, i1 1, i32 %B, <8 x i32> %C, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
@@ -74,13 +75,13 @@ bb:
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_clamp(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_clamp:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] clamp
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16
-; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu4_clamp:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] clamp
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16
+; GCN-NEXT: global_store_b128 v[10:11], v[2:5], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 1)
store <8 x i32> %res, ptr addrspace(1) %out
@@ -90,13 +91,13 @@ bb:
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src0(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src0:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x32_iu4_zext_src0:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
+; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
@@ -104,13 +105,13 @@ bb:
}
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src1(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src1:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x32_iu4_zext_src1:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
+; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
@@ -118,13 +119,13 @@ bb:
}
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_clamp:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] clamp
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x32_iu4_clamp:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] clamp
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
+; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1)
store <8 x i32> %res, ptr addrspace(1) %out
@@ -136,13 +137,13 @@ bb:
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src0(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src0:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
-; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src0:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
+; GCN-NEXT: global_store_b128 v[15:16], v[6:9], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.i16(i1 1, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
@@ -150,13 +151,13 @@ bb:
}
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src1(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src1:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
-; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src1:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
+; GCN-NEXT: global_store_b128 v[15:16], v[6:9], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.i16(i1 0, <2 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
@@ -164,13 +165,13 @@ bb:
}
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_clamp(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_clamp:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 clamp
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
-; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x32_iu8_clamp:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 clamp
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
+; GCN-NEXT: global_store_b128 v[15:16], v[6:9], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 1)
store <8 x i32> %res, ptr addrspace(1) %out
@@ -180,13 +181,13 @@ bb:
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src0(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src0:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[1,0,0]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16
-; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src0:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[1,0,0]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16
+; GCN-NEXT: global_store_b128 v[12:13], v[3:6], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.i16(i1 1, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
@@ -194,13 +195,13 @@ bb:
}
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src1(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src1:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[0,1,0]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16
-; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src1:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[0,1,0]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16
+; GCN-NEXT: global_store_b128 v[12:13], v[3:6], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.i16(i1 0, i32 %A, i1 1, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
@@ -208,13 +209,13 @@ bb:
}
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_clamp(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_clamp:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 clamp
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16
-; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x32_iu4_clamp:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 clamp
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16
+; GCN-NEXT: global_store_b128 v[12:13], v[3:6], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 1)
store <8 x i32> %res, ptr addrspace(1) %out
@@ -224,13 +225,13 @@ bb:
define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src0(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src0:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
-; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src0:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
+; GCN-NEXT: global_store_b128 v[15:16], v[6:9], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.i32(i1 1, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
@@ -238,13 +239,13 @@ bb:
}
define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src1(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src1:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
-; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src1:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
+; GCN-NEXT: global_store_b128 v[15:16], v[6:9], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.i32(i1 0, <2 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
@@ -252,13 +253,13 @@ bb:
}
define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_clamp(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_clamp:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 clamp
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
-; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x64_iu4_clamp:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 clamp
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
+; GCN-NEXT: global_store_b128 v[15:16], v[6:9], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.i32(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 1)
store <8 x i32> %res, ptr addrspace(1) %out
@@ -271,3 +272,6 @@ declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32(i1 immarg, <2 x
declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.i16(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16 %Index, i1 immarg)
declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16 %Index, i1 immarg)
declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.i32(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i32 %Index, i1 immarg)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX1170: {{.*}}
+; GFX12: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-swmmac-index_key.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-swmmac-index_key.ll
index cd7edc21718c9..f7dd2d189a2b2 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-swmmac-index_key.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-swmmac-index_key.ll
@@ -1,7 +1,27 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1170 < %s | FileCheck %s --check-prefixes=GCN,GFX1170
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefixes=GCN,GFX12
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_index_key(<8 x half> %A, <16 x half> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX1170-LABEL: test_swmmac_f32_16x16x32_f16_index_key:
+; GFX1170: ; %bb.0: ; %bb
+; GFX1170-NEXT: global_load_b32 v20, v[20:21], off
+; GFX1170-NEXT: v_dual_mov_b32 v33, v19 :: v_dual_mov_b32 v32, v18
+; GFX1170-NEXT: v_dual_mov_b32 v31, v17 :: v_dual_mov_b32 v30, v16
+; GFX1170-NEXT: v_dual_mov_b32 v29, v15 :: v_dual_mov_b32 v28, v14
+; GFX1170-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
+; GFX1170-NEXT: s_waitcnt vmcnt(0)
+; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT: v_swmmac_f32_16x16x32_f16 v[26:33], v[0:3], v[4:11], v20
+; GFX1170-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 index_key:1
+; GFX1170-NEXT: s_clause 0x1
+; GFX1170-NEXT: global_store_b128 v[22:23], v[30:33], off offset:16
+; GFX1170-NEXT: global_store_b128 v[22:23], v[26:29], off
+; GFX1170-NEXT: s_clause 0x1
+; GFX1170-NEXT: global_store_b128 v[24:25], v[16:19], off offset:16
+; GFX1170-NEXT: global_store_b128 v[24:25], v[12:15], off
+; GFX1170-NEXT: s_endpgm
+;
; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_index_key:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: global_load_b32 v20, v[20:21], off
@@ -32,6 +52,25 @@ bb:
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16_index_key(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX1170-LABEL: test_swmmac_f32_16x16x32_bf16_index_key:
+; GFX1170: ; %bb.0: ; %bb
+; GFX1170-NEXT: global_load_b32 v20, v[20:21], off
+; GFX1170-NEXT: v_dual_mov_b32 v33, v19 :: v_dual_mov_b32 v32, v18
+; GFX1170-NEXT: v_dual_mov_b32 v31, v17 :: v_dual_mov_b32 v30, v16
+; GFX1170-NEXT: v_dual_mov_b32 v29, v15 :: v_dual_mov_b32 v28, v14
+; GFX1170-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
+; GFX1170-NEXT: s_waitcnt vmcnt(0)
+; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf16 v[26:33], v[0:3], v[4:11], v20
+; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 index_key:1
+; GFX1170-NEXT: s_clause 0x1
+; GFX1170-NEXT: global_store_b128 v[22:23], v[30:33], off offset:16
+; GFX1170-NEXT: global_store_b128 v[22:23], v[26:29], off
+; GFX1170-NEXT: s_clause 0x1
+; GFX1170-NEXT: global_store_b128 v[24:25], v[16:19], off offset:16
+; GFX1170-NEXT: global_store_b128 v[24:25], v[12:15], off
+; GFX1170-NEXT: s_endpgm
+;
; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16_index_key:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: global_load_b32 v20, v[20:21], off
@@ -62,6 +101,19 @@ bb:
}
define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_index_key(<8 x half> %A, <16 x half> %B, <8 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX1170-LABEL: test_swmmac_f16_16x16x32_f16_index_key:
+; GFX1170: ; %bb.0: ; %bb
+; GFX1170-NEXT: global_load_b32 v16, v[16:17], off
+; GFX1170-NEXT: v_dual_mov_b32 v25, v15 :: v_dual_mov_b32 v24, v14
+; GFX1170-NEXT: v_dual_mov_b32 v23, v13 :: v_dual_mov_b32 v22, v12
+; GFX1170-NEXT: s_waitcnt vmcnt(0)
+; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT: v_swmmac_f16_16x16x32_f16 v[22:25], v[0:3], v[4:11], v16
+; GFX1170-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:1
+; GFX1170-NEXT: global_store_b128 v[18:19], v[22:25], off
+; GFX1170-NEXT: global_store_b128 v[20:21], v[12:15], off
+; GFX1170-NEXT: s_endpgm
+;
; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_index_key:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: global_load_b32 v16, v[16:17], off
@@ -86,6 +138,19 @@ bb:
}
define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16_index_key(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX1170-LABEL: test_swmmac_bf16_16x16x32_bf16_index_key:
+; GFX1170: ; %bb.0: ; %bb
+; GFX1170-NEXT: global_load_b32 v16, v[16:17], off
+; GFX1170-NEXT: v_dual_mov_b32 v25, v15 :: v_dual_mov_b32 v24, v14
+; GFX1170-NEXT: v_dual_mov_b32 v23, v13 :: v_dual_mov_b32 v22, v12
+; GFX1170-NEXT: s_waitcnt vmcnt(0)
+; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT: v_swmmac_bf16_16x16x32_bf16 v[22:25], v[0:3], v[4:11], v16
+; GFX1170-NEXT: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:1
+; GFX1170-NEXT: global_store_b128 v[18:19], v[22:25], off
+; GFX1170-NEXT: global_store_b128 v[20:21], v[12:15], off
+; GFX1170-NEXT: s_endpgm
+;
; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16_index_key:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: global_load_b32 v16, v[16:17], off
@@ -110,6 +175,25 @@ bb:
}
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX1170-LABEL: test_swmmac_i32_16x16x32_iu8_index_key:
+; GFX1170: ; %bb.0: ; %bb
+; GFX1170-NEXT: global_load_b32 v14, v[14:15], off
+; GFX1170-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
+; GFX1170-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10
+; GFX1170-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8
+; GFX1170-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6
+; GFX1170-NEXT: s_waitcnt vmcnt(0)
+; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT: v_swmmac_i32_16x16x32_iu8 v[20:27], v[0:1], v[2:5], v14
+; GFX1170-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 index_key:1
+; GFX1170-NEXT: s_clause 0x1
+; GFX1170-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16
+; GFX1170-NEXT: global_store_b128 v[16:17], v[20:23], off
+; GFX1170-NEXT: s_clause 0x1
+; GFX1170-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16
+; GFX1170-NEXT: global_store_b128 v[18:19], v[6:9], off
+; GFX1170-NEXT: s_endpgm
+;
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_index_key:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: global_load_b32 v14, v[14:15], off
@@ -140,6 +224,25 @@ bb:
}
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_index_key(i32 %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX1170-LABEL: test_swmmac_i32_16x16x32_iu4_index_key:
+; GFX1170: ; %bb.0: ; %bb
+; GFX1170-NEXT: global_load_b32 v11, v[11:12], off
+; GFX1170-NEXT: v_dual_mov_b32 v24, v10 :: v_dual_mov_b32 v23, v9
+; GFX1170-NEXT: v_dual_mov_b32 v22, v8 :: v_dual_mov_b32 v21, v7
+; GFX1170-NEXT: v_dual_mov_b32 v20, v6 :: v_dual_mov_b32 v19, v5
+; GFX1170-NEXT: v_dual_mov_b32 v18, v4 :: v_dual_mov_b32 v17, v3
+; GFX1170-NEXT: s_waitcnt vmcnt(0)
+; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT: v_swmmac_i32_16x16x32_iu4 v[17:24], v0, v[1:2], v11
+; GFX1170-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 index_key:1
+; GFX1170-NEXT: s_clause 0x1
+; GFX1170-NEXT: global_store_b128 v[13:14], v[21:24], off offset:16
+; GFX1170-NEXT: global_store_b128 v[13:14], v[17:20], off
+; GFX1170-NEXT: s_clause 0x1
+; GFX1170-NEXT: global_store_b128 v[15:16], v[7:10], off offset:16
+; GFX1170-NEXT: global_store_b128 v[15:16], v[3:6], off
+; GFX1170-NEXT: s_endpgm
+;
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_index_key:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: global_load_b32 v11, v[11:12], off
@@ -170,6 +273,25 @@ bb:
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX1170-LABEL: test_swmmac_f32_16x16x32_fp8_fp8_index_key:
+; GFX1170: ; %bb.0: ; %bb
+; GFX1170-NEXT: global_load_b32 v14, v[14:15], off
+; GFX1170-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
+; GFX1170-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10
+; GFX1170-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8
+; GFX1170-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6
+; GFX1170-NEXT: s_waitcnt vmcnt(0)
+; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[20:27], v[0:1], v[2:5], v14
+; GFX1170-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1
+; GFX1170-NEXT: s_clause 0x1
+; GFX1170-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16
+; GFX1170-NEXT: global_store_b128 v[16:17], v[20:23], off
+; GFX1170-NEXT: s_clause 0x1
+; GFX1170-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16
+; GFX1170-NEXT: global_store_b128 v[18:19], v[6:9], off
+; GFX1170-NEXT: s_endpgm
+;
; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8_index_key:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: global_load_b32 v14, v[14:15], off
@@ -200,6 +322,25 @@ bb:
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX1170-LABEL: test_swmmac_f32_16x16x32_fp8_bf8_index_key:
+; GFX1170: ; %bb.0: ; %bb
+; GFX1170-NEXT: global_load_b32 v14, v[14:15], off
+; GFX1170-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
+; GFX1170-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10
+; GFX1170-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8
+; GFX1170-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6
+; GFX1170-NEXT: s_waitcnt vmcnt(0)
+; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[20:27], v[0:1], v[2:5], v14
+; GFX1170-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1
+; GFX1170-NEXT: s_clause 0x1
+; GFX1170-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16
+; GFX1170-NEXT: global_store_b128 v[16:17], v[20:23], off
+; GFX1170-NEXT: s_clause 0x1
+; GFX1170-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16
+; GFX1170-NEXT: global_store_b128 v[18:19], v[6:9], off
+; GFX1170-NEXT: s_endpgm
+;
; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8_index_key:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: global_load_b32 v14, v[14:15], off
@@ -230,6 +371,25 @@ bb:
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX1170-LABEL: test_swmmac_f32_16x16x32_bf8_fp8_index_key:
+; GFX1170: ; %bb.0: ; %bb
+; GFX1170-NEXT: global_load_b32 v14, v[14:15], off
+; GFX1170-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
+; GFX1170-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10
+; GFX1170-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8
+; GFX1170-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6
+; GFX1170-NEXT: s_waitcnt vmcnt(0)
+; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[20:27], v[0:1], v[2:5], v14
+; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1
+; GFX1170-NEXT: s_clause 0x1
+; GFX1170-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16
+; GFX1170-NEXT: global_store_b128 v[16:17], v[20:23], off
+; GFX1170-NEXT: s_clause 0x1
+; GFX1170-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16
+; GFX1170-NEXT: global_store_b128 v[18:19], v[6:9], off
+; GFX1170-NEXT: s_endpgm
+;
; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8_index_key:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: global_load_b32 v14, v[14:15], off
@@ -260,6 +420,25 @@ bb:
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX1170-LABEL: test_swmmac_f32_16x16x32_bf8_bf8_index_key:
+; GFX1170: ; %bb.0: ; %bb
+; GFX1170-NEXT: global_load_b32 v14, v[14:15], off
+; GFX1170-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
+; GFX1170-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10
+; GFX1170-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8
+; GFX1170-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6
+; GFX1170-NEXT: s_waitcnt vmcnt(0)
+; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[20:27], v[0:1], v[2:5], v14
+; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1
+; GFX1170-NEXT: s_clause 0x1
+; GFX1170-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16
+; GFX1170-NEXT: global_store_b128 v[16:17], v[20:23], off
+; GFX1170-NEXT: s_clause 0x1
+; GFX1170-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16
+; GFX1170-NEXT: global_store_b128 v[18:19], v[6:9], off
+; GFX1170-NEXT: s_endpgm
+;
; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8_index_key:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: global_load_b32 v14, v[14:15], off
@@ -299,3 +478,5 @@ declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.i
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32.ll
index d67625248669a..0993c00c30415 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32.ll
@@ -1,14 +1,15 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1170 < %s | FileCheck %s --check-prefixes=GCN,GFX1170
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefixes=GCN,GFX12
define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_f16:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
-; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_f16:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
+; GCN-NEXT: global_store_b128 v[16:17], v[8:11], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> %A, <8 x half> %B, <8 x float> %C)
store <8 x float> %res, ptr addrspace(1) %out
@@ -16,13 +17,13 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf16:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
-; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf16:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
+; GCN-NEXT: global_store_b128 v[16:17], v[8:11], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x float> %C)
store <8 x float> %res, ptr addrspace(1) %out
@@ -30,11 +31,11 @@ bb:
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f16_16x16x16_f16:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11]
-; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f16_16x16x16_f16:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11]
+; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %C, i1 0)
store <8 x half> %res, ptr addrspace(1) %out
@@ -42,11 +43,11 @@ bb:
}
define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11]
-; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_bf16_16x16x16_bf16:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11]
+; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i1 0)
store <8 x i16> %res, ptr addrspace(1) %out
@@ -54,13 +55,13 @@ bb:
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu8:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu8:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
+; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
@@ -68,13 +69,13 @@ bb:
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu4:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16
-; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu4:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16
+; GCN-NEXT: global_store_b128 v[10:11], v[2:5], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
@@ -82,13 +83,13 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_fp8_fp8:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
+; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
store <8 x float> %res, ptr addrspace(1) %out
@@ -96,13 +97,13 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf8_fp8:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
+; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
store <8 x float> %res, ptr addrspace(1) %out
@@ -110,13 +111,13 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_fp8_bf8:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
+; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
store <8 x float> %res, ptr addrspace(1) %out
@@ -124,13 +125,13 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf8_bf8:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
+; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
store <8 x float> %res, ptr addrspace(1) %out
@@ -138,13 +139,13 @@ bb:
}
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x32_iu4:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11]
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x32_iu4:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11]
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
+; GCN-NEXT: global_store_b128 v[12:13], v[4:7], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
@@ -153,13 +154,13 @@ bb:
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f32_16x16x32_f16:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16
-; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_f32_16x16x32_f16:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16
+; GCN-NEXT: global_store_b128 v[21:22], v[12:15], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.i16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index)
store <8 x float> %res, ptr addrspace(1) %out
@@ -167,13 +168,13 @@ bb:
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16
-; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_f32_16x16x32_bf16:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16
+; GCN-NEXT: global_store_b128 v[21:22], v[12:15], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index)
store <8 x float> %res, ptr addrspace(1) %out
@@ -181,11 +182,11 @@ bb:
}
define amdgpu_ps void @test_swmmac_f16_16x16x32_f16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f16_16x16x32_f16:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16
-; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_f16_16x16x32_f16:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16
+; GCN-NEXT: global_store_b128 v[17:18], v[12:15], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index)
store <8 x half> %res, ptr addrspace(1) %out
@@ -193,11 +194,11 @@ bb:
}
define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16
-; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_bf16_16x16x32_bf16:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16
+; GCN-NEXT: global_store_b128 v[17:18], v[12:15], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index)
store <8 x i16> %res, ptr addrspace(1) %out
@@ -205,13 +206,13 @@ bb:
}
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
-; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x32_iu8:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
+; GCN-NEXT: global_store_b128 v[15:16], v[6:9], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
@@ -219,13 +220,13 @@ bb:
}
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16
-; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x32_iu4:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16
+; GCN-NEXT: global_store_b128 v[12:13], v[3:6], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
@@ -233,13 +234,13 @@ bb:
}
define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
-; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x64_iu4:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
+; GCN-NEXT: global_store_b128 v[15:16], v[6:9], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.i32(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
@@ -247,13 +248,13 @@ bb:
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
-; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_f32_16x16x32_fp8_fp8:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
+; GCN-NEXT: global_store_b128 v[15:16], v[6:9], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
store <8 x float> %res, ptr addrspace(1) %out
@@ -261,13 +262,13 @@ bb:
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
-; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_f32_16x16x32_fp8_bf8:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
+; GCN-NEXT: global_store_b128 v[15:16], v[6:9], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
store <8 x float> %res, ptr addrspace(1) %out
@@ -275,13 +276,13 @@ bb:
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
-; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_f32_16x16x32_bf8_fp8:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
+; GCN-NEXT: global_store_b128 v[15:16], v[6:9], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
store <8 x float> %res, ptr addrspace(1) %out
@@ -289,13 +290,13 @@ bb:
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
-; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_f32_16x16x32_bf8_bf8:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
+; GCN-NEXT: global_store_b128 v[15:16], v[6:9], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
store <8 x float> %res, ptr addrspace(1) %out
@@ -324,3 +325,6 @@ declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.i
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX1170: {{.*}}
+; GFX12: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll
index 53bede84513c9..1a2d59e969590 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll
@@ -1,13 +1,15 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GFX12,GFX12-TRUE16
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GFX12,GFX12-FAKE16
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1170 -mattr=+real-true16 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GCN,GFX1170-TRUE16
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1170 -mattr=-real-true16 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GCN,GFX1170-FAKE16
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GCN,GFX12-TRUE16
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GCN,GFX12-FAKE16
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] neg_hi:[1,0,0]
-; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_f16_negA:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GCN-NEXT: global_store_b128 v[8:9], v[4:7], off
+; GCN-NEXT: s_endpgm
bb:
%fneg.A = fneg <4 x half> %A
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> %fneg.A, <4 x half> %B, <4 x float> %C)
@@ -16,11 +18,11 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negB:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] neg_hi:[0,1,0]
-; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_f16_negB:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GCN-NEXT: global_store_b128 v[8:9], v[4:7], off
+; GCN-NEXT: s_endpgm
bb:
%fneg.B = fneg <4 x half> %B
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> %A, <4 x half> %fneg.B, <4 x float> %C)
@@ -29,11 +31,11 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negC:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
-; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_f16_negC:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
+; GCN-NEXT: global_store_b128 v[8:9], v[4:7], off
+; GCN-NEXT: s_endpgm
bb:
%fneg.C = fneg <4 x float> %C
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> %A, <4 x half> %B, <4 x float> %fneg.C)
@@ -42,11 +44,11 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_f16_absC:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1]
-; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_f16_absC:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1]
+; GCN-NEXT: global_store_b128 v[8:9], v[4:7], off
+; GCN-NEXT: s_endpgm
bb:
%fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> %A, <4 x half> %B, <4 x float> %fabs.C)
@@ -55,11 +57,11 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_negC:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
-; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf16_negC:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
+; GCN-NEXT: global_store_b128 v[8:9], v[4:7], off
+; GCN-NEXT: s_endpgm
bb:
%fneg.C = fneg <4 x float> %C
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x float> %fneg.C)
@@ -68,11 +70,11 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_absC:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1]
-; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf16_absC:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1]
+; GCN-NEXT: global_store_b128 v[8:9], v[4:7], off
+; GCN-NEXT: s_endpgm
bb:
%fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x float> %fabs.C)
@@ -81,11 +83,11 @@ bb:
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negA:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] neg_hi:[1,0,0]
-; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f16_16x16x16_f16_negA:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GCN-NEXT: global_store_b64 v[6:7], v[4:5], off
+; GCN-NEXT: s_endpgm
bb:
%fneg.A = fneg <4 x half> %A
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %fneg.A, <4 x half> %B, <4 x half> %C, i1 0)
@@ -94,11 +96,11 @@ bb:
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] neg_hi:[0,1,0]
-; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f16_16x16x16_f16_negB:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GCN-NEXT: global_store_b64 v[6:7], v[4:5], off
+; GCN-NEXT: s_endpgm
bb:
%fneg.B = fneg <4 x half> %B
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %fneg.B, <4 x half> %C, i1 0)
@@ -107,11 +109,11 @@ bb:
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1]
-; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f16_16x16x16_f16_negC:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1]
+; GCN-NEXT: global_store_b64 v[6:7], v[4:5], off
+; GCN-NEXT: s_endpgm
bb:
%fneg.C = fneg <4 x half> %C
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fneg.C, i1 0)
@@ -120,11 +122,11 @@ bb:
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f16_16x16x16_f16_absC:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1]
-; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f16_16x16x16_f16_absC:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1]
+; GCN-NEXT: global_store_b64 v[6:7], v[4:5], off
+; GCN-NEXT: s_endpgm
bb:
%fabs.C = call <4 x half> @llvm.fabs.v4f16(<4 x half> %C)
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fabs.C, i1 0)
@@ -133,11 +135,11 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_negC:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
-; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_fp8_fp8_negC:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
+; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT: s_endpgm
bb:
%fneg.C = fneg <4 x float> %C
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32(i32 %A, i32 %B, <4 x float> %fneg.C)
@@ -146,11 +148,11 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_absC:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
-; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_fp8_fp8_absC:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
+; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT: s_endpgm
bb:
%fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32(i32 %A, i32 %B, <4 x float> %fabs.C)
@@ -159,11 +161,11 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_negC:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
-; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf8_fp8_negC:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
+; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT: s_endpgm
bb:
%fneg.C = fneg <4 x float> %C
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32(i32 %A, i32 %B, <4 x float> %fneg.C)
@@ -172,11 +174,11 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_absC:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
-; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf8_fp8_absC:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
+; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT: s_endpgm
bb:
%fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32(i32 %A, i32 %B, <4 x float> %fabs.C)
@@ -185,11 +187,11 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_negC:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
-; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_fp8_bf8_negC:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
+; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT: s_endpgm
bb:
%fneg.C = fneg <4 x float> %C
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32(i32 %A, i32 %B, <4 x float> %fneg.C)
@@ -198,11 +200,11 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_absC:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
-; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_fp8_bf8_absC:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
+; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT: s_endpgm
bb:
%fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32(i32 %A, i32 %B, <4 x float> %fabs.C)
@@ -211,11 +213,11 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_negC:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
-; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf8_bf8_negC:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
+; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT: s_endpgm
bb:
%fneg.C = fneg <4 x float> %C
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32(i32 %A, i32 %B, <4 x float> %fneg.C)
@@ -224,11 +226,11 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_absC:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
-; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf8_bf8_absC:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
+; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT: s_endpgm
bb:
%fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32(i32 %A, i32 %B, <4 x float> %fabs.C)
@@ -237,11 +239,11 @@ bb:
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<4 x half> %A, <8 x half> %B, <4 x float> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negA:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] neg_hi:[1,0,0]
-; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_f32_16x16x32_f16_negA:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GCN-NEXT: global_store_b128 v[11:12], v[6:9], off
+; GCN-NEXT: s_endpgm
bb:
%fneg.A = fneg <4 x half> %A
%res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.i16(<4 x half> %fneg.A, <8 x half> %B, <4 x float> %C, i16 %Index)
@@ -250,11 +252,11 @@ bb:
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<4 x half> %A, <8 x half> %B, <4 x float> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negB:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] neg_hi:[0,1,0]
-; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_f32_16x16x32_f16_negB:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GCN-NEXT: global_store_b128 v[11:12], v[6:9], off
+; GCN-NEXT: s_endpgm
bb:
%fneg.B = fneg <8 x half> %B
%res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.i16(<4 x half> %A, <8 x half> %fneg.B, <4 x float> %C, i16 %Index)
@@ -263,11 +265,11 @@ bb:
}
define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<4 x half> %A, <8 x half> %B, <4 x half> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negA:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] neg_hi:[1,0,0]
-; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_f16_16x16x32_f16_negA:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GCN-NEXT: global_store_b64 v[9:10], v[6:7], off
+; GCN-NEXT: s_endpgm
bb:
%fneg.A = fneg <4 x half> %A
%res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.i16(<4 x half> %fneg.A, <8 x half> %B, <4 x half> %C, i16 %Index)
@@ -276,11 +278,11 @@ bb:
}
define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<4 x half> %A, <8 x half> %B, <4 x half> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negB:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] neg_hi:[0,1,0]
-; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_f16_16x16x32_f16_negB:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GCN-NEXT: global_store_b64 v[9:10], v[6:7], off
+; GCN-NEXT: s_endpgm
bb:
%fneg.B = fneg <8 x half> %B
%res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.i16(<4 x half> %A, <8 x half> %fneg.B, <4 x half> %C, i16 %Index)
@@ -291,11 +293,11 @@ bb:
; both neg and abs patterns (wmma matrix C f32 or f16 )
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negabsC:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] neg_hi:[0,0,1]
-; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_f16_negabsC:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GCN-NEXT: global_store_b128 v[8:9], v[4:7], off
+; GCN-NEXT: s_endpgm
bb:
%fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
%fneg.fabs.C = fneg <4 x float> %fabs.C
@@ -305,11 +307,11 @@ bb:
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negabsC:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] neg_hi:[0,0,1]
-; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f16_16x16x16_f16_negabsC:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GCN-NEXT: global_store_b64 v[6:7], v[4:5], off
+; GCN-NEXT: s_endpgm
bb:
%fabs.C = call <4 x half> @llvm.fabs.v4f16(<4 x half> %C)
%fneg.fabs.C = fneg <4 x half> %fabs.C
@@ -319,13 +321,13 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_f16_neg_partial_fabsA:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_and_b32_e32 v7, 0x7fffffff, v7
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
-; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_f16_neg_partial_fabsA:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_and_b32_e32 v7, 0x7fffffff, v7
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
+; GCN-NEXT: global_store_b128 v[8:9], v[4:7], off
+; GCN-NEXT: s_endpgm
bb:
%el3 = extractelement <4 x float> %C, i32 3
%el3.fabs = call float @llvm.fabs.f32(float %el3)
@@ -339,11 +341,11 @@ bb:
; A or B matrix modifier and constant in C
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA_constantC:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0]
-; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_f16_negA_constantC:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off
+; GCN-NEXT: s_endpgm
bb:
%fneg.A = fneg <4 x half> %A
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> %fneg.A, <4 x half> %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
@@ -352,11 +354,11 @@ bb:
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB_constantC:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0]
-; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f16_16x16x16_f16_negB_constantC:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GCN-NEXT: global_store_b64 v[4:5], v[6:7], off
+; GCN-NEXT: s_endpgm
bb:
%fneg.B = fneg <4 x half> %B
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %fneg.B, <4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
@@ -367,6 +369,29 @@ bb:
; pack f16 elements with v_perm_b32 since they don't come from same b32
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<4 x half> %A, <4 x half> %B, ptr %Caddr, ptr addrspace(1) %out) {
+; GFX1170-TRUE16-LABEL: test_wmma_f16_16x16x16_f16_negC_pack:
+; GFX1170-TRUE16: ; %bb.0: ; %bb
+; GFX1170-TRUE16-NEXT: flat_load_b128 v[8:11], v[4:5]
+; GFX1170-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX1170-TRUE16-NEXT: v_mov_b16_e32 v10.h, v11.l
+; GFX1170-TRUE16-NEXT: v_mov_b16_e32 v8.h, v9.l
+; GFX1170-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1170-TRUE16-NEXT: v_mov_b32_e32 v9, v10
+; GFX1170-TRUE16-NEXT: v_wmma_f16_16x16x16_f16 v[8:9], v[0:1], v[2:3], v[8:9] neg_lo:[0,0,1]
+; GFX1170-TRUE16-NEXT: global_store_b64 v[6:7], v[8:9], off
+; GFX1170-TRUE16-NEXT: s_endpgm
+;
+; GFX1170-FAKE16-LABEL: test_wmma_f16_16x16x16_f16_negC_pack:
+; GFX1170-FAKE16: ; %bb.0: ; %bb
+; GFX1170-FAKE16-NEXT: flat_load_b128 v[8:11], v[4:5]
+; GFX1170-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX1170-FAKE16-NEXT: v_perm_b32 v5, v11, v10, 0x5040100
+; GFX1170-FAKE16-NEXT: v_perm_b32 v4, v9, v8, 0x5040100
+; GFX1170-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-FAKE16-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1]
+; GFX1170-FAKE16-NEXT: global_store_b64 v[6:7], v[4:5], off
+; GFX1170-FAKE16-NEXT: s_endpgm
+;
; GFX12-TRUE16-LABEL: test_wmma_f16_16x16x16_f16_negC_pack:
; GFX12-TRUE16: ; %bb.0: ; %bb
; GFX12-TRUE16-NEXT: flat_load_b128 v[8:11], v[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-imm.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-imm.ll
index a8f5726632aa1..a4222338a5038 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-imm.ll
@@ -1,12 +1,13 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1170 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GCN,GFX1170
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GCN,GFX12
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0
-; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_f16_imm:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0
+; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> %A, <4 x half> %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
store <4 x float> %res, ptr addrspace(1) %out
@@ -14,16 +15,16 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm_non_inlineable(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm_non_inlineable:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_mov_b32_e32 v6, 0x40400000
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_mov_b32_e32 v7, v6
-; GFX12-NEXT: v_mov_b32_e32 v8, v6
-; GFX12-NEXT: v_mov_b32_e32 v9, v6
-; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], v[6:9]
-; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_f16_imm_non_inlineable:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_mov_b32_e32 v6, 0x40400000
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GCN-NEXT: v_mov_b32_e32 v7, v6
+; GCN-NEXT: v_mov_b32_e32 v8, v6
+; GCN-NEXT: v_mov_b32_e32 v9, v6
+; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], v[6:9]
+; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> %A, <4 x half> %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
store <4 x float> %res, ptr addrspace(1) %out
@@ -31,11 +32,11 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[6:9], v[0:1], v[2:3], 1.0
-; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf16_imm:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_bf16 v[6:9], v[0:1], v[2:3], 1.0
+; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
store <4 x float> %res, ptr addrspace(1) %out
@@ -43,16 +44,16 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm_non_inlineable(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm_non_inlineable:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_mov_b32_e32 v6, 0x40400000
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_mov_b32_e32 v7, v6
-; GFX12-NEXT: v_mov_b32_e32 v8, v6
-; GFX12-NEXT: v_mov_b32_e32 v9, v6
-; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[6:9], v[0:1], v[2:3], v[6:9]
-; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf16_imm_non_inlineable:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_mov_b32_e32 v6, 0x40400000
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GCN-NEXT: v_mov_b32_e32 v7, v6
+; GCN-NEXT: v_mov_b32_e32 v8, v6
+; GCN-NEXT: v_mov_b32_e32 v9, v6
+; GCN-NEXT: v_wmma_f32_16x16x16_bf16 v[6:9], v[0:1], v[2:3], v[6:9]
+; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
store <4 x float> %res, ptr addrspace(1) %out
@@ -60,11 +61,11 @@ bb:
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0
-; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f16_16x16x16_f16_imm:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0
+; GCN-NEXT: global_store_b64 v[4:5], v[6:7], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
store <4 x half> %res, ptr addrspace(1) %out
@@ -72,14 +73,14 @@ bb:
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm_non_inlineable(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm_non_inlineable:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_mov_b32_e32 v6, 0x42004200
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_mov_b32_e32 v7, v6
-; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], v[6:7]
-; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f16_16x16x16_f16_imm_non_inlineable:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_mov_b32_e32 v6, 0x42004200
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GCN-NEXT: v_mov_b32_e32 v7, v6
+; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], v[6:7]
+; GCN-NEXT: global_store_b64 v[4:5], v[6:7], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> <half 3.0, half 3.0, half 3.0, half 3.0>, i1 0)
store <4 x half> %res, ptr addrspace(1) %out
@@ -87,14 +88,14 @@ bb:
}
define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_mov_b32_e32 v6, 0x3f803f80
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_mov_b32_e32 v7, v6
-; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], v[6:7]
-; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_bf16_16x16x16_bf16_imm:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_mov_b32_e32 v6, 0x3f803f80
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GCN-NEXT: v_mov_b32_e32 v7, v6
+; GCN-NEXT: v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], v[6:7]
+; GCN-NEXT: global_store_b64 v[4:5], v[6:7], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> <i16 16256, i16 16256, i16 16256, i16 16256>, i1 0)
store <4 x i16> %res, ptr addrspace(1) %out
@@ -102,14 +103,14 @@ bb:
}
define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm_non_inlineable(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm_non_inlineable:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_mov_b32_e32 v6, 0x3fc03fc0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_mov_b32_e32 v7, v6
-; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], v[6:7]
-; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_bf16_16x16x16_bf16_imm_non_inlineable:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_mov_b32_e32 v6, 0x3fc03fc0
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GCN-NEXT: v_mov_b32_e32 v7, v6
+; GCN-NEXT: v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], v[6:7]
+; GCN-NEXT: global_store_b64 v[4:5], v[6:7], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> <i16 16320, i16 16320, i16 16320, i16 16320>, i1 0)
store <4 x i16> %res, ptr addrspace(1) %out
@@ -117,11 +118,11 @@ bb:
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm(i32 %A, i32 %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:7], v0, v1, 1
-; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu8_imm:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_i32_16x16x16_iu8 v[4:7], v0, v1, 1
+; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
@@ -129,16 +130,16 @@ bb:
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm_non_inlineable:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_mov_b32_e32 v4, 0x80
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_mov_b32_e32 v5, v4
-; GFX12-NEXT: v_mov_b32_e32 v6, v4
-; GFX12-NEXT: v_mov_b32_e32 v7, v4
-; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:7], v0, v1, v[4:7]
-; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu8_imm_non_inlineable:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_mov_b32_e32 v4, 0x80
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GCN-NEXT: v_mov_b32_e32 v5, v4
+; GCN-NEXT: v_mov_b32_e32 v6, v4
+; GCN-NEXT: v_mov_b32_e32 v7, v4
+; GCN-NEXT: v_wmma_i32_16x16x16_iu8 v[4:7], v0, v1, v[4:7]
+; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 128, i32 128, i32 128, i32 128>, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
@@ -146,11 +147,11 @@ bb:
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm(i32 %A, i32 %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v0, v1, 1
-; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu4_imm:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v0, v1, 1
+; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
@@ -158,16 +159,16 @@ bb:
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm_non_inlineable:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_mov_b32_e32 v4, 0x80
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_mov_b32_e32 v5, v4
-; GFX12-NEXT: v_mov_b32_e32 v6, v4
-; GFX12-NEXT: v_mov_b32_e32 v7, v4
-; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v0, v1, v[4:7]
-; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu4_imm_non_inlineable:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_mov_b32_e32 v4, 0x80
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GCN-NEXT: v_mov_b32_e32 v5, v4
+; GCN-NEXT: v_mov_b32_e32 v6, v4
+; GCN-NEXT: v_mov_b32_e32 v7, v4
+; GCN-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v0, v1, v[4:7]
+; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 128, i32 128, i32 128, i32 128>, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
@@ -175,11 +176,11 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:7], v0, v1, 1.0
-; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:7], v0, v1, 1.0
+; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
store <4 x float> %res, ptr addrspace(1) %out
@@ -187,16 +188,16 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_mov_b32_e32 v4, 0x40400000
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_mov_b32_e32 v5, v4
-; GFX12-NEXT: v_mov_b32_e32 v6, v4
-; GFX12-NEXT: v_mov_b32_e32 v7, v4
-; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:7], v0, v1, v[4:7]
-; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_mov_b32_e32 v4, 0x40400000
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GCN-NEXT: v_mov_b32_e32 v5, v4
+; GCN-NEXT: v_mov_b32_e32 v6, v4
+; GCN-NEXT: v_mov_b32_e32 v7, v4
+; GCN-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:7], v0, v1, v[4:7]
+; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
store <4 x float> %res, ptr addrspace(1) %out
@@ -204,11 +205,11 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:7], v0, v1, 1.0
-; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:7], v0, v1, 1.0
+; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
store <4 x float> %res, ptr addrspace(1) %out
@@ -216,16 +217,16 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_mov_b32_e32 v4, 0x40400000
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_mov_b32_e32 v5, v4
-; GFX12-NEXT: v_mov_b32_e32 v6, v4
-; GFX12-NEXT: v_mov_b32_e32 v7, v4
-; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:7], v0, v1, v[4:7]
-; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_mov_b32_e32 v4, 0x40400000
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GCN-NEXT: v_mov_b32_e32 v5, v4
+; GCN-NEXT: v_mov_b32_e32 v6, v4
+; GCN-NEXT: v_mov_b32_e32 v7, v4
+; GCN-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:7], v0, v1, v[4:7]
+; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
store <4 x float> %res, ptr addrspace(1) %out
@@ -233,11 +234,11 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:7], v0, v1, 1.0
-; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:7], v0, v1, 1.0
+; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
store <4 x float> %res, ptr addrspace(1) %out
@@ -245,16 +246,16 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_mov_b32_e32 v4, 0x40400000
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_mov_b32_e32 v5, v4
-; GFX12-NEXT: v_mov_b32_e32 v6, v4
-; GFX12-NEXT: v_mov_b32_e32 v7, v4
-; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:7], v0, v1, v[4:7]
-; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_mov_b32_e32 v4, 0x40400000
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GCN-NEXT: v_mov_b32_e32 v5, v4
+; GCN-NEXT: v_mov_b32_e32 v6, v4
+; GCN-NEXT: v_mov_b32_e32 v7, v4
+; GCN-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:7], v0, v1, v[4:7]
+; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
store <4 x float> %res, ptr addrspace(1) %out
@@ -262,11 +263,11 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:7], v0, v1, 1.0
-; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:7], v0, v1, 1.0
+; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
store <4 x float> %res, ptr addrspace(1) %out
@@ -274,16 +275,16 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_mov_b32_e32 v4, 0x40400000
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_mov_b32_e32 v5, v4
-; GFX12-NEXT: v_mov_b32_e32 v6, v4
-; GFX12-NEXT: v_mov_b32_e32 v7, v4
-; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:7], v0, v1, v[4:7]
-; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_mov_b32_e32 v4, 0x40400000
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GCN-NEXT: v_mov_b32_e32 v5, v4
+; GCN-NEXT: v_mov_b32_e32 v6, v4
+; GCN-NEXT: v_mov_b32_e32 v7, v4
+; GCN-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:7], v0, v1, v[4:7]
+; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
store <4 x float> %res, ptr addrspace(1) %out
@@ -291,11 +292,11 @@ bb:
}
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:7], v0, v1, 1
-; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x32_iu4_imm:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_i32_16x16x32_iu4 v[4:7], v0, v1, 1
+; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
@@ -303,16 +304,16 @@ bb:
}
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm_non_inlineable(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm_non_inlineable:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_mov_b32_e32 v4, 0x80
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_mov_b32_e32 v5, v4
-; GFX12-NEXT: v_mov_b32_e32 v6, v4
-; GFX12-NEXT: v_mov_b32_e32 v7, v4
-; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:7], v0, v1, v[4:7]
-; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x32_iu4_imm_non_inlineable:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_mov_b32_e32 v4, 0x80
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GCN-NEXT: v_mov_b32_e32 v5, v4
+; GCN-NEXT: v_mov_b32_e32 v6, v4
+; GCN-NEXT: v_mov_b32_e32 v7, v4
+; GCN-NEXT: v_wmma_i32_16x16x32_iu4 v[4:7], v0, v1, v[4:7]
+; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 128, i32 128, i32 128, i32 128>, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
@@ -330,3 +331,6 @@ declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32(i32, i32, <
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32(i32, i32, <4 x float>)
declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32(i32, i32, <4 x float>)
declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX1170: {{.*}}
+; GFX12: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-iu-modifiers.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-iu-modifiers.ll
index 9303dbfad437f..baeb81ab62957 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-iu-modifiers.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-iu-modifiers.ll
@@ -1,12 +1,13 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1170 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GCN,GFX1170
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GCN,GFX12
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src0:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
-; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu8_zext_src0:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
+; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
@@ -14,11 +15,11 @@ bb:
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src1(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src1:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
-; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu8_zext_src1:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
+; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
@@ -26,11 +27,11 @@ bb:
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_clamp(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_clamp:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] clamp
-; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu8_clamp:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] clamp
+; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1)
store <4 x i32> %res, ptr addrspace(1) %out
@@ -40,11 +41,11 @@ bb:
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src0:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
-; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu4_zext_src0:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
+; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
@@ -52,11 +53,11 @@ bb:
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src1(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src1:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
-; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu4_zext_src1:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
+; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
@@ -64,11 +65,11 @@ bb:
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_clamp(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_clamp:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] clamp
-; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu4_clamp:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] clamp
+; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1)
store <4 x i32> %res, ptr addrspace(1) %out
@@ -78,11 +79,11 @@ bb:
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src0:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
-; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x32_iu4_zext_src0:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
+; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
@@ -90,11 +91,11 @@ bb:
}
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src1(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src1:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
-; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x32_iu4_zext_src1:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
+; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
@@ -102,11 +103,11 @@ bb:
}
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_clamp(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_clamp:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] clamp
-; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x32_iu4_clamp:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] clamp
+; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1)
store <4 x i32> %res, ptr addrspace(1) %out
@@ -114,11 +115,11 @@ bb:
}
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src0(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src0:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
-; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src0:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
+; GCN-NEXT: global_store_b128 v[8:9], v[3:6], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.i8(i1 1, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
@@ -126,11 +127,11 @@ bb:
}
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src1(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src1:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
-; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src1:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
+; GCN-NEXT: global_store_b128 v[8:9], v[3:6], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.i8(i1 0, i32 %A, i1 1, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
@@ -138,11 +139,11 @@ bb:
}
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_clamp(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_clamp:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 clamp
-; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x32_iu8_clamp:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 clamp
+; GCN-NEXT: global_store_b128 v[8:9], v[3:6], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 1)
store <4 x i32> %res, ptr addrspace(1) %out
@@ -152,11 +153,11 @@ bb:
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src0(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src0:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[1,0,0]
-; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src0:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[1,0,0]
+; GCN-NEXT: global_store_b128 v[7:8], v[2:5], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.i16(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
@@ -164,11 +165,11 @@ bb:
}
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src1(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src1:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[0,1,0]
-; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src1:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[0,1,0]
+; GCN-NEXT: global_store_b128 v[7:8], v[2:5], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.i16(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i16 %Index, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
@@ -176,11 +177,11 @@ bb:
}
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_clamp(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_clamp:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 clamp
-; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x32_iu4_clamp:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 clamp
+; GCN-NEXT: global_store_b128 v[7:8], v[2:5], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 1)
store <4 x i32> %res, ptr addrspace(1) %out
@@ -190,11 +191,11 @@ bb:
define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src0(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src0:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
-; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src0:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
+; GCN-NEXT: global_store_b128 v[8:9], v[3:6], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.i16(i1 1, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
@@ -202,11 +203,11 @@ bb:
}
define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src1(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src1:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
-; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src1:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
+; GCN-NEXT: global_store_b128 v[8:9], v[3:6], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.i16(i1 0, i32 %A, i1 1, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
@@ -214,11 +215,11 @@ bb:
}
define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_clamp(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_clamp:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 clamp
-; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x64_iu4_clamp:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 clamp
+; GCN-NEXT: global_store_b128 v[8:9], v[3:6], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 1)
store <4 x i32> %res, ptr addrspace(1) %out
@@ -231,3 +232,6 @@ declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32(i1 immarg, i32, i
declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.i8(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i8 %Index, i1 immarg)
declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.i16(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i16 %Index, i1 immarg)
declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i16 %Index, i1 immarg)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX1170: {{.*}}
+; GFX12: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll
index fdfec74e01b7b..183230a1242bf 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll
@@ -1,7 +1,35 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1170 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GCN,GFX1170
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GCN,GFX12
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_index_key(<4 x half> %A, <8 x half> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX1170-LABEL: test_swmmac_f32_16x16x32_f16_index_key:
+; GFX1170: ; %bb.0: ; %bb
+; GFX1170-NEXT: global_load_b32 v10, v[10:11], off
+; GFX1170-NEXT: v_mov_b32_e32 v23, v9
+; GFX1170-NEXT: v_mov_b32_e32 v22, v8
+; GFX1170-NEXT: v_mov_b32_e32 v21, v7
+; GFX1170-NEXT: v_mov_b32_e32 v20, v6
+; GFX1170-NEXT: v_mov_b32_e32 v27, v9
+; GFX1170-NEXT: v_mov_b32_e32 v26, v8
+; GFX1170-NEXT: v_mov_b32_e32 v25, v7
+; GFX1170-NEXT: v_mov_b32_e32 v24, v6
+; GFX1170-NEXT: v_mov_b32_e32 v31, v9
+; GFX1170-NEXT: v_mov_b32_e32 v30, v8
+; GFX1170-NEXT: v_mov_b32_e32 v29, v7
+; GFX1170-NEXT: v_mov_b32_e32 v28, v6
+; GFX1170-NEXT: s_waitcnt vmcnt(0)
+; GFX1170-NEXT: v_swmmac_f32_16x16x32_f16 v[20:23], v[0:1], v[2:5], v10
+; GFX1170-NEXT: v_swmmac_f32_16x16x32_f16 v[24:27], v[0:1], v[2:5], v10 index_key:1
+; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX1170-NEXT: v_swmmac_f32_16x16x32_f16 v[28:31], v[0:1], v[2:5], v10 index_key:2
+; GFX1170-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:3
+; GFX1170-NEXT: global_store_b128 v[12:13], v[20:23], off
+; GFX1170-NEXT: global_store_b128 v[14:15], v[24:27], off
+; GFX1170-NEXT: global_store_b128 v[16:17], v[28:31], off
+; GFX1170-NEXT: global_store_b128 v[18:19], v[6:9], off
+; GFX1170-NEXT: s_endpgm
+;
; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_index_key:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: global_load_b32 v10, v[10:11], off
@@ -46,6 +74,33 @@ bb:
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16_index_key(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX1170-LABEL: test_swmmac_f32_16x16x32_bf16_index_key:
+; GFX1170: ; %bb.0: ; %bb
+; GFX1170-NEXT: global_load_b32 v10, v[10:11], off
+; GFX1170-NEXT: v_mov_b32_e32 v23, v9
+; GFX1170-NEXT: v_mov_b32_e32 v22, v8
+; GFX1170-NEXT: v_mov_b32_e32 v21, v7
+; GFX1170-NEXT: v_mov_b32_e32 v20, v6
+; GFX1170-NEXT: v_mov_b32_e32 v27, v9
+; GFX1170-NEXT: v_mov_b32_e32 v26, v8
+; GFX1170-NEXT: v_mov_b32_e32 v25, v7
+; GFX1170-NEXT: v_mov_b32_e32 v24, v6
+; GFX1170-NEXT: v_mov_b32_e32 v31, v9
+; GFX1170-NEXT: v_mov_b32_e32 v30, v8
+; GFX1170-NEXT: v_mov_b32_e32 v29, v7
+; GFX1170-NEXT: v_mov_b32_e32 v28, v6
+; GFX1170-NEXT: s_waitcnt vmcnt(0)
+; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf16 v[20:23], v[0:1], v[2:5], v10
+; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf16 v[24:27], v[0:1], v[2:5], v10 index_key:1
+; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf16 v[28:31], v[0:1], v[2:5], v10 index_key:2
+; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:3
+; GFX1170-NEXT: global_store_b128 v[12:13], v[20:23], off
+; GFX1170-NEXT: global_store_b128 v[14:15], v[24:27], off
+; GFX1170-NEXT: global_store_b128 v[16:17], v[28:31], off
+; GFX1170-NEXT: global_store_b128 v[18:19], v[6:9], off
+; GFX1170-NEXT: s_endpgm
+;
; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16_index_key:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: global_load_b32 v10, v[10:11], off
@@ -90,6 +145,27 @@ bb:
}
define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_index_key(<4 x half> %A, <8 x half> %B, <4 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX1170-LABEL: test_swmmac_f16_16x16x32_f16_index_key:
+; GFX1170: ; %bb.0: ; %bb
+; GFX1170-NEXT: global_load_b32 v22, v[8:9], off
+; GFX1170-NEXT: v_mov_b32_e32 v9, v7
+; GFX1170-NEXT: v_mov_b32_e32 v8, v6
+; GFX1170-NEXT: v_mov_b32_e32 v19, v7
+; GFX1170-NEXT: v_mov_b32_e32 v18, v6
+; GFX1170-NEXT: v_mov_b32_e32 v21, v7
+; GFX1170-NEXT: v_mov_b32_e32 v20, v6
+; GFX1170-NEXT: s_waitcnt vmcnt(0)
+; GFX1170-NEXT: v_swmmac_f16_16x16x32_f16 v[8:9], v[0:1], v[2:5], v22
+; GFX1170-NEXT: v_swmmac_f16_16x16x32_f16 v[18:19], v[0:1], v[2:5], v22 index_key:1
+; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX1170-NEXT: v_swmmac_f16_16x16x32_f16 v[20:21], v[0:1], v[2:5], v22 index_key:2
+; GFX1170-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v22 index_key:3
+; GFX1170-NEXT: global_store_b64 v[10:11], v[8:9], off
+; GFX1170-NEXT: global_store_b64 v[12:13], v[18:19], off
+; GFX1170-NEXT: global_store_b64 v[14:15], v[20:21], off
+; GFX1170-NEXT: global_store_b64 v[16:17], v[6:7], off
+; GFX1170-NEXT: s_endpgm
+;
; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_index_key:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: global_load_b32 v22, v[8:9], off
@@ -128,6 +204,27 @@ bb:
}
define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16_index_key(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX1170-LABEL: test_swmmac_bf16_16x16x32_bf16_index_key:
+; GFX1170: ; %bb.0: ; %bb
+; GFX1170-NEXT: global_load_b32 v22, v[8:9], off
+; GFX1170-NEXT: v_mov_b32_e32 v9, v7
+; GFX1170-NEXT: v_mov_b32_e32 v8, v6
+; GFX1170-NEXT: v_mov_b32_e32 v19, v7
+; GFX1170-NEXT: v_mov_b32_e32 v18, v6
+; GFX1170-NEXT: v_mov_b32_e32 v21, v7
+; GFX1170-NEXT: v_mov_b32_e32 v20, v6
+; GFX1170-NEXT: s_waitcnt vmcnt(0)
+; GFX1170-NEXT: v_swmmac_bf16_16x16x32_bf16 v[8:9], v[0:1], v[2:5], v22
+; GFX1170-NEXT: v_swmmac_bf16_16x16x32_bf16 v[18:19], v[0:1], v[2:5], v22 index_key:1
+; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX1170-NEXT: v_swmmac_bf16_16x16x32_bf16 v[20:21], v[0:1], v[2:5], v22 index_key:2
+; GFX1170-NEXT: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v22 index_key:3
+; GFX1170-NEXT: global_store_b64 v[10:11], v[8:9], off
+; GFX1170-NEXT: global_store_b64 v[12:13], v[18:19], off
+; GFX1170-NEXT: global_store_b64 v[14:15], v[20:21], off
+; GFX1170-NEXT: global_store_b64 v[16:17], v[6:7], off
+; GFX1170-NEXT: s_endpgm
+;
; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16_index_key:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: global_load_b32 v22, v[8:9], off
@@ -166,6 +263,33 @@ bb:
}
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_index_key(i32 %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX1170-LABEL: test_swmmac_i32_16x16x32_iu8_index_key:
+; GFX1170: ; %bb.0: ; %bb
+; GFX1170-NEXT: global_load_b32 v7, v[7:8], off
+; GFX1170-NEXT: v_mov_b32_e32 v20, v6
+; GFX1170-NEXT: v_mov_b32_e32 v19, v5
+; GFX1170-NEXT: v_mov_b32_e32 v18, v4
+; GFX1170-NEXT: v_mov_b32_e32 v17, v3
+; GFX1170-NEXT: v_mov_b32_e32 v24, v6
+; GFX1170-NEXT: v_mov_b32_e32 v23, v5
+; GFX1170-NEXT: v_mov_b32_e32 v22, v4
+; GFX1170-NEXT: v_mov_b32_e32 v21, v3
+; GFX1170-NEXT: v_mov_b32_e32 v28, v6
+; GFX1170-NEXT: v_mov_b32_e32 v27, v5
+; GFX1170-NEXT: v_mov_b32_e32 v26, v4
+; GFX1170-NEXT: v_mov_b32_e32 v25, v3
+; GFX1170-NEXT: s_waitcnt vmcnt(0)
+; GFX1170-NEXT: v_swmmac_i32_16x16x32_iu8 v[17:20], v0, v[1:2], v7
+; GFX1170-NEXT: v_swmmac_i32_16x16x32_iu8 v[21:24], v0, v[1:2], v7 index_key:1
+; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX1170-NEXT: v_swmmac_i32_16x16x32_iu8 v[25:28], v0, v[1:2], v7 index_key:2
+; GFX1170-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:3
+; GFX1170-NEXT: global_store_b128 v[9:10], v[17:20], off
+; GFX1170-NEXT: global_store_b128 v[11:12], v[21:24], off
+; GFX1170-NEXT: global_store_b128 v[13:14], v[25:28], off
+; GFX1170-NEXT: global_store_b128 v[15:16], v[3:6], off
+; GFX1170-NEXT: s_endpgm
+;
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_index_key:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: global_load_b32 v7, v[7:8], off
@@ -210,6 +334,21 @@ bb:
}
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_index_key(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX1170-LABEL: test_swmmac_i32_16x16x32_iu4_index_key:
+; GFX1170: ; %bb.0: ; %bb
+; GFX1170-NEXT: global_load_b32 v6, v[6:7], off
+; GFX1170-NEXT: v_mov_b32_e32 v15, v5
+; GFX1170-NEXT: v_mov_b32_e32 v14, v4
+; GFX1170-NEXT: v_mov_b32_e32 v13, v3
+; GFX1170-NEXT: v_mov_b32_e32 v12, v2
+; GFX1170-NEXT: s_waitcnt vmcnt(0)
+; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT: v_swmmac_i32_16x16x32_iu4 v[12:15], v0, v1, v6
+; GFX1170-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:1
+; GFX1170-NEXT: global_store_b128 v[8:9], v[12:15], off
+; GFX1170-NEXT: global_store_b128 v[10:11], v[2:5], off
+; GFX1170-NEXT: s_endpgm
+;
; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_index_key:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: global_load_b32 v6, v[6:7], off
@@ -236,6 +375,21 @@ bb:
}
define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_index_key(i32 %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX1170-LABEL: test_swmmac_i32_16x16x64_iu4_index_key:
+; GFX1170: ; %bb.0: ; %bb
+; GFX1170-NEXT: global_load_b32 v7, v[7:8], off
+; GFX1170-NEXT: v_mov_b32_e32 v16, v6
+; GFX1170-NEXT: v_mov_b32_e32 v15, v5
+; GFX1170-NEXT: v_mov_b32_e32 v14, v4
+; GFX1170-NEXT: v_mov_b32_e32 v13, v3
+; GFX1170-NEXT: s_waitcnt vmcnt(0)
+; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1170-NEXT: v_swmmac_i32_16x16x64_iu4 v[13:16], v0, v[1:2], v7
+; GFX1170-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:1
+; GFX1170-NEXT: global_store_b128 v[9:10], v[13:16], off
+; GFX1170-NEXT: global_store_b128 v[11:12], v[3:6], off
+; GFX1170-NEXT: s_endpgm
+;
; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_index_key:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: global_load_b32 v7, v[7:8], off
@@ -262,6 +416,33 @@ bb:
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX1170-LABEL: test_swmmac_f32_16x16x32_fp8_fp8_index_key:
+; GFX1170: ; %bb.0: ; %bb
+; GFX1170-NEXT: global_load_b32 v7, v[7:8], off
+; GFX1170-NEXT: v_mov_b32_e32 v20, v6
+; GFX1170-NEXT: v_mov_b32_e32 v19, v5
+; GFX1170-NEXT: v_mov_b32_e32 v18, v4
+; GFX1170-NEXT: v_mov_b32_e32 v17, v3
+; GFX1170-NEXT: v_mov_b32_e32 v24, v6
+; GFX1170-NEXT: v_mov_b32_e32 v23, v5
+; GFX1170-NEXT: v_mov_b32_e32 v22, v4
+; GFX1170-NEXT: v_mov_b32_e32 v21, v3
+; GFX1170-NEXT: v_mov_b32_e32 v28, v6
+; GFX1170-NEXT: v_mov_b32_e32 v27, v5
+; GFX1170-NEXT: v_mov_b32_e32 v26, v4
+; GFX1170-NEXT: v_mov_b32_e32 v25, v3
+; GFX1170-NEXT: s_waitcnt vmcnt(0)
+; GFX1170-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[17:20], v0, v[1:2], v7
+; GFX1170-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[21:24], v0, v[1:2], v7 index_key:1
+; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX1170-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[25:28], v0, v[1:2], v7 index_key:2
+; GFX1170-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:3
+; GFX1170-NEXT: global_store_b128 v[9:10], v[17:20], off
+; GFX1170-NEXT: global_store_b128 v[11:12], v[21:24], off
+; GFX1170-NEXT: global_store_b128 v[13:14], v[25:28], off
+; GFX1170-NEXT: global_store_b128 v[15:16], v[3:6], off
+; GFX1170-NEXT: s_endpgm
+;
; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8_index_key:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: global_load_b32 v7, v[7:8], off
@@ -306,6 +487,33 @@ bb:
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX1170-LABEL: test_swmmac_f32_16x16x32_fp8_bf8_index_key:
+; GFX1170: ; %bb.0: ; %bb
+; GFX1170-NEXT: global_load_b32 v7, v[7:8], off
+; GFX1170-NEXT: v_mov_b32_e32 v20, v6
+; GFX1170-NEXT: v_mov_b32_e32 v19, v5
+; GFX1170-NEXT: v_mov_b32_e32 v18, v4
+; GFX1170-NEXT: v_mov_b32_e32 v17, v3
+; GFX1170-NEXT: v_mov_b32_e32 v24, v6
+; GFX1170-NEXT: v_mov_b32_e32 v23, v5
+; GFX1170-NEXT: v_mov_b32_e32 v22, v4
+; GFX1170-NEXT: v_mov_b32_e32 v21, v3
+; GFX1170-NEXT: v_mov_b32_e32 v28, v6
+; GFX1170-NEXT: v_mov_b32_e32 v27, v5
+; GFX1170-NEXT: v_mov_b32_e32 v26, v4
+; GFX1170-NEXT: v_mov_b32_e32 v25, v3
+; GFX1170-NEXT: s_waitcnt vmcnt(0)
+; GFX1170-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[17:20], v0, v[1:2], v7
+; GFX1170-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[21:24], v0, v[1:2], v7 index_key:1
+; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX1170-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[25:28], v0, v[1:2], v7 index_key:2
+; GFX1170-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:3
+; GFX1170-NEXT: global_store_b128 v[9:10], v[17:20], off
+; GFX1170-NEXT: global_store_b128 v[11:12], v[21:24], off
+; GFX1170-NEXT: global_store_b128 v[13:14], v[25:28], off
+; GFX1170-NEXT: global_store_b128 v[15:16], v[3:6], off
+; GFX1170-NEXT: s_endpgm
+;
; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8_index_key:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: global_load_b32 v7, v[7:8], off
@@ -350,6 +558,33 @@ bb:
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX1170-LABEL: test_swmmac_f32_16x16x32_bf8_fp8_index_key:
+; GFX1170: ; %bb.0: ; %bb
+; GFX1170-NEXT: global_load_b32 v7, v[7:8], off
+; GFX1170-NEXT: v_mov_b32_e32 v20, v6
+; GFX1170-NEXT: v_mov_b32_e32 v19, v5
+; GFX1170-NEXT: v_mov_b32_e32 v18, v4
+; GFX1170-NEXT: v_mov_b32_e32 v17, v3
+; GFX1170-NEXT: v_mov_b32_e32 v24, v6
+; GFX1170-NEXT: v_mov_b32_e32 v23, v5
+; GFX1170-NEXT: v_mov_b32_e32 v22, v4
+; GFX1170-NEXT: v_mov_b32_e32 v21, v3
+; GFX1170-NEXT: v_mov_b32_e32 v28, v6
+; GFX1170-NEXT: v_mov_b32_e32 v27, v5
+; GFX1170-NEXT: v_mov_b32_e32 v26, v4
+; GFX1170-NEXT: v_mov_b32_e32 v25, v3
+; GFX1170-NEXT: s_waitcnt vmcnt(0)
+; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[17:20], v0, v[1:2], v7
+; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[21:24], v0, v[1:2], v7 index_key:1
+; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[25:28], v0, v[1:2], v7 index_key:2
+; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:3
+; GFX1170-NEXT: global_store_b128 v[9:10], v[17:20], off
+; GFX1170-NEXT: global_store_b128 v[11:12], v[21:24], off
+; GFX1170-NEXT: global_store_b128 v[13:14], v[25:28], off
+; GFX1170-NEXT: global_store_b128 v[15:16], v[3:6], off
+; GFX1170-NEXT: s_endpgm
+;
; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8_index_key:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: global_load_b32 v7, v[7:8], off
@@ -394,6 +629,33 @@ bb:
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX1170-LABEL: test_swmmac_f32_16x16x32_bf8_bf8_index_key:
+; GFX1170: ; %bb.0: ; %bb
+; GFX1170-NEXT: global_load_b32 v7, v[7:8], off
+; GFX1170-NEXT: v_mov_b32_e32 v20, v6
+; GFX1170-NEXT: v_mov_b32_e32 v19, v5
+; GFX1170-NEXT: v_mov_b32_e32 v18, v4
+; GFX1170-NEXT: v_mov_b32_e32 v17, v3
+; GFX1170-NEXT: v_mov_b32_e32 v24, v6
+; GFX1170-NEXT: v_mov_b32_e32 v23, v5
+; GFX1170-NEXT: v_mov_b32_e32 v22, v4
+; GFX1170-NEXT: v_mov_b32_e32 v21, v3
+; GFX1170-NEXT: v_mov_b32_e32 v28, v6
+; GFX1170-NEXT: v_mov_b32_e32 v27, v5
+; GFX1170-NEXT: v_mov_b32_e32 v26, v4
+; GFX1170-NEXT: v_mov_b32_e32 v25, v3
+; GFX1170-NEXT: s_waitcnt vmcnt(0)
+; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[17:20], v0, v[1:2], v7
+; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[21:24], v0, v[1:2], v7 index_key:1
+; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[25:28], v0, v[1:2], v7 index_key:2
+; GFX1170-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:3
+; GFX1170-NEXT: global_store_b128 v[9:10], v[17:20], off
+; GFX1170-NEXT: global_store_b128 v[11:12], v[21:24], off
+; GFX1170-NEXT: global_store_b128 v[13:14], v[25:28], off
+; GFX1170-NEXT: global_store_b128 v[15:16], v[3:6], off
+; GFX1170-NEXT: s_endpgm
+;
; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8_index_key:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: global_load_b32 v7, v[7:8], off
@@ -448,3 +710,5 @@ declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.i8(
declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.i8(i32, <2 x i32>, <4 x float>, i8)
declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.i8(i32, <2 x i32>, <4 x float>, i8)
declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.i8(i32, <2 x i32>, <4 x float>, i8)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64.ll
index 896efb06d5595..60dc7cc766f75 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64.ll
@@ -1,12 +1,13 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1170 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GCN,GFX1170
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck %s --check-prefixes=GCN,GFX12
define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_f16:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7]
-; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_f16:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7]
+; GCN-NEXT: global_store_b128 v[8:9], v[4:7], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> %A, <4 x half> %B, <4 x float> %C)
store <4 x float> %res, ptr addrspace(1) %out
@@ -14,11 +15,11 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf16:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7]
-; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf16:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7]
+; GCN-NEXT: global_store_b128 v[8:9], v[4:7], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x float> %C)
store <4 x float> %res, ptr addrspace(1) %out
@@ -26,11 +27,11 @@ bb:
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f16_16x16x16_f16:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5]
-; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f16_16x16x16_f16:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5]
+; GCN-NEXT: global_store_b64 v[6:7], v[4:5], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> %C, i1 0)
store <4 x half> %res, ptr addrspace(1) %out
@@ -38,11 +39,11 @@ bb:
}
define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5]
-; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_bf16_16x16x16_bf16:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5]
+; GCN-NEXT: global_store_b64 v[6:7], v[4:5], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i1 0)
store <4 x i16> %res, ptr addrspace(1) %out
@@ -50,11 +51,11 @@ bb:
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu8:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5]
-; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu8:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5]
+; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
@@ -62,11 +63,11 @@ bb:
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x16_iu4:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5]
-; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x16_iu4:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5]
+; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
@@ -74,11 +75,11 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5]
-; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_fp8_fp8:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5]
+; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32(i32 %A, i32 %B, <4 x float> %C)
store <4 x float> %res, ptr addrspace(1) %out
@@ -86,11 +87,11 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5]
-; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf8_fp8:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5]
+; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32(i32 %A, i32 %B, <4 x float> %C)
store <4 x float> %res, ptr addrspace(1) %out
@@ -98,11 +99,11 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5]
-; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_fp8_bf8:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5]
+; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32(i32 %A, i32 %B, <4 x float> %C)
store <4 x float> %res, ptr addrspace(1) %out
@@ -110,11 +111,11 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5]
-; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_f32_16x16x16_bf8_bf8:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5]
+; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32(i32 %A, i32 %B, <4 x float> %C)
store <4 x float> %res, ptr addrspace(1) %out
@@ -122,11 +123,11 @@ bb:
}
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_i32_16x16x32_iu4:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5]
-; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_wmma_i32_16x16x32_iu4:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5]
+; GCN-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
@@ -134,11 +135,11 @@ bb:
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f32_16x16x32_f16:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10
-; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_f32_16x16x32_f16:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10
+; GCN-NEXT: global_store_b128 v[11:12], v[6:9], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index)
store <4 x float> %res, ptr addrspace(1) %out
@@ -146,11 +147,11 @@ bb:
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10
-; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_f32_16x16x32_bf16:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10
+; GCN-NEXT: global_store_b128 v[11:12], v[6:9], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index)
store <4 x float> %res, ptr addrspace(1) %out
@@ -158,11 +159,11 @@ bb:
}
define amdgpu_ps void @test_swmmac_f16_16x16x32_f16(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f16_16x16x32_f16:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8
-; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_f16_16x16x32_f16:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8
+; GCN-NEXT: global_store_b64 v[9:10], v[6:7], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index)
store <4 x half> %res, ptr addrspace(1) %out
@@ -170,11 +171,11 @@ bb:
}
define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8
-; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_bf16_16x16x32_bf16:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8
+; GCN-NEXT: global_store_b64 v[9:10], v[6:7], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index)
store <4 x i16> %res, ptr addrspace(1) %out
@@ -182,11 +183,11 @@ bb:
}
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7
-; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x32_iu8:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7
+; GCN-NEXT: global_store_b128 v[8:9], v[3:6], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
@@ -194,11 +195,11 @@ bb:
}
define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6
-; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x32_iu4:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6
+; GCN-NEXT: global_store_b128 v[7:8], v[2:5], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
@@ -206,11 +207,11 @@ bb:
}
define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7
-; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_i32_16x16x64_iu4:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7
+; GCN-NEXT: global_store_b128 v[8:9], v[3:6], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0)
store <4 x i32> %res, ptr addrspace(1) %out
@@ -218,11 +219,11 @@ bb:
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7
-; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_f32_16x16x32_fp8_fp8:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7
+; GCN-NEXT: global_store_b128 v[8:9], v[3:6], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index)
store <4 x float> %res, ptr addrspace(1) %out
@@ -230,11 +231,11 @@ bb:
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7
-; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_f32_16x16x32_fp8_bf8:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7
+; GCN-NEXT: global_store_b128 v[8:9], v[3:6], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index)
store <4 x float> %res, ptr addrspace(1) %out
@@ -242,11 +243,11 @@ bb:
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7
-; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_f32_16x16x32_bf8_fp8:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7
+; GCN-NEXT: global_store_b128 v[8:9], v[3:6], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index)
store <4 x float> %res, ptr addrspace(1) %out
@@ -254,11 +255,11 @@ bb:
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8:
-; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7
-; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
-; GFX12-NEXT: s_endpgm
+; GCN-LABEL: test_swmmac_f32_16x16x32_bf8_bf8:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7
+; GCN-NEXT: global_store_b128 v[8:9], v[3:6], off
+; GCN-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index)
store <4 x float> %res, ptr addrspace(1) %out
@@ -287,3 +288,6 @@ declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.i8(
declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.i8(i32, <2 x i32>, <4 x float>, i8)
declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.i8(i32, <2 x i32>, <4 x float>, i8)
declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.i8(i32, <2 x i32>, <4 x float>, i8)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX1170: {{.*}}
+; GFX12: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w32.mir b/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w32.mir
index ef85de2012943..897bd2d8517a4 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w32.mir
+++ b/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w32.mir
@@ -1,5 +1,6 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=GFX12 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1170 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,GFX1170 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,GFX12 %s
# D0 overlaps A1, B1, C1 or Index1. Overlap starts at vgpr0.
# $D0 = wmma0 $A0, $B0, $C0 or $D0 = swmmac0 $A0, $B0, $C0, $Index0
@@ -11,12 +12,12 @@ body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45
- ; GFX12-LABEL: name: test_wmma_f32_16x16x16_f16_D0_overlaps_A1
- ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
- ; GFX12-NEXT: V_NOP_e32 implicit $exec
- ; GFX12-NEXT: early-clobber renamable $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35, 8, killed $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, implicit $exec
+ ; GCN-LABEL: name: test_wmma_f32_16x16x16_f16_D0_overlaps_A1
+ ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+ ; GCN-NEXT: V_NOP_e32 implicit $exec
+ ; GCN-NEXT: early-clobber renamable $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35, 8, killed $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
early-clobber renamable $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35, 8, killed $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, implicit $exec
...
@@ -27,12 +28,12 @@ body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45
- ; GFX12-LABEL: name: test_wmma_f32_16x16x16_bf16_D0_overlaps_B1
- ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
- ; GFX12-NEXT: V_NOP_e32 implicit $exec
- ; GFX12-NEXT: early-clobber renamable $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43 = V_WMMA_F32_16X16X16_BF16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, implicit $exec
+ ; GCN-LABEL: name: test_wmma_f32_16x16x16_bf16_D0_overlaps_B1
+ ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+ ; GCN-NEXT: V_NOP_e32 implicit $exec
+ ; GCN-NEXT: early-clobber renamable $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43 = V_WMMA_F32_16X16X16_BF16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
early-clobber renamable $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43 = V_WMMA_F32_16X16X16_BF16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, implicit $exec
...
@@ -43,11 +44,11 @@ body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
- ; GFX12-LABEL: name: test_wmma_f16_16x16x16_f16_D0_overlaps_C1
- ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
- ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F16_16X16X16_F16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+ ; GCN-LABEL: name: test_wmma_f16_16x16x16_f16_D0_overlaps_C1
+ ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+ ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F16_16X16X16_F16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F16_16X16X16_F16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
...
@@ -58,12 +59,12 @@ body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
- ; GFX12-LABEL: name: test_wmma_bf16_16x16x16_bf16_D0_overlaps_A1
- ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
- ; GFX12-NEXT: V_NOP_e32 implicit $exec
- ; GFX12-NEXT: early-clobber renamable $vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_BF16_16X16X16_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35, 8, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
+ ; GCN-LABEL: name: test_wmma_bf16_16x16x16_bf16_D0_overlaps_A1
+ ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+ ; GCN-NEXT: V_NOP_e32 implicit $exec
+ ; GCN-NEXT: early-clobber renamable $vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_BF16_16X16X16_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35, 8, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
early-clobber renamable $vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_BF16_16X16X16_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35, 8, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
...
@@ -73,12 +74,12 @@ body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
- ; GFX12-LABEL: name: test_wmma_i32_16x16x16_iu8_D0_overlaps_B1
- ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
- ; GFX12-NEXT: V_NOP_e32 implicit $exec
- ; GFX12-NEXT: early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_I32_16X16X16_IU8_w32_twoaddr 8, killed $vgpr28_vgpr29, 8, killed $vgpr0_vgpr1, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
+ ; GCN-LABEL: name: test_wmma_i32_16x16x16_iu8_D0_overlaps_B1
+ ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+ ; GCN-NEXT: V_NOP_e32 implicit $exec
+ ; GCN-NEXT: early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_I32_16X16X16_IU8_w32_twoaddr 8, killed $vgpr28_vgpr29, 8, killed $vgpr0_vgpr1, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_I32_16X16X16_IU8_w32_twoaddr 8, killed $vgpr28_vgpr29, 8, killed $vgpr0_vgpr1, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
...
@@ -89,11 +90,11 @@ body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39
- ; GFX12-LABEL: name: test_wmma_i32_16x16x16_iu4_D0_overlaps_C1
- ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
- ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_I32_16X16X16_IU4_w32_twoaddr 8, killed $vgpr28, 8, killed $vgpr29, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+ ; GCN-LABEL: name: test_wmma_i32_16x16x16_iu4_D0_overlaps_C1
+ ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+ ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_I32_16X16X16_IU4_w32_twoaddr 8, killed $vgpr28, 8, killed $vgpr29, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_I32_16X16X16_IU4_w32_twoaddr 8, killed $vgpr28, 8, killed $vgpr29, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
...
@@ -104,12 +105,12 @@ body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
- ; GFX12-LABEL: name: test_wmma_f32_16x16x16_fp8_fp8_D0_overlaps_A1
- ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
- ; GFX12-NEXT: V_NOP_e32 implicit $exec
- ; GFX12-NEXT: early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X16_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
+ ; GCN-LABEL: name: test_wmma_f32_16x16x16_fp8_fp8_D0_overlaps_A1
+ ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+ ; GCN-NEXT: V_NOP_e32 implicit $exec
+ ; GCN-NEXT: early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X16_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X16_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
...
@@ -120,12 +121,12 @@ body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
- ; GFX12-LABEL: name: test_wmma_f32_16x16x16_bf8_fp8_D0_overlaps_B1
- ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
- ; GFX12-NEXT: V_NOP_e32 implicit $exec
- ; GFX12-NEXT: early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X16_BF8_FP8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr0_vgpr1, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
+ ; GCN-LABEL: name: test_wmma_f32_16x16x16_bf8_fp8_D0_overlaps_B1
+ ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+ ; GCN-NEXT: V_NOP_e32 implicit $exec
+ ; GCN-NEXT: early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X16_BF8_FP8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr0_vgpr1, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X16_BF8_FP8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr0_vgpr1, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
...
@@ -136,11 +137,11 @@ body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
- ; GFX12-LABEL: name: test_wmma_f32_16x16x16_fp8_bf8_D0_overlaps_C1
- ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
- ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_FP8_BF8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr30_vgpr31, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+ ; GCN-LABEL: name: test_wmma_f32_16x16x16_fp8_bf8_D0_overlaps_C1
+ ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+ ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_FP8_BF8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr30_vgpr31, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_FP8_BF8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr30_vgpr31, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
...
@@ -151,12 +152,12 @@ body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
- ; GFX12-LABEL: name: test_wmma_f32_16x16x16_bf8_bf8_D0_overlaps_A1
- ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
- ; GFX12-NEXT: V_NOP_e32 implicit $exec
- ; GFX12-NEXT: early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X16_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
+ ; GCN-LABEL: name: test_wmma_f32_16x16x16_bf8_bf8_D0_overlaps_A1
+ ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+ ; GCN-NEXT: V_NOP_e32 implicit $exec
+ ; GCN-NEXT: early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X16_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X16_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
...
@@ -167,12 +168,12 @@ body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
- ; GFX12-LABEL: name: test_wmma_i32_16x16x32_iu4_D0_overlaps_B1
- ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
- ; GFX12-NEXT: V_NOP_e32 implicit $exec
- ; GFX12-NEXT: early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_I32_16X16X32_IU4_w32_twoaddr 8, killed $vgpr28_vgpr29, 8, killed $vgpr0_vgpr1, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
+ ; GCN-LABEL: name: test_wmma_i32_16x16x32_iu4_D0_overlaps_B1
+ ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+ ; GCN-NEXT: V_NOP_e32 implicit $exec
+ ; GCN-NEXT: early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_I32_16X16X32_IU4_w32_twoaddr 8, killed $vgpr28_vgpr29, 8, killed $vgpr0_vgpr1, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_I32_16X16X32_IU4_w32_twoaddr 8, killed $vgpr28_vgpr29, 8, killed $vgpr0_vgpr1, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
...
@@ -183,11 +184,11 @@ body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr48, $vgpr49, $vgpr50
- ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_f16_D0_overlaps_C1
- ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr48, $vgpr49, $vgpr50
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
- ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_SWMMAC_F32_16X16X32_F16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr44, 0, 0, 0, implicit $exec
+ ; GCN-LABEL: name: test_swmmac_f32_16x16x32_f16_D0_overlaps_C1
+ ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr48, $vgpr49, $vgpr50
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+ ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_SWMMAC_F32_16X16X32_F16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr44, 0, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_SWMMAC_F32_16X16X32_F16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr44, 0, 0, 0, implicit $exec
...
@@ -198,6 +199,12 @@ body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr48, $vgpr49, $vgpr50
+ ; GFX1170-LABEL: name: test_swmmac_f32_16x16x32_bf16_D0_overlaps_Index1
+ ; GFX1170: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr48, $vgpr49, $vgpr50
+ ; GFX1170-NEXT: {{ $}}
+ ; GFX1170-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+ ; GFX1170-NEXT: early-clobber renamable $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 = V_SWMMAC_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr0, 0, 0, 0, implicit $exec
+ ;
; GFX12-LABEL: name: test_swmmac_f32_16x16x32_bf16_D0_overlaps_Index1
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr48, $vgpr49, $vgpr50
; GFX12-NEXT: {{ $}}
@@ -214,12 +221,12 @@ body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46
- ; GFX12-LABEL: name: test_swmmac_f16_16x16x32_f16_D0_overlaps_A1
- ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
- ; GFX12-NEXT: V_NOP_e32 implicit $exec
- ; GFX12-NEXT: early-clobber renamable $vgpr40_vgpr41_vgpr42_vgpr43 = V_SWMMAC_F16_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43, killed $vgpr44, 0, 0, 0, implicit $exec
+ ; GCN-LABEL: name: test_swmmac_f16_16x16x32_f16_D0_overlaps_A1
+ ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+ ; GCN-NEXT: V_NOP_e32 implicit $exec
+ ; GCN-NEXT: early-clobber renamable $vgpr40_vgpr41_vgpr42_vgpr43 = V_SWMMAC_F16_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43, killed $vgpr44, 0, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
early-clobber renamable $vgpr40_vgpr41_vgpr42_vgpr43 = V_SWMMAC_F16_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43, killed $vgpr44, 0, 0, 0, implicit $exec
...
@@ -230,12 +237,12 @@ body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46
- ; GFX12-LABEL: name: test_swmmac_bf16_16x16x32_bf16_D0_overlaps_B1
- ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
- ; GFX12-NEXT: V_NOP_e32 implicit $exec
- ; GFX12-NEXT: early-clobber renamable $vgpr40_vgpr41_vgpr42_vgpr43 = V_SWMMAC_BF16_16X16X32_BF16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr40_vgpr41_vgpr42_vgpr43, killed $vgpr44, 0, 0, 0, implicit $exec
+ ; GCN-LABEL: name: test_swmmac_bf16_16x16x32_bf16_D0_overlaps_B1
+ ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+ ; GCN-NEXT: V_NOP_e32 implicit $exec
+ ; GCN-NEXT: early-clobber renamable $vgpr40_vgpr41_vgpr42_vgpr43 = V_SWMMAC_BF16_16X16X32_BF16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr40_vgpr41_vgpr42_vgpr43, killed $vgpr44, 0, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
early-clobber renamable $vgpr40_vgpr41_vgpr42_vgpr43 = V_SWMMAC_BF16_16X16X32_BF16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr40_vgpr41_vgpr42_vgpr43, killed $vgpr44, 0, 0, 0, implicit $exec
...
@@ -246,11 +253,11 @@ body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
- ; GFX12-LABEL: name: test_swmmac_i32_16x16x32_iu8_D0_overlaps_C1
- ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
- ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_SWMMAC_I32_16X16X32_IU8_w32_twoaddr 8, killed $vgpr28_vgpr29, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr42, 0, 0, 0, implicit $exec
+ ; GCN-LABEL: name: test_swmmac_i32_16x16x32_iu8_D0_overlaps_C1
+ ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+ ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_SWMMAC_I32_16X16X32_IU8_w32_twoaddr 8, killed $vgpr28_vgpr29, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr42, 0, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_SWMMAC_I32_16X16X32_IU8_w32_twoaddr 8, killed $vgpr28_vgpr29, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr42, 0, 0, 0, implicit $exec
...
@@ -261,6 +268,12 @@ body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
+ ; GFX1170-LABEL: name: test_swmmac_i32_16x16x32_iu4_D0_overlaps_Index1
+ ; GFX1170: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
+ ; GFX1170-NEXT: {{ $}}
+ ; GFX1170-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+ ; GFX1170-NEXT: early-clobber renamable $vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38 = V_SWMMAC_I32_16X16X32_IU4_w32_twoaddr 8, killed $vgpr28, 8, killed $vgpr28_vgpr29, killed $vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38, killed $vgpr0, 0, 0, 0, implicit $exec
+ ;
; GFX12-LABEL: name: test_swmmac_i32_16x16x32_iu4_D0_overlaps_Index1
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
; GFX12-NEXT: {{ $}}
@@ -277,12 +290,12 @@ body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
- ; GFX12-LABEL: name: test_swmmac_i32_16x16x64_iu4_D0_overlaps_A1
- ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
- ; GFX12-NEXT: V_NOP_e32 implicit $exec
- ; GFX12-NEXT: early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_I32_16X16X64_IU4_w32_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42, 0, 0, implicit $exec
+ ; GCN-LABEL: name: test_swmmac_i32_16x16x64_iu4_D0_overlaps_A1
+ ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+ ; GCN-NEXT: V_NOP_e32 implicit $exec
+ ; GCN-NEXT: early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_I32_16X16X64_IU4_w32_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_I32_16X16X64_IU4_w32_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42, 0, 0, implicit $exec
...
@@ -293,12 +306,12 @@ body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
- ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_fp8_fp8_D0_overlaps_B1
- ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
- ; GFX12-NEXT: V_NOP_e32 implicit $exec
- ; GFX12-NEXT: early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_F32_16X16X32_FP8_FP8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42, 0, implicit $exec
+ ; GCN-LABEL: name: test_swmmac_f32_16x16x32_fp8_fp8_D0_overlaps_B1
+ ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+ ; GCN-NEXT: V_NOP_e32 implicit $exec
+ ; GCN-NEXT: early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_F32_16X16X32_FP8_FP8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_F32_16X16X32_FP8_FP8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42, 0, implicit $exec
...
@@ -309,11 +322,11 @@ body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
- ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_fp8_bf8_D0_overlaps_C1
- ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
- ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_SWMMAC_F32_16X16X32_FP8_BF8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr42, 0, implicit $exec
+ ; GCN-LABEL: name: test_swmmac_f32_16x16x32_fp8_bf8_D0_overlaps_C1
+ ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+ ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_SWMMAC_F32_16X16X32_FP8_BF8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr42, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_SWMMAC_F32_16X16X32_FP8_BF8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr42, 0, implicit $exec
...
@@ -324,6 +337,12 @@ body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
+ ; GFX1170-LABEL: name: test_swmmac_f32_16x16x32_bf8_fp8_D0_overlaps_Index1
+ ; GFX1170: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
+ ; GFX1170-NEXT: {{ $}}
+ ; GFX1170-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+ ; GFX1170-NEXT: early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_F32_16X16X32_BF8_FP8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr0, 0, implicit $exec
+ ;
; GFX12-LABEL: name: test_swmmac_f32_16x16x32_bf8_fp8_D0_overlaps_Index1
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
; GFX12-NEXT: {{ $}}
@@ -340,12 +359,12 @@ body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
- ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_bf8_bf8_D0_overlaps_A1
- ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
- ; GFX12-NEXT: V_NOP_e32 implicit $exec
- ; GFX12-NEXT: early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_F32_16X16X32_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42, 0, implicit $exec
+ ; GCN-LABEL: name: test_swmmac_f32_16x16x32_bf8_bf8_D0_overlaps_A1
+ ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+ ; GCN-NEXT: V_NOP_e32 implicit $exec
+ ; GCN-NEXT: early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_F32_16X16X32_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_F32_16X16X32_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42, 0, implicit $exec
...
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w64.mir b/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w64.mir
index 4073964e2b038..0a80543b9977d 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w64.mir
+++ b/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w64.mir
@@ -1,5 +1,6 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize64 -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=GFX12 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1170 -verify-machineinstrs -mattr=+wavefrontsize64 -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,GFX1170 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize64 -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,GFX12 %s
# D0 overlaps A1, B1, C1 or Index1. Overlap starts at vgpr0.
# $D0 = wmma0 $A0, $B0, $C0 or $D0 = swmmac0 $A0, $B0, $C0, $Index0
@@ -11,12 +12,12 @@ body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
- ; GFX12-LABEL: name: test_wmma_f32_16x16x16_f16_D0_overlaps_A1
- ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
- ; GFX12-NEXT: V_NOP_e32 implicit $exec
- ; GFX12-NEXT: early-clobber renamable $vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr18_vgpr19, 8, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
+ ; GCN-LABEL: name: test_wmma_f32_16x16x16_f16_D0_overlaps_A1
+ ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: V_NOP_e32 implicit $exec
+ ; GCN-NEXT: early-clobber renamable $vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr18_vgpr19, 8, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
early-clobber renamable $vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr18_vgpr19, 8, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
...
@@ -27,12 +28,12 @@ body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
- ; GFX12-LABEL: name: test_wmma_f32_16x16x16_bf16_D0_overlaps_B1
- ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
- ; GFX12-NEXT: V_NOP_e32 implicit $exec
- ; GFX12-NEXT: early-clobber renamable $vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_BF16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr0_vgpr1, 8, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
+ ; GCN-LABEL: name: test_wmma_f32_16x16x16_bf16_D0_overlaps_B1
+ ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: V_NOP_e32 implicit $exec
+ ; GCN-NEXT: early-clobber renamable $vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_BF16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr0_vgpr1, 8, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
early-clobber renamable $vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_BF16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr0_vgpr1, 8, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
...
@@ -43,11 +44,11 @@ body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
- ; GFX12-LABEL: name: test_wmma_f16_16x16x16_f16_D0_overlaps_C1
- ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
- ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1 = V_WMMA_F16_16X16X16_F16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19, 8, killed $vgpr0_vgpr1, 0, 0, implicit $exec
+ ; GCN-LABEL: name: test_wmma_f16_16x16x16_f16_D0_overlaps_C1
+ ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1 = V_WMMA_F16_16X16X16_F16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19, 8, killed $vgpr0_vgpr1, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1 = V_WMMA_F16_16X16X16_F16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19, 8, killed $vgpr0_vgpr1, 0, 0, implicit $exec
...
@@ -58,12 +59,12 @@ body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
- ; GFX12-LABEL: name: test_wmma_bf16_16x16x16_bf16_D0_overlaps_A1
- ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
- ; GFX12-NEXT: V_NOP_e32 implicit $exec
- ; GFX12-NEXT: early-clobber renamable $vgpr20_vgpr21 = V_WMMA_BF16_16X16X16_BF16_w64_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr18_vgpr19, 8, killed $vgpr20_vgpr21, 0, 0, implicit $exec
+ ; GCN-LABEL: name: test_wmma_bf16_16x16x16_bf16_D0_overlaps_A1
+ ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: V_NOP_e32 implicit $exec
+ ; GCN-NEXT: early-clobber renamable $vgpr20_vgpr21 = V_WMMA_BF16_16X16X16_BF16_w64_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr18_vgpr19, 8, killed $vgpr20_vgpr21, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
early-clobber renamable $vgpr20_vgpr21 = V_WMMA_BF16_16X16X16_BF16_w64_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr18_vgpr19, 8, killed $vgpr20_vgpr21, 0, 0, implicit $exec
...
@@ -74,12 +75,12 @@ body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
- ; GFX12-LABEL: name: test_wmma_i32_16x16x16_iu8_D0_overlaps_B1
- ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
- ; GFX12-NEXT: V_NOP_e32 implicit $exec
- ; GFX12-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_I32_16X16X16_IU8_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr0, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
+ ; GCN-LABEL: name: test_wmma_i32_16x16x16_iu8_D0_overlaps_B1
+ ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: V_NOP_e32 implicit $exec
+ ; GCN-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_I32_16X16X16_IU8_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr0, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_I32_16X16X16_IU8_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr0, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
...
@@ -90,11 +91,11 @@ body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
- ; GFX12-LABEL: name: test_wmma_i32_16x16x16_iu4_D0_overlaps_C1
- ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
- ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_I32_16X16X16_IU4_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr17, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+ ; GCN-LABEL: name: test_wmma_i32_16x16x16_iu4_D0_overlaps_C1
+ ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_I32_16X16X16_IU4_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr17, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_I32_16X16X16_IU4_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr17, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
...
@@ -105,12 +106,12 @@ body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
- ; GFX12-LABEL: name: test_wmma_f32_16x16x16_fp8_fp8_D0_overlaps_A1
- ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
- ; GFX12-NEXT: V_NOP_e32 implicit $exec
- ; GFX12-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_F32_16X16X16_FP8_FP8_w64_twoaddr killed $vgpr0, killed $vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
+ ; GCN-LABEL: name: test_wmma_f32_16x16x16_fp8_fp8_D0_overlaps_A1
+ ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: V_NOP_e32 implicit $exec
+ ; GCN-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_F32_16X16X16_FP8_FP8_w64_twoaddr killed $vgpr0, killed $vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_F32_16X16X16_FP8_FP8_w64_twoaddr killed $vgpr0, killed $vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
...
@@ -121,12 +122,12 @@ body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
- ; GFX12-LABEL: name: test_wmma_f32_16x16x16_bf8_fp8_D0_overlaps_B1
- ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
- ; GFX12-NEXT: V_NOP_e32 implicit $exec
- ; GFX12-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_F32_16X16X16_BF8_FP8_w64_twoaddr killed $vgpr16, killed $vgpr0, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
+ ; GCN-LABEL: name: test_wmma_f32_16x16x16_bf8_fp8_D0_overlaps_B1
+ ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: V_NOP_e32 implicit $exec
+ ; GCN-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_F32_16X16X16_BF8_FP8_w64_twoaddr killed $vgpr16, killed $vgpr0, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_F32_16X16X16_BF8_FP8_w64_twoaddr killed $vgpr16, killed $vgpr0, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
...
@@ -137,11 +138,11 @@ body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
- ; GFX12-LABEL: name: test_wmma_f32_16x16x16_fp8_bf8_D0_overlaps_C1
- ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
- ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_FP8_BF8_w64_twoaddr killed $vgpr16, killed $vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+ ; GCN-LABEL: name: test_wmma_f32_16x16x16_fp8_bf8_D0_overlaps_C1
+ ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_FP8_BF8_w64_twoaddr killed $vgpr16, killed $vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_FP8_BF8_w64_twoaddr killed $vgpr16, killed $vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
...
@@ -152,12 +153,12 @@ body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
- ; GFX12-LABEL: name: test_wmma_f32_16x16x16_bf8_bf8_D0_overlaps_A1
- ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
- ; GFX12-NEXT: V_NOP_e32 implicit $exec
- ; GFX12-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_F32_16X16X16_BF8_BF8_w64_twoaddr killed $vgpr0, killed $vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
+ ; GCN-LABEL: name: test_wmma_f32_16x16x16_bf8_bf8_D0_overlaps_A1
+ ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: V_NOP_e32 implicit $exec
+ ; GCN-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_F32_16X16X16_BF8_BF8_w64_twoaddr killed $vgpr0, killed $vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_F32_16X16X16_BF8_BF8_w64_twoaddr killed $vgpr0, killed $vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
...
@@ -168,12 +169,12 @@ body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
- ; GFX12-LABEL: name: test_wmma_i32_16x16x32_iu4_D0_overlaps_B1
- ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
- ; GFX12-NEXT: V_NOP_e32 implicit $exec
- ; GFX12-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_I32_16X16X32_IU4_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr0, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
+ ; GCN-LABEL: name: test_wmma_i32_16x16x32_iu4_D0_overlaps_B1
+ ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: V_NOP_e32 implicit $exec
+ ; GCN-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_I32_16X16X32_IU4_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr0, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_I32_16X16X32_IU4_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr0, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
...
@@ -184,11 +185,11 @@ body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28
- ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_f16_D0_overlaps_C1
- ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
- ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_SWMMAC_F32_16X16X32_F16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr26, 0, 0, 0, implicit $exec
+ ; GCN-LABEL: name: test_swmmac_f32_16x16x32_f16_D0_overlaps_C1
+ ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_SWMMAC_F32_16X16X32_F16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr26, 0, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_SWMMAC_F32_16X16X32_F16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr26, 0, 0, 0, implicit $exec
...
@@ -199,6 +200,12 @@ body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28
+ ; GFX1170-LABEL: name: test_swmmac_f32_16x16x32_bf16_D0_overlaps_Index1
+ ; GFX1170: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28
+ ; GFX1170-NEXT: {{ $}}
+ ; GFX1170-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+ ; GFX1170-NEXT: early-clobber renamable $vgpr22_vgpr23_vgpr24_vgpr25 = V_SWMMAC_F32_16X16X32_BF16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr22_vgpr23_vgpr24_vgpr25, killed $vgpr0, 0, 0, 0, implicit $exec
+ ;
; GFX12-LABEL: name: test_swmmac_f32_16x16x32_bf16_D0_overlaps_Index1
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28
; GFX12-NEXT: {{ $}}
@@ -215,12 +222,12 @@ body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26
- ; GFX12-LABEL: name: test_swmmac_f16_16x16x32_f16_D0_overlaps_A1
- ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
- ; GFX12-NEXT: V_NOP_e32 implicit $exec
- ; GFX12-NEXT: early-clobber renamable $vgpr22_vgpr23 = V_SWMMAC_F16_16X16X32_F16_w64_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr22_vgpr23, killed $vgpr24, 0, 0, 0, implicit $exec
+ ; GCN-LABEL: name: test_swmmac_f16_16x16x32_f16_D0_overlaps_A1
+ ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: V_NOP_e32 implicit $exec
+ ; GCN-NEXT: early-clobber renamable $vgpr22_vgpr23 = V_SWMMAC_F16_16X16X32_F16_w64_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr22_vgpr23, killed $vgpr24, 0, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
early-clobber renamable $vgpr22_vgpr23 = V_SWMMAC_F16_16X16X32_F16_w64_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr22_vgpr23, killed $vgpr24, 0, 0, 0, implicit $exec
...
@@ -231,12 +238,12 @@ body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26
- ; GFX12-LABEL: name: test_swmmac_bf16_16x16x32_bf16_D0_overlaps_B1
- ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
- ; GFX12-NEXT: V_NOP_e32 implicit $exec
- ; GFX12-NEXT: early-clobber renamable $vgpr22_vgpr23 = V_SWMMAC_BF16_16X16X32_BF16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr22_vgpr23, killed $vgpr24, 0, 0, 0, implicit $exec
+ ; GCN-LABEL: name: test_swmmac_bf16_16x16x32_bf16_D0_overlaps_B1
+ ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: V_NOP_e32 implicit $exec
+ ; GCN-NEXT: early-clobber renamable $vgpr22_vgpr23 = V_SWMMAC_BF16_16X16X32_BF16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr22_vgpr23, killed $vgpr24, 0, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
early-clobber renamable $vgpr22_vgpr23 = V_SWMMAC_BF16_16X16X32_BF16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr22_vgpr23, killed $vgpr24, 0, 0, 0, implicit $exec
...
@@ -247,11 +254,11 @@ body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
- ; GFX12-LABEL: name: test_swmmac_i32_16x16x32_iu8_D0_overlaps_C1
- ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
- ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_SWMMAC_I32_16X16X32_IU8_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr17_vgpr18, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr23, 0, 0, 0, implicit $exec
+ ; GCN-LABEL: name: test_swmmac_i32_16x16x32_iu8_D0_overlaps_C1
+ ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_SWMMAC_I32_16X16X32_IU8_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr17_vgpr18, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr23, 0, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_SWMMAC_I32_16X16X32_IU8_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr17_vgpr18, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr23, 0, 0, 0, implicit $exec
...
@@ -262,6 +269,12 @@ body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24
+ ; GFX1170-LABEL: name: test_swmmac_i32_16x16x32_iu4_D0_overlaps_Index1
+ ; GFX1170: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24
+ ; GFX1170-NEXT: {{ $}}
+ ; GFX1170-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+ ; GFX1170-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_SWMMAC_I32_16X16X32_IU4_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr16, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr0, 0, 0, 0, implicit $exec
+ ;
; GFX12-LABEL: name: test_swmmac_i32_16x16x32_iu4_D0_overlaps_Index1
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24
; GFX12-NEXT: {{ $}}
@@ -278,12 +291,12 @@ body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
- ; GFX12-LABEL: name: test_swmmac_i32_16x16x64_iu4_D0_overlaps_A1
- ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
- ; GFX12-NEXT: V_NOP_e32 implicit $exec
- ; GFX12-NEXT: early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_I32_16X16X64_IU4_w64_twoaddr 8, killed $vgpr0, 8, killed $vgpr17_vgpr18, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr23, 0, 0, 0, implicit $exec
+ ; GCN-LABEL: name: test_swmmac_i32_16x16x64_iu4_D0_overlaps_A1
+ ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: V_NOP_e32 implicit $exec
+ ; GCN-NEXT: early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_I32_16X16X64_IU4_w64_twoaddr 8, killed $vgpr0, 8, killed $vgpr17_vgpr18, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr23, 0, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_I32_16X16X64_IU4_w64_twoaddr 8, killed $vgpr0, 8, killed $vgpr17_vgpr18, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr23, 0, 0, 0, implicit $exec
...
@@ -294,12 +307,12 @@ body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
- ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_fp8_fp8_D0_overlaps_B1
- ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
- ; GFX12-NEXT: V_NOP_e32 implicit $exec
- ; GFX12-NEXT: early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_F32_16X16X32_FP8_FP8_w64_twoaddr killed $vgpr16, killed $vgpr0_vgpr1, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr23, 0, implicit $exec
+ ; GCN-LABEL: name: test_swmmac_f32_16x16x32_fp8_fp8_D0_overlaps_B1
+ ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: V_NOP_e32 implicit $exec
+ ; GCN-NEXT: early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_F32_16X16X32_FP8_FP8_w64_twoaddr killed $vgpr16, killed $vgpr0_vgpr1, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr23, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_F32_16X16X32_FP8_FP8_w64_twoaddr killed $vgpr16, killed $vgpr0_vgpr1, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr23, 0, implicit $exec
...
@@ -310,11 +323,11 @@ body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
- ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_fp8_bf8_D0_overlaps_C1
- ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
- ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_SWMMAC_F32_16X16X32_FP8_BF8_w64_twoaddr killed $vgpr16, killed $vgpr17_vgpr18, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr23, 0, implicit $exec
+ ; GCN-LABEL: name: test_swmmac_f32_16x16x32_fp8_bf8_D0_overlaps_C1
+ ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_SWMMAC_F32_16X16X32_FP8_BF8_w64_twoaddr killed $vgpr16, killed $vgpr17_vgpr18, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr23, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_SWMMAC_F32_16X16X32_FP8_BF8_w64_twoaddr killed $vgpr16, killed $vgpr17_vgpr18, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr23, 0, implicit $exec
...
@@ -325,6 +338,12 @@ body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
+ ; GFX1170-LABEL: name: test_swmmac_f32_16x16x32_bf8_fp8_D0_overlaps_Index1
+ ; GFX1170: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
+ ; GFX1170-NEXT: {{ $}}
+ ; GFX1170-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+ ; GFX1170-NEXT: early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_F32_16X16X32_BF8_FP8_w64_twoaddr killed $vgpr16, killed $vgpr17_vgpr18, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr0, 0, implicit $exec
+ ;
; GFX12-LABEL: name: test_swmmac_f32_16x16x32_bf8_fp8_D0_overlaps_Index1
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
; GFX12-NEXT: {{ $}}
@@ -341,12 +360,12 @@ body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
- ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_bf8_bf8_D0_overlaps_A1
- ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
- ; GFX12-NEXT: V_NOP_e32 implicit $exec
- ; GFX12-NEXT: early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_F32_16X16X32_BF8_BF8_w64_twoaddr killed $vgpr0, killed $vgpr17_vgpr18, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr23, 0, implicit $exec
+ ; GCN-LABEL: name: test_swmmac_f32_16x16x32_bf8_bf8_D0_overlaps_A1
+ ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: V_NOP_e32 implicit $exec
+ ; GCN-NEXT: early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_F32_16X16X32_BF8_BF8_w64_twoaddr killed $vgpr0, killed $vgpr17_vgpr18, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr23, 0, implicit $exec
early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_F32_16X16X32_BF8_BF8_w64_twoaddr killed $vgpr0, killed $vgpr17_vgpr18, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr23, 0, implicit $exec
...
diff --git a/llvm/test/CodeGen/MIR/X86/expected-integer-after-tied-def.mir b/llvm/test/CodeGen/MIR/X86/expected-integer-after-tied-def.mir
index 7160a3e3c3d84..5666c2141c5ee 100644
--- a/llvm/test/CodeGen/MIR/X86/expected-integer-after-tied-def.mir
+++ b/llvm/test/CodeGen/MIR/X86/expected-integer-after-tied-def.mir
@@ -17,7 +17,7 @@ body: |
bb.0.entry:
liveins: $rdi
- ; CHECK: [[@LINE+1]]:78: expected tied-def or low-level type after '('
+ ; CHECK: [[@LINE+1]]:78: expected an integer literal after 'tied-def'
INLINEASM &"$foo", 1, 2818058, def $rdi, 2147483657, killed $rdi(tied-def)
$rax = COPY killed $rdi
RET64 killed $rax
diff --git a/llvm/test/CodeGen/MIR/X86/invalid-tied-physical-reg-def.mir b/llvm/test/CodeGen/MIR/X86/invalid-tied-physical-reg-def.mir
new file mode 100644
index 0000000000000..66c458e79f316
--- /dev/null
+++ b/llvm/test/CodeGen/MIR/X86/invalid-tied-physical-reg-def.mir
@@ -0,0 +1,15 @@
+# RUN: not llc -mtriple=x86_64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
+---
+name: test
+tracksRegLiveness: true
+liveins:
+ - { reg: '$rdi' }
+body: |
+ bb.0.entry:
+ liveins: $rdi
+
+ ; CHECK: [[@LINE+1]]:45: tied-def not supported for defs
+ INLINEASM &"$foo", 1, 2818058, def $rdi(tied-def 5), 2147483657, killed $rdi(tied-def 3)
+ $rax = COPY killed $rdi
+ RET64 killed $rax
+...
diff --git a/llvm/test/CodeGen/MIR/X86/expected-tied-def-after-lparen.mir b/llvm/test/CodeGen/MIR/X86/invalid-type-physical-reg.mir
similarity index 87%
rename from llvm/test/CodeGen/MIR/X86/expected-tied-def-after-lparen.mir
rename to llvm/test/CodeGen/MIR/X86/invalid-type-physical-reg.mir
index a2c65dd3f0195..f2d94339e3ae5 100644
--- a/llvm/test/CodeGen/MIR/X86/expected-tied-def-after-lparen.mir
+++ b/llvm/test/CodeGen/MIR/X86/invalid-type-physical-reg.mir
@@ -17,7 +17,7 @@ body: |
bb.0.entry:
liveins: $rdi
- ; CHECK: [[@LINE+1]]:70: expected tied-def or low-level type after '('
+ ; CHECK: [[@LINE+1]]:70: unexpected type on physical register
INLINEASM &"$foo", 1, 2818058, def $rdi, 2147483657, killed $rdi(3)
$rax = COPY killed $rdi
RET64 killed $rax
diff --git a/llvm/test/CodeGen/PowerPC/clmul-vector.ll b/llvm/test/CodeGen/PowerPC/clmul-vector.ll
new file mode 100644
index 0000000000000..9089dca5b0ed7
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/clmul-vector.ll
@@ -0,0 +1,8874 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=powerpc64-- | FileCheck %s --check-prefixes=CHECK,BE
+; RUN: llc < %s -mtriple=powerpc64le-- | FileCheck %s --check-prefixes=CHECK,LE
+
+define <16 x i8> @clmul_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
+; BE-LABEL: clmul_v16i8:
+; BE: # %bb.0:
+; BE-NEXT: addis 3, 2, .LCPI0_0 at toc@ha
+; BE-NEXT: vspltisb 4, 2
+; BE-NEXT: addi 3, 3, .LCPI0_0 at toc@l
+; BE-NEXT: vand 4, 3, 4
+; BE-NEXT: lvx 10, 0, 3
+; BE-NEXT: addis 3, 2, .LCPI0_1 at toc@ha
+; BE-NEXT: vspltisb 5, 1
+; BE-NEXT: addi 3, 3, .LCPI0_1 at toc@l
+; BE-NEXT: vspltisb 0, 4
+; BE-NEXT: vand 5, 3, 5
+; BE-NEXT: vspltisb 6, 8
+; BE-NEXT: vspltisb 8, -1
+; BE-NEXT: vmuloub 9, 2, 4
+; BE-NEXT: vmuleub 4, 2, 4
+; BE-NEXT: vand 1, 3, 0
+; BE-NEXT: vperm 4, 4, 9, 10
+; BE-NEXT: vmuloub 9, 2, 5
+; BE-NEXT: vmuleub 5, 2, 5
+; BE-NEXT: vand 7, 3, 6
+; BE-NEXT: vaddubm 6, 6, 6
+; BE-NEXT: vperm 5, 5, 9, 10
+; BE-NEXT: vmuloub 9, 2, 1
+; BE-NEXT: vmuleub 1, 2, 1
+; BE-NEXT: vperm 1, 1, 9, 10
+; BE-NEXT: vmuloub 9, 2, 7
+; BE-NEXT: vmuleub 7, 2, 7
+; BE-NEXT: vand 6, 3, 6
+; BE-NEXT: vperm 7, 7, 9, 10
+; BE-NEXT: vmuloub 9, 2, 6
+; BE-NEXT: vmuleub 6, 2, 6
+; BE-NEXT: vperm 6, 6, 9, 10
+; BE-NEXT: lvx 9, 0, 3
+; BE-NEXT: vslb 0, 0, 0
+; BE-NEXT: vslb 8, 8, 8
+; BE-NEXT: vand 0, 3, 0
+; BE-NEXT: vand 8, 3, 8
+; BE-NEXT: vand 3, 3, 9
+; BE-NEXT: vmuloub 9, 2, 0
+; BE-NEXT: vmuleub 0, 2, 0
+; BE-NEXT: vxor 4, 5, 4
+; BE-NEXT: vperm 0, 0, 9, 10
+; BE-NEXT: vmuloub 9, 2, 8
+; BE-NEXT: vmuleub 8, 2, 8
+; BE-NEXT: vmuloub 5, 2, 3
+; BE-NEXT: vmuleub 2, 2, 3
+; BE-NEXT: vxor 3, 4, 1
+; BE-NEXT: vxor 3, 3, 7
+; BE-NEXT: vperm 2, 2, 5, 10
+; BE-NEXT: vxor 3, 3, 6
+; BE-NEXT: vxor 2, 3, 2
+; BE-NEXT: vperm 8, 8, 9, 10
+; BE-NEXT: vxor 2, 2, 0
+; BE-NEXT: vxor 2, 2, 8
+; BE-NEXT: blr
+;
+; LE-LABEL: clmul_v16i8:
+; LE: # %bb.0:
+; LE-NEXT: vspltisb 4, 2
+; LE-NEXT: addis 3, 2, .LCPI0_0 at toc@ha
+; LE-NEXT: vspltisb 5, 1
+; LE-NEXT: addi 3, 3, .LCPI0_0 at toc@l
+; LE-NEXT: xxland 36, 35, 36
+; LE-NEXT: xxland 37, 35, 37
+; LE-NEXT: vspltisb 0, 4
+; LE-NEXT: vspltisb 1, 8
+; LE-NEXT: lxvd2x 0, 0, 3
+; LE-NEXT: vmuloub 7, 2, 4
+; LE-NEXT: vmuleub 4, 2, 4
+; LE-NEXT: addis 3, 2, .LCPI0_1 at toc@ha
+; LE-NEXT: addi 3, 3, .LCPI0_1 at toc@l
+; LE-NEXT: xxswapd 38, 0
+; LE-NEXT: lxvd2x 0, 0, 3
+; LE-NEXT: vperm 4, 4, 7, 6
+; LE-NEXT: vmuloub 7, 2, 5
+; LE-NEXT: vmuleub 5, 2, 5
+; LE-NEXT: vperm 5, 5, 7, 6
+; LE-NEXT: xxland 39, 35, 32
+; LE-NEXT: vslb 0, 0, 0
+; LE-NEXT: vmuloub 8, 2, 7
+; LE-NEXT: vmuleub 7, 2, 7
+; LE-NEXT: xxland 32, 35, 32
+; LE-NEXT: vperm 7, 7, 8, 6
+; LE-NEXT: xxland 40, 35, 33
+; LE-NEXT: vaddubm 1, 1, 1
+; LE-NEXT: vmuloub 9, 2, 8
+; LE-NEXT: vmuleub 8, 2, 8
+; LE-NEXT: xxland 33, 35, 33
+; LE-NEXT: vperm 8, 8, 9, 6
+; LE-NEXT: vmuloub 9, 2, 1
+; LE-NEXT: vmuleub 1, 2, 1
+; LE-NEXT: vperm 1, 1, 9, 6
+; LE-NEXT: xxland 41, 35, 0
+; LE-NEXT: xxlxor 0, 37, 36
+; LE-NEXT: vmuloub 10, 2, 9
+; LE-NEXT: vmuleub 9, 2, 9
+; LE-NEXT: xxlxor 0, 0, 39
+; LE-NEXT: xxlxor 0, 0, 40
+; LE-NEXT: xxlxor 0, 0, 33
+; LE-NEXT: vperm 9, 9, 10, 6
+; LE-NEXT: vmuloub 10, 2, 0
+; LE-NEXT: vmuleub 0, 2, 0
+; LE-NEXT: xxlxor 0, 0, 41
+; LE-NEXT: vperm 0, 0, 10, 6
+; LE-NEXT: xxleqv 42, 42, 42
+; LE-NEXT: vslb 10, 10, 10
+; LE-NEXT: xxlxor 0, 0, 32
+; LE-NEXT: xxland 35, 35, 42
+; LE-NEXT: vmuloub 10, 2, 3
+; LE-NEXT: vmuleub 2, 2, 3
+; LE-NEXT: vperm 2, 2, 10, 6
+; LE-NEXT: xxlxor 34, 0, 34
+; LE-NEXT: blr
+ %res = call <16 x i8> @llvm.clmul.v16i8(<16 x i8> %a, <16 x i8> %b)
+ ret <16 x i8> %res
+}
+
+define <8 x i16> @clmul_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
+; BE-LABEL: clmul_v8i16:
+; BE: # %bb.0:
+; BE-NEXT: addis 3, 2, .LCPI1_0 at toc@ha
+; BE-NEXT: vspltish 6, 2
+; BE-NEXT: addi 3, 3, .LCPI1_0 at toc@l
+; BE-NEXT: vand 4, 3, 6
+; BE-NEXT: lvx 13, 0, 3
+; BE-NEXT: addis 3, 2, .LCPI1_1 at toc@ha
+; BE-NEXT: addi 3, 3, .LCPI1_1 at toc@l
+; BE-NEXT: vspltish 7, 1
+; BE-NEXT: lvx 14, 0, 3
+; BE-NEXT: addis 3, 2, .LCPI1_2 at toc@ha
+; BE-NEXT: addi 3, 3, .LCPI1_2 at toc@l
+; BE-NEXT: vspltish 8, 4
+; BE-NEXT: lvx 15, 0, 3
+; BE-NEXT: addis 3, 2, .LCPI1_3 at toc@ha
+; BE-NEXT: addi 3, 3, .LCPI1_3 at toc@l
+; BE-NEXT: vspltish 9, 8
+; BE-NEXT: vand 5, 3, 7
+; BE-NEXT: lvx 16, 0, 3
+; BE-NEXT: addis 3, 2, .LCPI1_4 at toc@ha
+; BE-NEXT: addi 3, 3, .LCPI1_4 at toc@l
+; BE-NEXT: vspltisb 12, -1
+; BE-NEXT: lvx 17, 0, 3
+; BE-NEXT: vand 0, 3, 8
+; BE-NEXT: vand 1, 3, 9
+; BE-NEXT: vslh 10, 8, 8
+; BE-NEXT: vsldoi 7, 7, 7, 1
+; BE-NEXT: vsldoi 6, 6, 6, 1
+; BE-NEXT: vsldoi 8, 8, 8, 1
+; BE-NEXT: vslh 11, 9, 9
+; BE-NEXT: vadduhm 9, 9, 9
+; BE-NEXT: vslh 12, 12, 12
+; BE-NEXT: vand 9, 3, 9
+; BE-NEXT: vand 10, 3, 10
+; BE-NEXT: vand 7, 3, 7
+; BE-NEXT: vand 6, 3, 6
+; BE-NEXT: vand 8, 3, 8
+; BE-NEXT: vand 11, 3, 11
+; BE-NEXT: vand 12, 3, 12
+; BE-NEXT: vand 13, 3, 13
+; BE-NEXT: vand 14, 3, 14
+; BE-NEXT: vand 15, 3, 15
+; BE-NEXT: vand 16, 3, 16
+; BE-NEXT: vand 3, 3, 17
+; BE-NEXT: vxor 17, 17, 17
+; BE-NEXT: vmladduhm 4, 2, 4, 17
+; BE-NEXT: vmladduhm 5, 2, 5, 17
+; BE-NEXT: vmladduhm 0, 2, 0, 17
+; BE-NEXT: vmladduhm 1, 2, 1, 17
+; BE-NEXT: vmladduhm 9, 2, 9, 17
+; BE-NEXT: vmladduhm 10, 2, 10, 17
+; BE-NEXT: vmladduhm 7, 2, 7, 17
+; BE-NEXT: vmladduhm 6, 2, 6, 17
+; BE-NEXT: vmladduhm 8, 2, 8, 17
+; BE-NEXT: vmladduhm 11, 2, 11, 17
+; BE-NEXT: vmladduhm 12, 2, 12, 17
+; BE-NEXT: vmladduhm 13, 2, 13, 17
+; BE-NEXT: vmladduhm 14, 2, 14, 17
+; BE-NEXT: vmladduhm 15, 2, 15, 17
+; BE-NEXT: vmladduhm 16, 2, 16, 17
+; BE-NEXT: vmladduhm 2, 2, 3, 17
+; BE-NEXT: vxor 3, 5, 4
+; BE-NEXT: vxor 3, 3, 0
+; BE-NEXT: vxor 3, 3, 1
+; BE-NEXT: vxor 3, 3, 9
+; BE-NEXT: vxor 3, 3, 13
+; BE-NEXT: vxor 3, 3, 10
+; BE-NEXT: vxor 3, 3, 14
+; BE-NEXT: vxor 3, 3, 7
+; BE-NEXT: vxor 3, 3, 6
+; BE-NEXT: vxor 3, 3, 8
+; BE-NEXT: vxor 3, 3, 11
+; BE-NEXT: vxor 3, 3, 15
+; BE-NEXT: vxor 3, 3, 16
+; BE-NEXT: vxor 2, 3, 2
+; BE-NEXT: vxor 2, 2, 12
+; BE-NEXT: blr
+;
+; LE-LABEL: clmul_v8i16:
+; LE: # %bb.0:
+; LE-NEXT: vspltish 5, 2
+; LE-NEXT: vspltish 0, 1
+; LE-NEXT: addis 3, 2, .LCPI1_0 at toc@ha
+; LE-NEXT: xxland 41, 35, 37
+; LE-NEXT: vspltish 1, 4
+; LE-NEXT: vspltish 4, 8
+; LE-NEXT: addi 3, 3, .LCPI1_0 at toc@l
+; LE-NEXT: lxvd2x 1, 0, 3
+; LE-NEXT: vsldoi 6, 0, 0, 1
+; LE-NEXT: xxland 32, 35, 32
+; LE-NEXT: vsldoi 7, 5, 5, 1
+; LE-NEXT: vxor 5, 5, 5
+; LE-NEXT: vmladduhm 9, 2, 9, 5
+; LE-NEXT: vmladduhm 0, 2, 0, 5
+; LE-NEXT: addis 3, 2, .LCPI1_1 at toc@ha
+; LE-NEXT: addi 3, 3, .LCPI1_1 at toc@l
+; LE-NEXT: vsldoi 8, 1, 1, 1
+; LE-NEXT: xxlxor 0, 32, 41
+; LE-NEXT: xxland 32, 35, 33
+; LE-NEXT: vmladduhm 0, 2, 0, 5
+; LE-NEXT: xxlxor 0, 0, 32
+; LE-NEXT: xxland 32, 35, 36
+; LE-NEXT: vmladduhm 0, 2, 0, 5
+; LE-NEXT: xxlxor 0, 0, 32
+; LE-NEXT: vadduhm 0, 4, 4
+; LE-NEXT: vslh 4, 4, 4
+; LE-NEXT: xxland 32, 35, 32
+; LE-NEXT: xxland 36, 35, 36
+; LE-NEXT: vmladduhm 0, 2, 0, 5
+; LE-NEXT: vmladduhm 4, 2, 4, 5
+; LE-NEXT: xxlxor 0, 0, 32
+; LE-NEXT: xxland 32, 35, 1
+; LE-NEXT: lxvd2x 1, 0, 3
+; LE-NEXT: addis 3, 2, .LCPI1_2 at toc@ha
+; LE-NEXT: vmladduhm 0, 2, 0, 5
+; LE-NEXT: addi 3, 3, .LCPI1_2 at toc@l
+; LE-NEXT: xxlxor 0, 0, 32
+; LE-NEXT: vslh 0, 1, 1
+; LE-NEXT: xxland 32, 35, 32
+; LE-NEXT: vmladduhm 0, 2, 0, 5
+; LE-NEXT: xxlxor 0, 0, 32
+; LE-NEXT: xxland 32, 35, 1
+; LE-NEXT: lxvd2x 1, 0, 3
+; LE-NEXT: addis 3, 2, .LCPI1_3 at toc@ha
+; LE-NEXT: vmladduhm 0, 2, 0, 5
+; LE-NEXT: addi 3, 3, .LCPI1_3 at toc@l
+; LE-NEXT: xxlxor 0, 0, 32
+; LE-NEXT: xxland 32, 35, 38
+; LE-NEXT: vmladduhm 0, 2, 0, 5
+; LE-NEXT: xxlxor 0, 0, 32
+; LE-NEXT: xxland 32, 35, 39
+; LE-NEXT: vmladduhm 0, 2, 0, 5
+; LE-NEXT: xxlxor 0, 0, 32
+; LE-NEXT: xxland 32, 35, 40
+; LE-NEXT: vmladduhm 0, 2, 0, 5
+; LE-NEXT: xxlxor 0, 0, 32
+; LE-NEXT: xxlxor 0, 0, 36
+; LE-NEXT: xxland 36, 35, 1
+; LE-NEXT: lxvd2x 1, 0, 3
+; LE-NEXT: addis 3, 2, .LCPI1_4 at toc@ha
+; LE-NEXT: vmladduhm 4, 2, 4, 5
+; LE-NEXT: addi 3, 3, .LCPI1_4 at toc@l
+; LE-NEXT: xxlxor 0, 0, 36
+; LE-NEXT: xxland 36, 35, 1
+; LE-NEXT: lxvd2x 1, 0, 3
+; LE-NEXT: vmladduhm 4, 2, 4, 5
+; LE-NEXT: xxlxor 0, 0, 36
+; LE-NEXT: xxland 36, 35, 1
+; LE-NEXT: vmladduhm 4, 2, 4, 5
+; LE-NEXT: xxlxor 0, 0, 36
+; LE-NEXT: xxleqv 36, 36, 36
+; LE-NEXT: vslh 4, 4, 4
+; LE-NEXT: xxland 35, 35, 36
+; LE-NEXT: vmladduhm 2, 2, 3, 5
+; LE-NEXT: xxlxor 34, 0, 34
+; LE-NEXT: blr
+ %res = call <8 x i16> @llvm.clmul.v8i16(<8 x i16> %a, <8 x i16> %b)
+ ret <8 x i16> %res
+}
+
+define <4 x i32> @clmul_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
+; BE-LABEL: clmul_v4i32:
+; BE: # %bb.0:
+; BE-NEXT: stdu 1, -1184(1)
+; BE-NEXT: li 3, 992
+; BE-NEXT: vspltisw 9, 4
+; BE-NEXT: stvx 20, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 1008
+; BE-NEXT: vand 4, 3, 9
+; BE-NEXT: vspltisw 6, 8
+; BE-NEXT: stvx 21, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 1024
+; BE-NEXT: vspltisw 11, 1
+; BE-NEXT: stvx 22, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 1040
+; BE-NEXT: vand 1, 3, 11
+; BE-NEXT: vspltisw 8, 2
+; BE-NEXT: stvx 23, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 1056
+; BE-NEXT: vspltisb 17, -1
+; BE-NEXT: stvx 24, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 1072
+; BE-NEXT: stvx 25, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 1088
+; BE-NEXT: vsldoi 15, 11, 11, 1
+; BE-NEXT: stvx 26, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 1104
+; BE-NEXT: stvx 27, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 1120
+; BE-NEXT: stvx 28, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 1136
+; BE-NEXT: vslw 18, 6, 6
+; BE-NEXT: stvx 29, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 1152
+; BE-NEXT: stvx 30, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 1168
+; BE-NEXT: vsldoi 5, 11, 11, 2
+; BE-NEXT: stvx 31, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 976
+; BE-NEXT: stvx 4, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 960
+; BE-NEXT: vand 4, 3, 6
+; BE-NEXT: stvx 4, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 624
+; BE-NEXT: vsldoi 13, 6, 6, 2
+; BE-NEXT: vsldoi 4, 11, 11, 3
+; BE-NEXT: vsldoi 11, 6, 6, 3
+; BE-NEXT: vadduwm 6, 6, 6
+; BE-NEXT: vand 12, 3, 6
+; BE-NEXT: stvx 12, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 752
+; BE-NEXT: vand 6, 3, 18
+; BE-NEXT: stvx 6, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 864
+; BE-NEXT: vsldoi 19, 8, 8, 2
+; BE-NEXT: vand 5, 3, 5
+; BE-NEXT: stvx 5, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 592
+; BE-NEXT: vsldoi 0, 9, 9, 2
+; BE-NEXT: vand 5, 3, 19
+; BE-NEXT: stvx 5, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 896
+; BE-NEXT: vslw 10, 9, 9
+; BE-NEXT: vsldoi 31, 9, 9, 1
+; BE-NEXT: vsldoi 9, 9, 9, 3
+; BE-NEXT: vand 0, 3, 0
+; BE-NEXT: stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 944
+; BE-NEXT: vand 23, 3, 13
+; BE-NEXT: vand 13, 3, 4
+; BE-NEXT: vand 4, 3, 9
+; BE-NEXT: stvx 4, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: addis 3, 2, .LCPI2_0 at toc@ha
+; BE-NEXT: addi 3, 3, .LCPI2_0 at toc@l
+; BE-NEXT: lvx 4, 0, 3
+; BE-NEXT: addis 3, 2, .LCPI2_1 at toc@ha
+; BE-NEXT: addi 3, 3, .LCPI2_1 at toc@l
+; BE-NEXT: vand 25, 3, 4
+; BE-NEXT: lvx 4, 0, 3
+; BE-NEXT: li 3, 928
+; BE-NEXT: vand 4, 3, 4
+; BE-NEXT: stvx 4, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: addis 3, 2, .LCPI2_2 at toc@ha
+; BE-NEXT: addi 3, 3, .LCPI2_2 at toc@l
+; BE-NEXT: lvx 4, 0, 3
+; BE-NEXT: addis 3, 2, .LCPI2_3 at toc@ha
+; BE-NEXT: addi 3, 3, .LCPI2_3 at toc@l
+; BE-NEXT: vand 16, 3, 10
+; BE-NEXT: vand 10, 3, 4
+; BE-NEXT: lvx 4, 0, 3
+; BE-NEXT: addis 3, 2, .LCPI2_4 at toc@ha
+; BE-NEXT: addi 3, 3, .LCPI2_4 at toc@l
+; BE-NEXT: vand 30, 3, 4
+; BE-NEXT: lvx 4, 0, 3
+; BE-NEXT: li 3, 768
+; BE-NEXT: vand 4, 3, 4
+; BE-NEXT: stvx 4, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: addis 3, 2, .LCPI2_5 at toc@ha
+; BE-NEXT: addi 3, 3, .LCPI2_5 at toc@l
+; BE-NEXT: lvx 4, 0, 3
+; BE-NEXT: li 3, 704
+; BE-NEXT: vand 4, 3, 4
+; BE-NEXT: stvx 4, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: addis 3, 2, .LCPI2_6 at toc@ha
+; BE-NEXT: addi 3, 3, .LCPI2_6 at toc@l
+; BE-NEXT: lvx 4, 0, 3
+; BE-NEXT: addis 3, 2, .LCPI2_7 at toc@ha
+; BE-NEXT: addi 3, 3, .LCPI2_7 at toc@l
+; BE-NEXT: vand 27, 3, 4
+; BE-NEXT: lvx 4, 0, 3
+; BE-NEXT: addis 3, 2, .LCPI2_8 at toc@ha
+; BE-NEXT: addi 3, 3, .LCPI2_8 at toc@l
+; BE-NEXT: vand 22, 3, 4
+; BE-NEXT: lvx 4, 0, 3
+; BE-NEXT: addis 3, 2, .LCPI2_9 at toc@ha
+; BE-NEXT: addi 3, 3, .LCPI2_9 at toc@l
+; BE-NEXT: vand 21, 3, 4
+; BE-NEXT: lvx 4, 0, 3
+; BE-NEXT: addis 3, 2, .LCPI2_10 at toc@ha
+; BE-NEXT: addi 3, 3, .LCPI2_10 at toc@l
+; BE-NEXT: vand 20, 3, 4
+; BE-NEXT: lvx 4, 0, 3
+; BE-NEXT: li 3, 496
+; BE-NEXT: vand 4, 3, 4
+; BE-NEXT: stvx 4, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: addis 3, 2, .LCPI2_11 at toc@ha
+; BE-NEXT: addi 3, 3, .LCPI2_11 at toc@l
+; BE-NEXT: lvx 4, 0, 3
+; BE-NEXT: li 3, 448
+; BE-NEXT: vand 4, 3, 4
+; BE-NEXT: stvx 4, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: addis 3, 2, .LCPI2_12 at toc@ha
+; BE-NEXT: addi 3, 3, .LCPI2_12 at toc@l
+; BE-NEXT: lvx 4, 0, 3
+; BE-NEXT: li 3, 368
+; BE-NEXT: vand 7, 3, 8
+; BE-NEXT: vsldoi 14, 8, 8, 1
+; BE-NEXT: vsldoi 8, 8, 8, 3
+; BE-NEXT: vslw 17, 17, 17
+; BE-NEXT: vand 15, 3, 15
+; BE-NEXT: vand 14, 3, 14
+; BE-NEXT: vand 24, 3, 31
+; BE-NEXT: vand 26, 3, 8
+; BE-NEXT: vand 11, 3, 11
+; BE-NEXT: vand 9, 3, 17
+; BE-NEXT: vand 3, 3, 4
+; BE-NEXT: stvx 3, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: vspltisw 3, -16
+; BE-NEXT: li 3, 912
+; BE-NEXT: vmulouh 6, 2, 7
+; BE-NEXT: vrlw 7, 7, 3
+; BE-NEXT: vmulouh 8, 2, 1
+; BE-NEXT: vrlw 1, 1, 3
+; BE-NEXT: vxor 0, 0, 0
+; BE-NEXT: vmsumuhm 7, 2, 7, 0
+; BE-NEXT: vmsumuhm 1, 2, 1, 0
+; BE-NEXT: vslw 7, 7, 3
+; BE-NEXT: vadduwm 6, 6, 7
+; BE-NEXT: vslw 1, 1, 3
+; BE-NEXT: vadduwm 1, 8, 1
+; BE-NEXT: vxor 4, 1, 6
+; BE-NEXT: stvx 4, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 976
+; BE-NEXT: lvx 28, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 816
+; BE-NEXT: vrlw 1, 28, 3
+; BE-NEXT: vmsumuhm 4, 2, 1, 0
+; BE-NEXT: stvx 4, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 960
+; BE-NEXT: lvx 29, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 848
+; BE-NEXT: vrlw 1, 29, 3
+; BE-NEXT: vmsumuhm 4, 2, 1, 0
+; BE-NEXT: stvx 4, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 880
+; BE-NEXT: vrlw 1, 12, 3
+; BE-NEXT: vmsumuhm 4, 2, 1, 0
+; BE-NEXT: stvx 4, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 832
+; BE-NEXT: vrlw 1, 16, 3
+; BE-NEXT: vmsumuhm 4, 2, 1, 0
+; BE-NEXT: stvx 4, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 800
+; BE-NEXT: vrlw 1, 15, 3
+; BE-NEXT: vmsumuhm 4, 2, 1, 0
+; BE-NEXT: stvx 4, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 784
+; BE-NEXT: vrlw 1, 14, 3
+; BE-NEXT: vmsumuhm 4, 2, 1, 0
+; BE-NEXT: stvx 4, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 736
+; BE-NEXT: vrlw 1, 24, 3
+; BE-NEXT: vmsumuhm 4, 2, 1, 0
+; BE-NEXT: stvx 4, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 752
+; BE-NEXT: lvx 17, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 720
+; BE-NEXT: vrlw 1, 17, 3
+; BE-NEXT: vmsumuhm 4, 2, 1, 0
+; BE-NEXT: stvx 4, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 864
+; BE-NEXT: vmr 31, 16
+; BE-NEXT: lvx 16, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 688
+; BE-NEXT: vrlw 1, 16, 3
+; BE-NEXT: vmsumuhm 4, 2, 1, 0
+; BE-NEXT: stvx 4, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 672
+; BE-NEXT: vrlw 1, 5, 3
+; BE-NEXT: vmsumuhm 4, 2, 1, 0
+; BE-NEXT: stvx 4, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 896
+; BE-NEXT: vmr 19, 15
+; BE-NEXT: lvx 15, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 656
+; BE-NEXT: vrlw 1, 15, 3
+; BE-NEXT: vmsumuhm 4, 2, 1, 0
+; BE-NEXT: stvx 4, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 640
+; BE-NEXT: vrlw 1, 23, 3
+; BE-NEXT: vmsumuhm 4, 2, 1, 0
+; BE-NEXT: stvx 4, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 608
+; BE-NEXT: vrlw 1, 13, 3
+; BE-NEXT: vmsumuhm 4, 2, 1, 0
+; BE-NEXT: stvx 4, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 576
+; BE-NEXT: vrlw 1, 26, 3
+; BE-NEXT: vmsumuhm 4, 2, 1, 0
+; BE-NEXT: stvx 4, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 944
+; BE-NEXT: lvx 12, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 544
+; BE-NEXT: vrlw 1, 12, 3
+; BE-NEXT: vmsumuhm 4, 2, 1, 0
+; BE-NEXT: stvx 4, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 528
+; BE-NEXT: vrlw 1, 11, 3
+; BE-NEXT: vmsumuhm 4, 2, 1, 0
+; BE-NEXT: stvx 4, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 512
+; BE-NEXT: vrlw 1, 9, 3
+; BE-NEXT: vmsumuhm 4, 2, 1, 0
+; BE-NEXT: stvx 4, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 480
+; BE-NEXT: vrlw 1, 25, 3
+; BE-NEXT: vmsumuhm 4, 2, 1, 0
+; BE-NEXT: stvx 4, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 928
+; BE-NEXT: lvx 7, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 464
+; BE-NEXT: vrlw 1, 7, 3
+; BE-NEXT: vmsumuhm 4, 2, 1, 0
+; BE-NEXT: stvx 4, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 432
+; BE-NEXT: vrlw 1, 10, 3
+; BE-NEXT: vmsumuhm 4, 2, 1, 0
+; BE-NEXT: stvx 4, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 400
+; BE-NEXT: vrlw 1, 30, 3
+; BE-NEXT: vmsumuhm 4, 2, 1, 0
+; BE-NEXT: stvx 4, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 768
+; BE-NEXT: vmr 18, 14
+; BE-NEXT: vmr 14, 23
+; BE-NEXT: vmr 23, 26
+; BE-NEXT: vmr 26, 30
+; BE-NEXT: lvx 30, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 352
+; BE-NEXT: vrlw 1, 30, 3
+; BE-NEXT: vmsumuhm 4, 2, 1, 0
+; BE-NEXT: stvx 4, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 704
+; BE-NEXT: vmr 6, 25
+; BE-NEXT: vmr 25, 10
+; BE-NEXT: lvx 10, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 320
+; BE-NEXT: vrlw 1, 10, 3
+; BE-NEXT: vmsumuhm 4, 2, 1, 0
+; BE-NEXT: stvx 4, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 288
+; BE-NEXT: vrlw 1, 27, 3
+; BE-NEXT: vmsumuhm 4, 2, 1, 0
+; BE-NEXT: stvx 4, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 256
+; BE-NEXT: vrlw 1, 22, 3
+; BE-NEXT: vmsumuhm 4, 2, 1, 0
+; BE-NEXT: stvx 4, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 224
+; BE-NEXT: vrlw 1, 21, 3
+; BE-NEXT: vmsumuhm 4, 2, 1, 0
+; BE-NEXT: stvx 4, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 208
+; BE-NEXT: vrlw 1, 20, 3
+; BE-NEXT: vmsumuhm 4, 2, 1, 0
+; BE-NEXT: stvx 4, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 496
+; BE-NEXT: lvx 5, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 192
+; BE-NEXT: vrlw 1, 5, 3
+; BE-NEXT: vmsumuhm 4, 2, 1, 0
+; BE-NEXT: stvx 4, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 448
+; BE-NEXT: lvx 4, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 160
+; BE-NEXT: vrlw 1, 4, 3
+; BE-NEXT: vmsumuhm 1, 2, 1, 0
+; BE-NEXT: stvx 1, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 368
+; BE-NEXT: lvx 1, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 128
+; BE-NEXT: vrlw 8, 1, 3
+; BE-NEXT: vmsumuhm 0, 2, 8, 0
+; BE-NEXT: stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 624
+; BE-NEXT: vmulouh 8, 2, 29
+; BE-NEXT: lvx 29, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 96
+; BE-NEXT: vmulouh 29, 2, 29
+; BE-NEXT: stvx 29, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 176
+; BE-NEXT: vmulouh 31, 2, 31
+; BE-NEXT: stvx 31, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 240
+; BE-NEXT: vmulouh 19, 2, 19
+; BE-NEXT: stvx 19, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 272
+; BE-NEXT: vmulouh 18, 2, 18
+; BE-NEXT: stvx 18, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 304
+; BE-NEXT: vmulouh 18, 2, 24
+; BE-NEXT: stvx 18, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 336
+; BE-NEXT: vmulouh 17, 2, 17
+; BE-NEXT: stvx 17, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 560
+; BE-NEXT: vmulouh 16, 2, 16
+; BE-NEXT: stvx 16, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 592
+; BE-NEXT: lvx 16, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: vmulouh 16, 2, 16
+; BE-NEXT: stvx 16, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 624
+; BE-NEXT: vmulouh 15, 2, 15
+; BE-NEXT: stvx 15, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 752
+; BE-NEXT: vmulouh 14, 2, 14
+; BE-NEXT: stvx 14, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 864
+; BE-NEXT: vmulouh 13, 2, 13
+; BE-NEXT: stvx 13, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 896
+; BE-NEXT: vmulouh 13, 2, 23
+; BE-NEXT: stvx 13, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 944
+; BE-NEXT: vmulouh 12, 2, 12
+; BE-NEXT: stvx 12, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 960
+; BE-NEXT: vmulouh 11, 2, 11
+; BE-NEXT: stvx 11, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 976
+; BE-NEXT: vmulouh 9, 2, 9
+; BE-NEXT: stvx 9, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 64
+; BE-NEXT: vmulouh 23, 2, 6
+; BE-NEXT: vmulouh 6, 2, 25
+; BE-NEXT: stvx 6, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 80
+; BE-NEXT: vmulouh 6, 2, 26
+; BE-NEXT: stvx 6, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 112
+; BE-NEXT: vmulouh 6, 2, 30
+; BE-NEXT: stvx 6, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 144
+; BE-NEXT: vmulouh 6, 2, 10
+; BE-NEXT: stvx 6, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 384
+; BE-NEXT: vmulouh 6, 2, 27
+; BE-NEXT: stvx 6, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 416
+; BE-NEXT: vmulouh 6, 2, 22
+; BE-NEXT: stvx 6, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 704
+; BE-NEXT: vmulouh 6, 2, 21
+; BE-NEXT: stvx 6, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 768
+; BE-NEXT: vmulouh 6, 2, 20
+; BE-NEXT: stvx 6, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 928
+; BE-NEXT: vmulouh 5, 2, 5
+; BE-NEXT: stvx 5, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 496
+; BE-NEXT: vmulouh 0, 2, 28
+; BE-NEXT: vmulouh 24, 2, 7
+; BE-NEXT: vmulouh 20, 2, 4
+; BE-NEXT: vmulouh 2, 2, 1
+; BE-NEXT: stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 816
+; BE-NEXT: lvx 2, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 912
+; BE-NEXT: vslw 9, 2, 3
+; BE-NEXT: lvx 2, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 848
+; BE-NEXT: vadduwm 4, 0, 9
+; BE-NEXT: vxor 4, 2, 4
+; BE-NEXT: lvx 2, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 880
+; BE-NEXT: vslw 9, 2, 3
+; BE-NEXT: lvx 2, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 832
+; BE-NEXT: vadduwm 5, 8, 9
+; BE-NEXT: vslw 9, 2, 3
+; BE-NEXT: lvx 2, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 800
+; BE-NEXT: vslw 8, 2, 3
+; BE-NEXT: lvx 2, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 784
+; BE-NEXT: vslw 10, 2, 3
+; BE-NEXT: lvx 2, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 736
+; BE-NEXT: vslw 11, 2, 3
+; BE-NEXT: lvx 2, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 720
+; BE-NEXT: vslw 12, 2, 3
+; BE-NEXT: lvx 2, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 688
+; BE-NEXT: vslw 13, 2, 3
+; BE-NEXT: lvx 2, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 672
+; BE-NEXT: vslw 18, 2, 3
+; BE-NEXT: lvx 2, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 656
+; BE-NEXT: vslw 19, 2, 3
+; BE-NEXT: lvx 2, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 640
+; BE-NEXT: vslw 31, 2, 3
+; BE-NEXT: lvx 2, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 608
+; BE-NEXT: vslw 29, 2, 3
+; BE-NEXT: lvx 2, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 576
+; BE-NEXT: vslw 22, 2, 3
+; BE-NEXT: lvx 2, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 544
+; BE-NEXT: vslw 27, 2, 3
+; BE-NEXT: lvx 2, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 528
+; BE-NEXT: vslw 25, 2, 3
+; BE-NEXT: lvx 2, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 512
+; BE-NEXT: vslw 30, 2, 3
+; BE-NEXT: lvx 2, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 912
+; BE-NEXT: vslw 2, 2, 3
+; BE-NEXT: stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 480
+; BE-NEXT: lvx 2, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 464
+; BE-NEXT: vxor 6, 4, 5
+; BE-NEXT: lvx 4, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 432
+; BE-NEXT: lvx 5, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 400
+; BE-NEXT: lvx 0, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 352
+; BE-NEXT: vslw 2, 2, 3
+; BE-NEXT: lvx 1, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 320
+; BE-NEXT: vadduwm 2, 23, 2
+; BE-NEXT: lvx 7, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 288
+; BE-NEXT: lvx 14, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 256
+; BE-NEXT: vslw 4, 4, 3
+; BE-NEXT: vadduwm 4, 24, 4
+; BE-NEXT: lvx 15, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 224
+; BE-NEXT: lvx 16, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 208
+; BE-NEXT: lvx 17, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 192
+; BE-NEXT: vslw 5, 5, 3
+; BE-NEXT: lvx 28, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 160
+; BE-NEXT: lvx 26, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 128
+; BE-NEXT: vslw 0, 0, 3
+; BE-NEXT: lvx 21, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 96
+; BE-NEXT: vslw 1, 1, 3
+; BE-NEXT: vslw 7, 7, 3
+; BE-NEXT: vslw 14, 14, 3
+; BE-NEXT: vslw 15, 15, 3
+; BE-NEXT: vslw 16, 16, 3
+; BE-NEXT: vslw 17, 17, 3
+; BE-NEXT: vslw 28, 28, 3
+; BE-NEXT: vslw 26, 26, 3
+; BE-NEXT: vslw 3, 21, 3
+; BE-NEXT: lvx 21, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 176
+; BE-NEXT: vadduwm 9, 21, 9
+; BE-NEXT: vxor 6, 6, 9
+; BE-NEXT: vxor 2, 6, 2
+; BE-NEXT: lvx 6, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 240
+; BE-NEXT: vadduwm 6, 6, 8
+; BE-NEXT: vxor 2, 2, 6
+; BE-NEXT: vxor 2, 2, 4
+; BE-NEXT: lvx 4, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 272
+; BE-NEXT: vadduwm 4, 4, 10
+; BE-NEXT: vxor 2, 2, 4
+; BE-NEXT: lvx 4, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 304
+; BE-NEXT: vadduwm 4, 4, 11
+; BE-NEXT: vxor 2, 2, 4
+; BE-NEXT: lvx 4, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 336
+; BE-NEXT: vadduwm 4, 4, 12
+; BE-NEXT: vxor 2, 2, 4
+; BE-NEXT: lvx 4, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 64
+; BE-NEXT: vadduwm 4, 4, 13
+; BE-NEXT: vxor 2, 2, 4
+; BE-NEXT: lvx 4, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 80
+; BE-NEXT: vadduwm 4, 4, 5
+; BE-NEXT: vxor 2, 2, 4
+; BE-NEXT: lvx 4, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 112
+; BE-NEXT: vadduwm 4, 4, 0
+; BE-NEXT: vxor 2, 2, 4
+; BE-NEXT: lvx 4, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 144
+; BE-NEXT: vadduwm 4, 4, 1
+; BE-NEXT: vxor 2, 2, 4
+; BE-NEXT: lvx 4, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 560
+; BE-NEXT: vadduwm 4, 4, 7
+; BE-NEXT: vxor 2, 2, 4
+; BE-NEXT: lvx 4, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 592
+; BE-NEXT: vadduwm 4, 4, 18
+; BE-NEXT: vxor 2, 2, 4
+; BE-NEXT: lvx 4, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 624
+; BE-NEXT: vadduwm 4, 4, 19
+; BE-NEXT: vxor 2, 2, 4
+; BE-NEXT: lvx 4, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 752
+; BE-NEXT: vadduwm 4, 4, 31
+; BE-NEXT: vxor 2, 2, 4
+; BE-NEXT: lvx 4, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 384
+; BE-NEXT: vadduwm 4, 4, 29
+; BE-NEXT: vxor 2, 2, 4
+; BE-NEXT: lvx 4, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 416
+; BE-NEXT: vadduwm 4, 4, 14
+; BE-NEXT: vxor 2, 2, 4
+; BE-NEXT: lvx 4, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 704
+; BE-NEXT: vadduwm 4, 4, 15
+; BE-NEXT: vxor 2, 2, 4
+; BE-NEXT: lvx 4, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 768
+; BE-NEXT: vadduwm 4, 4, 16
+; BE-NEXT: vxor 2, 2, 4
+; BE-NEXT: lvx 4, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 864
+; BE-NEXT: vadduwm 4, 4, 17
+; BE-NEXT: vxor 2, 2, 4
+; BE-NEXT: lvx 4, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 896
+; BE-NEXT: vadduwm 4, 4, 22
+; BE-NEXT: vxor 2, 2, 4
+; BE-NEXT: lvx 4, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 944
+; BE-NEXT: vadduwm 4, 4, 27
+; BE-NEXT: vxor 2, 2, 4
+; BE-NEXT: lvx 4, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 960
+; BE-NEXT: vadduwm 4, 4, 25
+; BE-NEXT: vxor 2, 2, 4
+; BE-NEXT: lvx 4, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 928
+; BE-NEXT: vadduwm 4, 4, 30
+; BE-NEXT: vxor 2, 2, 4
+; BE-NEXT: lvx 4, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 496
+; BE-NEXT: vadduwm 4, 4, 28
+; BE-NEXT: vxor 2, 2, 4
+; BE-NEXT: vadduwm 4, 20, 26
+; BE-NEXT: vxor 2, 2, 4
+; BE-NEXT: lvx 4, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 976
+; BE-NEXT: vadduwm 3, 4, 3
+; BE-NEXT: vxor 2, 2, 3
+; BE-NEXT: lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 912
+; BE-NEXT: lvx 4, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 1168
+; BE-NEXT: lvx 31, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 1152
+; BE-NEXT: vadduwm 3, 3, 4
+; BE-NEXT: vxor 2, 2, 3
+; BE-NEXT: lvx 30, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 1136
+; BE-NEXT: lvx 29, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 1120
+; BE-NEXT: lvx 28, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 1104
+; BE-NEXT: lvx 27, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 1088
+; BE-NEXT: lvx 26, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 1072
+; BE-NEXT: lvx 25, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 1056
+; BE-NEXT: lvx 24, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 1040
+; BE-NEXT: lvx 23, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 1024
+; BE-NEXT: lvx 22, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 1008
+; BE-NEXT: lvx 21, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 992
+; BE-NEXT: lvx 20, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: addi 1, 1, 1184
+; BE-NEXT: blr
+;
+; LE-LABEL: clmul_v4i32:
+; LE: # %bb.0:
+; LE-NEXT: vspltisw 0, 2
+; LE-NEXT: vspltisw 1, 1
+; LE-NEXT: addis 3, 2, .LCPI2_0 at toc@ha
+; LE-NEXT: xxland 45, 35, 32
+; LE-NEXT: xxland 46, 35, 33
+; LE-NEXT: vspltisw 5, 4
+; LE-NEXT: vspltisw 4, 8
+; LE-NEXT: addi 3, 3, .LCPI2_0 at toc@l
+; LE-NEXT: lxvd2x 1, 0, 3
+; LE-NEXT: vmuluwm 13, 2, 13
+; LE-NEXT: vmuluwm 14, 2, 14
+; LE-NEXT: addis 3, 2, .LCPI2_1 at toc@ha
+; LE-NEXT: vsldoi 12, 1, 1, 1
+; LE-NEXT: vsldoi 8, 0, 0, 1
+; LE-NEXT: addi 3, 3, .LCPI2_1 at toc@l
+; LE-NEXT: xxland 44, 35, 44
+; LE-NEXT: vmuluwm 12, 2, 12
+; LE-NEXT: xxland 40, 35, 40
+; LE-NEXT: vmuluwm 8, 2, 8
+; LE-NEXT: vsldoi 10, 5, 5, 1
+; LE-NEXT: vsldoi 6, 1, 1, 2
+; LE-NEXT: xxland 38, 35, 38
+; LE-NEXT: vmuluwm 6, 2, 6
+; LE-NEXT: vsldoi 7, 0, 0, 2
+; LE-NEXT: vsldoi 9, 5, 5, 2
+; LE-NEXT: vsldoi 11, 4, 4, 2
+; LE-NEXT: vsldoi 1, 1, 1, 3
+; LE-NEXT: vsldoi 0, 0, 0, 3
+; LE-NEXT: xxland 33, 35, 33
+; LE-NEXT: vmuluwm 1, 2, 1
+; LE-NEXT: xxland 32, 35, 32
+; LE-NEXT: vmuluwm 0, 2, 0
+; LE-NEXT: xxlxor 0, 46, 45
+; LE-NEXT: xxland 45, 35, 37
+; LE-NEXT: vmuluwm 13, 2, 13
+; LE-NEXT: xxlxor 0, 0, 45
+; LE-NEXT: xxland 45, 35, 36
+; LE-NEXT: vmuluwm 13, 2, 13
+; LE-NEXT: xxlxor 0, 0, 45
+; LE-NEXT: vadduwm 13, 4, 4
+; LE-NEXT: xxland 45, 35, 45
+; LE-NEXT: vmuluwm 13, 2, 13
+; LE-NEXT: xxlxor 0, 0, 45
+; LE-NEXT: xxland 45, 35, 1
+; LE-NEXT: lxvd2x 1, 0, 3
+; LE-NEXT: addis 3, 2, .LCPI2_2 at toc@ha
+; LE-NEXT: vmuluwm 13, 2, 13
+; LE-NEXT: addi 3, 3, .LCPI2_2 at toc@l
+; LE-NEXT: xxlxor 0, 0, 45
+; LE-NEXT: vslw 13, 5, 5
+; LE-NEXT: xxland 45, 35, 45
+; LE-NEXT: vmuluwm 13, 2, 13
+; LE-NEXT: xxlxor 0, 0, 45
+; LE-NEXT: xxland 45, 35, 1
+; LE-NEXT: lxvd2x 1, 0, 3
+; LE-NEXT: addis 3, 2, .LCPI2_3 at toc@ha
+; LE-NEXT: vmuluwm 13, 2, 13
+; LE-NEXT: addi 3, 3, .LCPI2_3 at toc@l
+; LE-NEXT: xxlxor 0, 0, 45
+; LE-NEXT: xxlxor 0, 0, 44
+; LE-NEXT: xxlxor 0, 0, 40
+; LE-NEXT: xxland 40, 35, 42
+; LE-NEXT: vmuluwm 8, 2, 8
+; LE-NEXT: xxlxor 0, 0, 40
+; LE-NEXT: vslw 8, 4, 4
+; LE-NEXT: xxland 40, 35, 40
+; LE-NEXT: vmuluwm 8, 2, 8
+; LE-NEXT: xxlxor 0, 0, 40
+; LE-NEXT: xxland 40, 35, 1
+; LE-NEXT: lxvd2x 1, 0, 3
+; LE-NEXT: addis 3, 2, .LCPI2_4 at toc@ha
+; LE-NEXT: vmuluwm 8, 2, 8
+; LE-NEXT: addi 3, 3, .LCPI2_4 at toc@l
+; LE-NEXT: xxlxor 0, 0, 40
+; LE-NEXT: xxland 40, 35, 1
+; LE-NEXT: lxvd2x 1, 0, 3
+; LE-NEXT: addis 3, 2, .LCPI2_5 at toc@ha
+; LE-NEXT: vmuluwm 8, 2, 8
+; LE-NEXT: addi 3, 3, .LCPI2_5 at toc@l
+; LE-NEXT: xxlxor 0, 0, 40
+; LE-NEXT: xxland 40, 35, 1
+; LE-NEXT: lxvd2x 1, 0, 3
+; LE-NEXT: addis 3, 2, .LCPI2_6 at toc@ha
+; LE-NEXT: vmuluwm 8, 2, 8
+; LE-NEXT: addi 3, 3, .LCPI2_6 at toc@l
+; LE-NEXT: xxlxor 0, 0, 40
+; LE-NEXT: xxland 40, 35, 1
+; LE-NEXT: lxvd2x 1, 0, 3
+; LE-NEXT: addis 3, 2, .LCPI2_7 at toc@ha
+; LE-NEXT: vmuluwm 8, 2, 8
+; LE-NEXT: addi 3, 3, .LCPI2_7 at toc@l
+; LE-NEXT: xxlxor 0, 0, 40
+; LE-NEXT: xxlxor 0, 0, 38
+; LE-NEXT: xxland 38, 35, 39
+; LE-NEXT: vmuluwm 6, 2, 6
+; LE-NEXT: xxlxor 0, 0, 38
+; LE-NEXT: xxland 38, 35, 41
+; LE-NEXT: vmuluwm 6, 2, 6
+; LE-NEXT: xxlxor 0, 0, 38
+; LE-NEXT: xxland 38, 35, 43
+; LE-NEXT: vmuluwm 6, 2, 6
+; LE-NEXT: xxlxor 0, 0, 38
+; LE-NEXT: xxland 38, 35, 1
+; LE-NEXT: lxvd2x 1, 0, 3
+; LE-NEXT: addis 3, 2, .LCPI2_8 at toc@ha
+; LE-NEXT: vmuluwm 6, 2, 6
+; LE-NEXT: addi 3, 3, .LCPI2_8 at toc@l
+; LE-NEXT: xxlxor 0, 0, 38
+; LE-NEXT: xxland 38, 35, 1
+; LE-NEXT: lxvd2x 1, 0, 3
+; LE-NEXT: addis 3, 2, .LCPI2_9 at toc@ha
+; LE-NEXT: vmuluwm 6, 2, 6
+; LE-NEXT: addi 3, 3, .LCPI2_9 at toc@l
+; LE-NEXT: xxlxor 0, 0, 38
+; LE-NEXT: xxland 38, 35, 1
+; LE-NEXT: lxvd2x 1, 0, 3
+; LE-NEXT: addis 3, 2, .LCPI2_10 at toc@ha
+; LE-NEXT: vmuluwm 6, 2, 6
+; LE-NEXT: addi 3, 3, .LCPI2_10 at toc@l
+; LE-NEXT: xxlxor 0, 0, 38
+; LE-NEXT: xxland 38, 35, 1
+; LE-NEXT: lxvd2x 1, 0, 3
+; LE-NEXT: addis 3, 2, .LCPI2_11 at toc@ha
+; LE-NEXT: vmuluwm 6, 2, 6
+; LE-NEXT: addi 3, 3, .LCPI2_11 at toc@l
+; LE-NEXT: xxlxor 0, 0, 38
+; LE-NEXT: xxlxor 0, 0, 33
+; LE-NEXT: xxlxor 0, 0, 32
+; LE-NEXT: vsldoi 5, 5, 5, 3
+; LE-NEXT: xxland 37, 35, 37
+; LE-NEXT: vmuluwm 5, 2, 5
+; LE-NEXT: xxlxor 0, 0, 37
+; LE-NEXT: vsldoi 4, 4, 4, 3
+; LE-NEXT: xxland 36, 35, 36
+; LE-NEXT: vmuluwm 4, 2, 4
+; LE-NEXT: xxlxor 0, 0, 36
+; LE-NEXT: xxland 36, 35, 1
+; LE-NEXT: lxvd2x 1, 0, 3
+; LE-NEXT: addis 3, 2, .LCPI2_12 at toc@ha
+; LE-NEXT: vmuluwm 4, 2, 4
+; LE-NEXT: addi 3, 3, .LCPI2_12 at toc@l
+; LE-NEXT: xxlxor 0, 0, 36
+; LE-NEXT: xxland 36, 35, 1
+; LE-NEXT: lxvd2x 1, 0, 3
+; LE-NEXT: vmuluwm 4, 2, 4
+; LE-NEXT: xxlxor 0, 0, 36
+; LE-NEXT: xxland 36, 35, 1
+; LE-NEXT: vmuluwm 4, 2, 4
+; LE-NEXT: xxlxor 0, 0, 36
+; LE-NEXT: xxleqv 36, 36, 36
+; LE-NEXT: vslw 4, 4, 4
+; LE-NEXT: xxland 35, 35, 36
+; LE-NEXT: vmuluwm 2, 2, 3
+; LE-NEXT: xxlxor 34, 0, 34
+; LE-NEXT: blr
+ %res = call <4 x i32> @llvm.clmul.v4i32(<4 x i32> %a, <4 x i32> %b)
+ ret <4 x i32> %res
+}
+
+define <2 x i64> @clmul_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
+; BE-LABEL: clmul_v2i64:
+; BE: # %bb.0:
+; BE-NEXT: stdu 1, -1008(1)
+; BE-NEXT: rlwinm 7, 5, 0, 30, 30
+; BE-NEXT: rlwinm 8, 5, 0, 29, 29
+; BE-NEXT: std 2, 856(1) # 8-byte Folded Spill
+; BE-NEXT: mulld 7, 3, 7
+; BE-NEXT: std 7, 848(1) # 8-byte Folded Spill
+; BE-NEXT: clrldi 7, 5, 63
+; BE-NEXT: mulld 2, 3, 7
+; BE-NEXT: std 31, 1000(1) # 8-byte Folded Spill
+; BE-NEXT: mulld 7, 3, 8
+; BE-NEXT: std 15, 872(1) # 8-byte Folded Spill
+; BE-NEXT: std 7, 840(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 7, 5, 0, 28, 28
+; BE-NEXT: rlwinm 8, 5, 0, 27, 27
+; BE-NEXT: std 14, 864(1) # 8-byte Folded Spill
+; BE-NEXT: mulld 7, 3, 7
+; BE-NEXT: std 7, 824(1) # 8-byte Folded Spill
+; BE-NEXT: mulld 7, 3, 8
+; BE-NEXT: std 7, 832(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 7, 5, 0, 26, 26
+; BE-NEXT: std 17, 888(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 8, 5, 0, 25, 25
+; BE-NEXT: mulld 7, 3, 7
+; BE-NEXT: std 7, 808(1) # 8-byte Folded Spill
+; BE-NEXT: mulld 7, 3, 8
+; BE-NEXT: std 7, 816(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 7, 5, 0, 24, 24
+; BE-NEXT: std 16, 880(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 8, 5, 0, 23, 23
+; BE-NEXT: mulld 7, 3, 7
+; BE-NEXT: std 7, 792(1) # 8-byte Folded Spill
+; BE-NEXT: mulld 7, 3, 8
+; BE-NEXT: std 7, 800(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 7, 5, 0, 22, 22
+; BE-NEXT: std 19, 904(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 8, 5, 0, 21, 21
+; BE-NEXT: mulld 7, 3, 7
+; BE-NEXT: std 7, 776(1) # 8-byte Folded Spill
+; BE-NEXT: mulld 7, 3, 8
+; BE-NEXT: std 7, 784(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 7, 5, 0, 20, 20
+; BE-NEXT: std 18, 896(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 8, 5, 0, 19, 19
+; BE-NEXT: mulld 7, 3, 7
+; BE-NEXT: std 7, 760(1) # 8-byte Folded Spill
+; BE-NEXT: mulld 7, 3, 8
+; BE-NEXT: std 7, 768(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 7, 5, 0, 18, 18
+; BE-NEXT: std 21, 920(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 8, 5, 0, 17, 17
+; BE-NEXT: mulld 7, 3, 7
+; BE-NEXT: std 7, 744(1) # 8-byte Folded Spill
+; BE-NEXT: mulld 7, 3, 8
+; BE-NEXT: std 7, 752(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 7, 5, 0, 16, 16
+; BE-NEXT: std 20, 912(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 8, 5, 0, 15, 15
+; BE-NEXT: mulld 7, 3, 7
+; BE-NEXT: std 7, 728(1) # 8-byte Folded Spill
+; BE-NEXT: mulld 7, 3, 8
+; BE-NEXT: std 7, 736(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 7, 5, 0, 14, 14
+; BE-NEXT: std 23, 936(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 8, 5, 0, 13, 13
+; BE-NEXT: mulld 7, 3, 7
+; BE-NEXT: std 7, 712(1) # 8-byte Folded Spill
+; BE-NEXT: mulld 7, 3, 8
+; BE-NEXT: std 7, 720(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 7, 5, 0, 12, 12
+; BE-NEXT: std 22, 928(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 8, 5, 0, 11, 11
+; BE-NEXT: mulld 7, 3, 7
+; BE-NEXT: std 7, 696(1) # 8-byte Folded Spill
+; BE-NEXT: mulld 7, 3, 8
+; BE-NEXT: std 7, 704(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 7, 5, 0, 10, 10
+; BE-NEXT: std 25, 952(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 8, 5, 0, 9, 9
+; BE-NEXT: mulld 7, 3, 7
+; BE-NEXT: std 7, 680(1) # 8-byte Folded Spill
+; BE-NEXT: mulld 7, 3, 8
+; BE-NEXT: std 7, 688(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 7, 5, 0, 8, 8
+; BE-NEXT: std 24, 944(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 8, 5, 0, 7, 7
+; BE-NEXT: mulld 7, 3, 7
+; BE-NEXT: std 7, 664(1) # 8-byte Folded Spill
+; BE-NEXT: mulld 7, 3, 8
+; BE-NEXT: std 7, 672(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 7, 5, 0, 6, 6
+; BE-NEXT: std 27, 968(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 8, 5, 0, 5, 5
+; BE-NEXT: mulld 7, 3, 7
+; BE-NEXT: std 7, 648(1) # 8-byte Folded Spill
+; BE-NEXT: mulld 7, 3, 8
+; BE-NEXT: std 7, 656(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 7, 5, 0, 4, 4
+; BE-NEXT: std 26, 960(1) # 8-byte Folded Spill
+; BE-NEXT: mulld 7, 3, 7
+; BE-NEXT: rldicr 8, 5, 0, 0
+; BE-NEXT: std 7, 640(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 7, 5, 0, 3, 3
+; BE-NEXT: mulld 7, 3, 7
+; BE-NEXT: std 7, 632(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 7, 5, 0, 2, 2
+; BE-NEXT: std 29, 984(1) # 8-byte Folded Spill
+; BE-NEXT: mulld 7, 3, 7
+; BE-NEXT: std 7, 624(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 7, 5, 0, 1, 1
+; BE-NEXT: mulld 7, 3, 7
+; BE-NEXT: std 7, 616(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 7, 5, 0, 0, 0
+; BE-NEXT: std 28, 976(1) # 8-byte Folded Spill
+; BE-NEXT: mulld 7, 3, 7
+; BE-NEXT: std 7, 608(1) # 8-byte Folded Spill
+; BE-NEXT: rldicl 7, 5, 32, 32
+; BE-NEXT: rldicl 7, 7, 32, 31
+; BE-NEXT: std 30, 992(1) # 8-byte Folded Spill
+; BE-NEXT: mulld 7, 3, 7
+; BE-NEXT: mulld 8, 3, 8
+; BE-NEXT: std 7, 592(1) # 8-byte Folded Spill
+; BE-NEXT: std 8, 600(1) # 8-byte Folded Spill
+; BE-NEXT: rldicl 7, 5, 31, 33
+; BE-NEXT: rldicl 7, 7, 33, 30
+; BE-NEXT: rldicl 8, 5, 30, 34
+; BE-NEXT: rldicl 8, 8, 34, 29
+; BE-NEXT: mulld 7, 3, 7
+; BE-NEXT: std 7, 576(1) # 8-byte Folded Spill
+; BE-NEXT: mulld 7, 3, 8
+; BE-NEXT: std 7, 584(1) # 8-byte Folded Spill
+; BE-NEXT: rldicl 7, 5, 29, 35
+; BE-NEXT: rldicl 7, 7, 35, 28
+; BE-NEXT: rldicl 8, 5, 28, 36
+; BE-NEXT: rldicl 8, 8, 36, 27
+; BE-NEXT: mulld 7, 3, 7
+; BE-NEXT: std 7, 560(1) # 8-byte Folded Spill
+; BE-NEXT: mulld 7, 3, 8
+; BE-NEXT: std 7, 568(1) # 8-byte Folded Spill
+; BE-NEXT: rldicl 7, 5, 27, 37
+; BE-NEXT: rldicl 7, 7, 37, 26
+; BE-NEXT: rldicl 8, 5, 26, 38
+; BE-NEXT: rldicl 8, 8, 38, 25
+; BE-NEXT: mulld 7, 3, 7
+; BE-NEXT: std 7, 544(1) # 8-byte Folded Spill
+; BE-NEXT: mulld 7, 3, 8
+; BE-NEXT: std 7, 552(1) # 8-byte Folded Spill
+; BE-NEXT: rldicl 7, 5, 25, 39
+; BE-NEXT: rldicl 7, 7, 39, 24
+; BE-NEXT: rldicl 8, 5, 24, 40
+; BE-NEXT: rldicl 8, 8, 40, 23
+; BE-NEXT: mulld 7, 3, 7
+; BE-NEXT: std 7, 528(1) # 8-byte Folded Spill
+; BE-NEXT: mulld 7, 3, 8
+; BE-NEXT: std 7, 536(1) # 8-byte Folded Spill
+; BE-NEXT: rldicl 7, 5, 23, 41
+; BE-NEXT: rldicl 7, 7, 41, 22
+; BE-NEXT: rldicl 8, 5, 22, 42
+; BE-NEXT: rldicl 8, 8, 42, 21
+; BE-NEXT: mulld 7, 3, 7
+; BE-NEXT: std 7, 512(1) # 8-byte Folded Spill
+; BE-NEXT: mulld 7, 3, 8
+; BE-NEXT: std 7, 520(1) # 8-byte Folded Spill
+; BE-NEXT: rldicl 7, 5, 21, 43
+; BE-NEXT: rldicl 7, 7, 43, 20
+; BE-NEXT: rldicl 8, 5, 20, 44
+; BE-NEXT: rldicl 8, 8, 44, 19
+; BE-NEXT: mulld 7, 3, 7
+; BE-NEXT: std 7, 496(1) # 8-byte Folded Spill
+; BE-NEXT: mulld 7, 3, 8
+; BE-NEXT: std 7, 504(1) # 8-byte Folded Spill
+; BE-NEXT: rldicl 7, 5, 19, 45
+; BE-NEXT: rldicl 7, 7, 45, 18
+; BE-NEXT: rldicl 8, 5, 18, 46
+; BE-NEXT: rldicl 8, 8, 46, 17
+; BE-NEXT: mulld 7, 3, 7
+; BE-NEXT: std 7, 480(1) # 8-byte Folded Spill
+; BE-NEXT: mulld 7, 3, 8
+; BE-NEXT: std 7, 488(1) # 8-byte Folded Spill
+; BE-NEXT: rldicl 7, 5, 17, 47
+; BE-NEXT: rldicl 7, 7, 47, 16
+; BE-NEXT: rldicl 8, 5, 16, 48
+; BE-NEXT: rldicl 8, 8, 48, 15
+; BE-NEXT: mulld 7, 3, 7
+; BE-NEXT: std 7, 464(1) # 8-byte Folded Spill
+; BE-NEXT: mulld 7, 3, 8
+; BE-NEXT: std 7, 472(1) # 8-byte Folded Spill
+; BE-NEXT: rldicl 7, 5, 15, 49
+; BE-NEXT: rldicl 7, 7, 49, 14
+; BE-NEXT: rldicl 8, 5, 14, 50
+; BE-NEXT: rldicl 8, 8, 50, 13
+; BE-NEXT: mulld 7, 3, 7
+; BE-NEXT: std 7, 448(1) # 8-byte Folded Spill
+; BE-NEXT: mulld 7, 3, 8
+; BE-NEXT: std 7, 456(1) # 8-byte Folded Spill
+; BE-NEXT: rldicl 7, 5, 13, 51
+; BE-NEXT: rldicl 7, 7, 51, 12
+; BE-NEXT: rldicl 8, 5, 12, 52
+; BE-NEXT: rldicl 8, 8, 52, 11
+; BE-NEXT: mulld 7, 3, 7
+; BE-NEXT: std 7, 432(1) # 8-byte Folded Spill
+; BE-NEXT: mulld 7, 3, 8
+; BE-NEXT: std 7, 440(1) # 8-byte Folded Spill
+; BE-NEXT: rldicl 7, 5, 11, 53
+; BE-NEXT: rldicl 7, 7, 53, 10
+; BE-NEXT: rldicl 8, 5, 10, 54
+; BE-NEXT: rldicl 8, 8, 54, 9
+; BE-NEXT: mulld 7, 3, 7
+; BE-NEXT: std 7, 416(1) # 8-byte Folded Spill
+; BE-NEXT: mulld 7, 3, 8
+; BE-NEXT: std 7, 424(1) # 8-byte Folded Spill
+; BE-NEXT: rldicl 7, 5, 9, 55
+; BE-NEXT: rldicl 7, 7, 55, 8
+; BE-NEXT: rldicl 8, 5, 8, 56
+; BE-NEXT: rldicl 8, 8, 56, 7
+; BE-NEXT: mulld 7, 3, 7
+; BE-NEXT: std 7, 400(1) # 8-byte Folded Spill
+; BE-NEXT: mulld 7, 3, 8
+; BE-NEXT: std 7, 408(1) # 8-byte Folded Spill
+; BE-NEXT: rldicl 7, 5, 7, 57
+; BE-NEXT: rldicl 7, 7, 57, 6
+; BE-NEXT: rldicl 8, 5, 6, 58
+; BE-NEXT: rldicl 8, 8, 58, 5
+; BE-NEXT: mulld 7, 3, 7
+; BE-NEXT: std 7, 384(1) # 8-byte Folded Spill
+; BE-NEXT: mulld 7, 3, 8
+; BE-NEXT: std 7, 392(1) # 8-byte Folded Spill
+; BE-NEXT: rldicl 7, 5, 5, 59
+; BE-NEXT: rldicl 7, 7, 59, 4
+; BE-NEXT: rldicl 8, 5, 4, 60
+; BE-NEXT: rldicl 8, 8, 60, 3
+; BE-NEXT: mulld 7, 3, 7
+; BE-NEXT: std 7, 368(1) # 8-byte Folded Spill
+; BE-NEXT: mulld 7, 3, 8
+; BE-NEXT: std 7, 376(1) # 8-byte Folded Spill
+; BE-NEXT: rldicl 7, 5, 3, 61
+; BE-NEXT: rldicl 5, 5, 2, 62
+; BE-NEXT: rldicl 7, 7, 61, 2
+; BE-NEXT: rldicl 5, 5, 62, 1
+; BE-NEXT: mulld 7, 3, 7
+; BE-NEXT: mulld 3, 3, 5
+; BE-NEXT: std 7, 352(1) # 8-byte Folded Spill
+; BE-NEXT: std 3, 360(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 3, 6, 0, 30, 30
+; BE-NEXT: mulld 3, 4, 3
+; BE-NEXT: std 3, 344(1) # 8-byte Folded Spill
+; BE-NEXT: clrldi 3, 6, 63
+; BE-NEXT: mulld 3, 4, 3
+; BE-NEXT: std 3, 336(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 3, 6, 0, 29, 29
+; BE-NEXT: mulld 3, 4, 3
+; BE-NEXT: std 3, 328(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 3, 6, 0, 28, 28
+; BE-NEXT: mulld 3, 4, 3
+; BE-NEXT: std 3, 320(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 3, 6, 0, 27, 27
+; BE-NEXT: mulld 3, 4, 3
+; BE-NEXT: std 3, 312(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 3, 6, 0, 26, 26
+; BE-NEXT: mulld 3, 4, 3
+; BE-NEXT: std 3, 304(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 3, 6, 0, 25, 25
+; BE-NEXT: mulld 3, 4, 3
+; BE-NEXT: std 3, 296(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 3, 6, 0, 24, 24
+; BE-NEXT: mulld 3, 4, 3
+; BE-NEXT: std 3, 288(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 3, 6, 0, 23, 23
+; BE-NEXT: mulld 3, 4, 3
+; BE-NEXT: std 3, 280(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 3, 6, 0, 22, 22
+; BE-NEXT: mulld 3, 4, 3
+; BE-NEXT: std 3, 272(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 3, 6, 0, 21, 21
+; BE-NEXT: mulld 3, 4, 3
+; BE-NEXT: std 3, 264(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 3, 6, 0, 20, 20
+; BE-NEXT: mulld 3, 4, 3
+; BE-NEXT: std 3, 256(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 3, 6, 0, 19, 19
+; BE-NEXT: mulld 3, 4, 3
+; BE-NEXT: std 3, 248(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 3, 6, 0, 18, 18
+; BE-NEXT: mulld 3, 4, 3
+; BE-NEXT: std 3, 240(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 3, 6, 0, 17, 17
+; BE-NEXT: mulld 3, 4, 3
+; BE-NEXT: std 3, 232(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 3, 6, 0, 16, 16
+; BE-NEXT: mulld 3, 4, 3
+; BE-NEXT: std 3, 224(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 3, 6, 0, 15, 15
+; BE-NEXT: mulld 3, 4, 3
+; BE-NEXT: std 3, 216(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 3, 6, 0, 14, 14
+; BE-NEXT: mulld 3, 4, 3
+; BE-NEXT: std 3, 208(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 3, 6, 0, 13, 13
+; BE-NEXT: mulld 3, 4, 3
+; BE-NEXT: std 3, 200(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 3, 6, 0, 12, 12
+; BE-NEXT: mulld 3, 4, 3
+; BE-NEXT: std 3, 192(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 3, 6, 0, 11, 11
+; BE-NEXT: mulld 3, 4, 3
+; BE-NEXT: std 3, 184(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 3, 6, 0, 10, 10
+; BE-NEXT: mulld 3, 4, 3
+; BE-NEXT: std 3, 176(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 3, 6, 0, 9, 9
+; BE-NEXT: mulld 3, 4, 3
+; BE-NEXT: std 3, 168(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 3, 6, 0, 8, 8
+; BE-NEXT: mulld 3, 4, 3
+; BE-NEXT: std 3, 160(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 3, 6, 0, 7, 7
+; BE-NEXT: mulld 3, 4, 3
+; BE-NEXT: std 3, 152(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 3, 6, 0, 6, 6
+; BE-NEXT: mulld 3, 4, 3
+; BE-NEXT: std 3, 144(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 3, 6, 0, 5, 5
+; BE-NEXT: mulld 3, 4, 3
+; BE-NEXT: std 3, 136(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 3, 6, 0, 4, 4
+; BE-NEXT: mulld 3, 4, 3
+; BE-NEXT: std 3, 128(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 3, 6, 0, 3, 3
+; BE-NEXT: mulld 3, 4, 3
+; BE-NEXT: std 3, 120(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 3, 6, 0, 2, 2
+; BE-NEXT: mulld 3, 4, 3
+; BE-NEXT: std 3, 112(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 3, 6, 0, 1, 1
+; BE-NEXT: mulld 3, 4, 3
+; BE-NEXT: std 3, 104(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 3, 6, 0, 0, 0
+; BE-NEXT: mulld 3, 4, 3
+; BE-NEXT: std 3, 96(1) # 8-byte Folded Spill
+; BE-NEXT: rldicl 3, 6, 32, 32
+; BE-NEXT: rldicl 3, 3, 32, 31
+; BE-NEXT: rldicr 5, 6, 0, 0
+; BE-NEXT: mulld 3, 4, 3
+; BE-NEXT: mulld 5, 4, 5
+; BE-NEXT: std 3, 80(1) # 8-byte Folded Spill
+; BE-NEXT: std 5, 88(1) # 8-byte Folded Spill
+; BE-NEXT: rldicl 3, 6, 31, 33
+; BE-NEXT: rldicl 5, 6, 30, 34
+; BE-NEXT: rldicl 3, 3, 33, 30
+; BE-NEXT: rldicl 5, 5, 34, 29
+; BE-NEXT: mulld 3, 4, 3
+; BE-NEXT: std 3, 64(1) # 8-byte Folded Spill
+; BE-NEXT: mulld 3, 4, 5
+; BE-NEXT: std 3, 72(1) # 8-byte Folded Spill
+; BE-NEXT: rldicl 3, 6, 29, 35
+; BE-NEXT: rldicl 5, 6, 28, 36
+; BE-NEXT: rldicl 3, 3, 35, 28
+; BE-NEXT: rldicl 5, 5, 36, 27
+; BE-NEXT: mulld 31, 4, 3
+; BE-NEXT: mulld 3, 4, 5
+; BE-NEXT: std 3, 56(1) # 8-byte Folded Spill
+; BE-NEXT: rldicl 3, 6, 27, 37
+; BE-NEXT: rldicl 3, 3, 37, 26
+; BE-NEXT: rldicl 5, 6, 26, 38
+; BE-NEXT: mulld 15, 4, 3
+; BE-NEXT: rldicl 3, 6, 25, 39
+; BE-NEXT: rldicl 5, 5, 38, 25
+; BE-NEXT: rldicl 3, 3, 39, 24
+; BE-NEXT: mulld 14, 4, 5
+; BE-NEXT: rldicl 5, 6, 24, 40
+; BE-NEXT: mulld 17, 4, 3
+; BE-NEXT: rldicl 3, 6, 23, 41
+; BE-NEXT: rldicl 5, 5, 40, 23
+; BE-NEXT: rldicl 3, 3, 41, 22
+; BE-NEXT: mulld 16, 4, 5
+; BE-NEXT: rldicl 5, 6, 22, 42
+; BE-NEXT: mulld 19, 4, 3
+; BE-NEXT: rldicl 3, 6, 21, 43
+; BE-NEXT: rldicl 5, 5, 42, 21
+; BE-NEXT: rldicl 3, 3, 43, 20
+; BE-NEXT: mulld 18, 4, 5
+; BE-NEXT: rldicl 5, 6, 20, 44
+; BE-NEXT: mulld 21, 4, 3
+; BE-NEXT: rldicl 3, 6, 19, 45
+; BE-NEXT: rldicl 5, 5, 44, 19
+; BE-NEXT: rldicl 3, 3, 45, 18
+; BE-NEXT: mulld 20, 4, 5
+; BE-NEXT: rldicl 5, 6, 18, 46
+; BE-NEXT: mulld 23, 4, 3
+; BE-NEXT: rldicl 3, 6, 17, 47
+; BE-NEXT: rldicl 5, 5, 46, 17
+; BE-NEXT: rldicl 3, 3, 47, 16
+; BE-NEXT: mulld 22, 4, 5
+; BE-NEXT: rldicl 5, 6, 16, 48
+; BE-NEXT: mulld 25, 4, 3
+; BE-NEXT: rldicl 3, 6, 15, 49
+; BE-NEXT: rldicl 5, 5, 48, 15
+; BE-NEXT: rldicl 3, 3, 49, 14
+; BE-NEXT: mulld 24, 4, 5
+; BE-NEXT: rldicl 5, 6, 14, 50
+; BE-NEXT: mulld 27, 4, 3
+; BE-NEXT: rldicl 3, 6, 13, 51
+; BE-NEXT: rldicl 5, 5, 50, 13
+; BE-NEXT: rldicl 3, 3, 51, 12
+; BE-NEXT: mulld 26, 4, 5
+; BE-NEXT: rldicl 5, 6, 12, 52
+; BE-NEXT: mulld 29, 4, 3
+; BE-NEXT: rldicl 3, 6, 11, 53
+; BE-NEXT: rldicl 5, 5, 52, 11
+; BE-NEXT: rldicl 3, 3, 53, 10
+; BE-NEXT: mulld 28, 4, 5
+; BE-NEXT: rldicl 5, 6, 10, 54
+; BE-NEXT: mulld 0, 4, 3
+; BE-NEXT: rldicl 3, 6, 9, 55
+; BE-NEXT: rldicl 5, 5, 54, 9
+; BE-NEXT: rldicl 3, 3, 55, 8
+; BE-NEXT: mulld 30, 4, 5
+; BE-NEXT: rldicl 5, 6, 8, 56
+; BE-NEXT: mulld 11, 4, 3
+; BE-NEXT: rldicl 3, 6, 7, 57
+; BE-NEXT: rldicl 5, 5, 56, 7
+; BE-NEXT: rldicl 3, 3, 57, 6
+; BE-NEXT: mulld 12, 4, 5
+; BE-NEXT: rldicl 5, 6, 6, 58
+; BE-NEXT: mulld 9, 4, 3
+; BE-NEXT: rldicl 3, 6, 5, 59
+; BE-NEXT: rldicl 5, 5, 58, 5
+; BE-NEXT: rldicl 3, 3, 59, 4
+; BE-NEXT: mulld 10, 4, 5
+; BE-NEXT: rldicl 5, 6, 4, 60
+; BE-NEXT: mulld 7, 4, 3
+; BE-NEXT: rldicl 3, 6, 3, 61
+; BE-NEXT: rldicl 5, 5, 60, 3
+; BE-NEXT: rldicl 6, 6, 2, 62
+; BE-NEXT: rldicl 3, 3, 61, 2
+; BE-NEXT: mulld 8, 4, 5
+; BE-NEXT: rldicl 5, 6, 62, 1
+; BE-NEXT: mulld 6, 4, 3
+; BE-NEXT: ld 3, 848(1) # 8-byte Folded Reload
+; BE-NEXT: mulld 4, 4, 5
+; BE-NEXT: ld 5, 344(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 2, 3
+; BE-NEXT: ld 2, 336(1) # 8-byte Folded Reload
+; BE-NEXT: xor 5, 2, 5
+; BE-NEXT: ld 2, 840(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 328(1) # 8-byte Folded Reload
+; BE-NEXT: xor 5, 5, 2
+; BE-NEXT: ld 2, 824(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 320(1) # 8-byte Folded Reload
+; BE-NEXT: xor 5, 5, 2
+; BE-NEXT: ld 2, 832(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 312(1) # 8-byte Folded Reload
+; BE-NEXT: xor 5, 5, 2
+; BE-NEXT: ld 2, 808(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 304(1) # 8-byte Folded Reload
+; BE-NEXT: xor 5, 5, 2
+; BE-NEXT: ld 2, 816(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 296(1) # 8-byte Folded Reload
+; BE-NEXT: xor 5, 5, 2
+; BE-NEXT: ld 2, 792(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 288(1) # 8-byte Folded Reload
+; BE-NEXT: xor 5, 5, 2
+; BE-NEXT: ld 2, 800(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 280(1) # 8-byte Folded Reload
+; BE-NEXT: xor 5, 5, 2
+; BE-NEXT: ld 2, 776(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 272(1) # 8-byte Folded Reload
+; BE-NEXT: xor 5, 5, 2
+; BE-NEXT: ld 2, 784(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 264(1) # 8-byte Folded Reload
+; BE-NEXT: xor 5, 5, 2
+; BE-NEXT: ld 2, 760(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 256(1) # 8-byte Folded Reload
+; BE-NEXT: xor 5, 5, 2
+; BE-NEXT: ld 2, 768(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 248(1) # 8-byte Folded Reload
+; BE-NEXT: xor 5, 5, 2
+; BE-NEXT: ld 2, 744(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 240(1) # 8-byte Folded Reload
+; BE-NEXT: xor 5, 5, 2
+; BE-NEXT: ld 2, 752(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 232(1) # 8-byte Folded Reload
+; BE-NEXT: xor 5, 5, 2
+; BE-NEXT: ld 2, 728(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 224(1) # 8-byte Folded Reload
+; BE-NEXT: xor 5, 5, 2
+; BE-NEXT: ld 2, 736(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 216(1) # 8-byte Folded Reload
+; BE-NEXT: xor 5, 5, 2
+; BE-NEXT: ld 2, 712(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 208(1) # 8-byte Folded Reload
+; BE-NEXT: xor 5, 5, 2
+; BE-NEXT: ld 2, 720(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 200(1) # 8-byte Folded Reload
+; BE-NEXT: xor 5, 5, 2
+; BE-NEXT: ld 2, 696(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 192(1) # 8-byte Folded Reload
+; BE-NEXT: xor 5, 5, 2
+; BE-NEXT: ld 2, 704(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 184(1) # 8-byte Folded Reload
+; BE-NEXT: xor 5, 5, 2
+; BE-NEXT: ld 2, 680(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 176(1) # 8-byte Folded Reload
+; BE-NEXT: xor 5, 5, 2
+; BE-NEXT: ld 2, 688(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 168(1) # 8-byte Folded Reload
+; BE-NEXT: xor 5, 5, 2
+; BE-NEXT: ld 2, 664(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 160(1) # 8-byte Folded Reload
+; BE-NEXT: xor 5, 5, 2
+; BE-NEXT: ld 2, 672(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 152(1) # 8-byte Folded Reload
+; BE-NEXT: xor 5, 5, 2
+; BE-NEXT: ld 2, 648(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 144(1) # 8-byte Folded Reload
+; BE-NEXT: xor 5, 5, 2
+; BE-NEXT: ld 2, 656(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 136(1) # 8-byte Folded Reload
+; BE-NEXT: xor 5, 5, 2
+; BE-NEXT: ld 2, 640(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 128(1) # 8-byte Folded Reload
+; BE-NEXT: xor 5, 5, 2
+; BE-NEXT: ld 2, 632(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 120(1) # 8-byte Folded Reload
+; BE-NEXT: xor 5, 5, 2
+; BE-NEXT: ld 2, 624(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 112(1) # 8-byte Folded Reload
+; BE-NEXT: xor 5, 5, 2
+; BE-NEXT: ld 2, 616(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 104(1) # 8-byte Folded Reload
+; BE-NEXT: xor 5, 5, 2
+; BE-NEXT: ld 2, 608(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 96(1) # 8-byte Folded Reload
+; BE-NEXT: xor 5, 5, 2
+; BE-NEXT: ld 2, 592(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 80(1) # 8-byte Folded Reload
+; BE-NEXT: xor 5, 5, 2
+; BE-NEXT: ld 2, 576(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 64(1) # 8-byte Folded Reload
+; BE-NEXT: xor 5, 5, 2
+; BE-NEXT: ld 2, 584(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 72(1) # 8-byte Folded Reload
+; BE-NEXT: xor 5, 5, 2
+; BE-NEXT: ld 2, 560(1) # 8-byte Folded Reload
+; BE-NEXT: xor 5, 5, 31
+; BE-NEXT: ld 31, 568(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: xor 3, 3, 31
+; BE-NEXT: ld 31, 56(1) # 8-byte Folded Reload
+; BE-NEXT: xor 5, 5, 31
+; BE-NEXT: ld 31, 544(1) # 8-byte Folded Reload
+; BE-NEXT: xor 5, 5, 15
+; BE-NEXT: xor 5, 5, 14
+; BE-NEXT: ld 15, 552(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 31
+; BE-NEXT: xor 5, 5, 17
+; BE-NEXT: xor 5, 5, 16
+; BE-NEXT: xor 3, 3, 15
+; BE-NEXT: ld 15, 528(1) # 8-byte Folded Reload
+; BE-NEXT: xor 5, 5, 19
+; BE-NEXT: xor 5, 5, 18
+; BE-NEXT: ld 17, 536(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 15
+; BE-NEXT: xor 5, 5, 21
+; BE-NEXT: xor 5, 5, 20
+; BE-NEXT: xor 3, 3, 17
+; BE-NEXT: ld 17, 512(1) # 8-byte Folded Reload
+; BE-NEXT: xor 5, 5, 23
+; BE-NEXT: xor 5, 5, 22
+; BE-NEXT: ld 19, 520(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 17
+; BE-NEXT: xor 5, 5, 25
+; BE-NEXT: xor 5, 5, 24
+; BE-NEXT: xor 3, 3, 19
+; BE-NEXT: ld 19, 496(1) # 8-byte Folded Reload
+; BE-NEXT: xor 5, 5, 27
+; BE-NEXT: xor 5, 5, 26
+; BE-NEXT: ld 21, 504(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 19
+; BE-NEXT: xor 5, 5, 29
+; BE-NEXT: xor 5, 5, 28
+; BE-NEXT: xor 3, 3, 21
+; BE-NEXT: ld 21, 480(1) # 8-byte Folded Reload
+; BE-NEXT: xor 5, 5, 0
+; BE-NEXT: xor 5, 5, 30
+; BE-NEXT: ld 23, 488(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 21
+; BE-NEXT: xor 5, 5, 11
+; BE-NEXT: xor 5, 5, 12
+; BE-NEXT: xor 3, 3, 23
+; BE-NEXT: ld 23, 464(1) # 8-byte Folded Reload
+; BE-NEXT: xor 5, 5, 9
+; BE-NEXT: xor 5, 5, 10
+; BE-NEXT: ld 25, 472(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 23
+; BE-NEXT: xor 5, 5, 7
+; BE-NEXT: xor 5, 5, 8
+; BE-NEXT: xor 3, 3, 25
+; BE-NEXT: ld 25, 448(1) # 8-byte Folded Reload
+; BE-NEXT: xor 5, 5, 6
+; BE-NEXT: xor 4, 5, 4
+; BE-NEXT: ld 27, 456(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 25
+; BE-NEXT: xor 3, 3, 27
+; BE-NEXT: ld 27, 432(1) # 8-byte Folded Reload
+; BE-NEXT: ld 29, 440(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 27
+; BE-NEXT: xor 3, 3, 29
+; BE-NEXT: ld 29, 416(1) # 8-byte Folded Reload
+; BE-NEXT: ld 0, 424(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 29
+; BE-NEXT: xor 3, 3, 0
+; BE-NEXT: ld 0, 400(1) # 8-byte Folded Reload
+; BE-NEXT: ld 11, 408(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 0
+; BE-NEXT: xor 3, 3, 11
+; BE-NEXT: ld 11, 384(1) # 8-byte Folded Reload
+; BE-NEXT: ld 9, 392(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 11
+; BE-NEXT: xor 3, 3, 9
+; BE-NEXT: ld 9, 368(1) # 8-byte Folded Reload
+; BE-NEXT: ld 7, 376(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 9
+; BE-NEXT: xor 3, 3, 7
+; BE-NEXT: ld 7, 352(1) # 8-byte Folded Reload
+; BE-NEXT: ld 6, 360(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 7
+; BE-NEXT: ld 5, 600(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 6
+; BE-NEXT: xor 3, 3, 5
+; BE-NEXT: ld 5, 88(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 5
+; BE-NEXT: ld 2, 856(1) # 8-byte Folded Reload
+; BE-NEXT: ld 31, 1000(1) # 8-byte Folded Reload
+; BE-NEXT: ld 30, 992(1) # 8-byte Folded Reload
+; BE-NEXT: ld 29, 984(1) # 8-byte Folded Reload
+; BE-NEXT: ld 28, 976(1) # 8-byte Folded Reload
+; BE-NEXT: ld 27, 968(1) # 8-byte Folded Reload
+; BE-NEXT: ld 26, 960(1) # 8-byte Folded Reload
+; BE-NEXT: ld 25, 952(1) # 8-byte Folded Reload
+; BE-NEXT: ld 24, 944(1) # 8-byte Folded Reload
+; BE-NEXT: ld 23, 936(1) # 8-byte Folded Reload
+; BE-NEXT: ld 22, 928(1) # 8-byte Folded Reload
+; BE-NEXT: ld 21, 920(1) # 8-byte Folded Reload
+; BE-NEXT: ld 20, 912(1) # 8-byte Folded Reload
+; BE-NEXT: ld 19, 904(1) # 8-byte Folded Reload
+; BE-NEXT: ld 18, 896(1) # 8-byte Folded Reload
+; BE-NEXT: ld 17, 888(1) # 8-byte Folded Reload
+; BE-NEXT: ld 16, 880(1) # 8-byte Folded Reload
+; BE-NEXT: ld 15, 872(1) # 8-byte Folded Reload
+; BE-NEXT: ld 14, 864(1) # 8-byte Folded Reload
+; BE-NEXT: addi 1, 1, 1008
+; BE-NEXT: blr
+;
+; LE-LABEL: clmul_v2i64:
+; LE: # %bb.0:
+; LE-NEXT: stdu 1, -480(1)
+; LE-NEXT: mfvsrd 4, 35
+; LE-NEXT: mfvsrd 3, 34
+; LE-NEXT: std 16, 352(1) # 8-byte Folded Spill
+; LE-NEXT: std 14, 336(1) # 8-byte Folded Spill
+; LE-NEXT: std 15, 344(1) # 8-byte Folded Spill
+; LE-NEXT: std 17, 360(1) # 8-byte Folded Spill
+; LE-NEXT: xxswapd 0, 35
+; LE-NEXT: xxswapd 1, 34
+; LE-NEXT: std 18, 368(1) # 8-byte Folded Spill
+; LE-NEXT: std 30, 464(1) # 8-byte Folded Spill
+; LE-NEXT: rlwinm 5, 4, 0, 30, 30
+; LE-NEXT: clrldi 6, 4, 63
+; LE-NEXT: rlwinm 7, 4, 0, 29, 29
+; LE-NEXT: rlwinm 8, 4, 0, 28, 28
+; LE-NEXT: rlwinm 9, 4, 0, 27, 27
+; LE-NEXT: rlwinm 10, 4, 0, 26, 26
+; LE-NEXT: rlwinm 11, 4, 0, 25, 25
+; LE-NEXT: rlwinm 12, 4, 0, 24, 24
+; LE-NEXT: rlwinm 0, 4, 0, 23, 23
+; LE-NEXT: rlwinm 30, 4, 0, 22, 22
+; LE-NEXT: std 19, 376(1) # 8-byte Folded Spill
+; LE-NEXT: std 29, 456(1) # 8-byte Folded Spill
+; LE-NEXT: rlwinm 29, 4, 0, 21, 21
+; LE-NEXT: std 20, 384(1) # 8-byte Folded Spill
+; LE-NEXT: std 28, 448(1) # 8-byte Folded Spill
+; LE-NEXT: rlwinm 28, 4, 0, 20, 20
+; LE-NEXT: std 21, 392(1) # 8-byte Folded Spill
+; LE-NEXT: std 22, 400(1) # 8-byte Folded Spill
+; LE-NEXT: mulld 5, 3, 5
+; LE-NEXT: mulld 6, 3, 6
+; LE-NEXT: mulld 7, 3, 7
+; LE-NEXT: mulld 8, 3, 8
+; LE-NEXT: std 27, 440(1) # 8-byte Folded Spill
+; LE-NEXT: rlwinm 27, 4, 0, 19, 19
+; LE-NEXT: std 23, 408(1) # 8-byte Folded Spill
+; LE-NEXT: std 26, 432(1) # 8-byte Folded Spill
+; LE-NEXT: rlwinm 26, 4, 0, 18, 18
+; LE-NEXT: std 24, 416(1) # 8-byte Folded Spill
+; LE-NEXT: std 25, 424(1) # 8-byte Folded Spill
+; LE-NEXT: rlwinm 25, 4, 0, 17, 17
+; LE-NEXT: std 31, 472(1) # 8-byte Folded Spill
+; LE-NEXT: std 2, 328(1) # 8-byte Folded Spill
+; LE-NEXT: xor 5, 6, 5
+; LE-NEXT: std 8, 64(1) # 8-byte Folded Spill
+; LE-NEXT: mulld 8, 3, 9
+; LE-NEXT: xor 16, 5, 7
+; LE-NEXT: rlwinm 5, 4, 0, 16, 16
+; LE-NEXT: mulld 5, 3, 5
+; LE-NEXT: std 8, 80(1) # 8-byte Folded Spill
+; LE-NEXT: mulld 8, 3, 10
+; LE-NEXT: std 5, 256(1) # 8-byte Folded Spill
+; LE-NEXT: rlwinm 5, 4, 0, 15, 15
+; LE-NEXT: std 8, 96(1) # 8-byte Folded Spill
+; LE-NEXT: mulld 8, 3, 11
+; LE-NEXT: mulld 5, 3, 5
+; LE-NEXT: std 8, 112(1) # 8-byte Folded Spill
+; LE-NEXT: mulld 8, 3, 12
+; LE-NEXT: std 5, 272(1) # 8-byte Folded Spill
+; LE-NEXT: rlwinm 5, 4, 0, 14, 14
+; LE-NEXT: mulld 5, 3, 5
+; LE-NEXT: std 8, 128(1) # 8-byte Folded Spill
+; LE-NEXT: mulld 8, 3, 0
+; LE-NEXT: std 5, 288(1) # 8-byte Folded Spill
+; LE-NEXT: rlwinm 5, 4, 0, 13, 13
+; LE-NEXT: mulld 5, 3, 5
+; LE-NEXT: std 8, 144(1) # 8-byte Folded Spill
+; LE-NEXT: mulld 8, 3, 30
+; LE-NEXT: std 5, 304(1) # 8-byte Folded Spill
+; LE-NEXT: rlwinm 5, 4, 0, 12, 12
+; LE-NEXT: std 8, 160(1) # 8-byte Folded Spill
+; LE-NEXT: mulld 8, 3, 29
+; LE-NEXT: mulld 5, 3, 5
+; LE-NEXT: std 8, 184(1) # 8-byte Folded Spill
+; LE-NEXT: mulld 8, 3, 28
+; LE-NEXT: std 5, 312(1) # 8-byte Folded Spill
+; LE-NEXT: rlwinm 5, 4, 0, 11, 11
+; LE-NEXT: mulld 5, 3, 5
+; LE-NEXT: std 8, 200(1) # 8-byte Folded Spill
+; LE-NEXT: mulld 8, 3, 27
+; LE-NEXT: std 5, 320(1) # 8-byte Folded Spill
+; LE-NEXT: rlwinm 5, 4, 0, 10, 10
+; LE-NEXT: std 8, 224(1) # 8-byte Folded Spill
+; LE-NEXT: mulld 8, 3, 26
+; LE-NEXT: mulld 5, 3, 5
+; LE-NEXT: std 8, 240(1) # 8-byte Folded Spill
+; LE-NEXT: mulld 8, 3, 25
+; LE-NEXT: std 5, 296(1) # 8-byte Folded Spill
+; LE-NEXT: rlwinm 5, 4, 0, 9, 9
+; LE-NEXT: mulld 5, 3, 5
+; LE-NEXT: std 8, 280(1) # 8-byte Folded Spill
+; LE-NEXT: std 5, 264(1) # 8-byte Folded Spill
+; LE-NEXT: rlwinm 5, 4, 0, 8, 8
+; LE-NEXT: mulld 5, 3, 5
+; LE-NEXT: std 5, 248(1) # 8-byte Folded Spill
+; LE-NEXT: rlwinm 5, 4, 0, 7, 7
+; LE-NEXT: mulld 5, 3, 5
+; LE-NEXT: std 5, 232(1) # 8-byte Folded Spill
+; LE-NEXT: rlwinm 5, 4, 0, 6, 6
+; LE-NEXT: mulld 5, 3, 5
+; LE-NEXT: std 5, 216(1) # 8-byte Folded Spill
+; LE-NEXT: rlwinm 5, 4, 0, 5, 5
+; LE-NEXT: mulld 5, 3, 5
+; LE-NEXT: std 5, 208(1) # 8-byte Folded Spill
+; LE-NEXT: rlwinm 5, 4, 0, 4, 4
+; LE-NEXT: mulld 5, 3, 5
+; LE-NEXT: std 5, 192(1) # 8-byte Folded Spill
+; LE-NEXT: rlwinm 5, 4, 0, 3, 3
+; LE-NEXT: mulld 5, 3, 5
+; LE-NEXT: std 5, 176(1) # 8-byte Folded Spill
+; LE-NEXT: rlwinm 5, 4, 0, 2, 2
+; LE-NEXT: mulld 5, 3, 5
+; LE-NEXT: std 5, 168(1) # 8-byte Folded Spill
+; LE-NEXT: rlwinm 5, 4, 0, 1, 1
+; LE-NEXT: mulld 5, 3, 5
+; LE-NEXT: std 5, 152(1) # 8-byte Folded Spill
+; LE-NEXT: rlwinm 5, 4, 0, 0, 0
+; LE-NEXT: mulld 5, 3, 5
+; LE-NEXT: std 5, 136(1) # 8-byte Folded Spill
+; LE-NEXT: rldicl 5, 4, 32, 32
+; LE-NEXT: rldicl 5, 5, 32, 31
+; LE-NEXT: mulld 5, 3, 5
+; LE-NEXT: std 5, 120(1) # 8-byte Folded Spill
+; LE-NEXT: rldicl 5, 4, 31, 33
+; LE-NEXT: rldicl 5, 5, 33, 30
+; LE-NEXT: mulld 5, 3, 5
+; LE-NEXT: std 5, 104(1) # 8-byte Folded Spill
+; LE-NEXT: rldicl 5, 4, 30, 34
+; LE-NEXT: rldicl 5, 5, 34, 29
+; LE-NEXT: mulld 5, 3, 5
+; LE-NEXT: std 5, 88(1) # 8-byte Folded Spill
+; LE-NEXT: rldicl 5, 4, 29, 35
+; LE-NEXT: rldicl 5, 5, 35, 28
+; LE-NEXT: mulld 5, 3, 5
+; LE-NEXT: std 5, 72(1) # 8-byte Folded Spill
+; LE-NEXT: rldicl 5, 4, 28, 36
+; LE-NEXT: rldicl 5, 5, 36, 27
+; LE-NEXT: mulld 5, 3, 5
+; LE-NEXT: std 5, 56(1) # 8-byte Folded Spill
+; LE-NEXT: rldicl 5, 4, 27, 37
+; LE-NEXT: rldicl 5, 5, 37, 26
+; LE-NEXT: mulld 5, 3, 5
+; LE-NEXT: std 5, 48(1) # 8-byte Folded Spill
+; LE-NEXT: rldicl 5, 4, 26, 38
+; LE-NEXT: rldicl 5, 5, 38, 25
+; LE-NEXT: mulld 5, 3, 5
+; LE-NEXT: std 5, 40(1) # 8-byte Folded Spill
+; LE-NEXT: rldicl 5, 4, 25, 39
+; LE-NEXT: rldicl 5, 5, 39, 24
+; LE-NEXT: mulld 14, 3, 5
+; LE-NEXT: rldicl 5, 4, 24, 40
+; LE-NEXT: rldicl 5, 5, 40, 23
+; LE-NEXT: mulld 15, 3, 5
+; LE-NEXT: rldicl 5, 4, 23, 41
+; LE-NEXT: rldicl 5, 5, 41, 22
+; LE-NEXT: mulld 17, 3, 5
+; LE-NEXT: rldicl 5, 4, 22, 42
+; LE-NEXT: rldicl 5, 5, 42, 21
+; LE-NEXT: mulld 18, 3, 5
+; LE-NEXT: rldicl 5, 4, 21, 43
+; LE-NEXT: rldicl 5, 5, 43, 20
+; LE-NEXT: mulld 19, 3, 5
+; LE-NEXT: rldicl 5, 4, 20, 44
+; LE-NEXT: rldicl 5, 5, 44, 19
+; LE-NEXT: mulld 20, 3, 5
+; LE-NEXT: rldicl 5, 4, 19, 45
+; LE-NEXT: rldicl 5, 5, 45, 18
+; LE-NEXT: mulld 21, 3, 5
+; LE-NEXT: rldicl 5, 4, 18, 46
+; LE-NEXT: rldicl 5, 5, 46, 17
+; LE-NEXT: mulld 22, 3, 5
+; LE-NEXT: rldicl 5, 4, 17, 47
+; LE-NEXT: rldicl 5, 5, 47, 16
+; LE-NEXT: mulld 23, 3, 5
+; LE-NEXT: rldicl 5, 4, 16, 48
+; LE-NEXT: rldicl 5, 5, 48, 15
+; LE-NEXT: mulld 24, 3, 5
+; LE-NEXT: rldicl 5, 4, 15, 49
+; LE-NEXT: rldicl 5, 5, 49, 14
+; LE-NEXT: mulld 25, 3, 5
+; LE-NEXT: rldicl 5, 4, 14, 50
+; LE-NEXT: rldicl 5, 5, 50, 13
+; LE-NEXT: mulld 26, 3, 5
+; LE-NEXT: rldicl 5, 4, 13, 51
+; LE-NEXT: rldicl 5, 5, 51, 12
+; LE-NEXT: mulld 27, 3, 5
+; LE-NEXT: rldicl 5, 4, 12, 52
+; LE-NEXT: rldicl 5, 5, 52, 11
+; LE-NEXT: mulld 28, 3, 5
+; LE-NEXT: rldicl 5, 4, 11, 53
+; LE-NEXT: rldicl 5, 5, 53, 10
+; LE-NEXT: mulld 29, 3, 5
+; LE-NEXT: rldicl 5, 4, 10, 54
+; LE-NEXT: rldicl 5, 5, 54, 9
+; LE-NEXT: mulld 30, 3, 5
+; LE-NEXT: rldicl 5, 4, 9, 55
+; LE-NEXT: rldicl 5, 5, 55, 8
+; LE-NEXT: mulld 0, 3, 5
+; LE-NEXT: rldicl 5, 4, 8, 56
+; LE-NEXT: rldicl 5, 5, 56, 7
+; LE-NEXT: mulld 12, 3, 5
+; LE-NEXT: rldicl 5, 4, 7, 57
+; LE-NEXT: rldicl 5, 5, 57, 6
+; LE-NEXT: mulld 11, 3, 5
+; LE-NEXT: rldicl 5, 4, 6, 58
+; LE-NEXT: rldicl 5, 5, 58, 5
+; LE-NEXT: mulld 10, 3, 5
+; LE-NEXT: rldicl 5, 4, 5, 59
+; LE-NEXT: rldicl 5, 5, 59, 4
+; LE-NEXT: mulld 9, 3, 5
+; LE-NEXT: rldicl 5, 4, 4, 60
+; LE-NEXT: rldicl 5, 5, 60, 3
+; LE-NEXT: mulld 8, 3, 5
+; LE-NEXT: rldicl 5, 4, 3, 61
+; LE-NEXT: rldicl 5, 5, 61, 2
+; LE-NEXT: mulld 7, 3, 5
+; LE-NEXT: rldicl 5, 4, 2, 62
+; LE-NEXT: rldicr 4, 4, 0, 0
+; LE-NEXT: rldicl 5, 5, 62, 1
+; LE-NEXT: mulld 6, 3, 5
+; LE-NEXT: mulld 5, 3, 4
+; LE-NEXT: mffprd 4, 0
+; LE-NEXT: mffprd 3, 1
+; LE-NEXT: rlwinm 2, 4, 0, 30, 30
+; LE-NEXT: clrldi 31, 4, 63
+; LE-NEXT: mulld 2, 3, 2
+; LE-NEXT: mulld 31, 3, 31
+; LE-NEXT: xor 31, 31, 2
+; LE-NEXT: ld 2, 64(1) # 8-byte Folded Reload
+; LE-NEXT: xor 16, 16, 2
+; LE-NEXT: rlwinm 2, 4, 0, 29, 29
+; LE-NEXT: mulld 2, 3, 2
+; LE-NEXT: xor 31, 31, 2
+; LE-NEXT: ld 2, 80(1) # 8-byte Folded Reload
+; LE-NEXT: xor 16, 16, 2
+; LE-NEXT: rlwinm 2, 4, 0, 28, 28
+; LE-NEXT: mulld 2, 3, 2
+; LE-NEXT: xor 31, 31, 2
+; LE-NEXT: ld 2, 96(1) # 8-byte Folded Reload
+; LE-NEXT: xor 16, 16, 2
+; LE-NEXT: rlwinm 2, 4, 0, 27, 27
+; LE-NEXT: mulld 2, 3, 2
+; LE-NEXT: xor 31, 31, 2
+; LE-NEXT: ld 2, 112(1) # 8-byte Folded Reload
+; LE-NEXT: xor 16, 16, 2
+; LE-NEXT: rlwinm 2, 4, 0, 26, 26
+; LE-NEXT: mulld 2, 3, 2
+; LE-NEXT: xor 31, 31, 2
+; LE-NEXT: ld 2, 128(1) # 8-byte Folded Reload
+; LE-NEXT: xor 16, 16, 2
+; LE-NEXT: rlwinm 2, 4, 0, 25, 25
+; LE-NEXT: mulld 2, 3, 2
+; LE-NEXT: xor 31, 31, 2
+; LE-NEXT: ld 2, 144(1) # 8-byte Folded Reload
+; LE-NEXT: xor 16, 16, 2
+; LE-NEXT: rlwinm 2, 4, 0, 24, 24
+; LE-NEXT: mulld 2, 3, 2
+; LE-NEXT: xor 31, 31, 2
+; LE-NEXT: ld 2, 160(1) # 8-byte Folded Reload
+; LE-NEXT: xor 16, 16, 2
+; LE-NEXT: rlwinm 2, 4, 0, 23, 23
+; LE-NEXT: mulld 2, 3, 2
+; LE-NEXT: xor 31, 31, 2
+; LE-NEXT: ld 2, 184(1) # 8-byte Folded Reload
+; LE-NEXT: xor 16, 16, 2
+; LE-NEXT: rlwinm 2, 4, 0, 22, 22
+; LE-NEXT: mulld 2, 3, 2
+; LE-NEXT: xor 31, 31, 2
+; LE-NEXT: ld 2, 200(1) # 8-byte Folded Reload
+; LE-NEXT: xor 16, 16, 2
+; LE-NEXT: rlwinm 2, 4, 0, 21, 21
+; LE-NEXT: mulld 2, 3, 2
+; LE-NEXT: xor 31, 31, 2
+; LE-NEXT: ld 2, 224(1) # 8-byte Folded Reload
+; LE-NEXT: xor 16, 16, 2
+; LE-NEXT: rlwinm 2, 4, 0, 20, 20
+; LE-NEXT: mulld 2, 3, 2
+; LE-NEXT: xor 31, 31, 2
+; LE-NEXT: ld 2, 240(1) # 8-byte Folded Reload
+; LE-NEXT: xor 16, 16, 2
+; LE-NEXT: rlwinm 2, 4, 0, 19, 19
+; LE-NEXT: mulld 2, 3, 2
+; LE-NEXT: xor 31, 31, 2
+; LE-NEXT: ld 2, 280(1) # 8-byte Folded Reload
+; LE-NEXT: xor 16, 16, 2
+; LE-NEXT: rlwinm 2, 4, 0, 18, 18
+; LE-NEXT: mulld 2, 3, 2
+; LE-NEXT: xor 31, 31, 2
+; LE-NEXT: ld 2, 256(1) # 8-byte Folded Reload
+; LE-NEXT: xor 16, 16, 2
+; LE-NEXT: rlwinm 2, 4, 0, 17, 17
+; LE-NEXT: mulld 2, 3, 2
+; LE-NEXT: xor 31, 31, 2
+; LE-NEXT: ld 2, 272(1) # 8-byte Folded Reload
+; LE-NEXT: xor 16, 16, 2
+; LE-NEXT: rlwinm 2, 4, 0, 16, 16
+; LE-NEXT: mulld 2, 3, 2
+; LE-NEXT: xor 31, 31, 2
+; LE-NEXT: ld 2, 288(1) # 8-byte Folded Reload
+; LE-NEXT: xor 16, 16, 2
+; LE-NEXT: rlwinm 2, 4, 0, 15, 15
+; LE-NEXT: mulld 2, 3, 2
+; LE-NEXT: xor 31, 31, 2
+; LE-NEXT: ld 2, 304(1) # 8-byte Folded Reload
+; LE-NEXT: xor 16, 16, 2
+; LE-NEXT: rlwinm 2, 4, 0, 14, 14
+; LE-NEXT: mulld 2, 3, 2
+; LE-NEXT: xor 31, 31, 2
+; LE-NEXT: ld 2, 312(1) # 8-byte Folded Reload
+; LE-NEXT: xor 16, 16, 2
+; LE-NEXT: rlwinm 2, 4, 0, 13, 13
+; LE-NEXT: mulld 2, 3, 2
+; LE-NEXT: xor 31, 31, 2
+; LE-NEXT: ld 2, 320(1) # 8-byte Folded Reload
+; LE-NEXT: xor 16, 16, 2
+; LE-NEXT: ld 2, 296(1) # 8-byte Folded Reload
+; LE-NEXT: xor 16, 16, 2
+; LE-NEXT: ld 2, 264(1) # 8-byte Folded Reload
+; LE-NEXT: xor 16, 16, 2
+; LE-NEXT: ld 2, 248(1) # 8-byte Folded Reload
+; LE-NEXT: xor 16, 16, 2
+; LE-NEXT: ld 2, 232(1) # 8-byte Folded Reload
+; LE-NEXT: xor 16, 16, 2
+; LE-NEXT: ld 2, 216(1) # 8-byte Folded Reload
+; LE-NEXT: xor 16, 16, 2
+; LE-NEXT: ld 2, 208(1) # 8-byte Folded Reload
+; LE-NEXT: xor 16, 16, 2
+; LE-NEXT: ld 2, 192(1) # 8-byte Folded Reload
+; LE-NEXT: xor 16, 16, 2
+; LE-NEXT: ld 2, 176(1) # 8-byte Folded Reload
+; LE-NEXT: xor 16, 16, 2
+; LE-NEXT: ld 2, 168(1) # 8-byte Folded Reload
+; LE-NEXT: xor 16, 16, 2
+; LE-NEXT: ld 2, 152(1) # 8-byte Folded Reload
+; LE-NEXT: xor 16, 16, 2
+; LE-NEXT: ld 2, 136(1) # 8-byte Folded Reload
+; LE-NEXT: xor 16, 16, 2
+; LE-NEXT: ld 2, 120(1) # 8-byte Folded Reload
+; LE-NEXT: xor 16, 16, 2
+; LE-NEXT: ld 2, 104(1) # 8-byte Folded Reload
+; LE-NEXT: xor 16, 16, 2
+; LE-NEXT: ld 2, 88(1) # 8-byte Folded Reload
+; LE-NEXT: xor 16, 16, 2
+; LE-NEXT: ld 2, 72(1) # 8-byte Folded Reload
+; LE-NEXT: xor 16, 16, 2
+; LE-NEXT: ld 2, 56(1) # 8-byte Folded Reload
+; LE-NEXT: xor 16, 16, 2
+; LE-NEXT: ld 2, 48(1) # 8-byte Folded Reload
+; LE-NEXT: xor 16, 16, 2
+; LE-NEXT: ld 2, 40(1) # 8-byte Folded Reload
+; LE-NEXT: xor 16, 16, 2
+; LE-NEXT: ld 2, 328(1) # 8-byte Folded Reload
+; LE-NEXT: xor 16, 16, 14
+; LE-NEXT: ld 14, 336(1) # 8-byte Folded Reload
+; LE-NEXT: xor 16, 16, 15
+; LE-NEXT: ld 15, 344(1) # 8-byte Folded Reload
+; LE-NEXT: xor 17, 16, 17
+; LE-NEXT: ld 16, 352(1) # 8-byte Folded Reload
+; LE-NEXT: xor 18, 17, 18
+; LE-NEXT: ld 17, 360(1) # 8-byte Folded Reload
+; LE-NEXT: xor 19, 18, 19
+; LE-NEXT: ld 18, 368(1) # 8-byte Folded Reload
+; LE-NEXT: xor 20, 19, 20
+; LE-NEXT: ld 19, 376(1) # 8-byte Folded Reload
+; LE-NEXT: xor 21, 20, 21
+; LE-NEXT: ld 20, 384(1) # 8-byte Folded Reload
+; LE-NEXT: xor 22, 21, 22
+; LE-NEXT: ld 21, 392(1) # 8-byte Folded Reload
+; LE-NEXT: xor 23, 22, 23
+; LE-NEXT: ld 22, 400(1) # 8-byte Folded Reload
+; LE-NEXT: xor 24, 23, 24
+; LE-NEXT: ld 23, 408(1) # 8-byte Folded Reload
+; LE-NEXT: xor 25, 24, 25
+; LE-NEXT: ld 24, 416(1) # 8-byte Folded Reload
+; LE-NEXT: xor 26, 25, 26
+; LE-NEXT: ld 25, 424(1) # 8-byte Folded Reload
+; LE-NEXT: xor 27, 26, 27
+; LE-NEXT: ld 26, 432(1) # 8-byte Folded Reload
+; LE-NEXT: xor 28, 27, 28
+; LE-NEXT: ld 27, 440(1) # 8-byte Folded Reload
+; LE-NEXT: xor 29, 28, 29
+; LE-NEXT: ld 28, 448(1) # 8-byte Folded Reload
+; LE-NEXT: xor 30, 29, 30
+; LE-NEXT: ld 29, 456(1) # 8-byte Folded Reload
+; LE-NEXT: xor 0, 30, 0
+; LE-NEXT: ld 30, 464(1) # 8-byte Folded Reload
+; LE-NEXT: xor 12, 0, 12
+; LE-NEXT: xor 11, 12, 11
+; LE-NEXT: xor 10, 11, 10
+; LE-NEXT: xor 9, 10, 9
+; LE-NEXT: xor 8, 9, 8
+; LE-NEXT: xor 7, 8, 7
+; LE-NEXT: xor 6, 7, 6
+; LE-NEXT: xor 5, 6, 5
+; LE-NEXT: rlwinm 6, 4, 0, 11, 11
+; LE-NEXT: mtfprd 0, 5
+; LE-NEXT: rlwinm 5, 4, 0, 12, 12
+; LE-NEXT: mulld 6, 3, 6
+; LE-NEXT: mulld 5, 3, 5
+; LE-NEXT: xor 5, 31, 5
+; LE-NEXT: ld 31, 472(1) # 8-byte Folded Reload
+; LE-NEXT: xor 5, 5, 6
+; LE-NEXT: rlwinm 6, 4, 0, 10, 10
+; LE-NEXT: mulld 6, 3, 6
+; LE-NEXT: xor 5, 5, 6
+; LE-NEXT: rlwinm 6, 4, 0, 9, 9
+; LE-NEXT: mulld 6, 3, 6
+; LE-NEXT: xor 5, 5, 6
+; LE-NEXT: rlwinm 6, 4, 0, 8, 8
+; LE-NEXT: mulld 6, 3, 6
+; LE-NEXT: xor 5, 5, 6
+; LE-NEXT: rlwinm 6, 4, 0, 7, 7
+; LE-NEXT: mulld 6, 3, 6
+; LE-NEXT: xor 5, 5, 6
+; LE-NEXT: rlwinm 6, 4, 0, 6, 6
+; LE-NEXT: mulld 6, 3, 6
+; LE-NEXT: xor 5, 5, 6
+; LE-NEXT: rlwinm 6, 4, 0, 5, 5
+; LE-NEXT: mulld 6, 3, 6
+; LE-NEXT: xor 5, 5, 6
+; LE-NEXT: rlwinm 6, 4, 0, 4, 4
+; LE-NEXT: mulld 6, 3, 6
+; LE-NEXT: xor 5, 5, 6
+; LE-NEXT: rlwinm 6, 4, 0, 3, 3
+; LE-NEXT: mulld 6, 3, 6
+; LE-NEXT: xor 5, 5, 6
+; LE-NEXT: rlwinm 6, 4, 0, 2, 2
+; LE-NEXT: mulld 6, 3, 6
+; LE-NEXT: xor 5, 5, 6
+; LE-NEXT: rlwinm 6, 4, 0, 1, 1
+; LE-NEXT: mulld 6, 3, 6
+; LE-NEXT: xor 5, 5, 6
+; LE-NEXT: rlwinm 6, 4, 0, 0, 0
+; LE-NEXT: mulld 6, 3, 6
+; LE-NEXT: xor 5, 5, 6
+; LE-NEXT: rldicl 6, 4, 32, 32
+; LE-NEXT: rldicl 6, 6, 32, 31
+; LE-NEXT: mulld 6, 3, 6
+; LE-NEXT: xor 5, 5, 6
+; LE-NEXT: rldicl 6, 4, 31, 33
+; LE-NEXT: rldicl 6, 6, 33, 30
+; LE-NEXT: mulld 6, 3, 6
+; LE-NEXT: xor 5, 5, 6
+; LE-NEXT: rldicl 6, 4, 30, 34
+; LE-NEXT: rldicl 6, 6, 34, 29
+; LE-NEXT: mulld 6, 3, 6
+; LE-NEXT: xor 5, 5, 6
+; LE-NEXT: rldicl 6, 4, 29, 35
+; LE-NEXT: rldicl 6, 6, 35, 28
+; LE-NEXT: mulld 6, 3, 6
+; LE-NEXT: xor 5, 5, 6
+; LE-NEXT: rldicl 6, 4, 28, 36
+; LE-NEXT: rldicl 6, 6, 36, 27
+; LE-NEXT: mulld 6, 3, 6
+; LE-NEXT: xor 5, 5, 6
+; LE-NEXT: rldicl 6, 4, 27, 37
+; LE-NEXT: rldicl 6, 6, 37, 26
+; LE-NEXT: mulld 6, 3, 6
+; LE-NEXT: xor 5, 5, 6
+; LE-NEXT: rldicl 6, 4, 26, 38
+; LE-NEXT: rldicl 6, 6, 38, 25
+; LE-NEXT: mulld 6, 3, 6
+; LE-NEXT: xor 5, 5, 6
+; LE-NEXT: rldicl 6, 4, 25, 39
+; LE-NEXT: rldicl 6, 6, 39, 24
+; LE-NEXT: mulld 6, 3, 6
+; LE-NEXT: xor 5, 5, 6
+; LE-NEXT: rldicl 6, 4, 24, 40
+; LE-NEXT: rldicl 6, 6, 40, 23
+; LE-NEXT: mulld 6, 3, 6
+; LE-NEXT: xor 5, 5, 6
+; LE-NEXT: rldicl 6, 4, 23, 41
+; LE-NEXT: rldicl 6, 6, 41, 22
+; LE-NEXT: mulld 6, 3, 6
+; LE-NEXT: xor 5, 5, 6
+; LE-NEXT: rldicl 6, 4, 22, 42
+; LE-NEXT: rldicl 6, 6, 42, 21
+; LE-NEXT: mulld 6, 3, 6
+; LE-NEXT: xor 5, 5, 6
+; LE-NEXT: rldicl 6, 4, 21, 43
+; LE-NEXT: rldicl 6, 6, 43, 20
+; LE-NEXT: mulld 6, 3, 6
+; LE-NEXT: xor 5, 5, 6
+; LE-NEXT: rldicl 6, 4, 20, 44
+; LE-NEXT: rldicl 6, 6, 44, 19
+; LE-NEXT: mulld 6, 3, 6
+; LE-NEXT: xor 5, 5, 6
+; LE-NEXT: rldicl 6, 4, 19, 45
+; LE-NEXT: rldicl 6, 6, 45, 18
+; LE-NEXT: mulld 6, 3, 6
+; LE-NEXT: xor 5, 5, 6
+; LE-NEXT: rldicl 6, 4, 18, 46
+; LE-NEXT: rldicl 6, 6, 46, 17
+; LE-NEXT: mulld 6, 3, 6
+; LE-NEXT: xor 5, 5, 6
+; LE-NEXT: rldicl 6, 4, 17, 47
+; LE-NEXT: rldicl 6, 6, 47, 16
+; LE-NEXT: mulld 6, 3, 6
+; LE-NEXT: xor 5, 5, 6
+; LE-NEXT: rldicl 6, 4, 16, 48
+; LE-NEXT: rldicl 6, 6, 48, 15
+; LE-NEXT: mulld 6, 3, 6
+; LE-NEXT: xor 5, 5, 6
+; LE-NEXT: rldicl 6, 4, 15, 49
+; LE-NEXT: rldicl 6, 6, 49, 14
+; LE-NEXT: mulld 6, 3, 6
+; LE-NEXT: xor 5, 5, 6
+; LE-NEXT: rldicl 6, 4, 14, 50
+; LE-NEXT: rldicl 6, 6, 50, 13
+; LE-NEXT: mulld 6, 3, 6
+; LE-NEXT: xor 5, 5, 6
+; LE-NEXT: rldicl 6, 4, 13, 51
+; LE-NEXT: rldicl 6, 6, 51, 12
+; LE-NEXT: mulld 6, 3, 6
+; LE-NEXT: xor 5, 5, 6
+; LE-NEXT: rldicl 6, 4, 12, 52
+; LE-NEXT: rldicl 6, 6, 52, 11
+; LE-NEXT: mulld 6, 3, 6
+; LE-NEXT: xor 5, 5, 6
+; LE-NEXT: rldicl 6, 4, 11, 53
+; LE-NEXT: rldicl 6, 6, 53, 10
+; LE-NEXT: mulld 6, 3, 6
+; LE-NEXT: xor 5, 5, 6
+; LE-NEXT: rldicl 6, 4, 10, 54
+; LE-NEXT: rldicl 6, 6, 54, 9
+; LE-NEXT: mulld 6, 3, 6
+; LE-NEXT: xor 5, 5, 6
+; LE-NEXT: rldicl 6, 4, 9, 55
+; LE-NEXT: rldicl 6, 6, 55, 8
+; LE-NEXT: mulld 6, 3, 6
+; LE-NEXT: xor 5, 5, 6
+; LE-NEXT: rldicl 6, 4, 8, 56
+; LE-NEXT: rldicl 6, 6, 56, 7
+; LE-NEXT: mulld 6, 3, 6
+; LE-NEXT: xor 5, 5, 6
+; LE-NEXT: rldicl 6, 4, 7, 57
+; LE-NEXT: rldicl 6, 6, 57, 6
+; LE-NEXT: mulld 6, 3, 6
+; LE-NEXT: xor 5, 5, 6
+; LE-NEXT: rldicl 6, 4, 6, 58
+; LE-NEXT: rldicl 6, 6, 58, 5
+; LE-NEXT: mulld 6, 3, 6
+; LE-NEXT: xor 5, 5, 6
+; LE-NEXT: rldicl 6, 4, 5, 59
+; LE-NEXT: rldicl 6, 6, 59, 4
+; LE-NEXT: mulld 6, 3, 6
+; LE-NEXT: xor 5, 5, 6
+; LE-NEXT: rldicl 6, 4, 4, 60
+; LE-NEXT: rldicl 6, 6, 60, 3
+; LE-NEXT: mulld 6, 3, 6
+; LE-NEXT: xor 5, 5, 6
+; LE-NEXT: rldicl 6, 4, 3, 61
+; LE-NEXT: rldicl 6, 6, 61, 2
+; LE-NEXT: mulld 6, 3, 6
+; LE-NEXT: xor 5, 5, 6
+; LE-NEXT: rldicl 6, 4, 2, 62
+; LE-NEXT: rldicr 4, 4, 0, 0
+; LE-NEXT: rldicl 6, 6, 62, 1
+; LE-NEXT: mulld 6, 3, 6
+; LE-NEXT: mulld 3, 3, 4
+; LE-NEXT: xor 5, 5, 6
+; LE-NEXT: xor 3, 5, 3
+; LE-NEXT: mtfprd 1, 3
+; LE-NEXT: xxmrghd 34, 0, 1
+; LE-NEXT: addi 1, 1, 480
+; LE-NEXT: blr
+ %res = call <2 x i64> @llvm.clmul.v2i64(<2 x i64> %a, <2 x i64> %b)
+ ret <2 x i64> %res
+}
+
+define <16 x i8> @clmulr_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
+; BE-LABEL: clmulr_v16i8:
+; BE: # %bb.0:
+; BE-NEXT: li 3, -48
+; BE-NEXT: vspltisb 4, 4
+; BE-NEXT: stvx 29, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, -32
+; BE-NEXT: vsrb 1, 3, 4
+; BE-NEXT: vspltisb 5, 15
+; BE-NEXT: stvx 30, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, -16
+; BE-NEXT: vspltisb 7, -1
+; BE-NEXT: stvx 31, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: addis 3, 2, .LCPI4_0 at toc@ha
+; BE-NEXT: addi 3, 3, .LCPI4_0 at toc@l
+; BE-NEXT: vand 3, 3, 5
+; BE-NEXT: vspltisb 13, 8
+; BE-NEXT: vslb 3, 3, 4
+; BE-NEXT: vsrb 0, 2, 4
+; BE-NEXT: vand 2, 2, 5
+; BE-NEXT: vor 1, 1, 3
+; BE-NEXT: lvx 3, 0, 3
+; BE-NEXT: addis 3, 2, .LCPI4_1 at toc@ha
+; BE-NEXT: addi 3, 3, .LCPI4_1 at toc@l
+; BE-NEXT: vslb 2, 2, 4
+; BE-NEXT: vor 0, 0, 2
+; BE-NEXT: vspltisb 2, 2
+; BE-NEXT: vsrb 9, 1, 2
+; BE-NEXT: vand 1, 1, 3
+; BE-NEXT: vand 9, 9, 3
+; BE-NEXT: vslb 1, 1, 2
+; BE-NEXT: vsrb 8, 0, 2
+; BE-NEXT: vand 0, 0, 3
+; BE-NEXT: vor 9, 9, 1
+; BE-NEXT: lvx 1, 0, 3
+; BE-NEXT: addis 3, 2, .LCPI4_3 at toc@ha
+; BE-NEXT: addi 3, 3, .LCPI4_3 at toc@l
+; BE-NEXT: lvx 15, 0, 3
+; BE-NEXT: addis 3, 2, .LCPI4_2 at toc@ha
+; BE-NEXT: addi 3, 3, .LCPI4_2 at toc@l
+; BE-NEXT: vand 8, 8, 3
+; BE-NEXT: vslb 0, 0, 2
+; BE-NEXT: vor 8, 8, 0
+; BE-NEXT: vspltisb 0, 1
+; BE-NEXT: vsrb 11, 9, 0
+; BE-NEXT: vand 9, 9, 1
+; BE-NEXT: vaddubm 9, 9, 9
+; BE-NEXT: vand 11, 11, 1
+; BE-NEXT: vsrb 10, 8, 0
+; BE-NEXT: vand 8, 8, 1
+; BE-NEXT: vaddubm 8, 8, 8
+; BE-NEXT: vor 9, 11, 9
+; BE-NEXT: vslb 6, 4, 4
+; BE-NEXT: vslb 7, 7, 7
+; BE-NEXT: vand 10, 10, 1
+; BE-NEXT: vand 14, 9, 13
+; BE-NEXT: vaddubm 13, 13, 13
+; BE-NEXT: vor 8, 10, 8
+; BE-NEXT: vand 10, 9, 2
+; BE-NEXT: vand 11, 9, 0
+; BE-NEXT: vand 12, 9, 4
+; BE-NEXT: vand 13, 9, 13
+; BE-NEXT: vand 15, 9, 15
+; BE-NEXT: vand 6, 9, 6
+; BE-NEXT: vand 7, 9, 7
+; BE-NEXT: vmuloub 9, 8, 10
+; BE-NEXT: vmuleub 10, 8, 10
+; BE-NEXT: vmuloub 16, 8, 11
+; BE-NEXT: vmuleub 11, 8, 11
+; BE-NEXT: vmuloub 17, 8, 12
+; BE-NEXT: vmuleub 12, 8, 12
+; BE-NEXT: vmuloub 18, 8, 14
+; BE-NEXT: vmuleub 14, 8, 14
+; BE-NEXT: vmuloub 19, 8, 13
+; BE-NEXT: vmuleub 13, 8, 13
+; BE-NEXT: vmuloub 31, 8, 15
+; BE-NEXT: vmuleub 15, 8, 15
+; BE-NEXT: vmuloub 30, 8, 6
+; BE-NEXT: vmuleub 6, 8, 6
+; BE-NEXT: vmuloub 29, 8, 7
+; BE-NEXT: vmuleub 7, 8, 7
+; BE-NEXT: lvx 8, 0, 3
+; BE-NEXT: li 3, -16
+; BE-NEXT: vperm 9, 10, 9, 8
+; BE-NEXT: vperm 10, 11, 16, 8
+; BE-NEXT: vperm 11, 12, 17, 8
+; BE-NEXT: vperm 12, 14, 18, 8
+; BE-NEXT: vperm 13, 13, 19, 8
+; BE-NEXT: vperm 14, 15, 31, 8
+; BE-NEXT: lvx 31, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, -32
+; BE-NEXT: vperm 6, 6, 30, 8
+; BE-NEXT: lvx 30, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, -48
+; BE-NEXT: vperm 7, 7, 29, 8
+; BE-NEXT: lvx 29, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: vxor 8, 10, 9
+; BE-NEXT: vxor 8, 8, 11
+; BE-NEXT: vxor 8, 8, 12
+; BE-NEXT: vxor 8, 8, 13
+; BE-NEXT: vxor 8, 8, 14
+; BE-NEXT: vxor 6, 8, 6
+; BE-NEXT: vxor 6, 6, 7
+; BE-NEXT: vand 5, 6, 5
+; BE-NEXT: vsrb 7, 6, 4
+; BE-NEXT: vslb 4, 5, 4
+; BE-NEXT: vor 4, 7, 4
+; BE-NEXT: vand 5, 4, 3
+; BE-NEXT: vsrb 4, 4, 2
+; BE-NEXT: vslb 2, 5, 2
+; BE-NEXT: vand 3, 4, 3
+; BE-NEXT: vor 2, 3, 2
+; BE-NEXT: vsrb 3, 2, 0
+; BE-NEXT: vand 2, 2, 1
+; BE-NEXT: vaddubm 2, 2, 2
+; BE-NEXT: vand 3, 3, 1
+; BE-NEXT: vor 2, 3, 2
+; BE-NEXT: blr
+;
+; LE-LABEL: clmulr_v16i8:
+; LE: # %bb.0:
+; LE-NEXT: addis 3, 2, .LCPI4_0 at toc@ha
+; LE-NEXT: vspltisb 4, 4
+; LE-NEXT: vspltisb 5, 2
+; LE-NEXT: addi 3, 3, .LCPI4_0 at toc@l
+; LE-NEXT: vslb 1, 3, 4
+; LE-NEXT: vsrb 3, 3, 4
+; LE-NEXT: vslb 6, 2, 4
+; LE-NEXT: vsrb 2, 2, 4
+; LE-NEXT: lxvd2x 0, 0, 3
+; LE-NEXT: addis 3, 2, .LCPI4_1 at toc@ha
+; LE-NEXT: xxlor 35, 35, 33
+; LE-NEXT: xxlor 34, 34, 38
+; LE-NEXT: vspltisb 0, 1
+; LE-NEXT: addi 3, 3, .LCPI4_1 at toc@l
+; LE-NEXT: vsrb 1, 3, 5
+; LE-NEXT: vsrb 7, 2, 5
+; LE-NEXT: vspltisb 6, 8
+; LE-NEXT: lxvd2x 1, 0, 3
+; LE-NEXT: addis 3, 2, .LCPI4_2 at toc@ha
+; LE-NEXT: xxland 35, 35, 0
+; LE-NEXT: xxland 34, 34, 0
+; LE-NEXT: xxland 2, 33, 0
+; LE-NEXT: xxland 3, 39, 0
+; LE-NEXT: addi 3, 3, .LCPI4_2 at toc@l
+; LE-NEXT: vslb 3, 3, 5
+; LE-NEXT: vslb 2, 2, 5
+; LE-NEXT: xxlor 35, 2, 35
+; LE-NEXT: xxlor 34, 3, 34
+; LE-NEXT: lxvd2x 3, 0, 3
+; LE-NEXT: addis 3, 2, .LCPI4_3 at toc@ha
+; LE-NEXT: vsrb 1, 3, 0
+; LE-NEXT: xxland 35, 35, 1
+; LE-NEXT: vsrb 7, 2, 0
+; LE-NEXT: xxland 34, 34, 1
+; LE-NEXT: addi 3, 3, .LCPI4_3 at toc@l
+; LE-NEXT: xxland 2, 33, 1
+; LE-NEXT: vaddubm 3, 3, 3
+; LE-NEXT: vaddubm 2, 2, 2
+; LE-NEXT: xxlor 2, 2, 35
+; LE-NEXT: xxland 35, 2, 37
+; LE-NEXT: xxswapd 33, 3
+; LE-NEXT: xxland 3, 39, 1
+; LE-NEXT: xxlor 34, 3, 34
+; LE-NEXT: lxvd2x 3, 0, 3
+; LE-NEXT: vmuloub 7, 2, 3
+; LE-NEXT: vmuleub 3, 2, 3
+; LE-NEXT: vperm 3, 3, 7, 1
+; LE-NEXT: xxland 39, 2, 32
+; LE-NEXT: vmuloub 8, 2, 7
+; LE-NEXT: vmuleub 7, 2, 7
+; LE-NEXT: vperm 7, 7, 8, 1
+; LE-NEXT: xxland 40, 2, 36
+; LE-NEXT: vmuloub 9, 2, 8
+; LE-NEXT: vmuleub 8, 2, 8
+; LE-NEXT: vperm 8, 8, 9, 1
+; LE-NEXT: xxland 41, 2, 38
+; LE-NEXT: vaddubm 6, 6, 6
+; LE-NEXT: vmuloub 10, 2, 9
+; LE-NEXT: vmuleub 9, 2, 9
+; LE-NEXT: xxland 38, 2, 38
+; LE-NEXT: vperm 9, 9, 10, 1
+; LE-NEXT: vmuloub 10, 2, 6
+; LE-NEXT: vmuleub 6, 2, 6
+; LE-NEXT: vperm 6, 6, 10, 1
+; LE-NEXT: xxland 42, 2, 3
+; LE-NEXT: vmuloub 11, 2, 10
+; LE-NEXT: vmuleub 10, 2, 10
+; LE-NEXT: vperm 10, 10, 11, 1
+; LE-NEXT: vslb 11, 4, 4
+; LE-NEXT: xxland 43, 2, 43
+; LE-NEXT: vmuloub 12, 2, 11
+; LE-NEXT: vmuleub 11, 2, 11
+; LE-NEXT: vperm 11, 11, 12, 1
+; LE-NEXT: xxleqv 44, 44, 44
+; LE-NEXT: vslb 12, 12, 12
+; LE-NEXT: xxland 44, 2, 44
+; LE-NEXT: xxlxor 2, 39, 35
+; LE-NEXT: xxlxor 2, 2, 40
+; LE-NEXT: vmuloub 13, 2, 12
+; LE-NEXT: vmuleub 2, 2, 12
+; LE-NEXT: xxlxor 2, 2, 41
+; LE-NEXT: xxlxor 2, 2, 38
+; LE-NEXT: xxlxor 2, 2, 42
+; LE-NEXT: xxlxor 2, 2, 43
+; LE-NEXT: vperm 2, 2, 13, 1
+; LE-NEXT: xxlxor 34, 2, 34
+; LE-NEXT: vslb 3, 2, 4
+; LE-NEXT: vsrb 2, 2, 4
+; LE-NEXT: xxlor 34, 34, 35
+; LE-NEXT: xxland 35, 34, 0
+; LE-NEXT: vsrb 2, 2, 5
+; LE-NEXT: vslb 3, 3, 5
+; LE-NEXT: xxland 0, 34, 0
+; LE-NEXT: xxlor 34, 0, 35
+; LE-NEXT: vsrb 3, 2, 0
+; LE-NEXT: xxland 34, 34, 1
+; LE-NEXT: xxland 0, 35, 1
+; LE-NEXT: vaddubm 2, 2, 2
+; LE-NEXT: xxlor 34, 0, 34
+; LE-NEXT: blr
+ %a.ext = zext <16 x i8> %a to <16 x i16>
+ %b.ext = zext <16 x i8> %b to <16 x i16>
+ %clmul = call <16 x i16> @llvm.clmul.v16i16(<16 x i16> %a.ext, <16 x i16> %b.ext)
+ %res.ext = lshr <16 x i16> %clmul, splat (i16 7)
+ %res = trunc <16 x i16> %res.ext to <16 x i8>
+ ret <16 x i8> %res
+}
+
+define <8 x i16> @clmulr_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
+; BE-LABEL: clmulr_v8i16:
+; BE: # %bb.0:
+; BE-NEXT: li 3, -80
+; BE-NEXT: vspltish 4, 8
+; BE-NEXT: vxor 5, 5, 5
+; BE-NEXT: stvx 27, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, -64
+; BE-NEXT: vadduhm 19, 4, 4
+; BE-NEXT: vspltisb 1, -1
+; BE-NEXT: stvx 28, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, -48
+; BE-NEXT: vspltish 0, 2
+; BE-NEXT: stvx 29, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, -32
+; BE-NEXT: vrlh 8, 2, 4
+; BE-NEXT: vspltish 2, 4
+; BE-NEXT: stvx 30, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, -16
+; BE-NEXT: stvx 31, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: addis 3, 2, .LCPI5_0 at toc@ha
+; BE-NEXT: addi 3, 3, .LCPI5_0 at toc@l
+; BE-NEXT: vrlh 6, 3, 4
+; BE-NEXT: vspltish 3, 1
+; BE-NEXT: vslh 13, 1, 1
+; BE-NEXT: vspltisb 1, 15
+; BE-NEXT: vand 14, 8, 1
+; BE-NEXT: vsrh 8, 8, 2
+; BE-NEXT: vand 15, 6, 1
+; BE-NEXT: vsrh 6, 6, 2
+; BE-NEXT: vslh 14, 14, 2
+; BE-NEXT: vand 8, 8, 1
+; BE-NEXT: vslh 15, 15, 2
+; BE-NEXT: vand 6, 6, 1
+; BE-NEXT: vor 8, 8, 14
+; BE-NEXT: vor 14, 6, 15
+; BE-NEXT: lvx 6, 0, 3
+; BE-NEXT: addis 3, 2, .LCPI5_1 at toc@ha
+; BE-NEXT: addi 3, 3, .LCPI5_1 at toc@l
+; BE-NEXT: vand 15, 8, 6
+; BE-NEXT: vsrh 8, 8, 0
+; BE-NEXT: vslh 15, 15, 0
+; BE-NEXT: vand 8, 8, 6
+; BE-NEXT: vor 15, 8, 15
+; BE-NEXT: lvx 8, 0, 3
+; BE-NEXT: addis 3, 2, .LCPI5_2 at toc@ha
+; BE-NEXT: addi 3, 3, .LCPI5_2 at toc@l
+; BE-NEXT: lvx 31, 0, 3
+; BE-NEXT: addis 3, 2, .LCPI5_3 at toc@ha
+; BE-NEXT: addi 3, 3, .LCPI5_3 at toc@l
+; BE-NEXT: lvx 30, 0, 3
+; BE-NEXT: addis 3, 2, .LCPI5_4 at toc@ha
+; BE-NEXT: addi 3, 3, .LCPI5_4 at toc@l
+; BE-NEXT: vand 16, 14, 6
+; BE-NEXT: lvx 29, 0, 3
+; BE-NEXT: addis 3, 2, .LCPI5_5 at toc@ha
+; BE-NEXT: addi 3, 3, .LCPI5_5 at toc@l
+; BE-NEXT: lvx 28, 0, 3
+; BE-NEXT: addis 3, 2, .LCPI5_6 at toc@ha
+; BE-NEXT: addi 3, 3, .LCPI5_6 at toc@l
+; BE-NEXT: lvx 27, 0, 3
+; BE-NEXT: li 3, -16
+; BE-NEXT: vsrh 14, 14, 0
+; BE-NEXT: vslh 16, 16, 0
+; BE-NEXT: vand 14, 14, 6
+; BE-NEXT: vor 14, 14, 16
+; BE-NEXT: vsrh 17, 14, 3
+; BE-NEXT: vand 14, 14, 8
+; BE-NEXT: vadduhm 14, 14, 14
+; BE-NEXT: vsrh 16, 15, 3
+; BE-NEXT: vand 15, 15, 8
+; BE-NEXT: vadduhm 15, 15, 15
+; BE-NEXT: vand 17, 17, 8
+; BE-NEXT: vand 16, 16, 8
+; BE-NEXT: vor 14, 17, 14
+; BE-NEXT: vslh 7, 2, 2
+; BE-NEXT: vsldoi 9, 3, 3, 1
+; BE-NEXT: vsldoi 10, 0, 0, 1
+; BE-NEXT: vsldoi 11, 2, 2, 1
+; BE-NEXT: vslh 12, 4, 4
+; BE-NEXT: vor 15, 16, 15
+; BE-NEXT: vand 16, 14, 0
+; BE-NEXT: vand 17, 14, 3
+; BE-NEXT: vand 18, 14, 2
+; BE-NEXT: vand 19, 14, 19
+; BE-NEXT: vand 31, 14, 31
+; BE-NEXT: vand 7, 14, 7
+; BE-NEXT: vand 30, 14, 30
+; BE-NEXT: vand 9, 14, 9
+; BE-NEXT: vand 10, 14, 10
+; BE-NEXT: vand 11, 14, 11
+; BE-NEXT: vand 12, 14, 12
+; BE-NEXT: vand 29, 14, 29
+; BE-NEXT: vand 28, 14, 28
+; BE-NEXT: vand 27, 14, 27
+; BE-NEXT: vand 13, 14, 13
+; BE-NEXT: vand 14, 14, 4
+; BE-NEXT: vmladduhm 16, 15, 16, 5
+; BE-NEXT: vmladduhm 17, 15, 17, 5
+; BE-NEXT: vmladduhm 18, 15, 18, 5
+; BE-NEXT: vmladduhm 14, 15, 14, 5
+; BE-NEXT: vmladduhm 19, 15, 19, 5
+; BE-NEXT: vmladduhm 31, 15, 31, 5
+; BE-NEXT: vmladduhm 7, 15, 7, 5
+; BE-NEXT: vmladduhm 30, 15, 30, 5
+; BE-NEXT: vmladduhm 9, 15, 9, 5
+; BE-NEXT: vmladduhm 10, 15, 10, 5
+; BE-NEXT: vmladduhm 11, 15, 11, 5
+; BE-NEXT: vmladduhm 12, 15, 12, 5
+; BE-NEXT: vmladduhm 29, 15, 29, 5
+; BE-NEXT: vmladduhm 28, 15, 28, 5
+; BE-NEXT: vmladduhm 27, 15, 27, 5
+; BE-NEXT: vmladduhm 5, 15, 13, 5
+; BE-NEXT: vxor 13, 17, 16
+; BE-NEXT: vxor 13, 13, 18
+; BE-NEXT: vxor 13, 13, 14
+; BE-NEXT: vxor 13, 13, 19
+; BE-NEXT: vxor 13, 13, 31
+; BE-NEXT: lvx 31, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, -32
+; BE-NEXT: vxor 7, 13, 7
+; BE-NEXT: vxor 7, 7, 30
+; BE-NEXT: lvx 30, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, -48
+; BE-NEXT: vxor 7, 7, 9
+; BE-NEXT: vxor 7, 7, 10
+; BE-NEXT: vxor 7, 7, 11
+; BE-NEXT: vxor 7, 7, 12
+; BE-NEXT: vxor 7, 7, 29
+; BE-NEXT: lvx 29, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, -64
+; BE-NEXT: vxor 7, 7, 28
+; BE-NEXT: lvx 28, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, -80
+; BE-NEXT: vxor 7, 7, 27
+; BE-NEXT: lvx 27, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: vxor 5, 7, 5
+; BE-NEXT: vrlh 4, 5, 4
+; BE-NEXT: vand 5, 4, 1
+; BE-NEXT: vsrh 4, 4, 2
+; BE-NEXT: vslh 2, 5, 2
+; BE-NEXT: vand 4, 4, 1
+; BE-NEXT: vor 2, 4, 2
+; BE-NEXT: vand 4, 2, 6
+; BE-NEXT: vsrh 2, 2, 0
+; BE-NEXT: vslh 4, 4, 0
+; BE-NEXT: vand 2, 2, 6
+; BE-NEXT: vor 2, 2, 4
+; BE-NEXT: vsrh 3, 2, 3
+; BE-NEXT: vand 2, 2, 8
+; BE-NEXT: vadduhm 2, 2, 2
+; BE-NEXT: vand 3, 3, 8
+; BE-NEXT: vor 2, 3, 2
+; BE-NEXT: blr
+;
+; LE-LABEL: clmulr_v8i16:
+; LE: # %bb.0:
+; LE-NEXT: vspltish 5, 8
+; LE-NEXT: vspltisb 4, 15
+; LE-NEXT: addis 3, 2, .LCPI5_0 at toc@ha
+; LE-NEXT: vrlh 2, 2, 5
+; LE-NEXT: vspltish 0, 4
+; LE-NEXT: addi 3, 3, .LCPI5_0 at toc@l
+; LE-NEXT: vspltish 1, 2
+; LE-NEXT: vspltish 6, 1
+; LE-NEXT: vrlh 3, 3, 5
+; LE-NEXT: xxland 42, 34, 36
+; LE-NEXT: vsrh 2, 2, 0
+; LE-NEXT: vslh 10, 10, 0
+; LE-NEXT: xxland 0, 34, 36
+; LE-NEXT: vsldoi 7, 6, 6, 1
+; LE-NEXT: vsldoi 8, 1, 1, 1
+; LE-NEXT: vsldoi 9, 0, 0, 1
+; LE-NEXT: xxlor 34, 0, 42
+; LE-NEXT: lxvd2x 0, 0, 3
+; LE-NEXT: addis 3, 2, .LCPI5_1 at toc@ha
+; LE-NEXT: addi 3, 3, .LCPI5_1 at toc@l
+; LE-NEXT: xxland 42, 34, 0
+; LE-NEXT: vsrh 2, 2, 1
+; LE-NEXT: vslh 10, 10, 1
+; LE-NEXT: xxland 1, 34, 0
+; LE-NEXT: xxlor 34, 1, 42
+; LE-NEXT: lxvd2x 1, 0, 3
+; LE-NEXT: addis 3, 2, .LCPI5_2 at toc@ha
+; LE-NEXT: vsrh 10, 2, 6
+; LE-NEXT: addi 3, 3, .LCPI5_2 at toc@l
+; LE-NEXT: lxvd2x 4, 0, 3
+; LE-NEXT: addis 3, 2, .LCPI5_3 at toc@ha
+; LE-NEXT: xxland 34, 34, 1
+; LE-NEXT: xxland 2, 42, 1
+; LE-NEXT: xxland 42, 35, 36
+; LE-NEXT: vsrh 3, 3, 0
+; LE-NEXT: addi 3, 3, .LCPI5_3 at toc@l
+; LE-NEXT: vadduhm 2, 2, 2
+; LE-NEXT: vslh 10, 10, 0
+; LE-NEXT: xxlor 34, 2, 34
+; LE-NEXT: xxland 2, 35, 36
+; LE-NEXT: xxlor 35, 2, 42
+; LE-NEXT: xxland 42, 35, 0
+; LE-NEXT: vsrh 3, 3, 1
+; LE-NEXT: vslh 10, 10, 1
+; LE-NEXT: xxland 2, 35, 0
+; LE-NEXT: xxlor 35, 2, 42
+; LE-NEXT: vsrh 10, 3, 6
+; LE-NEXT: xxland 35, 35, 1
+; LE-NEXT: xxland 2, 42, 1
+; LE-NEXT: vadduhm 3, 3, 3
+; LE-NEXT: xxlor 2, 2, 35
+; LE-NEXT: vxor 3, 3, 3
+; LE-NEXT: xxland 42, 2, 33
+; LE-NEXT: xxland 43, 2, 38
+; LE-NEXT: xxland 39, 2, 39
+; LE-NEXT: vmladduhm 10, 2, 10, 3
+; LE-NEXT: vmladduhm 11, 2, 11, 3
+; LE-NEXT: vmladduhm 7, 2, 7, 3
+; LE-NEXT: xxlxor 3, 43, 42
+; LE-NEXT: xxland 42, 2, 32
+; LE-NEXT: vmladduhm 10, 2, 10, 3
+; LE-NEXT: xxlxor 3, 3, 42
+; LE-NEXT: xxland 42, 2, 37
+; LE-NEXT: vmladduhm 10, 2, 10, 3
+; LE-NEXT: xxlxor 3, 3, 42
+; LE-NEXT: vadduhm 10, 5, 5
+; LE-NEXT: xxland 42, 2, 42
+; LE-NEXT: vmladduhm 10, 2, 10, 3
+; LE-NEXT: xxlxor 3, 3, 42
+; LE-NEXT: xxland 42, 2, 4
+; LE-NEXT: lxvd2x 4, 0, 3
+; LE-NEXT: addis 3, 2, .LCPI5_4 at toc@ha
+; LE-NEXT: vmladduhm 10, 2, 10, 3
+; LE-NEXT: addi 3, 3, .LCPI5_4 at toc@l
+; LE-NEXT: xxlxor 3, 3, 42
+; LE-NEXT: vslh 10, 0, 0
+; LE-NEXT: xxland 42, 2, 42
+; LE-NEXT: vmladduhm 10, 2, 10, 3
+; LE-NEXT: xxlxor 3, 3, 42
+; LE-NEXT: xxland 42, 2, 4
+; LE-NEXT: lxvd2x 4, 0, 3
+; LE-NEXT: addis 3, 2, .LCPI5_5 at toc@ha
+; LE-NEXT: vmladduhm 10, 2, 10, 3
+; LE-NEXT: addi 3, 3, .LCPI5_5 at toc@l
+; LE-NEXT: xxlxor 3, 3, 42
+; LE-NEXT: xxlxor 3, 3, 39
+; LE-NEXT: xxland 39, 2, 40
+; LE-NEXT: vmladduhm 7, 2, 7, 3
+; LE-NEXT: xxlxor 3, 3, 39
+; LE-NEXT: xxland 39, 2, 41
+; LE-NEXT: vmladduhm 7, 2, 7, 3
+; LE-NEXT: xxlxor 3, 3, 39
+; LE-NEXT: vslh 7, 5, 5
+; LE-NEXT: xxland 39, 2, 39
+; LE-NEXT: vmladduhm 7, 2, 7, 3
+; LE-NEXT: xxlxor 3, 3, 39
+; LE-NEXT: xxland 39, 2, 4
+; LE-NEXT: lxvd2x 4, 0, 3
+; LE-NEXT: addis 3, 2, .LCPI5_6 at toc@ha
+; LE-NEXT: vmladduhm 7, 2, 7, 3
+; LE-NEXT: addi 3, 3, .LCPI5_6 at toc@l
+; LE-NEXT: xxlxor 3, 3, 39
+; LE-NEXT: xxland 39, 2, 4
+; LE-NEXT: lxvd2x 4, 0, 3
+; LE-NEXT: vmladduhm 7, 2, 7, 3
+; LE-NEXT: xxlxor 3, 3, 39
+; LE-NEXT: xxland 39, 2, 4
+; LE-NEXT: vmladduhm 7, 2, 7, 3
+; LE-NEXT: xxlxor 3, 3, 39
+; LE-NEXT: xxleqv 39, 39, 39
+; LE-NEXT: vslh 7, 7, 7
+; LE-NEXT: xxland 39, 2, 39
+; LE-NEXT: vmladduhm 2, 2, 7, 3
+; LE-NEXT: xxlxor 34, 3, 34
+; LE-NEXT: vrlh 2, 2, 5
+; LE-NEXT: xxland 35, 34, 36
+; LE-NEXT: vsrh 2, 2, 0
+; LE-NEXT: vslh 3, 3, 0
+; LE-NEXT: xxland 2, 34, 36
+; LE-NEXT: xxlor 34, 2, 35
+; LE-NEXT: xxland 35, 34, 0
+; LE-NEXT: vsrh 2, 2, 1
+; LE-NEXT: vslh 3, 3, 1
+; LE-NEXT: xxland 0, 34, 0
+; LE-NEXT: xxlor 34, 0, 35
+; LE-NEXT: vsrh 3, 2, 6
+; LE-NEXT: xxland 34, 34, 1
+; LE-NEXT: xxland 0, 35, 1
+; LE-NEXT: vadduhm 2, 2, 2
+; LE-NEXT: xxlor 34, 0, 34
+; LE-NEXT: blr
+ %a.ext = zext <8 x i16> %a to <8 x i32>
+ %b.ext = zext <8 x i16> %b to <8 x i32>
+ %clmul = call <8 x i32> @llvm.clmul.v8i32(<8 x i32> %a.ext, <8 x i32> %b.ext)
+ %res.ext = lshr <8 x i32> %clmul, splat (i32 15)
+ %res = trunc <8 x i32> %res.ext to <8 x i16>
+ ret <8 x i16> %res
+}
+
+define <4 x i32> @clmulr_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
+; BE-LABEL: clmulr_v4i32:
+; BE: # %bb.0:
+; BE-NEXT: stdu 1, -1472(1)
+; BE-NEXT: li 3, 1280
+; BE-NEXT: vspltisb 12, -1
+; BE-NEXT: stvx 20, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 1296
+; BE-NEXT: vslw 15, 12, 12
+; BE-NEXT: vspltisw 12, 12
+; BE-NEXT: stvx 21, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 1312
+; BE-NEXT: vadduwm 17, 12, 12
+; BE-NEXT: vspltisw 18, 8
+; BE-NEXT: stvx 22, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 1328
+; BE-NEXT: vsrw 6, 2, 18
+; BE-NEXT: vspltisw 19, 4
+; BE-NEXT: stvx 23, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 1344
+; BE-NEXT: stvx 24, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 1360
+; BE-NEXT: stvx 25, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 1376
+; BE-NEXT: vsrw 9, 3, 18
+; BE-NEXT: stvx 26, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 1392
+; BE-NEXT: stvx 27, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 1408
+; BE-NEXT: stvx 28, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 1424
+; BE-NEXT: vsrw 12, 2, 17
+; BE-NEXT: stvx 29, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 1440
+; BE-NEXT: stvx 30, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 1456
+; BE-NEXT: vspltisw 30, 2
+; BE-NEXT: vslw 14, 2, 17
+; BE-NEXT: stvx 31, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 1264
+; BE-NEXT: vspltisw 31, 1
+; BE-NEXT: stvx 17, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: addis 3, 2, .LCPI6_0 at toc@ha
+; BE-NEXT: addi 3, 3, .LCPI6_0 at toc@l
+; BE-NEXT: lvx 29, 0, 3
+; BE-NEXT: li 3, 1248
+; BE-NEXT: vsrw 16, 3, 17
+; BE-NEXT: stvx 29, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 1232
+; BE-NEXT: vslw 17, 3, 17
+; BE-NEXT: vand 2, 2, 29
+; BE-NEXT: vand 3, 3, 29
+; BE-NEXT: vand 6, 6, 29
+; BE-NEXT: vand 9, 9, 29
+; BE-NEXT: vslw 2, 2, 18
+; BE-NEXT: vslw 3, 3, 18
+; BE-NEXT: vor 6, 6, 12
+; BE-NEXT: vspltisb 12, 15
+; BE-NEXT: stvx 12, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: addis 3, 2, .LCPI6_1 at toc@ha
+; BE-NEXT: addi 3, 3, .LCPI6_1 at toc@l
+; BE-NEXT: vor 9, 9, 16
+; BE-NEXT: vor 2, 14, 2
+; BE-NEXT: vor 3, 17, 3
+; BE-NEXT: vor 2, 2, 6
+; BE-NEXT: vor 3, 3, 9
+; BE-NEXT: vand 6, 2, 12
+; BE-NEXT: vsrw 2, 2, 19
+; BE-NEXT: vand 9, 3, 12
+; BE-NEXT: vsrw 3, 3, 19
+; BE-NEXT: vand 2, 2, 12
+; BE-NEXT: vand 3, 3, 12
+; BE-NEXT: lvx 12, 0, 3
+; BE-NEXT: li 3, 1216
+; BE-NEXT: stvx 12, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: addis 3, 2, .LCPI6_2 at toc@ha
+; BE-NEXT: addi 3, 3, .LCPI6_2 at toc@l
+; BE-NEXT: vslw 6, 6, 19
+; BE-NEXT: vslw 9, 9, 19
+; BE-NEXT: vor 2, 2, 6
+; BE-NEXT: vor 3, 3, 9
+; BE-NEXT: vand 6, 2, 12
+; BE-NEXT: vsrw 2, 2, 30
+; BE-NEXT: vand 9, 3, 12
+; BE-NEXT: vsrw 3, 3, 30
+; BE-NEXT: vand 2, 2, 12
+; BE-NEXT: vand 3, 3, 12
+; BE-NEXT: lvx 12, 0, 3
+; BE-NEXT: li 3, 1200
+; BE-NEXT: stvx 12, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 1136
+; BE-NEXT: stvx 18, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: addis 3, 2, .LCPI6_3 at toc@ha
+; BE-NEXT: addi 3, 3, .LCPI6_3 at toc@l
+; BE-NEXT: vslw 6, 6, 30
+; BE-NEXT: vslw 9, 9, 30
+; BE-NEXT: vor 2, 2, 6
+; BE-NEXT: vor 3, 3, 9
+; BE-NEXT: vsrw 6, 2, 31
+; BE-NEXT: vand 2, 2, 12
+; BE-NEXT: vadduwm 2, 2, 2
+; BE-NEXT: vsrw 9, 3, 31
+; BE-NEXT: vand 3, 3, 12
+; BE-NEXT: vand 6, 6, 12
+; BE-NEXT: vand 12, 9, 12
+; BE-NEXT: vor 9, 6, 2
+; BE-NEXT: vadduwm 2, 3, 3
+; BE-NEXT: vor 14, 12, 2
+; BE-NEXT: vadduwm 2, 18, 18
+; BE-NEXT: vand 28, 14, 2
+; BE-NEXT: lvx 2, 0, 3
+; BE-NEXT: addis 3, 2, .LCPI6_4 at toc@ha
+; BE-NEXT: addi 3, 3, .LCPI6_4 at toc@l
+; BE-NEXT: vand 27, 14, 2
+; BE-NEXT: lvx 2, 0, 3
+; BE-NEXT: addis 3, 2, .LCPI6_5 at toc@ha
+; BE-NEXT: addi 3, 3, .LCPI6_5 at toc@l
+; BE-NEXT: vand 25, 14, 2
+; BE-NEXT: lvx 2, 0, 3
+; BE-NEXT: addis 3, 2, .LCPI6_6 at toc@ha
+; BE-NEXT: addi 3, 3, .LCPI6_6 at toc@l
+; BE-NEXT: vslw 4, 19, 19
+; BE-NEXT: vand 26, 14, 4
+; BE-NEXT: vand 4, 14, 2
+; BE-NEXT: lvx 2, 0, 3
+; BE-NEXT: addis 3, 2, .LCPI6_7 at toc@ha
+; BE-NEXT: addi 3, 3, .LCPI6_7 at toc@l
+; BE-NEXT: vsldoi 5, 31, 31, 1
+; BE-NEXT: vand 24, 14, 5
+; BE-NEXT: vand 5, 14, 2
+; BE-NEXT: lvx 2, 0, 3
+; BE-NEXT: addis 3, 2, .LCPI6_8 at toc@ha
+; BE-NEXT: addi 3, 3, .LCPI6_8 at toc@l
+; BE-NEXT: vand 29, 14, 2
+; BE-NEXT: lvx 2, 0, 3
+; BE-NEXT: addis 3, 2, .LCPI6_9 at toc@ha
+; BE-NEXT: addi 3, 3, .LCPI6_9 at toc@l
+; BE-NEXT: vand 21, 14, 2
+; BE-NEXT: lvx 2, 0, 3
+; BE-NEXT: addis 3, 2, .LCPI6_10 at toc@ha
+; BE-NEXT: addi 3, 3, .LCPI6_10 at toc@l
+; BE-NEXT: vslw 7, 18, 18
+; BE-NEXT: vand 3, 14, 7
+; BE-NEXT: vand 7, 14, 2
+; BE-NEXT: lvx 2, 0, 3
+; BE-NEXT: addis 3, 2, .LCPI6_11 at toc@ha
+; BE-NEXT: addi 3, 3, .LCPI6_11 at toc@l
+; BE-NEXT: vsldoi 13, 18, 18, 2
+; BE-NEXT: vand 16, 14, 13
+; BE-NEXT: vand 13, 14, 2
+; BE-NEXT: lvx 2, 0, 3
+; BE-NEXT: addis 3, 2, .LCPI6_12 at toc@ha
+; BE-NEXT: addi 3, 3, .LCPI6_12 at toc@l
+; BE-NEXT: vand 12, 14, 2
+; BE-NEXT: lvx 2, 0, 3
+; BE-NEXT: li 3, 1184
+; BE-NEXT: stvx 31, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 1168
+; BE-NEXT: stvx 30, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 1152
+; BE-NEXT: vsldoi 11, 31, 31, 2
+; BE-NEXT: stvx 19, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 1072
+; BE-NEXT: vsldoi 1, 19, 19, 1
+; BE-NEXT: vsldoi 10, 30, 30, 2
+; BE-NEXT: vand 20, 14, 11
+; BE-NEXT: vand 11, 14, 2
+; BE-NEXT: vsldoi 2, 31, 31, 3
+; BE-NEXT: vsldoi 8, 19, 19, 2
+; BE-NEXT: vand 22, 14, 1
+; BE-NEXT: vand 1, 14, 10
+; BE-NEXT: vand 10, 14, 2
+; BE-NEXT: vsldoi 2, 30, 30, 3
+; BE-NEXT: vand 17, 14, 8
+; BE-NEXT: vand 8, 14, 2
+; BE-NEXT: vsldoi 2, 19, 19, 3
+; BE-NEXT: vand 2, 14, 2
+; BE-NEXT: stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 1040
+; BE-NEXT: vsldoi 2, 18, 18, 3
+; BE-NEXT: vand 2, 14, 2
+; BE-NEXT: stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: addis 3, 2, .LCPI6_13 at toc@ha
+; BE-NEXT: addi 3, 3, .LCPI6_13 at toc@l
+; BE-NEXT: lvx 2, 0, 3
+; BE-NEXT: li 3, 1008
+; BE-NEXT: vand 2, 14, 2
+; BE-NEXT: stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: addis 3, 2, .LCPI6_14 at toc@ha
+; BE-NEXT: addi 3, 3, .LCPI6_14 at toc@l
+; BE-NEXT: lvx 2, 0, 3
+; BE-NEXT: li 3, 288
+; BE-NEXT: vand 2, 14, 2
+; BE-NEXT: stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: addis 3, 2, .LCPI6_15 at toc@ha
+; BE-NEXT: addi 3, 3, .LCPI6_15 at toc@l
+; BE-NEXT: lvx 2, 0, 3
+; BE-NEXT: li 3, 192
+; BE-NEXT: vand 2, 14, 2
+; BE-NEXT: stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 272
+; BE-NEXT: vand 2, 14, 15
+; BE-NEXT: vspltisw 15, -16
+; BE-NEXT: stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 976
+; BE-NEXT: vand 2, 14, 30
+; BE-NEXT: stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 944
+; BE-NEXT: vand 31, 14, 31
+; BE-NEXT: stvx 31, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 912
+; BE-NEXT: vsldoi 0, 30, 30, 1
+; BE-NEXT: vand 19, 14, 19
+; BE-NEXT: stvx 19, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 880
+; BE-NEXT: vand 23, 14, 0
+; BE-NEXT: vand 14, 14, 18
+; BE-NEXT: stvx 14, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 1120
+; BE-NEXT: vxor 6, 6, 6
+; BE-NEXT: vrlw 0, 2, 15
+; BE-NEXT: vmsumuhm 2, 9, 0, 6
+; BE-NEXT: stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 1104
+; BE-NEXT: vrlw 0, 31, 15
+; BE-NEXT: vmsumuhm 2, 9, 0, 6
+; BE-NEXT: stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 1088
+; BE-NEXT: vrlw 0, 19, 15
+; BE-NEXT: vmsumuhm 2, 9, 0, 6
+; BE-NEXT: stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 1056
+; BE-NEXT: vrlw 0, 14, 15
+; BE-NEXT: vmsumuhm 2, 9, 0, 6
+; BE-NEXT: stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 1024
+; BE-NEXT: vrlw 0, 28, 15
+; BE-NEXT: vmsumuhm 2, 9, 0, 6
+; BE-NEXT: stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 992
+; BE-NEXT: vrlw 0, 27, 15
+; BE-NEXT: vmsumuhm 2, 9, 0, 6
+; BE-NEXT: stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 960
+; BE-NEXT: vrlw 0, 26, 15
+; BE-NEXT: vmsumuhm 2, 9, 0, 6
+; BE-NEXT: stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 928
+; BE-NEXT: vrlw 0, 25, 15
+; BE-NEXT: vmsumuhm 2, 9, 0, 6
+; BE-NEXT: stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 896
+; BE-NEXT: vrlw 0, 24, 15
+; BE-NEXT: vmsumuhm 2, 9, 0, 6
+; BE-NEXT: stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 864
+; BE-NEXT: vrlw 0, 23, 15
+; BE-NEXT: vmsumuhm 2, 9, 0, 6
+; BE-NEXT: stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 832
+; BE-NEXT: vrlw 0, 22, 15
+; BE-NEXT: vmsumuhm 2, 9, 0, 6
+; BE-NEXT: stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 800
+; BE-NEXT: vrlw 0, 3, 15
+; BE-NEXT: vmsumuhm 2, 9, 0, 6
+; BE-NEXT: stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 768
+; BE-NEXT: vrlw 0, 4, 15
+; BE-NEXT: vmsumuhm 2, 9, 0, 6
+; BE-NEXT: stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 736
+; BE-NEXT: vrlw 0, 5, 15
+; BE-NEXT: vmsumuhm 2, 9, 0, 6
+; BE-NEXT: stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 704
+; BE-NEXT: vrlw 0, 29, 15
+; BE-NEXT: vmsumuhm 2, 9, 0, 6
+; BE-NEXT: stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 672
+; BE-NEXT: vrlw 0, 21, 15
+; BE-NEXT: vmsumuhm 2, 9, 0, 6
+; BE-NEXT: stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 640
+; BE-NEXT: vrlw 0, 20, 15
+; BE-NEXT: vmsumuhm 2, 9, 0, 6
+; BE-NEXT: stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 592
+; BE-NEXT: vrlw 0, 1, 15
+; BE-NEXT: vmsumuhm 2, 9, 0, 6
+; BE-NEXT: stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 560
+; BE-NEXT: vrlw 0, 17, 15
+; BE-NEXT: vmsumuhm 2, 9, 0, 6
+; BE-NEXT: stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 528
+; BE-NEXT: vrlw 0, 16, 15
+; BE-NEXT: vmsumuhm 2, 9, 0, 6
+; BE-NEXT: stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 496
+; BE-NEXT: vrlw 0, 7, 15
+; BE-NEXT: vmsumuhm 2, 9, 0, 6
+; BE-NEXT: stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 464
+; BE-NEXT: vrlw 0, 13, 15
+; BE-NEXT: vmsumuhm 2, 9, 0, 6
+; BE-NEXT: stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 432
+; BE-NEXT: vrlw 0, 12, 15
+; BE-NEXT: vmsumuhm 2, 9, 0, 6
+; BE-NEXT: stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 400
+; BE-NEXT: vrlw 0, 11, 15
+; BE-NEXT: vmsumuhm 2, 9, 0, 6
+; BE-NEXT: stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 368
+; BE-NEXT: vrlw 0, 10, 15
+; BE-NEXT: vmsumuhm 2, 9, 0, 6
+; BE-NEXT: stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 336
+; BE-NEXT: vrlw 0, 8, 15
+; BE-NEXT: vmsumuhm 2, 9, 0, 6
+; BE-NEXT: stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 1072
+; BE-NEXT: vmr 14, 7
+; BE-NEXT: lvx 7, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 304
+; BE-NEXT: vrlw 0, 7, 15
+; BE-NEXT: vmsumuhm 2, 9, 0, 6
+; BE-NEXT: stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 1040
+; BE-NEXT: vmr 30, 1
+; BE-NEXT: lvx 1, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 240
+; BE-NEXT: vrlw 0, 1, 15
+; BE-NEXT: vmsumuhm 2, 9, 0, 6
+; BE-NEXT: stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 1008
+; BE-NEXT: vmr 19, 5
+; BE-NEXT: lvx 5, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 208
+; BE-NEXT: vrlw 0, 5, 15
+; BE-NEXT: vmsumuhm 2, 9, 0, 6
+; BE-NEXT: stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 288
+; BE-NEXT: vmr 18, 4
+; BE-NEXT: lvx 4, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 160
+; BE-NEXT: vrlw 0, 4, 15
+; BE-NEXT: vmsumuhm 2, 9, 0, 6
+; BE-NEXT: stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 192
+; BE-NEXT: vmr 31, 3
+; BE-NEXT: lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 128
+; BE-NEXT: vrlw 0, 3, 15
+; BE-NEXT: vmsumuhm 2, 9, 0, 6
+; BE-NEXT: stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 272
+; BE-NEXT: lvx 2, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 64
+; BE-NEXT: vrlw 0, 2, 15
+; BE-NEXT: vmsumuhm 0, 9, 0, 6
+; BE-NEXT: stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 976
+; BE-NEXT: lvx 0, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 96
+; BE-NEXT: vmulouh 0, 9, 0
+; BE-NEXT: stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 944
+; BE-NEXT: lvx 0, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 80
+; BE-NEXT: vmulouh 0, 9, 0
+; BE-NEXT: stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 912
+; BE-NEXT: lvx 0, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 112
+; BE-NEXT: vmulouh 0, 9, 0
+; BE-NEXT: stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 880
+; BE-NEXT: lvx 0, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 144
+; BE-NEXT: vmulouh 0, 9, 0
+; BE-NEXT: stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 176
+; BE-NEXT: vmulouh 0, 9, 28
+; BE-NEXT: stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 224
+; BE-NEXT: vmulouh 0, 9, 27
+; BE-NEXT: stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 256
+; BE-NEXT: vmulouh 0, 9, 26
+; BE-NEXT: stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 320
+; BE-NEXT: vmulouh 0, 9, 25
+; BE-NEXT: stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 352
+; BE-NEXT: vmulouh 0, 9, 24
+; BE-NEXT: stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 384
+; BE-NEXT: vmulouh 0, 9, 23
+; BE-NEXT: stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 416
+; BE-NEXT: vmulouh 0, 9, 22
+; BE-NEXT: stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 448
+; BE-NEXT: vmulouh 0, 9, 31
+; BE-NEXT: stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 480
+; BE-NEXT: vmulouh 0, 9, 18
+; BE-NEXT: stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 512
+; BE-NEXT: vmulouh 0, 9, 19
+; BE-NEXT: stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 544
+; BE-NEXT: vmulouh 0, 9, 29
+; BE-NEXT: stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 576
+; BE-NEXT: vmulouh 0, 9, 21
+; BE-NEXT: stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 608
+; BE-NEXT: vmulouh 0, 9, 20
+; BE-NEXT: stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 624
+; BE-NEXT: vmulouh 0, 9, 30
+; BE-NEXT: stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 656
+; BE-NEXT: vmulouh 0, 9, 17
+; BE-NEXT: stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 688
+; BE-NEXT: vmulouh 0, 9, 16
+; BE-NEXT: stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 720
+; BE-NEXT: vmulouh 0, 9, 14
+; BE-NEXT: stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 752
+; BE-NEXT: vmulouh 0, 9, 13
+; BE-NEXT: stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 784
+; BE-NEXT: vmulouh 0, 9, 12
+; BE-NEXT: stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 816
+; BE-NEXT: vmulouh 0, 9, 11
+; BE-NEXT: stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 848
+; BE-NEXT: vmulouh 0, 9, 10
+; BE-NEXT: stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 880
+; BE-NEXT: vmulouh 0, 9, 8
+; BE-NEXT: stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 912
+; BE-NEXT: vmulouh 0, 9, 7
+; BE-NEXT: stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 944
+; BE-NEXT: vmulouh 0, 9, 1
+; BE-NEXT: stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 976
+; BE-NEXT: vmulouh 5, 9, 5
+; BE-NEXT: stvx 5, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 1008
+; BE-NEXT: vmulouh 4, 9, 4
+; BE-NEXT: stvx 4, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 1040
+; BE-NEXT: vmulouh 3, 9, 3
+; BE-NEXT: stvx 3, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 1072
+; BE-NEXT: vmulouh 2, 9, 2
+; BE-NEXT: stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 1120
+; BE-NEXT: lvx 2, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 1104
+; BE-NEXT: vslw 9, 2, 15
+; BE-NEXT: lvx 2, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 1088
+; BE-NEXT: lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 1056
+; BE-NEXT: lvx 4, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 1024
+; BE-NEXT: vslw 2, 2, 15
+; BE-NEXT: lvx 5, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 992
+; BE-NEXT: lvx 0, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 960
+; BE-NEXT: lvx 1, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 928
+; BE-NEXT: vslw 3, 3, 15
+; BE-NEXT: lvx 6, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 896
+; BE-NEXT: lvx 7, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 864
+; BE-NEXT: lvx 8, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 832
+; BE-NEXT: vslw 4, 4, 15
+; BE-NEXT: lvx 10, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 800
+; BE-NEXT: lvx 11, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 768
+; BE-NEXT: vslw 5, 5, 15
+; BE-NEXT: lvx 12, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 736
+; BE-NEXT: lvx 13, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 704
+; BE-NEXT: lvx 14, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 672
+; BE-NEXT: vslw 0, 0, 15
+; BE-NEXT: lvx 16, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 640
+; BE-NEXT: lvx 17, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 592
+; BE-NEXT: lvx 18, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 560
+; BE-NEXT: vslw 1, 1, 15
+; BE-NEXT: lvx 19, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 528
+; BE-NEXT: lvx 31, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 496
+; BE-NEXT: vslw 6, 6, 15
+; BE-NEXT: lvx 30, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 464
+; BE-NEXT: lvx 29, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 432
+; BE-NEXT: lvx 28, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 400
+; BE-NEXT: vslw 7, 7, 15
+; BE-NEXT: lvx 27, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 368
+; BE-NEXT: lvx 26, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 336
+; BE-NEXT: lvx 25, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 304
+; BE-NEXT: vslw 8, 8, 15
+; BE-NEXT: lvx 24, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 240
+; BE-NEXT: lvx 23, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 208
+; BE-NEXT: vslw 10, 10, 15
+; BE-NEXT: lvx 22, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 160
+; BE-NEXT: lvx 21, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 128
+; BE-NEXT: lvx 20, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 1120
+; BE-NEXT: vslw 11, 11, 15
+; BE-NEXT: vslw 20, 20, 15
+; BE-NEXT: stvx 20, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 64
+; BE-NEXT: lvx 20, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 96
+; BE-NEXT: vslw 12, 12, 15
+; BE-NEXT: vslw 13, 13, 15
+; BE-NEXT: vslw 14, 14, 15
+; BE-NEXT: vslw 16, 16, 15
+; BE-NEXT: vslw 17, 17, 15
+; BE-NEXT: vslw 18, 18, 15
+; BE-NEXT: vslw 19, 19, 15
+; BE-NEXT: vslw 31, 31, 15
+; BE-NEXT: vslw 30, 30, 15
+; BE-NEXT: vslw 29, 29, 15
+; BE-NEXT: vslw 28, 28, 15
+; BE-NEXT: vslw 27, 27, 15
+; BE-NEXT: vslw 26, 26, 15
+; BE-NEXT: vslw 25, 25, 15
+; BE-NEXT: vslw 24, 24, 15
+; BE-NEXT: vslw 23, 23, 15
+; BE-NEXT: vslw 22, 22, 15
+; BE-NEXT: vslw 21, 21, 15
+; BE-NEXT: vslw 20, 20, 15
+; BE-NEXT: lvx 15, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 80
+; BE-NEXT: vadduwm 9, 15, 9
+; BE-NEXT: lvx 15, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 112
+; BE-NEXT: vadduwm 2, 15, 2
+; BE-NEXT: vxor 2, 2, 9
+; BE-NEXT: lvx 9, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 144
+; BE-NEXT: vadduwm 3, 9, 3
+; BE-NEXT: vxor 2, 2, 3
+; BE-NEXT: lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 176
+; BE-NEXT: vadduwm 3, 3, 4
+; BE-NEXT: vxor 2, 2, 3
+; BE-NEXT: lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 224
+; BE-NEXT: vadduwm 3, 3, 5
+; BE-NEXT: vxor 2, 2, 3
+; BE-NEXT: lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 256
+; BE-NEXT: vadduwm 3, 3, 0
+; BE-NEXT: vxor 2, 2, 3
+; BE-NEXT: lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 320
+; BE-NEXT: vadduwm 3, 3, 1
+; BE-NEXT: vxor 2, 2, 3
+; BE-NEXT: lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 352
+; BE-NEXT: vadduwm 3, 3, 6
+; BE-NEXT: vxor 2, 2, 3
+; BE-NEXT: lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 384
+; BE-NEXT: vadduwm 3, 3, 7
+; BE-NEXT: vxor 2, 2, 3
+; BE-NEXT: lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 416
+; BE-NEXT: vadduwm 3, 3, 8
+; BE-NEXT: vxor 2, 2, 3
+; BE-NEXT: lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 448
+; BE-NEXT: vadduwm 3, 3, 10
+; BE-NEXT: vxor 2, 2, 3
+; BE-NEXT: lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 480
+; BE-NEXT: vadduwm 3, 3, 11
+; BE-NEXT: vxor 2, 2, 3
+; BE-NEXT: lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 512
+; BE-NEXT: vadduwm 3, 3, 12
+; BE-NEXT: vxor 2, 2, 3
+; BE-NEXT: lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 544
+; BE-NEXT: vadduwm 3, 3, 13
+; BE-NEXT: vxor 2, 2, 3
+; BE-NEXT: lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 576
+; BE-NEXT: vadduwm 3, 3, 14
+; BE-NEXT: vxor 2, 2, 3
+; BE-NEXT: lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 608
+; BE-NEXT: vadduwm 3, 3, 16
+; BE-NEXT: vxor 2, 2, 3
+; BE-NEXT: lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 624
+; BE-NEXT: vadduwm 3, 3, 17
+; BE-NEXT: vxor 2, 2, 3
+; BE-NEXT: lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 656
+; BE-NEXT: vadduwm 3, 3, 18
+; BE-NEXT: vxor 2, 2, 3
+; BE-NEXT: lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 688
+; BE-NEXT: vadduwm 3, 3, 19
+; BE-NEXT: vxor 2, 2, 3
+; BE-NEXT: lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 720
+; BE-NEXT: vadduwm 3, 3, 31
+; BE-NEXT: vxor 2, 2, 3
+; BE-NEXT: lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 752
+; BE-NEXT: vadduwm 3, 3, 30
+; BE-NEXT: vxor 2, 2, 3
+; BE-NEXT: lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 784
+; BE-NEXT: vadduwm 3, 3, 29
+; BE-NEXT: vxor 2, 2, 3
+; BE-NEXT: lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 816
+; BE-NEXT: vadduwm 3, 3, 28
+; BE-NEXT: vxor 2, 2, 3
+; BE-NEXT: lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 848
+; BE-NEXT: vadduwm 3, 3, 27
+; BE-NEXT: vxor 2, 2, 3
+; BE-NEXT: lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 880
+; BE-NEXT: vadduwm 3, 3, 26
+; BE-NEXT: vxor 2, 2, 3
+; BE-NEXT: lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 912
+; BE-NEXT: vadduwm 3, 3, 25
+; BE-NEXT: vxor 2, 2, 3
+; BE-NEXT: lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 944
+; BE-NEXT: vadduwm 3, 3, 24
+; BE-NEXT: vxor 2, 2, 3
+; BE-NEXT: lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 976
+; BE-NEXT: vadduwm 3, 3, 23
+; BE-NEXT: vxor 2, 2, 3
+; BE-NEXT: lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 1008
+; BE-NEXT: vadduwm 3, 3, 22
+; BE-NEXT: vxor 2, 2, 3
+; BE-NEXT: lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 1040
+; BE-NEXT: vadduwm 3, 3, 21
+; BE-NEXT: vxor 2, 2, 3
+; BE-NEXT: lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 1120
+; BE-NEXT: lvx 4, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 1072
+; BE-NEXT: vadduwm 3, 3, 4
+; BE-NEXT: vxor 2, 2, 3
+; BE-NEXT: lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 1264
+; BE-NEXT: lvx 5, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 1136
+; BE-NEXT: vadduwm 3, 3, 20
+; BE-NEXT: lvx 1, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 1248
+; BE-NEXT: vxor 2, 2, 3
+; BE-NEXT: lvx 0, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 1232
+; BE-NEXT: vsrw 3, 2, 5
+; BE-NEXT: vsrw 4, 2, 1
+; BE-NEXT: vslw 5, 2, 5
+; BE-NEXT: vand 2, 2, 0
+; BE-NEXT: vslw 2, 2, 1
+; BE-NEXT: vand 4, 4, 0
+; BE-NEXT: vor 2, 5, 2
+; BE-NEXT: lvx 5, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 1152
+; BE-NEXT: vor 3, 4, 3
+; BE-NEXT: lvx 4, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 1216
+; BE-NEXT: vor 2, 2, 3
+; BE-NEXT: vand 3, 2, 5
+; BE-NEXT: vsrw 2, 2, 4
+; BE-NEXT: vand 2, 2, 5
+; BE-NEXT: lvx 5, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 1168
+; BE-NEXT: vslw 3, 3, 4
+; BE-NEXT: lvx 4, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 1184
+; BE-NEXT: vor 2, 2, 3
+; BE-NEXT: vand 3, 2, 5
+; BE-NEXT: vsrw 2, 2, 4
+; BE-NEXT: vslw 3, 3, 4
+; BE-NEXT: vand 2, 2, 5
+; BE-NEXT: vor 2, 2, 3
+; BE-NEXT: lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 1200
+; BE-NEXT: lvx 4, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 1456
+; BE-NEXT: lvx 31, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 1440
+; BE-NEXT: vsrw 3, 2, 3
+; BE-NEXT: lvx 30, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 1424
+; BE-NEXT: lvx 29, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 1408
+; BE-NEXT: lvx 28, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 1392
+; BE-NEXT: vand 2, 2, 4
+; BE-NEXT: vadduwm 2, 2, 2
+; BE-NEXT: lvx 27, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 1376
+; BE-NEXT: lvx 26, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 1360
+; BE-NEXT: lvx 25, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 1344
+; BE-NEXT: vand 3, 3, 4
+; BE-NEXT: lvx 24, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 1328
+; BE-NEXT: lvx 23, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 1312
+; BE-NEXT: vor 2, 3, 2
+; BE-NEXT: lvx 22, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 1296
+; BE-NEXT: lvx 21, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 1280
+; BE-NEXT: lvx 20, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: addi 1, 1, 1472
+; BE-NEXT: blr
+;
+; LE-LABEL: clmulr_v4i32:
+; LE: # %bb.0:
+; LE-NEXT: addis 3, 2, .LCPI6_0 at toc@ha
+; LE-NEXT: vspltisw 7, 12
+; LE-NEXT: vspltisw 4, 8
+; LE-NEXT: addi 3, 3, .LCPI6_0 at toc@l
+; LE-NEXT: vadduwm 7, 7, 7
+; LE-NEXT: vsrw 17, 2, 4
+; LE-NEXT: vspltisb 5, 15
+; LE-NEXT: vspltisw 0, 4
+; LE-NEXT: lxvd2x 0, 0, 3
+; LE-NEXT: vsrw 16, 2, 7
+; LE-NEXT: addis 3, 2, .LCPI6_1 at toc@ha
+; LE-NEXT: vspltisw 1, 2
+; LE-NEXT: vspltisw 6, 1
+; LE-NEXT: vsldoi 10, 0, 0, 1
+; LE-NEXT: addi 3, 3, .LCPI6_1 at toc@l
+; LE-NEXT: vsldoi 13, 0, 0, 2
+; LE-NEXT: vsldoi 9, 1, 1, 1
+; LE-NEXT: vsldoi 12, 1, 1, 2
+; LE-NEXT: vsldoi 14, 4, 4, 2
+; LE-NEXT: xxland 1, 49, 0
+; LE-NEXT: vsldoi 8, 6, 6, 1
+; LE-NEXT: vsldoi 11, 6, 6, 2
+; LE-NEXT: vsldoi 15, 6, 6, 3
+; LE-NEXT: xxlor 1, 1, 48
+; LE-NEXT: vslw 16, 2, 7
+; LE-NEXT: xxland 34, 34, 0
+; LE-NEXT: vslw 2, 2, 4
+; LE-NEXT: xxlor 2, 48, 34
+; LE-NEXT: xxlor 34, 2, 1
+; LE-NEXT: xxland 49, 34, 37
+; LE-NEXT: vsrw 2, 2, 0
+; LE-NEXT: vslw 17, 17, 0
+; LE-NEXT: xxland 1, 34, 37
+; LE-NEXT: xxlor 34, 1, 49
+; LE-NEXT: lxvd2x 1, 0, 3
+; LE-NEXT: addis 3, 2, .LCPI6_2 at toc@ha
+; LE-NEXT: addi 3, 3, .LCPI6_2 at toc@l
+; LE-NEXT: xxland 50, 34, 1
+; LE-NEXT: vsrw 2, 2, 1
+; LE-NEXT: vslw 18, 18, 1
+; LE-NEXT: xxland 2, 34, 1
+; LE-NEXT: xxlor 34, 2, 50
+; LE-NEXT: lxvd2x 2, 0, 3
+; LE-NEXT: addis 3, 2, .LCPI6_3 at toc@ha
+; LE-NEXT: vsrw 19, 2, 6
+; LE-NEXT: addi 3, 3, .LCPI6_3 at toc@l
+; LE-NEXT: lxvd2x 5, 0, 3
+; LE-NEXT: addis 3, 2, .LCPI6_4 at toc@ha
+; LE-NEXT: xxland 34, 34, 2
+; LE-NEXT: xxland 3, 51, 2
+; LE-NEXT: vsrw 19, 3, 4
+; LE-NEXT: addi 3, 3, .LCPI6_4 at toc@l
+; LE-NEXT: vadduwm 2, 2, 2
+; LE-NEXT: xxlor 34, 3, 34
+; LE-NEXT: xxland 3, 51, 0
+; LE-NEXT: vsrw 19, 3, 7
+; LE-NEXT: xxlor 3, 3, 51
+; LE-NEXT: vslw 19, 3, 7
+; LE-NEXT: xxland 35, 35, 0
+; LE-NEXT: vslw 3, 3, 4
+; LE-NEXT: vsldoi 16, 1, 1, 3
+; LE-NEXT: xxlor 4, 51, 35
+; LE-NEXT: xxlor 35, 4, 3
+; LE-NEXT: xxland 51, 35, 37
+; LE-NEXT: vsrw 3, 3, 0
+; LE-NEXT: vslw 19, 19, 0
+; LE-NEXT: xxland 3, 35, 37
+; LE-NEXT: xxlor 35, 3, 51
+; LE-NEXT: xxland 51, 35, 1
+; LE-NEXT: vsrw 3, 3, 1
+; LE-NEXT: vslw 19, 19, 1
+; LE-NEXT: xxland 3, 35, 1
+; LE-NEXT: xxlor 35, 3, 51
+; LE-NEXT: vsrw 19, 3, 6
+; LE-NEXT: xxland 35, 35, 2
+; LE-NEXT: xxland 3, 51, 2
+; LE-NEXT: vadduwm 3, 3, 3
+; LE-NEXT: xxlor 3, 3, 35
+; LE-NEXT: xxland 35, 3, 33
+; LE-NEXT: xxland 51, 3, 38
+; LE-NEXT: vmuluwm 3, 2, 3
+; LE-NEXT: vmuluwm 19, 2, 19
+; LE-NEXT: xxlxor 4, 51, 35
+; LE-NEXT: xxland 35, 3, 32
+; LE-NEXT: vmuluwm 3, 2, 3
+; LE-NEXT: xxlxor 4, 4, 35
+; LE-NEXT: xxland 35, 3, 36
+; LE-NEXT: vmuluwm 3, 2, 3
+; LE-NEXT: xxlxor 4, 4, 35
+; LE-NEXT: vadduwm 3, 4, 4
+; LE-NEXT: xxland 35, 3, 35
+; LE-NEXT: vmuluwm 3, 2, 3
+; LE-NEXT: vsldoi 17, 0, 0, 3
+; LE-NEXT: xxlxor 4, 4, 35
+; LE-NEXT: xxland 35, 3, 5
+; LE-NEXT: lxvd2x 5, 0, 3
+; LE-NEXT: addis 3, 2, .LCPI6_5 at toc@ha
+; LE-NEXT: vmuluwm 3, 2, 3
+; LE-NEXT: addi 3, 3, .LCPI6_5 at toc@l
+; LE-NEXT: xxlxor 4, 4, 35
+; LE-NEXT: vslw 3, 0, 0
+; LE-NEXT: xxland 35, 3, 35
+; LE-NEXT: vmuluwm 3, 2, 3
+; LE-NEXT: xxlxor 4, 4, 35
+; LE-NEXT: xxland 35, 3, 5
+; LE-NEXT: lxvd2x 5, 0, 3
+; LE-NEXT: addis 3, 2, .LCPI6_6 at toc@ha
+; LE-NEXT: vmuluwm 3, 2, 3
+; LE-NEXT: addi 3, 3, .LCPI6_6 at toc@l
+; LE-NEXT: xxlxor 4, 4, 35
+; LE-NEXT: xxland 35, 3, 40
+; LE-NEXT: vmuluwm 3, 2, 3
+; LE-NEXT: xxlxor 4, 4, 35
+; LE-NEXT: xxland 35, 3, 41
+; LE-NEXT: vmuluwm 3, 2, 3
+; LE-NEXT: xxlxor 4, 4, 35
+; LE-NEXT: xxland 35, 3, 42
+; LE-NEXT: vmuluwm 3, 2, 3
+; LE-NEXT: xxlxor 4, 4, 35
+; LE-NEXT: vslw 3, 4, 4
+; LE-NEXT: xxland 35, 3, 35
+; LE-NEXT: vmuluwm 3, 2, 3
+; LE-NEXT: xxlxor 4, 4, 35
+; LE-NEXT: xxland 35, 3, 5
+; LE-NEXT: lxvd2x 5, 0, 3
+; LE-NEXT: addis 3, 2, .LCPI6_7 at toc@ha
+; LE-NEXT: vmuluwm 3, 2, 3
+; LE-NEXT: addi 3, 3, .LCPI6_7 at toc@l
+; LE-NEXT: vsldoi 18, 4, 4, 3
+; LE-NEXT: xxlxor 4, 4, 35
+; LE-NEXT: xxland 35, 3, 5
+; LE-NEXT: lxvd2x 5, 0, 3
+; LE-NEXT: addis 3, 2, .LCPI6_8 at toc@ha
+; LE-NEXT: vmuluwm 3, 2, 3
+; LE-NEXT: addi 3, 3, .LCPI6_8 at toc@l
+; LE-NEXT: xxlxor 4, 4, 35
+; LE-NEXT: xxland 35, 3, 5
+; LE-NEXT: lxvd2x 5, 0, 3
+; LE-NEXT: addis 3, 2, .LCPI6_9 at toc@ha
+; LE-NEXT: vmuluwm 3, 2, 3
+; LE-NEXT: addi 3, 3, .LCPI6_9 at toc@l
+; LE-NEXT: xxlxor 4, 4, 35
+; LE-NEXT: xxland 35, 3, 5
+; LE-NEXT: lxvd2x 5, 0, 3
+; LE-NEXT: addis 3, 2, .LCPI6_10 at toc@ha
+; LE-NEXT: vmuluwm 3, 2, 3
+; LE-NEXT: addi 3, 3, .LCPI6_10 at toc@l
+; LE-NEXT: xxlxor 4, 4, 35
+; LE-NEXT: xxland 35, 3, 43
+; LE-NEXT: vmuluwm 3, 2, 3
+; LE-NEXT: xxlxor 4, 4, 35
+; LE-NEXT: xxland 35, 3, 44
+; LE-NEXT: vmuluwm 3, 2, 3
+; LE-NEXT: xxlxor 4, 4, 35
+; LE-NEXT: xxland 35, 3, 45
+; LE-NEXT: vmuluwm 3, 2, 3
+; LE-NEXT: xxlxor 4, 4, 35
+; LE-NEXT: xxland 35, 3, 46
+; LE-NEXT: vmuluwm 3, 2, 3
+; LE-NEXT: xxlxor 4, 4, 35
+; LE-NEXT: xxland 35, 3, 5
+; LE-NEXT: lxvd2x 5, 0, 3
+; LE-NEXT: addis 3, 2, .LCPI6_11 at toc@ha
+; LE-NEXT: vmuluwm 3, 2, 3
+; LE-NEXT: addi 3, 3, .LCPI6_11 at toc@l
+; LE-NEXT: xxlxor 4, 4, 35
+; LE-NEXT: xxland 35, 3, 5
+; LE-NEXT: lxvd2x 5, 0, 3
+; LE-NEXT: addis 3, 2, .LCPI6_12 at toc@ha
+; LE-NEXT: vmuluwm 3, 2, 3
+; LE-NEXT: addi 3, 3, .LCPI6_12 at toc@l
+; LE-NEXT: xxlxor 4, 4, 35
+; LE-NEXT: xxland 35, 3, 5
+; LE-NEXT: lxvd2x 5, 0, 3
+; LE-NEXT: addis 3, 2, .LCPI6_13 at toc@ha
+; LE-NEXT: vmuluwm 3, 2, 3
+; LE-NEXT: addi 3, 3, .LCPI6_13 at toc@l
+; LE-NEXT: xxlxor 4, 4, 35
+; LE-NEXT: xxland 35, 3, 5
+; LE-NEXT: lxvd2x 5, 0, 3
+; LE-NEXT: addis 3, 2, .LCPI6_14 at toc@ha
+; LE-NEXT: vmuluwm 3, 2, 3
+; LE-NEXT: addi 3, 3, .LCPI6_14 at toc@l
+; LE-NEXT: xxlxor 4, 4, 35
+; LE-NEXT: xxland 35, 3, 47
+; LE-NEXT: vmuluwm 3, 2, 3
+; LE-NEXT: xxlxor 4, 4, 35
+; LE-NEXT: xxland 35, 3, 48
+; LE-NEXT: vmuluwm 3, 2, 3
+; LE-NEXT: xxlxor 4, 4, 35
+; LE-NEXT: xxland 35, 3, 49
+; LE-NEXT: vmuluwm 3, 2, 3
+; LE-NEXT: xxlxor 4, 4, 35
+; LE-NEXT: xxland 35, 3, 50
+; LE-NEXT: vmuluwm 3, 2, 3
+; LE-NEXT: xxlxor 4, 4, 35
+; LE-NEXT: xxland 35, 3, 5
+; LE-NEXT: lxvd2x 5, 0, 3
+; LE-NEXT: addis 3, 2, .LCPI6_15 at toc@ha
+; LE-NEXT: vmuluwm 3, 2, 3
+; LE-NEXT: addi 3, 3, .LCPI6_15 at toc@l
+; LE-NEXT: xxlxor 4, 4, 35
+; LE-NEXT: xxland 35, 3, 5
+; LE-NEXT: lxvd2x 5, 0, 3
+; LE-NEXT: vmuluwm 3, 2, 3
+; LE-NEXT: xxlxor 4, 4, 35
+; LE-NEXT: xxland 35, 3, 5
+; LE-NEXT: vmuluwm 3, 2, 3
+; LE-NEXT: xxlxor 4, 4, 35
+; LE-NEXT: xxleqv 35, 35, 35
+; LE-NEXT: vslw 3, 3, 3
+; LE-NEXT: xxland 35, 3, 35
+; LE-NEXT: vmuluwm 2, 2, 3
+; LE-NEXT: xxlxor 34, 4, 34
+; LE-NEXT: vsrw 8, 2, 4
+; LE-NEXT: vsrw 3, 2, 7
+; LE-NEXT: xxland 3, 40, 0
+; LE-NEXT: xxlor 3, 3, 35
+; LE-NEXT: vslw 3, 2, 7
+; LE-NEXT: xxland 34, 34, 0
+; LE-NEXT: vslw 2, 2, 4
+; LE-NEXT: xxlor 0, 35, 34
+; LE-NEXT: xxlor 34, 0, 3
+; LE-NEXT: xxland 35, 34, 37
+; LE-NEXT: vsrw 2, 2, 0
+; LE-NEXT: vslw 3, 3, 0
+; LE-NEXT: xxland 0, 34, 37
+; LE-NEXT: xxlor 34, 0, 35
+; LE-NEXT: xxland 35, 34, 1
+; LE-NEXT: vsrw 2, 2, 1
+; LE-NEXT: vslw 3, 3, 1
+; LE-NEXT: xxland 0, 34, 1
+; LE-NEXT: xxlor 34, 0, 35
+; LE-NEXT: vsrw 3, 2, 6
+; LE-NEXT: xxland 34, 34, 2
+; LE-NEXT: xxland 0, 35, 2
+; LE-NEXT: vadduwm 2, 2, 2
+; LE-NEXT: xxlor 34, 0, 34
+; LE-NEXT: blr
+ %a.ext = zext <4 x i32> %a to <4 x i64>
+ %b.ext = zext <4 x i32> %b to <4 x i64>
+ %clmul = call <4 x i64> @llvm.clmul.v4i64(<4 x i64> %a.ext, <4 x i64> %b.ext)
+ %res.ext = lshr <4 x i64> %clmul, splat (i64 31)
+ %res = trunc <4 x i64> %res.ext to <4 x i32>
+ ret <4 x i32> %res
+}
+
+define <2 x i64> @clmulr_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
+; BE-LABEL: clmulr_v2i64:
+; BE: # %bb.0:
+; BE-NEXT: stdu 1, -1056(1)
+; BE-NEXT: lis 7, -21846
+; BE-NEXT: lis 8, 21845
+; BE-NEXT: std 26, 1008(1) # 8-byte Folded Spill
+; BE-NEXT: ori 7, 7, 43690
+; BE-NEXT: ori 8, 8, 21845
+; BE-NEXT: std 27, 1016(1) # 8-byte Folded Spill
+; BE-NEXT: sldi 7, 7, 32
+; BE-NEXT: sldi 8, 8, 32
+; BE-NEXT: lis 9, -13108
+; BE-NEXT: lis 10, 13107
+; BE-NEXT: std 30, 1040(1) # 8-byte Folded Spill
+; BE-NEXT: oris 7, 7, 43690
+; BE-NEXT: oris 8, 8, 21845
+; BE-NEXT: std 28, 1024(1) # 8-byte Folded Spill
+; BE-NEXT: sldi 0, 3, 1
+; BE-NEXT: rldicl 3, 3, 63, 1
+; BE-NEXT: ori 9, 9, 52428
+; BE-NEXT: ori 10, 10, 13107
+; BE-NEXT: std 29, 1032(1) # 8-byte Folded Spill
+; BE-NEXT: ori 27, 7, 43690
+; BE-NEXT: ori 26, 8, 21845
+; BE-NEXT: std 2, 904(1) # 8-byte Folded Spill
+; BE-NEXT: sldi 9, 9, 32
+; BE-NEXT: sldi 10, 10, 32
+; BE-NEXT: and 7, 0, 27
+; BE-NEXT: and 3, 3, 26
+; BE-NEXT: std 31, 1048(1) # 8-byte Folded Spill
+; BE-NEXT: lis 11, -3856
+; BE-NEXT: lis 12, 3855
+; BE-NEXT: std 15, 920(1) # 8-byte Folded Spill
+; BE-NEXT: sldi 30, 5, 1
+; BE-NEXT: rldicl 5, 5, 63, 1
+; BE-NEXT: oris 9, 9, 52428
+; BE-NEXT: oris 10, 10, 13107
+; BE-NEXT: std 14, 912(1) # 8-byte Folded Spill
+; BE-NEXT: or 3, 3, 7
+; BE-NEXT: ori 11, 11, 61680
+; BE-NEXT: std 17, 936(1) # 8-byte Folded Spill
+; BE-NEXT: ori 12, 12, 3855
+; BE-NEXT: ori 29, 9, 52428
+; BE-NEXT: ori 28, 10, 13107
+; BE-NEXT: and 8, 30, 27
+; BE-NEXT: std 16, 928(1) # 8-byte Folded Spill
+; BE-NEXT: and 5, 5, 26
+; BE-NEXT: sldi 7, 3, 2
+; BE-NEXT: std 19, 952(1) # 8-byte Folded Spill
+; BE-NEXT: rldicl 3, 3, 62, 2
+; BE-NEXT: sldi 11, 11, 32
+; BE-NEXT: sldi 12, 12, 32
+; BE-NEXT: or 5, 5, 8
+; BE-NEXT: std 18, 944(1) # 8-byte Folded Spill
+; BE-NEXT: and 7, 7, 29
+; BE-NEXT: and 3, 3, 28
+; BE-NEXT: std 21, 968(1) # 8-byte Folded Spill
+; BE-NEXT: oris 11, 11, 61680
+; BE-NEXT: oris 12, 12, 3855
+; BE-NEXT: sldi 8, 5, 2
+; BE-NEXT: rldicl 5, 5, 62, 2
+; BE-NEXT: std 20, 960(1) # 8-byte Folded Spill
+; BE-NEXT: or 3, 3, 7
+; BE-NEXT: ori 9, 11, 61680
+; BE-NEXT: std 23, 984(1) # 8-byte Folded Spill
+; BE-NEXT: ori 10, 12, 3855
+; BE-NEXT: and 8, 8, 29
+; BE-NEXT: and 5, 5, 28
+; BE-NEXT: sldi 7, 3, 4
+; BE-NEXT: std 22, 976(1) # 8-byte Folded Spill
+; BE-NEXT: rldicl 3, 3, 60, 4
+; BE-NEXT: or 5, 5, 8
+; BE-NEXT: std 25, 1000(1) # 8-byte Folded Spill
+; BE-NEXT: and 7, 7, 9
+; BE-NEXT: and 3, 3, 10
+; BE-NEXT: sldi 8, 5, 4
+; BE-NEXT: rldicl 5, 5, 60, 4
+; BE-NEXT: std 24, 992(1) # 8-byte Folded Spill
+; BE-NEXT: or 3, 3, 7
+; BE-NEXT: and 8, 8, 9
+; BE-NEXT: std 27, 360(1) # 8-byte Folded Spill
+; BE-NEXT: and 5, 5, 10
+; BE-NEXT: rotlwi 7, 3, 24
+; BE-NEXT: or 5, 5, 8
+; BE-NEXT: rlwimi 7, 3, 8, 8, 15
+; BE-NEXT: std 26, 352(1) # 8-byte Folded Spill
+; BE-NEXT: mr 30, 9
+; BE-NEXT: std 29, 376(1) # 8-byte Folded Spill
+; BE-NEXT: rotlwi 8, 5, 24
+; BE-NEXT: rldicl 9, 3, 32, 32
+; BE-NEXT: rlwimi 7, 3, 8, 24, 31
+; BE-NEXT: rldicl 3, 5, 32, 32
+; BE-NEXT: std 28, 368(1) # 8-byte Folded Spill
+; BE-NEXT: rlwimi 8, 5, 8, 8, 15
+; BE-NEXT: std 30, 384(1) # 8-byte Folded Spill
+; BE-NEXT: rotlwi 11, 3, 24
+; BE-NEXT: mr 0, 10
+; BE-NEXT: rotlwi 10, 9, 24
+; BE-NEXT: std 0, 392(1) # 8-byte Folded Spill
+; BE-NEXT: rlwimi 11, 3, 8, 8, 15
+; BE-NEXT: rlwimi 8, 5, 8, 24, 31
+; BE-NEXT: rlwimi 10, 9, 8, 8, 15
+; BE-NEXT: rlwimi 11, 3, 8, 24, 31
+; BE-NEXT: sldi 5, 8, 32
+; BE-NEXT: rlwimi 10, 9, 8, 24, 31
+; BE-NEXT: sldi 3, 7, 32
+; BE-NEXT: or 11, 5, 11
+; BE-NEXT: or 12, 3, 10
+; BE-NEXT: rlwinm 3, 11, 0, 30, 30
+; BE-NEXT: rlwinm 5, 11, 0, 29, 29
+; BE-NEXT: mulld 3, 12, 3
+; BE-NEXT: std 3, 896(1) # 8-byte Folded Spill
+; BE-NEXT: clrldi 3, 11, 63
+; BE-NEXT: mulld 2, 12, 3
+; BE-NEXT: mulld 3, 12, 5
+; BE-NEXT: std 3, 888(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 3, 11, 0, 28, 28
+; BE-NEXT: rlwinm 5, 11, 0, 27, 27
+; BE-NEXT: mulld 3, 12, 3
+; BE-NEXT: std 3, 872(1) # 8-byte Folded Spill
+; BE-NEXT: mulld 3, 12, 5
+; BE-NEXT: std 3, 880(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 3, 11, 0, 26, 26
+; BE-NEXT: rlwinm 5, 11, 0, 25, 25
+; BE-NEXT: mulld 3, 12, 3
+; BE-NEXT: std 3, 856(1) # 8-byte Folded Spill
+; BE-NEXT: mulld 3, 12, 5
+; BE-NEXT: std 3, 864(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 3, 11, 0, 24, 24
+; BE-NEXT: rlwinm 5, 11, 0, 23, 23
+; BE-NEXT: mulld 3, 12, 3
+; BE-NEXT: std 3, 840(1) # 8-byte Folded Spill
+; BE-NEXT: mulld 3, 12, 5
+; BE-NEXT: std 3, 848(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 3, 11, 0, 22, 22
+; BE-NEXT: rlwinm 5, 11, 0, 21, 21
+; BE-NEXT: mulld 3, 12, 3
+; BE-NEXT: std 3, 824(1) # 8-byte Folded Spill
+; BE-NEXT: mulld 3, 12, 5
+; BE-NEXT: std 3, 832(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 3, 11, 0, 20, 20
+; BE-NEXT: rlwinm 5, 11, 0, 19, 19
+; BE-NEXT: mulld 3, 12, 3
+; BE-NEXT: std 3, 808(1) # 8-byte Folded Spill
+; BE-NEXT: mulld 3, 12, 5
+; BE-NEXT: std 3, 816(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 3, 11, 0, 18, 18
+; BE-NEXT: rlwinm 5, 11, 0, 17, 17
+; BE-NEXT: mulld 3, 12, 3
+; BE-NEXT: std 3, 792(1) # 8-byte Folded Spill
+; BE-NEXT: mulld 3, 12, 5
+; BE-NEXT: std 3, 800(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 3, 11, 0, 16, 16
+; BE-NEXT: rlwinm 5, 11, 0, 15, 15
+; BE-NEXT: mulld 3, 12, 3
+; BE-NEXT: std 3, 776(1) # 8-byte Folded Spill
+; BE-NEXT: mulld 3, 12, 5
+; BE-NEXT: std 3, 784(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 3, 11, 0, 14, 14
+; BE-NEXT: rlwinm 5, 11, 0, 13, 13
+; BE-NEXT: mulld 3, 12, 3
+; BE-NEXT: std 3, 760(1) # 8-byte Folded Spill
+; BE-NEXT: mulld 3, 12, 5
+; BE-NEXT: std 3, 768(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 3, 11, 0, 12, 12
+; BE-NEXT: rlwinm 5, 11, 0, 11, 11
+; BE-NEXT: mulld 3, 12, 3
+; BE-NEXT: std 3, 744(1) # 8-byte Folded Spill
+; BE-NEXT: mulld 3, 12, 5
+; BE-NEXT: std 3, 752(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 3, 11, 0, 10, 10
+; BE-NEXT: mulld 3, 12, 3
+; BE-NEXT: std 3, 736(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 3, 11, 0, 9, 9
+; BE-NEXT: mulld 3, 12, 3
+; BE-NEXT: std 3, 728(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 3, 11, 0, 8, 8
+; BE-NEXT: mulld 3, 12, 3
+; BE-NEXT: std 3, 720(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 3, 11, 0, 7, 7
+; BE-NEXT: mulld 3, 12, 3
+; BE-NEXT: std 3, 712(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 3, 11, 0, 6, 6
+; BE-NEXT: mulld 3, 12, 3
+; BE-NEXT: std 3, 704(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 3, 11, 0, 5, 5
+; BE-NEXT: mulld 3, 12, 3
+; BE-NEXT: std 3, 696(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 3, 11, 0, 4, 4
+; BE-NEXT: mulld 3, 12, 3
+; BE-NEXT: std 3, 688(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 3, 11, 0, 3, 3
+; BE-NEXT: mulld 3, 12, 3
+; BE-NEXT: std 3, 680(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 3, 11, 0, 2, 2
+; BE-NEXT: mulld 3, 12, 3
+; BE-NEXT: std 3, 672(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 3, 11, 0, 1, 1
+; BE-NEXT: mulld 3, 12, 3
+; BE-NEXT: std 3, 664(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 3, 11, 0, 0, 0
+; BE-NEXT: mulld 3, 12, 3
+; BE-NEXT: std 3, 656(1) # 8-byte Folded Spill
+; BE-NEXT: rldicl 3, 11, 32, 32
+; BE-NEXT: rldicl 3, 3, 32, 31
+; BE-NEXT: rldicr 5, 11, 0, 0
+; BE-NEXT: mulld 3, 12, 3
+; BE-NEXT: mulld 5, 12, 5
+; BE-NEXT: std 3, 640(1) # 8-byte Folded Spill
+; BE-NEXT: std 5, 648(1) # 8-byte Folded Spill
+; BE-NEXT: rldicl 3, 11, 31, 33
+; BE-NEXT: rldicl 5, 11, 30, 34
+; BE-NEXT: rldicl 3, 3, 33, 30
+; BE-NEXT: rldicl 5, 5, 34, 29
+; BE-NEXT: mulld 3, 12, 3
+; BE-NEXT: std 3, 624(1) # 8-byte Folded Spill
+; BE-NEXT: mulld 3, 12, 5
+; BE-NEXT: std 3, 632(1) # 8-byte Folded Spill
+; BE-NEXT: rldicl 3, 11, 29, 35
+; BE-NEXT: rldicl 3, 3, 35, 28
+; BE-NEXT: rldicl 5, 11, 28, 36
+; BE-NEXT: rldicl 5, 5, 36, 27
+; BE-NEXT: mulld 3, 12, 3
+; BE-NEXT: std 3, 608(1) # 8-byte Folded Spill
+; BE-NEXT: mulld 3, 12, 5
+; BE-NEXT: std 3, 616(1) # 8-byte Folded Spill
+; BE-NEXT: rldicl 3, 11, 27, 37
+; BE-NEXT: rldicl 3, 3, 37, 26
+; BE-NEXT: rldicl 5, 11, 26, 38
+; BE-NEXT: rldicl 5, 5, 38, 25
+; BE-NEXT: mulld 3, 12, 3
+; BE-NEXT: std 3, 592(1) # 8-byte Folded Spill
+; BE-NEXT: mulld 3, 12, 5
+; BE-NEXT: std 3, 600(1) # 8-byte Folded Spill
+; BE-NEXT: rldicl 3, 11, 25, 39
+; BE-NEXT: rldicl 3, 3, 39, 24
+; BE-NEXT: rldicl 5, 11, 24, 40
+; BE-NEXT: rldicl 5, 5, 40, 23
+; BE-NEXT: mulld 3, 12, 3
+; BE-NEXT: std 3, 576(1) # 8-byte Folded Spill
+; BE-NEXT: mulld 3, 12, 5
+; BE-NEXT: std 3, 584(1) # 8-byte Folded Spill
+; BE-NEXT: rldicl 3, 11, 23, 41
+; BE-NEXT: rldicl 3, 3, 41, 22
+; BE-NEXT: rldicl 5, 11, 22, 42
+; BE-NEXT: rldicl 5, 5, 42, 21
+; BE-NEXT: mulld 3, 12, 3
+; BE-NEXT: std 3, 560(1) # 8-byte Folded Spill
+; BE-NEXT: mulld 3, 12, 5
+; BE-NEXT: std 3, 568(1) # 8-byte Folded Spill
+; BE-NEXT: rldicl 3, 11, 21, 43
+; BE-NEXT: rldicl 3, 3, 43, 20
+; BE-NEXT: rldicl 5, 11, 20, 44
+; BE-NEXT: rldicl 5, 5, 44, 19
+; BE-NEXT: mulld 3, 12, 3
+; BE-NEXT: std 3, 544(1) # 8-byte Folded Spill
+; BE-NEXT: mulld 3, 12, 5
+; BE-NEXT: std 3, 552(1) # 8-byte Folded Spill
+; BE-NEXT: rldicl 3, 11, 19, 45
+; BE-NEXT: rldicl 3, 3, 45, 18
+; BE-NEXT: rldicl 5, 11, 18, 46
+; BE-NEXT: rldicl 5, 5, 46, 17
+; BE-NEXT: mulld 3, 12, 3
+; BE-NEXT: std 3, 528(1) # 8-byte Folded Spill
+; BE-NEXT: mulld 3, 12, 5
+; BE-NEXT: std 3, 536(1) # 8-byte Folded Spill
+; BE-NEXT: rldicl 3, 11, 17, 47
+; BE-NEXT: rldicl 3, 3, 47, 16
+; BE-NEXT: rldicl 5, 11, 16, 48
+; BE-NEXT: rldicl 5, 5, 48, 15
+; BE-NEXT: mulld 3, 12, 3
+; BE-NEXT: std 3, 512(1) # 8-byte Folded Spill
+; BE-NEXT: mulld 3, 12, 5
+; BE-NEXT: std 3, 520(1) # 8-byte Folded Spill
+; BE-NEXT: rldicl 3, 11, 15, 49
+; BE-NEXT: rldicl 3, 3, 49, 14
+; BE-NEXT: rldicl 5, 11, 14, 50
+; BE-NEXT: rldicl 5, 5, 50, 13
+; BE-NEXT: mulld 3, 12, 3
+; BE-NEXT: std 3, 496(1) # 8-byte Folded Spill
+; BE-NEXT: mulld 3, 12, 5
+; BE-NEXT: std 3, 504(1) # 8-byte Folded Spill
+; BE-NEXT: rldicl 3, 11, 13, 51
+; BE-NEXT: rldicl 3, 3, 51, 12
+; BE-NEXT: rldicl 5, 11, 12, 52
+; BE-NEXT: rldicl 5, 5, 52, 11
+; BE-NEXT: mulld 3, 12, 3
+; BE-NEXT: std 3, 480(1) # 8-byte Folded Spill
+; BE-NEXT: mulld 3, 12, 5
+; BE-NEXT: std 3, 488(1) # 8-byte Folded Spill
+; BE-NEXT: rldicl 3, 11, 11, 53
+; BE-NEXT: rldicl 3, 3, 53, 10
+; BE-NEXT: rldicl 5, 11, 10, 54
+; BE-NEXT: rldicl 5, 5, 54, 9
+; BE-NEXT: mulld 3, 12, 3
+; BE-NEXT: std 3, 464(1) # 8-byte Folded Spill
+; BE-NEXT: mulld 3, 12, 5
+; BE-NEXT: std 3, 472(1) # 8-byte Folded Spill
+; BE-NEXT: rldicl 3, 11, 9, 55
+; BE-NEXT: rldicl 3, 3, 55, 8
+; BE-NEXT: rldicl 5, 11, 8, 56
+; BE-NEXT: rldicl 5, 5, 56, 7
+; BE-NEXT: mulld 3, 12, 3
+; BE-NEXT: std 3, 448(1) # 8-byte Folded Spill
+; BE-NEXT: mulld 3, 12, 5
+; BE-NEXT: std 3, 456(1) # 8-byte Folded Spill
+; BE-NEXT: rldicl 3, 11, 7, 57
+; BE-NEXT: rldicl 3, 3, 57, 6
+; BE-NEXT: rldicl 5, 11, 6, 58
+; BE-NEXT: rldicl 5, 5, 58, 5
+; BE-NEXT: mulld 3, 12, 3
+; BE-NEXT: std 3, 432(1) # 8-byte Folded Spill
+; BE-NEXT: mulld 3, 12, 5
+; BE-NEXT: std 3, 440(1) # 8-byte Folded Spill
+; BE-NEXT: rldicl 3, 11, 5, 59
+; BE-NEXT: rldicl 3, 3, 59, 4
+; BE-NEXT: rldicl 5, 11, 4, 60
+; BE-NEXT: rldicl 5, 5, 60, 3
+; BE-NEXT: mulld 3, 12, 3
+; BE-NEXT: std 3, 416(1) # 8-byte Folded Spill
+; BE-NEXT: mulld 3, 12, 5
+; BE-NEXT: std 3, 424(1) # 8-byte Folded Spill
+; BE-NEXT: rldicl 3, 11, 3, 61
+; BE-NEXT: rldicl 5, 11, 2, 62
+; BE-NEXT: rldicl 3, 3, 61, 2
+; BE-NEXT: rldicl 5, 5, 62, 1
+; BE-NEXT: mulld 3, 12, 3
+; BE-NEXT: std 3, 400(1) # 8-byte Folded Spill
+; BE-NEXT: mulld 3, 12, 5
+; BE-NEXT: std 3, 408(1) # 8-byte Folded Spill
+; BE-NEXT: sldi 3, 4, 1
+; BE-NEXT: rldicl 4, 4, 63, 1
+; BE-NEXT: and 3, 3, 27
+; BE-NEXT: and 4, 4, 26
+; BE-NEXT: or 3, 4, 3
+; BE-NEXT: sldi 4, 3, 2
+; BE-NEXT: rldicl 3, 3, 62, 2
+; BE-NEXT: and 4, 4, 29
+; BE-NEXT: and 3, 3, 28
+; BE-NEXT: or 3, 3, 4
+; BE-NEXT: sldi 4, 3, 4
+; BE-NEXT: rldicl 3, 3, 60, 4
+; BE-NEXT: and 4, 4, 30
+; BE-NEXT: and 3, 3, 0
+; BE-NEXT: or 3, 3, 4
+; BE-NEXT: rotlwi 4, 3, 24
+; BE-NEXT: rlwimi 4, 3, 8, 8, 15
+; BE-NEXT: rlwimi 4, 3, 8, 24, 31
+; BE-NEXT: rldicl 3, 3, 32, 32
+; BE-NEXT: rotlwi 5, 3, 24
+; BE-NEXT: rlwimi 5, 3, 8, 8, 15
+; BE-NEXT: rlwimi 5, 3, 8, 24, 31
+; BE-NEXT: sldi 3, 6, 1
+; BE-NEXT: rldicl 6, 6, 63, 1
+; BE-NEXT: and 3, 3, 27
+; BE-NEXT: and 6, 6, 26
+; BE-NEXT: or 3, 6, 3
+; BE-NEXT: sldi 6, 3, 2
+; BE-NEXT: rldicl 3, 3, 62, 2
+; BE-NEXT: and 6, 6, 29
+; BE-NEXT: and 3, 3, 28
+; BE-NEXT: or 3, 3, 6
+; BE-NEXT: sldi 6, 3, 4
+; BE-NEXT: rldicl 3, 3, 60, 4
+; BE-NEXT: and 6, 6, 30
+; BE-NEXT: and 3, 3, 0
+; BE-NEXT: or 3, 3, 6
+; BE-NEXT: rotlwi 6, 3, 24
+; BE-NEXT: rlwimi 6, 3, 8, 8, 15
+; BE-NEXT: rlwimi 6, 3, 8, 24, 31
+; BE-NEXT: rldicl 3, 3, 32, 32
+; BE-NEXT: rotlwi 7, 3, 24
+; BE-NEXT: rlwimi 7, 3, 8, 8, 15
+; BE-NEXT: rlwimi 7, 3, 8, 24, 31
+; BE-NEXT: sldi 3, 4, 32
+; BE-NEXT: or 4, 3, 5
+; BE-NEXT: sldi 3, 6, 32
+; BE-NEXT: or 3, 3, 7
+; BE-NEXT: rlwinm 5, 3, 0, 30, 30
+; BE-NEXT: mulld 5, 4, 5
+; BE-NEXT: std 5, 344(1) # 8-byte Folded Spill
+; BE-NEXT: clrldi 5, 3, 63
+; BE-NEXT: mulld 5, 4, 5
+; BE-NEXT: std 5, 336(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 5, 3, 0, 29, 29
+; BE-NEXT: mulld 5, 4, 5
+; BE-NEXT: std 5, 328(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 5, 3, 0, 28, 28
+; BE-NEXT: mulld 5, 4, 5
+; BE-NEXT: std 5, 320(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 5, 3, 0, 27, 27
+; BE-NEXT: mulld 5, 4, 5
+; BE-NEXT: std 5, 312(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 5, 3, 0, 26, 26
+; BE-NEXT: mulld 5, 4, 5
+; BE-NEXT: std 5, 304(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 5, 3, 0, 25, 25
+; BE-NEXT: mulld 5, 4, 5
+; BE-NEXT: std 5, 296(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 5, 3, 0, 24, 24
+; BE-NEXT: mulld 5, 4, 5
+; BE-NEXT: std 5, 288(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 5, 3, 0, 23, 23
+; BE-NEXT: mulld 5, 4, 5
+; BE-NEXT: std 5, 280(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 5, 3, 0, 22, 22
+; BE-NEXT: mulld 5, 4, 5
+; BE-NEXT: std 5, 272(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 5, 3, 0, 21, 21
+; BE-NEXT: mulld 5, 4, 5
+; BE-NEXT: std 5, 264(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 5, 3, 0, 20, 20
+; BE-NEXT: mulld 5, 4, 5
+; BE-NEXT: std 5, 256(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 5, 3, 0, 19, 19
+; BE-NEXT: mulld 5, 4, 5
+; BE-NEXT: std 5, 248(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 5, 3, 0, 18, 18
+; BE-NEXT: mulld 5, 4, 5
+; BE-NEXT: std 5, 240(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 5, 3, 0, 17, 17
+; BE-NEXT: mulld 5, 4, 5
+; BE-NEXT: std 5, 232(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 5, 3, 0, 16, 16
+; BE-NEXT: mulld 5, 4, 5
+; BE-NEXT: std 5, 224(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 5, 3, 0, 15, 15
+; BE-NEXT: mulld 5, 4, 5
+; BE-NEXT: std 5, 216(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 5, 3, 0, 14, 14
+; BE-NEXT: mulld 5, 4, 5
+; BE-NEXT: std 5, 208(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 5, 3, 0, 13, 13
+; BE-NEXT: mulld 5, 4, 5
+; BE-NEXT: std 5, 200(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 5, 3, 0, 12, 12
+; BE-NEXT: mulld 5, 4, 5
+; BE-NEXT: std 5, 192(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 5, 3, 0, 11, 11
+; BE-NEXT: mulld 5, 4, 5
+; BE-NEXT: std 5, 184(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 5, 3, 0, 10, 10
+; BE-NEXT: mulld 5, 4, 5
+; BE-NEXT: std 5, 176(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 5, 3, 0, 9, 9
+; BE-NEXT: mulld 5, 4, 5
+; BE-NEXT: std 5, 168(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 5, 3, 0, 8, 8
+; BE-NEXT: mulld 5, 4, 5
+; BE-NEXT: std 5, 160(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 5, 3, 0, 7, 7
+; BE-NEXT: mulld 5, 4, 5
+; BE-NEXT: std 5, 152(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 5, 3, 0, 6, 6
+; BE-NEXT: mulld 5, 4, 5
+; BE-NEXT: std 5, 144(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 5, 3, 0, 5, 5
+; BE-NEXT: mulld 5, 4, 5
+; BE-NEXT: std 5, 136(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 5, 3, 0, 4, 4
+; BE-NEXT: mulld 5, 4, 5
+; BE-NEXT: std 5, 128(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 5, 3, 0, 3, 3
+; BE-NEXT: mulld 5, 4, 5
+; BE-NEXT: std 5, 120(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 5, 3, 0, 2, 2
+; BE-NEXT: mulld 5, 4, 5
+; BE-NEXT: std 5, 112(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 5, 3, 0, 1, 1
+; BE-NEXT: mulld 5, 4, 5
+; BE-NEXT: std 5, 104(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 5, 3, 0, 0, 0
+; BE-NEXT: mulld 5, 4, 5
+; BE-NEXT: std 5, 96(1) # 8-byte Folded Spill
+; BE-NEXT: rldicl 5, 3, 32, 32
+; BE-NEXT: rldicl 5, 5, 32, 31
+; BE-NEXT: rldicr 6, 3, 0, 0
+; BE-NEXT: mulld 5, 4, 5
+; BE-NEXT: mulld 6, 4, 6
+; BE-NEXT: std 5, 80(1) # 8-byte Folded Spill
+; BE-NEXT: std 6, 88(1) # 8-byte Folded Spill
+; BE-NEXT: rldicl 5, 3, 31, 33
+; BE-NEXT: rldicl 5, 5, 33, 30
+; BE-NEXT: rldicl 6, 3, 30, 34
+; BE-NEXT: rldicl 6, 6, 34, 29
+; BE-NEXT: mulld 5, 4, 5
+; BE-NEXT: std 5, 64(1) # 8-byte Folded Spill
+; BE-NEXT: mulld 5, 4, 6
+; BE-NEXT: std 5, 72(1) # 8-byte Folded Spill
+; BE-NEXT: rldicl 5, 3, 29, 35
+; BE-NEXT: rldicl 6, 3, 28, 36
+; BE-NEXT: rldicl 5, 5, 35, 28
+; BE-NEXT: rldicl 6, 6, 36, 27
+; BE-NEXT: mulld 31, 4, 5
+; BE-NEXT: mulld 5, 4, 6
+; BE-NEXT: std 5, 56(1) # 8-byte Folded Spill
+; BE-NEXT: rldicl 5, 3, 27, 37
+; BE-NEXT: rldicl 5, 5, 37, 26
+; BE-NEXT: rldicl 6, 3, 26, 38
+; BE-NEXT: mulld 15, 4, 5
+; BE-NEXT: rldicl 5, 3, 25, 39
+; BE-NEXT: rldicl 6, 6, 38, 25
+; BE-NEXT: rldicl 5, 5, 39, 24
+; BE-NEXT: mulld 14, 4, 6
+; BE-NEXT: rldicl 6, 3, 24, 40
+; BE-NEXT: mulld 17, 4, 5
+; BE-NEXT: rldicl 5, 3, 23, 41
+; BE-NEXT: rldicl 6, 6, 40, 23
+; BE-NEXT: rldicl 5, 5, 41, 22
+; BE-NEXT: mulld 16, 4, 6
+; BE-NEXT: rldicl 6, 3, 22, 42
+; BE-NEXT: mulld 19, 4, 5
+; BE-NEXT: rldicl 5, 3, 21, 43
+; BE-NEXT: rldicl 6, 6, 42, 21
+; BE-NEXT: rldicl 5, 5, 43, 20
+; BE-NEXT: mulld 18, 4, 6
+; BE-NEXT: rldicl 6, 3, 20, 44
+; BE-NEXT: mulld 21, 4, 5
+; BE-NEXT: rldicl 5, 3, 19, 45
+; BE-NEXT: rldicl 6, 6, 44, 19
+; BE-NEXT: rldicl 5, 5, 45, 18
+; BE-NEXT: mulld 20, 4, 6
+; BE-NEXT: rldicl 6, 3, 18, 46
+; BE-NEXT: mulld 23, 4, 5
+; BE-NEXT: rldicl 5, 3, 17, 47
+; BE-NEXT: rldicl 6, 6, 46, 17
+; BE-NEXT: rldicl 5, 5, 47, 16
+; BE-NEXT: mulld 22, 4, 6
+; BE-NEXT: rldicl 6, 3, 16, 48
+; BE-NEXT: mulld 25, 4, 5
+; BE-NEXT: rldicl 5, 3, 15, 49
+; BE-NEXT: rldicl 6, 6, 48, 15
+; BE-NEXT: rldicl 5, 5, 49, 14
+; BE-NEXT: mulld 24, 4, 6
+; BE-NEXT: rldicl 6, 3, 14, 50
+; BE-NEXT: mulld 27, 4, 5
+; BE-NEXT: rldicl 5, 3, 13, 51
+; BE-NEXT: rldicl 6, 6, 50, 13
+; BE-NEXT: rldicl 5, 5, 51, 12
+; BE-NEXT: mulld 26, 4, 6
+; BE-NEXT: rldicl 6, 3, 12, 52
+; BE-NEXT: mulld 29, 4, 5
+; BE-NEXT: rldicl 5, 3, 11, 53
+; BE-NEXT: rldicl 6, 6, 52, 11
+; BE-NEXT: rldicl 5, 5, 53, 10
+; BE-NEXT: mulld 28, 4, 6
+; BE-NEXT: rldicl 6, 3, 10, 54
+; BE-NEXT: mulld 0, 4, 5
+; BE-NEXT: rldicl 5, 3, 9, 55
+; BE-NEXT: rldicl 6, 6, 54, 9
+; BE-NEXT: rldicl 5, 5, 55, 8
+; BE-NEXT: mulld 30, 4, 6
+; BE-NEXT: rldicl 6, 3, 8, 56
+; BE-NEXT: mulld 11, 4, 5
+; BE-NEXT: rldicl 5, 3, 7, 57
+; BE-NEXT: rldicl 6, 6, 56, 7
+; BE-NEXT: rldicl 5, 5, 57, 6
+; BE-NEXT: mulld 12, 4, 6
+; BE-NEXT: rldicl 6, 3, 6, 58
+; BE-NEXT: mulld 9, 4, 5
+; BE-NEXT: rldicl 5, 3, 5, 59
+; BE-NEXT: rldicl 6, 6, 58, 5
+; BE-NEXT: rldicl 5, 5, 59, 4
+; BE-NEXT: mulld 10, 4, 6
+; BE-NEXT: rldicl 6, 3, 4, 60
+; BE-NEXT: mulld 7, 4, 5
+; BE-NEXT: rldicl 5, 3, 3, 61
+; BE-NEXT: rldicl 3, 3, 2, 62
+; BE-NEXT: rldicl 6, 6, 60, 3
+; BE-NEXT: rldicl 3, 3, 62, 1
+; BE-NEXT: mulld 8, 4, 6
+; BE-NEXT: mulld 6, 4, 3
+; BE-NEXT: ld 3, 896(1) # 8-byte Folded Reload
+; BE-NEXT: rldicl 5, 5, 61, 2
+; BE-NEXT: mulld 5, 4, 5
+; BE-NEXT: xor 3, 2, 3
+; BE-NEXT: ld 4, 344(1) # 8-byte Folded Reload
+; BE-NEXT: ld 2, 336(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 2, 4
+; BE-NEXT: ld 2, 888(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 328(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 2
+; BE-NEXT: ld 2, 872(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 320(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 2
+; BE-NEXT: ld 2, 880(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 312(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 2
+; BE-NEXT: ld 2, 856(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 304(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 2
+; BE-NEXT: ld 2, 864(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 296(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 2
+; BE-NEXT: ld 2, 840(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 288(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 2
+; BE-NEXT: ld 2, 848(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 280(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 2
+; BE-NEXT: ld 2, 824(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 272(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 2
+; BE-NEXT: ld 2, 832(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 264(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 2
+; BE-NEXT: ld 2, 808(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 256(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 2
+; BE-NEXT: ld 2, 816(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 248(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 2
+; BE-NEXT: ld 2, 792(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 240(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 2
+; BE-NEXT: ld 2, 800(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 232(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 2
+; BE-NEXT: ld 2, 776(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 224(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 2
+; BE-NEXT: ld 2, 784(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 216(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 2
+; BE-NEXT: ld 2, 760(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 208(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 2
+; BE-NEXT: ld 2, 768(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 200(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 2
+; BE-NEXT: ld 2, 744(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 192(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 2
+; BE-NEXT: ld 2, 752(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 184(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 2
+; BE-NEXT: ld 2, 736(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 176(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 2
+; BE-NEXT: ld 2, 728(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 168(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 2
+; BE-NEXT: ld 2, 720(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 160(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 2
+; BE-NEXT: ld 2, 712(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 152(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 2
+; BE-NEXT: ld 2, 704(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 144(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 2
+; BE-NEXT: ld 2, 696(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 136(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 2
+; BE-NEXT: ld 2, 688(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 128(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 2
+; BE-NEXT: ld 2, 680(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 120(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 2
+; BE-NEXT: ld 2, 672(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 112(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 2
+; BE-NEXT: ld 2, 664(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 104(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 2
+; BE-NEXT: ld 2, 656(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 96(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 2
+; BE-NEXT: ld 2, 640(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 80(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 2
+; BE-NEXT: ld 2, 624(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 64(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 2
+; BE-NEXT: ld 2, 632(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 72(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 2
+; BE-NEXT: ld 2, 608(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 31
+; BE-NEXT: ld 31, 616(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: xor 3, 3, 31
+; BE-NEXT: ld 31, 56(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 31
+; BE-NEXT: ld 31, 592(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 15
+; BE-NEXT: xor 4, 4, 14
+; BE-NEXT: ld 15, 600(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 31
+; BE-NEXT: xor 4, 4, 17
+; BE-NEXT: xor 4, 4, 16
+; BE-NEXT: xor 3, 3, 15
+; BE-NEXT: ld 15, 576(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 19
+; BE-NEXT: xor 4, 4, 18
+; BE-NEXT: ld 17, 584(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 15
+; BE-NEXT: xor 4, 4, 21
+; BE-NEXT: xor 4, 4, 20
+; BE-NEXT: xor 3, 3, 17
+; BE-NEXT: ld 17, 560(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 23
+; BE-NEXT: xor 4, 4, 22
+; BE-NEXT: ld 19, 568(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 17
+; BE-NEXT: xor 4, 4, 25
+; BE-NEXT: xor 4, 4, 24
+; BE-NEXT: xor 3, 3, 19
+; BE-NEXT: ld 19, 544(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 27
+; BE-NEXT: xor 4, 4, 26
+; BE-NEXT: ld 21, 552(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 19
+; BE-NEXT: xor 4, 4, 29
+; BE-NEXT: xor 4, 4, 28
+; BE-NEXT: xor 3, 3, 21
+; BE-NEXT: ld 21, 528(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 0
+; BE-NEXT: xor 4, 4, 30
+; BE-NEXT: ld 23, 536(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 21
+; BE-NEXT: xor 4, 4, 11
+; BE-NEXT: xor 4, 4, 12
+; BE-NEXT: xor 3, 3, 23
+; BE-NEXT: ld 23, 512(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 9
+; BE-NEXT: xor 4, 4, 10
+; BE-NEXT: ld 25, 520(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 23
+; BE-NEXT: xor 4, 4, 7
+; BE-NEXT: xor 4, 4, 8
+; BE-NEXT: xor 3, 3, 25
+; BE-NEXT: ld 25, 496(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 5
+; BE-NEXT: xor 4, 4, 6
+; BE-NEXT: ld 27, 504(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 25
+; BE-NEXT: xor 3, 3, 27
+; BE-NEXT: ld 27, 480(1) # 8-byte Folded Reload
+; BE-NEXT: ld 29, 488(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 27
+; BE-NEXT: xor 3, 3, 29
+; BE-NEXT: ld 29, 464(1) # 8-byte Folded Reload
+; BE-NEXT: ld 0, 472(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 29
+; BE-NEXT: xor 3, 3, 0
+; BE-NEXT: ld 0, 448(1) # 8-byte Folded Reload
+; BE-NEXT: ld 11, 456(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 0
+; BE-NEXT: xor 3, 3, 11
+; BE-NEXT: ld 11, 432(1) # 8-byte Folded Reload
+; BE-NEXT: ld 9, 440(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 11
+; BE-NEXT: xor 3, 3, 9
+; BE-NEXT: ld 9, 416(1) # 8-byte Folded Reload
+; BE-NEXT: ld 7, 424(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 9
+; BE-NEXT: xor 3, 3, 7
+; BE-NEXT: ld 7, 400(1) # 8-byte Folded Reload
+; BE-NEXT: ld 5, 408(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 7
+; BE-NEXT: xor 3, 3, 5
+; BE-NEXT: ld 5, 648(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 5
+; BE-NEXT: ld 5, 88(1) # 8-byte Folded Reload
+; BE-NEXT: ld 7, 360(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 5
+; BE-NEXT: sldi 5, 3, 1
+; BE-NEXT: rldicl 3, 3, 63, 1
+; BE-NEXT: sldi 6, 4, 1
+; BE-NEXT: ld 8, 352(1) # 8-byte Folded Reload
+; BE-NEXT: rldicl 4, 4, 63, 1
+; BE-NEXT: and 5, 5, 7
+; BE-NEXT: and 3, 3, 8
+; BE-NEXT: and 6, 6, 7
+; BE-NEXT: ld 7, 376(1) # 8-byte Folded Reload
+; BE-NEXT: and 4, 4, 8
+; BE-NEXT: or 3, 3, 5
+; BE-NEXT: ld 8, 368(1) # 8-byte Folded Reload
+; BE-NEXT: or 4, 4, 6
+; BE-NEXT: sldi 5, 3, 2
+; BE-NEXT: rldicl 3, 3, 62, 2
+; BE-NEXT: sldi 6, 4, 2
+; BE-NEXT: ld 2, 904(1) # 8-byte Folded Reload
+; BE-NEXT: rldicl 4, 4, 62, 2
+; BE-NEXT: and 5, 5, 7
+; BE-NEXT: ld 31, 1048(1) # 8-byte Folded Reload
+; BE-NEXT: and 3, 3, 8
+; BE-NEXT: and 6, 6, 7
+; BE-NEXT: and 4, 4, 8
+; BE-NEXT: ld 8, 384(1) # 8-byte Folded Reload
+; BE-NEXT: or 3, 3, 5
+; BE-NEXT: sldi 5, 3, 4
+; BE-NEXT: rldicl 3, 3, 60, 4
+; BE-NEXT: ld 7, 392(1) # 8-byte Folded Reload
+; BE-NEXT: or 4, 4, 6
+; BE-NEXT: and 5, 5, 8
+; BE-NEXT: and 3, 3, 7
+; BE-NEXT: sldi 6, 4, 4
+; BE-NEXT: ld 30, 1040(1) # 8-byte Folded Reload
+; BE-NEXT: rldicl 4, 4, 60, 4
+; BE-NEXT: or 3, 3, 5
+; BE-NEXT: ld 29, 1032(1) # 8-byte Folded Reload
+; BE-NEXT: and 6, 6, 8
+; BE-NEXT: and 4, 4, 7
+; BE-NEXT: rotlwi 5, 3, 24
+; BE-NEXT: or 4, 4, 6
+; BE-NEXT: ld 28, 1024(1) # 8-byte Folded Reload
+; BE-NEXT: rlwimi 5, 3, 8, 8, 15
+; BE-NEXT: rotlwi 6, 4, 24
+; BE-NEXT: ld 27, 1016(1) # 8-byte Folded Reload
+; BE-NEXT: rldicl 7, 3, 32, 32
+; BE-NEXT: rlwimi 5, 3, 8, 24, 31
+; BE-NEXT: rldicl 3, 4, 32, 32
+; BE-NEXT: ld 26, 1008(1) # 8-byte Folded Reload
+; BE-NEXT: rlwimi 6, 4, 8, 8, 15
+; BE-NEXT: ld 25, 1000(1) # 8-byte Folded Reload
+; BE-NEXT: rotlwi 8, 7, 24
+; BE-NEXT: rotlwi 9, 3, 24
+; BE-NEXT: rlwimi 8, 7, 8, 8, 15
+; BE-NEXT: ld 24, 992(1) # 8-byte Folded Reload
+; BE-NEXT: rlwimi 9, 3, 8, 8, 15
+; BE-NEXT: ld 23, 984(1) # 8-byte Folded Reload
+; BE-NEXT: rlwimi 6, 4, 8, 24, 31
+; BE-NEXT: rlwimi 8, 7, 8, 24, 31
+; BE-NEXT: ld 22, 976(1) # 8-byte Folded Reload
+; BE-NEXT: rlwimi 9, 3, 8, 24, 31
+; BE-NEXT: ld 21, 968(1) # 8-byte Folded Reload
+; BE-NEXT: sldi 3, 5, 32
+; BE-NEXT: sldi 4, 6, 32
+; BE-NEXT: or 3, 3, 8
+; BE-NEXT: ld 20, 960(1) # 8-byte Folded Reload
+; BE-NEXT: or 4, 4, 9
+; BE-NEXT: ld 19, 952(1) # 8-byte Folded Reload
+; BE-NEXT: ld 18, 944(1) # 8-byte Folded Reload
+; BE-NEXT: ld 17, 936(1) # 8-byte Folded Reload
+; BE-NEXT: ld 16, 928(1) # 8-byte Folded Reload
+; BE-NEXT: ld 15, 920(1) # 8-byte Folded Reload
+; BE-NEXT: ld 14, 912(1) # 8-byte Folded Reload
+; BE-NEXT: addi 1, 1, 1056
+; BE-NEXT: blr
+;
+; LE-LABEL: clmulr_v2i64:
+; LE: # %bb.0:
+; LE-NEXT: stdu 1, -752(1)
+; LE-NEXT: lis 4, -21846
+; LE-NEXT: lis 5, 21845
+; LE-NEXT: xxswapd 1, 35
+; LE-NEXT: xxswapd 0, 34
+; LE-NEXT: mfvsrd 3, 35
+; LE-NEXT: mfvsrd 9, 34
+; LE-NEXT: lis 6, -13108
+; LE-NEXT: lis 7, 13107
+; LE-NEXT: ori 4, 4, 43690
+; LE-NEXT: ori 5, 5, 21845
+; LE-NEXT: mffprd 8, 1
+; LE-NEXT: mffprd 10, 0
+; LE-NEXT: std 28, 720(1) # 8-byte Folded Spill
+; LE-NEXT: std 29, 728(1) # 8-byte Folded Spill
+; LE-NEXT: ori 6, 6, 52428
+; LE-NEXT: ori 7, 7, 13107
+; LE-NEXT: sldi 4, 4, 32
+; LE-NEXT: sldi 5, 5, 32
+; LE-NEXT: sldi 6, 6, 32
+; LE-NEXT: sldi 7, 7, 32
+; LE-NEXT: sldi 11, 3, 1
+; LE-NEXT: rldicl 3, 3, 63, 1
+; LE-NEXT: std 30, 736(1) # 8-byte Folded Spill
+; LE-NEXT: lis 0, -3856
+; LE-NEXT: oris 4, 4, 43690
+; LE-NEXT: oris 5, 5, 21845
+; LE-NEXT: lis 30, 3855
+; LE-NEXT: oris 6, 6, 52428
+; LE-NEXT: sldi 12, 10, 1
+; LE-NEXT: rldicl 10, 10, 63, 1
+; LE-NEXT: oris 7, 7, 13107
+; LE-NEXT: std 27, 712(1) # 8-byte Folded Spill
+; LE-NEXT: ori 28, 4, 43690
+; LE-NEXT: ori 29, 5, 21845
+; LE-NEXT: std 14, 608(1) # 8-byte Folded Spill
+; LE-NEXT: std 15, 616(1) # 8-byte Folded Spill
+; LE-NEXT: sldi 4, 8, 1
+; LE-NEXT: rldicl 5, 8, 63, 1
+; LE-NEXT: std 16, 624(1) # 8-byte Folded Spill
+; LE-NEXT: std 17, 632(1) # 8-byte Folded Spill
+; LE-NEXT: sldi 8, 9, 1
+; LE-NEXT: rldicl 9, 9, 63, 1
+; LE-NEXT: std 28, 584(1) # 8-byte Folded Spill
+; LE-NEXT: std 29, 592(1) # 8-byte Folded Spill
+; LE-NEXT: and 11, 11, 28
+; LE-NEXT: and 3, 3, 29
+; LE-NEXT: std 18, 640(1) # 8-byte Folded Spill
+; LE-NEXT: std 19, 648(1) # 8-byte Folded Spill
+; LE-NEXT: and 4, 4, 28
+; LE-NEXT: and 5, 5, 29
+; LE-NEXT: std 20, 656(1) # 8-byte Folded Spill
+; LE-NEXT: std 21, 664(1) # 8-byte Folded Spill
+; LE-NEXT: and 8, 8, 28
+; LE-NEXT: and 9, 9, 29
+; LE-NEXT: std 22, 672(1) # 8-byte Folded Spill
+; LE-NEXT: std 23, 680(1) # 8-byte Folded Spill
+; LE-NEXT: and 12, 12, 28
+; LE-NEXT: and 10, 10, 29
+; LE-NEXT: std 24, 688(1) # 8-byte Folded Spill
+; LE-NEXT: std 25, 696(1) # 8-byte Folded Spill
+; LE-NEXT: or 3, 3, 11
+; LE-NEXT: or 4, 5, 4
+; LE-NEXT: std 26, 704(1) # 8-byte Folded Spill
+; LE-NEXT: std 31, 744(1) # 8-byte Folded Spill
+; LE-NEXT: ori 5, 0, 61680
+; LE-NEXT: ori 11, 30, 3855
+; LE-NEXT: std 2, 600(1) # 8-byte Folded Spill
+; LE-NEXT: ori 30, 6, 52428
+; LE-NEXT: ori 0, 7, 13107
+; LE-NEXT: std 30, 568(1) # 8-byte Folded Spill
+; LE-NEXT: std 0, 576(1) # 8-byte Folded Spill
+; LE-NEXT: or 6, 9, 8
+; LE-NEXT: or 7, 10, 12
+; LE-NEXT: sldi 8, 3, 2
+; LE-NEXT: rldicl 3, 3, 62, 2
+; LE-NEXT: sldi 9, 4, 2
+; LE-NEXT: rldicl 4, 4, 62, 2
+; LE-NEXT: sldi 5, 5, 32
+; LE-NEXT: sldi 10, 11, 32
+; LE-NEXT: sldi 11, 6, 2
+; LE-NEXT: rldicl 6, 6, 62, 2
+; LE-NEXT: sldi 12, 7, 2
+; LE-NEXT: rldicl 7, 7, 62, 2
+; LE-NEXT: and 8, 8, 30
+; LE-NEXT: and 3, 3, 0
+; LE-NEXT: and 9, 9, 30
+; LE-NEXT: and 4, 4, 0
+; LE-NEXT: oris 5, 5, 61680
+; LE-NEXT: oris 10, 10, 3855
+; LE-NEXT: and 11, 11, 30
+; LE-NEXT: and 6, 6, 0
+; LE-NEXT: and 12, 12, 30
+; LE-NEXT: and 7, 7, 0
+; LE-NEXT: or 3, 3, 8
+; LE-NEXT: or 4, 4, 9
+; LE-NEXT: ori 30, 5, 61680
+; LE-NEXT: std 30, 552(1) # 8-byte Folded Spill
+; LE-NEXT: ori 0, 10, 3855
+; LE-NEXT: std 0, 560(1) # 8-byte Folded Spill
+; LE-NEXT: or 5, 6, 11
+; LE-NEXT: or 6, 7, 12
+; LE-NEXT: sldi 7, 3, 4
+; LE-NEXT: rldicl 3, 3, 60, 4
+; LE-NEXT: sldi 8, 4, 4
+; LE-NEXT: rldicl 4, 4, 60, 4
+; LE-NEXT: sldi 9, 5, 4
+; LE-NEXT: rldicl 5, 5, 60, 4
+; LE-NEXT: sldi 10, 6, 4
+; LE-NEXT: rldicl 6, 6, 60, 4
+; LE-NEXT: and 7, 7, 30
+; LE-NEXT: and 3, 3, 0
+; LE-NEXT: and 8, 8, 30
+; LE-NEXT: and 4, 4, 0
+; LE-NEXT: and 9, 9, 30
+; LE-NEXT: and 5, 5, 0
+; LE-NEXT: and 10, 10, 30
+; LE-NEXT: and 6, 6, 0
+; LE-NEXT: or 3, 3, 7
+; LE-NEXT: or 4, 4, 8
+; LE-NEXT: or 5, 5, 9
+; LE-NEXT: or 6, 6, 10
+; LE-NEXT: rldicl 7, 3, 32, 32
+; LE-NEXT: rotlwi 8, 3, 24
+; LE-NEXT: rldicl 9, 4, 32, 32
+; LE-NEXT: rotlwi 10, 4, 24
+; LE-NEXT: rldicl 11, 5, 32, 32
+; LE-NEXT: rotlwi 12, 5, 24
+; LE-NEXT: rotlwi 29, 7, 24
+; LE-NEXT: rlwimi 8, 3, 8, 8, 15
+; LE-NEXT: rotlwi 28, 9, 24
+; LE-NEXT: rlwimi 10, 4, 8, 8, 15
+; LE-NEXT: rlwimi 8, 3, 8, 24, 31
+; LE-NEXT: rlwimi 10, 4, 8, 24, 31
+; LE-NEXT: rotlwi 4, 11, 24
+; LE-NEXT: rlwimi 12, 5, 8, 8, 15
+; LE-NEXT: rlwimi 29, 7, 8, 8, 15
+; LE-NEXT: sldi 3, 8, 32
+; LE-NEXT: rlwimi 28, 9, 8, 8, 15
+; LE-NEXT: sldi 8, 10, 32
+; LE-NEXT: rlwimi 12, 5, 8, 24, 31
+; LE-NEXT: rlwimi 29, 7, 8, 24, 31
+; LE-NEXT: rlwimi 28, 9, 8, 24, 31
+; LE-NEXT: rlwimi 4, 11, 8, 8, 15
+; LE-NEXT: sldi 5, 12, 32
+; LE-NEXT: or 9, 3, 29
+; LE-NEXT: or 3, 8, 28
+; LE-NEXT: rlwimi 4, 11, 8, 24, 31
+; LE-NEXT: or 10, 5, 4
+; LE-NEXT: rlwinm 4, 3, 0, 30, 30
+; LE-NEXT: std 4, 544(1) # 8-byte Folded Spill
+; LE-NEXT: rlwinm 4, 3, 0, 5, 5
+; LE-NEXT: std 4, 384(1) # 8-byte Folded Spill
+; LE-NEXT: rlwinm 4, 3, 0, 4, 4
+; LE-NEXT: std 4, 376(1) # 8-byte Folded Spill
+; LE-NEXT: rlwinm 4, 3, 0, 3, 3
+; LE-NEXT: std 4, 368(1) # 8-byte Folded Spill
+; LE-NEXT: rlwinm 4, 3, 0, 2, 2
+; LE-NEXT: std 4, 360(1) # 8-byte Folded Spill
+; LE-NEXT: rlwinm 4, 3, 0, 1, 1
+; LE-NEXT: std 4, 352(1) # 8-byte Folded Spill
+; LE-NEXT: rlwinm 4, 3, 0, 0, 0
+; LE-NEXT: std 4, 344(1) # 8-byte Folded Spill
+; LE-NEXT: rldicl 4, 3, 32, 32
+; LE-NEXT: std 4, 336(1) # 8-byte Folded Spill
+; LE-NEXT: rldicl 4, 3, 31, 33
+; LE-NEXT: std 4, 280(1) # 8-byte Folded Spill
+; LE-NEXT: rldicl 4, 3, 30, 34
+; LE-NEXT: std 4, 272(1) # 8-byte Folded Spill
+; LE-NEXT: rldicl 4, 3, 29, 35
+; LE-NEXT: std 4, 264(1) # 8-byte Folded Spill
+; LE-NEXT: rldicl 4, 3, 28, 36
+; LE-NEXT: std 4, 256(1) # 8-byte Folded Spill
+; LE-NEXT: rldicl 4, 3, 27, 37
+; LE-NEXT: std 4, 248(1) # 8-byte Folded Spill
+; LE-NEXT: rldicl 4, 3, 26, 38
+; LE-NEXT: std 4, 240(1) # 8-byte Folded Spill
+; LE-NEXT: rldicl 4, 3, 25, 39
+; LE-NEXT: std 4, 232(1) # 8-byte Folded Spill
+; LE-NEXT: rldicl 4, 3, 24, 40
+; LE-NEXT: rldicl 0, 6, 32, 32
+; LE-NEXT: rotlwi 30, 6, 24
+; LE-NEXT: rotlwi 27, 0, 24
+; LE-NEXT: std 4, 224(1) # 8-byte Folded Spill
+; LE-NEXT: rldicl 4, 3, 23, 41
+; LE-NEXT: rlwimi 30, 6, 8, 8, 15
+; LE-NEXT: rlwimi 30, 6, 8, 24, 31
+; LE-NEXT: rlwimi 27, 0, 8, 8, 15
+; LE-NEXT: std 4, 216(1) # 8-byte Folded Spill
+; LE-NEXT: rldicl 4, 3, 22, 42
+; LE-NEXT: sldi 6, 30, 32
+; LE-NEXT: rlwimi 27, 0, 8, 24, 31
+; LE-NEXT: or 11, 6, 27
+; LE-NEXT: std 4, 208(1) # 8-byte Folded Spill
+; LE-NEXT: rldicl 4, 3, 21, 43
+; LE-NEXT: clrldi 5, 3, 63
+; LE-NEXT: rlwinm 6, 3, 0, 29, 29
+; LE-NEXT: rlwinm 7, 3, 0, 28, 28
+; LE-NEXT: std 4, 200(1) # 8-byte Folded Spill
+; LE-NEXT: rldicl 4, 3, 20, 44
+; LE-NEXT: rlwinm 8, 3, 0, 27, 27
+; LE-NEXT: rlwinm 12, 3, 0, 26, 26
+; LE-NEXT: rlwinm 0, 3, 0, 25, 25
+; LE-NEXT: std 4, 192(1) # 8-byte Folded Spill
+; LE-NEXT: rldicl 4, 3, 19, 45
+; LE-NEXT: rlwinm 30, 3, 0, 24, 24
+; LE-NEXT: rlwinm 29, 3, 0, 23, 23
+; LE-NEXT: rlwinm 28, 3, 0, 22, 22
+; LE-NEXT: std 4, 184(1) # 8-byte Folded Spill
+; LE-NEXT: rldicl 4, 3, 18, 46
+; LE-NEXT: rlwinm 27, 3, 0, 21, 21
+; LE-NEXT: rlwinm 26, 3, 0, 20, 20
+; LE-NEXT: rlwinm 25, 3, 0, 19, 19
+; LE-NEXT: std 4, 176(1) # 8-byte Folded Spill
+; LE-NEXT: rldicl 4, 3, 17, 47
+; LE-NEXT: rlwinm 24, 3, 0, 18, 18
+; LE-NEXT: rlwinm 23, 3, 0, 17, 17
+; LE-NEXT: rlwinm 22, 3, 0, 16, 16
+; LE-NEXT: std 4, 168(1) # 8-byte Folded Spill
+; LE-NEXT: rldicl 4, 3, 16, 48
+; LE-NEXT: rlwinm 21, 3, 0, 15, 15
+; LE-NEXT: rlwinm 20, 3, 0, 14, 14
+; LE-NEXT: rlwinm 19, 3, 0, 13, 13
+; LE-NEXT: std 4, 160(1) # 8-byte Folded Spill
+; LE-NEXT: rldicl 4, 3, 15, 49
+; LE-NEXT: rlwinm 18, 3, 0, 12, 12
+; LE-NEXT: rlwinm 17, 3, 0, 11, 11
+; LE-NEXT: rlwinm 16, 3, 0, 10, 10
+; LE-NEXT: std 4, 152(1) # 8-byte Folded Spill
+; LE-NEXT: rldicl 4, 3, 14, 50
+; LE-NEXT: rlwinm 15, 3, 0, 9, 9
+; LE-NEXT: rlwinm 14, 3, 0, 8, 8
+; LE-NEXT: rlwinm 31, 3, 0, 7, 7
+; LE-NEXT: std 4, 144(1) # 8-byte Folded Spill
+; LE-NEXT: rldicl 4, 3, 13, 51
+; LE-NEXT: rlwinm 2, 3, 0, 6, 6
+; LE-NEXT: std 4, 136(1) # 8-byte Folded Spill
+; LE-NEXT: rldicl 4, 3, 12, 52
+; LE-NEXT: std 4, 128(1) # 8-byte Folded Spill
+; LE-NEXT: rldicl 4, 3, 11, 53
+; LE-NEXT: std 4, 120(1) # 8-byte Folded Spill
+; LE-NEXT: rldicl 4, 3, 10, 54
+; LE-NEXT: std 4, 112(1) # 8-byte Folded Spill
+; LE-NEXT: rldicl 4, 3, 9, 55
+; LE-NEXT: std 4, 104(1) # 8-byte Folded Spill
+; LE-NEXT: rldicl 4, 3, 8, 56
+; LE-NEXT: std 4, 96(1) # 8-byte Folded Spill
+; LE-NEXT: rldicl 4, 3, 7, 57
+; LE-NEXT: std 4, 88(1) # 8-byte Folded Spill
+; LE-NEXT: rldicl 4, 3, 6, 58
+; LE-NEXT: std 4, 80(1) # 8-byte Folded Spill
+; LE-NEXT: rldicl 4, 3, 5, 59
+; LE-NEXT: std 4, 72(1) # 8-byte Folded Spill
+; LE-NEXT: rldicl 4, 3, 4, 60
+; LE-NEXT: std 4, 64(1) # 8-byte Folded Spill
+; LE-NEXT: rldicl 4, 3, 3, 61
+; LE-NEXT: std 4, 56(1) # 8-byte Folded Spill
+; LE-NEXT: rldicl 4, 3, 2, 62
+; LE-NEXT: rldicr 3, 3, 0, 0
+; LE-NEXT: std 3, 40(1) # 8-byte Folded Spill
+; LE-NEXT: ld 3, 544(1) # 8-byte Folded Reload
+; LE-NEXT: std 4, 48(1) # 8-byte Folded Spill
+; LE-NEXT: mulld 3, 11, 3
+; LE-NEXT: std 3, 296(1) # 8-byte Folded Spill
+; LE-NEXT: mulld 3, 11, 5
+; LE-NEXT: std 3, 288(1) # 8-byte Folded Spill
+; LE-NEXT: mulld 3, 11, 6
+; LE-NEXT: std 3, 304(1) # 8-byte Folded Spill
+; LE-NEXT: mulld 3, 11, 7
+; LE-NEXT: std 3, 312(1) # 8-byte Folded Spill
+; LE-NEXT: mulld 3, 11, 8
+; LE-NEXT: std 3, 320(1) # 8-byte Folded Spill
+; LE-NEXT: mulld 3, 11, 12
+; LE-NEXT: std 3, 328(1) # 8-byte Folded Spill
+; LE-NEXT: mulld 3, 11, 0
+; LE-NEXT: std 3, 544(1) # 8-byte Folded Spill
+; LE-NEXT: mulld 3, 11, 30
+; LE-NEXT: std 3, 536(1) # 8-byte Folded Spill
+; LE-NEXT: mulld 3, 11, 29
+; LE-NEXT: std 3, 528(1) # 8-byte Folded Spill
+; LE-NEXT: mulld 3, 11, 28
+; LE-NEXT: std 3, 520(1) # 8-byte Folded Spill
+; LE-NEXT: mulld 3, 11, 27
+; LE-NEXT: std 3, 512(1) # 8-byte Folded Spill
+; LE-NEXT: mulld 3, 11, 26
+; LE-NEXT: std 3, 504(1) # 8-byte Folded Spill
+; LE-NEXT: mulld 3, 11, 25
+; LE-NEXT: std 3, 496(1) # 8-byte Folded Spill
+; LE-NEXT: mulld 3, 11, 24
+; LE-NEXT: std 3, 488(1) # 8-byte Folded Spill
+; LE-NEXT: mulld 3, 11, 23
+; LE-NEXT: std 3, 480(1) # 8-byte Folded Spill
+; LE-NEXT: mulld 3, 11, 22
+; LE-NEXT: std 3, 472(1) # 8-byte Folded Spill
+; LE-NEXT: mulld 3, 11, 21
+; LE-NEXT: std 3, 464(1) # 8-byte Folded Spill
+; LE-NEXT: mulld 3, 11, 20
+; LE-NEXT: std 3, 456(1) # 8-byte Folded Spill
+; LE-NEXT: mulld 3, 11, 19
+; LE-NEXT: std 3, 448(1) # 8-byte Folded Spill
+; LE-NEXT: mulld 3, 11, 18
+; LE-NEXT: std 3, 440(1) # 8-byte Folded Spill
+; LE-NEXT: mulld 3, 11, 17
+; LE-NEXT: std 3, 432(1) # 8-byte Folded Spill
+; LE-NEXT: mulld 3, 11, 16
+; LE-NEXT: std 3, 424(1) # 8-byte Folded Spill
+; LE-NEXT: mulld 3, 11, 15
+; LE-NEXT: std 3, 416(1) # 8-byte Folded Spill
+; LE-NEXT: mulld 3, 11, 14
+; LE-NEXT: std 3, 408(1) # 8-byte Folded Spill
+; LE-NEXT: mulld 3, 11, 31
+; LE-NEXT: std 3, 400(1) # 8-byte Folded Spill
+; LE-NEXT: mulld 3, 11, 2
+; LE-NEXT: std 3, 392(1) # 8-byte Folded Spill
+; LE-NEXT: ld 3, 384(1) # 8-byte Folded Reload
+; LE-NEXT: mulld 3, 11, 3
+; LE-NEXT: std 3, 384(1) # 8-byte Folded Spill
+; LE-NEXT: ld 3, 376(1) # 8-byte Folded Reload
+; LE-NEXT: mulld 3, 11, 3
+; LE-NEXT: std 3, 376(1) # 8-byte Folded Spill
+; LE-NEXT: ld 3, 368(1) # 8-byte Folded Reload
+; LE-NEXT: mulld 3, 11, 3
+; LE-NEXT: std 3, 368(1) # 8-byte Folded Spill
+; LE-NEXT: ld 3, 360(1) # 8-byte Folded Reload
+; LE-NEXT: mulld 3, 11, 3
+; LE-NEXT: std 3, 360(1) # 8-byte Folded Spill
+; LE-NEXT: ld 3, 352(1) # 8-byte Folded Reload
+; LE-NEXT: mulld 3, 11, 3
+; LE-NEXT: std 3, 352(1) # 8-byte Folded Spill
+; LE-NEXT: ld 3, 344(1) # 8-byte Folded Reload
+; LE-NEXT: mulld 3, 11, 3
+; LE-NEXT: std 3, 344(1) # 8-byte Folded Spill
+; LE-NEXT: ld 3, 336(1) # 8-byte Folded Reload
+; LE-NEXT: rldicl 4, 3, 32, 31
+; LE-NEXT: ld 3, 280(1) # 8-byte Folded Reload
+; LE-NEXT: rldicl 5, 3, 33, 30
+; LE-NEXT: ld 3, 272(1) # 8-byte Folded Reload
+; LE-NEXT: rldicl 6, 3, 34, 29
+; LE-NEXT: ld 3, 264(1) # 8-byte Folded Reload
+; LE-NEXT: rldicl 7, 3, 35, 28
+; LE-NEXT: ld 3, 256(1) # 8-byte Folded Reload
+; LE-NEXT: rldicl 8, 3, 36, 27
+; LE-NEXT: ld 3, 248(1) # 8-byte Folded Reload
+; LE-NEXT: rldicl 12, 3, 37, 26
+; LE-NEXT: ld 3, 240(1) # 8-byte Folded Reload
+; LE-NEXT: rldicl 0, 3, 38, 25
+; LE-NEXT: ld 3, 232(1) # 8-byte Folded Reload
+; LE-NEXT: rldicl 30, 3, 39, 24
+; LE-NEXT: ld 3, 224(1) # 8-byte Folded Reload
+; LE-NEXT: rldicl 29, 3, 40, 23
+; LE-NEXT: ld 3, 216(1) # 8-byte Folded Reload
+; LE-NEXT: rldicl 28, 3, 41, 22
+; LE-NEXT: ld 3, 208(1) # 8-byte Folded Reload
+; LE-NEXT: rldicl 27, 3, 42, 21
+; LE-NEXT: ld 3, 200(1) # 8-byte Folded Reload
+; LE-NEXT: rldicl 26, 3, 43, 20
+; LE-NEXT: ld 3, 192(1) # 8-byte Folded Reload
+; LE-NEXT: rldicl 25, 3, 44, 19
+; LE-NEXT: ld 3, 184(1) # 8-byte Folded Reload
+; LE-NEXT: rldicl 24, 3, 45, 18
+; LE-NEXT: ld 3, 176(1) # 8-byte Folded Reload
+; LE-NEXT: rldicl 23, 3, 46, 17
+; LE-NEXT: ld 3, 168(1) # 8-byte Folded Reload
+; LE-NEXT: rldicl 22, 3, 47, 16
+; LE-NEXT: ld 3, 160(1) # 8-byte Folded Reload
+; LE-NEXT: rldicl 21, 3, 48, 15
+; LE-NEXT: ld 3, 152(1) # 8-byte Folded Reload
+; LE-NEXT: rldicl 20, 3, 49, 14
+; LE-NEXT: ld 3, 144(1) # 8-byte Folded Reload
+; LE-NEXT: rldicl 19, 3, 50, 13
+; LE-NEXT: ld 3, 136(1) # 8-byte Folded Reload
+; LE-NEXT: rldicl 18, 3, 51, 12
+; LE-NEXT: ld 3, 128(1) # 8-byte Folded Reload
+; LE-NEXT: rldicl 17, 3, 52, 11
+; LE-NEXT: ld 3, 120(1) # 8-byte Folded Reload
+; LE-NEXT: rldicl 16, 3, 53, 10
+; LE-NEXT: ld 3, 112(1) # 8-byte Folded Reload
+; LE-NEXT: rldicl 15, 3, 54, 9
+; LE-NEXT: ld 3, 104(1) # 8-byte Folded Reload
+; LE-NEXT: rldicl 14, 3, 55, 8
+; LE-NEXT: ld 3, 96(1) # 8-byte Folded Reload
+; LE-NEXT: rldicl 31, 3, 56, 7
+; LE-NEXT: ld 3, 88(1) # 8-byte Folded Reload
+; LE-NEXT: rldicl 2, 3, 57, 6
+; LE-NEXT: ld 3, 80(1) # 8-byte Folded Reload
+; LE-NEXT: rldicl 3, 3, 58, 5
+; LE-NEXT: std 3, 256(1) # 8-byte Folded Spill
+; LE-NEXT: ld 3, 72(1) # 8-byte Folded Reload
+; LE-NEXT: rldicl 3, 3, 59, 4
+; LE-NEXT: std 3, 248(1) # 8-byte Folded Spill
+; LE-NEXT: ld 3, 64(1) # 8-byte Folded Reload
+; LE-NEXT: rldicl 3, 3, 60, 3
+; LE-NEXT: std 3, 240(1) # 8-byte Folded Spill
+; LE-NEXT: ld 3, 56(1) # 8-byte Folded Reload
+; LE-NEXT: rldicl 3, 3, 61, 2
+; LE-NEXT: std 3, 232(1) # 8-byte Folded Spill
+; LE-NEXT: ld 3, 48(1) # 8-byte Folded Reload
+; LE-NEXT: rldicl 3, 3, 62, 1
+; LE-NEXT: std 3, 224(1) # 8-byte Folded Spill
+; LE-NEXT: ld 3, 40(1) # 8-byte Folded Reload
+; LE-NEXT: mulld 3, 11, 3
+; LE-NEXT: std 3, 336(1) # 8-byte Folded Spill
+; LE-NEXT: mulld 3, 11, 4
+; LE-NEXT: clrldi 4, 9, 63
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: std 3, 280(1) # 8-byte Folded Spill
+; LE-NEXT: mulld 3, 11, 5
+; LE-NEXT: ld 5, 288(1) # 8-byte Folded Reload
+; LE-NEXT: std 3, 272(1) # 8-byte Folded Spill
+; LE-NEXT: mulld 3, 11, 6
+; LE-NEXT: mulld 6, 11, 7
+; LE-NEXT: mulld 7, 11, 8
+; LE-NEXT: mulld 8, 11, 12
+; LE-NEXT: mulld 12, 11, 0
+; LE-NEXT: mulld 0, 11, 30
+; LE-NEXT: mulld 30, 11, 29
+; LE-NEXT: mulld 29, 11, 28
+; LE-NEXT: mulld 28, 11, 27
+; LE-NEXT: mulld 27, 11, 26
+; LE-NEXT: mulld 26, 11, 25
+; LE-NEXT: mulld 25, 11, 24
+; LE-NEXT: mulld 24, 11, 23
+; LE-NEXT: mulld 23, 11, 22
+; LE-NEXT: mulld 22, 11, 21
+; LE-NEXT: mulld 21, 11, 20
+; LE-NEXT: mulld 20, 11, 19
+; LE-NEXT: mulld 19, 11, 18
+; LE-NEXT: mulld 18, 11, 17
+; LE-NEXT: mulld 17, 11, 16
+; LE-NEXT: mulld 16, 11, 15
+; LE-NEXT: mulld 15, 11, 14
+; LE-NEXT: mulld 14, 11, 31
+; LE-NEXT: mulld 31, 11, 2
+; LE-NEXT: std 3, 264(1) # 8-byte Folded Spill
+; LE-NEXT: ld 3, 256(1) # 8-byte Folded Reload
+; LE-NEXT: mulld 2, 11, 3
+; LE-NEXT: ld 3, 248(1) # 8-byte Folded Reload
+; LE-NEXT: mulld 3, 11, 3
+; LE-NEXT: std 3, 256(1) # 8-byte Folded Spill
+; LE-NEXT: ld 3, 240(1) # 8-byte Folded Reload
+; LE-NEXT: mulld 3, 11, 3
+; LE-NEXT: std 3, 248(1) # 8-byte Folded Spill
+; LE-NEXT: ld 3, 232(1) # 8-byte Folded Reload
+; LE-NEXT: mulld 3, 11, 3
+; LE-NEXT: std 3, 240(1) # 8-byte Folded Spill
+; LE-NEXT: ld 3, 224(1) # 8-byte Folded Reload
+; LE-NEXT: mulld 11, 11, 3
+; LE-NEXT: rlwinm 3, 9, 0, 30, 30
+; LE-NEXT: mulld 3, 10, 3
+; LE-NEXT: xor 3, 4, 3
+; LE-NEXT: ld 4, 296(1) # 8-byte Folded Reload
+; LE-NEXT: xor 4, 5, 4
+; LE-NEXT: rlwinm 5, 9, 0, 29, 29
+; LE-NEXT: mulld 5, 10, 5
+; LE-NEXT: xor 3, 3, 5
+; LE-NEXT: ld 5, 304(1) # 8-byte Folded Reload
+; LE-NEXT: xor 4, 4, 5
+; LE-NEXT: rlwinm 5, 9, 0, 28, 28
+; LE-NEXT: mulld 5, 10, 5
+; LE-NEXT: xor 3, 3, 5
+; LE-NEXT: ld 5, 312(1) # 8-byte Folded Reload
+; LE-NEXT: xor 4, 4, 5
+; LE-NEXT: rlwinm 5, 9, 0, 27, 27
+; LE-NEXT: mulld 5, 10, 5
+; LE-NEXT: xor 3, 3, 5
+; LE-NEXT: ld 5, 320(1) # 8-byte Folded Reload
+; LE-NEXT: xor 4, 4, 5
+; LE-NEXT: rlwinm 5, 9, 0, 26, 26
+; LE-NEXT: mulld 5, 10, 5
+; LE-NEXT: xor 3, 3, 5
+; LE-NEXT: ld 5, 328(1) # 8-byte Folded Reload
+; LE-NEXT: xor 4, 4, 5
+; LE-NEXT: rlwinm 5, 9, 0, 25, 25
+; LE-NEXT: mulld 5, 10, 5
+; LE-NEXT: xor 3, 3, 5
+; LE-NEXT: std 3, 328(1) # 8-byte Folded Spill
+; LE-NEXT: ld 3, 544(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 4, 3
+; LE-NEXT: ld 4, 536(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: ld 4, 528(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: ld 4, 520(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: ld 4, 512(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: ld 4, 504(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: ld 4, 496(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: ld 4, 488(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: ld 4, 480(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: ld 4, 472(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: ld 4, 464(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: ld 4, 456(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: ld 4, 448(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: ld 4, 440(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: ld 4, 432(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: ld 4, 424(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: ld 4, 416(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: ld 4, 408(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: ld 4, 400(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: ld 4, 392(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: ld 4, 384(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: ld 4, 376(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: ld 4, 368(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: ld 4, 360(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: ld 4, 352(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: ld 4, 344(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: ld 4, 280(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: ld 4, 272(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: ld 4, 264(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: ld 4, 256(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 6
+; LE-NEXT: ld 6, 592(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 7
+; LE-NEXT: ld 7, 584(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 8
+; LE-NEXT: ld 8, 576(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 12
+; LE-NEXT: ld 12, 560(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 0
+; LE-NEXT: ld 0, 552(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 30
+; LE-NEXT: ld 30, 736(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 29
+; LE-NEXT: ld 29, 728(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 28
+; LE-NEXT: ld 28, 720(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 27
+; LE-NEXT: ld 27, 712(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 26
+; LE-NEXT: ld 26, 704(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 25
+; LE-NEXT: ld 25, 696(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 24
+; LE-NEXT: ld 24, 688(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 23
+; LE-NEXT: ld 23, 680(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 22
+; LE-NEXT: ld 22, 672(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 21
+; LE-NEXT: ld 21, 664(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 20
+; LE-NEXT: ld 20, 656(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 19
+; LE-NEXT: ld 19, 648(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 18
+; LE-NEXT: ld 18, 640(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 17
+; LE-NEXT: ld 17, 632(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 16
+; LE-NEXT: ld 16, 624(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 15
+; LE-NEXT: ld 15, 616(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 14
+; LE-NEXT: ld 14, 608(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 31
+; LE-NEXT: ld 31, 744(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 2
+; LE-NEXT: ld 2, 600(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: ld 4, 248(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: ld 4, 240(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: ld 4, 336(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 11
+; LE-NEXT: ld 11, 568(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: sldi 4, 3, 1
+; LE-NEXT: rldicl 3, 3, 63, 1
+; LE-NEXT: and 4, 4, 7
+; LE-NEXT: and 3, 3, 6
+; LE-NEXT: or 3, 3, 4
+; LE-NEXT: sldi 4, 3, 2
+; LE-NEXT: rldicl 3, 3, 62, 2
+; LE-NEXT: and 4, 4, 11
+; LE-NEXT: and 3, 3, 8
+; LE-NEXT: or 3, 3, 4
+; LE-NEXT: sldi 4, 3, 4
+; LE-NEXT: rldicl 3, 3, 60, 4
+; LE-NEXT: and 4, 4, 0
+; LE-NEXT: and 3, 3, 12
+; LE-NEXT: or 3, 3, 4
+; LE-NEXT: rotlwi 5, 3, 24
+; LE-NEXT: rldicl 4, 3, 32, 32
+; LE-NEXT: rlwimi 5, 3, 8, 8, 15
+; LE-NEXT: rlwimi 5, 3, 8, 24, 31
+; LE-NEXT: rotlwi 3, 4, 24
+; LE-NEXT: rlwimi 3, 4, 8, 8, 15
+; LE-NEXT: rlwimi 3, 4, 8, 24, 31
+; LE-NEXT: sldi 4, 5, 32
+; LE-NEXT: or 3, 4, 3
+; LE-NEXT: ld 4, 328(1) # 8-byte Folded Reload
+; LE-NEXT: mtfprd 0, 3
+; LE-NEXT: rlwinm 3, 9, 0, 24, 24
+; LE-NEXT: mulld 3, 10, 3
+; LE-NEXT: xor 3, 4, 3
+; LE-NEXT: rlwinm 4, 9, 0, 23, 23
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rlwinm 4, 9, 0, 22, 22
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rlwinm 4, 9, 0, 21, 21
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rlwinm 4, 9, 0, 20, 20
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rlwinm 4, 9, 0, 19, 19
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rlwinm 4, 9, 0, 18, 18
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rlwinm 4, 9, 0, 17, 17
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rlwinm 4, 9, 0, 16, 16
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rlwinm 4, 9, 0, 15, 15
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rlwinm 4, 9, 0, 14, 14
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rlwinm 4, 9, 0, 13, 13
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rlwinm 4, 9, 0, 12, 12
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rlwinm 4, 9, 0, 11, 11
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rlwinm 4, 9, 0, 10, 10
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rlwinm 4, 9, 0, 9, 9
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rlwinm 4, 9, 0, 8, 8
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rlwinm 4, 9, 0, 7, 7
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rlwinm 4, 9, 0, 6, 6
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rlwinm 4, 9, 0, 5, 5
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rlwinm 4, 9, 0, 4, 4
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rlwinm 4, 9, 0, 3, 3
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rlwinm 4, 9, 0, 2, 2
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rlwinm 4, 9, 0, 1, 1
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rlwinm 4, 9, 0, 0, 0
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rldicl 4, 9, 32, 32
+; LE-NEXT: rldicl 4, 4, 32, 31
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rldicl 4, 9, 31, 33
+; LE-NEXT: rldicl 4, 4, 33, 30
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rldicl 4, 9, 30, 34
+; LE-NEXT: rldicl 4, 4, 34, 29
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rldicl 4, 9, 29, 35
+; LE-NEXT: rldicl 4, 4, 35, 28
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rldicl 4, 9, 28, 36
+; LE-NEXT: rldicl 4, 4, 36, 27
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rldicl 4, 9, 27, 37
+; LE-NEXT: rldicl 4, 4, 37, 26
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rldicl 4, 9, 26, 38
+; LE-NEXT: rldicl 4, 4, 38, 25
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rldicl 4, 9, 25, 39
+; LE-NEXT: rldicl 4, 4, 39, 24
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rldicl 4, 9, 24, 40
+; LE-NEXT: rldicl 4, 4, 40, 23
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rldicl 4, 9, 23, 41
+; LE-NEXT: rldicl 4, 4, 41, 22
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rldicl 4, 9, 22, 42
+; LE-NEXT: rldicl 4, 4, 42, 21
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rldicl 4, 9, 21, 43
+; LE-NEXT: rldicl 4, 4, 43, 20
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rldicl 4, 9, 20, 44
+; LE-NEXT: rldicl 4, 4, 44, 19
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rldicl 4, 9, 19, 45
+; LE-NEXT: rldicl 4, 4, 45, 18
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rldicl 4, 9, 18, 46
+; LE-NEXT: rldicl 4, 4, 46, 17
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rldicl 4, 9, 17, 47
+; LE-NEXT: rldicl 4, 4, 47, 16
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rldicl 4, 9, 16, 48
+; LE-NEXT: rldicl 4, 4, 48, 15
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rldicl 4, 9, 15, 49
+; LE-NEXT: rldicl 4, 4, 49, 14
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rldicl 4, 9, 14, 50
+; LE-NEXT: rldicl 4, 4, 50, 13
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rldicl 4, 9, 13, 51
+; LE-NEXT: rldicl 4, 4, 51, 12
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rldicl 4, 9, 12, 52
+; LE-NEXT: rldicl 4, 4, 52, 11
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rldicl 4, 9, 11, 53
+; LE-NEXT: rldicl 4, 4, 53, 10
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rldicl 4, 9, 10, 54
+; LE-NEXT: rldicl 4, 4, 54, 9
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rldicl 4, 9, 9, 55
+; LE-NEXT: rldicl 4, 4, 55, 8
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rldicl 4, 9, 8, 56
+; LE-NEXT: rldicl 4, 4, 56, 7
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rldicl 4, 9, 7, 57
+; LE-NEXT: rldicl 4, 4, 57, 6
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rldicl 4, 9, 6, 58
+; LE-NEXT: rldicl 4, 4, 58, 5
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rldicl 4, 9, 5, 59
+; LE-NEXT: rldicl 4, 4, 59, 4
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rldicl 4, 9, 4, 60
+; LE-NEXT: rldicl 4, 4, 60, 3
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rldicl 4, 9, 3, 61
+; LE-NEXT: rldicl 4, 4, 61, 2
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rldicl 4, 9, 2, 62
+; LE-NEXT: rldicl 4, 4, 62, 1
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rldicr 4, 9, 0, 0
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: sldi 4, 3, 1
+; LE-NEXT: rldicl 3, 3, 63, 1
+; LE-NEXT: and 4, 4, 7
+; LE-NEXT: and 3, 3, 6
+; LE-NEXT: or 3, 3, 4
+; LE-NEXT: sldi 4, 3, 2
+; LE-NEXT: rldicl 3, 3, 62, 2
+; LE-NEXT: and 4, 4, 11
+; LE-NEXT: and 3, 3, 8
+; LE-NEXT: or 3, 3, 4
+; LE-NEXT: sldi 4, 3, 4
+; LE-NEXT: rldicl 3, 3, 60, 4
+; LE-NEXT: and 4, 4, 0
+; LE-NEXT: and 3, 3, 12
+; LE-NEXT: or 3, 3, 4
+; LE-NEXT: rldicl 4, 3, 32, 32
+; LE-NEXT: rotlwi 5, 4, 24
+; LE-NEXT: rlwimi 5, 4, 8, 8, 15
+; LE-NEXT: rlwimi 5, 4, 8, 24, 31
+; LE-NEXT: rotlwi 4, 3, 24
+; LE-NEXT: rlwimi 4, 3, 8, 8, 15
+; LE-NEXT: rlwimi 4, 3, 8, 24, 31
+; LE-NEXT: sldi 3, 4, 32
+; LE-NEXT: or 3, 3, 5
+; LE-NEXT: mtfprd 1, 3
+; LE-NEXT: xxmrghd 34, 1, 0
+; LE-NEXT: addi 1, 1, 752
+; LE-NEXT: blr
+ %a.ext = zext <2 x i64> %a to <2 x i128>
+ %b.ext = zext <2 x i64> %b to <2 x i128>
+ %clmul = call <2 x i128> @llvm.clmul.v2i128(<2 x i128> %a.ext, <2 x i128> %b.ext)
+ %res.ext = lshr <2 x i128> %clmul, splat (i128 63)
+ %res = trunc <2 x i128> %res.ext to <2 x i64>
+ ret <2 x i64> %res
+}
+
+define <16 x i8> @clmulh_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
+; BE-LABEL: clmulh_v16i8:
+; BE: # %bb.0:
+; BE-NEXT: li 3, -48
+; BE-NEXT: vspltisb 4, 4
+; BE-NEXT: stvx 29, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, -32
+; BE-NEXT: vsrb 1, 3, 4
+; BE-NEXT: vspltisb 5, 15
+; BE-NEXT: stvx 30, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, -16
+; BE-NEXT: vspltisb 7, -1
+; BE-NEXT: stvx 31, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: addis 3, 2, .LCPI8_0 at toc@ha
+; BE-NEXT: addi 3, 3, .LCPI8_0 at toc@l
+; BE-NEXT: vand 3, 3, 5
+; BE-NEXT: vspltisb 13, 8
+; BE-NEXT: vslb 3, 3, 4
+; BE-NEXT: vsrb 0, 2, 4
+; BE-NEXT: vand 2, 2, 5
+; BE-NEXT: vor 1, 1, 3
+; BE-NEXT: lvx 3, 0, 3
+; BE-NEXT: addis 3, 2, .LCPI8_1 at toc@ha
+; BE-NEXT: addi 3, 3, .LCPI8_1 at toc@l
+; BE-NEXT: vslb 2, 2, 4
+; BE-NEXT: vor 0, 0, 2
+; BE-NEXT: vspltisb 2, 2
+; BE-NEXT: vsrb 9, 1, 2
+; BE-NEXT: vand 1, 1, 3
+; BE-NEXT: vand 9, 9, 3
+; BE-NEXT: vslb 1, 1, 2
+; BE-NEXT: vsrb 8, 0, 2
+; BE-NEXT: vand 0, 0, 3
+; BE-NEXT: vor 9, 9, 1
+; BE-NEXT: lvx 1, 0, 3
+; BE-NEXT: addis 3, 2, .LCPI8_3 at toc@ha
+; BE-NEXT: addi 3, 3, .LCPI8_3 at toc@l
+; BE-NEXT: lvx 15, 0, 3
+; BE-NEXT: addis 3, 2, .LCPI8_2 at toc@ha
+; BE-NEXT: addi 3, 3, .LCPI8_2 at toc@l
+; BE-NEXT: vand 8, 8, 3
+; BE-NEXT: vslb 0, 0, 2
+; BE-NEXT: vor 8, 8, 0
+; BE-NEXT: vspltisb 0, 1
+; BE-NEXT: vsrb 11, 9, 0
+; BE-NEXT: vand 9, 9, 1
+; BE-NEXT: vaddubm 9, 9, 9
+; BE-NEXT: vand 11, 11, 1
+; BE-NEXT: vsrb 10, 8, 0
+; BE-NEXT: vand 8, 8, 1
+; BE-NEXT: vaddubm 8, 8, 8
+; BE-NEXT: vor 9, 11, 9
+; BE-NEXT: vslb 6, 4, 4
+; BE-NEXT: vslb 7, 7, 7
+; BE-NEXT: vand 10, 10, 1
+; BE-NEXT: vand 14, 9, 13
+; BE-NEXT: vaddubm 13, 13, 13
+; BE-NEXT: vor 8, 10, 8
+; BE-NEXT: vand 10, 9, 2
+; BE-NEXT: vand 11, 9, 0
+; BE-NEXT: vand 12, 9, 4
+; BE-NEXT: vand 13, 9, 13
+; BE-NEXT: vand 15, 9, 15
+; BE-NEXT: vand 6, 9, 6
+; BE-NEXT: vand 7, 9, 7
+; BE-NEXT: vmuloub 9, 8, 10
+; BE-NEXT: vmuleub 10, 8, 10
+; BE-NEXT: vmuloub 16, 8, 11
+; BE-NEXT: vmuleub 11, 8, 11
+; BE-NEXT: vmuloub 17, 8, 12
+; BE-NEXT: vmuleub 12, 8, 12
+; BE-NEXT: vmuloub 18, 8, 14
+; BE-NEXT: vmuleub 14, 8, 14
+; BE-NEXT: vmuloub 19, 8, 13
+; BE-NEXT: vmuleub 13, 8, 13
+; BE-NEXT: vmuloub 31, 8, 15
+; BE-NEXT: vmuleub 15, 8, 15
+; BE-NEXT: vmuloub 30, 8, 6
+; BE-NEXT: vmuleub 6, 8, 6
+; BE-NEXT: vmuloub 29, 8, 7
+; BE-NEXT: vmuleub 7, 8, 7
+; BE-NEXT: lvx 8, 0, 3
+; BE-NEXT: li 3, -16
+; BE-NEXT: vperm 9, 10, 9, 8
+; BE-NEXT: vperm 10, 11, 16, 8
+; BE-NEXT: vperm 11, 12, 17, 8
+; BE-NEXT: vperm 12, 14, 18, 8
+; BE-NEXT: vperm 13, 13, 19, 8
+; BE-NEXT: vperm 14, 15, 31, 8
+; BE-NEXT: lvx 31, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, -32
+; BE-NEXT: vperm 6, 6, 30, 8
+; BE-NEXT: lvx 30, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, -48
+; BE-NEXT: vperm 7, 7, 29, 8
+; BE-NEXT: lvx 29, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: vxor 8, 10, 9
+; BE-NEXT: vxor 8, 8, 11
+; BE-NEXT: vxor 8, 8, 12
+; BE-NEXT: vxor 8, 8, 13
+; BE-NEXT: vxor 8, 8, 14
+; BE-NEXT: vxor 6, 8, 6
+; BE-NEXT: vxor 6, 6, 7
+; BE-NEXT: vand 5, 6, 5
+; BE-NEXT: vsrb 7, 6, 4
+; BE-NEXT: vslb 4, 5, 4
+; BE-NEXT: vor 4, 7, 4
+; BE-NEXT: vand 5, 4, 3
+; BE-NEXT: vsrb 4, 4, 2
+; BE-NEXT: vslb 2, 5, 2
+; BE-NEXT: vand 3, 4, 3
+; BE-NEXT: vor 2, 3, 2
+; BE-NEXT: vsrb 3, 2, 0
+; BE-NEXT: vand 2, 2, 1
+; BE-NEXT: vaddubm 2, 2, 2
+; BE-NEXT: vand 3, 3, 1
+; BE-NEXT: vor 2, 3, 2
+; BE-NEXT: vsrb 2, 2, 0
+; BE-NEXT: blr
+;
+; LE-LABEL: clmulh_v16i8:
+; LE: # %bb.0:
+; LE-NEXT: addis 3, 2, .LCPI8_0 at toc@ha
+; LE-NEXT: vspltisb 4, 4
+; LE-NEXT: vspltisb 5, 2
+; LE-NEXT: addi 3, 3, .LCPI8_0 at toc@l
+; LE-NEXT: vslb 1, 3, 4
+; LE-NEXT: vsrb 3, 3, 4
+; LE-NEXT: vslb 6, 2, 4
+; LE-NEXT: vsrb 2, 2, 4
+; LE-NEXT: lxvd2x 0, 0, 3
+; LE-NEXT: addis 3, 2, .LCPI8_1 at toc@ha
+; LE-NEXT: xxlor 35, 35, 33
+; LE-NEXT: xxlor 34, 34, 38
+; LE-NEXT: vspltisb 0, 1
+; LE-NEXT: addi 3, 3, .LCPI8_1 at toc@l
+; LE-NEXT: vsrb 1, 3, 5
+; LE-NEXT: vsrb 7, 2, 5
+; LE-NEXT: vspltisb 6, 8
+; LE-NEXT: lxvd2x 1, 0, 3
+; LE-NEXT: addis 3, 2, .LCPI8_2 at toc@ha
+; LE-NEXT: xxland 35, 35, 0
+; LE-NEXT: xxland 34, 34, 0
+; LE-NEXT: xxland 2, 33, 0
+; LE-NEXT: xxland 3, 39, 0
+; LE-NEXT: addi 3, 3, .LCPI8_2 at toc@l
+; LE-NEXT: vslb 3, 3, 5
+; LE-NEXT: vslb 2, 2, 5
+; LE-NEXT: xxlor 35, 2, 35
+; LE-NEXT: xxlor 34, 3, 34
+; LE-NEXT: lxvd2x 3, 0, 3
+; LE-NEXT: addis 3, 2, .LCPI8_3 at toc@ha
+; LE-NEXT: vsrb 1, 3, 0
+; LE-NEXT: xxland 35, 35, 1
+; LE-NEXT: vsrb 7, 2, 0
+; LE-NEXT: xxland 34, 34, 1
+; LE-NEXT: addi 3, 3, .LCPI8_3 at toc@l
+; LE-NEXT: xxland 2, 33, 1
+; LE-NEXT: vaddubm 3, 3, 3
+; LE-NEXT: vaddubm 2, 2, 2
+; LE-NEXT: xxlor 2, 2, 35
+; LE-NEXT: xxland 35, 2, 37
+; LE-NEXT: xxswapd 33, 3
+; LE-NEXT: xxland 3, 39, 1
+; LE-NEXT: xxlor 34, 3, 34
+; LE-NEXT: lxvd2x 3, 0, 3
+; LE-NEXT: vmuloub 7, 2, 3
+; LE-NEXT: vmuleub 3, 2, 3
+; LE-NEXT: vperm 3, 3, 7, 1
+; LE-NEXT: xxland 39, 2, 32
+; LE-NEXT: vmuloub 8, 2, 7
+; LE-NEXT: vmuleub 7, 2, 7
+; LE-NEXT: vperm 7, 7, 8, 1
+; LE-NEXT: xxland 40, 2, 36
+; LE-NEXT: vmuloub 9, 2, 8
+; LE-NEXT: vmuleub 8, 2, 8
+; LE-NEXT: vperm 8, 8, 9, 1
+; LE-NEXT: xxland 41, 2, 38
+; LE-NEXT: vaddubm 6, 6, 6
+; LE-NEXT: vmuloub 10, 2, 9
+; LE-NEXT: vmuleub 9, 2, 9
+; LE-NEXT: xxland 38, 2, 38
+; LE-NEXT: vperm 9, 9, 10, 1
+; LE-NEXT: vmuloub 10, 2, 6
+; LE-NEXT: vmuleub 6, 2, 6
+; LE-NEXT: vperm 6, 6, 10, 1
+; LE-NEXT: xxland 42, 2, 3
+; LE-NEXT: vmuloub 11, 2, 10
+; LE-NEXT: vmuleub 10, 2, 10
+; LE-NEXT: vperm 10, 10, 11, 1
+; LE-NEXT: vslb 11, 4, 4
+; LE-NEXT: xxland 43, 2, 43
+; LE-NEXT: vmuloub 12, 2, 11
+; LE-NEXT: vmuleub 11, 2, 11
+; LE-NEXT: vperm 11, 11, 12, 1
+; LE-NEXT: xxleqv 44, 44, 44
+; LE-NEXT: vslb 12, 12, 12
+; LE-NEXT: xxland 44, 2, 44
+; LE-NEXT: xxlxor 2, 39, 35
+; LE-NEXT: xxlxor 2, 2, 40
+; LE-NEXT: vmuloub 13, 2, 12
+; LE-NEXT: vmuleub 2, 2, 12
+; LE-NEXT: xxlxor 2, 2, 41
+; LE-NEXT: xxlxor 2, 2, 38
+; LE-NEXT: xxlxor 2, 2, 42
+; LE-NEXT: xxlxor 2, 2, 43
+; LE-NEXT: vperm 2, 2, 13, 1
+; LE-NEXT: xxlxor 34, 2, 34
+; LE-NEXT: vslb 3, 2, 4
+; LE-NEXT: vsrb 2, 2, 4
+; LE-NEXT: xxlor 34, 34, 35
+; LE-NEXT: xxland 35, 34, 0
+; LE-NEXT: vsrb 2, 2, 5
+; LE-NEXT: vslb 3, 3, 5
+; LE-NEXT: xxland 0, 34, 0
+; LE-NEXT: xxlor 34, 0, 35
+; LE-NEXT: vsrb 3, 2, 0
+; LE-NEXT: xxland 34, 34, 1
+; LE-NEXT: xxland 0, 35, 1
+; LE-NEXT: vaddubm 2, 2, 2
+; LE-NEXT: xxlor 34, 0, 34
+; LE-NEXT: vsrb 2, 2, 0
+; LE-NEXT: blr
+ %a.ext = zext <16 x i8> %a to <16 x i16>
+ %b.ext = zext <16 x i8> %b to <16 x i16>
+ %clmul = call <16 x i16> @llvm.clmul.v16i16(<16 x i16> %a.ext, <16 x i16> %b.ext)
+ %res.ext = lshr <16 x i16> %clmul, splat (i16 8)
+ %res = trunc <16 x i16> %res.ext to <16 x i8>
+ ret <16 x i8> %res
+}
+
+define <8 x i16> @clmulh_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
+; BE-LABEL: clmulh_v8i16:
+; BE: # %bb.0:
+; BE-NEXT: li 3, -80
+; BE-NEXT: vspltish 4, 8
+; BE-NEXT: vxor 5, 5, 5
+; BE-NEXT: stvx 27, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, -64
+; BE-NEXT: vadduhm 19, 4, 4
+; BE-NEXT: vspltisb 1, -1
+; BE-NEXT: stvx 28, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, -48
+; BE-NEXT: vspltish 0, 2
+; BE-NEXT: stvx 29, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, -32
+; BE-NEXT: vrlh 8, 2, 4
+; BE-NEXT: vspltish 2, 4
+; BE-NEXT: stvx 30, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, -16
+; BE-NEXT: stvx 31, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: addis 3, 2, .LCPI9_0 at toc@ha
+; BE-NEXT: addi 3, 3, .LCPI9_0 at toc@l
+; BE-NEXT: vrlh 6, 3, 4
+; BE-NEXT: vspltish 3, 1
+; BE-NEXT: vslh 13, 1, 1
+; BE-NEXT: vspltisb 1, 15
+; BE-NEXT: vand 14, 8, 1
+; BE-NEXT: vsrh 8, 8, 2
+; BE-NEXT: vand 15, 6, 1
+; BE-NEXT: vsrh 6, 6, 2
+; BE-NEXT: vslh 14, 14, 2
+; BE-NEXT: vand 8, 8, 1
+; BE-NEXT: vslh 15, 15, 2
+; BE-NEXT: vand 6, 6, 1
+; BE-NEXT: vor 8, 8, 14
+; BE-NEXT: vor 14, 6, 15
+; BE-NEXT: lvx 6, 0, 3
+; BE-NEXT: addis 3, 2, .LCPI9_1 at toc@ha
+; BE-NEXT: addi 3, 3, .LCPI9_1 at toc@l
+; BE-NEXT: vand 15, 8, 6
+; BE-NEXT: vsrh 8, 8, 0
+; BE-NEXT: vslh 15, 15, 0
+; BE-NEXT: vand 8, 8, 6
+; BE-NEXT: vor 15, 8, 15
+; BE-NEXT: lvx 8, 0, 3
+; BE-NEXT: addis 3, 2, .LCPI9_2 at toc@ha
+; BE-NEXT: addi 3, 3, .LCPI9_2 at toc@l
+; BE-NEXT: lvx 31, 0, 3
+; BE-NEXT: addis 3, 2, .LCPI9_3 at toc@ha
+; BE-NEXT: addi 3, 3, .LCPI9_3 at toc@l
+; BE-NEXT: lvx 30, 0, 3
+; BE-NEXT: addis 3, 2, .LCPI9_4 at toc@ha
+; BE-NEXT: addi 3, 3, .LCPI9_4 at toc@l
+; BE-NEXT: vand 16, 14, 6
+; BE-NEXT: lvx 29, 0, 3
+; BE-NEXT: addis 3, 2, .LCPI9_5 at toc@ha
+; BE-NEXT: addi 3, 3, .LCPI9_5 at toc@l
+; BE-NEXT: lvx 28, 0, 3
+; BE-NEXT: addis 3, 2, .LCPI9_6 at toc@ha
+; BE-NEXT: addi 3, 3, .LCPI9_6 at toc@l
+; BE-NEXT: lvx 27, 0, 3
+; BE-NEXT: li 3, -16
+; BE-NEXT: vsrh 14, 14, 0
+; BE-NEXT: vslh 16, 16, 0
+; BE-NEXT: vand 14, 14, 6
+; BE-NEXT: vor 14, 14, 16
+; BE-NEXT: vsrh 17, 14, 3
+; BE-NEXT: vand 14, 14, 8
+; BE-NEXT: vadduhm 14, 14, 14
+; BE-NEXT: vsrh 16, 15, 3
+; BE-NEXT: vand 15, 15, 8
+; BE-NEXT: vadduhm 15, 15, 15
+; BE-NEXT: vand 17, 17, 8
+; BE-NEXT: vand 16, 16, 8
+; BE-NEXT: vor 14, 17, 14
+; BE-NEXT: vslh 7, 2, 2
+; BE-NEXT: vsldoi 9, 3, 3, 1
+; BE-NEXT: vsldoi 10, 0, 0, 1
+; BE-NEXT: vsldoi 11, 2, 2, 1
+; BE-NEXT: vslh 12, 4, 4
+; BE-NEXT: vor 15, 16, 15
+; BE-NEXT: vand 16, 14, 0
+; BE-NEXT: vand 17, 14, 3
+; BE-NEXT: vand 18, 14, 2
+; BE-NEXT: vand 19, 14, 19
+; BE-NEXT: vand 31, 14, 31
+; BE-NEXT: vand 7, 14, 7
+; BE-NEXT: vand 30, 14, 30
+; BE-NEXT: vand 9, 14, 9
+; BE-NEXT: vand 10, 14, 10
+; BE-NEXT: vand 11, 14, 11
+; BE-NEXT: vand 12, 14, 12
+; BE-NEXT: vand 29, 14, 29
+; BE-NEXT: vand 28, 14, 28
+; BE-NEXT: vand 27, 14, 27
+; BE-NEXT: vand 13, 14, 13
+; BE-NEXT: vand 14, 14, 4
+; BE-NEXT: vmladduhm 16, 15, 16, 5
+; BE-NEXT: vmladduhm 17, 15, 17, 5
+; BE-NEXT: vmladduhm 18, 15, 18, 5
+; BE-NEXT: vmladduhm 14, 15, 14, 5
+; BE-NEXT: vmladduhm 19, 15, 19, 5
+; BE-NEXT: vmladduhm 31, 15, 31, 5
+; BE-NEXT: vmladduhm 7, 15, 7, 5
+; BE-NEXT: vmladduhm 30, 15, 30, 5
+; BE-NEXT: vmladduhm 9, 15, 9, 5
+; BE-NEXT: vmladduhm 10, 15, 10, 5
+; BE-NEXT: vmladduhm 11, 15, 11, 5
+; BE-NEXT: vmladduhm 12, 15, 12, 5
+; BE-NEXT: vmladduhm 29, 15, 29, 5
+; BE-NEXT: vmladduhm 28, 15, 28, 5
+; BE-NEXT: vmladduhm 27, 15, 27, 5
+; BE-NEXT: vmladduhm 5, 15, 13, 5
+; BE-NEXT: vxor 13, 17, 16
+; BE-NEXT: vxor 13, 13, 18
+; BE-NEXT: vxor 13, 13, 14
+; BE-NEXT: vxor 13, 13, 19
+; BE-NEXT: vxor 13, 13, 31
+; BE-NEXT: lvx 31, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, -32
+; BE-NEXT: vxor 7, 13, 7
+; BE-NEXT: vxor 7, 7, 30
+; BE-NEXT: lvx 30, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, -48
+; BE-NEXT: vxor 7, 7, 9
+; BE-NEXT: vxor 7, 7, 10
+; BE-NEXT: vxor 7, 7, 11
+; BE-NEXT: vxor 7, 7, 12
+; BE-NEXT: vxor 7, 7, 29
+; BE-NEXT: lvx 29, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, -64
+; BE-NEXT: vxor 7, 7, 28
+; BE-NEXT: lvx 28, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, -80
+; BE-NEXT: vxor 7, 7, 27
+; BE-NEXT: lvx 27, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: vxor 5, 7, 5
+; BE-NEXT: vrlh 4, 5, 4
+; BE-NEXT: vand 5, 4, 1
+; BE-NEXT: vsrh 4, 4, 2
+; BE-NEXT: vslh 2, 5, 2
+; BE-NEXT: vand 4, 4, 1
+; BE-NEXT: vor 2, 4, 2
+; BE-NEXT: vand 4, 2, 6
+; BE-NEXT: vsrh 2, 2, 0
+; BE-NEXT: vslh 4, 4, 0
+; BE-NEXT: vand 2, 2, 6
+; BE-NEXT: vor 2, 2, 4
+; BE-NEXT: vsrh 4, 2, 3
+; BE-NEXT: vand 2, 2, 8
+; BE-NEXT: vadduhm 2, 2, 2
+; BE-NEXT: vand 4, 4, 8
+; BE-NEXT: vor 2, 4, 2
+; BE-NEXT: vsrh 2, 2, 3
+; BE-NEXT: blr
+;
+; LE-LABEL: clmulh_v8i16:
+; LE: # %bb.0:
+; LE-NEXT: vspltish 5, 8
+; LE-NEXT: vspltisb 4, 15
+; LE-NEXT: addis 3, 2, .LCPI9_0 at toc@ha
+; LE-NEXT: vrlh 2, 2, 5
+; LE-NEXT: vspltish 0, 4
+; LE-NEXT: addi 3, 3, .LCPI9_0 at toc@l
+; LE-NEXT: vspltish 6, 2
+; LE-NEXT: vspltish 1, 1
+; LE-NEXT: vrlh 3, 3, 5
+; LE-NEXT: xxland 42, 34, 36
+; LE-NEXT: vsrh 2, 2, 0
+; LE-NEXT: vslh 10, 10, 0
+; LE-NEXT: xxland 0, 34, 36
+; LE-NEXT: vsldoi 7, 1, 1, 1
+; LE-NEXT: vsldoi 8, 6, 6, 1
+; LE-NEXT: vsldoi 9, 0, 0, 1
+; LE-NEXT: xxlor 34, 0, 42
+; LE-NEXT: lxvd2x 0, 0, 3
+; LE-NEXT: addis 3, 2, .LCPI9_1 at toc@ha
+; LE-NEXT: addi 3, 3, .LCPI9_1 at toc@l
+; LE-NEXT: xxland 42, 34, 0
+; LE-NEXT: vsrh 2, 2, 6
+; LE-NEXT: vslh 10, 10, 6
+; LE-NEXT: xxland 1, 34, 0
+; LE-NEXT: xxlor 34, 1, 42
+; LE-NEXT: lxvd2x 1, 0, 3
+; LE-NEXT: addis 3, 2, .LCPI9_2 at toc@ha
+; LE-NEXT: vsrh 10, 2, 1
+; LE-NEXT: addi 3, 3, .LCPI9_2 at toc@l
+; LE-NEXT: lxvd2x 4, 0, 3
+; LE-NEXT: addis 3, 2, .LCPI9_3 at toc@ha
+; LE-NEXT: xxland 34, 34, 1
+; LE-NEXT: xxland 2, 42, 1
+; LE-NEXT: xxland 42, 35, 36
+; LE-NEXT: vsrh 3, 3, 0
+; LE-NEXT: addi 3, 3, .LCPI9_3 at toc@l
+; LE-NEXT: vadduhm 2, 2, 2
+; LE-NEXT: vslh 10, 10, 0
+; LE-NEXT: xxlor 34, 2, 34
+; LE-NEXT: xxland 2, 35, 36
+; LE-NEXT: xxlor 35, 2, 42
+; LE-NEXT: xxland 42, 35, 0
+; LE-NEXT: vsrh 3, 3, 6
+; LE-NEXT: vslh 10, 10, 6
+; LE-NEXT: xxland 2, 35, 0
+; LE-NEXT: xxlor 35, 2, 42
+; LE-NEXT: vsrh 10, 3, 1
+; LE-NEXT: xxland 35, 35, 1
+; LE-NEXT: xxland 2, 42, 1
+; LE-NEXT: vadduhm 3, 3, 3
+; LE-NEXT: xxlor 2, 2, 35
+; LE-NEXT: vxor 3, 3, 3
+; LE-NEXT: xxland 42, 2, 38
+; LE-NEXT: xxland 43, 2, 33
+; LE-NEXT: xxland 39, 2, 39
+; LE-NEXT: vmladduhm 10, 2, 10, 3
+; LE-NEXT: vmladduhm 11, 2, 11, 3
+; LE-NEXT: vmladduhm 7, 2, 7, 3
+; LE-NEXT: xxlxor 3, 43, 42
+; LE-NEXT: xxland 42, 2, 32
+; LE-NEXT: vmladduhm 10, 2, 10, 3
+; LE-NEXT: xxlxor 3, 3, 42
+; LE-NEXT: xxland 42, 2, 37
+; LE-NEXT: vmladduhm 10, 2, 10, 3
+; LE-NEXT: xxlxor 3, 3, 42
+; LE-NEXT: vadduhm 10, 5, 5
+; LE-NEXT: xxland 42, 2, 42
+; LE-NEXT: vmladduhm 10, 2, 10, 3
+; LE-NEXT: xxlxor 3, 3, 42
+; LE-NEXT: xxland 42, 2, 4
+; LE-NEXT: lxvd2x 4, 0, 3
+; LE-NEXT: addis 3, 2, .LCPI9_4 at toc@ha
+; LE-NEXT: vmladduhm 10, 2, 10, 3
+; LE-NEXT: addi 3, 3, .LCPI9_4 at toc@l
+; LE-NEXT: xxlxor 3, 3, 42
+; LE-NEXT: vslh 10, 0, 0
+; LE-NEXT: xxland 42, 2, 42
+; LE-NEXT: vmladduhm 10, 2, 10, 3
+; LE-NEXT: xxlxor 3, 3, 42
+; LE-NEXT: xxland 42, 2, 4
+; LE-NEXT: lxvd2x 4, 0, 3
+; LE-NEXT: addis 3, 2, .LCPI9_5 at toc@ha
+; LE-NEXT: vmladduhm 10, 2, 10, 3
+; LE-NEXT: addi 3, 3, .LCPI9_5 at toc@l
+; LE-NEXT: xxlxor 3, 3, 42
+; LE-NEXT: xxlxor 3, 3, 39
+; LE-NEXT: xxland 39, 2, 40
+; LE-NEXT: vmladduhm 7, 2, 7, 3
+; LE-NEXT: xxlxor 3, 3, 39
+; LE-NEXT: xxland 39, 2, 41
+; LE-NEXT: vmladduhm 7, 2, 7, 3
+; LE-NEXT: xxlxor 3, 3, 39
+; LE-NEXT: vslh 7, 5, 5
+; LE-NEXT: xxland 39, 2, 39
+; LE-NEXT: vmladduhm 7, 2, 7, 3
+; LE-NEXT: xxlxor 3, 3, 39
+; LE-NEXT: xxland 39, 2, 4
+; LE-NEXT: lxvd2x 4, 0, 3
+; LE-NEXT: addis 3, 2, .LCPI9_6 at toc@ha
+; LE-NEXT: vmladduhm 7, 2, 7, 3
+; LE-NEXT: addi 3, 3, .LCPI9_6 at toc@l
+; LE-NEXT: xxlxor 3, 3, 39
+; LE-NEXT: xxland 39, 2, 4
+; LE-NEXT: lxvd2x 4, 0, 3
+; LE-NEXT: vmladduhm 7, 2, 7, 3
+; LE-NEXT: xxlxor 3, 3, 39
+; LE-NEXT: xxland 39, 2, 4
+; LE-NEXT: vmladduhm 7, 2, 7, 3
+; LE-NEXT: xxlxor 3, 3, 39
+; LE-NEXT: xxleqv 39, 39, 39
+; LE-NEXT: vslh 7, 7, 7
+; LE-NEXT: xxland 39, 2, 39
+; LE-NEXT: vmladduhm 2, 2, 7, 3
+; LE-NEXT: xxlxor 34, 3, 34
+; LE-NEXT: vrlh 2, 2, 5
+; LE-NEXT: xxland 35, 34, 36
+; LE-NEXT: vsrh 2, 2, 0
+; LE-NEXT: vslh 3, 3, 0
+; LE-NEXT: xxland 2, 34, 36
+; LE-NEXT: xxlor 34, 2, 35
+; LE-NEXT: xxland 35, 34, 0
+; LE-NEXT: vsrh 2, 2, 6
+; LE-NEXT: vslh 3, 3, 6
+; LE-NEXT: xxland 0, 34, 0
+; LE-NEXT: xxlor 34, 0, 35
+; LE-NEXT: vsrh 3, 2, 1
+; LE-NEXT: xxland 34, 34, 1
+; LE-NEXT: xxland 0, 35, 1
+; LE-NEXT: vadduhm 2, 2, 2
+; LE-NEXT: xxlor 34, 0, 34
+; LE-NEXT: vsrh 2, 2, 1
+; LE-NEXT: blr
+ %a.ext = zext <8 x i16> %a to <8 x i32>
+ %b.ext = zext <8 x i16> %b to <8 x i32>
+ %clmul = call <8 x i32> @llvm.clmul.v8i32(<8 x i32> %a.ext, <8 x i32> %b.ext)
+ %res.ext = lshr <8 x i32> %clmul, splat (i32 16)
+ %res = trunc <8x i32> %res.ext to <8 x i16>
+ ret <8 x i16> %res
+}
+
+define <4 x i32> @clmulh_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
+; BE-LABEL: clmulh_v4i32:
+; BE: # %bb.0:
+; BE-NEXT: stdu 1, -1472(1)
+; BE-NEXT: li 3, 1280
+; BE-NEXT: vspltisb 12, -1
+; BE-NEXT: stvx 20, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 1296
+; BE-NEXT: vslw 15, 12, 12
+; BE-NEXT: vspltisw 12, 12
+; BE-NEXT: stvx 21, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 1312
+; BE-NEXT: vadduwm 17, 12, 12
+; BE-NEXT: vspltisw 18, 8
+; BE-NEXT: stvx 22, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 1328
+; BE-NEXT: vsrw 6, 2, 18
+; BE-NEXT: vspltisw 19, 4
+; BE-NEXT: stvx 23, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 1344
+; BE-NEXT: stvx 24, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 1360
+; BE-NEXT: stvx 25, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 1376
+; BE-NEXT: vsrw 9, 3, 18
+; BE-NEXT: stvx 26, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 1392
+; BE-NEXT: stvx 27, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 1408
+; BE-NEXT: stvx 28, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 1424
+; BE-NEXT: vsrw 12, 2, 17
+; BE-NEXT: stvx 29, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 1440
+; BE-NEXT: stvx 30, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 1456
+; BE-NEXT: vspltisw 30, 2
+; BE-NEXT: vslw 14, 2, 17
+; BE-NEXT: stvx 31, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 1264
+; BE-NEXT: vspltisw 31, 1
+; BE-NEXT: stvx 17, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: addis 3, 2, .LCPI10_0 at toc@ha
+; BE-NEXT: addi 3, 3, .LCPI10_0 at toc@l
+; BE-NEXT: lvx 29, 0, 3
+; BE-NEXT: li 3, 1248
+; BE-NEXT: vsrw 16, 3, 17
+; BE-NEXT: stvx 29, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 1232
+; BE-NEXT: vslw 17, 3, 17
+; BE-NEXT: vand 2, 2, 29
+; BE-NEXT: vand 3, 3, 29
+; BE-NEXT: vand 6, 6, 29
+; BE-NEXT: vand 9, 9, 29
+; BE-NEXT: vslw 2, 2, 18
+; BE-NEXT: vslw 3, 3, 18
+; BE-NEXT: vor 6, 6, 12
+; BE-NEXT: vspltisb 12, 15
+; BE-NEXT: stvx 12, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: addis 3, 2, .LCPI10_1 at toc@ha
+; BE-NEXT: addi 3, 3, .LCPI10_1 at toc@l
+; BE-NEXT: vor 9, 9, 16
+; BE-NEXT: vor 2, 14, 2
+; BE-NEXT: vor 3, 17, 3
+; BE-NEXT: vor 2, 2, 6
+; BE-NEXT: vor 3, 3, 9
+; BE-NEXT: vand 6, 2, 12
+; BE-NEXT: vsrw 2, 2, 19
+; BE-NEXT: vand 9, 3, 12
+; BE-NEXT: vsrw 3, 3, 19
+; BE-NEXT: vand 2, 2, 12
+; BE-NEXT: vand 3, 3, 12
+; BE-NEXT: lvx 12, 0, 3
+; BE-NEXT: li 3, 1216
+; BE-NEXT: stvx 12, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: addis 3, 2, .LCPI10_2 at toc@ha
+; BE-NEXT: addi 3, 3, .LCPI10_2 at toc@l
+; BE-NEXT: vslw 6, 6, 19
+; BE-NEXT: vslw 9, 9, 19
+; BE-NEXT: vor 2, 2, 6
+; BE-NEXT: vor 3, 3, 9
+; BE-NEXT: vand 6, 2, 12
+; BE-NEXT: vsrw 2, 2, 30
+; BE-NEXT: vand 9, 3, 12
+; BE-NEXT: vsrw 3, 3, 30
+; BE-NEXT: vand 2, 2, 12
+; BE-NEXT: vand 3, 3, 12
+; BE-NEXT: lvx 12, 0, 3
+; BE-NEXT: li 3, 1200
+; BE-NEXT: stvx 12, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 1136
+; BE-NEXT: stvx 18, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: addis 3, 2, .LCPI10_3 at toc@ha
+; BE-NEXT: addi 3, 3, .LCPI10_3 at toc@l
+; BE-NEXT: vslw 6, 6, 30
+; BE-NEXT: vslw 9, 9, 30
+; BE-NEXT: vor 2, 2, 6
+; BE-NEXT: vor 3, 3, 9
+; BE-NEXT: vsrw 6, 2, 31
+; BE-NEXT: vand 2, 2, 12
+; BE-NEXT: vadduwm 2, 2, 2
+; BE-NEXT: vsrw 9, 3, 31
+; BE-NEXT: vand 3, 3, 12
+; BE-NEXT: vand 6, 6, 12
+; BE-NEXT: vand 12, 9, 12
+; BE-NEXT: vor 9, 6, 2
+; BE-NEXT: vadduwm 2, 3, 3
+; BE-NEXT: vor 14, 12, 2
+; BE-NEXT: vadduwm 2, 18, 18
+; BE-NEXT: vand 28, 14, 2
+; BE-NEXT: lvx 2, 0, 3
+; BE-NEXT: addis 3, 2, .LCPI10_4 at toc@ha
+; BE-NEXT: addi 3, 3, .LCPI10_4 at toc@l
+; BE-NEXT: vand 27, 14, 2
+; BE-NEXT: lvx 2, 0, 3
+; BE-NEXT: addis 3, 2, .LCPI10_5 at toc@ha
+; BE-NEXT: addi 3, 3, .LCPI10_5 at toc@l
+; BE-NEXT: vand 25, 14, 2
+; BE-NEXT: lvx 2, 0, 3
+; BE-NEXT: addis 3, 2, .LCPI10_6 at toc@ha
+; BE-NEXT: addi 3, 3, .LCPI10_6 at toc@l
+; BE-NEXT: vslw 4, 19, 19
+; BE-NEXT: vand 26, 14, 4
+; BE-NEXT: vand 4, 14, 2
+; BE-NEXT: lvx 2, 0, 3
+; BE-NEXT: addis 3, 2, .LCPI10_7 at toc@ha
+; BE-NEXT: addi 3, 3, .LCPI10_7 at toc@l
+; BE-NEXT: vsldoi 5, 31, 31, 1
+; BE-NEXT: vand 24, 14, 5
+; BE-NEXT: vand 5, 14, 2
+; BE-NEXT: lvx 2, 0, 3
+; BE-NEXT: addis 3, 2, .LCPI10_8 at toc@ha
+; BE-NEXT: addi 3, 3, .LCPI10_8 at toc@l
+; BE-NEXT: vand 29, 14, 2
+; BE-NEXT: lvx 2, 0, 3
+; BE-NEXT: addis 3, 2, .LCPI10_9 at toc@ha
+; BE-NEXT: addi 3, 3, .LCPI10_9 at toc@l
+; BE-NEXT: vand 21, 14, 2
+; BE-NEXT: lvx 2, 0, 3
+; BE-NEXT: addis 3, 2, .LCPI10_10 at toc@ha
+; BE-NEXT: addi 3, 3, .LCPI10_10 at toc@l
+; BE-NEXT: vslw 7, 18, 18
+; BE-NEXT: vand 3, 14, 7
+; BE-NEXT: vand 7, 14, 2
+; BE-NEXT: lvx 2, 0, 3
+; BE-NEXT: addis 3, 2, .LCPI10_11 at toc@ha
+; BE-NEXT: addi 3, 3, .LCPI10_11 at toc@l
+; BE-NEXT: vsldoi 13, 18, 18, 2
+; BE-NEXT: vand 16, 14, 13
+; BE-NEXT: vand 13, 14, 2
+; BE-NEXT: lvx 2, 0, 3
+; BE-NEXT: addis 3, 2, .LCPI10_12 at toc@ha
+; BE-NEXT: addi 3, 3, .LCPI10_12 at toc@l
+; BE-NEXT: vand 12, 14, 2
+; BE-NEXT: lvx 2, 0, 3
+; BE-NEXT: li 3, 1184
+; BE-NEXT: stvx 31, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 1168
+; BE-NEXT: stvx 30, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 1152
+; BE-NEXT: vsldoi 11, 31, 31, 2
+; BE-NEXT: stvx 19, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 1072
+; BE-NEXT: vsldoi 1, 19, 19, 1
+; BE-NEXT: vsldoi 10, 30, 30, 2
+; BE-NEXT: vand 20, 14, 11
+; BE-NEXT: vand 11, 14, 2
+; BE-NEXT: vsldoi 2, 31, 31, 3
+; BE-NEXT: vsldoi 8, 19, 19, 2
+; BE-NEXT: vand 22, 14, 1
+; BE-NEXT: vand 1, 14, 10
+; BE-NEXT: vand 10, 14, 2
+; BE-NEXT: vsldoi 2, 30, 30, 3
+; BE-NEXT: vand 17, 14, 8
+; BE-NEXT: vand 8, 14, 2
+; BE-NEXT: vsldoi 2, 19, 19, 3
+; BE-NEXT: vand 2, 14, 2
+; BE-NEXT: stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 1040
+; BE-NEXT: vsldoi 2, 18, 18, 3
+; BE-NEXT: vand 2, 14, 2
+; BE-NEXT: stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: addis 3, 2, .LCPI10_13 at toc@ha
+; BE-NEXT: addi 3, 3, .LCPI10_13 at toc@l
+; BE-NEXT: lvx 2, 0, 3
+; BE-NEXT: li 3, 1008
+; BE-NEXT: vand 2, 14, 2
+; BE-NEXT: stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: addis 3, 2, .LCPI10_14 at toc@ha
+; BE-NEXT: addi 3, 3, .LCPI10_14 at toc@l
+; BE-NEXT: lvx 2, 0, 3
+; BE-NEXT: li 3, 288
+; BE-NEXT: vand 2, 14, 2
+; BE-NEXT: stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: addis 3, 2, .LCPI10_15 at toc@ha
+; BE-NEXT: addi 3, 3, .LCPI10_15 at toc@l
+; BE-NEXT: lvx 2, 0, 3
+; BE-NEXT: li 3, 192
+; BE-NEXT: vand 2, 14, 2
+; BE-NEXT: stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 272
+; BE-NEXT: vand 2, 14, 15
+; BE-NEXT: vspltisw 15, -16
+; BE-NEXT: stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 976
+; BE-NEXT: vand 2, 14, 30
+; BE-NEXT: stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 944
+; BE-NEXT: vand 31, 14, 31
+; BE-NEXT: stvx 31, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 912
+; BE-NEXT: vsldoi 0, 30, 30, 1
+; BE-NEXT: vand 19, 14, 19
+; BE-NEXT: stvx 19, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 880
+; BE-NEXT: vand 23, 14, 0
+; BE-NEXT: vand 14, 14, 18
+; BE-NEXT: stvx 14, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 1120
+; BE-NEXT: vxor 6, 6, 6
+; BE-NEXT: vrlw 0, 2, 15
+; BE-NEXT: vmsumuhm 2, 9, 0, 6
+; BE-NEXT: stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 1104
+; BE-NEXT: vrlw 0, 31, 15
+; BE-NEXT: vmsumuhm 2, 9, 0, 6
+; BE-NEXT: stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 1088
+; BE-NEXT: vrlw 0, 19, 15
+; BE-NEXT: vmsumuhm 2, 9, 0, 6
+; BE-NEXT: stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 1056
+; BE-NEXT: vrlw 0, 14, 15
+; BE-NEXT: vmsumuhm 2, 9, 0, 6
+; BE-NEXT: stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 1024
+; BE-NEXT: vrlw 0, 28, 15
+; BE-NEXT: vmsumuhm 2, 9, 0, 6
+; BE-NEXT: stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 992
+; BE-NEXT: vrlw 0, 27, 15
+; BE-NEXT: vmsumuhm 2, 9, 0, 6
+; BE-NEXT: stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 960
+; BE-NEXT: vrlw 0, 26, 15
+; BE-NEXT: vmsumuhm 2, 9, 0, 6
+; BE-NEXT: stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 928
+; BE-NEXT: vrlw 0, 25, 15
+; BE-NEXT: vmsumuhm 2, 9, 0, 6
+; BE-NEXT: stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 896
+; BE-NEXT: vrlw 0, 24, 15
+; BE-NEXT: vmsumuhm 2, 9, 0, 6
+; BE-NEXT: stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 864
+; BE-NEXT: vrlw 0, 23, 15
+; BE-NEXT: vmsumuhm 2, 9, 0, 6
+; BE-NEXT: stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 832
+; BE-NEXT: vrlw 0, 22, 15
+; BE-NEXT: vmsumuhm 2, 9, 0, 6
+; BE-NEXT: stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 800
+; BE-NEXT: vrlw 0, 3, 15
+; BE-NEXT: vmsumuhm 2, 9, 0, 6
+; BE-NEXT: stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 768
+; BE-NEXT: vrlw 0, 4, 15
+; BE-NEXT: vmsumuhm 2, 9, 0, 6
+; BE-NEXT: stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 736
+; BE-NEXT: vrlw 0, 5, 15
+; BE-NEXT: vmsumuhm 2, 9, 0, 6
+; BE-NEXT: stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 704
+; BE-NEXT: vrlw 0, 29, 15
+; BE-NEXT: vmsumuhm 2, 9, 0, 6
+; BE-NEXT: stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 672
+; BE-NEXT: vrlw 0, 21, 15
+; BE-NEXT: vmsumuhm 2, 9, 0, 6
+; BE-NEXT: stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 640
+; BE-NEXT: vrlw 0, 20, 15
+; BE-NEXT: vmsumuhm 2, 9, 0, 6
+; BE-NEXT: stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 592
+; BE-NEXT: vrlw 0, 1, 15
+; BE-NEXT: vmsumuhm 2, 9, 0, 6
+; BE-NEXT: stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 560
+; BE-NEXT: vrlw 0, 17, 15
+; BE-NEXT: vmsumuhm 2, 9, 0, 6
+; BE-NEXT: stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 528
+; BE-NEXT: vrlw 0, 16, 15
+; BE-NEXT: vmsumuhm 2, 9, 0, 6
+; BE-NEXT: stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 496
+; BE-NEXT: vrlw 0, 7, 15
+; BE-NEXT: vmsumuhm 2, 9, 0, 6
+; BE-NEXT: stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 464
+; BE-NEXT: vrlw 0, 13, 15
+; BE-NEXT: vmsumuhm 2, 9, 0, 6
+; BE-NEXT: stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 432
+; BE-NEXT: vrlw 0, 12, 15
+; BE-NEXT: vmsumuhm 2, 9, 0, 6
+; BE-NEXT: stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 400
+; BE-NEXT: vrlw 0, 11, 15
+; BE-NEXT: vmsumuhm 2, 9, 0, 6
+; BE-NEXT: stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 368
+; BE-NEXT: vrlw 0, 10, 15
+; BE-NEXT: vmsumuhm 2, 9, 0, 6
+; BE-NEXT: stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 336
+; BE-NEXT: vrlw 0, 8, 15
+; BE-NEXT: vmsumuhm 2, 9, 0, 6
+; BE-NEXT: stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 1072
+; BE-NEXT: vmr 14, 7
+; BE-NEXT: lvx 7, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 304
+; BE-NEXT: vrlw 0, 7, 15
+; BE-NEXT: vmsumuhm 2, 9, 0, 6
+; BE-NEXT: stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 1040
+; BE-NEXT: vmr 30, 1
+; BE-NEXT: lvx 1, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 240
+; BE-NEXT: vrlw 0, 1, 15
+; BE-NEXT: vmsumuhm 2, 9, 0, 6
+; BE-NEXT: stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 1008
+; BE-NEXT: vmr 19, 5
+; BE-NEXT: lvx 5, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 208
+; BE-NEXT: vrlw 0, 5, 15
+; BE-NEXT: vmsumuhm 2, 9, 0, 6
+; BE-NEXT: stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 288
+; BE-NEXT: vmr 18, 4
+; BE-NEXT: lvx 4, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 160
+; BE-NEXT: vrlw 0, 4, 15
+; BE-NEXT: vmsumuhm 2, 9, 0, 6
+; BE-NEXT: stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 192
+; BE-NEXT: vmr 31, 3
+; BE-NEXT: lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 128
+; BE-NEXT: vrlw 0, 3, 15
+; BE-NEXT: vmsumuhm 2, 9, 0, 6
+; BE-NEXT: stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 272
+; BE-NEXT: lvx 2, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 64
+; BE-NEXT: vrlw 0, 2, 15
+; BE-NEXT: vmsumuhm 0, 9, 0, 6
+; BE-NEXT: stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 976
+; BE-NEXT: lvx 0, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 96
+; BE-NEXT: vmulouh 0, 9, 0
+; BE-NEXT: stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 944
+; BE-NEXT: lvx 0, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 80
+; BE-NEXT: vmulouh 0, 9, 0
+; BE-NEXT: stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 912
+; BE-NEXT: lvx 0, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 112
+; BE-NEXT: vmulouh 0, 9, 0
+; BE-NEXT: stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 880
+; BE-NEXT: lvx 0, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 144
+; BE-NEXT: vmulouh 0, 9, 0
+; BE-NEXT: stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 176
+; BE-NEXT: vmulouh 0, 9, 28
+; BE-NEXT: stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 224
+; BE-NEXT: vmulouh 0, 9, 27
+; BE-NEXT: stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 256
+; BE-NEXT: vmulouh 0, 9, 26
+; BE-NEXT: stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 320
+; BE-NEXT: vmulouh 0, 9, 25
+; BE-NEXT: stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 352
+; BE-NEXT: vmulouh 0, 9, 24
+; BE-NEXT: stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 384
+; BE-NEXT: vmulouh 0, 9, 23
+; BE-NEXT: stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 416
+; BE-NEXT: vmulouh 0, 9, 22
+; BE-NEXT: stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 448
+; BE-NEXT: vmulouh 0, 9, 31
+; BE-NEXT: stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 480
+; BE-NEXT: vmulouh 0, 9, 18
+; BE-NEXT: stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 512
+; BE-NEXT: vmulouh 0, 9, 19
+; BE-NEXT: stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 544
+; BE-NEXT: vmulouh 0, 9, 29
+; BE-NEXT: stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 576
+; BE-NEXT: vmulouh 0, 9, 21
+; BE-NEXT: stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 608
+; BE-NEXT: vmulouh 0, 9, 20
+; BE-NEXT: stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 624
+; BE-NEXT: vmulouh 0, 9, 30
+; BE-NEXT: stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 656
+; BE-NEXT: vmulouh 0, 9, 17
+; BE-NEXT: stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 688
+; BE-NEXT: vmulouh 0, 9, 16
+; BE-NEXT: stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 720
+; BE-NEXT: vmulouh 0, 9, 14
+; BE-NEXT: stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 752
+; BE-NEXT: vmulouh 0, 9, 13
+; BE-NEXT: stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 784
+; BE-NEXT: vmulouh 0, 9, 12
+; BE-NEXT: stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 816
+; BE-NEXT: vmulouh 0, 9, 11
+; BE-NEXT: stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 848
+; BE-NEXT: vmulouh 0, 9, 10
+; BE-NEXT: stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 880
+; BE-NEXT: vmulouh 0, 9, 8
+; BE-NEXT: stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 912
+; BE-NEXT: vmulouh 0, 9, 7
+; BE-NEXT: stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 944
+; BE-NEXT: vmulouh 0, 9, 1
+; BE-NEXT: stvx 0, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 976
+; BE-NEXT: vmulouh 5, 9, 5
+; BE-NEXT: stvx 5, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 1008
+; BE-NEXT: vmulouh 4, 9, 4
+; BE-NEXT: stvx 4, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 1040
+; BE-NEXT: vmulouh 3, 9, 3
+; BE-NEXT: stvx 3, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 1072
+; BE-NEXT: vmulouh 2, 9, 2
+; BE-NEXT: stvx 2, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 1120
+; BE-NEXT: lvx 2, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 1104
+; BE-NEXT: vslw 9, 2, 15
+; BE-NEXT: lvx 2, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 1088
+; BE-NEXT: lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 1056
+; BE-NEXT: lvx 4, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 1024
+; BE-NEXT: vslw 2, 2, 15
+; BE-NEXT: lvx 5, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 992
+; BE-NEXT: lvx 0, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 960
+; BE-NEXT: lvx 1, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 928
+; BE-NEXT: vslw 3, 3, 15
+; BE-NEXT: lvx 6, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 896
+; BE-NEXT: lvx 7, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 864
+; BE-NEXT: lvx 8, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 832
+; BE-NEXT: vslw 4, 4, 15
+; BE-NEXT: lvx 10, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 800
+; BE-NEXT: lvx 11, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 768
+; BE-NEXT: vslw 5, 5, 15
+; BE-NEXT: lvx 12, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 736
+; BE-NEXT: lvx 13, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 704
+; BE-NEXT: lvx 14, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 672
+; BE-NEXT: vslw 0, 0, 15
+; BE-NEXT: lvx 16, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 640
+; BE-NEXT: lvx 17, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 592
+; BE-NEXT: lvx 18, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 560
+; BE-NEXT: vslw 1, 1, 15
+; BE-NEXT: lvx 19, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 528
+; BE-NEXT: lvx 31, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 496
+; BE-NEXT: vslw 6, 6, 15
+; BE-NEXT: lvx 30, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 464
+; BE-NEXT: lvx 29, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 432
+; BE-NEXT: lvx 28, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 400
+; BE-NEXT: vslw 7, 7, 15
+; BE-NEXT: lvx 27, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 368
+; BE-NEXT: lvx 26, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 336
+; BE-NEXT: lvx 25, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 304
+; BE-NEXT: vslw 8, 8, 15
+; BE-NEXT: lvx 24, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 240
+; BE-NEXT: lvx 23, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 208
+; BE-NEXT: vslw 10, 10, 15
+; BE-NEXT: lvx 22, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 160
+; BE-NEXT: lvx 21, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 128
+; BE-NEXT: lvx 20, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 1120
+; BE-NEXT: vslw 11, 11, 15
+; BE-NEXT: vslw 20, 20, 15
+; BE-NEXT: stvx 20, 1, 3 # 16-byte Folded Spill
+; BE-NEXT: li 3, 64
+; BE-NEXT: lvx 20, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 96
+; BE-NEXT: vslw 12, 12, 15
+; BE-NEXT: vslw 13, 13, 15
+; BE-NEXT: vslw 14, 14, 15
+; BE-NEXT: vslw 16, 16, 15
+; BE-NEXT: vslw 17, 17, 15
+; BE-NEXT: vslw 18, 18, 15
+; BE-NEXT: vslw 19, 19, 15
+; BE-NEXT: vslw 31, 31, 15
+; BE-NEXT: vslw 30, 30, 15
+; BE-NEXT: vslw 29, 29, 15
+; BE-NEXT: vslw 28, 28, 15
+; BE-NEXT: vslw 27, 27, 15
+; BE-NEXT: vslw 26, 26, 15
+; BE-NEXT: vslw 25, 25, 15
+; BE-NEXT: vslw 24, 24, 15
+; BE-NEXT: vslw 23, 23, 15
+; BE-NEXT: vslw 22, 22, 15
+; BE-NEXT: vslw 21, 21, 15
+; BE-NEXT: vslw 20, 20, 15
+; BE-NEXT: lvx 15, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 80
+; BE-NEXT: vadduwm 9, 15, 9
+; BE-NEXT: lvx 15, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 112
+; BE-NEXT: vadduwm 2, 15, 2
+; BE-NEXT: vxor 2, 2, 9
+; BE-NEXT: lvx 9, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 144
+; BE-NEXT: vadduwm 3, 9, 3
+; BE-NEXT: vxor 2, 2, 3
+; BE-NEXT: lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 176
+; BE-NEXT: vadduwm 3, 3, 4
+; BE-NEXT: vxor 2, 2, 3
+; BE-NEXT: lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 224
+; BE-NEXT: vadduwm 3, 3, 5
+; BE-NEXT: vxor 2, 2, 3
+; BE-NEXT: lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 256
+; BE-NEXT: vadduwm 3, 3, 0
+; BE-NEXT: vxor 2, 2, 3
+; BE-NEXT: lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 320
+; BE-NEXT: vadduwm 3, 3, 1
+; BE-NEXT: vxor 2, 2, 3
+; BE-NEXT: lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 352
+; BE-NEXT: vadduwm 3, 3, 6
+; BE-NEXT: vxor 2, 2, 3
+; BE-NEXT: lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 384
+; BE-NEXT: vadduwm 3, 3, 7
+; BE-NEXT: vxor 2, 2, 3
+; BE-NEXT: lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 416
+; BE-NEXT: vadduwm 3, 3, 8
+; BE-NEXT: vxor 2, 2, 3
+; BE-NEXT: lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 448
+; BE-NEXT: vadduwm 3, 3, 10
+; BE-NEXT: vxor 2, 2, 3
+; BE-NEXT: lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 480
+; BE-NEXT: vadduwm 3, 3, 11
+; BE-NEXT: vxor 2, 2, 3
+; BE-NEXT: lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 512
+; BE-NEXT: vadduwm 3, 3, 12
+; BE-NEXT: vxor 2, 2, 3
+; BE-NEXT: lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 544
+; BE-NEXT: vadduwm 3, 3, 13
+; BE-NEXT: vxor 2, 2, 3
+; BE-NEXT: lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 576
+; BE-NEXT: vadduwm 3, 3, 14
+; BE-NEXT: vxor 2, 2, 3
+; BE-NEXT: lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 608
+; BE-NEXT: vadduwm 3, 3, 16
+; BE-NEXT: vxor 2, 2, 3
+; BE-NEXT: lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 624
+; BE-NEXT: vadduwm 3, 3, 17
+; BE-NEXT: vxor 2, 2, 3
+; BE-NEXT: lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 656
+; BE-NEXT: vadduwm 3, 3, 18
+; BE-NEXT: vxor 2, 2, 3
+; BE-NEXT: lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 688
+; BE-NEXT: vadduwm 3, 3, 19
+; BE-NEXT: vxor 2, 2, 3
+; BE-NEXT: lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 720
+; BE-NEXT: vadduwm 3, 3, 31
+; BE-NEXT: vxor 2, 2, 3
+; BE-NEXT: lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 752
+; BE-NEXT: vadduwm 3, 3, 30
+; BE-NEXT: vxor 2, 2, 3
+; BE-NEXT: lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 784
+; BE-NEXT: vadduwm 3, 3, 29
+; BE-NEXT: vxor 2, 2, 3
+; BE-NEXT: lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 816
+; BE-NEXT: vadduwm 3, 3, 28
+; BE-NEXT: vxor 2, 2, 3
+; BE-NEXT: lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 848
+; BE-NEXT: vadduwm 3, 3, 27
+; BE-NEXT: vxor 2, 2, 3
+; BE-NEXT: lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 880
+; BE-NEXT: vadduwm 3, 3, 26
+; BE-NEXT: vxor 2, 2, 3
+; BE-NEXT: lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 912
+; BE-NEXT: vadduwm 3, 3, 25
+; BE-NEXT: vxor 2, 2, 3
+; BE-NEXT: lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 944
+; BE-NEXT: vadduwm 3, 3, 24
+; BE-NEXT: vxor 2, 2, 3
+; BE-NEXT: lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 976
+; BE-NEXT: vadduwm 3, 3, 23
+; BE-NEXT: vxor 2, 2, 3
+; BE-NEXT: lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 1008
+; BE-NEXT: vadduwm 3, 3, 22
+; BE-NEXT: vxor 2, 2, 3
+; BE-NEXT: lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 1040
+; BE-NEXT: vadduwm 3, 3, 21
+; BE-NEXT: vxor 2, 2, 3
+; BE-NEXT: lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 1120
+; BE-NEXT: lvx 4, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 1072
+; BE-NEXT: vadduwm 3, 3, 4
+; BE-NEXT: vxor 2, 2, 3
+; BE-NEXT: lvx 3, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 1264
+; BE-NEXT: lvx 5, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 1136
+; BE-NEXT: vadduwm 3, 3, 20
+; BE-NEXT: lvx 1, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 1248
+; BE-NEXT: vxor 2, 2, 3
+; BE-NEXT: lvx 0, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 1232
+; BE-NEXT: vsrw 3, 2, 5
+; BE-NEXT: vsrw 4, 2, 1
+; BE-NEXT: vslw 5, 2, 5
+; BE-NEXT: vand 2, 2, 0
+; BE-NEXT: vslw 2, 2, 1
+; BE-NEXT: vand 4, 4, 0
+; BE-NEXT: vor 2, 5, 2
+; BE-NEXT: lvx 5, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 1152
+; BE-NEXT: vor 3, 4, 3
+; BE-NEXT: lvx 4, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 1216
+; BE-NEXT: vor 2, 2, 3
+; BE-NEXT: vand 3, 2, 5
+; BE-NEXT: vsrw 2, 2, 4
+; BE-NEXT: vand 2, 2, 5
+; BE-NEXT: lvx 5, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 1168
+; BE-NEXT: vslw 3, 3, 4
+; BE-NEXT: lvx 4, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 1184
+; BE-NEXT: vor 2, 2, 3
+; BE-NEXT: vand 3, 2, 5
+; BE-NEXT: vsrw 2, 2, 4
+; BE-NEXT: vslw 3, 3, 4
+; BE-NEXT: lvx 4, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 1200
+; BE-NEXT: vand 2, 2, 5
+; BE-NEXT: lvx 5, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 1456
+; BE-NEXT: lvx 31, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 1440
+; BE-NEXT: lvx 30, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 1424
+; BE-NEXT: vor 2, 2, 3
+; BE-NEXT: lvx 29, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 1408
+; BE-NEXT: lvx 28, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 1392
+; BE-NEXT: lvx 27, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 1376
+; BE-NEXT: vsrw 3, 2, 4
+; BE-NEXT: lvx 26, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 1360
+; BE-NEXT: lvx 25, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 1344
+; BE-NEXT: lvx 24, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 1328
+; BE-NEXT: vand 2, 2, 5
+; BE-NEXT: vadduwm 2, 2, 2
+; BE-NEXT: lvx 23, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 1312
+; BE-NEXT: lvx 22, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 1296
+; BE-NEXT: vand 3, 3, 5
+; BE-NEXT: lvx 21, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: li 3, 1280
+; BE-NEXT: lvx 20, 1, 3 # 16-byte Folded Reload
+; BE-NEXT: vor 2, 3, 2
+; BE-NEXT: vsrw 2, 2, 4
+; BE-NEXT: addi 1, 1, 1472
+; BE-NEXT: blr
+;
+; LE-LABEL: clmulh_v4i32:
+; LE: # %bb.0:
+; LE-NEXT: addis 3, 2, .LCPI10_0 at toc@ha
+; LE-NEXT: vspltisw 7, 12
+; LE-NEXT: vspltisw 4, 8
+; LE-NEXT: addi 3, 3, .LCPI10_0 at toc@l
+; LE-NEXT: vadduwm 7, 7, 7
+; LE-NEXT: vsrw 18, 2, 4
+; LE-NEXT: vspltisb 5, 15
+; LE-NEXT: vspltisw 0, 4
+; LE-NEXT: lxvd2x 0, 0, 3
+; LE-NEXT: vsrw 17, 2, 7
+; LE-NEXT: addis 3, 2, .LCPI10_1 at toc@ha
+; LE-NEXT: vspltisw 6, 2
+; LE-NEXT: vspltisw 1, 1
+; LE-NEXT: vsldoi 10, 0, 0, 1
+; LE-NEXT: addi 3, 3, .LCPI10_1 at toc@l
+; LE-NEXT: vsldoi 13, 0, 0, 2
+; LE-NEXT: vsldoi 9, 6, 6, 1
+; LE-NEXT: vsldoi 12, 6, 6, 2
+; LE-NEXT: vsldoi 14, 4, 4, 2
+; LE-NEXT: vsldoi 16, 6, 6, 3
+; LE-NEXT: vsldoi 8, 1, 1, 1
+; LE-NEXT: vsldoi 11, 1, 1, 2
+; LE-NEXT: vsldoi 15, 1, 1, 3
+; LE-NEXT: xxland 1, 50, 0
+; LE-NEXT: xxlor 1, 1, 49
+; LE-NEXT: vslw 17, 2, 7
+; LE-NEXT: xxland 34, 34, 0
+; LE-NEXT: vslw 2, 2, 4
+; LE-NEXT: xxlor 2, 49, 34
+; LE-NEXT: xxlor 34, 2, 1
+; LE-NEXT: xxland 50, 34, 37
+; LE-NEXT: vsrw 2, 2, 0
+; LE-NEXT: vslw 18, 18, 0
+; LE-NEXT: xxland 1, 34, 37
+; LE-NEXT: xxlor 34, 1, 50
+; LE-NEXT: lxvd2x 1, 0, 3
+; LE-NEXT: addis 3, 2, .LCPI10_2 at toc@ha
+; LE-NEXT: addi 3, 3, .LCPI10_2 at toc@l
+; LE-NEXT: xxland 51, 34, 1
+; LE-NEXT: vsrw 2, 2, 6
+; LE-NEXT: vslw 19, 19, 6
+; LE-NEXT: xxland 2, 34, 1
+; LE-NEXT: xxlor 34, 2, 51
+; LE-NEXT: lxvd2x 2, 0, 3
+; LE-NEXT: addis 3, 2, .LCPI10_3 at toc@ha
+; LE-NEXT: vsrw 19, 2, 1
+; LE-NEXT: addi 3, 3, .LCPI10_3 at toc@l
+; LE-NEXT: lxvd2x 5, 0, 3
+; LE-NEXT: addis 3, 2, .LCPI10_4 at toc@ha
+; LE-NEXT: xxland 34, 34, 2
+; LE-NEXT: xxland 3, 51, 2
+; LE-NEXT: vsrw 19, 3, 4
+; LE-NEXT: addi 3, 3, .LCPI10_4 at toc@l
+; LE-NEXT: vadduwm 2, 2, 2
+; LE-NEXT: xxlor 34, 3, 34
+; LE-NEXT: xxland 3, 51, 0
+; LE-NEXT: vsrw 19, 3, 7
+; LE-NEXT: xxlor 3, 3, 51
+; LE-NEXT: vslw 19, 3, 7
+; LE-NEXT: xxland 35, 35, 0
+; LE-NEXT: vsldoi 17, 0, 0, 3
+; LE-NEXT: vslw 3, 3, 4
+; LE-NEXT: xxlor 4, 51, 35
+; LE-NEXT: xxlor 35, 4, 3
+; LE-NEXT: xxland 51, 35, 37
+; LE-NEXT: vsrw 3, 3, 0
+; LE-NEXT: vslw 19, 19, 0
+; LE-NEXT: xxland 3, 35, 37
+; LE-NEXT: xxlor 35, 3, 51
+; LE-NEXT: xxland 51, 35, 1
+; LE-NEXT: vsrw 3, 3, 6
+; LE-NEXT: vslw 19, 19, 6
+; LE-NEXT: xxland 3, 35, 1
+; LE-NEXT: xxlor 35, 3, 51
+; LE-NEXT: vsrw 19, 3, 1
+; LE-NEXT: xxland 35, 35, 2
+; LE-NEXT: xxland 3, 51, 2
+; LE-NEXT: vadduwm 3, 3, 3
+; LE-NEXT: xxlor 3, 3, 35
+; LE-NEXT: xxland 35, 3, 38
+; LE-NEXT: xxland 51, 3, 33
+; LE-NEXT: vmuluwm 3, 2, 3
+; LE-NEXT: vmuluwm 19, 2, 19
+; LE-NEXT: xxlxor 4, 51, 35
+; LE-NEXT: xxland 35, 3, 32
+; LE-NEXT: vmuluwm 3, 2, 3
+; LE-NEXT: xxlxor 4, 4, 35
+; LE-NEXT: xxland 35, 3, 36
+; LE-NEXT: vmuluwm 3, 2, 3
+; LE-NEXT: xxlxor 4, 4, 35
+; LE-NEXT: vadduwm 3, 4, 4
+; LE-NEXT: xxland 35, 3, 35
+; LE-NEXT: vsldoi 18, 4, 4, 3
+; LE-NEXT: vmuluwm 3, 2, 3
+; LE-NEXT: xxlxor 4, 4, 35
+; LE-NEXT: xxland 35, 3, 5
+; LE-NEXT: lxvd2x 5, 0, 3
+; LE-NEXT: addis 3, 2, .LCPI10_5 at toc@ha
+; LE-NEXT: vmuluwm 3, 2, 3
+; LE-NEXT: addi 3, 3, .LCPI10_5 at toc@l
+; LE-NEXT: xxlxor 4, 4, 35
+; LE-NEXT: vslw 3, 0, 0
+; LE-NEXT: xxland 35, 3, 35
+; LE-NEXT: vmuluwm 3, 2, 3
+; LE-NEXT: xxlxor 4, 4, 35
+; LE-NEXT: xxland 35, 3, 5
+; LE-NEXT: lxvd2x 5, 0, 3
+; LE-NEXT: addis 3, 2, .LCPI10_6 at toc@ha
+; LE-NEXT: vmuluwm 3, 2, 3
+; LE-NEXT: addi 3, 3, .LCPI10_6 at toc@l
+; LE-NEXT: xxlxor 4, 4, 35
+; LE-NEXT: xxland 35, 3, 40
+; LE-NEXT: vmuluwm 3, 2, 3
+; LE-NEXT: xxlxor 4, 4, 35
+; LE-NEXT: xxland 35, 3, 41
+; LE-NEXT: vmuluwm 3, 2, 3
+; LE-NEXT: xxlxor 4, 4, 35
+; LE-NEXT: xxland 35, 3, 42
+; LE-NEXT: vmuluwm 3, 2, 3
+; LE-NEXT: xxlxor 4, 4, 35
+; LE-NEXT: vslw 3, 4, 4
+; LE-NEXT: xxland 35, 3, 35
+; LE-NEXT: vmuluwm 3, 2, 3
+; LE-NEXT: xxlxor 4, 4, 35
+; LE-NEXT: xxland 35, 3, 5
+; LE-NEXT: lxvd2x 5, 0, 3
+; LE-NEXT: addis 3, 2, .LCPI10_7 at toc@ha
+; LE-NEXT: vmuluwm 3, 2, 3
+; LE-NEXT: addi 3, 3, .LCPI10_7 at toc@l
+; LE-NEXT: xxlxor 4, 4, 35
+; LE-NEXT: xxland 35, 3, 5
+; LE-NEXT: lxvd2x 5, 0, 3
+; LE-NEXT: addis 3, 2, .LCPI10_8 at toc@ha
+; LE-NEXT: vmuluwm 3, 2, 3
+; LE-NEXT: addi 3, 3, .LCPI10_8 at toc@l
+; LE-NEXT: xxlxor 4, 4, 35
+; LE-NEXT: xxland 35, 3, 5
+; LE-NEXT: lxvd2x 5, 0, 3
+; LE-NEXT: addis 3, 2, .LCPI10_9 at toc@ha
+; LE-NEXT: vmuluwm 3, 2, 3
+; LE-NEXT: addi 3, 3, .LCPI10_9 at toc@l
+; LE-NEXT: xxlxor 4, 4, 35
+; LE-NEXT: xxland 35, 3, 5
+; LE-NEXT: lxvd2x 5, 0, 3
+; LE-NEXT: addis 3, 2, .LCPI10_10 at toc@ha
+; LE-NEXT: vmuluwm 3, 2, 3
+; LE-NEXT: addi 3, 3, .LCPI10_10 at toc@l
+; LE-NEXT: xxlxor 4, 4, 35
+; LE-NEXT: xxland 35, 3, 43
+; LE-NEXT: vmuluwm 3, 2, 3
+; LE-NEXT: xxlxor 4, 4, 35
+; LE-NEXT: xxland 35, 3, 44
+; LE-NEXT: vmuluwm 3, 2, 3
+; LE-NEXT: xxlxor 4, 4, 35
+; LE-NEXT: xxland 35, 3, 45
+; LE-NEXT: vmuluwm 3, 2, 3
+; LE-NEXT: xxlxor 4, 4, 35
+; LE-NEXT: xxland 35, 3, 46
+; LE-NEXT: vmuluwm 3, 2, 3
+; LE-NEXT: xxlxor 4, 4, 35
+; LE-NEXT: xxland 35, 3, 5
+; LE-NEXT: lxvd2x 5, 0, 3
+; LE-NEXT: addis 3, 2, .LCPI10_11 at toc@ha
+; LE-NEXT: vmuluwm 3, 2, 3
+; LE-NEXT: addi 3, 3, .LCPI10_11 at toc@l
+; LE-NEXT: xxlxor 4, 4, 35
+; LE-NEXT: xxland 35, 3, 5
+; LE-NEXT: lxvd2x 5, 0, 3
+; LE-NEXT: addis 3, 2, .LCPI10_12 at toc@ha
+; LE-NEXT: vmuluwm 3, 2, 3
+; LE-NEXT: addi 3, 3, .LCPI10_12 at toc@l
+; LE-NEXT: xxlxor 4, 4, 35
+; LE-NEXT: xxland 35, 3, 5
+; LE-NEXT: lxvd2x 5, 0, 3
+; LE-NEXT: addis 3, 2, .LCPI10_13 at toc@ha
+; LE-NEXT: vmuluwm 3, 2, 3
+; LE-NEXT: addi 3, 3, .LCPI10_13 at toc@l
+; LE-NEXT: xxlxor 4, 4, 35
+; LE-NEXT: xxland 35, 3, 5
+; LE-NEXT: lxvd2x 5, 0, 3
+; LE-NEXT: addis 3, 2, .LCPI10_14 at toc@ha
+; LE-NEXT: vmuluwm 3, 2, 3
+; LE-NEXT: addi 3, 3, .LCPI10_14 at toc@l
+; LE-NEXT: xxlxor 4, 4, 35
+; LE-NEXT: xxland 35, 3, 47
+; LE-NEXT: vmuluwm 3, 2, 3
+; LE-NEXT: xxlxor 4, 4, 35
+; LE-NEXT: xxland 35, 3, 48
+; LE-NEXT: vmuluwm 3, 2, 3
+; LE-NEXT: xxlxor 4, 4, 35
+; LE-NEXT: xxland 35, 3, 49
+; LE-NEXT: vmuluwm 3, 2, 3
+; LE-NEXT: xxlxor 4, 4, 35
+; LE-NEXT: xxland 35, 3, 50
+; LE-NEXT: vmuluwm 3, 2, 3
+; LE-NEXT: xxlxor 4, 4, 35
+; LE-NEXT: xxland 35, 3, 5
+; LE-NEXT: lxvd2x 5, 0, 3
+; LE-NEXT: addis 3, 2, .LCPI10_15 at toc@ha
+; LE-NEXT: vmuluwm 3, 2, 3
+; LE-NEXT: addi 3, 3, .LCPI10_15 at toc@l
+; LE-NEXT: xxlxor 4, 4, 35
+; LE-NEXT: xxland 35, 3, 5
+; LE-NEXT: lxvd2x 5, 0, 3
+; LE-NEXT: vmuluwm 3, 2, 3
+; LE-NEXT: xxlxor 4, 4, 35
+; LE-NEXT: xxland 35, 3, 5
+; LE-NEXT: vmuluwm 3, 2, 3
+; LE-NEXT: xxlxor 4, 4, 35
+; LE-NEXT: xxleqv 35, 35, 35
+; LE-NEXT: vslw 3, 3, 3
+; LE-NEXT: xxland 35, 3, 35
+; LE-NEXT: vmuluwm 2, 2, 3
+; LE-NEXT: xxlxor 34, 4, 34
+; LE-NEXT: vsrw 8, 2, 4
+; LE-NEXT: vsrw 3, 2, 7
+; LE-NEXT: xxland 3, 40, 0
+; LE-NEXT: xxlor 3, 3, 35
+; LE-NEXT: vslw 3, 2, 7
+; LE-NEXT: xxland 34, 34, 0
+; LE-NEXT: vslw 2, 2, 4
+; LE-NEXT: xxlor 0, 35, 34
+; LE-NEXT: xxlor 34, 0, 3
+; LE-NEXT: xxland 35, 34, 37
+; LE-NEXT: vsrw 2, 2, 0
+; LE-NEXT: vslw 3, 3, 0
+; LE-NEXT: xxland 0, 34, 37
+; LE-NEXT: xxlor 34, 0, 35
+; LE-NEXT: xxland 35, 34, 1
+; LE-NEXT: vsrw 2, 2, 6
+; LE-NEXT: vslw 3, 3, 6
+; LE-NEXT: xxland 0, 34, 1
+; LE-NEXT: xxlor 34, 0, 35
+; LE-NEXT: vsrw 3, 2, 1
+; LE-NEXT: xxland 34, 34, 2
+; LE-NEXT: xxland 0, 35, 2
+; LE-NEXT: vadduwm 2, 2, 2
+; LE-NEXT: xxlor 34, 0, 34
+; LE-NEXT: vsrw 2, 2, 1
+; LE-NEXT: blr
+ %a.ext = zext <4 x i32> %a to <4 x i64>
+ %b.ext = zext <4 x i32> %b to <4 x i64>
+ %clmul = call <4 x i64> @llvm.clmul.v4i64(<4 x i64> %a.ext, <4 x i64> %b.ext)
+ %res.ext = lshr <4 x i64> %clmul, splat (i64 32)
+ %res = trunc <4 x i64> %res.ext to <4 x i32>
+ ret <4 x i32> %res
+}
+
+define <2 x i64> @clmulh_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
+; BE-LABEL: clmulh_v2i64:
+; BE: # %bb.0:
+; BE-NEXT: stdu 1, -1040(1)
+; BE-NEXT: lis 7, -21846
+; BE-NEXT: lis 8, 21845
+; BE-NEXT: std 26, 992(1) # 8-byte Folded Spill
+; BE-NEXT: ori 7, 7, 43690
+; BE-NEXT: ori 8, 8, 21845
+; BE-NEXT: std 27, 1000(1) # 8-byte Folded Spill
+; BE-NEXT: sldi 7, 7, 32
+; BE-NEXT: sldi 8, 8, 32
+; BE-NEXT: lis 9, -13108
+; BE-NEXT: lis 10, 13107
+; BE-NEXT: std 30, 1024(1) # 8-byte Folded Spill
+; BE-NEXT: oris 7, 7, 43690
+; BE-NEXT: oris 8, 8, 21845
+; BE-NEXT: std 28, 1008(1) # 8-byte Folded Spill
+; BE-NEXT: sldi 0, 3, 1
+; BE-NEXT: rldicl 3, 3, 63, 1
+; BE-NEXT: ori 9, 9, 52428
+; BE-NEXT: ori 10, 10, 13107
+; BE-NEXT: std 29, 1016(1) # 8-byte Folded Spill
+; BE-NEXT: ori 27, 7, 43690
+; BE-NEXT: ori 26, 8, 21845
+; BE-NEXT: std 2, 888(1) # 8-byte Folded Spill
+; BE-NEXT: sldi 9, 9, 32
+; BE-NEXT: sldi 10, 10, 32
+; BE-NEXT: and 7, 0, 27
+; BE-NEXT: and 3, 3, 26
+; BE-NEXT: std 31, 1032(1) # 8-byte Folded Spill
+; BE-NEXT: lis 11, -3856
+; BE-NEXT: lis 12, 3855
+; BE-NEXT: std 15, 904(1) # 8-byte Folded Spill
+; BE-NEXT: sldi 30, 5, 1
+; BE-NEXT: rldicl 5, 5, 63, 1
+; BE-NEXT: oris 9, 9, 52428
+; BE-NEXT: oris 10, 10, 13107
+; BE-NEXT: std 14, 896(1) # 8-byte Folded Spill
+; BE-NEXT: or 3, 3, 7
+; BE-NEXT: ori 11, 11, 61680
+; BE-NEXT: std 17, 920(1) # 8-byte Folded Spill
+; BE-NEXT: ori 12, 12, 3855
+; BE-NEXT: ori 29, 9, 52428
+; BE-NEXT: ori 28, 10, 13107
+; BE-NEXT: and 8, 30, 27
+; BE-NEXT: std 16, 912(1) # 8-byte Folded Spill
+; BE-NEXT: and 5, 5, 26
+; BE-NEXT: sldi 7, 3, 2
+; BE-NEXT: std 19, 936(1) # 8-byte Folded Spill
+; BE-NEXT: rldicl 3, 3, 62, 2
+; BE-NEXT: sldi 11, 11, 32
+; BE-NEXT: sldi 12, 12, 32
+; BE-NEXT: or 5, 5, 8
+; BE-NEXT: std 18, 928(1) # 8-byte Folded Spill
+; BE-NEXT: and 7, 7, 29
+; BE-NEXT: and 3, 3, 28
+; BE-NEXT: std 21, 952(1) # 8-byte Folded Spill
+; BE-NEXT: oris 11, 11, 61680
+; BE-NEXT: oris 12, 12, 3855
+; BE-NEXT: sldi 8, 5, 2
+; BE-NEXT: rldicl 5, 5, 62, 2
+; BE-NEXT: std 20, 944(1) # 8-byte Folded Spill
+; BE-NEXT: or 3, 3, 7
+; BE-NEXT: ori 9, 11, 61680
+; BE-NEXT: std 23, 968(1) # 8-byte Folded Spill
+; BE-NEXT: ori 10, 12, 3855
+; BE-NEXT: and 8, 8, 29
+; BE-NEXT: and 5, 5, 28
+; BE-NEXT: sldi 7, 3, 4
+; BE-NEXT: std 22, 960(1) # 8-byte Folded Spill
+; BE-NEXT: rldicl 3, 3, 60, 4
+; BE-NEXT: or 5, 5, 8
+; BE-NEXT: std 25, 984(1) # 8-byte Folded Spill
+; BE-NEXT: and 7, 7, 9
+; BE-NEXT: and 3, 3, 10
+; BE-NEXT: sldi 8, 5, 4
+; BE-NEXT: rldicl 5, 5, 60, 4
+; BE-NEXT: std 24, 976(1) # 8-byte Folded Spill
+; BE-NEXT: or 3, 3, 7
+; BE-NEXT: and 8, 8, 9
+; BE-NEXT: std 27, 352(1) # 8-byte Folded Spill
+; BE-NEXT: and 5, 5, 10
+; BE-NEXT: rotlwi 7, 3, 24
+; BE-NEXT: or 5, 5, 8
+; BE-NEXT: rlwimi 7, 3, 8, 8, 15
+; BE-NEXT: std 26, 344(1) # 8-byte Folded Spill
+; BE-NEXT: mr 30, 9
+; BE-NEXT: std 29, 368(1) # 8-byte Folded Spill
+; BE-NEXT: rotlwi 8, 5, 24
+; BE-NEXT: rldicl 9, 3, 32, 32
+; BE-NEXT: rlwimi 7, 3, 8, 24, 31
+; BE-NEXT: rldicl 3, 5, 32, 32
+; BE-NEXT: std 28, 360(1) # 8-byte Folded Spill
+; BE-NEXT: rlwimi 8, 5, 8, 8, 15
+; BE-NEXT: std 30, 376(1) # 8-byte Folded Spill
+; BE-NEXT: rotlwi 11, 3, 24
+; BE-NEXT: mr 0, 10
+; BE-NEXT: rotlwi 10, 9, 24
+; BE-NEXT: std 0, 384(1) # 8-byte Folded Spill
+; BE-NEXT: rlwimi 11, 3, 8, 8, 15
+; BE-NEXT: rlwimi 8, 5, 8, 24, 31
+; BE-NEXT: rlwimi 10, 9, 8, 8, 15
+; BE-NEXT: rlwimi 11, 3, 8, 24, 31
+; BE-NEXT: sldi 5, 8, 32
+; BE-NEXT: rlwimi 10, 9, 8, 24, 31
+; BE-NEXT: sldi 3, 7, 32
+; BE-NEXT: or 11, 5, 11
+; BE-NEXT: or 12, 3, 10
+; BE-NEXT: rlwinm 3, 11, 0, 30, 30
+; BE-NEXT: rlwinm 5, 11, 0, 29, 29
+; BE-NEXT: mulld 3, 12, 3
+; BE-NEXT: std 3, 880(1) # 8-byte Folded Spill
+; BE-NEXT: clrldi 3, 11, 63
+; BE-NEXT: mulld 2, 12, 3
+; BE-NEXT: mulld 3, 12, 5
+; BE-NEXT: std 3, 872(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 3, 11, 0, 28, 28
+; BE-NEXT: rlwinm 5, 11, 0, 27, 27
+; BE-NEXT: mulld 3, 12, 3
+; BE-NEXT: std 3, 856(1) # 8-byte Folded Spill
+; BE-NEXT: mulld 3, 12, 5
+; BE-NEXT: std 3, 864(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 3, 11, 0, 26, 26
+; BE-NEXT: rlwinm 5, 11, 0, 25, 25
+; BE-NEXT: mulld 3, 12, 3
+; BE-NEXT: std 3, 840(1) # 8-byte Folded Spill
+; BE-NEXT: mulld 3, 12, 5
+; BE-NEXT: std 3, 848(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 3, 11, 0, 24, 24
+; BE-NEXT: rlwinm 5, 11, 0, 23, 23
+; BE-NEXT: mulld 3, 12, 3
+; BE-NEXT: std 3, 824(1) # 8-byte Folded Spill
+; BE-NEXT: mulld 3, 12, 5
+; BE-NEXT: std 3, 832(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 3, 11, 0, 22, 22
+; BE-NEXT: rlwinm 5, 11, 0, 21, 21
+; BE-NEXT: mulld 3, 12, 3
+; BE-NEXT: std 3, 808(1) # 8-byte Folded Spill
+; BE-NEXT: mulld 3, 12, 5
+; BE-NEXT: std 3, 816(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 3, 11, 0, 20, 20
+; BE-NEXT: rlwinm 5, 11, 0, 19, 19
+; BE-NEXT: mulld 3, 12, 3
+; BE-NEXT: std 3, 792(1) # 8-byte Folded Spill
+; BE-NEXT: mulld 3, 12, 5
+; BE-NEXT: std 3, 800(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 3, 11, 0, 18, 18
+; BE-NEXT: rlwinm 5, 11, 0, 17, 17
+; BE-NEXT: mulld 3, 12, 3
+; BE-NEXT: std 3, 776(1) # 8-byte Folded Spill
+; BE-NEXT: mulld 3, 12, 5
+; BE-NEXT: std 3, 784(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 3, 11, 0, 16, 16
+; BE-NEXT: rlwinm 5, 11, 0, 15, 15
+; BE-NEXT: mulld 3, 12, 3
+; BE-NEXT: std 3, 760(1) # 8-byte Folded Spill
+; BE-NEXT: mulld 3, 12, 5
+; BE-NEXT: std 3, 768(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 3, 11, 0, 14, 14
+; BE-NEXT: rlwinm 5, 11, 0, 13, 13
+; BE-NEXT: mulld 3, 12, 3
+; BE-NEXT: std 3, 744(1) # 8-byte Folded Spill
+; BE-NEXT: mulld 3, 12, 5
+; BE-NEXT: std 3, 752(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 3, 11, 0, 12, 12
+; BE-NEXT: rlwinm 5, 11, 0, 11, 11
+; BE-NEXT: mulld 3, 12, 3
+; BE-NEXT: std 3, 728(1) # 8-byte Folded Spill
+; BE-NEXT: mulld 3, 12, 5
+; BE-NEXT: std 3, 736(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 3, 11, 0, 10, 10
+; BE-NEXT: mulld 3, 12, 3
+; BE-NEXT: std 3, 720(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 3, 11, 0, 9, 9
+; BE-NEXT: mulld 3, 12, 3
+; BE-NEXT: std 3, 712(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 3, 11, 0, 8, 8
+; BE-NEXT: mulld 3, 12, 3
+; BE-NEXT: std 3, 704(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 3, 11, 0, 7, 7
+; BE-NEXT: mulld 3, 12, 3
+; BE-NEXT: std 3, 696(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 3, 11, 0, 6, 6
+; BE-NEXT: mulld 3, 12, 3
+; BE-NEXT: std 3, 688(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 3, 11, 0, 5, 5
+; BE-NEXT: mulld 3, 12, 3
+; BE-NEXT: std 3, 680(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 3, 11, 0, 4, 4
+; BE-NEXT: mulld 3, 12, 3
+; BE-NEXT: std 3, 672(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 3, 11, 0, 3, 3
+; BE-NEXT: mulld 3, 12, 3
+; BE-NEXT: std 3, 664(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 3, 11, 0, 2, 2
+; BE-NEXT: mulld 3, 12, 3
+; BE-NEXT: std 3, 656(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 3, 11, 0, 1, 1
+; BE-NEXT: mulld 3, 12, 3
+; BE-NEXT: std 3, 648(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 3, 11, 0, 0, 0
+; BE-NEXT: mulld 3, 12, 3
+; BE-NEXT: std 3, 640(1) # 8-byte Folded Spill
+; BE-NEXT: rldicl 3, 11, 32, 32
+; BE-NEXT: rldicl 3, 3, 32, 31
+; BE-NEXT: mulld 3, 12, 3
+; BE-NEXT: std 3, 632(1) # 8-byte Folded Spill
+; BE-NEXT: rldicl 3, 11, 31, 33
+; BE-NEXT: rldicl 3, 3, 33, 30
+; BE-NEXT: rldicl 5, 11, 30, 34
+; BE-NEXT: rldicl 5, 5, 34, 29
+; BE-NEXT: mulld 3, 12, 3
+; BE-NEXT: std 3, 616(1) # 8-byte Folded Spill
+; BE-NEXT: mulld 3, 12, 5
+; BE-NEXT: std 3, 624(1) # 8-byte Folded Spill
+; BE-NEXT: rldicl 3, 11, 29, 35
+; BE-NEXT: rldicl 3, 3, 35, 28
+; BE-NEXT: rldicl 5, 11, 28, 36
+; BE-NEXT: rldicl 5, 5, 36, 27
+; BE-NEXT: mulld 3, 12, 3
+; BE-NEXT: std 3, 600(1) # 8-byte Folded Spill
+; BE-NEXT: mulld 3, 12, 5
+; BE-NEXT: std 3, 608(1) # 8-byte Folded Spill
+; BE-NEXT: rldicl 3, 11, 27, 37
+; BE-NEXT: rldicl 3, 3, 37, 26
+; BE-NEXT: rldicl 5, 11, 26, 38
+; BE-NEXT: rldicl 5, 5, 38, 25
+; BE-NEXT: mulld 3, 12, 3
+; BE-NEXT: std 3, 584(1) # 8-byte Folded Spill
+; BE-NEXT: mulld 3, 12, 5
+; BE-NEXT: std 3, 592(1) # 8-byte Folded Spill
+; BE-NEXT: rldicl 3, 11, 25, 39
+; BE-NEXT: rldicl 3, 3, 39, 24
+; BE-NEXT: rldicl 5, 11, 24, 40
+; BE-NEXT: rldicl 5, 5, 40, 23
+; BE-NEXT: mulld 3, 12, 3
+; BE-NEXT: std 3, 568(1) # 8-byte Folded Spill
+; BE-NEXT: mulld 3, 12, 5
+; BE-NEXT: std 3, 576(1) # 8-byte Folded Spill
+; BE-NEXT: rldicl 3, 11, 23, 41
+; BE-NEXT: rldicl 3, 3, 41, 22
+; BE-NEXT: rldicl 5, 11, 22, 42
+; BE-NEXT: rldicl 5, 5, 42, 21
+; BE-NEXT: mulld 3, 12, 3
+; BE-NEXT: std 3, 552(1) # 8-byte Folded Spill
+; BE-NEXT: mulld 3, 12, 5
+; BE-NEXT: std 3, 560(1) # 8-byte Folded Spill
+; BE-NEXT: rldicl 3, 11, 21, 43
+; BE-NEXT: rldicl 3, 3, 43, 20
+; BE-NEXT: rldicl 5, 11, 20, 44
+; BE-NEXT: rldicl 5, 5, 44, 19
+; BE-NEXT: mulld 3, 12, 3
+; BE-NEXT: std 3, 536(1) # 8-byte Folded Spill
+; BE-NEXT: mulld 3, 12, 5
+; BE-NEXT: std 3, 544(1) # 8-byte Folded Spill
+; BE-NEXT: rldicl 3, 11, 19, 45
+; BE-NEXT: rldicl 3, 3, 45, 18
+; BE-NEXT: rldicl 5, 11, 18, 46
+; BE-NEXT: rldicl 5, 5, 46, 17
+; BE-NEXT: mulld 3, 12, 3
+; BE-NEXT: std 3, 520(1) # 8-byte Folded Spill
+; BE-NEXT: mulld 3, 12, 5
+; BE-NEXT: std 3, 528(1) # 8-byte Folded Spill
+; BE-NEXT: rldicl 3, 11, 17, 47
+; BE-NEXT: rldicl 3, 3, 47, 16
+; BE-NEXT: rldicl 5, 11, 16, 48
+; BE-NEXT: rldicl 5, 5, 48, 15
+; BE-NEXT: mulld 3, 12, 3
+; BE-NEXT: std 3, 504(1) # 8-byte Folded Spill
+; BE-NEXT: mulld 3, 12, 5
+; BE-NEXT: std 3, 512(1) # 8-byte Folded Spill
+; BE-NEXT: rldicl 3, 11, 15, 49
+; BE-NEXT: rldicl 3, 3, 49, 14
+; BE-NEXT: rldicl 5, 11, 14, 50
+; BE-NEXT: rldicl 5, 5, 50, 13
+; BE-NEXT: mulld 3, 12, 3
+; BE-NEXT: std 3, 488(1) # 8-byte Folded Spill
+; BE-NEXT: mulld 3, 12, 5
+; BE-NEXT: std 3, 496(1) # 8-byte Folded Spill
+; BE-NEXT: rldicl 3, 11, 13, 51
+; BE-NEXT: rldicl 3, 3, 51, 12
+; BE-NEXT: rldicl 5, 11, 12, 52
+; BE-NEXT: rldicl 5, 5, 52, 11
+; BE-NEXT: mulld 3, 12, 3
+; BE-NEXT: std 3, 472(1) # 8-byte Folded Spill
+; BE-NEXT: mulld 3, 12, 5
+; BE-NEXT: std 3, 480(1) # 8-byte Folded Spill
+; BE-NEXT: rldicl 3, 11, 11, 53
+; BE-NEXT: rldicl 3, 3, 53, 10
+; BE-NEXT: rldicl 5, 11, 10, 54
+; BE-NEXT: rldicl 5, 5, 54, 9
+; BE-NEXT: mulld 3, 12, 3
+; BE-NEXT: std 3, 456(1) # 8-byte Folded Spill
+; BE-NEXT: mulld 3, 12, 5
+; BE-NEXT: std 3, 464(1) # 8-byte Folded Spill
+; BE-NEXT: rldicl 3, 11, 9, 55
+; BE-NEXT: rldicl 3, 3, 55, 8
+; BE-NEXT: rldicl 5, 11, 8, 56
+; BE-NEXT: rldicl 5, 5, 56, 7
+; BE-NEXT: mulld 3, 12, 3
+; BE-NEXT: std 3, 440(1) # 8-byte Folded Spill
+; BE-NEXT: mulld 3, 12, 5
+; BE-NEXT: std 3, 448(1) # 8-byte Folded Spill
+; BE-NEXT: rldicl 3, 11, 7, 57
+; BE-NEXT: rldicl 3, 3, 57, 6
+; BE-NEXT: rldicl 5, 11, 6, 58
+; BE-NEXT: rldicl 5, 5, 58, 5
+; BE-NEXT: mulld 3, 12, 3
+; BE-NEXT: std 3, 424(1) # 8-byte Folded Spill
+; BE-NEXT: mulld 3, 12, 5
+; BE-NEXT: std 3, 432(1) # 8-byte Folded Spill
+; BE-NEXT: rldicl 3, 11, 5, 59
+; BE-NEXT: rldicl 3, 3, 59, 4
+; BE-NEXT: rldicl 5, 11, 4, 60
+; BE-NEXT: rldicl 5, 5, 60, 3
+; BE-NEXT: mulld 3, 12, 3
+; BE-NEXT: std 3, 408(1) # 8-byte Folded Spill
+; BE-NEXT: mulld 3, 12, 5
+; BE-NEXT: std 3, 416(1) # 8-byte Folded Spill
+; BE-NEXT: rldicl 3, 11, 3, 61
+; BE-NEXT: rldicl 5, 11, 2, 62
+; BE-NEXT: rldicl 3, 3, 61, 2
+; BE-NEXT: rldicl 5, 5, 62, 1
+; BE-NEXT: mulld 3, 12, 3
+; BE-NEXT: std 3, 392(1) # 8-byte Folded Spill
+; BE-NEXT: mulld 3, 12, 5
+; BE-NEXT: std 3, 400(1) # 8-byte Folded Spill
+; BE-NEXT: sldi 3, 4, 1
+; BE-NEXT: rldicl 4, 4, 63, 1
+; BE-NEXT: and 3, 3, 27
+; BE-NEXT: and 4, 4, 26
+; BE-NEXT: or 3, 4, 3
+; BE-NEXT: sldi 4, 3, 2
+; BE-NEXT: rldicl 3, 3, 62, 2
+; BE-NEXT: and 4, 4, 29
+; BE-NEXT: and 3, 3, 28
+; BE-NEXT: or 3, 3, 4
+; BE-NEXT: sldi 4, 3, 4
+; BE-NEXT: rldicl 3, 3, 60, 4
+; BE-NEXT: and 4, 4, 30
+; BE-NEXT: and 3, 3, 0
+; BE-NEXT: or 3, 3, 4
+; BE-NEXT: rotlwi 4, 3, 24
+; BE-NEXT: rlwimi 4, 3, 8, 8, 15
+; BE-NEXT: rlwimi 4, 3, 8, 24, 31
+; BE-NEXT: rldicl 3, 3, 32, 32
+; BE-NEXT: rotlwi 5, 3, 24
+; BE-NEXT: rlwimi 5, 3, 8, 8, 15
+; BE-NEXT: rlwimi 5, 3, 8, 24, 31
+; BE-NEXT: sldi 3, 6, 1
+; BE-NEXT: rldicl 6, 6, 63, 1
+; BE-NEXT: and 3, 3, 27
+; BE-NEXT: and 6, 6, 26
+; BE-NEXT: or 3, 6, 3
+; BE-NEXT: sldi 6, 3, 2
+; BE-NEXT: rldicl 3, 3, 62, 2
+; BE-NEXT: and 6, 6, 29
+; BE-NEXT: and 3, 3, 28
+; BE-NEXT: or 3, 3, 6
+; BE-NEXT: sldi 6, 3, 4
+; BE-NEXT: rldicl 3, 3, 60, 4
+; BE-NEXT: and 6, 6, 30
+; BE-NEXT: and 3, 3, 0
+; BE-NEXT: or 3, 3, 6
+; BE-NEXT: rotlwi 6, 3, 24
+; BE-NEXT: rlwimi 6, 3, 8, 8, 15
+; BE-NEXT: rlwimi 6, 3, 8, 24, 31
+; BE-NEXT: rldicl 3, 3, 32, 32
+; BE-NEXT: rotlwi 7, 3, 24
+; BE-NEXT: rlwimi 7, 3, 8, 8, 15
+; BE-NEXT: rlwimi 7, 3, 8, 24, 31
+; BE-NEXT: sldi 3, 4, 32
+; BE-NEXT: or 4, 3, 5
+; BE-NEXT: sldi 3, 6, 32
+; BE-NEXT: or 3, 3, 7
+; BE-NEXT: rlwinm 5, 3, 0, 30, 30
+; BE-NEXT: mulld 5, 4, 5
+; BE-NEXT: std 5, 336(1) # 8-byte Folded Spill
+; BE-NEXT: clrldi 5, 3, 63
+; BE-NEXT: mulld 5, 4, 5
+; BE-NEXT: std 5, 328(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 5, 3, 0, 29, 29
+; BE-NEXT: mulld 5, 4, 5
+; BE-NEXT: std 5, 320(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 5, 3, 0, 28, 28
+; BE-NEXT: mulld 5, 4, 5
+; BE-NEXT: std 5, 312(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 5, 3, 0, 27, 27
+; BE-NEXT: mulld 5, 4, 5
+; BE-NEXT: std 5, 304(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 5, 3, 0, 26, 26
+; BE-NEXT: mulld 5, 4, 5
+; BE-NEXT: std 5, 296(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 5, 3, 0, 25, 25
+; BE-NEXT: mulld 5, 4, 5
+; BE-NEXT: std 5, 288(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 5, 3, 0, 24, 24
+; BE-NEXT: mulld 5, 4, 5
+; BE-NEXT: std 5, 280(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 5, 3, 0, 23, 23
+; BE-NEXT: mulld 5, 4, 5
+; BE-NEXT: std 5, 272(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 5, 3, 0, 22, 22
+; BE-NEXT: mulld 5, 4, 5
+; BE-NEXT: std 5, 264(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 5, 3, 0, 21, 21
+; BE-NEXT: mulld 5, 4, 5
+; BE-NEXT: std 5, 256(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 5, 3, 0, 20, 20
+; BE-NEXT: mulld 5, 4, 5
+; BE-NEXT: std 5, 248(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 5, 3, 0, 19, 19
+; BE-NEXT: mulld 5, 4, 5
+; BE-NEXT: std 5, 240(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 5, 3, 0, 18, 18
+; BE-NEXT: mulld 5, 4, 5
+; BE-NEXT: std 5, 232(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 5, 3, 0, 17, 17
+; BE-NEXT: mulld 5, 4, 5
+; BE-NEXT: std 5, 224(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 5, 3, 0, 16, 16
+; BE-NEXT: mulld 5, 4, 5
+; BE-NEXT: std 5, 216(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 5, 3, 0, 15, 15
+; BE-NEXT: mulld 5, 4, 5
+; BE-NEXT: std 5, 208(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 5, 3, 0, 14, 14
+; BE-NEXT: mulld 5, 4, 5
+; BE-NEXT: std 5, 200(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 5, 3, 0, 13, 13
+; BE-NEXT: mulld 5, 4, 5
+; BE-NEXT: std 5, 192(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 5, 3, 0, 12, 12
+; BE-NEXT: mulld 5, 4, 5
+; BE-NEXT: std 5, 184(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 5, 3, 0, 11, 11
+; BE-NEXT: mulld 5, 4, 5
+; BE-NEXT: std 5, 176(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 5, 3, 0, 10, 10
+; BE-NEXT: mulld 5, 4, 5
+; BE-NEXT: std 5, 168(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 5, 3, 0, 9, 9
+; BE-NEXT: mulld 5, 4, 5
+; BE-NEXT: std 5, 160(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 5, 3, 0, 8, 8
+; BE-NEXT: mulld 5, 4, 5
+; BE-NEXT: std 5, 152(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 5, 3, 0, 7, 7
+; BE-NEXT: mulld 5, 4, 5
+; BE-NEXT: std 5, 144(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 5, 3, 0, 6, 6
+; BE-NEXT: mulld 5, 4, 5
+; BE-NEXT: std 5, 136(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 5, 3, 0, 5, 5
+; BE-NEXT: mulld 5, 4, 5
+; BE-NEXT: std 5, 128(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 5, 3, 0, 4, 4
+; BE-NEXT: mulld 5, 4, 5
+; BE-NEXT: std 5, 120(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 5, 3, 0, 3, 3
+; BE-NEXT: mulld 5, 4, 5
+; BE-NEXT: std 5, 112(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 5, 3, 0, 2, 2
+; BE-NEXT: mulld 5, 4, 5
+; BE-NEXT: std 5, 104(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 5, 3, 0, 1, 1
+; BE-NEXT: mulld 5, 4, 5
+; BE-NEXT: std 5, 96(1) # 8-byte Folded Spill
+; BE-NEXT: rlwinm 5, 3, 0, 0, 0
+; BE-NEXT: mulld 5, 4, 5
+; BE-NEXT: std 5, 88(1) # 8-byte Folded Spill
+; BE-NEXT: rldicl 5, 3, 32, 32
+; BE-NEXT: rldicl 5, 5, 32, 31
+; BE-NEXT: mulld 5, 4, 5
+; BE-NEXT: std 5, 80(1) # 8-byte Folded Spill
+; BE-NEXT: rldicl 5, 3, 31, 33
+; BE-NEXT: rldicl 5, 5, 33, 30
+; BE-NEXT: rldicl 6, 3, 30, 34
+; BE-NEXT: rldicl 6, 6, 34, 29
+; BE-NEXT: mulld 5, 4, 5
+; BE-NEXT: std 5, 64(1) # 8-byte Folded Spill
+; BE-NEXT: mulld 5, 4, 6
+; BE-NEXT: std 5, 72(1) # 8-byte Folded Spill
+; BE-NEXT: rldicl 5, 3, 29, 35
+; BE-NEXT: rldicl 6, 3, 28, 36
+; BE-NEXT: rldicl 5, 5, 35, 28
+; BE-NEXT: rldicl 6, 6, 36, 27
+; BE-NEXT: mulld 31, 4, 5
+; BE-NEXT: mulld 5, 4, 6
+; BE-NEXT: std 5, 56(1) # 8-byte Folded Spill
+; BE-NEXT: rldicl 5, 3, 27, 37
+; BE-NEXT: rldicl 5, 5, 37, 26
+; BE-NEXT: rldicl 6, 3, 26, 38
+; BE-NEXT: mulld 15, 4, 5
+; BE-NEXT: rldicl 5, 3, 25, 39
+; BE-NEXT: rldicl 6, 6, 38, 25
+; BE-NEXT: rldicl 5, 5, 39, 24
+; BE-NEXT: mulld 14, 4, 6
+; BE-NEXT: rldicl 6, 3, 24, 40
+; BE-NEXT: mulld 17, 4, 5
+; BE-NEXT: rldicl 5, 3, 23, 41
+; BE-NEXT: rldicl 6, 6, 40, 23
+; BE-NEXT: rldicl 5, 5, 41, 22
+; BE-NEXT: mulld 16, 4, 6
+; BE-NEXT: rldicl 6, 3, 22, 42
+; BE-NEXT: mulld 19, 4, 5
+; BE-NEXT: rldicl 5, 3, 21, 43
+; BE-NEXT: rldicl 6, 6, 42, 21
+; BE-NEXT: rldicl 5, 5, 43, 20
+; BE-NEXT: mulld 18, 4, 6
+; BE-NEXT: rldicl 6, 3, 20, 44
+; BE-NEXT: mulld 21, 4, 5
+; BE-NEXT: rldicl 5, 3, 19, 45
+; BE-NEXT: rldicl 6, 6, 44, 19
+; BE-NEXT: rldicl 5, 5, 45, 18
+; BE-NEXT: mulld 20, 4, 6
+; BE-NEXT: rldicl 6, 3, 18, 46
+; BE-NEXT: mulld 23, 4, 5
+; BE-NEXT: rldicl 5, 3, 17, 47
+; BE-NEXT: rldicl 6, 6, 46, 17
+; BE-NEXT: rldicl 5, 5, 47, 16
+; BE-NEXT: mulld 22, 4, 6
+; BE-NEXT: rldicl 6, 3, 16, 48
+; BE-NEXT: mulld 25, 4, 5
+; BE-NEXT: rldicl 5, 3, 15, 49
+; BE-NEXT: rldicl 6, 6, 48, 15
+; BE-NEXT: rldicl 5, 5, 49, 14
+; BE-NEXT: mulld 24, 4, 6
+; BE-NEXT: rldicl 6, 3, 14, 50
+; BE-NEXT: mulld 27, 4, 5
+; BE-NEXT: rldicl 5, 3, 13, 51
+; BE-NEXT: rldicl 6, 6, 50, 13
+; BE-NEXT: rldicl 5, 5, 51, 12
+; BE-NEXT: mulld 26, 4, 6
+; BE-NEXT: rldicl 6, 3, 12, 52
+; BE-NEXT: mulld 29, 4, 5
+; BE-NEXT: rldicl 5, 3, 11, 53
+; BE-NEXT: rldicl 6, 6, 52, 11
+; BE-NEXT: rldicl 5, 5, 53, 10
+; BE-NEXT: mulld 28, 4, 6
+; BE-NEXT: rldicl 6, 3, 10, 54
+; BE-NEXT: mulld 0, 4, 5
+; BE-NEXT: rldicl 5, 3, 9, 55
+; BE-NEXT: rldicl 6, 6, 54, 9
+; BE-NEXT: rldicl 5, 5, 55, 8
+; BE-NEXT: mulld 30, 4, 6
+; BE-NEXT: rldicl 6, 3, 8, 56
+; BE-NEXT: mulld 11, 4, 5
+; BE-NEXT: rldicl 5, 3, 7, 57
+; BE-NEXT: rldicl 6, 6, 56, 7
+; BE-NEXT: rldicl 5, 5, 57, 6
+; BE-NEXT: mulld 12, 4, 6
+; BE-NEXT: rldicl 6, 3, 6, 58
+; BE-NEXT: mulld 9, 4, 5
+; BE-NEXT: rldicl 5, 3, 5, 59
+; BE-NEXT: rldicl 6, 6, 58, 5
+; BE-NEXT: rldicl 5, 5, 59, 4
+; BE-NEXT: mulld 10, 4, 6
+; BE-NEXT: rldicl 6, 3, 4, 60
+; BE-NEXT: mulld 7, 4, 5
+; BE-NEXT: rldicl 5, 3, 3, 61
+; BE-NEXT: rldicl 3, 3, 2, 62
+; BE-NEXT: rldicl 6, 6, 60, 3
+; BE-NEXT: rldicl 3, 3, 62, 1
+; BE-NEXT: mulld 8, 4, 6
+; BE-NEXT: mulld 6, 4, 3
+; BE-NEXT: ld 3, 880(1) # 8-byte Folded Reload
+; BE-NEXT: rldicl 5, 5, 61, 2
+; BE-NEXT: mulld 5, 4, 5
+; BE-NEXT: ld 4, 336(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 2, 3
+; BE-NEXT: ld 2, 328(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 2, 4
+; BE-NEXT: ld 2, 872(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 320(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 2
+; BE-NEXT: ld 2, 856(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 312(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 2
+; BE-NEXT: ld 2, 864(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 304(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 2
+; BE-NEXT: ld 2, 840(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 296(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 2
+; BE-NEXT: ld 2, 848(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 288(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 2
+; BE-NEXT: ld 2, 824(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 280(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 2
+; BE-NEXT: ld 2, 832(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 272(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 2
+; BE-NEXT: ld 2, 808(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 264(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 2
+; BE-NEXT: ld 2, 816(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 256(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 2
+; BE-NEXT: ld 2, 792(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 248(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 2
+; BE-NEXT: ld 2, 800(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 240(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 2
+; BE-NEXT: ld 2, 776(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 232(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 2
+; BE-NEXT: ld 2, 784(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 224(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 2
+; BE-NEXT: ld 2, 760(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 216(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 2
+; BE-NEXT: ld 2, 768(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 208(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 2
+; BE-NEXT: ld 2, 744(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 200(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 2
+; BE-NEXT: ld 2, 752(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 192(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 2
+; BE-NEXT: ld 2, 728(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 184(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 2
+; BE-NEXT: ld 2, 736(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 176(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 2
+; BE-NEXT: ld 2, 720(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 168(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 2
+; BE-NEXT: ld 2, 712(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 160(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 2
+; BE-NEXT: ld 2, 704(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 152(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 2
+; BE-NEXT: ld 2, 696(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 144(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 2
+; BE-NEXT: ld 2, 688(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 136(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 2
+; BE-NEXT: ld 2, 680(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 128(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 2
+; BE-NEXT: ld 2, 672(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 120(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 2
+; BE-NEXT: ld 2, 664(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 112(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 2
+; BE-NEXT: ld 2, 656(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 104(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 2
+; BE-NEXT: ld 2, 648(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 96(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 2
+; BE-NEXT: ld 2, 640(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 88(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 2
+; BE-NEXT: ld 2, 632(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 80(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 2
+; BE-NEXT: ld 2, 616(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 64(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 2
+; BE-NEXT: ld 2, 624(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: ld 2, 72(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 2
+; BE-NEXT: ld 2, 600(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 31
+; BE-NEXT: ld 31, 608(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 2
+; BE-NEXT: xor 3, 3, 31
+; BE-NEXT: ld 31, 56(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 31
+; BE-NEXT: ld 31, 584(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 15
+; BE-NEXT: xor 4, 4, 14
+; BE-NEXT: ld 15, 592(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 31
+; BE-NEXT: xor 4, 4, 17
+; BE-NEXT: xor 4, 4, 16
+; BE-NEXT: xor 3, 3, 15
+; BE-NEXT: ld 15, 568(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 19
+; BE-NEXT: xor 4, 4, 18
+; BE-NEXT: ld 17, 576(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 15
+; BE-NEXT: xor 4, 4, 21
+; BE-NEXT: xor 4, 4, 20
+; BE-NEXT: xor 3, 3, 17
+; BE-NEXT: ld 17, 552(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 23
+; BE-NEXT: xor 4, 4, 22
+; BE-NEXT: ld 19, 560(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 17
+; BE-NEXT: xor 4, 4, 25
+; BE-NEXT: xor 4, 4, 24
+; BE-NEXT: xor 3, 3, 19
+; BE-NEXT: ld 19, 536(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 27
+; BE-NEXT: xor 4, 4, 26
+; BE-NEXT: ld 21, 544(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 19
+; BE-NEXT: xor 4, 4, 29
+; BE-NEXT: xor 4, 4, 28
+; BE-NEXT: xor 3, 3, 21
+; BE-NEXT: ld 21, 520(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 0
+; BE-NEXT: xor 4, 4, 30
+; BE-NEXT: ld 23, 528(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 21
+; BE-NEXT: xor 4, 4, 11
+; BE-NEXT: xor 4, 4, 12
+; BE-NEXT: xor 3, 3, 23
+; BE-NEXT: ld 23, 504(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 9
+; BE-NEXT: xor 4, 4, 10
+; BE-NEXT: ld 25, 512(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 23
+; BE-NEXT: xor 4, 4, 7
+; BE-NEXT: xor 4, 4, 8
+; BE-NEXT: xor 3, 3, 25
+; BE-NEXT: ld 25, 488(1) # 8-byte Folded Reload
+; BE-NEXT: xor 4, 4, 5
+; BE-NEXT: xor 4, 4, 6
+; BE-NEXT: ld 27, 496(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 25
+; BE-NEXT: sldi 6, 4, 1
+; BE-NEXT: rldicl 4, 4, 63, 1
+; BE-NEXT: xor 3, 3, 27
+; BE-NEXT: ld 27, 472(1) # 8-byte Folded Reload
+; BE-NEXT: ld 29, 480(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 27
+; BE-NEXT: xor 3, 3, 29
+; BE-NEXT: ld 29, 456(1) # 8-byte Folded Reload
+; BE-NEXT: ld 0, 464(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 29
+; BE-NEXT: xor 3, 3, 0
+; BE-NEXT: ld 0, 440(1) # 8-byte Folded Reload
+; BE-NEXT: ld 11, 448(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 0
+; BE-NEXT: xor 3, 3, 11
+; BE-NEXT: ld 11, 424(1) # 8-byte Folded Reload
+; BE-NEXT: ld 9, 432(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 11
+; BE-NEXT: xor 3, 3, 9
+; BE-NEXT: ld 9, 408(1) # 8-byte Folded Reload
+; BE-NEXT: ld 7, 416(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 9
+; BE-NEXT: xor 3, 3, 7
+; BE-NEXT: ld 7, 392(1) # 8-byte Folded Reload
+; BE-NEXT: ld 5, 400(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 7
+; BE-NEXT: ld 7, 352(1) # 8-byte Folded Reload
+; BE-NEXT: xor 3, 3, 5
+; BE-NEXT: sldi 5, 3, 1
+; BE-NEXT: rldicl 3, 3, 63, 1
+; BE-NEXT: ld 8, 344(1) # 8-byte Folded Reload
+; BE-NEXT: and 5, 5, 7
+; BE-NEXT: and 6, 6, 7
+; BE-NEXT: and 3, 3, 8
+; BE-NEXT: and 4, 4, 8
+; BE-NEXT: ld 7, 368(1) # 8-byte Folded Reload
+; BE-NEXT: or 3, 3, 5
+; BE-NEXT: or 4, 4, 6
+; BE-NEXT: ld 8, 360(1) # 8-byte Folded Reload
+; BE-NEXT: sldi 5, 3, 2
+; BE-NEXT: rldicl 3, 3, 62, 2
+; BE-NEXT: sldi 6, 4, 2
+; BE-NEXT: rldicl 4, 4, 62, 2
+; BE-NEXT: ld 2, 888(1) # 8-byte Folded Reload
+; BE-NEXT: and 5, 5, 7
+; BE-NEXT: and 3, 3, 8
+; BE-NEXT: ld 31, 1032(1) # 8-byte Folded Reload
+; BE-NEXT: and 6, 6, 7
+; BE-NEXT: and 4, 4, 8
+; BE-NEXT: ld 8, 376(1) # 8-byte Folded Reload
+; BE-NEXT: or 3, 3, 5
+; BE-NEXT: or 4, 4, 6
+; BE-NEXT: sldi 5, 3, 4
+; BE-NEXT: rldicl 3, 3, 60, 4
+; BE-NEXT: ld 7, 384(1) # 8-byte Folded Reload
+; BE-NEXT: and 5, 5, 8
+; BE-NEXT: sldi 6, 4, 4
+; BE-NEXT: and 3, 3, 7
+; BE-NEXT: rldicl 4, 4, 60, 4
+; BE-NEXT: ld 30, 1024(1) # 8-byte Folded Reload
+; BE-NEXT: or 3, 3, 5
+; BE-NEXT: and 6, 6, 8
+; BE-NEXT: ld 29, 1016(1) # 8-byte Folded Reload
+; BE-NEXT: and 4, 4, 7
+; BE-NEXT: rotlwi 5, 3, 24
+; BE-NEXT: or 4, 4, 6
+; BE-NEXT: rlwimi 5, 3, 8, 8, 15
+; BE-NEXT: ld 28, 1008(1) # 8-byte Folded Reload
+; BE-NEXT: rotlwi 6, 4, 24
+; BE-NEXT: ld 27, 1000(1) # 8-byte Folded Reload
+; BE-NEXT: rldicl 7, 3, 32, 32
+; BE-NEXT: rlwimi 5, 3, 8, 24, 31
+; BE-NEXT: rldicl 3, 4, 32, 32
+; BE-NEXT: ld 26, 992(1) # 8-byte Folded Reload
+; BE-NEXT: rlwimi 6, 4, 8, 8, 15
+; BE-NEXT: rotlwi 8, 7, 24
+; BE-NEXT: ld 25, 984(1) # 8-byte Folded Reload
+; BE-NEXT: rotlwi 9, 3, 24
+; BE-NEXT: rlwimi 8, 7, 8, 8, 15
+; BE-NEXT: rlwimi 9, 3, 8, 8, 15
+; BE-NEXT: ld 24, 976(1) # 8-byte Folded Reload
+; BE-NEXT: ld 23, 968(1) # 8-byte Folded Reload
+; BE-NEXT: rlwimi 6, 4, 8, 24, 31
+; BE-NEXT: rlwimi 8, 7, 8, 24, 31
+; BE-NEXT: ld 22, 960(1) # 8-byte Folded Reload
+; BE-NEXT: rlwimi 9, 3, 8, 24, 31
+; BE-NEXT: sldi 3, 5, 32
+; BE-NEXT: ld 21, 952(1) # 8-byte Folded Reload
+; BE-NEXT: sldi 4, 6, 32
+; BE-NEXT: or 3, 3, 8
+; BE-NEXT: or 4, 4, 9
+; BE-NEXT: ld 20, 944(1) # 8-byte Folded Reload
+; BE-NEXT: rldicl 3, 3, 63, 1
+; BE-NEXT: rldicl 4, 4, 63, 1
+; BE-NEXT: ld 19, 936(1) # 8-byte Folded Reload
+; BE-NEXT: ld 18, 928(1) # 8-byte Folded Reload
+; BE-NEXT: ld 17, 920(1) # 8-byte Folded Reload
+; BE-NEXT: ld 16, 912(1) # 8-byte Folded Reload
+; BE-NEXT: ld 15, 904(1) # 8-byte Folded Reload
+; BE-NEXT: ld 14, 896(1) # 8-byte Folded Reload
+; BE-NEXT: addi 1, 1, 1040
+; BE-NEXT: blr
+;
+; LE-LABEL: clmulh_v2i64:
+; LE: # %bb.0:
+; LE-NEXT: stdu 1, -736(1)
+; LE-NEXT: lis 4, -21846
+; LE-NEXT: lis 5, 21845
+; LE-NEXT: xxswapd 1, 35
+; LE-NEXT: xxswapd 0, 34
+; LE-NEXT: mfvsrd 3, 35
+; LE-NEXT: mfvsrd 9, 34
+; LE-NEXT: lis 6, -13108
+; LE-NEXT: lis 7, 13107
+; LE-NEXT: ori 4, 4, 43690
+; LE-NEXT: ori 5, 5, 21845
+; LE-NEXT: mffprd 8, 1
+; LE-NEXT: mffprd 10, 0
+; LE-NEXT: std 28, 704(1) # 8-byte Folded Spill
+; LE-NEXT: std 29, 712(1) # 8-byte Folded Spill
+; LE-NEXT: ori 6, 6, 52428
+; LE-NEXT: ori 7, 7, 13107
+; LE-NEXT: sldi 4, 4, 32
+; LE-NEXT: sldi 5, 5, 32
+; LE-NEXT: sldi 6, 6, 32
+; LE-NEXT: sldi 7, 7, 32
+; LE-NEXT: sldi 11, 3, 1
+; LE-NEXT: rldicl 3, 3, 63, 1
+; LE-NEXT: std 30, 720(1) # 8-byte Folded Spill
+; LE-NEXT: lis 0, -3856
+; LE-NEXT: oris 4, 4, 43690
+; LE-NEXT: oris 5, 5, 21845
+; LE-NEXT: lis 30, 3855
+; LE-NEXT: oris 6, 6, 52428
+; LE-NEXT: sldi 12, 10, 1
+; LE-NEXT: rldicl 10, 10, 63, 1
+; LE-NEXT: oris 7, 7, 13107
+; LE-NEXT: std 27, 696(1) # 8-byte Folded Spill
+; LE-NEXT: ori 28, 4, 43690
+; LE-NEXT: ori 29, 5, 21845
+; LE-NEXT: std 14, 592(1) # 8-byte Folded Spill
+; LE-NEXT: std 15, 600(1) # 8-byte Folded Spill
+; LE-NEXT: sldi 4, 8, 1
+; LE-NEXT: rldicl 5, 8, 63, 1
+; LE-NEXT: std 16, 608(1) # 8-byte Folded Spill
+; LE-NEXT: std 17, 616(1) # 8-byte Folded Spill
+; LE-NEXT: sldi 8, 9, 1
+; LE-NEXT: rldicl 9, 9, 63, 1
+; LE-NEXT: std 28, 568(1) # 8-byte Folded Spill
+; LE-NEXT: std 29, 576(1) # 8-byte Folded Spill
+; LE-NEXT: and 11, 11, 28
+; LE-NEXT: and 3, 3, 29
+; LE-NEXT: std 18, 624(1) # 8-byte Folded Spill
+; LE-NEXT: std 19, 632(1) # 8-byte Folded Spill
+; LE-NEXT: and 4, 4, 28
+; LE-NEXT: and 5, 5, 29
+; LE-NEXT: std 20, 640(1) # 8-byte Folded Spill
+; LE-NEXT: std 21, 648(1) # 8-byte Folded Spill
+; LE-NEXT: and 8, 8, 28
+; LE-NEXT: and 9, 9, 29
+; LE-NEXT: std 22, 656(1) # 8-byte Folded Spill
+; LE-NEXT: std 23, 664(1) # 8-byte Folded Spill
+; LE-NEXT: and 12, 12, 28
+; LE-NEXT: and 10, 10, 29
+; LE-NEXT: std 24, 672(1) # 8-byte Folded Spill
+; LE-NEXT: std 25, 680(1) # 8-byte Folded Spill
+; LE-NEXT: or 3, 3, 11
+; LE-NEXT: or 4, 5, 4
+; LE-NEXT: std 26, 688(1) # 8-byte Folded Spill
+; LE-NEXT: std 31, 728(1) # 8-byte Folded Spill
+; LE-NEXT: ori 5, 0, 61680
+; LE-NEXT: ori 11, 30, 3855
+; LE-NEXT: std 2, 584(1) # 8-byte Folded Spill
+; LE-NEXT: ori 30, 6, 52428
+; LE-NEXT: ori 0, 7, 13107
+; LE-NEXT: std 30, 552(1) # 8-byte Folded Spill
+; LE-NEXT: std 0, 560(1) # 8-byte Folded Spill
+; LE-NEXT: or 6, 9, 8
+; LE-NEXT: or 7, 10, 12
+; LE-NEXT: sldi 8, 3, 2
+; LE-NEXT: rldicl 3, 3, 62, 2
+; LE-NEXT: sldi 9, 4, 2
+; LE-NEXT: rldicl 4, 4, 62, 2
+; LE-NEXT: sldi 5, 5, 32
+; LE-NEXT: sldi 10, 11, 32
+; LE-NEXT: sldi 11, 6, 2
+; LE-NEXT: rldicl 6, 6, 62, 2
+; LE-NEXT: sldi 12, 7, 2
+; LE-NEXT: rldicl 7, 7, 62, 2
+; LE-NEXT: and 8, 8, 30
+; LE-NEXT: and 3, 3, 0
+; LE-NEXT: and 9, 9, 30
+; LE-NEXT: and 4, 4, 0
+; LE-NEXT: oris 5, 5, 61680
+; LE-NEXT: oris 10, 10, 3855
+; LE-NEXT: and 11, 11, 30
+; LE-NEXT: and 6, 6, 0
+; LE-NEXT: and 12, 12, 30
+; LE-NEXT: and 7, 7, 0
+; LE-NEXT: or 3, 3, 8
+; LE-NEXT: or 4, 4, 9
+; LE-NEXT: ori 30, 5, 61680
+; LE-NEXT: std 30, 536(1) # 8-byte Folded Spill
+; LE-NEXT: ori 0, 10, 3855
+; LE-NEXT: std 0, 544(1) # 8-byte Folded Spill
+; LE-NEXT: or 5, 6, 11
+; LE-NEXT: or 6, 7, 12
+; LE-NEXT: sldi 7, 3, 4
+; LE-NEXT: rldicl 3, 3, 60, 4
+; LE-NEXT: sldi 8, 4, 4
+; LE-NEXT: rldicl 4, 4, 60, 4
+; LE-NEXT: sldi 9, 5, 4
+; LE-NEXT: rldicl 5, 5, 60, 4
+; LE-NEXT: sldi 10, 6, 4
+; LE-NEXT: rldicl 6, 6, 60, 4
+; LE-NEXT: and 7, 7, 30
+; LE-NEXT: and 3, 3, 0
+; LE-NEXT: and 8, 8, 30
+; LE-NEXT: and 4, 4, 0
+; LE-NEXT: and 9, 9, 30
+; LE-NEXT: and 5, 5, 0
+; LE-NEXT: and 10, 10, 30
+; LE-NEXT: and 6, 6, 0
+; LE-NEXT: or 3, 3, 7
+; LE-NEXT: or 4, 4, 8
+; LE-NEXT: or 5, 5, 9
+; LE-NEXT: or 6, 6, 10
+; LE-NEXT: rldicl 7, 3, 32, 32
+; LE-NEXT: rotlwi 8, 3, 24
+; LE-NEXT: rldicl 9, 4, 32, 32
+; LE-NEXT: rotlwi 10, 4, 24
+; LE-NEXT: rldicl 11, 5, 32, 32
+; LE-NEXT: rotlwi 12, 5, 24
+; LE-NEXT: rotlwi 29, 7, 24
+; LE-NEXT: rlwimi 8, 3, 8, 8, 15
+; LE-NEXT: rotlwi 28, 9, 24
+; LE-NEXT: rlwimi 10, 4, 8, 8, 15
+; LE-NEXT: rlwimi 8, 3, 8, 24, 31
+; LE-NEXT: rlwimi 10, 4, 8, 24, 31
+; LE-NEXT: rotlwi 4, 11, 24
+; LE-NEXT: rlwimi 12, 5, 8, 8, 15
+; LE-NEXT: rlwimi 29, 7, 8, 8, 15
+; LE-NEXT: sldi 3, 8, 32
+; LE-NEXT: rlwimi 28, 9, 8, 8, 15
+; LE-NEXT: sldi 8, 10, 32
+; LE-NEXT: rlwimi 12, 5, 8, 24, 31
+; LE-NEXT: rlwimi 29, 7, 8, 24, 31
+; LE-NEXT: rlwimi 28, 9, 8, 24, 31
+; LE-NEXT: rlwimi 4, 11, 8, 8, 15
+; LE-NEXT: sldi 5, 12, 32
+; LE-NEXT: or 9, 3, 29
+; LE-NEXT: or 3, 8, 28
+; LE-NEXT: rlwimi 4, 11, 8, 24, 31
+; LE-NEXT: or 10, 5, 4
+; LE-NEXT: rlwinm 4, 3, 0, 30, 30
+; LE-NEXT: std 4, 528(1) # 8-byte Folded Spill
+; LE-NEXT: rlwinm 4, 3, 0, 5, 5
+; LE-NEXT: std 4, 376(1) # 8-byte Folded Spill
+; LE-NEXT: rlwinm 4, 3, 0, 4, 4
+; LE-NEXT: std 4, 368(1) # 8-byte Folded Spill
+; LE-NEXT: rlwinm 4, 3, 0, 3, 3
+; LE-NEXT: std 4, 360(1) # 8-byte Folded Spill
+; LE-NEXT: rlwinm 4, 3, 0, 2, 2
+; LE-NEXT: std 4, 352(1) # 8-byte Folded Spill
+; LE-NEXT: rlwinm 4, 3, 0, 1, 1
+; LE-NEXT: std 4, 344(1) # 8-byte Folded Spill
+; LE-NEXT: rlwinm 4, 3, 0, 0, 0
+; LE-NEXT: std 4, 336(1) # 8-byte Folded Spill
+; LE-NEXT: rldicl 4, 3, 32, 32
+; LE-NEXT: std 4, 272(1) # 8-byte Folded Spill
+; LE-NEXT: rldicl 4, 3, 31, 33
+; LE-NEXT: std 4, 264(1) # 8-byte Folded Spill
+; LE-NEXT: rldicl 4, 3, 30, 34
+; LE-NEXT: std 4, 256(1) # 8-byte Folded Spill
+; LE-NEXT: rldicl 4, 3, 29, 35
+; LE-NEXT: std 4, 248(1) # 8-byte Folded Spill
+; LE-NEXT: rldicl 4, 3, 28, 36
+; LE-NEXT: std 4, 240(1) # 8-byte Folded Spill
+; LE-NEXT: rldicl 4, 3, 27, 37
+; LE-NEXT: std 4, 232(1) # 8-byte Folded Spill
+; LE-NEXT: rldicl 4, 3, 26, 38
+; LE-NEXT: std 4, 224(1) # 8-byte Folded Spill
+; LE-NEXT: rldicl 4, 3, 25, 39
+; LE-NEXT: std 4, 216(1) # 8-byte Folded Spill
+; LE-NEXT: rldicl 4, 3, 24, 40
+; LE-NEXT: rldicl 0, 6, 32, 32
+; LE-NEXT: rotlwi 30, 6, 24
+; LE-NEXT: rotlwi 27, 0, 24
+; LE-NEXT: std 4, 208(1) # 8-byte Folded Spill
+; LE-NEXT: rldicl 4, 3, 23, 41
+; LE-NEXT: rlwimi 30, 6, 8, 8, 15
+; LE-NEXT: rlwimi 30, 6, 8, 24, 31
+; LE-NEXT: rlwimi 27, 0, 8, 8, 15
+; LE-NEXT: std 4, 200(1) # 8-byte Folded Spill
+; LE-NEXT: rldicl 4, 3, 22, 42
+; LE-NEXT: sldi 6, 30, 32
+; LE-NEXT: rlwimi 27, 0, 8, 24, 31
+; LE-NEXT: or 11, 6, 27
+; LE-NEXT: std 4, 192(1) # 8-byte Folded Spill
+; LE-NEXT: rldicl 4, 3, 21, 43
+; LE-NEXT: clrldi 5, 3, 63
+; LE-NEXT: rlwinm 6, 3, 0, 29, 29
+; LE-NEXT: rlwinm 7, 3, 0, 28, 28
+; LE-NEXT: std 4, 184(1) # 8-byte Folded Spill
+; LE-NEXT: rldicl 4, 3, 20, 44
+; LE-NEXT: rlwinm 8, 3, 0, 27, 27
+; LE-NEXT: rlwinm 12, 3, 0, 26, 26
+; LE-NEXT: rlwinm 0, 3, 0, 25, 25
+; LE-NEXT: std 4, 176(1) # 8-byte Folded Spill
+; LE-NEXT: rldicl 4, 3, 19, 45
+; LE-NEXT: rlwinm 30, 3, 0, 24, 24
+; LE-NEXT: rlwinm 29, 3, 0, 23, 23
+; LE-NEXT: rlwinm 28, 3, 0, 22, 22
+; LE-NEXT: std 4, 168(1) # 8-byte Folded Spill
+; LE-NEXT: rldicl 4, 3, 18, 46
+; LE-NEXT: rlwinm 27, 3, 0, 21, 21
+; LE-NEXT: rlwinm 26, 3, 0, 20, 20
+; LE-NEXT: rlwinm 25, 3, 0, 19, 19
+; LE-NEXT: std 4, 160(1) # 8-byte Folded Spill
+; LE-NEXT: rldicl 4, 3, 17, 47
+; LE-NEXT: rlwinm 24, 3, 0, 18, 18
+; LE-NEXT: rlwinm 23, 3, 0, 17, 17
+; LE-NEXT: rlwinm 22, 3, 0, 16, 16
+; LE-NEXT: std 4, 152(1) # 8-byte Folded Spill
+; LE-NEXT: rldicl 4, 3, 16, 48
+; LE-NEXT: rlwinm 21, 3, 0, 15, 15
+; LE-NEXT: rlwinm 20, 3, 0, 14, 14
+; LE-NEXT: rlwinm 19, 3, 0, 13, 13
+; LE-NEXT: std 4, 144(1) # 8-byte Folded Spill
+; LE-NEXT: rldicl 4, 3, 15, 49
+; LE-NEXT: rlwinm 18, 3, 0, 12, 12
+; LE-NEXT: rlwinm 17, 3, 0, 11, 11
+; LE-NEXT: rlwinm 16, 3, 0, 10, 10
+; LE-NEXT: std 4, 136(1) # 8-byte Folded Spill
+; LE-NEXT: rldicl 4, 3, 14, 50
+; LE-NEXT: rlwinm 15, 3, 0, 9, 9
+; LE-NEXT: rlwinm 14, 3, 0, 8, 8
+; LE-NEXT: rlwinm 31, 3, 0, 7, 7
+; LE-NEXT: std 4, 128(1) # 8-byte Folded Spill
+; LE-NEXT: rldicl 4, 3, 13, 51
+; LE-NEXT: rlwinm 2, 3, 0, 6, 6
+; LE-NEXT: std 4, 120(1) # 8-byte Folded Spill
+; LE-NEXT: rldicl 4, 3, 12, 52
+; LE-NEXT: std 4, 112(1) # 8-byte Folded Spill
+; LE-NEXT: rldicl 4, 3, 11, 53
+; LE-NEXT: std 4, 104(1) # 8-byte Folded Spill
+; LE-NEXT: rldicl 4, 3, 10, 54
+; LE-NEXT: std 4, 96(1) # 8-byte Folded Spill
+; LE-NEXT: rldicl 4, 3, 9, 55
+; LE-NEXT: std 4, 88(1) # 8-byte Folded Spill
+; LE-NEXT: rldicl 4, 3, 8, 56
+; LE-NEXT: std 4, 80(1) # 8-byte Folded Spill
+; LE-NEXT: rldicl 4, 3, 7, 57
+; LE-NEXT: std 4, 72(1) # 8-byte Folded Spill
+; LE-NEXT: rldicl 4, 3, 6, 58
+; LE-NEXT: std 4, 64(1) # 8-byte Folded Spill
+; LE-NEXT: rldicl 4, 3, 5, 59
+; LE-NEXT: std 4, 56(1) # 8-byte Folded Spill
+; LE-NEXT: rldicl 4, 3, 4, 60
+; LE-NEXT: std 4, 48(1) # 8-byte Folded Spill
+; LE-NEXT: rldicl 4, 3, 3, 61
+; LE-NEXT: rldicl 3, 3, 2, 62
+; LE-NEXT: std 3, 32(1) # 8-byte Folded Spill
+; LE-NEXT: ld 3, 528(1) # 8-byte Folded Reload
+; LE-NEXT: std 4, 40(1) # 8-byte Folded Spill
+; LE-NEXT: mulld 3, 11, 3
+; LE-NEXT: std 3, 288(1) # 8-byte Folded Spill
+; LE-NEXT: mulld 3, 11, 5
+; LE-NEXT: std 3, 280(1) # 8-byte Folded Spill
+; LE-NEXT: mulld 3, 11, 6
+; LE-NEXT: std 3, 296(1) # 8-byte Folded Spill
+; LE-NEXT: mulld 3, 11, 7
+; LE-NEXT: std 3, 304(1) # 8-byte Folded Spill
+; LE-NEXT: mulld 3, 11, 8
+; LE-NEXT: std 3, 312(1) # 8-byte Folded Spill
+; LE-NEXT: mulld 3, 11, 12
+; LE-NEXT: std 3, 320(1) # 8-byte Folded Spill
+; LE-NEXT: mulld 3, 11, 0
+; LE-NEXT: std 3, 328(1) # 8-byte Folded Spill
+; LE-NEXT: mulld 3, 11, 30
+; LE-NEXT: std 3, 528(1) # 8-byte Folded Spill
+; LE-NEXT: mulld 3, 11, 29
+; LE-NEXT: std 3, 520(1) # 8-byte Folded Spill
+; LE-NEXT: mulld 3, 11, 28
+; LE-NEXT: std 3, 512(1) # 8-byte Folded Spill
+; LE-NEXT: mulld 3, 11, 27
+; LE-NEXT: std 3, 504(1) # 8-byte Folded Spill
+; LE-NEXT: mulld 3, 11, 26
+; LE-NEXT: std 3, 496(1) # 8-byte Folded Spill
+; LE-NEXT: mulld 3, 11, 25
+; LE-NEXT: std 3, 488(1) # 8-byte Folded Spill
+; LE-NEXT: mulld 3, 11, 24
+; LE-NEXT: std 3, 480(1) # 8-byte Folded Spill
+; LE-NEXT: mulld 3, 11, 23
+; LE-NEXT: std 3, 472(1) # 8-byte Folded Spill
+; LE-NEXT: mulld 3, 11, 22
+; LE-NEXT: std 3, 464(1) # 8-byte Folded Spill
+; LE-NEXT: mulld 3, 11, 21
+; LE-NEXT: std 3, 456(1) # 8-byte Folded Spill
+; LE-NEXT: mulld 3, 11, 20
+; LE-NEXT: std 3, 448(1) # 8-byte Folded Spill
+; LE-NEXT: mulld 3, 11, 19
+; LE-NEXT: std 3, 440(1) # 8-byte Folded Spill
+; LE-NEXT: mulld 3, 11, 18
+; LE-NEXT: std 3, 432(1) # 8-byte Folded Spill
+; LE-NEXT: mulld 3, 11, 17
+; LE-NEXT: std 3, 424(1) # 8-byte Folded Spill
+; LE-NEXT: mulld 3, 11, 16
+; LE-NEXT: std 3, 416(1) # 8-byte Folded Spill
+; LE-NEXT: mulld 3, 11, 15
+; LE-NEXT: std 3, 408(1) # 8-byte Folded Spill
+; LE-NEXT: mulld 3, 11, 14
+; LE-NEXT: std 3, 400(1) # 8-byte Folded Spill
+; LE-NEXT: mulld 3, 11, 31
+; LE-NEXT: std 3, 392(1) # 8-byte Folded Spill
+; LE-NEXT: mulld 3, 11, 2
+; LE-NEXT: std 3, 384(1) # 8-byte Folded Spill
+; LE-NEXT: ld 3, 376(1) # 8-byte Folded Reload
+; LE-NEXT: mulld 3, 11, 3
+; LE-NEXT: std 3, 376(1) # 8-byte Folded Spill
+; LE-NEXT: ld 3, 368(1) # 8-byte Folded Reload
+; LE-NEXT: mulld 3, 11, 3
+; LE-NEXT: std 3, 368(1) # 8-byte Folded Spill
+; LE-NEXT: ld 3, 360(1) # 8-byte Folded Reload
+; LE-NEXT: mulld 3, 11, 3
+; LE-NEXT: std 3, 360(1) # 8-byte Folded Spill
+; LE-NEXT: ld 3, 352(1) # 8-byte Folded Reload
+; LE-NEXT: mulld 3, 11, 3
+; LE-NEXT: std 3, 352(1) # 8-byte Folded Spill
+; LE-NEXT: ld 3, 344(1) # 8-byte Folded Reload
+; LE-NEXT: mulld 3, 11, 3
+; LE-NEXT: std 3, 344(1) # 8-byte Folded Spill
+; LE-NEXT: ld 3, 336(1) # 8-byte Folded Reload
+; LE-NEXT: mulld 3, 11, 3
+; LE-NEXT: std 3, 336(1) # 8-byte Folded Spill
+; LE-NEXT: ld 3, 272(1) # 8-byte Folded Reload
+; LE-NEXT: rldicl 4, 3, 32, 31
+; LE-NEXT: ld 3, 264(1) # 8-byte Folded Reload
+; LE-NEXT: mulld 4, 11, 4
+; LE-NEXT: rldicl 5, 3, 33, 30
+; LE-NEXT: ld 3, 256(1) # 8-byte Folded Reload
+; LE-NEXT: rldicl 6, 3, 34, 29
+; LE-NEXT: ld 3, 248(1) # 8-byte Folded Reload
+; LE-NEXT: std 4, 272(1) # 8-byte Folded Spill
+; LE-NEXT: mulld 4, 11, 5
+; LE-NEXT: ld 5, 280(1) # 8-byte Folded Reload
+; LE-NEXT: rldicl 7, 3, 35, 28
+; LE-NEXT: ld 3, 240(1) # 8-byte Folded Reload
+; LE-NEXT: rldicl 8, 3, 36, 27
+; LE-NEXT: ld 3, 232(1) # 8-byte Folded Reload
+; LE-NEXT: std 4, 264(1) # 8-byte Folded Spill
+; LE-NEXT: mulld 4, 11, 6
+; LE-NEXT: mulld 6, 11, 7
+; LE-NEXT: mulld 7, 11, 8
+; LE-NEXT: rldicl 12, 3, 37, 26
+; LE-NEXT: ld 3, 224(1) # 8-byte Folded Reload
+; LE-NEXT: mulld 8, 11, 12
+; LE-NEXT: std 4, 256(1) # 8-byte Folded Spill
+; LE-NEXT: clrldi 4, 9, 63
+; LE-NEXT: rldicl 0, 3, 38, 25
+; LE-NEXT: ld 3, 216(1) # 8-byte Folded Reload
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: mulld 12, 11, 0
+; LE-NEXT: rldicl 30, 3, 39, 24
+; LE-NEXT: ld 3, 208(1) # 8-byte Folded Reload
+; LE-NEXT: mulld 0, 11, 30
+; LE-NEXT: rldicl 29, 3, 40, 23
+; LE-NEXT: ld 3, 200(1) # 8-byte Folded Reload
+; LE-NEXT: mulld 30, 11, 29
+; LE-NEXT: rldicl 28, 3, 41, 22
+; LE-NEXT: ld 3, 192(1) # 8-byte Folded Reload
+; LE-NEXT: mulld 29, 11, 28
+; LE-NEXT: rldicl 27, 3, 42, 21
+; LE-NEXT: ld 3, 184(1) # 8-byte Folded Reload
+; LE-NEXT: mulld 28, 11, 27
+; LE-NEXT: rldicl 26, 3, 43, 20
+; LE-NEXT: ld 3, 176(1) # 8-byte Folded Reload
+; LE-NEXT: mulld 27, 11, 26
+; LE-NEXT: rldicl 25, 3, 44, 19
+; LE-NEXT: ld 3, 168(1) # 8-byte Folded Reload
+; LE-NEXT: mulld 26, 11, 25
+; LE-NEXT: rldicl 24, 3, 45, 18
+; LE-NEXT: ld 3, 160(1) # 8-byte Folded Reload
+; LE-NEXT: mulld 25, 11, 24
+; LE-NEXT: rldicl 23, 3, 46, 17
+; LE-NEXT: ld 3, 152(1) # 8-byte Folded Reload
+; LE-NEXT: mulld 24, 11, 23
+; LE-NEXT: rldicl 22, 3, 47, 16
+; LE-NEXT: ld 3, 144(1) # 8-byte Folded Reload
+; LE-NEXT: mulld 23, 11, 22
+; LE-NEXT: rldicl 21, 3, 48, 15
+; LE-NEXT: ld 3, 136(1) # 8-byte Folded Reload
+; LE-NEXT: mulld 22, 11, 21
+; LE-NEXT: rldicl 20, 3, 49, 14
+; LE-NEXT: ld 3, 128(1) # 8-byte Folded Reload
+; LE-NEXT: mulld 21, 11, 20
+; LE-NEXT: rldicl 19, 3, 50, 13
+; LE-NEXT: ld 3, 120(1) # 8-byte Folded Reload
+; LE-NEXT: mulld 20, 11, 19
+; LE-NEXT: rldicl 18, 3, 51, 12
+; LE-NEXT: ld 3, 112(1) # 8-byte Folded Reload
+; LE-NEXT: mulld 19, 11, 18
+; LE-NEXT: rldicl 17, 3, 52, 11
+; LE-NEXT: ld 3, 104(1) # 8-byte Folded Reload
+; LE-NEXT: mulld 18, 11, 17
+; LE-NEXT: rldicl 16, 3, 53, 10
+; LE-NEXT: ld 3, 96(1) # 8-byte Folded Reload
+; LE-NEXT: mulld 17, 11, 16
+; LE-NEXT: rldicl 15, 3, 54, 9
+; LE-NEXT: ld 3, 88(1) # 8-byte Folded Reload
+; LE-NEXT: mulld 16, 11, 15
+; LE-NEXT: rldicl 14, 3, 55, 8
+; LE-NEXT: ld 3, 80(1) # 8-byte Folded Reload
+; LE-NEXT: mulld 15, 11, 14
+; LE-NEXT: rldicl 31, 3, 56, 7
+; LE-NEXT: ld 3, 72(1) # 8-byte Folded Reload
+; LE-NEXT: mulld 14, 11, 31
+; LE-NEXT: rldicl 2, 3, 57, 6
+; LE-NEXT: ld 3, 64(1) # 8-byte Folded Reload
+; LE-NEXT: mulld 31, 11, 2
+; LE-NEXT: rldicl 3, 3, 58, 5
+; LE-NEXT: std 3, 248(1) # 8-byte Folded Spill
+; LE-NEXT: ld 3, 56(1) # 8-byte Folded Reload
+; LE-NEXT: rldicl 3, 3, 59, 4
+; LE-NEXT: std 3, 240(1) # 8-byte Folded Spill
+; LE-NEXT: ld 3, 48(1) # 8-byte Folded Reload
+; LE-NEXT: rldicl 3, 3, 60, 3
+; LE-NEXT: std 3, 232(1) # 8-byte Folded Spill
+; LE-NEXT: ld 3, 40(1) # 8-byte Folded Reload
+; LE-NEXT: rldicl 3, 3, 61, 2
+; LE-NEXT: std 3, 224(1) # 8-byte Folded Spill
+; LE-NEXT: ld 3, 32(1) # 8-byte Folded Reload
+; LE-NEXT: rldicl 3, 3, 62, 1
+; LE-NEXT: std 3, 216(1) # 8-byte Folded Spill
+; LE-NEXT: ld 3, 248(1) # 8-byte Folded Reload
+; LE-NEXT: mulld 2, 11, 3
+; LE-NEXT: ld 3, 240(1) # 8-byte Folded Reload
+; LE-NEXT: mulld 3, 11, 3
+; LE-NEXT: std 3, 248(1) # 8-byte Folded Spill
+; LE-NEXT: ld 3, 232(1) # 8-byte Folded Reload
+; LE-NEXT: mulld 3, 11, 3
+; LE-NEXT: std 3, 240(1) # 8-byte Folded Spill
+; LE-NEXT: ld 3, 224(1) # 8-byte Folded Reload
+; LE-NEXT: mulld 3, 11, 3
+; LE-NEXT: std 3, 232(1) # 8-byte Folded Spill
+; LE-NEXT: ld 3, 216(1) # 8-byte Folded Reload
+; LE-NEXT: mulld 11, 11, 3
+; LE-NEXT: rlwinm 3, 9, 0, 30, 30
+; LE-NEXT: mulld 3, 10, 3
+; LE-NEXT: xor 3, 4, 3
+; LE-NEXT: ld 4, 288(1) # 8-byte Folded Reload
+; LE-NEXT: xor 4, 5, 4
+; LE-NEXT: rlwinm 5, 9, 0, 29, 29
+; LE-NEXT: mulld 5, 10, 5
+; LE-NEXT: xor 3, 3, 5
+; LE-NEXT: ld 5, 296(1) # 8-byte Folded Reload
+; LE-NEXT: xor 4, 4, 5
+; LE-NEXT: rlwinm 5, 9, 0, 28, 28
+; LE-NEXT: mulld 5, 10, 5
+; LE-NEXT: xor 3, 3, 5
+; LE-NEXT: ld 5, 304(1) # 8-byte Folded Reload
+; LE-NEXT: xor 4, 4, 5
+; LE-NEXT: rlwinm 5, 9, 0, 27, 27
+; LE-NEXT: mulld 5, 10, 5
+; LE-NEXT: xor 3, 3, 5
+; LE-NEXT: ld 5, 312(1) # 8-byte Folded Reload
+; LE-NEXT: xor 4, 4, 5
+; LE-NEXT: rlwinm 5, 9, 0, 26, 26
+; LE-NEXT: mulld 5, 10, 5
+; LE-NEXT: xor 3, 3, 5
+; LE-NEXT: ld 5, 320(1) # 8-byte Folded Reload
+; LE-NEXT: xor 4, 4, 5
+; LE-NEXT: rlwinm 5, 9, 0, 25, 25
+; LE-NEXT: mulld 5, 10, 5
+; LE-NEXT: xor 3, 3, 5
+; LE-NEXT: ld 5, 328(1) # 8-byte Folded Reload
+; LE-NEXT: xor 4, 4, 5
+; LE-NEXT: rlwinm 5, 9, 0, 24, 24
+; LE-NEXT: mulld 5, 10, 5
+; LE-NEXT: xor 3, 3, 5
+; LE-NEXT: std 3, 328(1) # 8-byte Folded Spill
+; LE-NEXT: ld 3, 528(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 4, 3
+; LE-NEXT: ld 4, 520(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: ld 4, 512(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: ld 4, 504(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: ld 4, 496(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: ld 4, 488(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: ld 4, 480(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: ld 4, 472(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: ld 4, 464(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: ld 4, 456(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: ld 4, 448(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: ld 4, 440(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: ld 4, 432(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: ld 4, 424(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: ld 4, 416(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: ld 4, 408(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: ld 4, 400(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: ld 4, 392(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: ld 4, 384(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: ld 4, 376(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: ld 4, 368(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: ld 4, 360(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: ld 4, 352(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: ld 4, 344(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: ld 4, 336(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: ld 4, 272(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: ld 4, 264(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: ld 4, 256(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: ld 4, 248(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 6
+; LE-NEXT: ld 6, 576(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 7
+; LE-NEXT: ld 7, 568(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 8
+; LE-NEXT: ld 8, 560(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 12
+; LE-NEXT: ld 12, 544(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 0
+; LE-NEXT: ld 0, 536(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 30
+; LE-NEXT: ld 30, 720(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 29
+; LE-NEXT: ld 29, 712(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 28
+; LE-NEXT: ld 28, 704(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 27
+; LE-NEXT: ld 27, 696(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 26
+; LE-NEXT: ld 26, 688(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 25
+; LE-NEXT: ld 25, 680(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 24
+; LE-NEXT: ld 24, 672(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 23
+; LE-NEXT: ld 23, 664(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 22
+; LE-NEXT: ld 22, 656(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 21
+; LE-NEXT: ld 21, 648(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 20
+; LE-NEXT: ld 20, 640(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 19
+; LE-NEXT: ld 19, 632(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 18
+; LE-NEXT: ld 18, 624(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 17
+; LE-NEXT: ld 17, 616(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 16
+; LE-NEXT: ld 16, 608(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 15
+; LE-NEXT: ld 15, 600(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 14
+; LE-NEXT: ld 14, 592(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 31
+; LE-NEXT: ld 31, 728(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 2
+; LE-NEXT: ld 2, 584(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: ld 4, 240(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: ld 4, 232(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: xor 3, 3, 11
+; LE-NEXT: ld 11, 552(1) # 8-byte Folded Reload
+; LE-NEXT: sldi 4, 3, 1
+; LE-NEXT: rldicl 3, 3, 63, 1
+; LE-NEXT: and 4, 4, 7
+; LE-NEXT: and 3, 3, 6
+; LE-NEXT: or 3, 3, 4
+; LE-NEXT: sldi 4, 3, 2
+; LE-NEXT: rldicl 3, 3, 62, 2
+; LE-NEXT: and 4, 4, 11
+; LE-NEXT: and 3, 3, 8
+; LE-NEXT: or 3, 3, 4
+; LE-NEXT: sldi 4, 3, 4
+; LE-NEXT: rldicl 3, 3, 60, 4
+; LE-NEXT: and 4, 4, 0
+; LE-NEXT: and 3, 3, 12
+; LE-NEXT: or 3, 3, 4
+; LE-NEXT: rotlwi 5, 3, 24
+; LE-NEXT: rldicl 4, 3, 32, 32
+; LE-NEXT: rlwimi 5, 3, 8, 8, 15
+; LE-NEXT: rlwimi 5, 3, 8, 24, 31
+; LE-NEXT: rotlwi 3, 4, 24
+; LE-NEXT: rlwimi 3, 4, 8, 8, 15
+; LE-NEXT: rlwimi 3, 4, 8, 24, 31
+; LE-NEXT: sldi 4, 5, 32
+; LE-NEXT: or 3, 4, 3
+; LE-NEXT: ld 4, 328(1) # 8-byte Folded Reload
+; LE-NEXT: rldicl 3, 3, 63, 1
+; LE-NEXT: mtfprd 0, 3
+; LE-NEXT: rlwinm 3, 9, 0, 23, 23
+; LE-NEXT: mulld 3, 10, 3
+; LE-NEXT: xor 3, 4, 3
+; LE-NEXT: rlwinm 4, 9, 0, 22, 22
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rlwinm 4, 9, 0, 21, 21
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rlwinm 4, 9, 0, 20, 20
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rlwinm 4, 9, 0, 19, 19
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rlwinm 4, 9, 0, 18, 18
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rlwinm 4, 9, 0, 17, 17
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rlwinm 4, 9, 0, 16, 16
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rlwinm 4, 9, 0, 15, 15
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rlwinm 4, 9, 0, 14, 14
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rlwinm 4, 9, 0, 13, 13
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rlwinm 4, 9, 0, 12, 12
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rlwinm 4, 9, 0, 11, 11
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rlwinm 4, 9, 0, 10, 10
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rlwinm 4, 9, 0, 9, 9
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rlwinm 4, 9, 0, 8, 8
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rlwinm 4, 9, 0, 7, 7
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rlwinm 4, 9, 0, 6, 6
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rlwinm 4, 9, 0, 5, 5
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rlwinm 4, 9, 0, 4, 4
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rlwinm 4, 9, 0, 3, 3
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rlwinm 4, 9, 0, 2, 2
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rlwinm 4, 9, 0, 1, 1
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rlwinm 4, 9, 0, 0, 0
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rldicl 4, 9, 32, 32
+; LE-NEXT: rldicl 4, 4, 32, 31
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rldicl 4, 9, 31, 33
+; LE-NEXT: rldicl 4, 4, 33, 30
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rldicl 4, 9, 30, 34
+; LE-NEXT: rldicl 4, 4, 34, 29
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rldicl 4, 9, 29, 35
+; LE-NEXT: rldicl 4, 4, 35, 28
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rldicl 4, 9, 28, 36
+; LE-NEXT: rldicl 4, 4, 36, 27
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rldicl 4, 9, 27, 37
+; LE-NEXT: rldicl 4, 4, 37, 26
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rldicl 4, 9, 26, 38
+; LE-NEXT: rldicl 4, 4, 38, 25
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rldicl 4, 9, 25, 39
+; LE-NEXT: rldicl 4, 4, 39, 24
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rldicl 4, 9, 24, 40
+; LE-NEXT: rldicl 4, 4, 40, 23
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rldicl 4, 9, 23, 41
+; LE-NEXT: rldicl 4, 4, 41, 22
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rldicl 4, 9, 22, 42
+; LE-NEXT: rldicl 4, 4, 42, 21
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rldicl 4, 9, 21, 43
+; LE-NEXT: rldicl 4, 4, 43, 20
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rldicl 4, 9, 20, 44
+; LE-NEXT: rldicl 4, 4, 44, 19
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rldicl 4, 9, 19, 45
+; LE-NEXT: rldicl 4, 4, 45, 18
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rldicl 4, 9, 18, 46
+; LE-NEXT: rldicl 4, 4, 46, 17
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rldicl 4, 9, 17, 47
+; LE-NEXT: rldicl 4, 4, 47, 16
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rldicl 4, 9, 16, 48
+; LE-NEXT: rldicl 4, 4, 48, 15
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rldicl 4, 9, 15, 49
+; LE-NEXT: rldicl 4, 4, 49, 14
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rldicl 4, 9, 14, 50
+; LE-NEXT: rldicl 4, 4, 50, 13
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rldicl 4, 9, 13, 51
+; LE-NEXT: rldicl 4, 4, 51, 12
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rldicl 4, 9, 12, 52
+; LE-NEXT: rldicl 4, 4, 52, 11
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rldicl 4, 9, 11, 53
+; LE-NEXT: rldicl 4, 4, 53, 10
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rldicl 4, 9, 10, 54
+; LE-NEXT: rldicl 4, 4, 54, 9
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rldicl 4, 9, 9, 55
+; LE-NEXT: rldicl 4, 4, 55, 8
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rldicl 4, 9, 8, 56
+; LE-NEXT: rldicl 4, 4, 56, 7
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rldicl 4, 9, 7, 57
+; LE-NEXT: rldicl 4, 4, 57, 6
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rldicl 4, 9, 6, 58
+; LE-NEXT: rldicl 4, 4, 58, 5
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rldicl 4, 9, 5, 59
+; LE-NEXT: rldicl 4, 4, 59, 4
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rldicl 4, 9, 4, 60
+; LE-NEXT: rldicl 4, 4, 60, 3
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rldicl 4, 9, 3, 61
+; LE-NEXT: rldicl 4, 4, 61, 2
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rldicl 4, 9, 2, 62
+; LE-NEXT: rldicl 4, 4, 62, 1
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: sldi 4, 3, 1
+; LE-NEXT: rldicl 3, 3, 63, 1
+; LE-NEXT: and 4, 4, 7
+; LE-NEXT: and 3, 3, 6
+; LE-NEXT: or 3, 3, 4
+; LE-NEXT: sldi 4, 3, 2
+; LE-NEXT: rldicl 3, 3, 62, 2
+; LE-NEXT: and 4, 4, 11
+; LE-NEXT: and 3, 3, 8
+; LE-NEXT: or 3, 3, 4
+; LE-NEXT: sldi 4, 3, 4
+; LE-NEXT: rldicl 3, 3, 60, 4
+; LE-NEXT: and 4, 4, 0
+; LE-NEXT: and 3, 3, 12
+; LE-NEXT: or 3, 3, 4
+; LE-NEXT: rldicl 4, 3, 32, 32
+; LE-NEXT: rotlwi 5, 4, 24
+; LE-NEXT: rlwimi 5, 4, 8, 8, 15
+; LE-NEXT: rlwimi 5, 4, 8, 24, 31
+; LE-NEXT: rotlwi 4, 3, 24
+; LE-NEXT: rlwimi 4, 3, 8, 8, 15
+; LE-NEXT: rlwimi 4, 3, 8, 24, 31
+; LE-NEXT: sldi 3, 4, 32
+; LE-NEXT: or 3, 3, 5
+; LE-NEXT: rldicl 3, 3, 63, 1
+; LE-NEXT: mtfprd 1, 3
+; LE-NEXT: xxmrghd 34, 1, 0
+; LE-NEXT: addi 1, 1, 736
+; LE-NEXT: blr
+ %a.ext = zext <2 x i64> %a to <2 x i128>
+ %b.ext = zext <2 x i64> %b to <2 x i128>
+ %clmul = call <2 x i128> @llvm.clmul.v2i128(<2 x i128> %a.ext, <2 x i128> %b.ext)
+ %res.ext = lshr <2 x i128> %clmul, splat (i128 64)
+ %res = trunc <2 x i128> %res.ext to <2 x i64>
+ ret <2 x i64> %res
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/X86/known-pow2.ll b/llvm/test/CodeGen/X86/known-pow2.ll
index 92b176b7a4bbb..b4dd00125aab5 100644
--- a/llvm/test/CodeGen/X86/known-pow2.ll
+++ b/llvm/test/CodeGen/X86/known-pow2.ll
@@ -797,17 +797,13 @@ define i1 @pow2_and_fail0(i32 %x, i32 %y) {
ret i1 %r
}
-define i1 @pow2_and_fail1(i32 %x, i32 %y) {
-; CHECK-LABEL: pow2_and_fail1:
+define i1 @pow2_andnot_3op(i32 %x, i32 %y) {
+; CHECK-LABEL: pow2_andnot_3op:
; CHECK: # %bb.0:
-; CHECK-NEXT: movl %esi, %ecx
-; CHECK-NEXT: movl $1, %eax
-; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx
-; CHECK-NEXT: shll %cl, %eax
; CHECK-NEXT: notl %edi
-; CHECK-NEXT: andl %eax, %edi
-; CHECK-NEXT: testl $-2, %edi
-; CHECK-NEXT: sete %al
+; CHECK-NEXT: andl $-2, %edi
+; CHECK-NEXT: btl %esi, %edi
+; CHECK-NEXT: setae %al
; CHECK-NEXT: retq
%yy = shl i32 1, %y
%nyy = sub i32 1, %yy
@@ -817,17 +813,13 @@ define i1 @pow2_and_fail1(i32 %x, i32 %y) {
ret i1 %r
}
-define i1 @pow2_and_fail2(i32 %x, i32 %y, i32 %z) {
-; CHECK-LABEL: pow2_and_fail2:
+define i1 @pow2_and_3op(i32 %x, i32 %y, i32 %z) {
+; CHECK-LABEL: pow2_and_3op:
; CHECK: # %bb.0:
-; CHECK-NEXT: movl %esi, %ecx
-; CHECK-NEXT: movl $1, %eax
-; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx
-; CHECK-NEXT: shll %cl, %eax
-; CHECK-NEXT: andl %edx, %eax
; CHECK-NEXT: notl %edi
-; CHECK-NEXT: testl %edi, %eax
-; CHECK-NEXT: sete %al
+; CHECK-NEXT: andl %edx, %edi
+; CHECK-NEXT: btl %esi, %edi
+; CHECK-NEXT: setae %al
; CHECK-NEXT: retq
%yy = shl i32 1, %y
%d = and i32 %yy, %z
@@ -856,13 +848,9 @@ define i1 @pow2_though_zext(i32 %x, i16 %y) {
define i1 @pow2_and_i20(i20 %num, i20 %shift) {
; CHECK-LABEL: pow2_and_i20:
; CHECK: # %bb.0:
-; CHECK-NEXT: movl %esi, %ecx
-; CHECK-NEXT: movl $1, %eax
-; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx
-; CHECK-NEXT: shll %cl, %eax
-; CHECK-NEXT: andl %edi, %eax
-; CHECK-NEXT: testl $1048575, %eax # imm = 0xFFFFF
-; CHECK-NEXT: sete %al
+; CHECK-NEXT: andl $1048575, %edi # imm = 0xFFFFF
+; CHECK-NEXT: btl %esi, %edi
+; CHECK-NEXT: setae %al
; CHECK-NEXT: retq
%mask = shl nuw i20 1, %shift
%bit = and i20 %mask, %num
@@ -873,13 +861,10 @@ define i1 @pow2_and_i20(i20 %num, i20 %shift) {
define i1 @pow2_and_i50(i50 %num, i50 %shift) {
; CHECK-LABEL: pow2_and_i50:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %rsi, %rcx
-; CHECK-NEXT: movl $1, %eax
-; CHECK-NEXT: # kill: def $cl killed $cl killed $rcx
-; CHECK-NEXT: shlq %cl, %rax
+; CHECK-NEXT: movabsq $1125899906842623, %rax # imm = 0x3FFFFFFFFFFFF
; CHECK-NEXT: andq %rdi, %rax
-; CHECK-NEXT: shlq $14, %rax
-; CHECK-NEXT: sete %al
+; CHECK-NEXT: btq %rsi, %rax
+; CHECK-NEXT: setae %al
; CHECK-NEXT: retq
%mask = shl nuw i50 1, %shift
%bit = and i50 %mask, %num
diff --git a/llvm/test/MC/AMDGPU/gfx1170_asm_wmma_w32.s b/llvm/test/MC/AMDGPU/gfx1170_asm_wmma_w32.s
new file mode 100644
index 0000000000000..abdb344ac0614
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx1170_asm_wmma_w32.s
@@ -0,0 +1,1529 @@
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1170 -show-encoding %s | FileCheck --check-prefix=GFX1170 %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1170 %s 2>&1 | FileCheck --check-prefix=GFX1170-ERR --implicit-check-not=error: %s
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15]
+// GFX1170: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x1c]
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] clamp
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0]
+// GFX1170: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x3c]
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0]
+// GFX1170: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x5c]
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
+// GFX1170: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x9c]
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[1,0,0]
+// GFX1170: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x40,0xcc,0x00,0x09,0x22,0x1c]
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,1,0]
+// GFX1170: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x40,0xcc,0x00,0x09,0x22,0x1c]
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1]
+// GFX1170: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x40,0xcc,0x00,0x09,0x22,0x1c]
+
+v_wmma_f32_16x16x16_f16 v[8:15], s[0:3], v[4:7], v[8:15]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], s[4:7], v[8:15]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], s[8:15]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_f16 v[8:15], 1.0, v[4:7], v[8:15]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], 1.0, v[8:15]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], 1.0
+// GFX1170: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], 1.0 ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0xca,0x1b]
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], 1
+// GFX1170: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x06,0x1a]
+
+
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15]
+// GFX1170: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x1c]
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] clamp
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0]
+// GFX1170: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x3c]
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0]
+// GFX1170: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x5c]
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
+// GFX1170: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x9c]
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[1,0,0]
+// GFX1170: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x41,0xcc,0x00,0x09,0x22,0x1c]
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,1,0]
+// GFX1170: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x41,0xcc,0x00,0x09,0x22,0x1c]
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1]
+// GFX1170: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x41,0xcc,0x00,0x09,0x22,0x1c]
+
+v_wmma_f32_16x16x16_bf16 v[8:15], s[0:3], v[4:7], v[8:15]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], s[4:7], v[8:15]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], s[8:15]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf16 v[8:15], 1.0, v[4:7], v[8:15]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], 1.0, v[8:15]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], 1.0
+// GFX1170: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], 1.0 ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0xca,0x1b]
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], 1
+// GFX1170: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x06,0x1a]
+
+
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11]
+// GFX1170: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x1c]
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] clamp
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0]
+// GFX1170: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x3c]
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0]
+// GFX1170: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x5c]
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1]
+// GFX1170: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x9c]
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[1,0,0]
+// GFX1170: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x42,0xcc,0x00,0x09,0x22,0x1c]
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,1,0]
+// GFX1170: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x42,0xcc,0x00,0x09,0x22,0x1c]
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1]
+// GFX1170: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x42,0xcc,0x00,0x09,0x22,0x1c]
+
+v_wmma_f16_16x16x16_f16 v[8:11], s[0:3], v[4:7], v[8:11]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], s[4:7], v[8:11]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], s[8:11]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f16_16x16x16_f16 v[8:11], 1.0, v[4:7], v[8:11]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], 1.0, v[8:11]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], 1.0
+// GFX1170: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], 1.0 ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0xca,0x1b]
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], 1
+// GFX1170: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x06,0x1a]
+
+
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11]
+// GFX1170: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x1c]
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] clamp
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0]
+// GFX1170: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x3c]
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0]
+// GFX1170: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x5c]
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1]
+// GFX1170: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x9c]
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[1,0,0]
+// GFX1170: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x43,0xcc,0x00,0x09,0x22,0x1c]
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,1,0]
+// GFX1170: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x43,0xcc,0x00,0x09,0x22,0x1c]
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1]
+// GFX1170: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x43,0xcc,0x00,0x09,0x22,0x1c]
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], s[0:3], v[4:7], v[8:11]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], s[4:7], v[8:11]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], s[8:11]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], 1.0, v[4:7], v[8:11]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], 1.0, v[8:11]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], 1.0
+// GFX1170: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], 1.0 ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0xca,0x1b]
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], 1
+// GFX1170: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x06,0x1a]
+
+
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11]
+// GFX1170: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] clamp
+// GFX1170: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] clamp ; encoding: [0x04,0xc0,0x44,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
+// GFX1170: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x3c]
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
+// GFX1170: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x5c]
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[4:11], s[0:1], v[2:3], v[4:11]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], s[2:3], v[4:11]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], s[4:11]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu8 v[4:11], 1, v[2:3], v[4:11]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], 1, v[4:11]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], 1
+// GFX1170: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x06,0x1a]
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], 1.0
+// GFX1170: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0xca,0x1b]
+
+
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9]
+// GFX1170: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x1c]
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] clamp
+// GFX1170: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] clamp ; encoding: [0x02,0xc0,0x45,0xcc,0x00,0x03,0x0a,0x1c]
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[1,0,0]
+// GFX1170: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x3c]
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[0,1,0]
+// GFX1170: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x5c]
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_hi:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_hi:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_hi:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:9], s0, v1, v[2:9]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, s1, v[2:9]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, s[0:7]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu4 v[2:9], 1, v1, v[2:9]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, 1, v[2:9]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, 1
+// GFX1170: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, 1 ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x06,0x1a]
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, 1.0
+// GFX1170: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, 1.0 ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0xca,0x1b]
+
+
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11]
+// GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] clamp
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
+// GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x9c]
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
+// GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x46,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], s[0:1], v[2:3], v[4:11]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], s[2:3], v[4:11]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], s[4:11]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], 1.0, v[2:3], v[4:11]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], 1.0, v[4:11]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], 1.0
+// GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0xca,0x1b]
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], 1
+// GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0x06,0x1a]
+
+
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11]
+// GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] clamp
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
+// GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x9c]
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
+// GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x48,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], s[0:1], v[2:3], v[4:11]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], s[2:3], v[4:11]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], s[4:11]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], 1.0, v[2:3], v[4:11]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], 1.0, v[4:11]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], 1.0
+// GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0xca,0x1b]
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], 1
+// GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0x06,0x1a]
+
+
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11]
+// GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] clamp
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
+// GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x9c]
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
+// GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x47,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], s[0:1], v[2:3], v[4:11]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], s[2:3], v[4:11]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], s[4:11]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], 1.0, v[2:3], v[4:11]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], 1.0, v[4:11]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], 1.0
+// GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0xca,0x1b]
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], 1
+// GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0x06,0x1a]
+
+
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11]
+// GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] clamp
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
+// GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x9c]
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
+// GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x49,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], s[0:1], v[2:3], v[4:11]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], s[2:3], v[4:11]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], s[4:11]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], 1.0, v[2:3], v[4:11]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], 1.0, v[4:11]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], 1.0
+// GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0xca,0x1b]
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], 1
+// GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0x06,0x1a]
+
+
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11]
+// GFX1170: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] clamp
+// GFX1170: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] clamp ; encoding: [0x04,0xc0,0x4a,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
+// GFX1170: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x3c]
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
+// GFX1170: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x5c]
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[4:11], s[0:1], v[2:3], v[4:11]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], s[2:3], v[4:11]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], s[4:11]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x32_iu4 v[4:11], 1, v[2:3], v[4:11]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], 1, v[4:11]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], 1
+// GFX1170: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x06,0x1a]
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], 1.0
+// GFX1170: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0xca,0x1b]
+
+
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20
+// GFX1170: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x1c]
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 clamp
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 index_key:1
+// GFX1170: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 index_key:1 ; encoding: [0x0c,0x48,0x50,0xcc,0x00,0x09,0x52,0x1c]
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 index_key:2
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 index_key:3
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0]
+// GFX1170: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x3c]
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0]
+// GFX1170: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x5c]
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[1,0,0]
+// GFX1170: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[1,0,0] ; encoding: [0x0c,0x41,0x50,0xcc,0x00,0x09,0x52,0x1c]
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[0,1,0]
+// GFX1170: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[0,1,0] ; encoding: [0x0c,0x42,0x50,0xcc,0x00,0x09,0x52,0x1c]
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_swmmac_f32_16x16x32_f16 v[12:19], s[0:3], v[4:11], v20
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], s[4:11], v20
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], s20
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_f16 v[12:19], 1.0, v[4:11], v20
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], 1.0, v20
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], 1.0
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], 1
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20
+// GFX1170: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x1c]
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 clamp
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 index_key:1
+// GFX1170: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 index_key:1 ; encoding: [0x0c,0x48,0x51,0xcc,0x00,0x09,0x52,0x1c]
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 index_key:2
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 index_key:3
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0]
+// GFX1170: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x3c]
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0]
+// GFX1170: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x5c]
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[1,0,0]
+// GFX1170: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[1,0,0] ; encoding: [0x0c,0x41,0x51,0xcc,0x00,0x09,0x52,0x1c]
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[0,1,0]
+// GFX1170: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[0,1,0] ; encoding: [0x0c,0x42,0x51,0xcc,0x00,0x09,0x52,0x1c]
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], s[0:3], v[4:11], v20
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], s[4:11], v20
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], s20
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], 1.0, v[4:11], v20
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], 1.0, v20
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], 1.0
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], 1
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16
+// GFX1170: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x1c]
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 clamp
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:1
+// GFX1170: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:1 ; encoding: [0x0c,0x48,0x52,0xcc,0x00,0x09,0x42,0x1c]
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:2
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:3
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0]
+// GFX1170: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x3c]
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0]
+// GFX1170: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x5c]
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[1,0,0]
+// GFX1170: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[1,0,0] ; encoding: [0x0c,0x41,0x52,0xcc,0x00,0x09,0x42,0x1c]
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[0,1,0]
+// GFX1170: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[0,1,0] ; encoding: [0x0c,0x42,0x52,0xcc,0x00,0x09,0x42,0x1c]
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_swmmac_f16_16x16x32_f16 v[12:15], s[0:3], v[4:11], v16
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], s[4:11], v16
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], s16
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f16_16x16x32_f16 v[12:15], 1.0, v[4:11], v16
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], 1.0, v16
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], 1.0
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], 1
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16
+// GFX1170: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x1c]
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 clamp
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:1
+// GFX1170: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:1 ; encoding: [0x0c,0x48,0x53,0xcc,0x00,0x09,0x42,0x1c]
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:2
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:3
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0]
+// GFX1170: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x3c]
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0]
+// GFX1170: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x5c]
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[1,0,0]
+// GFX1170: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[1,0,0] ; encoding: [0x0c,0x41,0x53,0xcc,0x00,0x09,0x42,0x1c]
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[0,1,0]
+// GFX1170: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[0,1,0] ; encoding: [0x0c,0x42,0x53,0xcc,0x00,0x09,0x42,0x1c]
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], s[0:3], v[4:11], v16
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], s[4:11], v16
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], s16
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], 1.0, v[4:11], v16
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], 1.0, v16
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], 1.0
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], 1
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14
+// GFX1170: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x1c]
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 clamp
+// GFX1170: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 clamp ; encoding: [0x06,0xc0,0x54,0xcc,0x00,0x05,0x3a,0x1c]
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 index_key:1
+// GFX1170: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x54,0xcc,0x00,0x05,0x3a,0x1c]
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 index_key:2
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 index_key:3
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0]
+// GFX1170: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x3c]
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0]
+// GFX1170: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x5c]
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], s[0:1], v[2:5], v14
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], s[0:3], v14
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], s14
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], 1, v[2:5], v14
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], 1, v14
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], 1
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], 1.0
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11
+// GFX1170: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x1c]
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 clamp
+// GFX1170: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 clamp ; encoding: [0x03,0xc0,0x55,0xcc,0x00,0x03,0x2e,0x1c]
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 index_key:1
+// GFX1170: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 index_key:1 ; encoding: [0x03,0x48,0x55,0xcc,0x00,0x03,0x2e,0x1c]
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 index_key:2
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 index_key:3
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[1,0,0]
+// GFX1170: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[1,0,0] ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x3c]
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[0,1,0]
+// GFX1170: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[0,1,0] ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x5c]
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_hi:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_hi:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_hi:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], s0, v[1:2], v11
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, s[0:1], v11
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], s11
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], 1, v[1:2], v11
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, 1, v11
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], 1
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], 1.0
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14
+// GFX1170: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x1c]
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 clamp
+// GFX1170: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 clamp ; encoding: [0x06,0xc0,0x56,0xcc,0x00,0x05,0x3a,0x1c]
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 index_key:1
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 index_key:2
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 index_key:3
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0]
+// GFX1170: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x3c]
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0]
+// GFX1170: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x5c]
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_hi:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], s[0:1], v[2:5], v14
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], s[0:3], v14
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], s14
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], 1, v[2:5], v14
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], 1, v14
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], 1
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], 1.0
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14
+// GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x1c]
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 clamp
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1
+// GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x57,0xcc,0x00,0x05,0x3a,0x1c]
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:2
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:3
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], s[0:1], v[2:5], v14
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], s[0:3], v14
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], s14
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], 1.0, v[2:5], v14
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], 1.0, v14
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], 1.0
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], 1
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14
+// GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x1c]
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 clamp
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1
+// GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x58,0xcc,0x00,0x05,0x3a,0x1c]
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:2
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:3
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], s[0:1], v[2:5], v14
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], s[0:3], v14
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], s14
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], 1.0, v[2:5], v14
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], 1.0, v14
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], 1.0
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], 1
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14
+// GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x1c]
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 clamp
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1
+// GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x59,0xcc,0x00,0x05,0x3a,0x1c]
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:2
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:3
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], s[0:1], v[2:5], v14
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], s[0:3], v14
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], s14
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], 1.0, v[2:5], v14
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], 1.0, v14
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], 1.0
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], 1
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14
+// GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x1c]
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 clamp
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1
+// GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x5a,0xcc,0x00,0x05,0x3a,0x1c]
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:2
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:3
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], s[0:1], v[2:5], v14
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], s[0:3], v14
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], s14
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], 1.0, v[2:5], v14
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], 1.0, v14
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], 1.0
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], 1
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
diff --git a/llvm/test/MC/AMDGPU/gfx1170_asm_wmma_w64.s b/llvm/test/MC/AMDGPU/gfx1170_asm_wmma_w64.s
new file mode 100644
index 0000000000000..6b1b889f8bedd
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx1170_asm_wmma_w64.s
@@ -0,0 +1,1529 @@
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1170 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=GFX1170 %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1170 -mattr=+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=GFX1170-ERR --implicit-check-not=error: %s
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7]
+// GFX1170: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] clamp
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0]
+// GFX1170: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x3c]
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0]
+// GFX1170: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x5c]
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
+// GFX1170: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x9c]
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[1,0,0]
+// GFX1170: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[1,0,0] ; encoding: [0x04,0x41,0x40,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,1,0]
+// GFX1170: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,1,0] ; encoding: [0x04,0x42,0x40,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1]
+// GFX1170: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x40,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f32_16x16x16_f16 v[4:7], s[0:1], v[2:3], v[4:7]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], s[2:3], v[4:7]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], s[4:7]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_f16 v[4:7], 1.0, v[2:3], v[4:7]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], 1.0, v[4:7]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], 1.0
+// GFX1170: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0xca,0x1b]
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], 1
+// GFX1170: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x06,0x1a]
+
+
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7]
+// GFX1170: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] clamp
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0]
+// GFX1170: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x3c]
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0]
+// GFX1170: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x5c]
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
+// GFX1170: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x9c]
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[1,0,0]
+// GFX1170: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[1,0,0] ; encoding: [0x04,0x41,0x41,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,1,0]
+// GFX1170: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,1,0] ; encoding: [0x04,0x42,0x41,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1]
+// GFX1170: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x41,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f32_16x16x16_bf16 v[4:7], s[0:1], v[2:3], v[4:7]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], s[2:3], v[4:7]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], s[4:7]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf16 v[4:7], 1.0, v[2:3], v[4:7]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], 1.0, v[4:7]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], 1.0
+// GFX1170: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0xca,0x1b]
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], 1
+// GFX1170: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x06,0x1a]
+
+
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5]
+// GFX1170: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] clamp
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0]
+// GFX1170: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x3c]
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0]
+// GFX1170: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x5c]
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1]
+// GFX1170: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x9c]
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[1,0,0]
+// GFX1170: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[1,0,0] ; encoding: [0x04,0x41,0x42,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,1,0]
+// GFX1170: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,1,0] ; encoding: [0x04,0x42,0x42,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1]
+// GFX1170: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x42,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f16_16x16x16_f16 v[4:5], s[0:1], v[2:3], v[4:5]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], s[2:3], v[4:5]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], s[4:5]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f16_16x16x16_f16 v[4:5], 1.0, v[2:3], v[4:5]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], 1.0, v[4:5]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], 1.0
+// GFX1170: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0xca,0x1b]
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], 1
+// GFX1170: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x06,0x1a]
+
+
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5]
+// GFX1170: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] clamp
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0]
+// GFX1170: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x3c]
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0]
+// GFX1170: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x5c]
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1]
+// GFX1170: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x9c]
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[1,0,0]
+// GFX1170: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[1,0,0] ; encoding: [0x04,0x41,0x43,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,1,0]
+// GFX1170: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,1,0] ; encoding: [0x04,0x42,0x43,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1]
+// GFX1170: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x43,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], s[0:1], v[2:3], v[4:5]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], s[2:3], v[4:5]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], s[4:5]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], 1.0, v[2:3], v[4:5]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], 1.0, v[4:5]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], 1.0
+// GFX1170: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0xca,0x1b]
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], 1
+// GFX1170: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x06,0x1a]
+
+
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5]
+// GFX1170: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x1c]
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] clamp
+// GFX1170: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] clamp ; encoding: [0x02,0xc0,0x44,0xcc,0x00,0x03,0x0a,0x1c]
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
+// GFX1170: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x3c]
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
+// GFX1170: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x5c]
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_hi:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_hi:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[2:5], s0, v1, v[2:5]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, s1, v[2:5]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, s[0:3]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu8 v[2:5], 1, v1, v[2:5]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, 1, v[2:5]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, 1
+// GFX1170: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x06,0x1a]
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, 1.0
+// GFX1170: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0xca,0x1b]
+
+
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5]
+// GFX1170: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x1c]
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] clamp
+// GFX1170: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] clamp ; encoding: [0x02,0xc0,0x45,0xcc,0x00,0x03,0x0a,0x1c]
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
+// GFX1170: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x3c]
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
+// GFX1170: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x5c]
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_hi:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_hi:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:5], s0, v1, v[2:5]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, s1, v[2:5]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, s[0:3]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu4 v[2:5], 1, v1, v[2:5]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, 1, v[2:5]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, 1
+// GFX1170: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x06,0x1a]
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, 1.0
+// GFX1170: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0xca,0x1b]
+
+
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5]
+// GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x1c]
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] clamp
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
+// GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x9c]
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
+// GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] ; encoding: [0x02,0x44,0x46,0xcc,0x00,0x03,0x0a,0x1c]
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], s0, v1, v[2:5]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, s1, v[2:5]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, s[0:3]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], 1.0, v1, v[2:5]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, 1.0, v[2:5]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, 1.0
+// GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x03,0xca,0x1b]
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, 1
+// GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x03,0x06,0x1a]
+
+
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5]
+// GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x1c]
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] clamp
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
+// GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x9c]
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
+// GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] ; encoding: [0x02,0x44,0x47,0xcc,0x00,0x03,0x0a,0x1c]
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], s0, v1, v[2:5]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, s1, v[2:5]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, s[0:3]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], 1.0, v1, v[2:5]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, 1.0, v[2:5]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, 1.0
+// GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x03,0xca,0x1b]
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, 1
+// GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x03,0x06,0x1a]
+
+
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5]
+// GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x1c]
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] clamp
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
+// GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x9c]
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
+// GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] ; encoding: [0x02,0x44,0x48,0xcc,0x00,0x03,0x0a,0x1c]
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], s0, v1, v[2:5]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, s1, v[2:5]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, s[0:3]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], 1.0, v1, v[2:5]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, 1.0, v[2:5]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, 1.0
+// GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x03,0xca,0x1b]
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, 1
+// GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x03,0x06,0x1a]
+
+
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5]
+// GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x1c]
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] clamp
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
+// GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x9c]
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
+// GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] ; encoding: [0x02,0x44,0x49,0xcc,0x00,0x03,0x0a,0x1c]
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], s0, v1, v[2:5]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, s1, v[2:5]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, s[0:3]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], 1.0, v1, v[2:5]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, 1.0, v[2:5]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, 1.0
+// GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x03,0xca,0x1b]
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, 1
+// GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x03,0x06,0x1a]
+
+
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5]
+// GFX1170: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x1c]
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] clamp
+// GFX1170: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] clamp ; encoding: [0x02,0xc0,0x4a,0xcc,0x00,0x03,0x0a,0x1c]
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
+// GFX1170: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x3c]
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
+// GFX1170: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x5c]
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_hi:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_hi:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[2:5], s0, v1, v[2:5]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, s1, v[2:5]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, s[0:3]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x32_iu4 v[2:5], 1.0, v1, v[2:5]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, 1.0, v[2:5]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, 1.0
+// GFX1170: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0xca,0x1b]
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, 1
+// GFX1170: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x06,0x1a]
+
+
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10
+// GFX1170: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x1c]
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 clamp
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:1
+// GFX1170: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:1 ; encoding: [0x06,0x48,0x50,0xcc,0x00,0x05,0x2a,0x1c]
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:2
+// GFX1170: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:2 ; encoding: [0x06,0x50,0x50,0xcc,0x00,0x05,0x2a,0x1c]
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:3
+// GFX1170: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:3 ; encoding: [0x06,0x58,0x50,0xcc,0x00,0x05,0x2a,0x1c]
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0]
+// GFX1170: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x3c]
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0]
+// GFX1170: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x5c]
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[1,0,0]
+// GFX1170: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[1,0,0] ; encoding: [0x06,0x41,0x50,0xcc,0x00,0x05,0x2a,0x1c]
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[0,1,0]
+// GFX1170: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[0,1,0] ; encoding: [0x06,0x42,0x50,0xcc,0x00,0x05,0x2a,0x1c]
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_swmmac_f32_16x16x32_f16 v[6:9], s[0:1], v[2:5], v10
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], s[0:3], v10
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], s10
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_f16 v[6:9], 1.0, v[2:5], v10
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], 1.0, v10
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], 1.0
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], 1
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10
+// GFX1170: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x1c]
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 clamp
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:1
+// GFX1170: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:1 ; encoding: [0x06,0x48,0x51,0xcc,0x00,0x05,0x2a,0x1c]
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:2
+// GFX1170: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:2 ; encoding: [0x06,0x50,0x51,0xcc,0x00,0x05,0x2a,0x1c]
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:3
+// GFX1170: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:3 ; encoding: [0x06,0x58,0x51,0xcc,0x00,0x05,0x2a,0x1c]
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0]
+// GFX1170: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x3c]
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0]
+// GFX1170: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x5c]
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[1,0,0]
+// GFX1170: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[1,0,0] ; encoding: [0x06,0x41,0x51,0xcc,0x00,0x05,0x2a,0x1c]
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[0,1,0]
+// GFX1170: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[0,1,0] ; encoding: [0x06,0x42,0x51,0xcc,0x00,0x05,0x2a,0x1c]
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], s[0:1], v[2:5], v10
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], s[0:3], v10
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], s10
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], 1.0, v[2:5], v10
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], 1.0, v10
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], 1.0
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], 1
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8
+// GFX1170: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x1c]
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 clamp
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:1
+// GFX1170: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:1 ; encoding: [0x06,0x48,0x52,0xcc,0x00,0x05,0x22,0x1c]
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:2
+// GFX1170: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:2 ; encoding: [0x06,0x50,0x52,0xcc,0x00,0x05,0x22,0x1c]
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:3
+// GFX1170: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:3 ; encoding: [0x06,0x58,0x52,0xcc,0x00,0x05,0x22,0x1c]
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0]
+// GFX1170: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x3c]
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0]
+// GFX1170: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x5c]
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[1,0,0]
+// GFX1170: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[1,0,0] ; encoding: [0x06,0x41,0x52,0xcc,0x00,0x05,0x22,0x1c]
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[0,1,0]
+// GFX1170: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[0,1,0] ; encoding: [0x06,0x42,0x52,0xcc,0x00,0x05,0x22,0x1c]
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_swmmac_f16_16x16x32_f16 v[6:7], s[0:1], v[2:5], v8
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], s[0:3], v8
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], s8
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f16_16x16x32_f16 v[6:7], 1.0, v[2:5], v8
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], 1.0, v8
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], 1.0
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], 1
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8
+// GFX1170: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x1c]
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 clamp
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:1
+// GFX1170: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:1 ; encoding: [0x06,0x48,0x53,0xcc,0x00,0x05,0x22,0x1c]
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:2
+// GFX1170: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:2 ; encoding: [0x06,0x50,0x53,0xcc,0x00,0x05,0x22,0x1c]
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:3
+// GFX1170: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:3 ; encoding: [0x06,0x58,0x53,0xcc,0x00,0x05,0x22,0x1c]
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0]
+// GFX1170: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x3c]
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0]
+// GFX1170: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x5c]
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[1,0,0]
+// GFX1170: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[1,0,0] ; encoding: [0x06,0x41,0x53,0xcc,0x00,0x05,0x22,0x1c]
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[0,1,0]
+// GFX1170: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[0,1,0] ; encoding: [0x06,0x42,0x53,0xcc,0x00,0x05,0x22,0x1c]
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], s[0:1], v[2:5], v8
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], s[0:3], v8
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], s8
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], 1.0, v[2:5], v8
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], 1.0, v8
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], 1.0
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], 1
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7
+// GFX1170: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 clamp
+// GFX1170: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 clamp ; encoding: [0x03,0xc0,0x54,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:1
+// GFX1170: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x54,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:2
+// GFX1170: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x54,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:3
+// GFX1170: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x54,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
+// GFX1170: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x3c]
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
+// GFX1170: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x5c]
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_hi:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_hi:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_hi:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], s0, v[1:2], v7
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, s[0:1], v7
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], s7
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], 1, v[1:2], v7
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, 1, v7
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], 1
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], 1.0
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6
+// GFX1170: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x1c]
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 clamp
+// GFX1170: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 clamp ; encoding: [0x02,0xc0,0x55,0xcc,0x00,0x03,0x1a,0x1c]
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:1
+// GFX1170: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:1 ; encoding: [0x02,0x48,0x55,0xcc,0x00,0x03,0x1a,0x1c]
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:2
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:3
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[1,0,0]
+// GFX1170: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x3c]
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[0,1,0]
+// GFX1170: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x5c]
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_hi:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_hi:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_hi:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], s0, v1, v6
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, s1, v6
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, s6
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], 1, v1, v6
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, 1, v6
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, 1
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, 1.0
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7
+// GFX1170: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 clamp
+// GFX1170: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 clamp ; encoding: [0x03,0xc0,0x56,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:1
+// GFX1170: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x56,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:2
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:3
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
+// GFX1170: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x3c]
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
+// GFX1170: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x5c]
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_hi:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_hi:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_hi:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], s0, v[1:2], v7
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, s[0:1], v7
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], s7
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], 1, v[1:2], v7
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, 1, v7
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], 1
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], 1.0
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7
+// GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x57,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 clamp
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:1
+// GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x57,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:2
+// GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x57,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:3
+// GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x57,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 neg_lo:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 neg_hi:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 neg_hi:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 neg_hi:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], s0, v[1:2], v7
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, s[0:1], v7
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], s7
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], 1.0, v[1:2], v7
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, 1.0, v7
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], 1.0
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], 1
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7
+// GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x58,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 clamp
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:1
+// GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x58,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:2
+// GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x58,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:3
+// GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x58,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 neg_lo:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 neg_hi:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 neg_hi:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 neg_hi:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], s0, v[1:2], v7
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, s[0:1], v7
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], s7
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], 1.0, v[1:2], v7
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, 1.0, v7
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], 1.0
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], 1
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7
+// GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x59,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 clamp
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:1
+// GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x59,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:2
+// GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x59,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:3
+// GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x59,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 neg_lo:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 neg_hi:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 neg_hi:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 neg_hi:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], s0, v[1:2], v7
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, s[0:1], v7
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], s7
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], 1.0, v[1:2], v7
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, 1.0, v7
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], 1.0
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], 1
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7
+// GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x5a,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 clamp
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 op_sel:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 op_sel:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 op_sel:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 op_sel_hi:[0,1,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:1
+// GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x5a,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:2
+// GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x5a,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:3
+// GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x5a,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 neg_lo:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 neg_hi:[1,0,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 neg_hi:[0,1,0]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 neg_hi:[0,0,1]
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], s0, v[1:2], v7
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, s[0:1], v7
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], s7
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], 1.0, v[1:2], v7
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, 1.0, v7
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], 1.0
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], 1
+// GFX1170-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
diff --git a/llvm/test/MC/AMDGPU/gfx1170_unsupported.s b/llvm/test/MC/AMDGPU/gfx1170_unsupported.s
new file mode 100644
index 0000000000000..b4e0da1779ffb
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx1170_unsupported.s
@@ -0,0 +1,11 @@
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1170 %s 2>&1 | FileCheck --implicit-check-not=error: %s
+
+//===----------------------------------------------------------------------===//
+// Unsupported instructions.
+//===----------------------------------------------------------------------===//
+
+v_dot2c_f32_f16 v0, v1, v2
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_dot2acc_f32_f16 v5, v1, v2
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/literals.s b/llvm/test/MC/AMDGPU/literals.s
index 363db1a16b170..a96e9c4c07873 100644
--- a/llvm/test/MC/AMDGPU/literals.s
+++ b/llvm/test/MC/AMDGPU/literals.s
@@ -206,14 +206,14 @@ v_fract_f64_e32 v[0:1], lit(1.0)
v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], 1.0
// GFX11: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], 1.0 ; encoding: [0x08,0x40,0x44,0xcc,0x00,0x09,0xca,0x1b]
// NOGFX12: :[[@LINE-2]]:1: error: operands are not valid for this GPU or mode
-// NOGFX1250: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], lit(1.0)
// NOGFX11: :[[@LINE-1]]:54: error: invalid operand for instruction
// NOGFX12: :[[@LINE-2]]:54: error: invalid operand for instruction
-// NOGFX1250: :[[@LINE-3]]:54: error: invalid operand for instruction
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
@@ -658,14 +658,14 @@ v_fract_f64_e32 v[0:1], 0xffffffffffffffff
v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], 1
// GFX11: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x44,0xcc,0x00,0x09,0x06,0x1a]
// NOGFX12: :[[@LINE-2]]:1: error: operands are not valid for this GPU or mode
-// NOGFX1250: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], lit(1)
// NOGFX11: :[[@LINE-1]]:54: error: invalid operand for instruction
// NOGFX12: :[[@LINE-2]]:54: error: invalid operand for instruction
-// NOGFX1250: :[[@LINE-3]]:54: error: invalid operand for instruction
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1170_dasm_wmma_w32.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1170_dasm_wmma_w32.txt
new file mode 100644
index 0000000000000..1e778fb04aea2
--- /dev/null
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1170_dasm_wmma_w32.txt
@@ -0,0 +1,1628 @@
+# RUN: not llvm-mc -disassemble -triple=amdgcn -mcpu=gfx1170 -show-encoding %s | FileCheck --check-prefix=GFX1170 %s
+# RUN: not llvm-mc -disassemble -triple=amdgcn -mcpu=gfx1170 -show-encoding %s 2>&1 | FileCheck --implicit-check-not=warning: --check-prefix=GFX1170-ERR %s
+
+[0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x1c]
+# GFX1170: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x1c]
+
+[0x08,0xc0,0x40,0xcc,0x00,0x09,0x22,0x1c] # clamp
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x08,0x48,0x40,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x08,0x50,0x40,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x08,0x60,0x40,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x08,0x00,0x40,0xcc,0x00,0x09,0x22,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x08,0x48,0x40,0xcc,0x00,0x09,0x22,0x1c] # index_key:1
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x08,0x50,0x40,0xcc,0x00,0x09,0x22,0x1c] # index_key:2
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x08,0x58,0x40,0xcc,0x00,0x09,0x22,0x1c] # index_key:3
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x3c] # neg_lo:[1,0,0]
+# GFX1170: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x3c]
+
+[0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x5c] # neg_lo:[0,1,0]
+# GFX1170: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x5c]
+
+[0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x9c] # neg_lo:[0,0,1]
+# GFX1170: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x9c]
+
+[0x08,0x41,0x40,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[1,0,0]
+# GFX1170: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x40,0xcc,0x00,0x09,0x22,0x1c]
+
+[0x08,0x42,0x40,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[0,1,0]
+# GFX1170: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x40,0xcc,0x00,0x09,0x22,0x1c]
+
+[0x08,0x44,0x40,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[0,0,1]
+# GFX1170: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x40,0xcc,0x00,0x09,0x22,0x1c]
+
+[0x08,0x40,0x40,0xcc,0x00,0x08,0x22,0x1c] # sgpr src0
+# GFX1170: v_wmma_f32_16x16x16_f16 v[8:15], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v[4:7], v[8:15] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x08,0x22,0x1c]
+
+[0x08,0x40,0x40,0xcc,0x00,0x09,0x20,0x1c] # sgpr src1
+# GFX1170: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], s[4:7]/*Invalid register, operand has 'VReg_128' register class*/, v[8:15] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x20,0x1c]
+
+[0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x18] # sgpr src2
+# GFX1170: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], s[8:15]/*Invalid register, operand has 'VReg_256' register class*/ ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x18]
+
+[0x08,0x40,0x40,0xcc,0xf2,0x08,0x22,0x1c] # 1.0 src0
+# GFX1170: v_wmma_f32_16x16x16_f16 v[8:15], 1.0/*Invalid immediate*/, v[4:7], v[8:15] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x08,0x22,0x1c]
+
+[0x08,0x40,0x40,0xcc,0x00,0xe5,0x21,0x1c] # 1.0 src1
+# GFX1170: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], 1.0/*Invalid immediate*/, v[8:15] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x01,0x20,0x1c]
+
+[0x08,0x40,0x40,0xcc,0x00,0x09,0xca,0x1b] # 1.0 src2
+# GFX1170: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], 1.0 ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0xca,0x1b]
+
+[0x08,0x40,0x40,0xcc,0x00,0x09,0x06,0x1a] # 1 src2
+# GFX1170: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x06,0x1a]
+
+
+
+[0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x1c]
+# GFX1170: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x1c]
+
+[0x08,0xc0,0x41,0xcc,0x00,0x09,0x22,0x1c] # clamp
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x08,0x48,0x41,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x08,0x50,0x41,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x08,0x60,0x41,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x08,0x00,0x41,0xcc,0x00,0x09,0x22,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x41,0xcc,0x00,0x05,0x12,0x1c] # index_key:1
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x41,0xcc,0x00,0x05,0x12,0x1c] # index_key:2
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x58,0x41,0xcc,0x00,0x05,0x12,0x1c] # index_key:3
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x3c] # neg_lo:[1,0,0]
+# GFX1170: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x3c]
+
+[0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x5c] # neg_lo:[0,1,0]
+# GFX1170: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x5c]
+
+[0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x9c] # neg_lo:[0,0,1]
+# GFX1170: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x9c]
+
+[0x08,0x41,0x41,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[1,0,0]
+# GFX1170: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x41,0xcc,0x00,0x09,0x22,0x1c]
+
+[0x08,0x42,0x41,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[0,1,0]
+# GFX1170: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x41,0xcc,0x00,0x09,0x22,0x1c]
+
+[0x08,0x44,0x41,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[0,0,1]
+# GFX1170: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x41,0xcc,0x00,0x09,0x22,0x1c]
+
+[0x08,0x40,0x41,0xcc,0x00,0x08,0x22,0x1c] # sgpr src0
+# GFX1170: v_wmma_f32_16x16x16_bf16 v[8:15], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v[4:7], v[8:15] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x08,0x22,0x1c]
+
+[0x08,0x40,0x41,0xcc,0x00,0x09,0x20,0x1c] # sgpr src1
+# GFX1170: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], s[4:7]/*Invalid register, operand has 'VReg_128' register class*/, v[8:15] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x20,0x1c]
+
+[0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x18] # sgpr src2
+# GFX1170: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], s[8:15]/*Invalid register, operand has 'VReg_256' register class*/ ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x18]
+
+[0x08,0x40,0x41,0xcc,0xf2,0x08,0x22,0x1c] # 1.0 src0
+# GFX1170: v_wmma_f32_16x16x16_bf16 v[8:15], 1.0/*Invalid immediate*/, v[4:7], v[8:15] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x08,0x22,0x1c]
+
+[0x08,0x40,0x41,0xcc,0x00,0xe5,0x21,0x1c] # 1.0 src1
+# GFX1170: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], 1.0/*Invalid immediate*/, v[8:15] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x01,0x20,0x1c]
+
+[0x08,0x40,0x41,0xcc,0x00,0x09,0xca,0x1b] # 1.0 src2
+# GFX1170: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], 1.0 ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0xca,0x1b]
+
+[0x08,0x40,0x41,0xcc,0x00,0x09,0x06,0x1a] # 1 src2
+# GFX1170: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x06,0x1a]
+
+
+
+[0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x1c]
+# GFX1170: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x1c]
+
+[0x08,0xc0,0x42,0xcc,0x00,0x09,0x22,0x1c] # clamp
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x08,0x48,0x42,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x08,0x50,0x42,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x08,0x60,0x42,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x08,0x00,0x42,0xcc,0x00,0x09,0x22,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x42,0xcc,0x00,0x05,0x12,0x1c] # index_key:1
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x42,0xcc,0x00,0x05,0x12,0x1c] # index_key:2
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x58,0x42,0xcc,0x00,0x05,0x12,0x1c] # index_key:3
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x3c] # neg_lo:[1,0,0]
+# GFX1170: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x3c]
+
+[0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x5c] # neg_lo:[0,1,0]
+# GFX1170: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x5c]
+
+[0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x9c] # neg_lo:[0,0,1]
+# GFX1170: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x9c]
+
+[0x08,0x41,0x42,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[1,0,0]
+# GFX1170: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x42,0xcc,0x00,0x09,0x22,0x1c]
+
+[0x08,0x42,0x42,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[0,1,0]
+# GFX1170: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x42,0xcc,0x00,0x09,0x22,0x1c]
+
+[0x08,0x44,0x42,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[0,0,1]
+# GFX1170: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x42,0xcc,0x00,0x09,0x22,0x1c]
+
+[0x08,0x40,0x42,0xcc,0x00,0x08,0x22,0x1c] # sgpr src0
+# GFX1170: v_wmma_f16_16x16x16_f16 v[8:11], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v[4:7], v[8:11] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x08,0x22,0x1c]
+
+[0x08,0x40,0x42,0xcc,0x00,0x09,0x20,0x1c] # sgpr src1
+# GFX1170: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], s[4:7]/*Invalid register, operand has 'VReg_128' register class*/, v[8:11] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x20,0x1c]
+
+[0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x18] # sgpr src2
+# GFX1170: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], s[8:11]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x18]
+
+[0x08,0x40,0x42,0xcc,0xf2,0x08,0x22,0x1c] # 1.0 src0
+# GFX1170: v_wmma_f16_16x16x16_f16 v[8:11], 1.0/*Invalid immediate*/, v[4:7], v[8:11] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x08,0x22,0x1c]
+
+[0x08,0x40,0x42,0xcc,0x00,0xe5,0x21,0x1c] # 1.0 src1
+# GFX1170: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], 1.0/*Invalid immediate*/, v[8:11] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x01,0x20,0x1c]
+
+[0x08,0x40,0x42,0xcc,0x00,0x09,0xca,0x1b] # 1.0 src2
+# GFX1170: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], 1.0 ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0xca,0x1b]
+
+[0x08,0x40,0x42,0xcc,0x00,0x09,0x06,0x1a] # 1 src2
+# GFX1170: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x06,0x1a]
+
+
+
+[0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x1c]
+# GFX1170: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x1c]
+
+[0x08,0xc0,0x43,0xcc,0x00,0x09,0x22,0x1c] # clamp
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x08,0x48,0x43,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x08,0x50,0x43,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x08,0x60,0x43,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x08,0x00,0x43,0xcc,0x00,0x09,0x22,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x43,0xcc,0x00,0x05,0x12,0x1c] # index_key:1
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x43,0xcc,0x00,0x05,0x12,0x1c] # index_key:2
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x58,0x43,0xcc,0x00,0x05,0x12,0x1c] # index_key:3
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x3c] # neg_lo:[1,0,0]
+# GFX1170: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x3c]
+
+[0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x5c] # neg_lo:[0,1,0]
+# GFX1170: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x5c]
+
+[0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x9c] # neg_lo:[0,0,1]
+# GFX1170: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x9c]
+
+[0x08,0x41,0x43,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[1,0,0]
+# GFX1170: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x43,0xcc,0x00,0x09,0x22,0x1c]
+
+[0x08,0x42,0x43,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[0,1,0]
+# GFX1170: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x43,0xcc,0x00,0x09,0x22,0x1c]
+
+[0x08,0x44,0x43,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[0,0,1]
+# GFX1170: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x43,0xcc,0x00,0x09,0x22,0x1c]
+
+[0x08,0x40,0x43,0xcc,0x00,0x08,0x22,0x1c] # sgpr src0
+# GFX1170: v_wmma_bf16_16x16x16_bf16 v[8:11], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v[4:7], v[8:11] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x08,0x22,0x1c]
+
+[0x08,0x40,0x43,0xcc,0x00,0x09,0x20,0x1c] # sgpr src1
+# GFX1170: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], s[4:7]/*Invalid register, operand has 'VReg_128' register class*/, v[8:11] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x20,0x1c]
+
+[0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x18] # sgpr src2
+# GFX1170: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], s[8:11]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x18]
+
+[0x08,0x40,0x43,0xcc,0xf2,0x08,0x22,0x1c] # 1.0 src0
+# GFX1170: v_wmma_bf16_16x16x16_bf16 v[8:11], 1.0/*Invalid immediate*/, v[4:7], v[8:11] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x08,0x22,0x1c]
+
+[0x08,0x40,0x43,0xcc,0x00,0xe5,0x21,0x1c] # 1.0 src1
+# GFX1170: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], 1.0/*Invalid immediate*/, v[8:11] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x01,0x20,0x1c]
+
+[0x08,0x40,0x43,0xcc,0x00,0x09,0xca,0x1b] # 1.0 src2
+# GFX1170: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], 1.0 ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0xca,0x1b]
+
+[0x08,0x40,0x43,0xcc,0x00,0x09,0x06,0x1a] # 1 src2
+# GFX1170: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x06,0x1a]
+
+
+
+[0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x1c]
+# GFX1170: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0xc0,0x44,0xcc,0x00,0x05,0x12,0x1c] # clamp
+# GFX1170: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] clamp ; encoding: [0x04,0xc0,0x44,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x48,0x44,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x44,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x60,0x44,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x00,0x44,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x48,0x44,0xcc,0x00,0x03,0x0a,0x1c] # index_key:1
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x44,0xcc,0x00,0x03,0x0a,0x1c] # index_key:2
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x58,0x44,0xcc,0x00,0x03,0x0a,0x1c] # index_key:3
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0]
+# GFX1170: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x3c]
+
+[0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0]
+# GFX1170: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x5c]
+
+[0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x41,0x44,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x42,0x44,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x44,0x44,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x44,0xcc,0x00,0x04,0x12,0x1c] # sgpr src0
+# GFX1170: v_wmma_i32_16x16x16_iu8 v[4:11], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x44,0xcc,0x00,0x05,0x10,0x1c] # sgpr src1
+# GFX1170: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], s[2:3]/*Invalid register, operand has 'VReg_64' register class*/, v[4:11] ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x10,0x1c]
+
+[0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x18] # sgpr src2
+# GFX1170: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/ ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x18]
+
+[0x04,0x40,0x44,0xcc,0x81,0x04,0x12,0x1c] # 1 src0
+# GFX1170: v_wmma_i32_16x16x16_iu8 v[4:11], 1/*Invalid immediate*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x44,0xcc,0x01,0x04,0x12,0x1c]
+
+[0x04,0x40,0x44,0xcc,0x00,0x03,0x11,0x1c] # 1 src1
+# GFX1170: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], 1/*Invalid immediate*/, v[4:11] ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x03,0x10,0x1c]
+
+[0x04,0x40,0x44,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX1170: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x06,0x1a]
+
+[0x04,0x40,0x44,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX1170: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0xca,0x1b]
+
+
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x1c]
+# GFX1170: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x1c]
+
+[0x02,0xc0,0x45,0xcc,0x00,0x03,0x0a,0x1c] # clamp
+# GFX1170: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] clamp ; encoding: [0x02,0xc0,0x45,0xcc,0x00,0x03,0x0a,0x1c]
+
+[0x02,0x48,0x45,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x45,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x60,0x45,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x00,0x45,0xcc,0x00,0x03,0x0a,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x48,0x45,0xcc,0x00,0x03,0x0a,0x1c] # index_key:1
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x45,0xcc,0x00,0x03,0x0a,0x1c] # index_key:2
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x58,0x45,0xcc,0x00,0x03,0x0a,0x1c] # index_key:3
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x3c] # neg_lo:[1,0,0]
+# GFX1170: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x3c]
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x5c] # neg_lo:[0,1,0]
+# GFX1170: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x5c]
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x9c] # neg_lo:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x41,0x45,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x42,0x45,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x44,0x45,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x45,0xcc,0x00,0x02,0x0a,0x1c] # sgpr src0
+# GFX1170: v_wmma_i32_16x16x16_iu4 v[2:9], s0/*Invalid register, operand has 'VGPR_32' register class*/, v1, v[2:9] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x02,0x0a,0x1c]
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x08,0x1c] # sgpr src1
+# GFX1170: v_wmma_i32_16x16x16_iu4 v[2:9], v0, s1/*Invalid register, operand has 'VGPR_32' register class*/, v[2:9] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x08,0x1c]
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x18] # sgpr src2
+# GFX1170: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, s[0:7]/*Invalid register, operand has 'VReg_256' register class*/ ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x02,0x18]
+
+[0x02,0x40,0x45,0xcc,0x81,0x02,0x0a,0x1c] # 1 src0
+# GFX1170: v_wmma_i32_16x16x16_iu4 v[2:9], 1/*Invalid immediate*/, v1, v[2:9] ; encoding: [0x02,0x40,0x45,0xcc,0x01,0x02,0x0a,0x1c]
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x09,0x1c] # 1 src1
+# GFX1170: v_wmma_i32_16x16x16_iu4 v[2:9], v0, 1/*Invalid immediate*/, v[2:9] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x08,0x1c]
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x06,0x1a] # 1 src2
+# GFX1170: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, 1 ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x06,0x1a]
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2
+# GFX1170: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, 1.0 ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0xca,0x1b]
+
+
+
+[0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x1c]
+# GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0xc0,0x46,0xcc,0x00,0x05,0x12,0x1c] # clamp
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x46,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x46,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x60,0x46,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x00,0x46,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x46,0xcc,0x00,0x05,0x12,0x1c] # index_key:1
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x46,0xcc,0x00,0x05,0x12,0x1c] # index_key:2
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x58,0x46,0xcc,0x00,0x05,0x12,0x1c] # index_key:3
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1]
+# GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x9c]
+
+[0x04,0x41,0x46,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x42,0x46,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x44,0x46,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1]
+# GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x46,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x40,0x46,0xcc,0x00,0x04,0x12,0x1c] # sgpr src0
+# GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x46,0xcc,0x00,0x05,0x10,0x1c] # sgpr src1
+# GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], s[2:3]/*Invalid register, operand has 'VReg_64' register class*/, v[4:11] ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0x10,0x1c]
+
+[0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x18] # sgpr src2
+# GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/ ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x18]
+
+[0x04,0x40,0x46,0xcc,0xf2,0x04,0x12,0x1c] # 1.0 src0
+# GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], 1.0/*Invalid immediate*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x46,0xcc,0x00,0xe5,0x11,0x1c] # 1.0 src1
+# GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], 1.0/*Invalid immediate*/, v[4:11] ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x01,0x10,0x1c]
+
+[0x04,0x40,0x46,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0xca,0x1b]
+
+[0x04,0x40,0x46,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0x06,0x1a]
+
+
+
+[0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x1c]
+# GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0xc0,0x48,0xcc,0x00,0x05,0x12,0x1c] # clamp
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x48,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x48,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x60,0x48,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x00,0x48,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x48,0xcc,0x00,0x05,0x12,0x1c] # index_key:1
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x48,0xcc,0x00,0x05,0x12,0x1c] # index_key:2
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x58,0x48,0xcc,0x00,0x05,0x12,0x1c] # index_key:3
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1]
+# GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x9c]
+
+[0x04,0x41,0x48,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x42,0x48,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x44,0x48,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1]
+# GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x48,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x40,0x48,0xcc,0x00,0x04,0x12,0x1c] # sgpr src0
+# GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x48,0xcc,0x00,0x05,0x10,0x1c] # sgpr src1
+# GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], s[2:3]/*Invalid register, operand has 'VReg_64' register class*/, v[4:11] ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0x10,0x1c]
+
+[0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x18] # sgpr src2
+# GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/ ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x18]
+
+[0x04,0x40,0x48,0xcc,0xf2,0x04,0x12,0x1c] # 1.0 src0
+# GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], 1.0/*Invalid immediate*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x48,0xcc,0x00,0xe5,0x11,0x1c] # 1.0 src1
+# GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], 1.0/*Invalid immediate*/, v[4:11] ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x01,0x10,0x1c]
+
+[0x04,0x40,0x48,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0xca,0x1b]
+
+[0x04,0x40,0x48,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0x06,0x1a]
+
+
+
+[0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x1c]
+# GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0xc0,0x47,0xcc,0x00,0x05,0x12,0x1c] # clamp
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x47,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x47,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x60,0x47,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x00,0x47,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x47,0xcc,0x00,0x05,0x12,0x1c] # index_key:1
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x47,0xcc,0x00,0x05,0x12,0x1c] # index_key:2
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x58,0x47,0xcc,0x00,0x05,0x12,0x1c] # index_key:3
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1]
+# GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x9c]
+
+[0x04,0x41,0x47,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x42,0x47,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x44,0x47,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1]
+# GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x47,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x40,0x47,0xcc,0x00,0x04,0x12,0x1c] # sgpr src0
+# GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x47,0xcc,0x00,0x05,0x10,0x1c] # sgpr src1
+# GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], s[2:3]/*Invalid register, operand has 'VReg_64' register class*/, v[4:11] ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0x10,0x1c]
+
+[0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x18] # sgpr src2
+# GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/ ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x18]
+
+[0x04,0x40,0x47,0xcc,0xf2,0x04,0x12,0x1c] # 1.0 src0
+# GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], 1.0/*Invalid immediate*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x47,0xcc,0x00,0xe5,0x11,0x1c] # 1.0 src1
+# GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], 1.0/*Invalid immediate*/, v[4:11] ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x01,0x10,0x1c]
+
+[0x04,0x40,0x47,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0xca,0x1b]
+
+[0x04,0x40,0x47,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0x06,0x1a]
+
+
+
+[0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x1c]
+# GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0xc0,0x49,0xcc,0x00,0x05,0x12,0x1c] # clamp
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x49,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x49,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x60,0x49,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x00,0x49,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x49,0xcc,0x00,0x05,0x12,0x1c] # index_key:1
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x49,0xcc,0x00,0x05,0x12,0x1c] # index_key:2
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x58,0x49,0xcc,0x00,0x05,0x12,0x1c] # index_key:3
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1]
+# GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x9c]
+
+[0x04,0x41,0x49,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x42,0x49,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x44,0x49,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1]
+# GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x49,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x40,0x49,0xcc,0x00,0x04,0x12,0x1c] # sgpr src0
+# GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x49,0xcc,0x00,0x05,0x10,0x1c] # sgpr src1
+# GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], s[2:3]/*Invalid register, operand has 'VReg_64' register class*/, v[4:11] ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0x10,0x1c]
+
+[0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x18] # sgpr src2
+# GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/ ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x18]
+
+[0x04,0x40,0x49,0xcc,0xf2,0x04,0x12,0x1c] # 1.0 src0
+# GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], 1.0/*Invalid immediate*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x49,0xcc,0x00,0xe5,0x11,0x1c] # 1.0 src1
+# GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], 1.0/*Invalid immediate*/, v[4:11] ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x01,0x10,0x1c]
+
+[0x04,0x40,0x49,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0xca,0x1b]
+
+[0x04,0x40,0x49,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0x06,0x1a]
+
+
+
+[0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x1c]
+# GFX1170: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0xc0,0x4a,0xcc,0x00,0x05,0x12,0x1c] # clamp
+# GFX1170: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] clamp ; encoding: [0x04,0xc0,0x4a,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x48,0x4a,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x4a,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x60,0x4a,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x00,0x4a,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x4a,0xcc,0x00,0x05,0x12,0x1c] # index_key:1
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x4a,0xcc,0x00,0x05,0x12,0x1c] # index_key:2
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x58,0x4a,0xcc,0x00,0x05,0x12,0x1c] # index_key:3
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0]
+# GFX1170: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x3c]
+
+[0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0]
+# GFX1170: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x5c]
+
+[0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x41,0x4a,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x42,0x4a,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x44,0x4a,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x4a,0xcc,0x00,0x04,0x12,0x1c] # sgpr src0
+# GFX1170: v_wmma_i32_16x16x32_iu4 v[4:11], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x4a,0xcc,0x00,0x05,0x10,0x1c] # sgpr src1
+# GFX1170: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], s[2:3]/*Invalid register, operand has 'VReg_64' register class*/, v[4:11] ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x10,0x1c]
+
+[0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x18] # sgpr src2
+# GFX1170: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/ ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x18]
+
+[0x04,0x40,0x4a,0xcc,0x81,0x04,0x12,0x1c] # 1 src0
+# GFX1170: v_wmma_i32_16x16x32_iu4 v[4:11], 1/*Invalid immediate*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x4a,0xcc,0x01,0x04,0x12,0x1c]
+
+[0x04,0x40,0x4a,0xcc,0x00,0x03,0x11,0x1c] # 1 src1
+# GFX1170: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], 1/*Invalid immediate*/, v[4:11] ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x03,0x10,0x1c]
+
+[0x04,0x40,0x4a,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX1170: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x06,0x1a]
+
+[0x04,0x40,0x4a,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX1170: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0xca,0x1b]
+
+
+
+[0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x1c]
+# GFX1170:v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x1c]
+
+[0x0c,0xc0,0x50,0xcc,0x00,0x09,0x52,0x1c] # clamp
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x0c,0x48,0x50,0xcc,0x00,0x09,0x52,0x1c] # op_sel:[1,0,0]
+# GFX1170:v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 index_key:1 ; encoding: [0x0c,0x48,0x50,0xcc,0x00,0x09,0x52,0x1c]
+
+[0x0c,0x50,0x50,0xcc,0x00,0x09,0x52,0x1c] # op_sel:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x0c,0x60,0x50,0xcc,0x00,0x09,0x52,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x0c,0x00,0x50,0xcc,0x00,0x09,0x52,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x0c,0x48,0x50,0xcc,0x00,0x09,0x52,0x1c] # index_key:1
+# GFX1170: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 index_key:1 ; encoding: [0x0c,0x48,0x50,0xcc,0x00,0x09,0x52,0x1c]
+
+[0x0c,0x50,0x50,0xcc,0x00,0x09,0x52,0x1c] # index_key:2
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x0c,0x58,0x50,0xcc,0x00,0x09,0x52,0x1c] # index_key:3
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x3c] # neg_lo:[1,0,0]
+# GFX1170: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x3c]
+
+[0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x5c] # neg_lo:[0,1,0]
+# GFX1170: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x5c]
+
+[0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x9c] # neg_lo:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x0c,0x41,0x50,0xcc,0x00,0x09,0x52,0x1c] # neg_hi:[1,0,0]
+# GFX1170: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[1,0,0] ; encoding: [0x0c,0x41,0x50,0xcc,0x00,0x09,0x52,0x1c]
+
+[0x0c,0x42,0x50,0xcc,0x00,0x09,0x52,0x1c] # neg_hi:[0,1,0]
+# GFX1170: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[0,1,0] ; encoding: [0x0c,0x42,0x50,0xcc,0x00,0x09,0x52,0x1c]
+
+[0x0c,0x44,0x50,0xcc,0x00,0x09,0x52,0x1c] # neg_hi:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x0c,0x40,0x50,0xcc,0x00,0x08,0x52,0x1c] # sgpr src0
+# GFX1170: v_swmmac_f32_16x16x32_f16 v[12:19], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v[4:11], v20 ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x08,0x52,0x1c]
+
+[0x0c,0x40,0x50,0xcc,0x00,0x09,0x50,0x1c] # sgpr src1
+# GFX1170: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/, v20 ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x50,0x1c]
+
+[0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x18] # sgpr src2
+# GFX1170: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], s20/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x18]
+
+[0x0c,0x40,0x50,0xcc,0xf2,0x08,0x52,0x1c] # 1.0 src0
+# GFX1170: v_swmmac_f32_16x16x32_f16 v[12:19], 1.0/*Invalid immediate*/, v[4:11], v20 ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x08,0x52,0x1c]
+
+[0x0c,0x40,0x50,0xcc,0x00,0xe5,0x51,0x1c] # 1.0 src1
+# GFX1170: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], 1.0/*Invalid immediate*/, v20 ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x01,0x50,0x1c]
+
+[0x0c,0x40,0x50,0xcc,0x00,0x09,0xca,0x1b] # 1.0 src2
+# GFX1170: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], 1.0/*Invalid immediate*/ ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x02,0x18]
+
+[0x0c,0x40,0x50,0xcc,0x00,0x09,0x06,0x1a] # 1 src2
+# GFX1170: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], 1/*Invalid immediate*/ ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x06,0x18]
+
+
+
+[0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x1c]
+# GFX1170: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x1c]
+
+[0x0c,0xc0,0x51,0xcc,0x00,0x09,0x52,0x1c] # clamp
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x0c,0x48,0x51,0xcc,0x00,0x09,0x52,0x1c] # op_sel:[1,0,0]
+# GFX1170: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 index_key:1 ; encoding: [0x0c,0x48,0x51,0xcc,0x00,0x09,0x52,0x1c]
+
+[0x0c,0x50,0x51,0xcc,0x00,0x09,0x52,0x1c] # op_sel:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x0c,0x60,0x51,0xcc,0x00,0x09,0x52,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x0c,0x00,0x51,0xcc,0x00,0x09,0x52,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x0c,0x48,0x51,0xcc,0x00,0x09,0x52,0x1c] # index_key:1
+# GFX1170: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 index_key:1 ; encoding: [0x0c,0x48,0x51,0xcc,0x00,0x09,0x52,0x1c]
+
+[0x0c,0x50,0x51,0xcc,0x00,0x09,0x52,0x1c] # index_key:2
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x0c,0x58,0x51,0xcc,0x00,0x09,0x52,0x1c] # index_key:3
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x3c] # neg_lo:[1,0,0]
+# GFX1170: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x3c]
+
+[0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x5c] # neg_lo:[0,1,0]
+# GFX1170: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x5c]
+
+[0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x9c] # neg_lo:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x0c,0x41,0x51,0xcc,0x00,0x09,0x52,0x1c] # neg_hi:[1,0,0]
+# GFX1170: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[1,0,0] ; encoding: [0x0c,0x41,0x51,0xcc,0x00,0x09,0x52,0x1c]
+
+[0x0c,0x42,0x51,0xcc,0x00,0x09,0x52,0x1c] # neg_hi:[0,1,0]
+# GFX1170: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[0,1,0] ; encoding: [0x0c,0x42,0x51,0xcc,0x00,0x09,0x52,0x1c]
+
+[0x0c,0x44,0x51,0xcc,0x00,0x09,0x52,0x1c] # neg_hi:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x0c,0x40,0x51,0xcc,0x00,0x08,0x52,0x1c] # sgpr src0
+# GFX1170: v_swmmac_f32_16x16x32_bf16 v[12:19], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v[4:11], v20 ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x08,0x52,0x1c]
+
+[0x0c,0x40,0x51,0xcc,0x00,0x09,0x50,0x1c] # sgpr src1
+# GFX1170: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/, v20 ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x50,0x1c]
+
+[0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x18] # sgpr src2
+# GFX1170: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], s20/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x18]
+
+[0x0c,0x40,0x51,0xcc,0xf2,0x08,0x52,0x1c] # 1.0 src0
+# GFX1170: v_swmmac_f32_16x16x32_bf16 v[12:19], 1.0/*Invalid immediate*/, v[4:11], v20 ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x08,0x52,0x1c]
+
+[0x0c,0x40,0x51,0xcc,0x00,0xe5,0x51,0x1c] # 1.0 src1
+# GFX1170: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], 1.0/*Invalid immediate*/, v20 ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x01,0x50,0x1c]
+
+[0x0c,0x40,0x51,0xcc,0x00,0x09,0xca,0x1b] # 1.0 src2
+# GFX1170: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], 1.0/*Invalid immediate*/ ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x02,0x18]
+
+[0x0c,0x40,0x51,0xcc,0x00,0x09,0x06,0x1a] # 1 src2
+# GFX1170: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], 1/*Invalid immediate*/ ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x06,0x18]
+
+
+
+[0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x1c]
+# GFX1170: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x1c]
+
+[0x0c,0xc0,0x52,0xcc,0x00,0x09,0x42,0x1c] # clamp
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x0c,0x48,0x52,0xcc,0x00,0x09,0x42,0x1c] # op_sel:[1,0,0]
+# GFX1170: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:1 ; encoding: [0x0c,0x48,0x52,0xcc,0x00,0x09,0x42,0x1c]
+
+[0x0c,0x50,0x52,0xcc,0x00,0x09,0x42,0x1c] # op_sel:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x0c,0x60,0x52,0xcc,0x00,0x09,0x42,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x0c,0x00,0x52,0xcc,0x00,0x09,0x42,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x0c,0x48,0x52,0xcc,0x00,0x09,0x42,0x1c] # index_key:1
+# GFX1170: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:1 ; encoding: [0x0c,0x48,0x52,0xcc,0x00,0x09,0x42,0x1c]
+
+[0x0c,0x50,0x52,0xcc,0x00,0x09,0x42,0x1c] # index_key:2
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x0c,0x58,0x52,0xcc,0x00,0x09,0x42,0x1c] # index_key:3
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x3c] # neg_lo:[1,0,0]
+# GFX1170: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x3c]
+
+[0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x5c] # neg_lo:[0,1,0]
+# GFX1170: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x5c]
+
+[0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x9c] # neg_lo:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x0c,0x41,0x52,0xcc,0x00,0x09,0x42,0x1c] # neg_hi:[1,0,0]
+# GFX1170: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[1,0,0] ; encoding: [0x0c,0x41,0x52,0xcc,0x00,0x09,0x42,0x1c]
+
+[0x0c,0x42,0x52,0xcc,0x00,0x09,0x42,0x1c] # neg_hi:[0,1,0]
+# GFX1170: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[0,1,0] ; encoding: [0x0c,0x42,0x52,0xcc,0x00,0x09,0x42,0x1c]
+
+[0x0c,0x44,0x52,0xcc,0x00,0x09,0x42,0x1c] # neg_hi:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x0c,0x40,0x52,0xcc,0x00,0x08,0x42,0x1c] # sgpr src0
+# GFX1170: v_swmmac_f16_16x16x32_f16 v[12:15], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v[4:11], v16 ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x08,0x42,0x1c]
+
+[0x0c,0x40,0x52,0xcc,0x00,0x09,0x40,0x1c] # sgpr src1
+# GFX1170: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/, v16 ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x40,0x1c]
+
+[0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x18] # sgpr src2
+# GFX1170: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], s16/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x18]
+
+[0x0c,0x40,0x52,0xcc,0xf2,0x08,0x42,0x1c] # 1.0 src0
+# GFX1170: v_swmmac_f16_16x16x32_f16 v[12:15], 1.0/*Invalid immediate*/, v[4:11], v16 ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x08,0x42,0x1c]
+
+[0x0c,0x40,0x52,0xcc,0x00,0xe5,0x41,0x1c] # 1.0 src1
+# GFX1170: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], 1.0/*Invalid immediate*/, v16 ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x01,0x40,0x1c]
+
+[0x0c,0x40,0x52,0xcc,0x00,0x09,0xca,0x1b] # 1.0 src2
+# GFX1170: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], 1.0/*Invalid immediate*/ ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x02,0x18]
+
+[0x0c,0x40,0x52,0xcc,0x00,0x09,0x06,0x1a] # 1 src2
+# GFX1170: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], 1/*Invalid immediate*/ ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x06,0x18]
+
+
+
+[0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x1c]
+# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x1c]
+
+[0x0c,0xc0,0x53,0xcc,0x00,0x09,0x42,0x1c] # clamp
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x0c,0x48,0x53,0xcc,0x00,0x09,0x42,0x1c] # op_sel:[1,0,0]
+# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:1 ; encoding: [0x0c,0x48,0x53,0xcc,0x00,0x09,0x42,0x1c]
+
+[0x0c,0x50,0x53,0xcc,0x00,0x09,0x42,0x1c] # op_sel:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x0c,0x60,0x53,0xcc,0x00,0x09,0x42,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x0c,0x00,0x53,0xcc,0x00,0x09,0x42,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x0c,0x48,0x53,0xcc,0x00,0x09,0x42,0x1c] # index_key:1
+# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:1 ; encoding: [0x0c,0x48,0x53,0xcc,0x00,0x09,0x42,0x1c]
+
+[0x0c,0x50,0x53,0xcc,0x00,0x09,0x42,0x1c] # index_key:2
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x0c,0x58,0x53,0xcc,0x00,0x09,0x42,0x1c] # index_key:3
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x3c] # neg_lo:[1,0,0]
+# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x3c]
+
+[0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x5c] # neg_lo:[0,1,0]
+# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x5c]
+
+[0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x9c] # neg_lo:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x0c,0x41,0x53,0xcc,0x00,0x09,0x42,0x1c] # neg_hi:[1,0,0]
+# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[1,0,0] ; encoding: [0x0c,0x41,0x53,0xcc,0x00,0x09,0x42,0x1c]
+
+[0x0c,0x42,0x53,0xcc,0x00,0x09,0x42,0x1c] # neg_hi:[0,1,0]
+# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[0,1,0] ; encoding: [0x0c,0x42,0x53,0xcc,0x00,0x09,0x42,0x1c]
+
+[0x0c,0x44,0x53,0xcc,0x00,0x09,0x42,0x1c] # neg_hi:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x0c,0x40,0x53,0xcc,0x00,0x08,0x42,0x1c] # sgpr src0
+# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[12:15], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v[4:11], v16 ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x08,0x42,0x1c]
+
+[0x0c,0x40,0x53,0xcc,0x00,0x09,0x40,0x1c] # sgpr src1
+# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/, v16 ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x40,0x1c]
+
+[0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x18] # sgpr src2
+# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], s16/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x18]
+
+[0x0c,0x40,0x53,0xcc,0xf2,0x08,0x42,0x1c] # 1.0 src0
+# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[12:15], 1.0/*Invalid immediate*/, v[4:11], v16 ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x08,0x42,0x1c]
+
+[0x0c,0x40,0x53,0xcc,0x00,0xe5,0x41,0x1c] # 1.0 src1
+# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], 1.0/*Invalid immediate*/, v16 ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x01,0x40,0x1c]
+
+[0x0c,0x40,0x53,0xcc,0x00,0x09,0xca,0x1b] # 1.0 src2
+# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], 1.0/*Invalid immediate*/ ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x02,0x18]
+
+[0x0c,0x40,0x53,0xcc,0x00,0x09,0x06,0x1a] # 1 src2
+# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], 1/*Invalid immediate*/ ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x06,0x18]
+
+
+
+[0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x1c]
+# GFX1170: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0xc0,0x54,0xcc,0x00,0x05,0x3a,0x1c] # clamp
+# GFX1170: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 clamp ; encoding: [0x06,0xc0,0x54,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0x48,0x54,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[1,0,0]
+# GFX1170: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x54,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0x50,0x54,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x60,0x54,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x00,0x54,0xcc,0x00,0x05,0x3a,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x54,0xcc,0x00,0x05,0x3a,0x1c] # index_key:1
+# GFX1170: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x54,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0x50,0x54,0xcc,0x00,0x05,0x3a,0x1c] # index_key:2
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x58,0x54,0xcc,0x00,0x05,0x3a,0x1c] # index_key:3
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x3c] # neg_lo:[1,0,0]
+# GFX1170: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x3c]
+
+[0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x5c] # neg_lo:[0,1,0]
+# GFX1170: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x5c]
+
+[0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x9c] # neg_lo:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x41,0x54,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x42,0x54,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x44,0x54,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x54,0xcc,0x00,0x04,0x3a,0x1c] # sgpr src0
+# GFX1170:v_swmmac_i32_16x16x32_iu8 v[6:13], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v14 ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x04,0x3a,0x1c]
+
+[0x06,0x40,0x54,0xcc,0x00,0x05,0x38,0x1c] # sgpr src1
+# GFX1170:v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v14 ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x01,0x38,0x1c]
+
+[0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x18] # sgpr src2
+# GFX1170:v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], s14/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x18]
+
+[0x06,0x40,0x54,0xcc,0x81,0x04,0x3a,0x1c] # 1 src0
+# GFX1170:v_swmmac_i32_16x16x32_iu8 v[6:13], 1/*Invalid immediate*/, v[2:5], v14 ; encoding: [0x06,0x40,0x54,0xcc,0x01,0x04,0x3a,0x1c]
+
+[0x06,0x40,0x54,0xcc,0x00,0x03,0x39,0x1c] # 1 src1
+# GFX1170:v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], 1/*Invalid immediate*/, v14 ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x03,0x38,0x1c]
+
+[0x06,0x40,0x54,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX1170:v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x05,0x06,0x18]
+
+[0x06,0x40,0x54,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX1170:v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x05,0x02,0x18]
+
+
+
+[0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x1c]
+# GFX1170: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x1c]
+
+[0x03,0xc0,0x55,0xcc,0x00,0x03,0x2e,0x1c] # clamp
+# GFX1170: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 clamp ; encoding: [0x03,0xc0,0x55,0xcc,0x00,0x03,0x2e,0x1c]
+
+[0x03,0x48,0x55,0xcc,0x00,0x03,0x2e,0x1c] # op_sel:[1,0,0]
+# GFX1170: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 index_key:1 ; encoding: [0x03,0x48,0x55,0xcc,0x00,0x03,0x2e,0x1c]
+
+[0x03,0x50,0x55,0xcc,0x00,0x03,0x2e,0x1c] # op_sel:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x60,0x55,0xcc,0x00,0x03,0x2e,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x00,0x55,0xcc,0x00,0x03,0x2e,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x48,0x55,0xcc,0x00,0x03,0x2e,0x1c] # index_key:1
+# GFX1170: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 index_key:1 ; encoding: [0x03,0x48,0x55,0xcc,0x00,0x03,0x2e,0x1c]
+
+[0x03,0x50,0x55,0xcc,0x00,0x03,0x2e,0x1c] # index_key:2
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x58,0x55,0xcc,0x00,0x03,0x2e,0x1c] # index_key:3
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x3c] # neg_lo:[1,0,0]
+# GFX1170: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[1,0,0] ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x3c]
+
+[0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x5c] # neg_lo:[0,1,0]
+# GFX1170: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[0,1,0] ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x5c]
+
+[0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x9c] # neg_lo:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x41,0x55,0xcc,0x00,0x03,0x2e,0x1c] # neg_hi:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x42,0x55,0xcc,0x00,0x03,0x2e,0x1c] # neg_hi:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x44,0x55,0xcc,0x00,0x03,0x2e,0x1c] # neg_hi:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x55,0xcc,0x00,0x02,0x2e,0x1c] # sgpr src0
+# GFX1170: v_swmmac_i32_16x16x32_iu4 v[3:10], s0/*Invalid register, operand has 'VGPR_32' register class*/, v[1:2], v11 ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x02,0x2e,0x1c]
+
+[0x03,0x40,0x55,0xcc,0x00,0x03,0x2c,0x1c] # sgpr src1
+# GFX1170: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v11 ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x01,0x2c,0x1c]
+
+[0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x18] # sgpr src2
+# GFX1170: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], s11/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x18]
+
+[0x03,0x40,0x55,0xcc,0x81,0x02,0x2e,0x1c] # 1 src0
+# GFX1170: v_swmmac_i32_16x16x32_iu4 v[3:10], 1/*Invalid immediate*/, v[1:2], v11 ; encoding: [0x03,0x40,0x55,0xcc,0x01,0x02,0x2e,0x1c]
+
+[0x03,0x40,0x55,0xcc,0x00,0x03,0x2d,0x1c] # 1 src1
+# GFX1170: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, 1/*Invalid immediate*/, v11 ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x2c,0x1c]
+
+[0x03,0x40,0x55,0xcc,0x00,0x03,0x06,0x1a] # 1 src2
+# GFX1170: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], 1/*Invalid immediate*/ ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x06,0x18]
+
+[0x03,0x40,0x55,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2
+# GFX1170: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], 1.0/*Invalid immediate*/ ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x02,0x18]
+
+
+
+[0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x1c]
+# GFX1170: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0xc0,0x56,0xcc,0x00,0x05,0x3a,0x1c] # clamp
+# GFX1170: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 clamp ; encoding: [0x06,0xc0,0x56,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0x48,0x56,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x50,0x56,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x60,0x56,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x00,0x56,0xcc,0x00,0x05,0x3a,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x56,0xcc,0x00,0x05,0x3a,0x1c] # index_key:1
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x50,0x56,0xcc,0x00,0x05,0x3a,0x1c] # index_key:2
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x58,0x56,0xcc,0x00,0x05,0x3a,0x1c] # index_key:3
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x3c] # neg_lo:[1,0,0]
+# GFX1170: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x3c]
+
+[0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x5c] # neg_lo:[0,1,0]
+# GFX1170: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x5c]
+
+[0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x9c] # neg_lo:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x41,0x56,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x42,0x56,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x44,0x56,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x56,0xcc,0x00,0x04,0x3a,0x1c] # sgpr src0
+# GFX1170: v_swmmac_i32_16x16x64_iu4 v[6:13], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v14 ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x04,0x3a,0x1c]
+
+[0x06,0x40,0x56,0xcc,0x00,0x05,0x38,0x1c] # sgpr src1
+# GFX1170: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v14 ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x01,0x38,0x1c]
+
+[0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x18] # sgpr src2
+# GFX1170: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], s14/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x18]
+
+[0x06,0x40,0x56,0xcc,0x81,0x04,0x3a,0x1c] # 1 src0
+# GFX1170: v_swmmac_i32_16x16x64_iu4 v[6:13], 1/*Invalid immediate*/, v[2:5], v14 ; encoding: [0x06,0x40,0x56,0xcc,0x01,0x04,0x3a,0x1c]
+
+[0x06,0x40,0x56,0xcc,0x00,0x03,0x39,0x1c] # 1 src1
+# GFX1170: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], 1/*Invalid immediate*/, v14 ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x03,0x38,0x1c]
+
+[0x06,0x40,0x56,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX1170: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x05,0x06,0x18]
+
+[0x06,0x40,0x56,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX1170: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x05,0x02,0x18]
+
+
+
+[0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x1c]
+# GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0xc0,0x57,0xcc,0x00,0x05,0x3a,0x1c] # clamp
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x57,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[1,0,0]
+# GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x57,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0x50,0x57,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x60,0x57,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x00,0x57,0xcc,0x00,0x05,0x3a,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x57,0xcc,0x00,0x05,0x3a,0x1c] # index_key:1
+# GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x57,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0x50,0x57,0xcc,0x00,0x05,0x3a,0x1c] # index_key:2
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x58,0x57,0xcc,0x00,0x05,0x3a,0x1c] # index_key:3
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x3c] # neg_lo:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x5c] # neg_lo:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x9c] # neg_lo:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x41,0x57,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x42,0x57,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x44,0x57,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x57,0xcc,0x00,0x04,0x3a,0x1c] # sgpr src0
+# GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v14 ; encoding: [0x06,0x40,0x57,0xcc,0x00,0x04,0x3a,0x1c]
+
+[0x06,0x40,0x57,0xcc,0x00,0x05,0x38,0x1c] # sgpr src1
+# GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v14 ; encoding: [0x06,0x40,0x57,0xcc,0x00,0x01,0x38,0x1c]
+
+[0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x18] # sgpr src2
+# GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], s14/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x18]
+
+[0x06,0x40,0x57,0xcc,0xf2,0x04,0x3a,0x1c] # 1.0 src0
+# GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], 1.0/*Invalid immediate*/, v[2:5], v14 ; encoding: [0x06,0x40,0x57,0xcc,0x00,0x04,0x3a,0x1c]
+
+[0x06,0x40,0x57,0xcc,0x00,0xe5,0x39,0x1c] # 1.0 src1
+# GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], 1.0/*Invalid immediate*/, v14 ; encoding: [0x06,0x40,0x57,0xcc,0x00,0x01,0x38,0x1c]
+
+[0x06,0x40,0x57,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x57,0xcc,0x00,0x05,0x02,0x18]
+
+[0x06,0x40,0x57,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x57,0xcc,0x00,0x05,0x06,0x18]
+
+
+
+[0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x1c]
+# GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0xc0,0x58,0xcc,0x00,0x05,0x3a,0x1c] # clamp
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x58,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[1,0,0]
+# GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x58,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0x50,0x58,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x60,0x58,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x00,0x58,0xcc,0x00,0x05,0x3a,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x58,0xcc,0x00,0x05,0x3a,0x1c] # index_key:1
+# GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x58,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0x50,0x58,0xcc,0x00,0x05,0x3a,0x1c] # index_key:2
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x58,0x58,0xcc,0x00,0x05,0x3a,0x1c] # index_key:3
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x3c] # neg_lo:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x5c] # neg_lo:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x9c] # neg_lo:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x41,0x58,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x42,0x58,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x44,0x58,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x58,0xcc,0x00,0x04,0x3a,0x1c] # sgpr src0
+# GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v14 ; encoding: [0x06,0x40,0x58,0xcc,0x00,0x04,0x3a,0x1c]
+
+[0x06,0x40,0x58,0xcc,0x00,0x05,0x38,0x1c] # sgpr src1
+# GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v14 ; encoding: [0x06,0x40,0x58,0xcc,0x00,0x01,0x38,0x1c]
+
+[0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x18] # sgpr src2
+# GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], s14/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x18]
+
+[0x06,0x40,0x58,0xcc,0xf2,0x04,0x3a,0x1c] # 1.0 src0
+# GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], 1.0/*Invalid immediate*/, v[2:5], v14 ; encoding: [0x06,0x40,0x58,0xcc,0x00,0x04,0x3a,0x1c]
+
+[0x06,0x40,0x58,0xcc,0x00,0xe5,0x39,0x1c] # 1.0 src1
+# GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], 1.0/*Invalid immediate*/, v14 ; encoding: [0x06,0x40,0x58,0xcc,0x00,0x01,0x38,0x1c]
+
+[0x06,0x40,0x58,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x58,0xcc,0x00,0x05,0x02,0x18]
+
+[0x06,0x40,0x58,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x58,0xcc,0x00,0x05,0x06,0x18]
+
+
+
+[0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x1c]
+# GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0xc0,0x59,0xcc,0x00,0x05,0x3a,0x1c] # clamp
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x59,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[1,0,0]
+# GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x59,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0x50,0x59,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x60,0x59,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x00,0x59,0xcc,0x00,0x05,0x3a,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x59,0xcc,0x00,0x05,0x3a,0x1c] # index_key:1
+# GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x59,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0x50,0x59,0xcc,0x00,0x05,0x3a,0x1c] # index_key:2
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x58,0x59,0xcc,0x00,0x05,0x3a,0x1c] # index_key:3
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x3c] # neg_lo:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x5c] # neg_lo:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x9c] # neg_lo:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x41,0x59,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x42,0x59,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x44,0x59,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x59,0xcc,0x00,0x04,0x3a,0x1c] # sgpr src0
+# GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v14 ; encoding: [0x06,0x40,0x59,0xcc,0x00,0x04,0x3a,0x1c]
+
+[0x06,0x40,0x59,0xcc,0x00,0x05,0x38,0x1c] # sgpr src1
+# GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v14 ; encoding: [0x06,0x40,0x59,0xcc,0x00,0x01,0x38,0x1c]
+
+[0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x18] # sgpr src2
+# GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], s14/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x18]
+
+[0x06,0x40,0x59,0xcc,0xf2,0x04,0x3a,0x1c] # 1.0 src0
+# GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], 1.0/*Invalid immediate*/, v[2:5], v14 ; encoding: [0x06,0x40,0x59,0xcc,0x00,0x04,0x3a,0x1c]
+
+[0x06,0x40,0x59,0xcc,0x00,0xe5,0x39,0x1c] # 1.0 src1
+# GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], 1.0/*Invalid immediate*/, v14 ; encoding: [0x06,0x40,0x59,0xcc,0x00,0x01,0x38,0x1c]
+
+[0x06,0x40,0x59,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x59,0xcc,0x00,0x05,0x02,0x18]
+
+[0x06,0x40,0x59,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x59,0xcc,0x00,0x05,0x06,0x18]
+
+
+
+[0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x1c]
+# GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0xc0,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # clamp
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[1,0,0]
+# GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x5a,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0x50,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x60,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x00,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # index_key:1
+# GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x5a,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0x50,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # index_key:2
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x58,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # index_key:3
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x3c] # neg_lo:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x5c] # neg_lo:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x9c] # neg_lo:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x41,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x42,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x44,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x5a,0xcc,0x00,0x04,0x3a,0x1c] # sgpr src0
+# GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v14 ; encoding: [0x06,0x40,0x5a,0xcc,0x00,0x04,0x3a,0x1c]
+
+[0x06,0x40,0x5a,0xcc,0x00,0x05,0x38,0x1c] # sgpr src1
+# GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v14 ; encoding: [0x06,0x40,0x5a,0xcc,0x00,0x01,0x38,0x1c]
+
+[0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x18] # sgpr src2
+# GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], s14/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x18]
+
+[0x06,0x40,0x5a,0xcc,0xf2,0x04,0x3a,0x1c] # 1.0 src0
+# GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], 1.0/*Invalid immediate*/, v[2:5], v14 ; encoding: [0x06,0x40,0x5a,0xcc,0x00,0x04,0x3a,0x1c]
+
+[0x06,0x40,0x5a,0xcc,0x00,0xe5,0x39,0x1c] # 1.0 src1
+# GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], 1.0/*Invalid immediate*/, v14 ; encoding: [0x06,0x40,0x5a,0xcc,0x00,0x01,0x38,0x1c]
+
+[0x06,0x40,0x5a,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x5a,0xcc,0x00,0x05,0x02,0x18]
+
+[0x06,0x40,0x5a,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x5a,0xcc,0x00,0x05,0x06,0x18]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1170_dasm_wmma_w64.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1170_dasm_wmma_w64.txt
new file mode 100644
index 0000000000000..169fd20488e37
--- /dev/null
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1170_dasm_wmma_w64.txt
@@ -0,0 +1,1628 @@
+# RUN: not llvm-mc -disassemble -triple=amdgcn -mcpu=gfx1170 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=GFX1170 %s
+# RUN: not llvm-mc -disassemble -triple=amdgcn -mcpu=gfx1170 -mattr=+wavefrontsize64 -show-encoding %s 2>&1 | FileCheck --implicit-check-not=warning: --check-prefix=GFX1170-ERR %s
+
+[0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x1c]
+# GFX1170: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0xc0,0x40,0xcc,0x00,0x05,0x12,0x1c] # clamp
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x40,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x40,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x60,0x40,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x00,0x40,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x40,0xcc,0x00,0x05,0x12,0x1c] # index_key:1
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x40,0xcc,0x00,0x05,0x12,0x1c] # index_key:2
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x58,0x40,0xcc,0x00,0x05,0x12,0x1c] # index_key:3
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0]
+# GFX1170: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x3c]
+
+[0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0]
+# GFX1170: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x5c]
+
+[0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1]
+# GFX1170: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x9c]
+
+[0x04,0x41,0x40,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0]
+# GFX1170: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[1,0,0] ; encoding: [0x04,0x41,0x40,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x42,0x40,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0]
+# GFX1170: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,1,0] ; encoding: [0x04,0x42,0x40,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x44,0x40,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1]
+# GFX1170: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x40,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x40,0x40,0xcc,0x00,0x04,0x12,0x1c] # sgpr_0 src0
+# GFX1170: v_wmma_f32_16x16x16_f16 v[4:7], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:7] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x40,0xcc,0x00,0x01,0x10,0x1c] # sgpr_0 src1
+# GFX1170: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[4:7] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x01,0x10,0x1c]
+
+[0x04,0x40,0x40,0xcc,0x00,0x05,0x02,0x18] # sgpr_0 src2
+# GFX1170: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x02,0x18]
+
+[0x04,0x40,0x40,0xcc,0xf2,0x04,0x12,0x1c] # 1.0 src0
+# GFX1170: v_wmma_f32_16x16x16_f16 v[4:7], 1.0/*Invalid immediate*/, v[2:3], v[4:7] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x40,0xcc,0x00,0xe5,0x11,0x1c] # 1.0 src1
+# GFX1170: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], 1.0/*Invalid immediate*/, v[4:7] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x01,0x10,0x1c]
+
+[0x04,0x40,0x40,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX1170: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0xca,0x1b]
+
+[0x04,0x40,0x40,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX1170: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x06,0x1a]
+
+
+
+[0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x1c]
+# GFX1170: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0xc0,0x41,0xcc,0x00,0x05,0x12,0x1c] # clamp
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x41,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x41,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x60,0x41,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x00,0x41,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x41,0xcc,0x00,0x05,0x12,0x1c] # index_key:1
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x41,0xcc,0x00,0x05,0x12,0x1c] # index_key:2
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x58,0x41,0xcc,0x00,0x05,0x12,0x1c] # index_key:3
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0]
+# GFX1170: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x3c]
+
+[0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0]
+# GFX1170: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x5c]
+
+[0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1]
+# GFX1170: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x9c]
+
+[0x04,0x41,0x41,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0]
+# GFX1170: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[1,0,0] ; encoding: [0x04,0x41,0x41,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x42,0x41,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0]
+# GFX1170: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,1,0] ; encoding: [0x04,0x42,0x41,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x44,0x41,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1]
+# GFX1170: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x41,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x40,0x41,0xcc,0x00,0x04,0x12,0x1c] # sgpr_0 src0
+# GFX1170: v_wmma_f32_16x16x16_bf16 v[4:7], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:7] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x41,0xcc,0x00,0x01,0x10,0x1c] # sgpr_0 src1
+# GFX1170: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[4:7] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x01,0x10,0x1c]
+
+[0x04,0x40,0x41,0xcc,0x00,0x05,0x02,0x18] # sgpr_0 src2
+# GFX1170: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x02,0x18]
+
+[0x04,0x40,0x41,0xcc,0xf2,0x04,0x12,0x1c] # 1.0 src0
+# GFX1170: v_wmma_f32_16x16x16_bf16 v[4:7], 1.0/*Invalid immediate*/, v[2:3], v[4:7] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x41,0xcc,0x00,0xe5,0x11,0x1c] # 1.0 src1
+# GFX1170: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], 1.0/*Invalid immediate*/, v[4:7] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x01,0x10,0x1c]
+
+[0x04,0x40,0x41,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX1170: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0xca,0x1b]
+
+[0x04,0x40,0x41,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX1170: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x06,0x1a]
+
+
+
+[0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x1c]
+# GFX1170: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0xc0,0x42,0xcc,0x00,0x05,0x12,0x1c] # clamp
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x42,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x42,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x60,0x42,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x00,0x42,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x42,0xcc,0x00,0x05,0x12,0x1c] # index_key:1
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x42,0xcc,0x00,0x05,0x12,0x1c] # index_key:2
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x58,0x42,0xcc,0x00,0x05,0x12,0x1c] # index_key:3
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0]
+# GFX1170: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x3c]
+
+[0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0]
+# GFX1170: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x5c]
+
+[0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1]
+# GFX1170: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x9c]
+
+[0x04,0x41,0x42,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0]
+# GFX1170: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[1,0,0] ; encoding: [0x04,0x41,0x42,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x42,0x42,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0]
+# GFX1170: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,1,0] ; encoding: [0x04,0x42,0x42,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x44,0x42,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1]
+# GFX1170: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x42,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x40,0x42,0xcc,0x00,0x04,0x12,0x1c] # sgpr_0 src0
+# GFX1170: v_wmma_f16_16x16x16_f16 v[4:5], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:5] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x42,0xcc,0x00,0x01,0x10,0x1c] # sgpr_0 src1
+# GFX1170: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[4:5] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x01,0x10,0x1c]
+
+[0x04,0x40,0x42,0xcc,0x00,0x05,0x02,0x18] # sgpr_0 src2
+# GFX1170: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/ ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x02,0x18]
+
+[0x04,0x40,0x42,0xcc,0xf2,0x04,0x12,0x1c] # 1.0 src0
+# GFX1170: v_wmma_f16_16x16x16_f16 v[4:5], 1.0/*Invalid immediate*/, v[2:3], v[4:5] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x42,0xcc,0x00,0xe5,0x11,0x1c] # 1.0 src1
+# GFX1170: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], 1.0/*Invalid immediate*/, v[4:5] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x01,0x10,0x1c]
+
+[0x04,0x40,0x42,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX1170: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0xca,0x1b]
+
+[0x04,0x40,0x42,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX1170: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x06,0x1a]
+
+
+
+[0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x1c]
+# GFX1170: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0xc0,0x43,0xcc,0x00,0x05,0x12,0x1c] # clamp
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x43,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x43,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x60,0x43,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x00,0x43,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x43,0xcc,0x00,0x05,0x12,0x1c] # index_key:1
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x43,0xcc,0x00,0x05,0x12,0x1c] # index_key:2
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x58,0x43,0xcc,0x00,0x05,0x12,0x1c] # index_key:3
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0]
+# GFX1170: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x3c]
+
+[0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0]
+# GFX1170: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x5c]
+
+[0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1]
+# GFX1170: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x9c]
+
+[0x04,0x41,0x43,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0]
+# GFX1170: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[1,0,0] ; encoding: [0x04,0x41,0x43,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x42,0x43,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0]
+# GFX1170: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,1,0] ; encoding: [0x04,0x42,0x43,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x44,0x43,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1]
+# GFX1170: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x43,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x40,0x43,0xcc,0x00,0x04,0x12,0x1c] # sgpr_0 src0
+# GFX1170: v_wmma_bf16_16x16x16_bf16 v[4:5], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:5] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x43,0xcc,0x00,0x01,0x10,0x1c] # sgpr_0 src1
+# GFX1170: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[4:5] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x01,0x10,0x1c]
+
+[0x04,0x40,0x43,0xcc,0x00,0x05,0x02,0x18] # sgpr_0 src2
+# GFX1170: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/ ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x02,0x18]
+
+[0x04,0x40,0x43,0xcc,0xf2,0x04,0x12,0x1c] # 1.0 src0
+# GFX1170: v_wmma_bf16_16x16x16_bf16 v[4:5], 1.0/*Invalid immediate*/, v[2:3], v[4:5] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x43,0xcc,0x00,0xe5,0x11,0x1c] # 1.0 src1
+# GFX1170: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], 1.0/*Invalid immediate*/, v[4:5] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x01,0x10,0x1c]
+
+[0x04,0x40,0x43,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX1170: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0xca,0x1b]
+
+[0x04,0x40,0x43,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX1170: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x06,0x1a]
+
+
+
+[0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x1c]
+# GFX1170: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x1c]
+
+[0x02,0xc0,0x44,0xcc,0x00,0x03,0x0a,0x1c] # clamp
+# GFX1170: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] clamp ; encoding: [0x02,0xc0,0x44,0xcc,0x00,0x03,0x0a,0x1c]
+
+[0x02,0x48,0x44,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x44,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x60,0x44,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x00,0x44,0xcc,0x00,0x03,0x0a,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x48,0x44,0xcc,0x00,0x03,0x0a,0x1c] # index_key:1
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x44,0xcc,0x00,0x03,0x0a,0x1c] # index_key:2
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x58,0x44,0xcc,0x00,0x03,0x0a,0x1c] # index_key:3
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x3c] # neg_lo:[1,0,0]
+# GFX1170: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x3c]
+
+[0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x5c] # neg_lo:[0,1,0]
+# GFX1170: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x5c]
+
+[0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x9c] # neg_lo:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x41,0x44,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x42,0x44,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x44,0x44,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x44,0xcc,0x00,0x02,0x0a,0x1c] # sgpr_0 src0
+# GFX1170: v_wmma_i32_16x16x16_iu8 v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/, v1, v[2:5] ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x02,0x0a,0x1c]
+
+[0x02,0x40,0x44,0xcc,0x00,0x01,0x08,0x1c] # sgpr_0 src1
+# GFX1170: v_wmma_i32_16x16x16_iu8 v[2:5], v0, s0/*Invalid register, operand has 'VGPR_32' register class*/, v[2:5] ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x01,0x08,0x1c]
+
+[0x02,0x40,0x44,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2
+# GFX1170: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, s[0:3]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x02,0x18]
+
+[0x02,0x40,0x44,0xcc,0x81,0x02,0x0a,0x1c] # 1 src0
+# GFX1170: v_wmma_i32_16x16x16_iu8 v[2:5], 1/*Invalid immediate*/, v1, v[2:5] ; encoding: [0x02,0x40,0x44,0xcc,0x01,0x02,0x0a,0x1c]
+
+[0x02,0x40,0x44,0xcc,0x00,0x03,0x09,0x1c] # 1 src1
+# GFX1170: v_wmma_i32_16x16x16_iu8 v[2:5], v0, 1/*Invalid immediate*/, v[2:5] ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x08,0x1c]
+
+[0x02,0x40,0x44,0xcc,0x00,0x03,0x06,0x1a] # 1 src2
+# GFX1170: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x06,0x1a]
+
+[0x02,0x40,0x44,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2
+# GFX1170: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0xca,0x1b]
+
+
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x1c]
+# GFX1170: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x1c]
+
+[0x02,0xc0,0x45,0xcc,0x00,0x03,0x0a,0x1c] # clamp
+# GFX1170: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] clamp ; encoding: [0x02,0xc0,0x45,0xcc,0x00,0x03,0x0a,0x1c]
+
+[0x02,0x48,0x45,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x45,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x60,0x45,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x00,0x45,0xcc,0x00,0x03,0x0a,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x48,0x45,0xcc,0x00,0x03,0x0a,0x1c] # index_key:1
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x45,0xcc,0x00,0x03,0x0a,0x1c] # index_key:2
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x58,0x45,0xcc,0x00,0x03,0x0a,0x1c] # index_key:3
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x3c] # neg_lo:[1,0,0]
+# GFX1170: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x3c]
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x5c] # neg_lo:[0,1,0]
+# GFX1170: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x5c]
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x9c] # neg_lo:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x41,0x45,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x42,0x45,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x44,0x45,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x45,0xcc,0x00,0x02,0x0a,0x1c] # sgpr_0 src0
+# GFX1170: v_wmma_i32_16x16x16_iu4 v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/, v1, v[2:5] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x02,0x0a,0x1c]
+
+[0x02,0x40,0x45,0xcc,0x00,0x01,0x08,0x1c] # sgpr_0 src1
+# GFX1170: v_wmma_i32_16x16x16_iu4 v[2:5], v0, s0/*Invalid register, operand has 'VGPR_32' register class*/, v[2:5] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x01,0x08,0x1c]
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2
+# GFX1170: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, s[0:3]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x02,0x18]
+
+[0x02,0x40,0x45,0xcc,0x81,0x02,0x0a,0x1c] # 1 src0
+# GFX1170: v_wmma_i32_16x16x16_iu4 v[2:5], 1/*Invalid immediate*/, v1, v[2:5] ; encoding: [0x02,0x40,0x45,0xcc,0x01,0x02,0x0a,0x1c]
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x09,0x1c] # 1 src1
+# GFX1170: v_wmma_i32_16x16x16_iu4 v[2:5], v0, 1/*Invalid immediate*/, v[2:5] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x08,0x1c]
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x06,0x1a] # 1 src2
+# GFX1170: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x06,0x1a]
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2
+# GFX1170: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0xca,0x1b]
+
+
+
+[0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x1c]
+# GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x1c]
+
+[0x02,0xc0,0x46,0xcc,0x00,0x03,0x0a,0x1c] # clamp
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x48,0x46,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x46,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x60,0x46,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x00,0x46,0xcc,0x00,0x03,0x0a,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x48,0x46,0xcc,0x00,0x03,0x0a,0x1c] # index_key:1
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x46,0xcc,0x00,0x03,0x0a,0x1c] # index_key:2
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x58,0x46,0xcc,0x00,0x03,0x0a,0x1c] # index_key:3
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x3c] # neg_lo:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x5c] # neg_lo:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x9c] # neg_lo:[0,0,1]
+# GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x9c]
+
+[0x02,0x41,0x46,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x42,0x46,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x44,0x46,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,0,1]
+# GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] ; encoding: [0x02,0x44,0x46,0xcc,0x00,0x03,0x0a,0x1c]
+
+[0x02,0x40,0x46,0xcc,0x00,0x02,0x0a,0x1c] # sgpr_0 src0
+# GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/, v1, v[2:5] ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x02,0x0a,0x1c]
+
+[0x02,0x40,0x46,0xcc,0x00,0x01,0x08,0x1c] # sgpr_0 src1
+# GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, s0/*Invalid register, operand has 'VGPR_32' register class*/, v[2:5] ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x01,0x08,0x1c]
+
+[0x02,0x40,0x46,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2
+# GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, s[0:3]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x03,0x02,0x18]
+
+[0x02,0x40,0x46,0xcc,0xf2,0x02,0x0a,0x1c] # 1.0 src0
+# GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], 1.0/*Invalid immediate*/, v1, v[2:5] ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x02,0x0a,0x1c]
+
+[0x02,0x40,0x46,0xcc,0x00,0xe5,0x09,0x1c] # 1.0 src1
+# GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, 1.0/*Invalid immediate*/, v[2:5] ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x01,0x08,0x1c]
+
+[0x02,0x40,0x46,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2
+# GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x03,0xca,0x1b]
+
+[0x02,0x40,0x46,0xcc,0x00,0x03,0x06,0x1a] # 1 src2
+# GFX1170: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x03,0x06,0x1a]
+
+
+
+[0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x1c]
+# GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x1c]
+
+[0x02,0xc0,0x47,0xcc,0x00,0x03,0x0a,0x1c] # clamp
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x48,0x47,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x47,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x60,0x47,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x00,0x47,0xcc,0x00,0x03,0x0a,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x48,0x47,0xcc,0x00,0x03,0x0a,0x1c] # index_key:1
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x47,0xcc,0x00,0x03,0x0a,0x1c] # index_key:2
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x58,0x47,0xcc,0x00,0x03,0x0a,0x1c] # index_key:3
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x3c] # neg_lo:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x5c] # neg_lo:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x9c] # neg_lo:[0,0,1]
+# GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x9c]
+
+[0x02,0x41,0x47,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x42,0x47,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x44,0x47,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,0,1]
+# GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] ; encoding: [0x02,0x44,0x47,0xcc,0x00,0x03,0x0a,0x1c]
+
+[0x02,0x40,0x47,0xcc,0x00,0x02,0x0a,0x1c] # sgpr_0 src0
+# GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/, v1, v[2:5] ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x02,0x0a,0x1c]
+
+[0x02,0x40,0x47,0xcc,0x00,0x01,0x08,0x1c] # sgpr_0 src1
+# GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, s0/*Invalid register, operand has 'VGPR_32' register class*/, v[2:5] ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x01,0x08,0x1c]
+
+[0x02,0x40,0x47,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2
+# GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, s[0:3]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x03,0x02,0x18]
+
+[0x02,0x40,0x47,0xcc,0xf2,0x02,0x0a,0x1c] # 1.0 src0
+# GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], 1.0/*Invalid immediate*/, v1, v[2:5] ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x02,0x0a,0x1c]
+
+[0x02,0x40,0x47,0xcc,0x00,0xe5,0x09,0x1c] # 1.0 src1
+# GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, 1.0/*Invalid immediate*/, v[2:5] ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x01,0x08,0x1c]
+
+[0x02,0x40,0x47,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2
+# GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x03,0xca,0x1b]
+
+[0x02,0x40,0x47,0xcc,0x00,0x03,0x06,0x1a] # 1 src2
+# GFX1170: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x03,0x06,0x1a]
+
+
+
+[0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x1c]
+# GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x1c]
+
+[0x02,0xc0,0x48,0xcc,0x00,0x03,0x0a,0x1c] # clamp
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x48,0x48,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x48,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x60,0x48,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x00,0x48,0xcc,0x00,0x03,0x0a,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x48,0x48,0xcc,0x00,0x03,0x0a,0x1c] # index_key:1
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x48,0xcc,0x00,0x03,0x0a,0x1c] # index_key:2
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x58,0x48,0xcc,0x00,0x03,0x0a,0x1c] # index_key:3
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x3c] # neg_lo:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x5c] # neg_lo:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x9c] # neg_lo:[0,0,1]
+# GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x9c]
+
+[0x02,0x41,0x48,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x42,0x48,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x44,0x48,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,0,1]
+# GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] ; encoding: [0x02,0x44,0x48,0xcc,0x00,0x03,0x0a,0x1c]
+
+[0x02,0x40,0x48,0xcc,0x00,0x02,0x0a,0x1c] # sgpr_0 src0
+# GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/, v1, v[2:5] ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x02,0x0a,0x1c]
+
+[0x02,0x40,0x48,0xcc,0x00,0x01,0x08,0x1c] # sgpr_0 src1
+# GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, s0/*Invalid register, operand has 'VGPR_32' register class*/, v[2:5] ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x01,0x08,0x1c]
+
+[0x02,0x40,0x48,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2
+# GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, s[0:3]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x03,0x02,0x18]
+
+[0x02,0x40,0x48,0xcc,0xf2,0x02,0x0a,0x1c] # 1.0 src0
+# GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], 1.0/*Invalid immediate*/, v1, v[2:5] ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x02,0x0a,0x1c]
+
+[0x02,0x40,0x48,0xcc,0x00,0xe5,0x09,0x1c] # 1.0 src1
+# GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, 1.0/*Invalid immediate*/, v[2:5] ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x01,0x08,0x1c]
+
+[0x02,0x40,0x48,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2
+# GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x03,0xca,0x1b]
+
+[0x02,0x40,0x48,0xcc,0x00,0x03,0x06,0x1a] # 1 src2
+# GFX1170: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x03,0x06,0x1a]
+
+
+
+[0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x1c]
+# GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x1c]
+
+[0x02,0xc0,0x49,0xcc,0x00,0x03,0x0a,0x1c] # clamp
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x48,0x49,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x49,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x60,0x49,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x00,0x49,0xcc,0x00,0x03,0x0a,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x48,0x49,0xcc,0x00,0x03,0x0a,0x1c] # index_key:1
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x49,0xcc,0x00,0x03,0x0a,0x1c] # index_key:2
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x58,0x49,0xcc,0x00,0x03,0x0a,0x1c] # index_key:3
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x3c] # neg_lo:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x5c] # neg_lo:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x9c] # neg_lo:[0,0,1]
+# GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x9c]
+
+[0x02,0x41,0x49,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x42,0x49,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x44,0x49,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,0,1]
+# GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] ; encoding: [0x02,0x44,0x49,0xcc,0x00,0x03,0x0a,0x1c]
+
+[0x02,0x40,0x49,0xcc,0x00,0x02,0x0a,0x1c] # sgpr_0 src0
+# GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/, v1, v[2:5] ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x02,0x0a,0x1c]
+
+[0x02,0x40,0x49,0xcc,0x00,0x01,0x08,0x1c] # sgpr_0 src1
+# GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, s0/*Invalid register, operand has 'VGPR_32' register class*/, v[2:5] ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x01,0x08,0x1c]
+
+[0x02,0x40,0x49,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2
+# GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, s[0:3]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x03,0x02,0x18]
+
+[0x02,0x40,0x49,0xcc,0xf2,0x02,0x0a,0x1c] # 1.0 src0
+# GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], 1.0/*Invalid immediate*/, v1, v[2:5] ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x02,0x0a,0x1c]
+
+[0x02,0x40,0x49,0xcc,0x00,0xe5,0x09,0x1c] # 1.0 src1
+# GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, 1.0/*Invalid immediate*/, v[2:5] ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x01,0x08,0x1c]
+
+[0x02,0x40,0x49,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2
+# GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x03,0xca,0x1b]
+
+[0x02,0x40,0x49,0xcc,0x00,0x03,0x06,0x1a] # 1 src2
+# GFX1170: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x03,0x06,0x1a]
+
+
+
+[0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x1c]
+# GFX1170: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x1c]
+
+[0x02,0xc0,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # clamp
+# GFX1170: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] clamp ; encoding: [0x02,0xc0,0x4a,0xcc,0x00,0x03,0x0a,0x1c]
+
+[0x02,0x48,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x60,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x00,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x48,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # index_key:1
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # index_key:2
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x58,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # index_key:3
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x3c] # neg_lo:[1,0,0]
+# GFX1170: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x3c]
+
+[0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x5c] # neg_lo:[0,1,0]
+# GFX1170: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x5c]
+
+[0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x9c] # neg_lo:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x41,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x42,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x44,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x4a,0xcc,0x00,0x02,0x0a,0x1c] # sgpr_0 src0
+# GFX1170: v_wmma_i32_16x16x32_iu4 v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/, v1, v[2:5] ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x02,0x0a,0x1c]
+
+[0x02,0x40,0x4a,0xcc,0x00,0x01,0x08,0x1c] # sgpr_0 src1
+# GFX1170: v_wmma_i32_16x16x32_iu4 v[2:5], v0, s0/*Invalid register, operand has 'VGPR_32' register class*/, v[2:5] ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x01,0x08,0x1c]
+
+[0x02,0x40,0x4a,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2
+# GFX1170: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, s[0:3]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x02,0x18]
+
+[0x02,0x40,0x4a,0xcc,0x81,0x02,0x0a,0x1c] # 1 src0
+# GFX1170: v_wmma_i32_16x16x32_iu4 v[2:5], 1/*Invalid immediate*/, v1, v[2:5] ; encoding: [0x02,0x40,0x4a,0xcc,0x01,0x02,0x0a,0x1c]
+
+[0x02,0x40,0x4a,0xcc,0x00,0x03,0x09,0x1c] # 1 src1
+# GFX1170: v_wmma_i32_16x16x32_iu4 v[2:5], v0, 1/*Invalid immediate*/, v[2:5] ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x08,0x1c]
+
+[0x02,0x40,0x4a,0xcc,0x00,0x03,0x06,0x1a] # 1 src2
+# GFX1170: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x06,0x1a]
+
+[0x02,0x40,0x4a,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2
+# GFX1170: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0xca,0x1b]
+
+
+
+[0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x1c]
+# GFX1170: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x1c]
+
+[0x06,0xc0,0x50,0xcc,0x00,0x05,0x2a,0x1c] # clamp
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x50,0xcc,0x00,0x05,0x2a,0x1c] # op_sel:[1,0,0]
+# GFX1170: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:1 ; encoding: [0x06,0x48,0x50,0xcc,0x00,0x05,0x2a,0x1c]
+
+[0x06,0x50,0x50,0xcc,0x00,0x05,0x2a,0x1c] # op_sel:[0,1,0]
+# GFX1170: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:2 ; encoding: [0x06,0x50,0x50,0xcc,0x00,0x05,0x2a,0x1c]
+
+[0x06,0x60,0x50,0xcc,0x00,0x05,0x2a,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x00,0x50,0xcc,0x00,0x05,0x2a,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x50,0xcc,0x00,0x05,0x2a,0x1c] # index_key:1
+# GFX1170: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:1 ; encoding: [0x06,0x48,0x50,0xcc,0x00,0x05,0x2a,0x1c]
+
+[0x06,0x50,0x50,0xcc,0x00,0x05,0x2a,0x1c] # index_key:2
+# GFX1170: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:2 ; encoding: [0x06,0x50,0x50,0xcc,0x00,0x05,0x2a,0x1c]
+
+[0x06,0x58,0x50,0xcc,0x00,0x05,0x2a,0x1c] # index_key:3
+# GFX1170: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:3 ; encoding: [0x06,0x58,0x50,0xcc,0x00,0x05,0x2a,0x1c]
+
+[0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x3c] # neg_lo:[1,0,0]
+# GFX1170: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x3c]
+
+[0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x5c] # neg_lo:[0,1,0]
+# GFX1170: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x5c]
+
+[0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x9c] # neg_lo:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x41,0x50,0xcc,0x00,0x05,0x2a,0x1c] # neg_hi:[1,0,0]
+# GFX1170: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[1,0,0] ; encoding: [0x06,0x41,0x50,0xcc,0x00,0x05,0x2a,0x1c]
+
+[0x06,0x42,0x50,0xcc,0x00,0x05,0x2a,0x1c] # neg_hi:[0,1,0]
+# GFX1170: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[0,1,0] ; encoding: [0x06,0x42,0x50,0xcc,0x00,0x05,0x2a,0x1c]
+
+[0x06,0x44,0x50,0xcc,0x00,0x05,0x2a,0x1c] # neg_hi:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x50,0xcc,0x00,0x04,0x2a,0x1c] # sgpr_0 src0
+# GFX1170: v_swmmac_f32_16x16x32_f16 v[6:9], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v10 ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x04,0x2a,0x1c]
+
+[0x06,0x40,0x50,0xcc,0x00,0x01,0x28,0x1c] # sgpr_0 src1
+# GFX1170: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v10 ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x01,0x28,0x1c]
+
+[0x06,0x40,0x50,0xcc,0x00,0x05,0x02,0x18] # sgpr_0 src2
+# GFX1170: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x05,0x02,0x18]
+
+[0x06,0x40,0x50,0xcc,0xf2,0x04,0x2a,0x1c] # 1.0 src0
+# GFX1170: v_swmmac_f32_16x16x32_f16 v[6:9], 1.0/*Invalid immediate*/, v[2:5], v10 ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x04,0x2a,0x1c]
+
+[0x06,0x40,0x50,0xcc,0x00,0xe5,0x29,0x1c] # 1.0 src1
+# GFX1170: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], 1.0/*Invalid immediate*/, v10 ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x01,0x28,0x1c]
+
+[0x06,0x40,0x50,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX1170: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x05,0x02,0x18]
+
+[0x06,0x40,0x50,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX1170: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x05,0x06,0x18]
+
+
+
+[0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x1c]
+# GFX1170: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x1c]
+
+[0x06,0xc0,0x51,0xcc,0x00,0x05,0x2a,0x1c] # clamp
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x51,0xcc,0x00,0x05,0x2a,0x1c] # op_sel:[1,0,0]
+# GFX1170: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:1 ; encoding: [0x06,0x48,0x51,0xcc,0x00,0x05,0x2a,0x1c]
+
+[0x06,0x50,0x51,0xcc,0x00,0x05,0x2a,0x1c] # op_sel:[0,1,0]
+# GFX1170: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:2 ; encoding: [0x06,0x50,0x51,0xcc,0x00,0x05,0x2a,0x1c]
+
+[0x06,0x60,0x51,0xcc,0x00,0x05,0x2a,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x00,0x51,0xcc,0x00,0x05,0x2a,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x51,0xcc,0x00,0x05,0x2a,0x1c] # index_key:1
+# GFX1170: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:1 ; encoding: [0x06,0x48,0x51,0xcc,0x00,0x05,0x2a,0x1c]
+
+[0x06,0x50,0x51,0xcc,0x00,0x05,0x2a,0x1c] # index_key:2
+# GFX1170: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:2 ; encoding: [0x06,0x50,0x51,0xcc,0x00,0x05,0x2a,0x1c]
+
+[0x06,0x58,0x51,0xcc,0x00,0x05,0x2a,0x1c] # index_key:3
+# GFX1170: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:3 ; encoding: [0x06,0x58,0x51,0xcc,0x00,0x05,0x2a,0x1c]
+
+[0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x3c] # neg_lo:[1,0,0]
+# GFX1170: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x3c]
+
+[0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x5c] # neg_lo:[0,1,0]
+# GFX1170: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x5c]
+
+[0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x9c] # neg_lo:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x41,0x51,0xcc,0x00,0x05,0x2a,0x1c] # neg_hi:[1,0,0]
+# GFX1170: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[1,0,0] ; encoding: [0x06,0x41,0x51,0xcc,0x00,0x05,0x2a,0x1c]
+
+[0x06,0x42,0x51,0xcc,0x00,0x05,0x2a,0x1c] # neg_hi:[0,1,0]
+# GFX1170: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[0,1,0] ; encoding: [0x06,0x42,0x51,0xcc,0x00,0x05,0x2a,0x1c]
+
+[0x06,0x44,0x51,0xcc,0x00,0x05,0x2a,0x1c] # neg_hi:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x51,0xcc,0x00,0x04,0x2a,0x1c] # sgpr_0 src0
+# GFX1170: v_swmmac_f32_16x16x32_bf16 v[6:9], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v10 ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x04,0x2a,0x1c]
+
+[0x06,0x40,0x51,0xcc,0x00,0x01,0x28,0x1c] # sgpr_0 src1
+# GFX1170: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v10 ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x01,0x28,0x1c]
+
+[0x06,0x40,0x51,0xcc,0x00,0x05,0x02,0x18] # sgpr_0 src2
+# GFX1170: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x05,0x02,0x18]
+
+[0x06,0x40,0x51,0xcc,0xf2,0x04,0x2a,0x1c] # 1.0 src0
+# GFX1170: v_swmmac_f32_16x16x32_bf16 v[6:9], 1.0/*Invalid immediate*/, v[2:5], v10 ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x04,0x2a,0x1c]
+
+[0x06,0x40,0x51,0xcc,0x00,0xe5,0x29,0x1c] # 1.0 src1
+# GFX1170: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], 1.0/*Invalid immediate*/, v10 ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x01,0x28,0x1c]
+
+[0x06,0x40,0x51,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX1170: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x05,0x02,0x18]
+
+[0x06,0x40,0x51,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX1170: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x05,0x06,0x18]
+
+
+
+[0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x1c]
+# GFX1170: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x1c]
+
+[0x06,0xc0,0x52,0xcc,0x00,0x05,0x22,0x1c] # clamp
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x52,0xcc,0x00,0x05,0x22,0x1c] # op_sel:[1,0,0]
+# GFX1170: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:1 ; encoding: [0x06,0x48,0x52,0xcc,0x00,0x05,0x22,0x1c]
+
+[0x06,0x50,0x52,0xcc,0x00,0x05,0x22,0x1c] # op_sel:[0,1,0]
+# GFX1170: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:2 ; encoding: [0x06,0x50,0x52,0xcc,0x00,0x05,0x22,0x1c]
+
+[0x06,0x60,0x52,0xcc,0x00,0x05,0x22,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x00,0x52,0xcc,0x00,0x05,0x22,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x52,0xcc,0x00,0x05,0x22,0x1c] # index_key:1
+# GFX1170: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:1 ; encoding: [0x06,0x48,0x52,0xcc,0x00,0x05,0x22,0x1c]
+
+[0x06,0x50,0x52,0xcc,0x00,0x05,0x22,0x1c] # index_key:2
+# GFX1170: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:2 ; encoding: [0x06,0x50,0x52,0xcc,0x00,0x05,0x22,0x1c]
+
+[0x06,0x58,0x52,0xcc,0x00,0x05,0x22,0x1c] # index_key:3
+# GFX1170: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:3 ; encoding: [0x06,0x58,0x52,0xcc,0x00,0x05,0x22,0x1c]
+
+[0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x3c] # neg_lo:[1,0,0]
+# GFX1170: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x3c]
+
+[0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x5c] # neg_lo:[0,1,0]
+# GFX1170: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x5c]
+
+[0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x9c] # neg_lo:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x41,0x52,0xcc,0x00,0x05,0x22,0x1c] # neg_hi:[1,0,0]
+# GFX1170: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[1,0,0] ; encoding: [0x06,0x41,0x52,0xcc,0x00,0x05,0x22,0x1c]
+
+[0x06,0x42,0x52,0xcc,0x00,0x05,0x22,0x1c] # neg_hi:[0,1,0]
+# GFX1170: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[0,1,0] ; encoding: [0x06,0x42,0x52,0xcc,0x00,0x05,0x22,0x1c]
+
+[0x06,0x44,0x52,0xcc,0x00,0x05,0x22,0x1c] # neg_hi:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x52,0xcc,0x00,0x04,0x22,0x1c] # sgpr_0 src0
+# GFX1170: v_swmmac_f16_16x16x32_f16 v[6:7], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v8 ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x04,0x22,0x1c]
+
+[0x06,0x40,0x52,0xcc,0x00,0x01,0x20,0x1c] # sgpr_0 src1
+# GFX1170: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v8 ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x01,0x20,0x1c]
+
+[0x06,0x40,0x52,0xcc,0x00,0x05,0x02,0x18] # sgpr_0 src2
+# GFX1170: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x05,0x02,0x18]
+
+[0x06,0x40,0x52,0xcc,0xf2,0x04,0x22,0x1c] # 1.0 src0
+# GFX1170: v_swmmac_f16_16x16x32_f16 v[6:7], 1.0/*Invalid immediate*/, v[2:5], v8 ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x04,0x22,0x1c]
+
+[0x06,0x40,0x52,0xcc,0x00,0xe5,0x21,0x1c] # 1.0 src1
+# GFX1170: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], 1.0/*Invalid immediate*/, v8 ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x01,0x20,0x1c]
+
+[0x06,0x40,0x52,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX1170: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x05,0x02,0x18]
+
+[0x06,0x40,0x52,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX1170: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x05,0x06,0x18]
+
+
+
+[0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x1c]
+# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x1c]
+
+[0x06,0xc0,0x53,0xcc,0x00,0x05,0x22,0x1c] # clamp
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x53,0xcc,0x00,0x05,0x22,0x1c] # op_sel:[1,0,0]
+# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:1 ; encoding: [0x06,0x48,0x53,0xcc,0x00,0x05,0x22,0x1c]
+
+[0x06,0x50,0x53,0xcc,0x00,0x05,0x22,0x1c] # op_sel:[0,1,0]
+# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:2 ; encoding: [0x06,0x50,0x53,0xcc,0x00,0x05,0x22,0x1c]
+
+[0x06,0x60,0x53,0xcc,0x00,0x05,0x22,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x00,0x53,0xcc,0x00,0x05,0x22,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x53,0xcc,0x00,0x05,0x22,0x1c] # index_key:1
+# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:1 ; encoding: [0x06,0x48,0x53,0xcc,0x00,0x05,0x22,0x1c]
+
+[0x06,0x50,0x53,0xcc,0x00,0x05,0x22,0x1c] # index_key:2
+# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:2 ; encoding: [0x06,0x50,0x53,0xcc,0x00,0x05,0x22,0x1c]
+
+[0x06,0x58,0x53,0xcc,0x00,0x05,0x22,0x1c] # index_key:3
+# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:3 ; encoding: [0x06,0x58,0x53,0xcc,0x00,0x05,0x22,0x1c]
+
+[0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x3c] # neg_lo:[1,0,0]
+# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x3c]
+
+[0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x5c] # neg_lo:[0,1,0]
+# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x5c]
+
+[0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x9c] # neg_lo:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x41,0x53,0xcc,0x00,0x05,0x22,0x1c] # neg_hi:[1,0,0]
+# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[1,0,0] ; encoding: [0x06,0x41,0x53,0xcc,0x00,0x05,0x22,0x1c]
+
+[0x06,0x42,0x53,0xcc,0x00,0x05,0x22,0x1c] # neg_hi:[0,1,0]
+# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[0,1,0] ; encoding: [0x06,0x42,0x53,0xcc,0x00,0x05,0x22,0x1c]
+
+[0x06,0x44,0x53,0xcc,0x00,0x05,0x22,0x1c] # neg_hi:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x53,0xcc,0x00,0x04,0x22,0x1c] # sgpr_0 src0
+# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[6:7], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v8 ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x04,0x22,0x1c]
+
+[0x06,0x40,0x53,0xcc,0x00,0x01,0x20,0x1c] # sgpr_0 src1
+# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v8 ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x01,0x20,0x1c]
+
+[0x06,0x40,0x53,0xcc,0x00,0x05,0x02,0x18] # sgpr_0 src2
+# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x05,0x02,0x18]
+
+[0x06,0x40,0x53,0xcc,0xf2,0x04,0x22,0x1c] # 1.0 src0
+# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[6:7], 1.0/*Invalid immediate*/, v[2:5], v8 ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x04,0x22,0x1c]
+
+[0x06,0x40,0x53,0xcc,0x00,0xe5,0x21,0x1c] # 1.0 src1
+# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], 1.0/*Invalid immediate*/, v8 ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x01,0x20,0x1c]
+
+[0x06,0x40,0x53,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x05,0x02,0x18]
+
+[0x06,0x40,0x53,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX1170: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x05,0x06,0x18]
+
+
+
+[0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x1c]
+# GFX1170: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0xc0,0x54,0xcc,0x00,0x03,0x1e,0x1c] # clamp
+# GFX1170: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 clamp ; encoding: [0x03,0xc0,0x54,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x48,0x54,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[1,0,0]
+# GFX1170: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x54,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x50,0x54,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,1,0]
+# GFX1170: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x54,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x60,0x54,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x00,0x54,0xcc,0x00,0x03,0x1e,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x48,0x54,0xcc,0x00,0x03,0x1e,0x1c] # index_key:1
+# GFX1170: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x54,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x50,0x54,0xcc,0x00,0x03,0x1e,0x1c] # index_key:2
+# GFX1170: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x54,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x58,0x54,0xcc,0x00,0x03,0x1e,0x1c] # index_key:3
+# GFX1170: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x54,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x3c] # neg_lo:[1,0,0]
+# GFX1170: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x3c]
+
+[0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x5c] # neg_lo:[0,1,0]
+# GFX1170: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x5c]
+
+[0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x9c] # neg_lo:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x41,0x54,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x42,0x54,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x44,0x54,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x54,0xcc,0x00,0x02,0x1e,0x1c] # sgpr_0 src0
+# GFX1170: v_swmmac_i32_16x16x32_iu8 v[3:6], s0/*Invalid register, operand has 'VGPR_32' register class*/, v[1:2], v7 ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x02,0x1e,0x1c]
+
+[0x03,0x40,0x54,0xcc,0x00,0x01,0x1c,0x1c] # sgpr_0 src1
+# GFX1170: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v7 ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x01,0x1c,0x1c]
+
+[0x03,0x40,0x54,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2
+# GFX1170: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x02,0x18]
+
+[0x03,0x40,0x54,0xcc,0x81,0x02,0x1e,0x1c] # 1 src0
+# GFX1170: v_swmmac_i32_16x16x32_iu8 v[3:6], 1/*Invalid immediate*/, v[1:2], v7 ; encoding: [0x03,0x40,0x54,0xcc,0x01,0x02,0x1e,0x1c]
+
+[0x03,0x40,0x54,0xcc,0x00,0x03,0x1d,0x1c] # 1 src1
+# GFX1170: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, 1/*Invalid immediate*/, v7 ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x1c,0x1c]
+
+[0x03,0x40,0x54,0xcc,0x00,0x03,0x06,0x1a] # 1 src2
+# GFX1170: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], 1/*Invalid immediate*/ ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x06,0x18]
+
+[0x03,0x40,0x54,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2
+# GFX1170: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], 1.0/*Invalid immediate*/ ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x02,0x18]
+
+
+
+[0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x1c]
+# GFX1170: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x1c]
+
+[0x02,0xc0,0x55,0xcc,0x00,0x03,0x1a,0x1c] # clamp
+# GFX1170: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 clamp ; encoding: [0x02,0xc0,0x55,0xcc,0x00,0x03,0x1a,0x1c]
+
+[0x02,0x48,0x55,0xcc,0x00,0x03,0x1a,0x1c] # op_sel:[1,0,0]
+# GFX1170: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:1 ; encoding: [0x02,0x48,0x55,0xcc,0x00,0x03,0x1a,0x1c]
+
+[0x02,0x50,0x55,0xcc,0x00,0x03,0x1a,0x1c] # op_sel:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x60,0x55,0xcc,0x00,0x03,0x1a,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x00,0x55,0xcc,0x00,0x03,0x1a,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x48,0x55,0xcc,0x00,0x03,0x1a,0x1c] # index_key:1
+# GFX1170: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:1 ; encoding: [0x02,0x48,0x55,0xcc,0x00,0x03,0x1a,0x1c]
+
+[0x02,0x50,0x55,0xcc,0x00,0x03,0x1a,0x1c] # index_key:2
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x58,0x55,0xcc,0x00,0x03,0x1a,0x1c] # index_key:3
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x3c] # neg_lo:[1,0,0]
+# GFX1170: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x3c]
+
+[0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x5c] # neg_lo:[0,1,0]
+# GFX1170: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x5c]
+
+[0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x9c] # neg_lo:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x41,0x55,0xcc,0x00,0x03,0x1a,0x1c] # neg_hi:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x42,0x55,0xcc,0x00,0x03,0x1a,0x1c] # neg_hi:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x44,0x55,0xcc,0x00,0x03,0x1a,0x1c] # neg_hi:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x55,0xcc,0x00,0x02,0x1a,0x1c] # sgpr_0 src0
+# GFX1170: v_swmmac_i32_16x16x32_iu4 v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/, v1, v6 ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x02,0x1a,0x1c]
+
+[0x02,0x40,0x55,0xcc,0x00,0x01,0x18,0x1c] # sgpr_0 src1
+# GFX1170: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, s0/*Invalid register, operand has 'VGPR_32' register class*/, v6 ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x01,0x18,0x1c]
+
+[0x02,0x40,0x55,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2
+# GFX1170: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x02,0x18]
+
+[0x02,0x40,0x55,0xcc,0x81,0x02,0x1a,0x1c] # 1 src0
+# GFX1170: v_swmmac_i32_16x16x32_iu4 v[2:5], 1/*Invalid immediate*/, v1, v6 ; encoding: [0x02,0x40,0x55,0xcc,0x01,0x02,0x1a,0x1c]
+
+[0x02,0x40,0x55,0xcc,0x00,0x03,0x19,0x1c] # 1 src1
+# GFX1170: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, 1/*Invalid immediate*/, v6 ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x18,0x1c]
+
+[0x02,0x40,0x55,0xcc,0x00,0x03,0x06,0x1a] # 1 src2
+# GFX1170: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, 1/*Invalid immediate*/ ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x06,0x18]
+
+[0x02,0x40,0x55,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2
+# GFX1170: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, 1.0/*Invalid immediate*/ ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x02,0x18]
+
+
+
+[0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x1c]
+# GFX1170: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0xc0,0x56,0xcc,0x00,0x03,0x1e,0x1c] # clamp
+# GFX1170: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 clamp ; encoding: [0x03,0xc0,0x56,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x48,0x56,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[1,0,0]
+# GFX1170: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x56,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x50,0x56,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x60,0x56,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x00,0x56,0xcc,0x00,0x03,0x1e,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x48,0x56,0xcc,0x00,0x03,0x1e,0x1c] # index_key:1
+# GFX1170: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x56,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x50,0x56,0xcc,0x00,0x03,0x1e,0x1c] # index_key:2
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x58,0x56,0xcc,0x00,0x03,0x1e,0x1c] # index_key:3
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x3c] # neg_lo:[1,0,0]
+# GFX1170: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x3c]
+
+[0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x5c] # neg_lo:[0,1,0]
+# GFX1170: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x5c]
+
+[0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x9c] # neg_lo:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x41,0x56,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x42,0x56,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x44,0x56,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x56,0xcc,0x00,0x02,0x1e,0x1c] # sgpr_0 src0
+# GFX1170: v_swmmac_i32_16x16x64_iu4 v[3:6], s0/*Invalid register, operand has 'VGPR_32' register class*/, v[1:2], v7 ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x02,0x1e,0x1c]
+
+[0x03,0x40,0x56,0xcc,0x00,0x01,0x1c,0x1c] # sgpr_0 src1
+# GFX1170: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v7 ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x01,0x1c,0x1c]
+
+[0x03,0x40,0x56,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2
+# GFX1170: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x02,0x18]
+
+[0x03,0x40,0x56,0xcc,0x81,0x02,0x1e,0x1c] # 1 src0
+# GFX1170: v_swmmac_i32_16x16x64_iu4 v[3:6], 1/*Invalid immediate*/, v[1:2], v7 ; encoding: [0x03,0x40,0x56,0xcc,0x01,0x02,0x1e,0x1c]
+
+[0x03,0x40,0x56,0xcc,0x00,0x03,0x1d,0x1c] # 1 src1
+# GFX1170: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, 1/*Invalid immediate*/, v7 ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x1c,0x1c]
+
+[0x03,0x40,0x56,0xcc,0x00,0x03,0x06,0x1a] # 1 src2
+# GFX1170: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], 1/*Invalid immediate*/ ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x06,0x18]
+
+[0x03,0x40,0x56,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2
+# GFX1170: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], 1.0/*Invalid immediate*/ ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x02,0x18]
+
+
+
+[0x03,0x40,0x57,0xcc,0x00,0x03,0x1e,0x1c]
+# GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x57,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0xc0,0x57,0xcc,0x00,0x03,0x1e,0x1c] # clamp
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x48,0x57,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[1,0,0]
+# GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x57,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x50,0x57,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,1,0]
+# GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x57,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x60,0x57,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x57,0xcc,0x00,0x03,0x1e,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x57,0xcc,0x00,0x03,0x1e,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x00,0x57,0xcc,0x00,0x03,0x1e,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x48,0x57,0xcc,0x00,0x03,0x1e,0x1c] # index_key:1
+# GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x57,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x50,0x57,0xcc,0x00,0x03,0x1e,0x1c] # index_key:2
+# GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x57,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x58,0x57,0xcc,0x00,0x03,0x1e,0x1c] # index_key:3
+# GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x57,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x40,0x57,0xcc,0x00,0x03,0x1e,0x3c] # neg_lo:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x57,0xcc,0x00,0x03,0x1e,0x5c] # neg_lo:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x57,0xcc,0x00,0x03,0x1e,0x9c] # neg_lo:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x41,0x57,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x42,0x57,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x44,0x57,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x57,0xcc,0x00,0x02,0x1e,0x1c] # sgpr_0 src0
+# GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], s0/*Invalid register, operand has 'VGPR_32' register class*/, v[1:2], v7 ; encoding: [0x03,0x40,0x57,0xcc,0x00,0x02,0x1e,0x1c]
+
+[0x03,0x40,0x57,0xcc,0x00,0x01,0x1c,0x1c] # sgpr_0 src1
+# GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v7 ; encoding: [0x03,0x40,0x57,0xcc,0x00,0x01,0x1c,0x1c]
+
+[0x03,0x40,0x57,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2
+# GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x03,0x40,0x57,0xcc,0x00,0x03,0x02,0x18]
+
+[0x03,0x40,0x57,0xcc,0xf2,0x02,0x1e,0x1c] # 1.0 src0
+# GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], 1.0/*Invalid immediate*/, v[1:2], v7 ; encoding: [0x03,0x40,0x57,0xcc,0x00,0x02,0x1e,0x1c]
+
+[0x03,0x40,0x57,0xcc,0x00,0xe5,0x1d,0x1c] # 1.0 src1
+# GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, 1.0/*Invalid immediate*/, v7 ; encoding: [0x03,0x40,0x57,0xcc,0x00,0x01,0x1c,0x1c]
+
+[0x03,0x40,0x57,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2
+# GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], 1.0/*Invalid immediate*/ ; encoding: [0x03,0x40,0x57,0xcc,0x00,0x03,0x02,0x18]
+
+[0x03,0x40,0x57,0xcc,0x00,0x03,0x06,0x1a] # 1 src2
+# GFX1170: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], 1/*Invalid immediate*/ ; encoding: [0x03,0x40,0x57,0xcc,0x00,0x03,0x06,0x18]
+
+
+
+[0x03,0x40,0x58,0xcc,0x00,0x03,0x1e,0x1c]
+# GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x58,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0xc0,0x58,0xcc,0x00,0x03,0x1e,0x1c] # clamp
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x48,0x58,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[1,0,0]
+# GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x58,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x50,0x58,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,1,0]
+# GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x58,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x60,0x58,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x58,0xcc,0x00,0x03,0x1e,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x58,0xcc,0x00,0x03,0x1e,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x00,0x58,0xcc,0x00,0x03,0x1e,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x48,0x58,0xcc,0x00,0x03,0x1e,0x1c] # index_key:1
+# GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x58,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x50,0x58,0xcc,0x00,0x03,0x1e,0x1c] # index_key:2
+# GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x58,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x58,0x58,0xcc,0x00,0x03,0x1e,0x1c] # index_key:3
+# GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x58,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x40,0x58,0xcc,0x00,0x03,0x1e,0x3c] # neg_lo:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x58,0xcc,0x00,0x03,0x1e,0x5c] # neg_lo:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x58,0xcc,0x00,0x03,0x1e,0x9c] # neg_lo:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x41,0x58,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x42,0x58,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x44,0x58,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x58,0xcc,0x00,0x02,0x1e,0x1c] # sgpr_0 src0
+# GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], s0/*Invalid register, operand has 'VGPR_32' register class*/, v[1:2], v7 ; encoding: [0x03,0x40,0x58,0xcc,0x00,0x02,0x1e,0x1c]
+
+[0x03,0x40,0x58,0xcc,0x00,0x01,0x1c,0x1c] # sgpr_0 src1
+# GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v7 ; encoding: [0x03,0x40,0x58,0xcc,0x00,0x01,0x1c,0x1c]
+
+[0x03,0x40,0x58,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2
+# GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x03,0x40,0x58,0xcc,0x00,0x03,0x02,0x18]
+
+[0x03,0x40,0x58,0xcc,0xf2,0x02,0x1e,0x1c] # 1.0 src0
+# GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], 1.0/*Invalid immediate*/, v[1:2], v7 ; encoding: [0x03,0x40,0x58,0xcc,0x00,0x02,0x1e,0x1c]
+
+[0x03,0x40,0x58,0xcc,0x00,0xe5,0x1d,0x1c] # 1.0 src1
+# GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, 1.0/*Invalid immediate*/, v7 ; encoding: [0x03,0x40,0x58,0xcc,0x00,0x01,0x1c,0x1c]
+
+[0x03,0x40,0x58,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2
+# GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], 1.0/*Invalid immediate*/ ; encoding: [0x03,0x40,0x58,0xcc,0x00,0x03,0x02,0x18]
+
+[0x03,0x40,0x58,0xcc,0x00,0x03,0x06,0x1a] # 1 src2
+# GFX1170: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], 1/*Invalid immediate*/ ; encoding: [0x03,0x40,0x58,0xcc,0x00,0x03,0x06,0x18]
+
+
+
+[0x03,0x40,0x59,0xcc,0x00,0x03,0x1e,0x1c]
+# GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x59,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0xc0,0x59,0xcc,0x00,0x03,0x1e,0x1c] # clamp
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x48,0x59,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[1,0,0]
+# GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x59,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x50,0x59,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,1,0]
+# GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x59,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x60,0x59,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x59,0xcc,0x00,0x03,0x1e,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x59,0xcc,0x00,0x03,0x1e,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x00,0x59,0xcc,0x00,0x03,0x1e,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x48,0x59,0xcc,0x00,0x03,0x1e,0x1c] # index_key:1
+# GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x59,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x50,0x59,0xcc,0x00,0x03,0x1e,0x1c] # index_key:2
+# GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x59,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x58,0x59,0xcc,0x00,0x03,0x1e,0x1c] # index_key:3
+# GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x59,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x40,0x59,0xcc,0x00,0x03,0x1e,0x3c] # neg_lo:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x59,0xcc,0x00,0x03,0x1e,0x5c] # neg_lo:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x59,0xcc,0x00,0x03,0x1e,0x9c] # neg_lo:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x41,0x59,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x42,0x59,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x44,0x59,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x59,0xcc,0x00,0x02,0x1e,0x1c] # sgpr_0 src0
+# GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], s0/*Invalid register, operand has 'VGPR_32' register class*/, v[1:2], v7 ; encoding: [0x03,0x40,0x59,0xcc,0x00,0x02,0x1e,0x1c]
+
+[0x03,0x40,0x59,0xcc,0x00,0x01,0x1c,0x1c] # sgpr_0 src1
+# GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v7 ; encoding: [0x03,0x40,0x59,0xcc,0x00,0x01,0x1c,0x1c]
+
+[0x03,0x40,0x59,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2
+# GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x03,0x40,0x59,0xcc,0x00,0x03,0x02,0x18]
+
+[0x03,0x40,0x59,0xcc,0xf2,0x02,0x1e,0x1c] # 1.0 src0
+# GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], 1.0/*Invalid immediate*/, v[1:2], v7 ; encoding: [0x03,0x40,0x59,0xcc,0x00,0x02,0x1e,0x1c]
+
+[0x03,0x40,0x59,0xcc,0x00,0xe5,0x1d,0x1c] # 1.0 src1
+# GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, 1.0/*Invalid immediate*/, v7 ; encoding: [0x03,0x40,0x59,0xcc,0x00,0x01,0x1c,0x1c]
+
+[0x03,0x40,0x59,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2
+# GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], 1.0/*Invalid immediate*/ ; encoding: [0x03,0x40,0x59,0xcc,0x00,0x03,0x02,0x18]
+
+[0x03,0x40,0x59,0xcc,0x00,0x03,0x06,0x1a] # 1 src2
+# GFX1170: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], 1/*Invalid immediate*/ ; encoding: [0x03,0x40,0x59,0xcc,0x00,0x03,0x06,0x18]
+
+
+
+[0x03,0x40,0x5a,0xcc,0x00,0x03,0x1e,0x1c]
+# GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x5a,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0xc0,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # clamp
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x48,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[1,0,0]
+# GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x5a,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x50,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,1,0]
+# GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x5a,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x60,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x5a,0xcc,0x00,0x03,0x1e,0x14] # op_sel_hi:[0,1,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x5a,0xcc,0x00,0x03,0x1e,0x0c] # op_sel_hi:[1,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x00,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # op_sel_hi:[1,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x48,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # index_key:1
+# GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x5a,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x50,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # index_key:2
+# GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x5a,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x58,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # index_key:3
+# GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x5a,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x40,0x5a,0xcc,0x00,0x03,0x1e,0x3c] # neg_lo:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x5a,0xcc,0x00,0x03,0x1e,0x5c] # neg_lo:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x5a,0xcc,0x00,0x03,0x1e,0x9c] # neg_lo:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x41,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[1,0,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x42,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,1,0]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x44,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,0,1]
+# GFX1170-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x5a,0xcc,0x00,0x02,0x1e,0x1c] # sgpr_0 src0
+# GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], s0/*Invalid register, operand has 'VGPR_32' register class*/, v[1:2], v7 ; encoding: [0x03,0x40,0x5a,0xcc,0x00,0x02,0x1e,0x1c]
+
+[0x03,0x40,0x5a,0xcc,0x00,0x01,0x1c,0x1c] # sgpr_0 src1
+# GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v7 ; encoding: [0x03,0x40,0x5a,0xcc,0x00,0x01,0x1c,0x1c]
+
+[0x03,0x40,0x5a,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2
+# GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x03,0x40,0x5a,0xcc,0x00,0x03,0x02,0x18]
+
+[0x03,0x40,0x5a,0xcc,0xf2,0x02,0x1e,0x1c] # 1.0 src0
+# GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], 1.0/*Invalid immediate*/, v[1:2], v7 ; encoding: [0x03,0x40,0x5a,0xcc,0x00,0x02,0x1e,0x1c]
+
+[0x03,0x40,0x5a,0xcc,0x00,0xe5,0x1d,0x1c] # 1.0 src1
+# GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, 1.0/*Invalid immediate*/, v7 ; encoding: [0x03,0x40,0x5a,0xcc,0x00,0x01,0x1c,0x1c]
+
+[0x03,0x40,0x5a,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2
+# GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], 1.0/*Invalid immediate*/ ; encoding: [0x03,0x40,0x5a,0xcc,0x00,0x03,0x02,0x18]
+
+[0x03,0x40,0x5a,0xcc,0x00,0x03,0x06,0x1a] # 1 src2
+# GFX1170: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], 1/*Invalid immediate*/ ; encoding: [0x03,0x40,0x5a,0xcc,0x00,0x03,0x06,0x18]
diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/vector-type-constant-folding.ll b/llvm/test/Transforms/InstSimplify/ConstProp/vector-type-constant-folding.ll
new file mode 100644
index 0000000000000..5b12fd3ec545f
--- /dev/null
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/vector-type-constant-folding.ll
@@ -0,0 +1,25 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s -passes=instsimplify -S | FileCheck %s
+;
+; Test type mismatch in ConstantFolding for vector types.
+
+define internal void @f() {
+; CHECK-LABEL: define internal void @f() {
+; CHECK-NEXT: ret void
+;
+ ret void
+}
+
+define void @test() {
+; CHECK-LABEL: define void @test() {
+; CHECK-NEXT: store <4 x i16> <i16 sub (i16 extractelement (<4 x i16> bitcast (i64 ptrtoint (ptr @f to i64) to <4 x i16>), i32 0), i16 extractelement (<4 x i16> bitcast (i64 ptrtoint (ptr @f to i64) to <4 x i16>), i32 0)), i16 sub (i16 extractelement (<4 x i16> bitcast (i64 ptrtoint (ptr @f to i64) to <4 x i16>), i32 1), i16 extractelement (<4 x i16> bitcast (i64 ptrtoint (ptr @f to i64) to <4 x i16>), i32 1)), i16 sub (i16 extractelement (<4 x i16> bitcast (i64 ptrtoint (ptr @f to i64) to <4 x i16>), i32 2), i16 extractelement (<4 x i16> bitcast (i64 ptrtoint (ptr @f to i64) to <4 x i16>), i32 2)), i16 sub (i16 extractelement (<4 x i16> bitcast (i64 ptrtoint (ptr @f to i64) to <4 x i16>), i32 3), i16 extractelement (<4 x i16> bitcast (i64 ptrtoint (ptr @f to i64) to <4 x i16>), i32 3))>, ptr @f, align 8
+; CHECK-NEXT: ret void
+;
+ %1 = ptrtoint ptr @f to i64
+ %2 = bitcast i64 %1 to <4 x i16>
+ %3 = ptrtoint ptr @f to i64
+ %4 = bitcast i64 %3 to <4 x i16>
+ %sub = sub <4 x i16> %2, %4
+ store <4 x i16> %sub, ptr @f, align 8
+ ret void
+}
diff --git a/llvm/unittests/Target/X86/CMakeLists.txt b/llvm/unittests/Target/X86/CMakeLists.txt
index b011681aa3b95..253ac5db96df2 100644
--- a/llvm/unittests/Target/X86/CMakeLists.txt
+++ b/llvm/unittests/Target/X86/CMakeLists.txt
@@ -13,6 +13,7 @@ set(LLVM_LINK_COMPONENTS
MC
MIRParser
Passes
+ SelectionDAG
Support
Target
TargetParser
@@ -24,4 +25,5 @@ set(LLVM_LINK_COMPONENTS
add_llvm_unittest(X86Tests
MachineSizeOptsTest.cpp
TernlogTest.cpp
+ X86SelectionDAGTest.cpp
)
diff --git a/llvm/unittests/Target/X86/X86SelectionDAGTest.cpp b/llvm/unittests/Target/X86/X86SelectionDAGTest.cpp
new file mode 100644
index 0000000000000..b546908a48931
--- /dev/null
+++ b/llvm/unittests/Target/X86/X86SelectionDAGTest.cpp
@@ -0,0 +1,103 @@
+//===----------------------------------------------------------------------===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86ISelLowering.h"
+#include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/AsmParser/Parser.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/KnownBits.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Target/TargetMachine.h"
+#include "gtest/gtest.h"
+
+namespace llvm {
+
+class X86SelectionDAGTest : public testing::Test {
+protected:
+ const TargetSubtargetInfo *STI;
+
+ static void SetUpTestCase() {
+ LLVMInitializeX86TargetInfo();
+ LLVMInitializeX86Target();
+ LLVMInitializeX86TargetMC();
+ }
+
+ void SetUp() override {
+ StringRef Assembly = "define void @f() { ret void }";
+
+ Triple TargetTriple("x86_64--");
+ std::string Error;
+ const Target *T = TargetRegistry::lookupTarget("", TargetTriple, Error);
+
+ TargetOptions Options;
+ TM = std::unique_ptr<TargetMachine>(T->createTargetMachine(
+ TargetTriple, "x86-64-v4", "", Options, std::nullopt, std::nullopt,
+ CodeGenOptLevel::Aggressive));
+
+ SMDiagnostic SMError;
+ M = parseAssemblyString(Assembly, SMError, Context);
+ if (!M)
+ report_fatal_error(SMError.getMessage());
+ M->setDataLayout(TM->createDataLayout());
+
+ F = M->getFunction("f");
+ if (!F)
+ report_fatal_error("F?");
+
+ MachineModuleInfo MMI(TM.get());
+
+ STI = TM->getSubtargetImpl(*F);
+ MF = std::make_unique<MachineFunction>(*F, *TM, *STI, MMI.getContext(), 0);
+
+ DAG = std::make_unique<SelectionDAG>(*TM, CodeGenOptLevel::None);
+ if (!DAG)
+ report_fatal_error("DAG?");
+ OptimizationRemarkEmitter ORE(F);
+ DAG->init(*MF, ORE, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
+ MMI, nullptr);
+ }
+
+ LLVMContext Context;
+ std::unique_ptr<TargetMachine> TM;
+ std::unique_ptr<Module> M;
+ Function *F;
+ std::unique_ptr<MachineFunction> MF;
+ std::unique_ptr<SelectionDAG> DAG;
+};
+
+TEST_F(X86SelectionDAGTest, computeKnownBits_FANDN) {
+ SDLoc Loc;
+
+ auto SrcF32 = DAG->getCopyFromReg(DAG->getEntryNode(), Loc, 1, MVT::f32);
+ auto SignBitF32 = DAG->getConstantFP(-0.0f, Loc, MVT::f32);
+ auto OpF32 = DAG->getNode(X86ISD::FANDN, Loc, MVT::f32, SignBitF32, SrcF32);
+ KnownBits KnownF32 = DAG->computeKnownBits(OpF32);
+ EXPECT_TRUE(KnownF32.isNonNegative());
+
+ auto Src2xF64 = DAG->getCopyFromReg(DAG->getEntryNode(), Loc, 1, MVT::v2f64);
+ auto ZeroF64 = DAG->getConstantFP(+0.0f, Loc, MVT::f64);
+ auto SignBitF64 = DAG->getConstantFP(-0.0f, Loc, MVT::f64);
+ auto HiSign2xF64 =
+ DAG->getBuildVector(MVT::v2f64, Loc, {ZeroF64, SignBitF64});
+ auto Op2xF64 =
+ DAG->getNode(X86ISD::FANDN, Loc, MVT::v2f64, HiSign2xF64, Src2xF64);
+ KnownBits KnownAll2xF64 = DAG->computeKnownBits(Op2xF64);
+ KnownBits KnownLo2xF64 = DAG->computeKnownBits(Op2xF64, APInt(2, 1));
+ KnownBits KnownHi2xF64 = DAG->computeKnownBits(Op2xF64, APInt(2, 2));
+ EXPECT_FALSE(KnownAll2xF64.isNonNegative());
+ EXPECT_FALSE(KnownLo2xF64.isNonNegative());
+ EXPECT_TRUE(KnownHi2xF64.isNonNegative());
+}
+
+} // end namespace llvm
diff --git a/llvm/utils/TableGen/CodeGenMapTable.cpp b/llvm/utils/TableGen/CodeGenMapTable.cpp
index 9724493642d75..049ce41ba45ef 100644
--- a/llvm/utils/TableGen/CodeGenMapTable.cpp
+++ b/llvm/utils/TableGen/CodeGenMapTable.cpp
@@ -381,7 +381,7 @@ unsigned MapTableEmitter::emitBinSearchTable(raw_ostream &OS) {
OutStr += ", ";
OutStr += ColInstr->getName();
} else {
- OutStr += ", (uint32_t)-1U";
+ OutStr += ", INSTRUCTION_LIST_END";
}
}
@@ -455,7 +455,7 @@ void MapTableEmitter::emitMapFuncBody(raw_ostream &OS, unsigned TableSize) {
OS << ")\n";
OS << " return Table[mid][" << I + 1 << "];\n";
}
- OS << " return (uint32_t)-1U;";
+ OS << " llvm_unreachable(\"Unrecognized column value!\");\n";
} else {
OS << " return Table[mid][1];\n";
}
@@ -474,7 +474,7 @@ void MapTableEmitter::emitTablesWithFunc(raw_ostream &OS) {
const ListInit *ColFields = InstrMapDesc.getColFields();
ArrayRef<const ListInit *> ValueCols = InstrMapDesc.getValueCols();
OS << "// " << InstrMapDesc.getName() << "\nLLVM_READONLY\n";
- OS << "int64_t " << InstrMapDesc.getName() << "(uint32_t Opcode";
+ OS << "int32_t " << InstrMapDesc.getName() << "(uint32_t Opcode";
if (ValueCols.size() > 1) {
for (const Init *CF : ColFields->getElements()) {
std::string ColName = CF->getAsUnquotedString();
diff --git a/llvm/utils/gn/secondary/llvm/unittests/Target/X86/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/Target/X86/BUILD.gn
index af2c6d38d9519..e70ccd67b4018 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/Target/X86/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/Target/X86/BUILD.gn
@@ -21,5 +21,6 @@ unittest("X86Tests") {
sources = [
"MachineSizeOptsTest.cpp",
"TernlogTest.cpp",
+ "X86SelectionDAGTest.cpp",
]
}
diff --git a/mlir/include/mlir/Dialect/MemRef/Utils/MemRefUtils.h b/mlir/include/mlir/Dialect/MemRef/Utils/MemRefUtils.h
index 5d2429bb476e6..9af0f301d763c 100644
--- a/mlir/include/mlir/Dialect/MemRef/Utils/MemRefUtils.h
+++ b/mlir/include/mlir/Dialect/MemRef/Utils/MemRefUtils.h
@@ -157,6 +157,21 @@ void resolveSourceIndicesCollapseShape(Location loc, PatternRewriter &rewriter,
ValueRange indices,
SmallVectorImpl<Value> &sourceIndices);
+/// Given the 'indices' of a load/store operation where the memref is a result
+/// of a rank-reducing full subview op, returns the indices w.r.t to the source
+/// memref of the memref.subview op. For example
+///
+/// %alias = memref.subview %src[0, 0, 0][1, 2, 2][1, 1, 1]: memref<1x2x2xf32>
+/// to memref<2x2xf32>
+/// %val = memref.load %alias[%i, %j] : memref<2x2xf32>
+///
+/// could be folded into
+///
+/// %val = memref.load %src[0, %i, %j] : memref<1x2x2xf32>
+LogicalResult resolveSourceIndicesRankReducingSubview(
+ Location loc, OpBuilder &b, memref::SubViewOp subViewOp, ValueRange indices,
+ SmallVectorImpl<Value> &sourceIndices);
+
} // namespace memref
} // namespace mlir
diff --git a/mlir/include/mlir/Dialect/SCF/Utils/Utils.h b/mlir/include/mlir/Dialect/SCF/Utils/Utils.h
index c85f3b02c4a44..a758032ef69b4 100644
--- a/mlir/include/mlir/Dialect/SCF/Utils/Utils.h
+++ b/mlir/include/mlir/Dialect/SCF/Utils/Utils.h
@@ -18,6 +18,7 @@
#include "mlir/Support/LLVM.h"
#include "llvm/ADT/STLExtras.h"
#include <optional>
+#include <tuple>
namespace mlir {
class Location;
@@ -248,6 +249,12 @@ FailureOr<scf::ParallelOp> parallelLoopUnrollByFactors(
function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn = nullptr,
IRMapping *clonedToSrcOpsMap = nullptr);
+/// Get constant loop bounds and steps for each of the induction variables of
+/// the given loop operation, if all the loop's ranges are constant. Each entry
+/// in the returned vector is a tuple (lowerBound, upperBound, step).
+llvm::SmallVector<std::tuple<int64_t, int64_t, int64_t>>
+getConstLoopBounds(mlir::LoopLikeOpInterface loopOp);
+
/// Get constant trip counts for each of the induction variables of the given
/// loop operation. If any of the loop's trip counts is not constant, return an
/// empty vector.
diff --git a/mlir/lib/Conversion/MathToSPIRV/MathToSPIRV.cpp b/mlir/lib/Conversion/MathToSPIRV/MathToSPIRV.cpp
index 610ce1f13c56b..78f0fe1392962 100644
--- a/mlir/lib/Conversion/MathToSPIRV/MathToSPIRV.cpp
+++ b/mlir/lib/Conversion/MathToSPIRV/MathToSPIRV.cpp
@@ -449,8 +449,14 @@ struct RoundOpPattern final : public OpConversionPattern<math::RoundOp> {
return res;
Location loc = roundOp.getLoc();
- Value operand = roundOp.getOperand();
- Type ty = operand.getType();
+ auto ty = getTypeConverter()->convertType(adaptor.getOperand().getType());
+ if (!ty) {
+ return rewriter.notifyMatchFailure(
+ roundOp->getLoc(),
+ llvm::formatv("failed to convert type {0} for SPIR-V",
+ roundOp.getType()));
+ }
+
Type ety = getElementTypeOrSelf(ty);
auto zero = spirv::ConstantOp::getZero(ty, loc, rewriter);
@@ -466,14 +472,15 @@ struct RoundOpPattern final : public OpConversionPattern<math::RoundOp> {
rewriter.getFloatAttr(ety, 0.5));
}
- auto abs = spirv::GLFAbsOp::create(rewriter, loc, operand);
+ auto abs = spirv::GLFAbsOp::create(rewriter, loc, adaptor.getOperand());
auto floor = spirv::GLFloorOp::create(rewriter, loc, abs);
auto sub = spirv::FSubOp::create(rewriter, loc, abs, floor);
auto greater =
spirv::FOrdGreaterThanEqualOp::create(rewriter, loc, sub, half);
auto select = spirv::SelectOp::create(rewriter, loc, greater, one, zero);
auto add = spirv::FAddOp::create(rewriter, loc, floor, select);
- rewriter.replaceOpWithNewOp<math::CopySignOp>(roundOp, add, operand);
+ rewriter.replaceOpWithNewOp<math::CopySignOp>(roundOp, add,
+ adaptor.getOperand());
return success();
}
};
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
index 4c67720654f83..d960201e2b3d0 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
@@ -4151,11 +4151,13 @@ LLVMFuncOp BlockAddressOp::getFunction(SymbolTableCollection &symbolTable) {
}
BlockTagOp BlockAddressOp::getBlockTagOp() {
- auto funcOp = dyn_cast<LLVMFuncOp>(mlir::SymbolTable::lookupNearestSymbolFrom(
- parentLLVMModule(*this), getBlockAddr().getFunction()));
+ Operation *sym = mlir::SymbolTable::lookupNearestSymbolFrom(
+ parentLLVMModule(*this), getBlockAddr().getFunction());
+ if (!sym)
+ return nullptr;
+ auto funcOp = dyn_cast<LLVMFuncOp>(sym);
if (!funcOp)
return nullptr;
-
BlockTagOp blockTagOp = nullptr;
funcOp.walk([&](LLVM::BlockTagOp labelOp) {
if (labelOp.getTag() == getBlockAddr().getTag()) {
diff --git a/mlir/lib/Dialect/MemRef/Utils/MemRefUtils.cpp b/mlir/lib/Dialect/MemRef/Utils/MemRefUtils.cpp
index 2d341dce665e5..cf126cd85ddce 100644
--- a/mlir/lib/Dialect/MemRef/Utils/MemRefUtils.cpp
+++ b/mlir/lib/Dialect/MemRef/Utils/MemRefUtils.cpp
@@ -286,5 +286,46 @@ void resolveSourceIndicesCollapseShape(Location loc, PatternRewriter &rewriter,
}
}
+LogicalResult resolveSourceIndicesRankReducingSubview(
+ Location loc, OpBuilder &b, memref::SubViewOp subViewOp, ValueRange indices,
+ SmallVectorImpl<Value> &sourceIndices) {
+ if (!subViewOp.hasZeroOffset() || !subViewOp.hasUnitStride())
+ return failure();
+
+ MemRefType srcType = subViewOp.getSourceType();
+ MemRefType resType = subViewOp.getType();
+ unsigned srcRank = srcType.getRank();
+ unsigned resRank = resType.getRank();
+ if (srcRank <= resRank || indices.size() != resRank)
+ return failure();
+
+ auto droppedDims = subViewOp.getDroppedDims();
+ if (droppedDims.none() || droppedDims.count() != srcRank - resRank)
+ return failure();
+
+ auto mixedSizes = subViewOp.getMixedSizes();
+ if (mixedSizes.size() != srcRank)
+ return failure();
+
+ unsigned resultDim = 0;
+ for (unsigned sourceDim = 0; sourceDim < srcRank; ++sourceDim) {
+ if (droppedDims.test(sourceDim)) {
+ auto sizeCst = getConstantIntValue(mixedSizes[sourceDim]);
+ if (!sizeCst || *sizeCst != 1)
+ return failure();
+ sourceIndices.push_back(
+ getValueOrCreateConstantIndexOp(b, loc, b.getIndexAttr(0)));
+ continue;
+ }
+ if (resultDim >= indices.size())
+ return failure();
+ sourceIndices.push_back(indices[resultDim++]);
+ }
+ if (resultDim != indices.size())
+ return failure();
+
+ return success();
+}
+
} // namespace memref
} // namespace mlir
diff --git a/mlir/lib/Dialect/OpenACC/IR/CMakeLists.txt b/mlir/lib/Dialect/OpenACC/IR/CMakeLists.txt
index 2bd41d99a3661..2ba1778532860 100644
--- a/mlir/lib/Dialect/OpenACC/IR/CMakeLists.txt
+++ b/mlir/lib/Dialect/OpenACC/IR/CMakeLists.txt
@@ -1,5 +1,6 @@
add_mlir_dialect_library(MLIROpenACCDialect
OpenACC.cpp
+ OpenACCCG.cpp
ADDITIONAL_HEADER_DIRS
${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/OpenACC
diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
index 03fe5d177e327..ce024648b160c 100644
--- a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
+++ b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
@@ -465,16 +465,6 @@ ValueRange SerialOp::getSuccessorInputs(RegionSuccessor successor) {
return getSingleRegionSuccessorInputs(getOperation(), successor);
}
-void KernelEnvironmentOp::getSuccessorRegions(
- RegionBranchPoint point, SmallVectorImpl<RegionSuccessor> ®ions) {
- getSingleRegionOpSuccessorRegions(getOperation(), getRegion(), point,
- regions);
-}
-
-ValueRange KernelEnvironmentOp::getSuccessorInputs(RegionSuccessor successor) {
- return getSingleRegionSuccessorInputs(getOperation(), successor);
-}
-
void DataOp::getSuccessorRegions(RegionBranchPoint point,
SmallVectorImpl<RegionSuccessor> ®ions) {
getSingleRegionOpSuccessorRegions(getOperation(), getRegion(), point,
@@ -876,20 +866,6 @@ LogicalResult acc::FirstprivateOp::verify() {
return success();
}
-//===----------------------------------------------------------------------===//
-// FirstprivateMapInitialOp
-//===----------------------------------------------------------------------===//
-LogicalResult acc::FirstprivateMapInitialOp::verify() {
- if (getDataClause() != acc::DataClause::acc_firstprivate)
- return emitError("data clause associated with firstprivate operation must "
- "match its intent");
- if (failed(checkVarAndVarType(*this)))
- return failure();
- if (failed(checkNoModifier(*this)))
- return failure();
- return success();
-}
-
//===----------------------------------------------------------------------===//
// ReductionOp
//===----------------------------------------------------------------------===//
@@ -1289,16 +1265,6 @@ void acc::FirstprivateOp::getEffects(
addResultEffect<MemoryEffects::Write>(effects, getAccVar());
}
-// FirstprivateMapInitialOp: var read, accVar result write.
-void acc::FirstprivateMapInitialOp::getEffects(
- SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
- &effects) {
- effects.emplace_back(MemoryEffects::Read::get(),
- acc::CurrentDeviceIdResource::get());
- addOperandEffect<MemoryEffects::Read>(effects, getVarMutable());
- addResultEffect<MemoryEffects::Write>(effects, getAccVar());
-}
-
// ReductionOp: var read, accVar result write.
void acc::ReductionOp::getEffects(
SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
@@ -1573,65 +1539,6 @@ struct RemoveConstantIfConditionWithRegion : public OpRewritePattern<OpTy> {
}
};
-/// Remove empty acc.kernel_environment operations. If the operation has wait
-/// operands, create a acc.wait operation to preserve synchronization.
-struct RemoveEmptyKernelEnvironment
- : public OpRewritePattern<acc::KernelEnvironmentOp> {
- using OpRewritePattern<acc::KernelEnvironmentOp>::OpRewritePattern;
-
- LogicalResult matchAndRewrite(acc::KernelEnvironmentOp op,
- PatternRewriter &rewriter) const override {
- assert(op->getNumRegions() == 1 && "expected op to have one region");
-
- Block &block = op.getRegion().front();
- if (!block.empty())
- return failure();
-
- // Conservatively disable canonicalization of empty acc.kernel_environment
- // operations if the wait operands in the kernel_environment cannot be fully
- // represented by acc.wait operation.
-
- // Disable canonicalization if device type is not the default
- if (auto deviceTypeAttr = op.getWaitOperandsDeviceTypeAttr()) {
- for (auto attr : deviceTypeAttr) {
- if (auto dtAttr = mlir::dyn_cast<acc::DeviceTypeAttr>(attr)) {
- if (dtAttr.getValue() != mlir::acc::DeviceType::None)
- return failure();
- }
- }
- }
-
- // Disable canonicalization if any wait segment has a devnum
- if (auto hasDevnumAttr = op.getHasWaitDevnumAttr()) {
- for (auto attr : hasDevnumAttr) {
- if (auto boolAttr = mlir::dyn_cast<mlir::BoolAttr>(attr)) {
- if (boolAttr.getValue())
- return failure();
- }
- }
- }
-
- // Disable canonicalization if there are multiple wait segments
- if (auto segmentsAttr = op.getWaitOperandsSegmentsAttr()) {
- if (segmentsAttr.size() > 1)
- return failure();
- }
-
- // Remove empty kernel environment.
- // Preserve synchronization by creating acc.wait operation if needed.
- if (!op.getWaitOperands().empty() || op.getWaitOnlyAttr())
- rewriter.replaceOpWithNewOp<acc::WaitOp>(op, op.getWaitOperands(),
- /*asyncOperand=*/Value(),
- /*waitDevnum=*/Value(),
- /*async=*/nullptr,
- /*ifCond=*/Value());
- else
- rewriter.eraseOp(op);
-
- return success();
- }
-};
-
//===----------------------------------------------------------------------===//
// Recipe Region Helpers
//===----------------------------------------------------------------------===//
@@ -3221,15 +3128,6 @@ void acc::HostDataOp::getCanonicalizationPatterns(RewritePatternSet &results,
results.add<RemoveConstantIfConditionWithRegion<HostDataOp>>(context);
}
-//===----------------------------------------------------------------------===//
-// KernelEnvironmentOp
-//===----------------------------------------------------------------------===//
-
-void acc::KernelEnvironmentOp::getCanonicalizationPatterns(
- RewritePatternSet &results, MLIRContext *context) {
- results.add<RemoveEmptyKernelEnvironment>(context);
-}
-
//===----------------------------------------------------------------------===//
// LoopOp
//===----------------------------------------------------------------------===//
@@ -5129,23 +5027,6 @@ LogicalResult acc::WaitOp::verify() {
return success();
}
-//===----------------------------------------------------------------------===//
-// ReductionCombineOp
-//===----------------------------------------------------------------------===//
-void acc::ReductionCombineOp::getEffects(
- llvm::SmallVectorImpl<
- mlir::SideEffects::EffectInstance<mlir::MemoryEffects::Effect>>
- &effects) {
- effects.emplace_back(mlir::MemoryEffects::Read::get(), &getSrcMemrefMutable(),
- mlir::SideEffects::DefaultResource::get());
- effects.emplace_back(mlir::MemoryEffects::Read::get(),
- &getDestMemrefMutable(),
- mlir::SideEffects::DefaultResource::get());
- effects.emplace_back(mlir::MemoryEffects::Write::get(),
- &getDestMemrefMutable(),
- mlir::SideEffects::DefaultResource::get());
-}
-
#define GET_OP_CLASSES
#include "mlir/Dialect/OpenACC/OpenACCOps.cpp.inc"
diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACCCG.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACCCG.cpp
new file mode 100644
index 0000000000000..2753750128699
--- /dev/null
+++ b/mlir/lib/Dialect/OpenACC/IR/OpenACCCG.cpp
@@ -0,0 +1,186 @@
+//===- OpenACCCG.cpp - OpenACC codegen ops, attributes, and types ---------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implementation for OpenACC codegen operations, attributes, and types.
+// These correspond to the definitions in OpenACCCG*.td tablegen files
+// and are kept in a separate file because they do not represent direct mappings
+// of OpenACC language constructs; they are intermediate representations used
+// when decomposing and lowering primary `acc` dialect operations.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/OpenACC/OpenACC.h"
+#include "mlir/IR/Region.h"
+#include "mlir/Interfaces/ControlFlowInterfaces.h"
+#include "mlir/Support/LogicalResult.h"
+#include "llvm/ADT/SmallVector.h"
+
+using namespace mlir;
+using namespace acc;
+
+namespace {
+
+/// Generic helper for single-region OpenACC ops that execute their body once
+/// and then return to the parent operation with their results (if any).
+static void
+getSingleRegionOpSuccessorRegions(Operation *op, Region ®ion,
+ RegionBranchPoint point,
+ SmallVectorImpl<RegionSuccessor> ®ions) {
+ if (point.isParent()) {
+ regions.push_back(RegionSuccessor(®ion));
+ return;
+ }
+ regions.push_back(RegionSuccessor::parent());
+}
+
+static ValueRange getSingleRegionSuccessorInputs(Operation *op,
+ RegionSuccessor successor) {
+ return successor.isParent() ? ValueRange(op->getResults()) : ValueRange();
+}
+
+/// Remove empty acc.kernel_environment operations. If the operation has wait
+/// operands, create a acc.wait operation to preserve synchronization.
+struct RemoveEmptyKernelEnvironment
+ : public OpRewritePattern<acc::KernelEnvironmentOp> {
+ using OpRewritePattern<acc::KernelEnvironmentOp>::OpRewritePattern;
+
+ LogicalResult matchAndRewrite(acc::KernelEnvironmentOp op,
+ PatternRewriter &rewriter) const override {
+ assert(op->getNumRegions() == 1 && "expected op to have one region");
+
+ Block &block = op.getRegion().front();
+ if (!block.empty())
+ return failure();
+
+ // Conservatively disable canonicalization of empty acc.kernel_environment
+ // operations if the wait operands in the kernel_environment cannot be fully
+ // represented by acc.wait operation.
+
+ // Disable canonicalization if device type is not the default
+ if (auto deviceTypeAttr = op.getWaitOperandsDeviceTypeAttr()) {
+ for (auto attr : deviceTypeAttr) {
+ if (auto dtAttr = mlir::dyn_cast<acc::DeviceTypeAttr>(attr)) {
+ if (dtAttr.getValue() != mlir::acc::DeviceType::None)
+ return failure();
+ }
+ }
+ }
+
+ // Disable canonicalization if any wait segment has a devnum
+ if (auto hasDevnumAttr = op.getHasWaitDevnumAttr()) {
+ for (auto attr : hasDevnumAttr) {
+ if (auto boolAttr = mlir::dyn_cast<mlir::BoolAttr>(attr)) {
+ if (boolAttr.getValue())
+ return failure();
+ }
+ }
+ }
+
+ // Disable canonicalization if there are multiple wait segments
+ if (auto segmentsAttr = op.getWaitOperandsSegmentsAttr()) {
+ if (segmentsAttr.size() > 1)
+ return failure();
+ }
+
+ // Remove empty kernel environment.
+ // Preserve synchronization by creating acc.wait operation if needed.
+ if (!op.getWaitOperands().empty() || op.getWaitOnlyAttr())
+ rewriter.replaceOpWithNewOp<acc::WaitOp>(op, op.getWaitOperands(),
+ /*asyncOperand=*/Value(),
+ /*waitDevnum=*/Value(),
+ /*async=*/nullptr,
+ /*ifCond=*/Value());
+ else
+ rewriter.eraseOp(op);
+
+ return success();
+ }
+};
+
+template <typename EffectTy>
+static void addOperandEffect(
+ SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
+ &effects,
+ const MutableOperandRange &operand) {
+ for (unsigned i = 0, e = operand.size(); i < e; ++i)
+ effects.emplace_back(EffectTy::get(), &operand[i]);
+}
+
+template <typename EffectTy>
+static void addResultEffect(
+ SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
+ &effects,
+ Value result) {
+ effects.emplace_back(EffectTy::get(), mlir::cast<mlir::OpResult>(result));
+}
+
+} // namespace
+
+//===----------------------------------------------------------------------===//
+// KernelEnvironmentOp
+//===----------------------------------------------------------------------===//
+
+void KernelEnvironmentOp::getSuccessorRegions(
+ RegionBranchPoint point, SmallVectorImpl<RegionSuccessor> ®ions) {
+ getSingleRegionOpSuccessorRegions(getOperation(), getRegion(), point,
+ regions);
+}
+
+ValueRange KernelEnvironmentOp::getSuccessorInputs(RegionSuccessor successor) {
+ return getSingleRegionSuccessorInputs(getOperation(), successor);
+}
+
+void KernelEnvironmentOp::getCanonicalizationPatterns(
+ RewritePatternSet &results, MLIRContext *context) {
+ results.add<RemoveEmptyKernelEnvironment>(context);
+}
+
+//===----------------------------------------------------------------------===//
+// FirstprivateMapInitialOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult FirstprivateMapInitialOp::verify() {
+ if (getDataClause() != acc::DataClause::acc_firstprivate)
+ return emitError("data clause associated with firstprivate operation must "
+ "match its intent");
+ if (!getVar())
+ return emitError("must have var operand");
+ if (!mlir::isa<mlir::acc::PointerLikeType>(getVar().getType()) &&
+ !mlir::isa<mlir::acc::MappableType>(getVar().getType()))
+ return emitError("var must be mappable or pointer-like");
+ if (mlir::isa<mlir::acc::PointerLikeType>(getVar().getType()) &&
+ getVarType() == getVar().getType())
+ return emitError("varType must capture the element type of var");
+ if (getModifiers() != acc::DataClauseModifier::none)
+ return emitError("no data clause modifiers are allowed");
+ return success();
+}
+
+void FirstprivateMapInitialOp::getEffects(
+ SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
+ &effects) {
+ effects.emplace_back(MemoryEffects::Read::get(),
+ acc::CurrentDeviceIdResource::get());
+ addOperandEffect<MemoryEffects::Read>(effects, getVarMutable());
+ addResultEffect<MemoryEffects::Write>(effects, getAccVar());
+}
+
+//===----------------------------------------------------------------------===//
+// ReductionCombineOp
+//===----------------------------------------------------------------------===//
+
+void ReductionCombineOp::getEffects(
+ SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
+ &effects) {
+ effects.emplace_back(MemoryEffects::Read::get(), &getSrcMemrefMutable(),
+ SideEffects::DefaultResource::get());
+ effects.emplace_back(MemoryEffects::Read::get(), &getDestMemrefMutable(),
+ SideEffects::DefaultResource::get());
+ effects.emplace_back(MemoryEffects::Write::get(), &getDestMemrefMutable(),
+ SideEffects::DefaultResource::get());
+}
diff --git a/mlir/lib/Dialect/SCF/Transforms/CMakeLists.txt b/mlir/lib/Dialect/SCF/Transforms/CMakeLists.txt
index a9ffa9dc208a0..fb9aa4018d263 100644
--- a/mlir/lib/Dialect/SCF/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/SCF/Transforms/CMakeLists.txt
@@ -32,6 +32,7 @@ add_mlir_dialect_library(MLIRSCFTransforms
MLIRBufferizationTransforms
MLIRDestinationStyleOpInterface
MLIRDialectUtils
+ MLIRIndexDialect
MLIRIR
MLIRMemRefDialect
MLIRPass
diff --git a/mlir/lib/Dialect/SCF/Transforms/ParallelLoopFusion.cpp b/mlir/lib/Dialect/SCF/Transforms/ParallelLoopFusion.cpp
index 4ea832177c4f9..0b132e9109492 100644
--- a/mlir/lib/Dialect/SCF/Transforms/ParallelLoopFusion.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/ParallelLoopFusion.cpp
@@ -13,15 +13,31 @@
#include "mlir/Dialect/SCF/Transforms/Passes.h"
#include "mlir/Analysis/AliasAnalysis.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Index/IR/IndexOps.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/MemRef/Utils/MemRefUtils.h"
#include "mlir/Dialect/SCF/IR/SCF.h"
#include "mlir/Dialect/SCF/Transforms/Transforms.h"
+#include "mlir/Dialect/SCF/Utils/Utils.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinTypes.h"
#include "mlir/IR/IRMapping.h"
+#include "mlir/IR/Matchers.h"
#include "mlir/IR/OpDefinition.h"
#include "mlir/IR/OperationSupport.h"
+#include "mlir/IR/PatternMatch.h"
#include "mlir/Interfaces/SideEffectInterfaces.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallBitVector.h"
+#include "llvm/ADT/TypeSwitch.h"
+
+#include <optional>
+#include <tuple>
+
namespace mlir {
#define GEN_PASS_DEF_SCFPARALLELLOOPFUSION
#include "mlir/Dialect/SCF/Transforms/Passes.h.inc"
@@ -55,114 +71,670 @@ static bool equalIterationSpaces(ParallelOp firstPloop,
matchOperands(firstPloop.getStep(), secondPloop.getStep());
}
-/// Checks if the parallel loops have mixed access to the same buffers. Returns
-/// `true` if the first parallel loop writes to the same indices that the second
-/// loop reads.
-static bool haveNoReadsAfterWriteExceptSameIndex(
+/// Check if both operations are the same type of memory write op and
+/// write to the same memory location (same buffer and same indices).
+static bool opsWriteSameMemLocation(Operation *op1, Operation *op2) {
+ if (!op1 || !op2 || op1->getName() != op2->getName())
+ return false;
+ if (op1 == op2)
+ return true;
+ // support only these memory-writing ops for now
+ if (!isa<memref::StoreOp, vector::TransferWriteOp, vector::StoreOp>(op1))
+ return false;
+ bool opsAreIdentical =
+ llvm::TypeSwitch<Operation *, bool>(op1)
+ .Case([&](memref::StoreOp storeOp1) {
+ auto storeOp2 = cast<memref::StoreOp>(op2);
+ return (storeOp1.getMemRef() == storeOp2.getMemRef()) &&
+ (storeOp1.getIndices() == storeOp2.getIndices());
+ })
+ .Case([&](vector::TransferWriteOp writeOp1) {
+ auto writeOp2 = cast<vector::TransferWriteOp>(op2);
+ return (writeOp1.getBase() == writeOp2.getBase()) &&
+ (writeOp1.getIndices() == writeOp2.getIndices()) &&
+ (writeOp1.getMask() == writeOp2.getMask()) &&
+ (writeOp1.getValueToStore().getType() ==
+ writeOp2.getValueToStore().getType()) &&
+ (writeOp1.getInBounds() == writeOp2.getInBounds());
+ })
+ .Case([&](vector::StoreOp vecStoreOp1) {
+ auto vecStoreOp2 = cast<vector::StoreOp>(op2);
+ return (vecStoreOp1.getBase() == vecStoreOp2.getBase()) &&
+ (vecStoreOp1.getIndices() == vecStoreOp2.getIndices()) &&
+ (vecStoreOp1.getValueToStore().getType() ==
+ vecStoreOp2.getValueToStore().getType()) &&
+ (vecStoreOp1.getAlignment() == vecStoreOp2.getAlignment()) &&
+ (vecStoreOp1.getNontemporal() ==
+ vecStoreOp2.getNontemporal());
+ })
+ .Default([](Operation *) { return false; });
+ return opsAreIdentical;
+}
+
+/// Check if val1 (from the first parallel loop) and val2 (from the
+/// second) are equivalent, considering the mapping of induction variables from
+/// the first to the second parallel loop.
+static bool valsAreEquivalent(Value val1, Value val2,
+ const IRMapping &loopsIVsMap) {
+ if (val1 == val2 || loopsIVsMap.lookupOrDefault(val1) == val2 ||
+ loopsIVsMap.lookupOrDefault(val2) == val1)
+ return true;
+ Operation *val1DefOp = val1.getDefiningOp();
+ Operation *val2DefOp = val2.getDefiningOp();
+ if (!val1DefOp || !val2DefOp)
+ return false;
+ if (!isMemoryEffectFree(val1DefOp) || !isMemoryEffectFree(val2DefOp))
+ return false;
+ return OperationEquivalence::isEquivalentTo(
+ val1DefOp, val2DefOp,
+ [&](Value v1, Value v2) {
+ return success(loopsIVsMap.lookupOrDefault(v1) == v2 ||
+ loopsIVsMap.lookupOrDefault(v2) == v1);
+ },
+ /*markEquivalent=*/nullptr, OperationEquivalence::Flags::IgnoreLocations);
+}
+
+/// If the `expr` value is the result of an integer addition of `base` and a
+/// constant, return the constant.
+static std::optional<int64_t> getAddConstant(Value expr, Value base,
+ const IRMapping &loopsIVsMap) {
+ if (auto addOp = expr.getDefiningOp<arith::AddIOp>()) {
+ if (auto constOp = getConstantIntValue(addOp.getLhs());
+ constOp && valsAreEquivalent(addOp.getRhs(), base, loopsIVsMap))
+ return constOp.value();
+ if (auto constOp = getConstantIntValue(addOp.getRhs());
+ constOp && valsAreEquivalent(addOp.getLhs(), base, loopsIVsMap))
+ return constOp.value();
+ return std::nullopt;
+ }
+
+ if (auto addOp = expr.getDefiningOp<index::AddOp>()) {
+ if (auto constOp = getConstantIntValue(addOp.getLhs());
+ constOp && valsAreEquivalent(addOp.getRhs(), base, loopsIVsMap))
+ return constOp.value();
+ if (auto constOp = getConstantIntValue(addOp.getRhs());
+ constOp && valsAreEquivalent(addOp.getLhs(), base, loopsIVsMap))
+ return constOp.value();
+ return std::nullopt;
+ }
+
+ if (auto applyOp = expr.getDefiningOp<affine::AffineApplyOp>()) {
+ AffineMap map = applyOp.getAffineMap();
+ if (map.getNumResults() != 1 || map.getNumDims() != 1 ||
+ map.getNumSymbols() != 0)
+ return std::nullopt;
+ if (!valsAreEquivalent(applyOp.getOperand(0), base, loopsIVsMap))
+ return std::nullopt;
+ AffineExpr result = map.getResult(0);
+ auto bin = dyn_cast<AffineBinaryOpExpr>(result);
+ if (!bin || bin.getKind() != AffineExprKind::Add)
+ return std::nullopt;
+ auto lhsDim = dyn_cast<AffineDimExpr>(bin.getLHS());
+ auto rhsDim = dyn_cast<AffineDimExpr>(bin.getRHS());
+ auto lhsConst = dyn_cast<AffineConstantExpr>(bin.getLHS());
+ auto rhsConst = dyn_cast<AffineConstantExpr>(bin.getRHS());
+ if (lhsConst && rhsDim)
+ return lhsConst.getValue();
+ if (rhsConst && lhsDim)
+ return rhsConst.getValue();
+ }
+ return std::nullopt;
+}
+
+// Return true if the scalar load index may hit any element covered by a
+// vector.store/transfer_write along a single memref dimension. Supported cases:
+//
+// 1) Direct index match (with optional offset):
+// vector.transfer_write %v, %A[%i] : vector<4xf32>, memref<...>
+// %x = memref.load %A[%i] : memref<...>
+//
+// 2) Loop IV range intersects the write range:
+// vector.transfer_write %v, %A[%c0] : vector<4xf32>, memref<...>
+// scf.for %k = %c0 to %c4 step %c1 { %x = memref.load %A[%k] }
+//
+// 3) Constant index (or IV + constant) within the write range:
+// vector.transfer_write %v, %A[%c0] : vector<4xf32>, memref<...>
+// %x = memref.load %A[%c2] : memref<...>
+// %y = memref.load %A[%i + %c1] : memref<...>
+//
+// Args:
+// - loadIndex: index used by the scalar load for this dimension.
+// - offset: subview offset for the base memref dimension (if any).
+// - writeIndex: index used by the transfer_write for this dimension. Can be
+// null if the dim was dropped by a rank reducing subview, whose result is
+// written by the vector.write.
+// - extent: vector size along this dimension (number of elements written).
+// - loopsIVsMap: IV equivalence map between fused loops.
+static bool loadIndexWithinWriteRange(Value loadIndex, OpFoldResult offset,
+ Value writeIndex, int64_t extent,
+ const IRMapping &loopsIVsMap) {
+ if (extent <= 0)
+ return false;
+
+ // Extract constant loop bounds for loop IVs (e.g. from scf.for).
+ auto getConstLoopBoundsForIV =
+ [](Value index) -> std::optional<std::tuple<int64_t, int64_t, int64_t>> {
+ auto blockArg = dyn_cast<BlockArgument>(index);
+ if (!blockArg)
+ return std::nullopt;
+ auto *parentOp = blockArg.getOwner()->getParentOp();
+ auto loopLike = dyn_cast<LoopLikeOpInterface>(parentOp);
+ if (!loopLike)
+ return std::nullopt;
+ auto ranges = getConstLoopBounds(loopLike);
+ if (ranges.empty())
+ return std::nullopt;
+
+ auto ivs = loopLike.getLoopInductionVars();
+ if (!ivs)
+ return std::nullopt;
+ auto it = llvm::find(*ivs, blockArg);
+ if (it == ivs->end())
+ return std::nullopt;
+ unsigned pos = std::distance(ivs->begin(), it);
+ if (pos >= ranges.size())
+ return std::nullopt;
+ auto [lb, ub, step] = ranges[pos];
+ return std::make_tuple(lb, ub, step);
+ };
+
+ std::optional<int64_t> offsetConst = getConstantIntValue(offset);
+ std::optional<int64_t> writeConst =
+ writeIndex ? getConstantIntValue(writeIndex) : std::optional<int64_t>(0);
+ if (!writeConst && writeIndex) {
+ // Treat single-iteration IVs as constants for matching.
+ if (auto bounds = getConstLoopBoundsForIV(writeIndex)) {
+ auto [lb, ub, step] = *bounds;
+ if (step > 0 && ub == lb + step)
+ writeConst = lb;
+ }
+ }
+
+ // Check whether a loop IV is fully contained in a constant write range.
+ auto loopIVWithinRange = [](int64_t lb, int64_t ub, int64_t step,
+ int64_t rangeStart, int64_t rangeExtent) -> bool {
+ if (rangeExtent <= 0 || step <= 0)
+ return false;
+ if (ub <= lb)
+ return false;
+ int64_t rangeEnd = rangeStart + rangeExtent;
+ return lb >= rangeStart && ub <= rangeEnd;
+ };
+
+ if (offsetConst && writeConst) {
+ // Constant start of the write range; check constant load or loop IV range.
+ int64_t start = *offsetConst + *writeConst;
+ if (auto loadConst = getConstantIntValue(loadIndex))
+ return (*loadConst >= start && *loadConst < start + extent);
+ if (auto bounds = getConstLoopBoundsForIV(loadIndex)) {
+ auto [lb, ub, step] = *bounds;
+ return loopIVWithinRange(lb, ub, step, start, extent);
+ }
+ }
+
+ if (writeIndex) {
+ // Direct IV match (or IV + constant) against the write index.
+ if (offsetConst && *offsetConst == 0 &&
+ valsAreEquivalent(loadIndex, writeIndex, loopsIVsMap))
+ return true;
+ if (auto addConst = getAddConstant(loadIndex, writeIndex, loopsIVsMap)) {
+ // Match load index of the form writeIndex + C within the write extent.
+ if (offsetConst) {
+ int64_t start = *offsetConst;
+ return (*addConst >= start && *addConst < start + extent);
+ }
+ }
+ return false;
+ }
+
+ if (auto offsetVal = dyn_cast<Value>(offset)) {
+ // Exact match when extent is 1 and the load hits the offset value.
+ if (extent == 1 && valsAreEquivalent(loadIndex, offsetVal, loopsIVsMap))
+ return true;
+ }
+
+ return false;
+}
+
+/// Return the base memref value used by the given memory op.
+static Value getBaseMemref(Operation *op) {
+ // TODO: use the common interface for memory ops once available.
+ return llvm::TypeSwitch<Operation *, Value>(op)
+ .Case([&](memref::LoadOp load) { return load.getMemRef(); })
+ .Case([&](memref::StoreOp store) { return store.getMemRef(); })
+ .Case([&](vector::TransferReadOp read) { return read.getBase(); })
+ .Case([&](vector::TransferWriteOp write) { return write.getBase(); })
+ .Case([&](vector::LoadOp load) { return load.getBase(); })
+ .Case([&](vector::StoreOp store) { return store.getBase(); })
+ .Default([](Operation *) { return Value(); });
+}
+
+/// Recognize scalar memref.load of an element produced by a vector write
+/// (vector.transfer_write or vector.store, optionally through a rank-reducing
+/// unit-stride subview) of the same buffer. This covers the pattern where a
+/// vector write stores a full lane pack and a subsequent scalar load reads an
+/// element from that lane pack. EXAMPLE:
+/// vector.transfer_write %V, %arg[%x, %y, ..., 0] {in_bounds = [true]} :
+/// vector<4xf32>, memref<4xf32, strided<[1], offset: ?>>
+/// scf.for %iter = %c0 to %c4 step %c1 iter_args(...) -> (f32) {
+/// %0 = memref.load %arg[%x, %y, ..., %iter] : memref<1x128x16x4xf32>
+/// ...
+/// }
+///
+static bool isLoadOnWrittenVector(memref::LoadOp loadOp, Value writeBase,
+ ValueRange writeIndices, VectorType vecTy,
+ ArrayRef<int64_t> vectorDimForWriteDim,
+ const IRMapping &ivsMap) {
+ if (!vecTy)
+ return false;
+
+ Value base = writeBase;
+ // The write base if there is no subview, or the subview source otherwise.
+ MemrefValue baseMemref = nullptr;
+ SmallVector<OpFoldResult> offsets;
+ llvm::SmallBitVector droppedDims;
+ bool hasSubview = false;
+ auto *ctx = loadOp.getContext();
+ if (auto subView = base.getDefiningOp<memref::SubViewOp>()) {
+ if (!subView.hasUnitStride())
+ return false;
+ baseMemref = cast<MemrefValue>(subView.getSource());
+ offsets = llvm::to_vector(subView.getMixedOffsets());
+ droppedDims = subView.getDroppedDims();
+ hasSubview = true;
+ } else {
+ baseMemref = dyn_cast<MemrefValue>(base);
+ if (!baseMemref)
+ return false;
+ }
+
+ auto loadIndices = loadOp.getIndices();
+ unsigned baseRank = baseMemref.getType().getRank();
+ if ((loadOp.getMemref() != baseMemref) || (loadIndices.size() != baseRank))
+ return false;
+
+ unsigned writeRank = writeIndices.size();
+ if ((!hasSubview && writeRank != baseRank) ||
+ (hasSubview && offsets.size() != baseRank) ||
+ (vectorDimForWriteDim.size() != writeRank))
+ return false;
+
+ auto zeroAttr = IntegerAttr::get(IndexType::get(ctx), 0);
+ unsigned writeMemrefDim = 0;
+ for (unsigned baseDim : llvm::seq(baseRank)) {
+ bool wasDropped = (hasSubview && droppedDims.test(baseDim));
+ int64_t vectorDim = !wasDropped ? vectorDimForWriteDim[writeMemrefDim] : -1;
+ int64_t extent = 1;
+ if (vectorDim >= 0) {
+ int64_t dimSize = vecTy.getDimSize(vectorDim);
+ if (dimSize == ShapedType::kDynamic)
+ return false;
+ extent = dimSize;
+ }
+ Value writeIndex = !wasDropped ? writeIndices[writeMemrefDim] : Value();
+ OpFoldResult offset =
+ hasSubview ? offsets[baseDim] : OpFoldResult(zeroAttr);
+ if (!loadIndexWithinWriteRange(loadIndices[baseDim], offset, writeIndex,
+ extent, ivsMap))
+ return false;
+ if (!wasDropped)
+ ++writeMemrefDim;
+ }
+
+ return true;
+}
+
+/// Recognize scalar memref.load of an element produced by a
+/// vector.transfer_write
+static bool loadMatchesVectorWrite(memref::LoadOp loadOp,
+ vector::TransferWriteOp writeOp,
+ const IRMapping &ivsMap) {
+ auto vecTy = dyn_cast<VectorType>(writeOp.getVector().getType());
+ if (!vecTy)
+ return false;
+
+ unsigned writeRank = writeOp.getIndices().size();
+ AffineMap permutationMap = writeOp.getPermutationMap();
+ if (!permutationMap.isProjectedPermutation() ||
+ permutationMap.getNumResults() != vecTy.getRank() ||
+ permutationMap.getNumDims() != writeRank)
+ return false;
+
+ SmallVector<int64_t> vectorDimForWriteDim(writeRank, -1);
+ for (unsigned vecDim = 0; vecDim < permutationMap.getNumResults(); ++vecDim) {
+ auto dimExpr = dyn_cast<AffineDimExpr>(permutationMap.getResult(vecDim));
+ if (!dimExpr)
+ return false;
+ unsigned writeDim = dimExpr.getPosition();
+ if (writeDim >= writeRank || vectorDimForWriteDim[writeDim] != -1)
+ return false;
+ vectorDimForWriteDim[writeDim] = vecDim;
+ }
+
+ return isLoadOnWrittenVector(loadOp, writeOp.getBase(), writeOp.getIndices(),
+ vecTy, vectorDimForWriteDim, ivsMap);
+}
+
+/// Recognize scalar memref.load of an element produced by a vector.store
+static bool loadMatchesVectorStore(memref::LoadOp loadOp,
+ vector::StoreOp storeOp,
+ const IRMapping &ivsMap) {
+ auto vecTy = dyn_cast<VectorType>(storeOp.getValueToStore().getType());
+ if (!vecTy)
+ return false;
+
+ unsigned writeRank = storeOp.getIndices().size();
+ if (vecTy.getRank() > writeRank)
+ return false;
+
+ SmallVector<int64_t> vectorDimForWriteDim(writeRank, -1);
+ unsigned vecRank = vecTy.getRank();
+ for (unsigned i = 0; i < vecRank; ++i) {
+ unsigned writeDim = writeRank - vecRank + i;
+ vectorDimForWriteDim[writeDim] = i;
+ }
+
+ return isLoadOnWrittenVector(loadOp, storeOp.getBase(), storeOp.getIndices(),
+ vecTy, vectorDimForWriteDim, ivsMap);
+}
+
+/// Check if both operations access the same positions of the same
+/// buffer, but one of the two does it through a rank-reducing full subview of
+/// the buffer (the other's base). EXAMPLE:
+/// memref.store %a, %buf[%c0, %i, %j] : memref<1x2x2xf32>
+/// %alias = memref.subview %buf[0, 0, 0][1, 2, 2][1, 1, 1]: memref<1x2x2xf32>
+/// to memref<2x2xf32>
+/// %val = memref.load %alias[%i, %j] : memref<2x2xf32>
+template <typename OpTy1, typename OpTy2>
+static bool opsAccessSameIndicesViaRankReducingSubview(
+ OpTy1 op1, OpTy2 op2, const IRMapping &firstToSecondPloopIVsMap,
+ OpBuilder &b) {
+ auto base1 = cast<MemrefValue>(getBaseMemref(op1));
+ auto base2 = cast<MemrefValue>(getBaseMemref(op2));
+ if (!base1 || !base2)
+ return false;
+
+ auto accessThroughTrivialSubviewIsSame =
+ [&b](memref::SubViewOp subView, ValueRange subViewAccess,
+ ValueRange sourceAccess, const IRMapping &ivsMap) -> bool {
+ SmallVector<Value> resolvedSubviewAccess;
+ LogicalResult resolved = resolveSourceIndicesRankReducingSubview(
+ subView.getLoc(), b, subView, subViewAccess, resolvedSubviewAccess);
+ if (failed(resolved) ||
+ (resolvedSubviewAccess.size() != sourceAccess.size()))
+ return false;
+ for (auto [dimIdx, resolvedIndex] :
+ llvm::enumerate(resolvedSubviewAccess)) {
+ if (!matchPattern(resolvedIndex, m_Zero()) &&
+ !valsAreEquivalent(resolvedIndex, sourceAccess[dimIdx], ivsMap))
+ return false;
+ }
+ return true;
+ };
+
+ // Case 1: op1 uses a subview of op2's base.
+ if (auto subView = base1.template getDefiningOp<memref::SubViewOp>();
+ subView &&
+ memref::isSameViewOrTrivialAlias(
+ base2, cast<MemrefValue>(subView.getSource())) &&
+ accessThroughTrivialSubviewIsSame(subView, op1.getIndices(),
+ op2.getIndices(),
+ firstToSecondPloopIVsMap))
+ return true;
+
+ // Case 2: op2 uses a subview of op1's base.
+ if (auto subView = base2.template getDefiningOp<memref::SubViewOp>();
+ subView &&
+ memref::isSameViewOrTrivialAlias(
+ base1, cast<MemrefValue>(subView.getSource())) &&
+ accessThroughTrivialSubviewIsSame(subView, op2.getIndices(),
+ op1.getIndices(),
+ firstToSecondPloopIVsMap))
+ return true;
+
+ return false;
+}
+
+/// Check if both memory read/write operations access the same indices
+/// (considering also the mapping of induction variables from the first to the
+/// second parallel loop).
+template <typename OpTy1, typename OpTy2>
+static bool opsAccessSameIndices(OpTy1 op1, OpTy2 op2,
+ const IRMapping &loopsIVsMap, OpBuilder &b) {
+ auto indices1 = op1.getIndices();
+ auto indices2 = op2.getIndices();
+ if (indices1.size() != indices2.size())
+ return opsAccessSameIndicesViaRankReducingSubview(op1, op2, loopsIVsMap, b);
+ for (auto [idx1, idx2] : llvm::zip(indices1, indices2)) {
+ if (!valsAreEquivalent(idx1, idx2, loopsIVsMap))
+ return false;
+ }
+ return true;
+}
+
+/// Check if the loadOp reads from the same memory location (same buffer,
+/// same indices and same properties) as written by the storeOp.
+static bool
+loadsFromSameMemoryLocationWrittenBy(Operation *loadOp, Operation *storeOp,
+ const IRMapping &firstToSecondPloopIVsMap,
+ OpBuilder &b) {
+ if (!loadOp || !storeOp)
+ return false;
+ // Support only these memory-reading ops for now
+ if (!isa<memref::LoadOp, vector::TransferReadOp, vector::LoadOp>(loadOp))
+ return false;
+ bool accessSameMemory =
+ llvm::TypeSwitch<Operation *, bool>(loadOp)
+ .Case([&](memref::LoadOp memLoadOp) {
+ if (auto memStoreOp = dyn_cast<memref::StoreOp>(storeOp))
+ return opsAccessSameIndices(memLoadOp, memStoreOp,
+ firstToSecondPloopIVsMap, b);
+ if (auto vecWriteOp = dyn_cast<vector::TransferWriteOp>(storeOp))
+ return loadMatchesVectorWrite(memLoadOp, vecWriteOp,
+ firstToSecondPloopIVsMap);
+ if (auto vecStoreOp = dyn_cast<vector::StoreOp>(storeOp))
+ return loadMatchesVectorStore(memLoadOp, vecStoreOp,
+ firstToSecondPloopIVsMap);
+ return false;
+ })
+ .Case([&](vector::TransferReadOp vecReadOp) {
+ auto vecWriteOp = dyn_cast<vector::TransferWriteOp>(storeOp);
+ if (!vecWriteOp)
+ return false;
+ return opsAccessSameIndices(vecReadOp, vecWriteOp,
+ firstToSecondPloopIVsMap, b) &&
+ (vecReadOp.getMask() == vecWriteOp.getMask()) &&
+ (vecReadOp.getInBounds() == vecWriteOp.getInBounds());
+ })
+ .Case([&](vector::LoadOp vecLoadOp) {
+ auto vecStoreOp = dyn_cast<vector::StoreOp>(storeOp);
+ if (!vecStoreOp)
+ return false;
+ return opsAccessSameIndices(vecLoadOp, vecStoreOp,
+ firstToSecondPloopIVsMap, b) &&
+ (vecLoadOp.getAlignment() == vecStoreOp.getAlignment());
+ })
+ .Default([](Operation *) { return false; });
+ return accessSameMemory;
+}
+
+static Value getStoreOpTargetBuffer(Operation *op) {
+ return llvm::TypeSwitch<Operation *, Value>(op)
+ .Case([&](memref::StoreOp storeOp) { return storeOp.getMemRef(); })
+ .Case([&](vector::TransferWriteOp writeOp) { return writeOp.getBase(); })
+ .Case([&](vector::StoreOp vecStoreOp) { return vecStoreOp.getBase(); })
+ .Default([](Operation *) { return Value(); });
+}
+
+/// To be called when `mayAlias(val1, val2)` is true. Check if the potential
+/// aliasing between the loadOp and storeOp can be resolved by analyzing their
+/// access patterns.
+static bool canResolveAlias(Operation *loadOp, Operation *storeOp,
+ const IRMapping &loopsIVsMap) {
+ if (auto transfWriteOp = dyn_cast<vector::TransferWriteOp>(storeOp);
+ transfWriteOp && isa<memref::LoadOp>(loadOp))
+ return loadMatchesVectorWrite(cast<memref::LoadOp>(loadOp), transfWriteOp,
+ loopsIVsMap);
+ if (auto vecStoreOp = dyn_cast<vector::StoreOp>(storeOp);
+ vecStoreOp && isa<memref::LoadOp>(loadOp))
+ return loadMatchesVectorStore(cast<memref::LoadOp>(loadOp), vecStoreOp,
+ loopsIVsMap);
+ return false;
+}
+
+/// Check that the parallel loops have no mixed access to the same buffers.
+/// Return `true` if the second parallel loop does not read or write the buffers
+/// written by the first loop using different indices.
+static bool haveNoDataDependenciesExceptSameIndex(
ParallelOp firstPloop, ParallelOp secondPloop,
const IRMapping &firstToSecondPloopIndices,
- llvm::function_ref<bool(Value, Value)> mayAlias) {
- DenseMap<Value, SmallVector<ValueRange, 1>> bufferStores;
- SmallVector<Value> bufferStoresVec;
- firstPloop.getBody()->walk([&](memref::StoreOp store) {
- bufferStores[store.getMemRef()].push_back(store.getIndices());
- bufferStoresVec.emplace_back(store.getMemRef());
- });
- auto walkResult = secondPloop.getBody()->walk([&](memref::LoadOp load) {
- Value loadMem = load.getMemRef();
- // Stop if the memref is defined in secondPloop body. Careful alias analysis
- // is needed.
- auto *memrefDef = loadMem.getDefiningOp();
- if (memrefDef && memrefDef->getBlock() == load->getBlock())
+ llvm::function_ref<bool(Value, Value)> mayAlias, OpBuilder &b) {
+ // Map buffers to their store/write ops in the firstPloop
+ DenseMap<Value, SmallVector<Operation *>> bufferStoresInFirstPloop;
+ // Record all the memory buffers used in store/write ops found in firstPloop
+ llvm::SmallSetVector<Value, 4> buffersWrittenInFirstPloop;
+
+ auto collectStoreOpsInWalk = [&](Operation *op) {
+ auto memOpInterf = dyn_cast_if_present<MemoryEffectOpInterface>(op);
+ // Ignore ops that don't write to memory
+ if (!memOpInterf || (!memOpInterf.hasEffect<MemoryEffects::Write>() &&
+ !memOpInterf.hasEffect<MemoryEffects::Free>()))
+ return WalkResult::advance();
+
+ // Only these memory-writing ops are supported for now:
+ // memref.store, vector.transfer_write, vector.store
+ Value storeOpBase = getStoreOpTargetBuffer(op);
+ if (!storeOpBase)
return WalkResult::interrupt();
- for (Value store : bufferStoresVec)
- if (store != loadMem && mayAlias(store, loadMem))
- return WalkResult::interrupt();
+ // Expect the base operand to be a Memref
+ MemrefValue storeOpBaseMemref = dyn_cast<MemrefValue>(storeOpBase);
+ if (!storeOpBaseMemref)
+ return WalkResult::interrupt();
+ // Get the original memref buffer, skipping full view-like ops
+ Value buffer = memref::skipFullyAliasingOperations(storeOpBaseMemref);
+ bufferStoresInFirstPloop[buffer].push_back(op);
+ buffersWrittenInFirstPloop.insert(buffer);
+ return WalkResult::advance();
+ };
- auto write = bufferStores.find(loadMem);
- if (write == bufferStores.end())
- return WalkResult::advance();
+ // Walk the first parallel loop to collect all store/write ops and their
+ // target buffers
+ if (firstPloop.getBody()->walk(collectStoreOpsInWalk).wasInterrupted())
+ return false;
- // Check that at last one store was retrieved
- if (write->second.empty())
+ // Check that this load/read op encountered while walking the second parallel
+ // loop does not have incompatible data dependencies with the store/write ops
+ // collected from the first parallel loop: the loops can be fused only if in
+ // the 2nd loop there are no loads/stores from/to the buffers written in the
+ // 1st loop, except when on the same exact memory location (same indices) as
+ // written in the 1st loop.
+ auto checkLoadInWalkHasNoIncompatibleDataDeps = [&](Operation *loadOp) {
+ auto memOpInterf = dyn_cast_if_present<MemoryEffectOpInterface>(loadOp);
+ // To be conservative, we should stop on ops that don't advertise their
+ // memory effects. However, many ops don't implement MemoryEffectOpInterface
+ // yet, so for now we just skip them.
+ // TODO: once more ops add MemoryEffectOpInterface, interrupt the walk here.
+ if (!memOpInterf &&
+ !loadOp->hasTrait<mlir::OpTrait::HasRecursiveMemoryEffects>())
+ return WalkResult::advance();
+ // Ignore ops that don't read from memory, and wrapping ops that have nested
+ // memory effects (e.g. loops, conditionals) as they will be analyzed when
+ // visiting their nested ops.
+ if ((!memOpInterf &&
+ loadOp->hasTrait<mlir::OpTrait::HasRecursiveMemoryEffects>()) ||
+ (memOpInterf && !memOpInterf.hasEffect<MemoryEffects::Read>()))
+ return WalkResult::advance();
+ // Support only these memory-reading ops for now
+ if (!isa<memref::LoadOp, vector::TransferReadOp, vector::LoadOp>(loadOp) ||
+ !isa<MemrefValue>(loadOp->getOperand(0)))
return WalkResult::interrupt();
- auto storeIndices = write->second.front();
+ MemrefValue loadOpBase = cast<MemrefValue>(loadOp->getOperand(0));
+ MemrefValue loadedOrigBuf = memref::skipFullyAliasingOperations(loadOpBase);
- // Multiple writes to the same memref are allowed only on the same indices
- for (const auto &othStoreIndices : write->second) {
- if (othStoreIndices != storeIndices)
+ for (Value storedMem : buffersWrittenInFirstPloop)
+ if ((storedMem != loadedOrigBuf) && mayAlias(storedMem, loadedOrigBuf) &&
+ !llvm::all_of(bufferStoresInFirstPloop[storedMem],
+ [&](Operation *storeOp) {
+ return canResolveAlias(loadOp, storeOp,
+ firstToSecondPloopIndices);
+ })) {
return WalkResult::interrupt();
+ }
+
+ auto writeOpsIt = bufferStoresInFirstPloop.find(loadedOrigBuf);
+ if (writeOpsIt == bufferStoresInFirstPloop.end())
+ return WalkResult::advance();
+ // Store/write ops to this buffer in the firstPloop
+ SmallVector<mlir::Operation *> &writeOps = writeOpsIt->second;
+
+ // If the first loop has no writes to this buffer, continue
+ if (writeOps.empty())
+ return WalkResult::advance();
+
+ Operation *writeOp = writeOps.front();
+
+ // In the first parallel loop, multiple writes to the same memref are
+ // allowed only on the same memory location
+ if (!llvm::all_of(writeOps, [&](Operation *otherWriteOp) {
+ return opsWriteSameMemLocation(writeOp, otherWriteOp);
+ })) {
+ return WalkResult::interrupt();
}
- // Check that the load indices of secondPloop coincide with store indices of
- // firstPloop for the same memrefs.
- auto loadIndices = load.getIndices();
- if (storeIndices.size() != loadIndices.size())
+ // Check that the load in secondPloop reads from the same memory location as
+ // written by the corresponding store in firstPloop
+ if (!loadsFromSameMemoryLocationWrittenBy(loadOp, writeOp,
+ firstToSecondPloopIndices, b)) {
return WalkResult::interrupt();
- for (int i = 0, e = storeIndices.size(); i < e; ++i) {
- if (firstToSecondPloopIndices.lookupOrDefault(storeIndices[i]) !=
- loadIndices[i]) {
- auto *storeIndexDefOp = storeIndices[i].getDefiningOp();
- auto *loadIndexDefOp = loadIndices[i].getDefiningOp();
- if (storeIndexDefOp && loadIndexDefOp) {
- if (!isMemoryEffectFree(storeIndexDefOp))
- return WalkResult::interrupt();
- if (!isMemoryEffectFree(loadIndexDefOp))
- return WalkResult::interrupt();
- if (!OperationEquivalence::isEquivalentTo(
- storeIndexDefOp, loadIndexDefOp,
- [&](Value storeIndex, Value loadIndex) {
- if (firstToSecondPloopIndices.lookupOrDefault(storeIndex) !=
- firstToSecondPloopIndices.lookupOrDefault(loadIndex))
- return failure();
- else
- return success();
- },
- /*markEquivalent=*/nullptr,
- OperationEquivalence::Flags::IgnoreLocations)) {
- return WalkResult::interrupt();
- }
- } else {
- return WalkResult::interrupt();
- }
- }
}
+
return WalkResult::advance();
- });
- return !walkResult.wasInterrupted();
+ };
+
+ // Walk the second parallel loop to check load/read ops against the stores
+ // collected from the first parallel loop.
+ return !secondPloop.getBody()
+ ->walk(checkLoadInWalkHasNoIncompatibleDataDeps)
+ .wasInterrupted();
}
-/// Analyzes dependencies in the most primitive way by checking simple read and
-/// write patterns.
-static LogicalResult
-verifyDependencies(ParallelOp firstPloop, ParallelOp secondPloop,
- const IRMapping &firstToSecondPloopIndices,
- llvm::function_ref<bool(Value, Value)> mayAlias) {
- if (!haveNoReadsAfterWriteExceptSameIndex(
- firstPloop, secondPloop, firstToSecondPloopIndices, mayAlias))
- return failure();
+/// Check that in each loop there are no read ops on the buffers written
+/// by the other loop, except when reading from the same exact memory location
+/// (same indices) as written in the other loop.
+static bool
+noIncompatibleDataDependencies(ParallelOp firstPloop, ParallelOp secondPloop,
+ const IRMapping &firstToSecondPloopIndices,
+ llvm::function_ref<bool(Value, Value)> mayAlias,
+ OpBuilder &b) {
+ if (!haveNoDataDependenciesExceptSameIndex(
+ firstPloop, secondPloop, firstToSecondPloopIndices, mayAlias, b))
+ return false;
IRMapping secondToFirstPloopIndices;
secondToFirstPloopIndices.map(secondPloop.getBody()->getArguments(),
firstPloop.getBody()->getArguments());
- return success(haveNoReadsAfterWriteExceptSameIndex(
- secondPloop, firstPloop, secondToFirstPloopIndices, mayAlias));
+ return haveNoDataDependenciesExceptSameIndex(
+ secondPloop, firstPloop, secondToFirstPloopIndices, mayAlias, b);
}
+/// Check if fusion of the two parallel loops is legal:
+/// i.e. no nested parallel loops, equal iteration spaces,
+/// and no incompatible data dependencies between the loops.
static bool isFusionLegal(ParallelOp firstPloop, ParallelOp secondPloop,
const IRMapping &firstToSecondPloopIndices,
- llvm::function_ref<bool(Value, Value)> mayAlias) {
+ llvm::function_ref<bool(Value, Value)> mayAlias,
+ OpBuilder &b) {
return !hasNestedParallelOp(firstPloop) &&
!hasNestedParallelOp(secondPloop) &&
equalIterationSpaces(firstPloop, secondPloop) &&
- succeeded(verifyDependencies(firstPloop, secondPloop,
- firstToSecondPloopIndices, mayAlias));
+ noIncompatibleDataDependencies(firstPloop, secondPloop,
+ firstToSecondPloopIndices, mayAlias, b);
}
-/// Prepends operations of firstPloop's body into secondPloop's body.
-/// Updates secondPloop with new loop.
+/// Prepend operations of firstPloop's body into secondPloop's body.
+/// Update secondPloop with new loop.
static void fuseIfLegal(ParallelOp firstPloop, ParallelOp &secondPloop,
OpBuilder builder,
llvm::function_ref<bool(Value, Value)> mayAlias) {
@@ -172,7 +744,7 @@ static void fuseIfLegal(ParallelOp firstPloop, ParallelOp &secondPloop,
firstToSecondPloopIndices.map(block1->getArguments(), block2->getArguments());
if (!isFusionLegal(firstPloop, secondPloop, firstToSecondPloopIndices,
- mayAlias))
+ mayAlias, builder))
return;
DominanceInfo dom;
@@ -272,6 +844,18 @@ struct ParallelLoopFusion
auto &aa = getAnalysis<AliasAnalysis>();
auto mayAlias = [&](Value val1, Value val2) -> bool {
+ // If the memref is defined in one of the parallel loops body, careful
+ // alias analysis is needed.
+ // TODO: check if this is still needed as a separate check.
+ auto val1Def = val1.getDefiningOp();
+ auto val2Def = val2.getDefiningOp();
+ auto val1Loop =
+ val1Def ? val1Def->getParentOfType<ParallelOp>() : nullptr;
+ auto val2Loop =
+ val2Def ? val2Def->getParentOfType<ParallelOp>() : nullptr;
+ if (val1Loop != val2Loop)
+ return true;
+
return !aa.alias(val1, val2).isNo();
};
diff --git a/mlir/lib/Dialect/SCF/Utils/Utils.cpp b/mlir/lib/Dialect/SCF/Utils/Utils.cpp
index f8a4f057c9f0d..e795f3f0b019b 100644
--- a/mlir/lib/Dialect/SCF/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/SCF/Utils/Utils.cpp
@@ -1560,6 +1560,25 @@ bool mlir::isPerfectlyNestedForLoops(
return true;
}
+llvm::SmallVector<std::tuple<int64_t, int64_t, int64_t>>
+mlir::getConstLoopBounds(mlir::LoopLikeOpInterface loopOp) {
+ std::optional<SmallVector<OpFoldResult>> loBnds = loopOp.getLoopLowerBounds();
+ std::optional<SmallVector<OpFoldResult>> upBnds = loopOp.getLoopUpperBounds();
+ std::optional<SmallVector<OpFoldResult>> steps = loopOp.getLoopSteps();
+ if (!loBnds || !upBnds || !steps)
+ return {};
+ llvm::SmallVector<std::tuple<int64_t, int64_t, int64_t>> loopRanges;
+ for (auto [lb, ub, step] : llvm::zip(*loBnds, *upBnds, *steps)) {
+ auto lbCst = getConstantIntValue(lb);
+ auto ubCst = getConstantIntValue(ub);
+ auto stepCst = getConstantIntValue(step);
+ if (!lbCst || !ubCst || !stepCst)
+ return {};
+ loopRanges.emplace_back(*lbCst, *ubCst, *stepCst);
+ }
+ return loopRanges;
+}
+
llvm::SmallVector<llvm::APInt>
mlir::getConstLoopTripCounts(mlir::LoopLikeOpInterface loopOp) {
std::optional<SmallVector<OpFoldResult>> loBnds = loopOp.getLoopLowerBounds();
diff --git a/mlir/test/Conversion/MathToSPIRV/math-to-gl-spirv.mlir b/mlir/test/Conversion/MathToSPIRV/math-to-gl-spirv.mlir
index b8e001c9f6950..608abffd8bd82 100644
--- a/mlir/test/Conversion/MathToSPIRV/math-to-gl-spirv.mlir
+++ b/mlir/test/Conversion/MathToSPIRV/math-to-gl-spirv.mlir
@@ -257,6 +257,27 @@ func.func @round_vector(%x: vector<4xf32>) -> vector<4xf32> {
return %0: vector<4xf32>
}
+// Unit dimensional vectors are converted to scalars by inserting
+// unrealized_conversion_cast's.
+//
+// CHECK-LABEL: @round_vector_unit_dim
+// CHECK-SAME: (%[[ARG:.+]]: vector<1xf32>) -> vector<1xf32>
+func.func @round_vector_unit_dim(%x: vector<1xf32>) -> vector<1xf32> {
+ // CHECK: %[[CAST:.+]] = builtin.unrealized_conversion_cast %[[ARG]] : vector<1xf32> to f32
+ // CHECK: %[[ZERO:.+]] = spirv.Constant 0.000000e+00
+ // CHECK: %[[ONE:.+]] = spirv.Constant 1.000000e+00
+ // CHECK: %[[HALF:.+]] = spirv.Constant 5.000000e-01
+ // CHECK: %[[ABS:.+]] = spirv.GL.FAbs %[[CAST]] : f32
+ // CHECK: %[[FLOOR:.+]] = spirv.GL.Floor %[[ABS]]
+ // CHECK: %[[SUB:.+]] = spirv.FSub %[[ABS]], %[[FLOOR]]
+ // CHECK: %[[GE:.+]] = spirv.FOrdGreaterThanEqual %[[SUB]], %[[HALF]]
+ // CHECK: %[[SEL:.+]] = spirv.Select %[[GE]], %[[ONE]], %[[ZERO]]
+ // CHECK: %[[ADD:.+]] = spirv.FAdd %[[FLOOR]], %[[SEL]]
+ // CHECK: %[[BITCAST:.+]] = spirv.Bitcast %[[ADD]] : f32 to i32
+ %0 = math.round %x : vector<1xf32>
+ return %0: vector<1xf32>
+}
+
} // end module
// -----
diff --git a/mlir/test/Dialect/LLVMIR/invalid.mlir b/mlir/test/Dialect/LLVMIR/invalid.mlir
index 49b6342aea538..b736cde7689ed 100644
--- a/mlir/test/Dialect/LLVMIR/invalid.mlir
+++ b/mlir/test/Dialect/LLVMIR/invalid.mlir
@@ -2065,3 +2065,10 @@ llvm.func @invalid_sincos_gt_2_element_struct_return_type(%f: f32) -> () {
// expected-error at +1 {{op expected result type to be an homogeneous struct with two elements matching the operand type}}
llvm.intr.sincos(%f) : (f32) -> !llvm.struct<(f32, f32, f32)>
}
+
+// -----
+
+module {
+ // expected-error at +1 {{'llvm.blockaddress' op expects an existing block label target in the referenced function}}
+ %0 = llvm.blockaddress <function = @missing_func, tag = <id = 1>> : !llvm.ptr
+}
diff --git a/mlir/test/Dialect/SCF/parallel-loop-fusion.mlir b/mlir/test/Dialect/SCF/parallel-loop-fusion.mlir
index 0d4ea6f20e8d9..d876062b704f2 100644
--- a/mlir/test/Dialect/SCF/parallel-loop-fusion.mlir
+++ b/mlir/test/Dialect/SCF/parallel-loop-fusion.mlir
@@ -314,23 +314,24 @@ func.func @do_not_fuse_unmatching_read_write_patterns(
// -----
-func.func @do_not_fuse_loops_with_memref_defined_in_loop_bodies() {
+func.func @do_not_fuse_loops_with_nonfull_alias_defined_in_loop_bodies() {
%c2 = arith.constant 2 : index
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
+ %c1fp = arith.constant 1.0 : f32
%buffer = memref.alloc() : memref<2x2xf32>
- scf.parallel (%i, %j) = (%c0, %c0) to (%c2, %c2) step (%c1, %c1) {
+ scf.parallel (%i, %j) = (%c0, %c0) to (%c2, %c1) step (%c1, %c1) {
+ memref.store %c1fp, %buffer[%i, %j] : memref<2x2xf32>
scf.reduce
}
- scf.parallel (%i, %j) = (%c0, %c0) to (%c2, %c2) step (%c1, %c1) {
- %A = memref.subview %buffer[%c0, %c0][%c2, %c2][%c1, %c1]
- : memref<2x2xf32> to memref<?x?xf32, strided<[?, ?], offset: ?>>
- %A_elem = memref.load %A[%i, %j] : memref<?x?xf32, strided<[?, ?], offset: ?>>
+ scf.parallel (%i, %j) = (%c0, %c0) to (%c2, %c1) step (%c1, %c1) {
+ %A = memref.subview %buffer[%i, %c0][2, 1][1, 1] : memref<2x2xf32> to memref<2x1xf32, strided<[2, 1], offset: ?>>
+ %A_elem = memref.load %A[%i, %j] : memref<2x1xf32, strided<[2, 1], offset: ?>>
scf.reduce
}
return
}
-// CHECK-LABEL: func @do_not_fuse_loops_with_memref_defined_in_loop_bodies
+// CHECK-LABEL: func @do_not_fuse_loops_with_nonfull_alias_defined_in_loop_bodies
// CHECK: scf.parallel
// CHECK: scf.parallel
@@ -604,6 +605,415 @@ func.func @do_not_fuse_affine_apply_to_non_ind_var(
// -----
+func.func @fuse_trivial_rank_reducing_subview() {
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %c2 = arith.constant 2 : index
+ %c1fp = arith.constant 1.0 : f32
+ %buf = memref.alloc() : memref<1x2x2xf32>
+ scf.parallel (%i, %j) = (%c0, %c0) to (%c2, %c2) step (%c1, %c1) {
+ memref.store %c1fp, %buf[%c0, %i, %j] : memref<1x2x2xf32>
+ scf.reduce
+ }
+ %sub = memref.subview %buf[0, 0, 0][1, 2, 2][1, 1, 1]
+ : memref<1x2x2xf32> to memref<2x2xf32>
+ scf.parallel (%i, %j) = (%c0, %c0) to (%c2, %c2) step (%c1, %c1) {
+ %v = memref.load %sub[%i, %j] : memref<2x2xf32>
+ memref.store %v, %buf[%c0, %i, %j] : memref<1x2x2xf32>
+ scf.reduce
+ }
+ memref.dealloc %buf : memref<1x2x2xf32>
+ return
+}
+// CHECK-LABEL: func @fuse_trivial_rank_reducing_subview
+// CHECK: %[[BUF:.*]] = memref.alloc() : memref<1x2x2xf32>
+// CHECK: %[[SUB:.*]] = memref.subview %[[BUF]]
+// CHECK: scf.parallel
+// CHECK: memref.store {{.*}}, %[[BUF]]
+// CHECK: %[[L:.*]] = memref.load %[[SUB]]
+// CHECK: memref.store %[[L]], %[[BUF]]
+// CHECK-NOT: scf.parallel
+// CHECK: memref.dealloc %[[BUF]] : memref<1x2x2xf32>
+
+// -----
+
+func.func @do_not_fuse_nontrivial_subview_offset() {
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %c2 = arith.constant 2 : index
+ %c1fp = arith.constant 1.0 : f32
+ %buf = memref.alloc() : memref<2x2x2xf32>
+ scf.parallel (%i, %j) = (%c0, %c0) to (%c2, %c2) step (%c1, %c1) {
+ memref.store %c1fp, %buf[%c0, %i, %j] : memref<2x2x2xf32>
+ scf.reduce
+ }
+ %sub = memref.subview %buf[1, 0, 0][1, 2, 2][1, 1, 1]
+ : memref<2x2x2xf32> to memref<2x2xf32, strided<[2, 1], offset: 4>>
+ scf.parallel (%i, %j) = (%c0, %c0) to (%c2, %c2) step (%c1, %c1) {
+ %v = memref.load %sub[%i, %j]
+ : memref<2x2xf32, strided<[2, 1], offset: 4>>
+ memref.store %v, %buf[%c0, %i, %j] : memref<2x2x2xf32>
+ scf.reduce
+ }
+ memref.dealloc %buf : memref<2x2x2xf32>
+ return
+}
+// CHECK-LABEL: func @do_not_fuse_nontrivial_subview_offset
+// CHECK: scf.parallel
+// CHECK: scf.parallel
+
+// -----
+
+func.func @fuse_vector_load_store(%A: memref<4x4xf32>) {
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %c4 = arith.constant 4 : index
+ %vec0 = arith.constant dense<0.0> : vector<4xf32>
+ scf.parallel (%i) = (%c0) to (%c4) step (%c1) {
+ vector.store %vec0, %A[%i, %c0] : memref<4x4xf32>, vector<4xf32>
+ scf.reduce
+ }
+ scf.parallel (%i) = (%c0) to (%c4) step (%c1) {
+ %v = vector.load %A[%i, %c0] : memref<4x4xf32>, vector<4xf32>
+ vector.store %v, %A[%i, %c0] : memref<4x4xf32>, vector<4xf32>
+ scf.reduce
+ }
+ return
+}
+// CHECK-LABEL: func @fuse_vector_load_store
+// CHECK: scf.parallel (%[[I:.*]]) = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) {
+// CHECK: vector.store
+// CHECK: %[[V:.*]] = vector.load
+// CHECK: vector.store %[[V]]
+// CHECK-NOT: scf.parallel
+
+// -----
+
+func.func @do_not_fuse_vector_different_indices(%A: memref<4x4xf32>) {
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %c4 = arith.constant 4 : index
+ %vec0 = arith.constant dense<0.0> : vector<4xf32>
+ scf.parallel (%i) = (%c0) to (%c4) step (%c1) {
+ vector.store %vec0, %A[%i, %c0] : memref<4x4xf32>, vector<4xf32>
+ scf.reduce
+ }
+ scf.parallel (%i) = (%c0) to (%c4) step (%c1) {
+ %j = affine.apply affine_map<(d0) -> (d0 + 1)>(%i)
+ %v = vector.load %A[%j, %c0] : memref<4x4xf32>, vector<4xf32>
+ vector.store %v, %A[%i, %c0] : memref<4x4xf32>, vector<4xf32>
+ scf.reduce
+ }
+ return
+}
+// CHECK-LABEL: func @do_not_fuse_vector_different_indices
+// CHECK: scf.parallel
+// CHECK: scf.parallel
+
+// -----
+
+func.func @fuse_vector_transfer_same_indices(%A: memref<4x4xf32>) {
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %c4 = arith.constant 4 : index
+ %zero = arith.constant 0.0 : f32
+ scf.parallel (%i) = (%c0) to (%c4) step (%c1) {
+ %v = vector.transfer_read %A[%i, %c0], %zero {permutation_map = affine_map<(d0, d1) -> (d1)>, in_bounds = [true]} : memref<4x4xf32>, vector<4xf32>
+ vector.transfer_write %v, %A[%i, %c0] {permutation_map = affine_map<(d0, d1) -> (d1)>, in_bounds = [true]} : vector<4xf32>, memref<4x4xf32>
+ scf.reduce
+ }
+ scf.parallel (%i) = (%c0) to (%c4) step (%c1) {
+ %v = vector.transfer_read %A[%i, %c0], %zero {permutation_map = affine_map<(d0, d1) -> (d1)>, in_bounds = [true]} : memref<4x4xf32>, vector<4xf32>
+ vector.transfer_write %v, %A[%i, %c0] {permutation_map = affine_map<(d0, d1) -> (d1)>, in_bounds = [true]} : vector<4xf32>, memref<4x4xf32>
+ scf.reduce
+ }
+ return
+}
+// CHECK-LABEL: func @fuse_vector_transfer_same_indices
+// CHECK: scf.parallel
+// CHECK: vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}]
+// CHECK: vector.transfer_write %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}]
+// CHECK: vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}]
+// CHECK: vector.transfer_write %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}]
+// CHECK-NOT: scf.parallel
+
+// -----
+
+func.func @do_not_fuse_vector_transfer_different_indices(%A: memref<4x4xf32>) {
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %c4 = arith.constant 4 : index
+ %zero = arith.constant 0.0 : f32
+ scf.parallel (%i) = (%c0) to (%c4) step (%c1) {
+ %v = vector.transfer_read %A[%i, %c0], %zero {permutation_map = affine_map<(d0, d1) -> (d1)>, in_bounds = [true]} : memref<4x4xf32>, vector<4xf32>
+ vector.transfer_write %v, %A[%i, %c0] {permutation_map = affine_map<(d0, d1) -> (d1)>, in_bounds = [true]} : vector<4xf32>, memref<4x4xf32>
+ scf.reduce
+ }
+ scf.parallel (%i) = (%c0) to (%c4) step (%c1) {
+ %j = affine.apply affine_map<(d0) -> (d0 + 1)>(%i)
+ %v = vector.transfer_read %A[%j, %c0], %zero {permutation_map = affine_map<(d0, d1) -> (d1)>, in_bounds = [true]} : memref<4x4xf32>, vector<4xf32>
+ vector.transfer_write %v, %A[%i, %c0] {permutation_map = affine_map<(d0, d1) -> (d1)>, in_bounds = [true]} : vector<4xf32>, memref<4x4xf32>
+ scf.reduce
+ }
+ return
+}
+// CHECK-LABEL: func @do_not_fuse_vector_transfer_different_indices
+// CHECK: scf.parallel
+// CHECK: scf.parallel
+
+// -----
+
+func.func @fuse_vector_transfer_with_subview(%A: memref<1x4xf32>) {
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %c4 = arith.constant 4 : index
+ %zero = arith.constant 0.0 : f32
+ %vec = arith.constant dense<1.0> : vector<4xf32>
+ scf.parallel (%i) = (%c0) to (%c1) step (%c1) {
+ %sub = memref.subview %A[0, 0][1, 4][1, 1] : memref<1x4xf32> to memref<4xf32>
+ vector.transfer_write %vec, %sub[%c0] {permutation_map = affine_map<(d0) -> (d0)>, in_bounds = [true]} : vector<4xf32>, memref<4xf32>
+ scf.reduce
+ }
+ scf.parallel (%i) = (%c0) to (%c1) step (%c1) {
+ %sum = scf.for %k = %c0 to %c4 step %c1 iter_args(%acc = %zero) -> f32 {
+ %v = memref.load %A[%c0, %k] : memref<1x4xf32>
+ %n = arith.addf %v, %acc : f32
+ scf.yield %n : f32
+ }
+ memref.store %sum, %A[%c0, %c0] : memref<1x4xf32>
+ scf.reduce
+ }
+ return
+}
+// CHECK-LABEL: func @fuse_vector_transfer_with_subview
+// CHECK: scf.parallel
+// CHECK: vector.transfer_write
+// CHECK: scf.for
+// CHECK-NOT: scf.parallel
+
+// -----
+
+func.func @do_not_fuse_vector_transfer_nontrivial_subview(%A: memref<2x4xf32>) {
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %zero = arith.constant 0.0 : f32
+ scf.parallel (%i) = (%c0) to (%c1) step (%c1) {
+ %v = vector.transfer_read %A[%c0, %i], %zero {permutation_map = affine_map<(d0, d1) -> (d1)>, in_bounds = [true]} : memref<2x4xf32>, vector<1xf32>
+ vector.transfer_write %v, %A[%c0, %i] {permutation_map = affine_map<(d0, d1) -> (d1)>, in_bounds = [true]} : vector<1xf32>, memref<2x4xf32>
+ scf.reduce
+ }
+ %sub = memref.subview %A[1, 0][1, 4][1, 1] : memref<2x4xf32> to memref<4xf32, strided<[1], offset: 4>>
+ scf.parallel (%i) = (%c0) to (%c1) step (%c1) {
+ %v = vector.transfer_read %sub[%i], %zero {in_bounds = [true]} : memref<4xf32, strided<[1], offset: 4>>, vector<1xf32>
+ vector.transfer_write %v, %sub[%i] {in_bounds = [true]} : vector<1xf32>, memref<4xf32, strided<[1], offset: 4>>
+ scf.reduce
+ }
+ return
+}
+// CHECK-LABEL: func @do_not_fuse_vector_transfer_nontrivial_subview
+// CHECK: scf.parallel
+// CHECK: scf.parallel
+
+// -----
+
+func.func @do_not_fuse_vector_transfer_different_masks(%A: memref<1x4xf32>) {
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %zero = arith.constant 0.0 : f32
+ %mask_true = vector.create_mask %c1 : vector<1xi1>
+ %mask_false = vector.create_mask %c0 : vector<1xi1>
+ scf.parallel (%i) = (%c0) to (%c1) step (%c1) {
+ %v = vector.transfer_read %A[%c0, %i], %zero, %mask_true {permutation_map = affine_map<(d0, d1) -> (d1)>, in_bounds = [true]} : memref<1x4xf32>, vector<1xf32>
+ vector.transfer_write %v, %A[%c0, %i], %mask_true {permutation_map = affine_map<(d0, d1) -> (d1)>, in_bounds = [true]} : vector<1xf32>, memref<1x4xf32>
+ scf.reduce
+ }
+ scf.parallel (%i) = (%c0) to (%c1) step (%c1) {
+ %v = vector.transfer_read %A[%c0, %i], %zero, %mask_false {permutation_map = affine_map<(d0, d1) -> (d1)>, in_bounds = [true]} : memref<1x4xf32>, vector<1xf32>
+ vector.transfer_write %v, %A[%c0, %i], %mask_false {permutation_map = affine_map<(d0, d1) -> (d1)>, in_bounds = [true]} : vector<1xf32>, memref<1x4xf32>
+ scf.reduce
+ }
+ return
+}
+// CHECK-LABEL: func @do_not_fuse_vector_transfer_different_masks
+// CHECK: scf.parallel
+// CHECK: scf.parallel
+
+// -----
+
+func.func @fuse_vector_transfer_subview_rank_reducing(%A: memref<1x4xf32>, %B: memref<1x4xf32>) {
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %c4 = arith.constant 4 : index
+ %zero = arith.constant 0.0 : f32
+ %vec = arith.constant dense<1.0> : vector<4xf32>
+ scf.parallel (%i) = (%c0) to (%c1) step (%c1) {
+ %sub = memref.subview %A[%i, %c0][1, 4][1, 1] : memref<1x4xf32> to memref<4xf32, strided<[1], offset: ?>>
+ vector.transfer_write %vec, %sub[%c0] {permutation_map = affine_map<(d0) -> (d0)>, in_bounds = [true]} : vector<4xf32>, memref<4xf32, strided<[1], offset: ?>>
+ scf.reduce
+ }
+ scf.parallel (%i) = (%c0) to (%c1) step (%c1) {
+ %sum = scf.for %k = %c0 to %c4 step %c1 iter_args(%acc = %zero) -> f32 {
+ %v = memref.load %A[%i, %k] : memref<1x4xf32>
+ %n = arith.addf %v, %acc : f32
+ scf.yield %n : f32
+ }
+ memref.store %sum, %B[%i, %c0] : memref<1x4xf32>
+ scf.reduce
+ }
+ return
+}
+// CHECK-LABEL: func @fuse_vector_transfer_subview_rank_reducing
+// CHECK: scf.parallel
+// CHECK: vector.transfer_write
+// CHECK: scf.for
+// CHECK-NOT: scf.parallel
+
+// -----
+
+func.func @do_not_fuse_vector_transfer_subview_offset(%A: memref<1x4xf32>, %B: memref<1x4xf32>) {
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %c4 = arith.constant 4 : index
+ %zero = arith.constant 0.0 : f32
+ %vec = arith.constant dense<1.0> : vector<4xf32>
+ scf.parallel (%i) = (%c0) to (%c1) step (%c1) {
+ %sub = memref.subview %A[%i, %c0][1, 4][1, 1] : memref<1x4xf32> to memref<4xf32, strided<[1], offset: ?>>
+ vector.transfer_write %vec, %sub[%c0] {permutation_map = affine_map<(d0) -> (d0)>, in_bounds = [true]} : vector<4xf32>, memref<4xf32, strided<[1], offset: ?>>
+ scf.reduce
+ }
+ scf.parallel (%i) = (%c0) to (%c1) step (%c1) {
+ %sum = scf.for %k = %c0 to %c4 step %c1 iter_args(%acc = %zero) -> f32 {
+ %v = memref.load %A[%i, %k] : memref<1x4xf32>
+ %n = arith.addf %v, %acc : f32
+ scf.yield %n : f32
+ }
+ // Read from an offset alias to prevent fusion.
+ %off = memref.subview %A[%i, %c1][1, 3][1, 1] : memref<1x4xf32> to memref<3xf32, strided<[1], offset: ?>>
+ %v0 = memref.load %off[%c0] : memref<3xf32, strided<[1], offset: ?>>
+ %res = arith.addf %sum, %v0 : f32
+ memref.store %res, %B[%i, %c0] : memref<1x4xf32>
+ scf.reduce
+ }
+ return
+}
+// CHECK-LABEL: func @do_not_fuse_vector_transfer_subview_offset
+// CHECK: scf.parallel
+// CHECK: scf.parallel
+
+// -----
+
+func.func @fuse_vector_transfer_no_subview(%A: memref<1x4xf32>, %B: memref<1x4xf32>) {
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %c4 = arith.constant 4 : index
+ %zero = arith.constant 0.0 : f32
+ %vec = arith.constant dense<2.0> : vector<4xf32>
+ scf.parallel (%i) = (%c0) to (%c1) step (%c1) {
+ vector.transfer_write %vec, %A[%c0, %i] {permutation_map = affine_map<(d0, d1) -> (d1)>, in_bounds = [true]} : vector<4xf32>, memref<1x4xf32>
+ scf.reduce
+ }
+ scf.parallel (%i) = (%c0) to (%c1) step (%c1) {
+ %sum = scf.for %k = %c0 to %c4 step %c1 iter_args(%acc = %zero) -> f32 {
+ %v = memref.load %A[%c0, %k] : memref<1x4xf32>
+ %n = arith.addf %v, %acc : f32
+ scf.yield %n : f32
+ }
+ memref.store %sum, %B[%c0, %c0] : memref<1x4xf32>
+ scf.reduce
+ }
+ return
+}
+// CHECK-LABEL: func @fuse_vector_transfer_no_subview
+// CHECK: vector.transfer_write
+// CHECK: scf.for
+// CHECK-NOT: scf.parallel
+
+// -----
+
+func.func @fuse_vector_transfer_scalar_load_rank2(%A: memref<2x4xf32>, %B: memref<2x4xf32>) {
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %c2 = arith.constant 2 : index
+ %vec = arith.constant dense<1.0> : vector<2x4xf32>
+ scf.parallel (%i) = (%c0) to (%c1) step (%c1) {
+ vector.transfer_write %vec, %A[%c0, %c0] {permutation_map = affine_map<(d0, d1) -> (d0, d1)>, in_bounds = [true, true]} : vector<2x4xf32>, memref<2x4xf32>
+ scf.reduce
+ }
+ scf.parallel (%i) = (%c0) to (%c1) step (%c1) {
+ %v0 = memref.load %A[%c0, %c1] : memref<2x4xf32>
+ %v1 = memref.load %A[%c1, %c2] : memref<2x4xf32>
+ %sum = arith.addf %v0, %v1 : f32
+ memref.store %sum, %B[%c0, %c0] : memref<2x4xf32>
+ scf.reduce
+ }
+ return
+}
+// CHECK-LABEL: func @fuse_vector_transfer_scalar_load_rank2
+// CHECK: scf.parallel
+// CHECK: vector.transfer_write
+// CHECK: memref.load
+// CHECK: memref.load
+// CHECK-NOT: scf.parallel
+
+// -----
+
+func.func @fuse_vector_transfer_scalar_load_loop_rank2(%A: memref<2x4xf32>, %B: memref<2x4xf32>) {
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %c4 = arith.constant 4 : index
+ %zero = arith.constant 0.0 : f32
+ %vec = arith.constant dense<2.0> : vector<2x4xf32>
+ scf.parallel (%i) = (%c0) to (%c1) step (%c1) {
+ vector.transfer_write %vec, %A[%c0, %c0] {permutation_map = affine_map<(d0, d1) -> (d0, d1)>, in_bounds = [true, true]} : vector<2x4xf32>, memref<2x4xf32>
+ scf.reduce
+ }
+ scf.parallel (%i) = (%c0) to (%c1) step (%c1) {
+ %sum = scf.for %k = %c0 to %c4 step %c1 iter_args(%acc = %zero) -> f32 {
+ %v = memref.load %A[%c1, %k] : memref<2x4xf32>
+ %n = arith.addf %v, %acc : f32
+ scf.yield %n : f32
+ }
+ memref.store %sum, %B[%c0, %c0] : memref<2x4xf32>
+ scf.reduce
+ }
+ return
+}
+// CHECK-LABEL: func @fuse_vector_transfer_scalar_load_loop_rank2
+// CHECK: scf.parallel
+// CHECK: vector.transfer_write
+// CHECK: scf.for
+// CHECK-NOT: scf.parallel
+
+// -----
+
+func.func @fuse_vector_store_scalar_load_rank2(%A: memref<2x4xf32>, %B: memref<2x4xf32>) {
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %c2 = arith.constant 2 : index
+ %c3 = arith.constant 3 : index
+ %vec = arith.constant dense<3.0> : vector<2x4xf32>
+ scf.parallel (%i) = (%c0) to (%c1) step (%c1) {
+ vector.store %vec, %A[%c0, %c0] : memref<2x4xf32>, vector<2x4xf32>
+ scf.reduce
+ }
+ scf.parallel (%i) = (%c0) to (%c1) step (%c1) {
+ %v0 = memref.load %A[%c1, %c2] : memref<2x4xf32>
+ %v1 = memref.load %A[%c0, %c3] : memref<2x4xf32>
+ %sum = arith.addf %v0, %v1 : f32
+ memref.store %sum, %B[%c0, %c0] : memref<2x4xf32>
+ scf.reduce
+ }
+ return
+}
+// CHECK-LABEL: func @fuse_vector_store_scalar_load_rank2
+// CHECK: scf.parallel
+// CHECK: vector.store
+// CHECK: memref.load
+// CHECK: memref.load
+// CHECK-NOT: scf.parallel
+
+// -----
+
func.func @fuse_reductions_two(%A: memref<2x2xf32>, %B: memref<2x2xf32>) -> (f32, f32) {
%c2 = arith.constant 2 : index
%c0 = arith.constant 0 : index
diff --git a/mlir/test/Integration/Dialect/XeGPU/LANE/no-xegpu-ops.mlir b/mlir/test/Integration/Dialect/XeGPU/LANE/no-xegpu-ops.mlir
new file mode 100644
index 0000000000000..548e1864b1a05
--- /dev/null
+++ b/mlir/test/Integration/Dialect/XeGPU/LANE/no-xegpu-ops.mlir
@@ -0,0 +1,53 @@
+// RUN: mlir-opt %s --gpu-async-region --gpu-lower-to-xevm-pipeline="xegpu-op-level=lane" \
+// RUN: | mlir-runner \
+// RUN: --shared-libs=%mlir_levelzero_runtime \
+// RUN: --shared-libs=%mlir_runner_utils \
+// RUN: --entry-point-result=void \
+// RUN: | FileCheck %s
+
+module @add attributes {gpu.container_module} {
+ memref.global "private" constant @__constant_2x2x2xf32_0 : memref<2x2x2xf32> = dense<[[[1.1, 2.2], [3.3, 4.4]], [[5.5, 6.6], [7.7, 8.8 ]]]>
+ memref.global "private" constant @__constant_2x2x2xf32 : memref<2x2x2xf32> = dense<[[[1.2, 2.3], [4.5, 5.8]], [[7.2, 8.3], [10.5, 11.8]]]>
+ func.func @main() {
+ %0 = memref.get_global @__constant_2x2x2xf32 : memref<2x2x2xf32>
+ %1 = memref.get_global @__constant_2x2x2xf32_0 : memref<2x2x2xf32>
+ %2 = call @test(%0, %1) : (memref<2x2x2xf32>, memref<2x2x2xf32>) -> memref<2x2x2xf32>
+ %cast = memref.cast %2 : memref<2x2x2xf32> to memref<*xf32>
+ call @printMemrefF32(%cast) : (memref<*xf32>) -> ()
+ return
+ }
+ func.func private @printMemrefF32(memref<*xf32>)
+ func.func @test(%arg0: memref<2x2x2xf32>, %arg1: memref<2x2x2xf32>) -> memref<2x2x2xf32> {
+ %c2 = arith.constant 2 : index
+ %c1 = arith.constant 1 : index
+ %mem = gpu.alloc () : memref<2x2x2xf32>
+ gpu.memcpy %mem, %arg1 : memref<2x2x2xf32>, memref<2x2x2xf32>
+ %memref_0 = gpu.alloc () : memref<2x2x2xf32>
+ gpu.memcpy %memref_0, %arg0 : memref<2x2x2xf32>, memref<2x2x2xf32>
+ %memref_2 = gpu.alloc () : memref<2x2x2xf32>
+ gpu.launch_func @test_kernel::@test_kernel blocks in (%c2, %c2, %c2) threads in (%c1, %c1, %c1)
+ args(%memref_0 : memref<2x2x2xf32>, %mem : memref<2x2x2xf32>, %memref_2 : memref<2x2x2xf32>)
+ %alloc = memref.alloc() : memref<2x2x2xf32>
+ gpu.memcpy %alloc, %memref_2 : memref<2x2x2xf32>, memref<2x2x2xf32>
+ gpu.dealloc %memref_2 : memref<2x2x2xf32>
+ gpu.dealloc %memref_0 : memref<2x2x2xf32>
+ gpu.dealloc %mem : memref<2x2x2xf32>
+ return %alloc : memref<2x2x2xf32>
+ }
+ gpu.module @test_kernel {
+ gpu.func @test_kernel(%arg0: memref<2x2x2xf32>, %arg1: memref<2x2x2xf32>, %arg2: memref<2x2x2xf32>) kernel {
+ %0 = gpu.block_id x
+ %1 = gpu.block_id y
+ %2 = gpu.block_id z
+ %3 = memref.load %arg0[%0, %1, %2] : memref<2x2x2xf32>
+ %4 = memref.load %arg1[%0, %1, %2] : memref<2x2x2xf32>
+ %5 = arith.addf %3, %4 : f32
+ memref.store %5, %arg2[%0, %1, %2] : memref<2x2x2xf32>
+ gpu.return
+ }
+ }
+ // CHECK: [2.3, 4.5]
+ // CHECK: [7.8, 10.2]
+ // CHECK: [12.7, 14.9]
+ // CHECK: [18.2, 20.6]
+}
diff --git a/utils/bazel/BUILD.bazel b/utils/bazel/BUILD.bazel
index dd837093c62ac..4e907cc8b7ba7 100644
--- a/utils/bazel/BUILD.bazel
+++ b/utils/bazel/BUILD.bazel
@@ -2,4 +2,10 @@
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-# Required to reference .bzl files in this package
+load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
+
+bzl_library(
+ name = "configure",
+ srcs = ["configure.bzl"],
+ visibility = ["//visibility:public"],
+)
diff --git a/utils/bazel/llvm-project-overlay/clang-tools-extra/clang-query/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang-tools-extra/clang-query/BUILD.bazel
index 05fcbf7beb99f..edfa65654066d 100644
--- a/utils/bazel/llvm-project-overlay/clang-tools-extra/clang-query/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/clang-tools-extra/clang-query/BUILD.bazel
@@ -2,7 +2,7 @@
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-load("@rules_cc//cc:defs.bzl", "cc_library")
+load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_library")
package(
default_visibility = ["//visibility:public"],
@@ -15,6 +15,7 @@ cc_library(
name = "lib",
srcs = glob(["*.cpp"]),
hdrs = glob(["*.h"]),
+ includes = ["."],
deps = [
"//clang:ast",
"//clang:ast_matchers",
@@ -26,3 +27,16 @@ cc_library(
"//llvm:Support",
],
)
+
+cc_binary(
+ name = "clang-query",
+ srcs = ["tool/ClangQuery.cpp"],
+ stamp = 0,
+ deps = [
+ ":lib",
+ "//clang:frontend",
+ "//clang:tooling",
+ "//llvm:LineEditor",
+ "//llvm:Support",
+ ],
+)
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index a5b6823b9ca3d..2ecd3502204e2 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -2675,6 +2675,16 @@ libc_support_library(
],
)
+libc_support_library(
+ name = "__support_math_bf16divl",
+ hdrs = ["src/__support/math/bf16divl.h"],
+ deps = [
+ ":__support_fputil_basic_operations",
+ ":__support_fputil_bfloat16",
+ ":__support_macros_config",
+ ],
+)
+
libc_support_library(
name = "__support_math_bf16fmaf",
hdrs = ["src/__support/math/bf16fmaf.h"],
@@ -2685,6 +2695,56 @@ libc_support_library(
],
)
+libc_support_library(
+ name = "__support_math_bf16fmal",
+ hdrs = ["src/__support/math/bf16fmal.h"],
+ deps = [
+ ":__support_fputil_bfloat16",
+ ":__support_fputil_fma",
+ ":__support_macros_config",
+ ],
+)
+
+libc_support_library(
+ name = "__support_math_bf16mul",
+ hdrs = ["src/__support/math/bf16mul.h"],
+ deps = [
+ ":__support_fputil_basic_operations",
+ ":__support_fputil_bfloat16",
+ ":__support_macros_config",
+ ],
+)
+
+libc_support_library(
+ name = "__support_math_bf16mulf",
+ hdrs = ["src/__support/math/bf16mulf.h"],
+ deps = [
+ ":__support_fputil_basic_operations",
+ ":__support_fputil_bfloat16",
+ ":__support_macros_config",
+ ],
+)
+
+libc_support_library(
+ name = "__support_math_bf16mulf128",
+ hdrs = ["src/__support/math/bf16mulf128.h"],
+ deps = [
+ ":__support_fputil_basic_operations",
+ ":__support_fputil_bfloat16",
+ ":__support_macros_config",
+ ":llvm_libc_types_float128",
+ ],
+)
+
+libc_support_library(
+ name = "__support_math_bf16mull",
+ hdrs = ["src/__support/math/bf16mull.h"],
+ deps = [
+ ":__support_fputil_basic_operations",
+ ":__support_fputil_bfloat16",
+ ],
+)
+
libc_support_library(
name = "__support_math_canonicalize",
hdrs = ["src/__support/math/canonicalize.h"],
@@ -2992,6 +3052,47 @@ libc_support_library(
],
)
+libc_support_library(
+ name = "__support_math_f16add",
+ hdrs = ["src/__support/math/f16add.h"],
+ deps = [
+ ":__support_fputil_basic_operations",
+ ":__support_macros_config",
+ ":llvm_libc_macros_float16_macros",
+ ],
+)
+
+libc_support_library(
+ name = "__support_math_f16addf",
+ hdrs = ["src/__support/math/f16addf.h"],
+ deps = [
+ ":__support_fputil_basic_operations",
+ ":__support_macros_config",
+ ":llvm_libc_macros_float16_macros",
+ ],
+)
+
+libc_support_library(
+ name = "__support_math_f16addf128",
+ hdrs = ["src/__support/math/f16addf128.h"],
+ deps = [
+ ":__support_fputil_basic_operations",
+ ":__support_macros_config",
+ ":llvm_libc_macros_float16_macros",
+ ":llvm_libc_types_float128",
+ ],
+)
+
+libc_support_library(
+ name = "__support_math_f16addl",
+ hdrs = ["src/__support/math/f16addl.h"],
+ deps = [
+ ":__support_fputil_basic_operations",
+ ":__support_macros_config",
+ ":llvm_libc_macros_float16_macros",
+ ],
+)
+
libc_support_library(
name = "__support_math_f16fma",
hdrs = ["src/__support/math/f16fma.h"],
@@ -3264,6 +3365,16 @@ libc_support_library(
],
)
+libc_support_library(
+ name = "__support_math_logbl",
+ hdrs = ["src/__support/math/logbl.h"],
+ deps = [
+ ":__support_common",
+ ":__support_fputil_manipulation_functions",
+ ":__support_macros_config",
+ ],
+)
+
libc_support_library(
name = "__support_math_log10",
hdrs = ["src/__support/math/log10.h"],
@@ -4087,6 +4198,24 @@ libc_support_library(
],
)
+libc_support_library(
+ name = "__support_math_tanf16",
+ hdrs = ["src/__support/math/tanf16.h"],
+ deps = [
+ ":__support_fputil_cast",
+ ":__support_fputil_except_value_utils",
+ ":__support_fputil_fenv_impl",
+ ":__support_fputil_fp_bits",
+ ":__support_fputil_multiply_add",
+ ":__support_macros_optimization",
+ ":__support_macros_properties_types",
+ ":__support_math_sincosf16_utils",
+ ":hdr_errno_macros",
+ ":hdr_fenv_macros",
+ ":llvm_libc_macros_float16_macros",
+ ],
+)
+
libc_support_library(
name = "__support_math_tanhf",
hdrs = ["src/__support/math/tanhf.h"],
@@ -4119,6 +4248,22 @@ libc_support_library(
],
)
+libc_support_library(
+ name = "__support_math_tanpif",
+ hdrs = ["src/__support/math/tanpif.h"],
+ deps = [
+ ":__support_common",
+ ":__support_fputil_cast",
+ ":__support_fputil_except_value_utils",
+ ":__support_fputil_fenv_impl",
+ ":__support_fputil_fp_bits",
+ ":__support_fputil_multiply_add",
+ ":__support_macros_config",
+ ":__support_macros_optimization",
+ ":__support_sincosf_utils",
+ ],
+)
+
############################### complex targets ################################
libc_function(
@@ -4483,11 +4628,49 @@ libc_math_function(
additional_deps = [":__support_math_bf16addf128"],
)
+libc_math_function(
+ name = "bf16divl",
+ additional_deps = [":__support_math_bf16divl"],
+)
+
libc_math_function(
name = "bf16fmaf",
additional_deps = [":__support_math_bf16fmaf"],
)
+libc_math_function(
+ name = "bf16fmal",
+ additional_deps = [":__support_math_bf16fmal"],
+)
+
+libc_math_function(
+ name = "bf16fmul",
+ additional_deps = [
+ ":__support_math_bf16mul"
+ ],
+)
+
+libc_math_function(
+ name = "bf16fmulf",
+ additional_deps = [
+ ":__support_math_bf16mulf"
+ ],
+)
+
+libc_math_function(
+ name = "bf16fmulf128",
+ additional_deps = [
+ ":__support_math_bf16mulf128"
+ ],
+)
+
+libc_math_function(
+ name = "bf16fmull",
+ additional_deps = [
+ ":__support_math_bf16mull"
+ ],
+)
+
libc_math_function(
name = "canonicalize",
additional_deps = [
@@ -4757,13 +4940,25 @@ libc_math_function(
additional_deps = [":__support_math_expm1f16"],
)
-libc_math_function(name = "f16add")
+libc_math_function(
+ name = "f16add",
+ additional_deps = [":__support_math_f16add"],
+)
-libc_math_function(name = "f16addf")
+libc_math_function(
+ name = "f16addf",
+ additional_deps = [":__support_math_f16addf"],
+)
-libc_math_function(name = "f16addf128")
+libc_math_function(
+ name = "f16addf128",
+ additional_deps = [":__support_math_f16addf128"],
+)
-libc_math_function(name = "f16addl")
+libc_math_function(
+ name = "f16addl",
+ additional_deps = [":__support_math_f16addl"],
+)
libc_math_function(name = "f16div")
@@ -5312,7 +5507,10 @@ libc_math_function(
additional_deps = [":__support_math_logbf"],
)
-libc_math_function(name = "logbl")
+libc_math_function(
+ name = "logbl",
+ additional_deps = [":__support_math_logbl"],
+)
libc_math_function(
name = "logbf128",
@@ -5651,9 +5849,7 @@ libc_math_function(
libc_math_function(
name = "tanf16",
additional_deps = [
- ":__support_fputil_nearest_integer",
- ":__support_fputil_polyeval",
- ":__support_math_sincosf16_utils",
+ ":__support_math_tanf16",
],
)
@@ -5674,11 +5870,7 @@ libc_math_function(
libc_math_function(
name = "tanpif",
additional_deps = [
- ":__support_sincosf_utils",
- ":hdr_fenv_macros",
- ":__support_macros_config",
- ":__support_macros_optimization",
- ":__support_fputil_multiply_add",
+ ":__support_math_tanpif",
],
)
diff --git a/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel b/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel
index 59916fe16be1d..24d2de92636d1 100644
--- a/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel
@@ -2078,6 +2078,7 @@ cc_library(
"//llvm:BinaryFormat",
"//llvm:Object",
"//llvm:Support",
+ "//llvm:TargetParser",
],
)
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index c18f71f466ea2..3292454f49de4 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -2911,8 +2911,10 @@ cc_library(
":DialectUtils",
":FuncDialect",
":IR",
+ ":IndexDialect",
":LoopLikeInterface",
":MemRefDialect",
+ ":MemRefUtils",
":Pass",
":Rewrite",
":SCFDialect",
@@ -2924,6 +2926,7 @@ cc_library(
":TensorTransforms",
":TilingInterface",
":TransformUtils",
+ ":VectorDialect",
":ViewLikeInterface",
"//llvm:Support",
],
>From cc671df1081c4e50f99eaf2c4d8cd98b7b2e3413 Mon Sep 17 00:00:00 2001
From: hulxv <hulxxv at gmail.com>
Date: Tue, 24 Feb 2026 04:14:57 +0200
Subject: [PATCH 2/2] format
---
utils/bazel/llvm-project-overlay/libc/BUILD.bazel | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index e50c7adf4628a..1695f80daa6dd 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -4790,28 +4790,28 @@ libc_math_function(
libc_math_function(
name = "bf16fmul",
additional_deps = [
- ":__support_math_bf16mul"
+ ":__support_math_bf16mul",
],
)
libc_math_function(
name = "bf16fmulf",
additional_deps = [
- ":__support_math_bf16mulf"
+ ":__support_math_bf16mulf",
],
)
libc_math_function(
name = "bf16fmulf128",
additional_deps = [
- ":__support_math_bf16mulf128"
+ ":__support_math_bf16mulf128",
],
)
libc_math_function(
name = "bf16fmull",
additional_deps = [
- ":__support_math_bf16mull"
+ ":__support_math_bf16mull",
],
)
More information about the flang-commits
mailing list