[clang] 7fdf608 - [AMDGPU] Add GFX12 WMMA and SWMMAC instructions (#77795)
via cfe-commits
cfe-commits at lists.llvm.org
Wed Jan 24 04:43:15 PST 2024
Author: Mirko BrkuĊĦanin
Date: 2024-01-24T13:43:07+01:00
New Revision: 7fdf608cefa0d9051eb3146ee19c3750e237c799
URL: https://github.com/llvm/llvm-project/commit/7fdf608cefa0d9051eb3146ee19c3750e237c799
DIFF: https://github.com/llvm/llvm-project/commit/7fdf608cefa0d9051eb3146ee19c3750e237c799.diff
LOG: [AMDGPU] Add GFX12 WMMA and SWMMAC instructions (#77795)
Co-authored-by: Petar Avramovic <Petar.Avramovic at amd.com>
Co-authored-by: Piotr Sobczak <piotr.sobczak at amd.com>
Added:
clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w32.cl
clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w64.cl
clang/test/CodeGenOpenCL/builtins-amdgcn-swmmac-w32.cl
clang/test/CodeGenOpenCL/builtins-amdgcn-swmmac-w64.cl
cross-project-tests/amdgpu/builtins-amdgcn-gfx12-wmma-w32.cl
cross-project-tests/amdgpu/builtins-amdgcn-gfx12-wmma-w64.cl
cross-project-tests/amdgpu/builtins-amdgcn-swmmac-w32.cl
cross-project-tests/amdgpu/builtins-amdgcn-swmmac-w64.cl
llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-imm.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-iu-modifiers.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-swmmac-index_key.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-imm.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-iu-modifiers.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-swmmac-index_key.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64.ll
llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll
llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-imm.ll
llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-iu-modifiers.ll
llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-swmmac-index_key.ll
llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32.ll
llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll
llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-imm.ll
llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-iu-modifiers.ll
llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll
llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64.ll
llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w32.mir
llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w64.mir
llvm/test/MC/AMDGPU/gfx12_asm_wmma_w32.s
llvm/test/MC/AMDGPU/gfx12_asm_wmma_w64.s
llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_wmma_w32.txt
llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_wmma_w64.txt
Modified:
clang/include/clang/Basic/BuiltinsAMDGPU.def
clang/lib/CodeGen/CGBuiltin.cpp
clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32-gfx10-err.cl
clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32.cl
clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w64.cl
llvm/include/llvm/IR/IntrinsicsAMDGPU.td
llvm/lib/Target/AMDGPU/AMDGPUGISel.td
llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
llvm/lib/Target/AMDGPU/SIDefines.h
llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
llvm/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/lib/Target/AMDGPU/SIInstrFormats.td
llvm/lib/Target/AMDGPU/SIInstrInfo.h
llvm/lib/Target/AMDGPU/SIInstrInfo.td
llvm/lib/Target/AMDGPU/SIRegisterInfo.td
llvm/lib/Target/AMDGPU/VOP3PInstructions.td
llvm/lib/Target/AMDGPU/VOPInstructions.td
llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
mlir/test/Target/LLVMIR/rocdl.mlir
Removed:
################################################################################
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index d208342d9c516e..74dfd1d214e849 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -436,5 +436,67 @@ TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_i32, "ii*1", "nc", "gfx12-insts,w
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_v4i16, "V4sV4s*1", "nc", "gfx12-insts,wavefrontsize64")
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_v4f16, "V4hV4h*1", "nc", "gfx12-insts,wavefrontsize64")
+//===----------------------------------------------------------------------===//
+// WMMA builtins.
+// Postfix w32 indicates the builtin requires wavefront size of 32.
+// Postfix w64 indicates the builtin requires wavefront size of 64.
+//
+// Some of these are very similar to their GFX11 counterparts, but they don't
+// require replication of the A,B matrices, so they use fewer vector elements.
+// Therefore, we add an "_gfx12" suffix to distinguish them from the existing
+// builtins.
+//===----------------------------------------------------------------------===//
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12, "V8fV8hV8hV8f", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12, "V8fV8sV8sV8f", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12, "V8hV8hV8hV8h", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12, "V8sV8sV8sV8s", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12, "V8iIbV2iIbV2iV8iIb", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12, "V8iIbiIbiV8iIb", "nc", "gfx12-insts,wavefrontsize32")
+// These are gfx12-only, but for consistency with the other WMMA variants we're
+// keeping the "_gfx12" suffix.
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12, "V8fV2iV2iV8f", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12, "V8fV2iV2iV8f", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12, "V8fV2iV2iV8f", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12, "V8fV2iV2iV8f", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12, "V8iIbV2iIbV2iV8iIb", "nc", "gfx12-insts,wavefrontsize32")
+
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12, "V4fV4hV4hV4f", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12, "V4fV4sV4sV4f", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12, "V4hV4hV4hV4h", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12, "V4sV4sV4sV4s", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12, "V4iIbiIbiV4iIb", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12, "V4iIbiIbiV4iIb", "nc", "gfx12-insts,wavefrontsize64")
+// These are gfx12-only, but for consistency with the other WMMA variants we're
+// keeping the "_gfx12" suffix.
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12, "V4fiiV4f", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12, "V4fiiV4f", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12, "V4fiiV4f", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12, "V4fiiV4f", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12, "V4iIbiIbiV4iIb", "nc", "gfx12-insts,wavefrontsize64")
+
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_f16_w32, "V8fV8hV16hV8fs", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32, "V8fV8sV16sV8fs", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f16_16x16x32_f16_w32, "V8hV8hV16hV8hs", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32, "V8sV8sV16sV8ss", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32, "V8iIbV2iIbV4iV8isIb", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32, "V8iIbiIbV2iV8isIb", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32, "V8iIbV2iIbV4iV8isIb", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32, "V8fV2iV4iV8fs", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32, "V8fV2iV4iV8fs", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32, "V8fV2iV4iV8fs", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32, "V8fV2iV4iV8fs", "nc", "gfx12-insts,wavefrontsize32")
+
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_f16_w64, "V4fV4hV8hV4fs", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64, "V4fV4sV8sV4fs", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f16_16x16x32_f16_w64, "V4hV4hV8hV4hs", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64, "V4sV4sV8sV4ss", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64, "V4iIbiIbV2iV4isIb", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64, "V4iIbiIbiV4isIb", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64, "V4iIbiIbV2iV4isIb", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64, "V4fiV2iV4fs", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64, "V4fiV2iV4fs", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64, "V4fiV2iV4fs", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64, "V4fiV2iV4fs", "nc", "gfx12-insts,wavefrontsize64")
+
#undef BUILTIN
#undef TARGET_BUILTIN
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 7ef764b8e1ac80..a4f26a6f0eb19b 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -18279,65 +18279,216 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32:
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64:
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32:
- case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64: {
+ case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64:
+ case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12:
+ case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12:
+ case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12:
+ case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12:
+ case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12:
+ case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12:
+ case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12:
+ case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12:
+ case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12:
+ case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12:
+ case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12:
+ case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12:
+ case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12:
+ case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12:
+ case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12:
+ case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12:
+ case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12:
+ case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12:
+ case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12:
+ case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12:
+ case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12:
+ case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12:
+ case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w32:
+ case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w64:
+ case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32:
+ case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64:
+ case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w32:
+ case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w64:
+ case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32:
+ case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64:
+ case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32:
+ case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64:
+ case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32:
+ case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64:
+ case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32:
+ case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64:
+ case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32:
+ case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64:
+ case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32:
+ case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64:
+ case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32:
+ case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64:
+ case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32:
+ case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64: {
// These operations perform a matrix multiplication and accumulation of
// the form:
// D = A * B + C
- // The return type always matches the type of matrix C.
- unsigned ArgForMatchingRetType;
+ // We need to specify one type for matrices AB and one for matrices CD.
+ // Sparse matrix operations can have
diff erent types for A and B as well as
+ // an additional type for sparsity index.
+ // Destination type should be put before types used for source operands.
+ SmallVector<unsigned, 2> ArgsForMatchingMatrixTypes;
+ // On GFX12, the intrinsics with 16-bit accumulator use a packed layout.
+ // There is no need for the variable opsel argument, so always set it to
+ // "false".
+ bool AppendFalseForOpselArg = false;
unsigned BuiltinWMMAOp;
switch (BuiltinID) {
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32:
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64:
- ArgForMatchingRetType = 2;
+ case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12:
+ case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12:
+ ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_f16;
break;
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32:
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64:
- ArgForMatchingRetType = 2;
+ case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12:
+ case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12:
+ ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf16;
break;
+ case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12:
+ case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12:
+ AppendFalseForOpselArg = true;
+ LLVM_FALLTHROUGH;
case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32:
case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64:
- ArgForMatchingRetType = 2;
+ ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x16_f16;
break;
+ case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12:
+ case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12:
+ AppendFalseForOpselArg = true;
+ LLVM_FALLTHROUGH;
case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32:
case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64:
- ArgForMatchingRetType = 2;
+ ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16;
break;
case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32:
case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64:
- ArgForMatchingRetType = 2;
+ ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x16_f16_tied;
break;
case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32:
case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64:
- ArgForMatchingRetType = 2;
+ ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16_tied;
break;
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32:
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64:
- ArgForMatchingRetType = 4;
+ case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12:
+ case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12:
+ ArgsForMatchingMatrixTypes = {4, 1}; // CD, AB
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x16_iu8;
break;
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32:
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64:
- ArgForMatchingRetType = 4;
+ case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12:
+ case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12:
+ ArgsForMatchingMatrixTypes = {4, 1}; // CD, AB
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x16_iu4;
break;
+ case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12:
+ case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12:
+ ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
+ BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_fp8;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12:
+ case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12:
+ ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
+ BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_bf8;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12:
+ case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12:
+ ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
+ BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_fp8;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12:
+ case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12:
+ ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
+ BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_bf8;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12:
+ case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12:
+ ArgsForMatchingMatrixTypes = {4, 1}; // CD, AB
+ BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x32_iu4;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w32:
+ case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w64:
+ ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
+ BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_f16;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32:
+ case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64:
+ ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
+ BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w32:
+ case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w64:
+ ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
+ BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x32_f16;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32:
+ case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64:
+ ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
+ BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32:
+ case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64:
+ ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; // CD, A, B, Index
+ BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32:
+ case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64:
+ ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; // CD, A, B, Index
+ BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32:
+ case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64:
+ ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; // CD, A, B, Index
+ BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32:
+ case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64:
+ ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
+ BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32:
+ case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64:
+ ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
+ BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32:
+ case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64:
+ ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
+ BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32:
+ case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64:
+ ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
+ BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8;
+ break;
}
SmallVector<Value *, 6> Args;
for (int i = 0, e = E->getNumArgs(); i != e; ++i)
Args.push_back(EmitScalarExpr(E->getArg(i)));
+ if (AppendFalseForOpselArg)
+ Args.push_back(Builder.getFalse());
- Function *F = CGM.getIntrinsic(BuiltinWMMAOp,
- {Args[ArgForMatchingRetType]->getType()});
+ SmallVector<llvm::Type *, 6> ArgTypes;
+ for (auto ArgIdx : ArgsForMatchingMatrixTypes)
+ ArgTypes.push_back(Args[ArgIdx]->getType());
+ Function *F = CGM.getIntrinsic(BuiltinWMMAOp, ArgTypes);
return Builder.CreateCall(F, Args);
}
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w32.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w32.cl
new file mode 100644
index 00000000000000..11747af7ea74f5
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w32.cl
@@ -0,0 +1,156 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize32 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200
+
+typedef int v2i __attribute__((ext_vector_type(2)));
+typedef float v8f __attribute__((ext_vector_type(8)));
+typedef half v8h __attribute__((ext_vector_type(8)));
+typedef short v8s __attribute__((ext_vector_type(8)));
+typedef int v8i __attribute__((ext_vector_type(8)));
+
+// Wave32
+
+//
+// amdgcn_wmma_f32_16x16x16_f16
+//
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_f16_w32(
+// CHECK-GFX1200-NEXT: entry:
+// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], <8 x float> [[C:%.*]])
+// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4:![0-9]+]]
+// CHECK-GFX1200-NEXT: ret void
+//
+void test_amdgcn_wmma_f32_16x16x16_f16_w32(global v8f* out, v8h a, v8h b, v8f c)
+{
+ *out = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12(a, b, c);
+}
+
+//
+// amdgcn_wmma_f32_16x16x16_bf16
+//
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_bf16_w32(
+// CHECK-GFX1200-NEXT: entry:
+// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x float> [[C:%.*]])
+// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT: ret void
+//
+void test_amdgcn_wmma_f32_16x16x16_bf16_w32(global v8f* out, v8s a, v8s b, v8f c)
+{
+ *out = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12(a, b, c);
+}
+
+//
+// amdgcn_wmma_f16_16x16x16_f16
+//
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f16_16x16x16_f16_w32(
+// CHECK-GFX1200-NEXT: entry:
+// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], <8 x half> [[C:%.*]], i1 false)
+// CHECK-GFX1200-NEXT: store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT: ret void
+//
+void test_amdgcn_wmma_f16_16x16x16_f16_w32(global v8h* out, v8h a, v8h b, v8h c)
+{
+ *out = __builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12(a, b, c);
+}
+
+//
+// amdgcn_wmma_bf16_16x16x16_bf16
+//
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_bf16_16x16x16_bf16_w32(
+// CHECK-GFX1200-NEXT: entry:
+// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], i1 false)
+// CHECK-GFX1200-NEXT: store <8 x i16> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT: ret void
+//
+void test_amdgcn_wmma_bf16_16x16x16_bf16_w32(global v8s* out, v8s a, v8s b, v8s c)
+{
+ *out = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12(a, b, c);
+}
+
+//
+// amdgcn_wmma_i32_16x16x16_iu8
+//
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_i32_16x16x16_iu8_w32(
+// CHECK-GFX1200-NEXT: entry:
+// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32(i1 true, <2 x i32> [[A:%.*]], i1 true, <2 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i1 false)
+// CHECK-GFX1200-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT: ret void
+//
+void test_amdgcn_wmma_i32_16x16x16_iu8_w32(global v8i* out, v2i a, v2i b, v8i c)
+{
+ *out = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12(true, a, true, b, c, false);
+}
+
+//
+// amdgcn_wmma_i32_16x16x16_iu4
+//
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_i32_16x16x16_iu4_w32(
+// CHECK-GFX1200-NEXT: entry:
+// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32(i1 true, i32 [[A:%.*]], i1 true, i32 [[B:%.*]], <8 x i32> [[C:%.*]], i1 false)
+// CHECK-GFX1200-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT: ret void
+//
+void test_amdgcn_wmma_i32_16x16x16_iu4_w32(global v8i* out, int a, int b, v8i c)
+{
+ *out = __builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12(true, a, true, b, c, false);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32(
+// CHECK-GFX1200-NEXT: entry:
+// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <8 x float> [[C:%.*]])
+// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT: ret void
+//
+void test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32(global v8f* out, v2i a, v2i b, v8f c)
+{
+ *out = __builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12(a, b, c);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32(
+// CHECK-GFX1200-NEXT: entry:
+// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <8 x float> [[C:%.*]])
+// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT: ret void
+//
+void test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32(global v8f* out, v2i a, v2i b, v8f c)
+{
+ *out = __builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12(a, b, c);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32(
+// CHECK-GFX1200-NEXT: entry:
+// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <8 x float> [[C:%.*]])
+// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT: ret void
+//
+void test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32(global v8f* out, v2i a, v2i b, v8f c)
+{
+ *out = __builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12(a, b, c);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32(
+// CHECK-GFX1200-NEXT: entry:
+// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <8 x float> [[C:%.*]])
+// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT: ret void
+//
+void test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32(global v8f* out, v2i a, v2i b, v8f c)
+{
+ *out = __builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12(a, b, c);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_i32_16x16x32_iu4_w32(
+// CHECK-GFX1200-NEXT: entry:
+// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32(i1 true, <2 x i32> [[A:%.*]], i1 true, <2 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i1 false)
+// CHECK-GFX1200-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT: ret void
+//
+void test_amdgcn_wmma_i32_16x16x32_iu4_w32(global v8i* out, v2i a, v2i b, v8i c)
+{
+ *out = __builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12(true, a, true, b, c, false);
+}
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w64.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w64.cl
new file mode 100644
index 00000000000000..ef32648743ca62
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w64.cl
@@ -0,0 +1,155 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize64 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200
+
+typedef float v4f __attribute__((ext_vector_type(4)));
+typedef half v4h __attribute__((ext_vector_type(4)));
+typedef short v4s __attribute__((ext_vector_type(4)));
+typedef int v4i __attribute__((ext_vector_type(4)));
+
+// Wave64
+
+//
+// amdgcn_wmma_f32_16x16x16_f16
+//
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_f16_w64(
+// CHECK-GFX1200-NEXT: entry:
+// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> [[A:%.*]], <4 x half> [[B:%.*]], <4 x float> [[C:%.*]])
+// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4:![0-9]+]]
+// CHECK-GFX1200-NEXT: ret void
+//
+void test_amdgcn_wmma_f32_16x16x16_f16_w64(global v4f* out, v4h a, v4h b, v4f c)
+{
+ *out = __builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12(a, b, c);
+}
+
+//
+// amdgcn_wmma_f32_16x16x16_bf16
+//
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_bf16_w64(
+// CHECK-GFX1200-NEXT: entry:
+// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x float> [[C:%.*]])
+// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT: ret void
+//
+void test_amdgcn_wmma_f32_16x16x16_bf16_w64(global v4f* out, v4s a, v4s b, v4f c)
+{
+ *out = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12(a, b, c);
+}
+
+//
+// amdgcn_wmma_f16_16x16x16_f16
+//
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f16_16x16x16_f16_w64(
+// CHECK-GFX1200-NEXT: entry:
+// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16(<4 x half> [[A:%.*]], <4 x half> [[B:%.*]], <4 x half> [[C:%.*]], i1 false)
+// CHECK-GFX1200-NEXT: store <4 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 8, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT: ret void
+//
+void test_amdgcn_wmma_f16_16x16x16_f16_w64(global v4h* out, v4h a, v4h b, v4h c)
+{
+ *out = __builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12(a, b, c);
+}
+
+//
+// amdgcn_wmma_bf16_16x16x16_bf16
+//
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_bf16_16x16x16_bf16_w64(
+// CHECK-GFX1200-NEXT: entry:
+// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]], i1 false)
+// CHECK-GFX1200-NEXT: store <4 x i16> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 8, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT: ret void
+//
+void test_amdgcn_wmma_bf16_16x16x16_bf16_w64(global v4s* out, v4s a, v4s b, v4s c)
+{
+ *out = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12(a, b, c);
+}
+
+//
+// amdgcn_wmma_i32_16x16x16_iu8
+//
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_i32_16x16x16_iu8_w64(
+// CHECK-GFX1200-NEXT: entry:
+// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32(i1 true, i32 [[A:%.*]], i1 true, i32 [[B:%.*]], <4 x i32> [[C:%.*]], i1 false)
+// CHECK-GFX1200-NEXT: store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT: ret void
+//
+void test_amdgcn_wmma_i32_16x16x16_iu8_w64(global v4i* out, int a, int b, v4i c)
+{
+ *out = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12(true, a, true, b, c, false);
+}
+
+//
+// amdgcn_wmma_i32_16x16x16_iu4
+//
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_i32_16x16x16_iu4_w64(
+// CHECK-GFX1200-NEXT: entry:
+// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32(i1 true, i32 [[A:%.*]], i1 true, i32 [[B:%.*]], <4 x i32> [[C:%.*]], i1 false)
+// CHECK-GFX1200-NEXT: store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT: ret void
+//
+void test_amdgcn_wmma_i32_16x16x16_iu4_w64(global v4i* out, int a, int b, v4i c)
+{
+ *out = __builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12(true, a, true, b, c, false);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32(
+// CHECK-GFX1200-NEXT: entry:
+// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32(i32 [[A:%.*]], i32 [[B:%.*]], <4 x float> [[C:%.*]])
+// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT: ret void
+//
+void test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32(global v4f* out, int a, int b, v4f c)
+{
+ *out = __builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12(a, b, c);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32(
+// CHECK-GFX1200-NEXT: entry:
+// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32(i32 [[A:%.*]], i32 [[B:%.*]], <4 x float> [[C:%.*]])
+// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT: ret void
+//
+void test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32(global v4f* out, int a, int b, v4f c)
+{
+ *out = __builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12(a, b, c);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32(
+// CHECK-GFX1200-NEXT: entry:
+// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32(i32 [[A:%.*]], i32 [[B:%.*]], <4 x float> [[C:%.*]])
+// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT: ret void
+//
+void test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32(global v4f* out, int a, int b, v4f c)
+{
+ *out = __builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12(a, b, c);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32(
+// CHECK-GFX1200-NEXT: entry:
+// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32(i32 [[A:%.*]], i32 [[B:%.*]], <4 x float> [[C:%.*]])
+// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT: ret void
+//
+void test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32(global v4f* out, int a, int b, v4f c)
+{
+ *out = __builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12(a, b, c);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_i32_16x16x32_iu4_w32(
+// CHECK-GFX1200-NEXT: entry:
+// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32(i1 true, i32 [[A:%.*]], i1 true, i32 [[B:%.*]], <4 x i32> [[C:%.*]], i1 false)
+// CHECK-GFX1200-NEXT: store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT: ret void
+//
+void test_amdgcn_wmma_i32_16x16x32_iu4_w32(global v4i* out, int a, int b, v4i c)
+{
+ *out = __builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12(true, a, true, b, c, false);
+}
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-swmmac-w32.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-swmmac-w32.cl
new file mode 100644
index 00000000000000..b303c2f25dddac
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-swmmac-w32.cl
@@ -0,0 +1,135 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize32 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200
+
+typedef int v2i __attribute__((ext_vector_type(2)));
+typedef int v4i __attribute__((ext_vector_type(4)));
+typedef float v8f __attribute__((ext_vector_type(8)));
+typedef half v8h __attribute__((ext_vector_type(8)));
+typedef short v8s __attribute__((ext_vector_type(8)));
+typedef int v8i __attribute__((ext_vector_type(8)));
+typedef half v16h __attribute__((ext_vector_type(16)));
+typedef short v16s __attribute__((ext_vector_type(16)));
+
+// Wave32
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_f16_w32(
+// CHECK-GFX1200-NEXT: entry:
+// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.i16(<8 x half> [[A:%.*]], <16 x half> [[B:%.*]], <8 x float> [[C:%.*]], i16 [[INDEX:%.*]])
+// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4:![0-9]+]]
+// CHECK-GFX1200-NEXT: ret void
+//
+void test_amdgcn_swmmac_f32_16x16x32_f16_w32(global v8f* out, v8h a, v16h b, v8f c, short index)
+{
+ *out = __builtin_amdgcn_swmmac_f32_16x16x32_f16_w32(a, b, c, index);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_bf16_w32(
+// CHECK-GFX1200-NEXT: entry:
+// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.i16(<8 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x float> [[C:%.*]], i16 [[INDEX:%.*]])
+// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT: ret void
+//
+void test_amdgcn_swmmac_f32_16x16x32_bf16_w32(global v8f* out, v8s a, v16s b, v8f c, short index)
+{
+ *out = __builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32(a, b, c, index);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f16_16x16x32_f16_w32(
+// CHECK-GFX1200-NEXT: entry:
+// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> [[A:%.*]], <16 x half> [[B:%.*]], <8 x half> [[C:%.*]], i16 [[INDEX:%.*]])
+// CHECK-GFX1200-NEXT: store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT: ret void
+//
+void test_amdgcn_swmmac_f16_16x16x32_f16_w32(global v8h* out, v8h a, v16h b, v8h c, short index)
+{
+ *out = __builtin_amdgcn_swmmac_f16_16x16x32_f16_w32(a, b, c, index);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_bf16_16x16x32_bf16_w32(
+// CHECK-GFX1200-NEXT: entry:
+// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.i16(<8 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], i16 [[INDEX:%.*]])
+// CHECK-GFX1200-NEXT: store <8 x i16> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT: ret void
+//
+void test_amdgcn_swmmac_bf16_16x16x32_bf16_w32(global v8s* out, v8s a, v16s b, v8s c, short index)
+{
+ *out = __builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32(a, b, c, index);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_i32_16x16x32_iu8_w32(
+// CHECK-GFX1200-NEXT: entry:
+// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.i16(i1 true, <2 x i32> [[A:%.*]], i1 true, <4 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i16 [[INDEX:%.*]], i1 true)
+// CHECK-GFX1200-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT: ret void
+//
+void test_amdgcn_swmmac_i32_16x16x32_iu8_w32(global v8i* out, v2i a, v4i b, v8i c, short index)
+{
+ *out = __builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32(true, a, true, b, c, index, true);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_i32_16x16x32_iu4_w32(
+// CHECK-GFX1200-NEXT: entry:
+// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.i16(i1 true, i32 [[A:%.*]], i1 true, <2 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i16 [[INDEX:%.*]], i1 true)
+// CHECK-GFX1200-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT: ret void
+//
+void test_amdgcn_swmmac_i32_16x16x32_iu4_w32(global v8i* out, int a, v2i b, v8i c, short index)
+{
+ *out = __builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32(true, a, true, b, c, index, true);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_i32_16x16x64_iu4_w32(
+// CHECK-GFX1200-NEXT: entry:
+// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.i16(i1 true, <2 x i32> [[A:%.*]], i1 true, <4 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i16 [[INDEX:%.*]], i1 true)
+// CHECK-GFX1200-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT: ret void
+//
+void test_amdgcn_swmmac_i32_16x16x64_iu4_w32(global v8i* out, v2i a, v4i b, v8i c, short index)
+{
+ *out = __builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32(true, a, true, b, c, index, true);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32(
+// CHECK-GFX1200-NEXT: entry:
+// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.i16(<2 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <8 x float> [[C:%.*]], i16 [[INDEX:%.*]])
+// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT: ret void
+//
+void test_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32(global v8f* out, v2i a, v4i b, v8f c, short index)
+{
+ *out = __builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32(a, b, c, index);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32(
+// CHECK-GFX1200-NEXT: entry:
+// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.i16(<2 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <8 x float> [[C:%.*]], i16 [[INDEX:%.*]])
+// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT: ret void
+//
+void test_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32(global v8f* out, v2i a, v4i b, v8f c, short index)
+{
+ *out = __builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32(a, b, c, index);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32(
+// CHECK-GFX1200-NEXT: entry:
+// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.i16(<2 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <8 x float> [[C:%.*]], i16 [[INDEX:%.*]])
+// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT: ret void
+//
+void test_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32(global v8f* out, v2i a, v4i b, v8f c, short index)
+{
+ *out = __builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32(a, b, c, index);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32(
+// CHECK-GFX1200-NEXT: entry:
+// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.i16(<2 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <8 x float> [[C:%.*]], i16 [[INDEX:%.*]])
+// CHECK-GFX1200-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT: ret void
+//
+void test_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32(global v8f* out, v2i a, v4i b, v8f c, short index)
+{
+ *out = __builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32(a, b, c, index);
+}
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-swmmac-w64.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-swmmac-w64.cl
new file mode 100644
index 00000000000000..855fa7351e1556
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-swmmac-w64.cl
@@ -0,0 +1,134 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize64 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200
+
+typedef int v2i __attribute__((ext_vector_type(2)));
+typedef int v4i __attribute__((ext_vector_type(4)));
+typedef float v4f __attribute__((ext_vector_type(4)));
+typedef half v4h __attribute__((ext_vector_type(4)));
+typedef short v4s __attribute__((ext_vector_type(4)));
+typedef half v8h __attribute__((ext_vector_type(8)));
+typedef short v8s __attribute__((ext_vector_type(8)));
+
+// Wave64
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_f16_w64(
+// CHECK-GFX1200-NEXT: entry:
+// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.i16(<4 x half> [[A:%.*]], <8 x half> [[B:%.*]], <4 x float> [[C:%.*]], i16 [[INDEX:%.*]])
+// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4:![0-9]+]]
+// CHECK-GFX1200-NEXT: ret void
+//
+void test_amdgcn_swmmac_f32_16x16x32_f16_w64(global v4f* out, v4h a, v8h b, v4f c, short index)
+{
+ *out = __builtin_amdgcn_swmmac_f32_16x16x32_f16_w64(a, b, c, index);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_bf16_w64(
+// CHECK-GFX1200-NEXT: entry:
+// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.i16(<4 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <4 x float> [[C:%.*]], i16 [[INDEX:%.*]])
+// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT: ret void
+//
+void test_amdgcn_swmmac_f32_16x16x32_bf16_w64(global v4f* out, v4s a, v8s b, v4f c, short index)
+{
+ *out = __builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64(a, b, c, index);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f16_16x16x32_f16_w64(
+// CHECK-GFX1200-NEXT: entry:
+// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.i16(<4 x half> [[A:%.*]], <8 x half> [[B:%.*]], <4 x half> [[C:%.*]], i16 [[INDEX:%.*]])
+// CHECK-GFX1200-NEXT: store <4 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 8, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT: ret void
+//
+void test_amdgcn_swmmac_f16_16x16x32_f16_w64(global v4h* out, v4h a, v8h b, v4h c, short index)
+{
+ *out = __builtin_amdgcn_swmmac_f16_16x16x32_f16_w64(a, b, c, index);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_bf16_16x16x32_bf16_w64(
+// CHECK-GFX1200-NEXT: entry:
+// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.i16(<4 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <4 x i16> [[C:%.*]], i16 [[INDEX:%.*]])
+// CHECK-GFX1200-NEXT: store <4 x i16> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 8, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT: ret void
+//
+void test_amdgcn_swmmac_bf16_16x16x32_bf16_w64(global v4s* out, v4s a, v8s b, v4s c, short index)
+{
+ *out = __builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64(a, b, c, index);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_i32_16x16x32_iu8_w64(
+// CHECK-GFX1200-NEXT: entry:
+// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.i16(i1 true, i32 [[A:%.*]], i1 true, <2 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], i16 [[INDEX:%.*]], i1 true)
+// CHECK-GFX1200-NEXT: store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT: ret void
+//
+void test_amdgcn_swmmac_i32_16x16x32_iu8_w64(global v4i* out, int a, v2i b, v4i c, short index)
+{
+ *out = __builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64(true, a, true, b, c, index, true);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_i32_16x16x32_iu4_w64(
+// CHECK-GFX1200-NEXT: entry:
+// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.i16(i1 true, i32 [[A:%.*]], i1 true, i32 [[B:%.*]], <4 x i32> [[C:%.*]], i16 [[INDEX:%.*]], i1 true)
+// CHECK-GFX1200-NEXT: store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT: ret void
+//
+void test_amdgcn_swmmac_i32_16x16x32_iu4_w64(global v4i* out, int a, int b, v4i c, short index)
+{
+ *out = __builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64(true, a, true, b, c, index, true);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_i32_16x16x64_iu4_w64(
+// CHECK-GFX1200-NEXT: entry:
+// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.i16(i1 true, i32 [[A:%.*]], i1 true, <2 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], i16 [[INDEX:%.*]], i1 true)
+// CHECK-GFX1200-NEXT: store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT: ret void
+//
+void test_amdgcn_swmmac_i32_16x16x64_iu4_w64(global v4i* out, int a, v2i b, v4i c, short index)
+{
+ *out = __builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64(true, a, true, b, c, index, true);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64(
+// CHECK-GFX1200-NEXT: entry:
+// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.i16(i32 [[A:%.*]], <2 x i32> [[B:%.*]], <4 x float> [[C:%.*]], i16 [[INDEX:%.*]])
+// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT: ret void
+//
+void test_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64(global v4f* out, int a, v2i b, v4f c, short index)
+{
+ *out = __builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64(a, b, c, index);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64(
+// CHECK-GFX1200-NEXT: entry:
+// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.i16(i32 [[A:%.*]], <2 x i32> [[B:%.*]], <4 x float> [[C:%.*]], i16 [[INDEX:%.*]])
+// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT: ret void
+//
+void test_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64(global v4f* out, int a, v2i b, v4f c, short index)
+{
+ *out = __builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64(a, b, c, index);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64(
+// CHECK-GFX1200-NEXT: entry:
+// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.i16(i32 [[A:%.*]], <2 x i32> [[B:%.*]], <4 x float> [[C:%.*]], i16 [[INDEX:%.*]])
+// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT: ret void
+//
+void test_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64(global v4f* out, int a, v2i b, v4f c, short index)
+{
+ *out = __builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64(a, b, c, index);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64(
+// CHECK-GFX1200-NEXT: entry:
+// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.i16(i32 [[A:%.*]], <2 x i32> [[B:%.*]], <4 x float> [[C:%.*]], i16 [[INDEX:%.*]])
+// CHECK-GFX1200-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT: ret void
+//
+void test_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64(global v4f* out, int a, v2i b, v4f c, short index)
+{
+ *out = __builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64(a, b, c, index);
+}
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32-gfx10-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32-gfx10-err.cl
index 41a78ae268be57..49cb797df42331 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32-gfx10-err.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32-gfx10-err.cl
@@ -16,7 +16,7 @@ typedef short v16s __attribute__((ext_vector_type(16)));
// Wave32
-void test_amdgcn_wmma_f32_16x16x16_bf16_w32(global v8f* out8f, v16s a16s, v16s b16s, v8f c8f,
+void test_amdgcn_wmma_f32_16x16x16_bf16_w32(global v8f* out8f, v16s a16s, v16s b16s, v8f c8f,
global v16h* out16h, v16h a16h, v16h b16h, v16h c16h,
global v16s* out16s, v2i a2i, v2i b2i, v16s c16s,
global v8i* out8i, v4i a4i, v4i b4i, v8i c8i)
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32.cl
index 4f13c75e5e81f9..3c6aaf5e38281d 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32.cl
@@ -2,7 +2,6 @@
// REQUIRES: amdgpu-registered-target
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1100 -DWMMA_GFX1100_TESTS -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1100
-typedef float v4f __attribute__((ext_vector_type(4)));
typedef float v8f __attribute__((ext_vector_type(8)));
typedef half v16h __attribute__((ext_vector_type(16)));
typedef int v2i __attribute__((ext_vector_type(2)));
@@ -20,7 +19,7 @@ typedef short v16s __attribute__((ext_vector_type(16)));
// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f32_16x16x16_f16_w32(
// CHECK-GFX1100-NEXT: entry:
-// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <8 x float> [[C:%.*]])
+// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v16f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <8 x float> [[C:%.*]])
// CHECK-GFX1100-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4:![0-9]+]]
// CHECK-GFX1100-NEXT: ret void
//
@@ -35,7 +34,7 @@ void test_amdgcn_wmma_f32_16x16x16_f16_w32(global v8f* out, v16h a, v16h b, v8f
// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f32_16x16x16_bf16_w32(
// CHECK-GFX1100-NEXT: entry:
-// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x float> [[C:%.*]])
+// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v16i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x float> [[C:%.*]])
// CHECK-GFX1100-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
// CHECK-GFX1100-NEXT: ret void
//
@@ -50,7 +49,7 @@ void test_amdgcn_wmma_f32_16x16x16_bf16_w32(global v8f* out, v16s a, v16s b, v8f
// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f16_16x16x16_f16_w32(
// CHECK-GFX1100-NEXT: entry:
-// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <16 x half> [[C:%.*]], i1 true)
+// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <16 x half> [[C:%.*]], i1 true)
// CHECK-GFX1100-NEXT: store <16 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
// CHECK-GFX1100-NEXT: ret void
//
@@ -65,7 +64,7 @@ void test_amdgcn_wmma_f16_16x16x16_f16_w32(global v16h* out, v16h a, v16h b, v16
// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_bf16_16x16x16_bf16_w32(
// CHECK-GFX1100-NEXT: entry:
-// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i16> [[C:%.*]], i1 true)
+// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16.v16i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i16> [[C:%.*]], i1 true)
// CHECK-GFX1100-NEXT: store <16 x i16> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
// CHECK-GFX1100-NEXT: ret void
//
@@ -80,7 +79,7 @@ void test_amdgcn_wmma_bf16_16x16x16_bf16_w32(global v16s* out, v16s a, v16s b, v
// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f16_16x16x16_f16_tied_w32(
// CHECK-GFX1100-NEXT: entry:
-// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied.v16f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <16 x half> [[C:%.*]], i1 true)
+// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied.v16f16.v16f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <16 x half> [[C:%.*]], i1 true)
// CHECK-GFX1100-NEXT: store <16 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
// CHECK-GFX1100-NEXT: ret void
//
@@ -95,7 +94,7 @@ void test_amdgcn_wmma_f16_16x16x16_f16_tied_w32(global v16h* out, v16h a, v16h b
// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32(
// CHECK-GFX1100-NEXT: entry:
-// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied.v16i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i16> [[C:%.*]], i1 true)
+// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied.v16i16.v16i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i16> [[C:%.*]], i1 true)
// CHECK-GFX1100-NEXT: store <16 x i16> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
// CHECK-GFX1100-NEXT: ret void
//
@@ -110,7 +109,7 @@ void test_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32(global v16s* out, v16s a, v16s
// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_i32_16x16x16_iu8_w32(
// CHECK-GFX1100-NEXT: entry:
-// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32(i1 true, <4 x i32> [[A:%.*]], i1 true, <4 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i1 false)
+// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v4i32(i1 true, <4 x i32> [[A:%.*]], i1 true, <4 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i1 false)
// CHECK-GFX1100-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
// CHECK-GFX1100-NEXT: ret void
//
@@ -125,7 +124,7 @@ void test_amdgcn_wmma_i32_16x16x16_iu8_w32(global v8i* out, v4i a, v4i b, v8i c)
// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_i32_16x16x16_iu4_w32(
// CHECK-GFX1100-NEXT: entry:
-// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32(i1 true, <2 x i32> [[A:%.*]], i1 true, <2 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i1 false)
+// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.v2i32(i1 true, <2 x i32> [[A:%.*]], i1 true, <2 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i1 false)
// CHECK-GFX1100-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
// CHECK-GFX1100-NEXT: ret void
//
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w64.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w64.cl
index 4797675f50d42e..1490f14fd17b69 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w64.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w64.cl
@@ -3,12 +3,10 @@
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1100 -DWMMA_GFX1100_TESTS -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1100
typedef float v4f __attribute__((ext_vector_type(4)));
-typedef float v8f __attribute__((ext_vector_type(8)));
typedef half v8h __attribute__((ext_vector_type(8)));
typedef half v16h __attribute__((ext_vector_type(16)));
typedef int v2i __attribute__((ext_vector_type(2)));
typedef int v4i __attribute__((ext_vector_type(4)));
-typedef int v8i __attribute__((ext_vector_type(8)));
typedef short v8s __attribute__((ext_vector_type(8)));
typedef short v16s __attribute__((ext_vector_type(16)));
@@ -22,7 +20,7 @@ typedef short v16s __attribute__((ext_vector_type(16)));
// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f32_16x16x16_f16_w64(
// CHECK-GFX1100-NEXT: entry:
-// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <4 x float> [[C:%.*]])
+// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v16f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <4 x float> [[C:%.*]])
// CHECK-GFX1100-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4:![0-9]+]]
// CHECK-GFX1100-NEXT: ret void
//
@@ -37,7 +35,7 @@ void test_amdgcn_wmma_f32_16x16x16_f16_w64(global v4f* out, v16h a, v16h b, v4f
// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f32_16x16x16_bf16_w64(
// CHECK-GFX1100-NEXT: entry:
-// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <4 x float> [[C:%.*]])
+// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v16i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <4 x float> [[C:%.*]])
// CHECK-GFX1100-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
// CHECK-GFX1100-NEXT: ret void
//
@@ -52,7 +50,7 @@ void test_amdgcn_wmma_f32_16x16x16_bf16_w64(global v4f* out, v16s a, v16s b, v4f
// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f16_16x16x16_f16_w64(
// CHECK-GFX1100-NEXT: entry:
-// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <8 x half> [[C:%.*]], i1 true)
+// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v16f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <8 x half> [[C:%.*]], i1 true)
// CHECK-GFX1100-NEXT: store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
// CHECK-GFX1100-NEXT: ret void
//
@@ -67,7 +65,7 @@ void test_amdgcn_wmma_f16_16x16x16_f16_w64(global v8h* out, v16h a, v16h b, v8h
// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_bf16_16x16x16_bf16_w64(
// CHECK-GFX1100-NEXT: entry:
-// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], i1 true)
+// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v16i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], i1 true)
// CHECK-GFX1100-NEXT: store <8 x i16> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
// CHECK-GFX1100-NEXT: ret void
//
@@ -82,7 +80,7 @@ void test_amdgcn_wmma_bf16_16x16x16_bf16_w64(global v8s* out, v16s a, v16s b, v8
// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f16_16x16x16_f16_tied_w64(
// CHECK-GFX1100-NEXT: entry:
-// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied.v8f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <8 x half> [[C:%.*]], i1 true)
+// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied.v8f16.v16f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <8 x half> [[C:%.*]], i1 true)
// CHECK-GFX1100-NEXT: store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
// CHECK-GFX1100-NEXT: ret void
//
@@ -97,7 +95,7 @@ void test_amdgcn_wmma_f16_16x16x16_f16_tied_w64(global v8h* out, v16h a, v16h b,
// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64(
// CHECK-GFX1100-NEXT: entry:
-// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied.v8i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], i1 true)
+// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied.v8i16.v16i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], i1 true)
// CHECK-GFX1100-NEXT: store <8 x i16> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
// CHECK-GFX1100-NEXT: ret void
//
@@ -112,7 +110,7 @@ void test_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64(global v8s* out, v16s a, v16s
// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_i32_16x16x16_iu8_w64(
// CHECK-GFX1100-NEXT: entry:
-// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32(i1 true, <4 x i32> [[A:%.*]], i1 true, <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], i1 false)
+// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 true, <4 x i32> [[A:%.*]], i1 true, <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], i1 false)
// CHECK-GFX1100-NEXT: store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
// CHECK-GFX1100-NEXT: ret void
//
@@ -127,7 +125,7 @@ void test_amdgcn_wmma_i32_16x16x16_iu8_w64(global v4i* out, v4i a, v4i b, v4i c)
// CHECK-GFX1100-LABEL: @test_amdgcn_wmma_i32_16x16x16_iu4_w64(
// CHECK-GFX1100-NEXT: entry:
-// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32(i1 true, <2 x i32> [[A:%.*]], i1 true, <2 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], i1 false)
+// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 true, <2 x i32> [[A:%.*]], i1 true, <2 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], i1 false)
// CHECK-GFX1100-NEXT: store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
// CHECK-GFX1100-NEXT: ret void
//
diff --git a/cross-project-tests/amdgpu/builtins-amdgcn-gfx12-wmma-w32.cl b/cross-project-tests/amdgpu/builtins-amdgcn-gfx12-wmma-w32.cl
new file mode 100644
index 00000000000000..867273126f8fdf
--- /dev/null
+++ b/cross-project-tests/amdgpu/builtins-amdgcn-gfx12-wmma-w32.cl
@@ -0,0 +1,107 @@
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize32 -S -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200
+
+typedef int v2i __attribute__((ext_vector_type(2)));
+typedef float v8f __attribute__((ext_vector_type(8)));
+typedef half v8h __attribute__((ext_vector_type(8)));
+typedef short v8s __attribute__((ext_vector_type(8)));
+typedef int v8i __attribute__((ext_vector_type(8)));
+
+// Wave32
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_f16_w32:
+// CHECK-GFX1200: v_wmma_f32_16x16x16_f16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+//
+void test_amdgcn_wmma_f32_16x16x16_f16_w32(global v8f* out, v8h a, v8h b, v8f c)
+{
+ *out = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12(a, b, c);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_bf16_w32:
+// CHECK-GFX1200: v_wmma_f32_16x16x16_bf16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+//
+void test_amdgcn_wmma_f32_16x16x16_bf16_w32(global v8f* out, v8s a, v8s b, v8f c)
+{
+ *out = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12(a, b, c);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f16_16x16x16_f16_w32:
+// CHECK-GFX1200: v_wmma_f16_16x16x16_f16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+//
+void test_amdgcn_wmma_f16_16x16x16_f16_w32(global v8h* out, v8h a, v8h b, v8h c)
+{
+ *out = __builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12(a, b, c);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_wmma_bf16_16x16x16_bf16_w32:
+// CHECK-GFX1200: v_wmma_bf16_16x16x16_bf16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+void test_amdgcn_wmma_bf16_16x16x16_bf16_w32(global v8s* out, v8s a, v8s b, v8s c)
+{
+ *out = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12(a, b, c);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_wmma_i32_16x16x16_iu8_w32:
+// CHECK-GFX1200: v_wmma_i32_16x16x16_iu8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] neg_lo:[1,1,0]
+//
+void test_amdgcn_wmma_i32_16x16x16_iu8_w32(global v8i* out, v2i a, v2i b, v8i c)
+{
+ *out = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12(true, a, true, b, c, false);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_wmma_i32_16x16x16_iu4_w32:
+// CHECK-GFX1200: v_wmma_i32_16x16x16_iu4 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] neg_lo:[1,1,0]
+//
+void test_amdgcn_wmma_i32_16x16x16_iu4_w32(global v8i* out, int a, int b, v8i c)
+{
+ *out = __builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12(true, a, true, b, c, false);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32:
+// CHECK-GFX1200: v_wmma_f32_16x16x16_fp8_fp8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+//
+void test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32(global v8f* out, v2i a, v2i b, v8f c)
+{
+ *out = __builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12(a, b, c);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32:
+// CHECK-GFX1200: v_wmma_f32_16x16x16_fp8_bf8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+//
+void test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32(global v8f* out, v2i a, v2i b, v8f c)
+{
+ *out = __builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12(a, b, c);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32:
+// CHECK-GFX1200: v_wmma_f32_16x16x16_bf8_fp8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+//
+void test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32(global v8f* out, v2i a, v2i b, v8f c)
+{
+ *out = __builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12(a, b, c);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32:
+// CHECK-GFX1200: v_wmma_f32_16x16x16_bf8_bf8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+//
+void test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32(global v8f* out, v2i a, v2i b, v8f c)
+{
+ *out = __builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12(a, b, c);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_wmma_i32_16x16x32_iu4_w32:
+// CHECK-GFX1200: v_wmma_i32_16x16x32_iu4 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] neg_lo:[1,1,0]
+//
+void test_amdgcn_wmma_i32_16x16x32_iu4_w32(global v8i* out, v2i a, v2i b, v8i c)
+{
+ *out = __builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12(true, a, true, b, c, false);
+}
diff --git a/cross-project-tests/amdgpu/builtins-amdgcn-gfx12-wmma-w64.cl b/cross-project-tests/amdgpu/builtins-amdgcn-gfx12-wmma-w64.cl
new file mode 100644
index 00000000000000..ad01450c35a76f
--- /dev/null
+++ b/cross-project-tests/amdgpu/builtins-amdgcn-gfx12-wmma-w64.cl
@@ -0,0 +1,104 @@
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize64 -S -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200
+
+typedef float v4f __attribute__((ext_vector_type(4)));
+typedef half v4h __attribute__((ext_vector_type(4)));
+typedef short v4s __attribute__((ext_vector_type(4)));
+typedef int v4i __attribute__((ext_vector_type(4)));
+
+// Wave64
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_f16_w64:
+// CHECK-GFX1200: v_wmma_f32_16x16x16_f16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+//
+void test_amdgcn_wmma_f32_16x16x16_f16_w64(global v4f* out, v4h a, v4h b, v4f c)
+{
+ *out = __builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12(a, b, c);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_bf16_w64:
+// CHECK-GFX1200: v_wmma_f32_16x16x16_bf16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+//
+void test_amdgcn_wmma_f32_16x16x16_bf16_w64(global v4f* out, v4s a, v4s b, v4f c)
+{
+ *out = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12(a, b, c);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f16_16x16x16_f16_w64:
+// CHECK-GFX1200: v_wmma_f16_16x16x16_f16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+//
+void test_amdgcn_wmma_f16_16x16x16_f16_w64(global v4h* out, v4h a, v4h b, v4h c)
+{
+ *out = __builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12(a, b, c);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_wmma_bf16_16x16x16_bf16_w64:
+// CHECK-GFX1200: v_wmma_bf16_16x16x16_bf16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+//
+void test_amdgcn_wmma_bf16_16x16x16_bf16_w64(global v4s* out, v4s a, v4s b, v4s c)
+{
+ *out = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12(a, b, c);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_wmma_i32_16x16x16_iu8_w64:
+// CHECK-GFX1200: v_wmma_i32_16x16x16_iu8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] neg_lo:[1,1,0]
+//
+void test_amdgcn_wmma_i32_16x16x16_iu8_w64(global v4i* out, int a, int b, v4i c)
+{
+ *out = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12(true, a, true, b, c, false);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_wmma_i32_16x16x16_iu4_w64:
+// CHECK-GFX1200: v_wmma_i32_16x16x16_iu4 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] neg_lo:[1,1,0]
+//
+void test_amdgcn_wmma_i32_16x16x16_iu4_w64(global v4i* out, int a, int b, v4i c)
+{
+ *out = __builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12(true, a, true, b, c, false);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32:
+// CHECK-GFX1200: v_wmma_f32_16x16x16_fp8_fp8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
+//
+void test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32(global v4f* out, int a, int b, v4f c)
+{
+ *out = __builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12(a, b, c);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32:
+// CHECK-GFX1200: v_wmma_f32_16x16x16_fp8_bf8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
+//
+void test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32(global v4f* out, int a, int b, v4f c)
+{
+ *out = __builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12(a, b, c);
+}
+
+// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32:
+// CHECK-GFX1200: v_wmma_f32_16x16x16_bf8_fp8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
+//
+void test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32(global v4f* out, int a, int b, v4f c)
+{
+ *out = __builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12(a, b, c);
+}
+
+// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32:
+// CHECK-GFX1200: v_wmma_f32_16x16x16_bf8_bf8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
+//
+void test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32(global v4f* out, int a, int b, v4f c)
+{
+ *out = __builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12(a, b, c);
+}
+
+// CHECK-GFX1200-LABEL: test_amdgcn_wmma_i32_16x16x32_iu4_w32:
+// CHECK-GFX1200: v_wmma_i32_16x16x32_iu4 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] neg_lo:[1,1,0]
+//
+void test_amdgcn_wmma_i32_16x16x32_iu4_w32(global v4i* out, int a, int b, v4i c)
+{
+ *out = __builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12(true, a, true, b, c, false);
+}
diff --git a/cross-project-tests/amdgpu/builtins-amdgcn-swmmac-w32.cl b/cross-project-tests/amdgpu/builtins-amdgcn-swmmac-w32.cl
new file mode 100644
index 00000000000000..317d9a1102ccff
--- /dev/null
+++ b/cross-project-tests/amdgpu/builtins-amdgcn-swmmac-w32.cl
@@ -0,0 +1,110 @@
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize32 -S -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200
+
+typedef int v2i __attribute__((ext_vector_type(2)));
+typedef int v4i __attribute__((ext_vector_type(4)));
+typedef float v8f __attribute__((ext_vector_type(8)));
+typedef half v8h __attribute__((ext_vector_type(8)));
+typedef short v8s __attribute__((ext_vector_type(8)));
+typedef int v8i __attribute__((ext_vector_type(8)));
+typedef half v16h __attribute__((ext_vector_type(16)));
+typedef short v16s __attribute__((ext_vector_type(16)));
+
+// Wave32
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_f16_w32:
+// CHECK-GFX1200: v_swmmac_f32_16x16x32_f16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
+//
+void test_amdgcn_swmmac_f32_16x16x32_f16_w32(global v8f* out, v8h a, v16h b, v8f c, short index)
+{
+ *out = __builtin_amdgcn_swmmac_f32_16x16x32_f16_w32(a, b, c, index);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_bf16_w32:
+// CHECK-GFX1200: v_swmmac_f32_16x16x32_bf16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
+//
+void test_amdgcn_swmmac_f32_16x16x32_bf16_w32(global v8f* out, v8s a, v16s b, v8f c, short index)
+{
+ *out = __builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32(a, b, c, index);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f16_16x16x32_f16_w32:
+// CHECK-GFX1200: v_swmmac_f16_16x16x32_f16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
+//
+void test_amdgcn_swmmac_f16_16x16x32_f16_w32(global v8h* out, v8h a, v16h b, v8h c, short index)
+{
+ *out = __builtin_amdgcn_swmmac_f16_16x16x32_f16_w32(a, b, c, index);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_bf16_16x16x32_bf16_w32:
+// CHECK-GFX1200: v_swmmac_bf16_16x16x32_bf16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
+//
+void test_amdgcn_swmmac_bf16_16x16x32_bf16_w32(global v8s* out, v8s a, v16s b, v8s c, short index)
+{
+ *out = __builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32(a, b, c, index);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_i32_16x16x32_iu8_w32:
+// CHECK-GFX1200: v_swmmac_i32_16x16x32_iu8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} neg_lo:[1,1,0] clamp
+//
+void test_amdgcn_swmmac_i32_16x16x32_iu8_w32(global v8i* out, v2i a, v4i b, v8i c, short index)
+{
+ *out = __builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32(true, a, true, b, c, index, true);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_i32_16x16x32_iu4_w32:
+// CHECK-GFX1200: v_swmmac_i32_16x16x32_iu4 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} neg_lo:[1,1,0] clamp
+//
+void test_amdgcn_swmmac_i32_16x16x32_iu4_w32(global v8i* out, int a, v2i b, v8i c, short index)
+{
+ *out = __builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32(true, a, true, b, c, index, true);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_i32_16x16x64_iu4_w32:
+// CHECK-GFX1200: v_swmmac_i32_16x16x64_iu4 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} neg_lo:[1,1,0] clamp
+//
+void test_amdgcn_swmmac_i32_16x16x64_iu4_w32(global v8i* out, v2i a, v4i b, v8i c, short index)
+{
+ *out = __builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32(true, a, true, b, c, index, true);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32:
+// CHECK-GFX1200: v_swmmac_f32_16x16x32_fp8_fp8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
+//
+void test_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32(global v8f* out, v2i a, v4i b, v8f c, short index)
+{
+ *out = __builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32(a, b, c, index);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32:
+// CHECK-GFX1200: v_swmmac_f32_16x16x32_fp8_bf8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
+//
+void test_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32(global v8f* out, v2i a, v4i b, v8f c, short index)
+{
+ *out = __builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32(a, b, c, index);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32:
+// CHECK-GFX1200: v_swmmac_f32_16x16x32_bf8_fp8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
+//
+void test_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32(global v8f* out, v2i a, v4i b, v8f c, short index)
+{
+ *out = __builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32(a, b, c, index);
+}
+
+// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32:
+// CHECK-GFX1200: v_swmmac_f32_16x16x32_bf8_bf8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
+//
+void test_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32(global v8f* out, v2i a, v4i b, v8f c, short index)
+{
+ *out = __builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32(a, b, c, index);
+}
diff --git a/cross-project-tests/amdgpu/builtins-amdgcn-swmmac-w64.cl b/cross-project-tests/amdgpu/builtins-amdgcn-swmmac-w64.cl
new file mode 100644
index 00000000000000..eb81234f53a663
--- /dev/null
+++ b/cross-project-tests/amdgpu/builtins-amdgcn-swmmac-w64.cl
@@ -0,0 +1,109 @@
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize64 -S -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200
+
+typedef int v2i __attribute__((ext_vector_type(2)));
+typedef int v4i __attribute__((ext_vector_type(4)));
+typedef float v4f __attribute__((ext_vector_type(4)));
+typedef half v4h __attribute__((ext_vector_type(4)));
+typedef short v4s __attribute__((ext_vector_type(4)));
+typedef half v8h __attribute__((ext_vector_type(8)));
+typedef short v8s __attribute__((ext_vector_type(8)));
+
+// Wave64
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_f16_w64:
+// CHECK-GFX1200: v_swmmac_f32_16x16x32_f16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
+//
+void test_amdgcn_swmmac_f32_16x16x32_f16_w64(global v4f* out, v4h a, v8h b, v4f c, short index)
+{
+ *out = __builtin_amdgcn_swmmac_f32_16x16x32_f16_w64(a, b, c, index);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_bf16_w64:
+// CHECK-GFX1200: v_swmmac_f32_16x16x32_bf16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
+//
+void test_amdgcn_swmmac_f32_16x16x32_bf16_w64(global v4f* out, v4s a, v8s b, v4f c, short index)
+{
+ *out = __builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64(a, b, c, index);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f16_16x16x32_f16_w64:
+// CHECK-GFX1200: v_swmmac_f16_16x16x32_f16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
+//
+void test_amdgcn_swmmac_f16_16x16x32_f16_w64(global v4h* out, v4h a, v8h b, v4h c, short index)
+{
+ *out = __builtin_amdgcn_swmmac_f16_16x16x32_f16_w64(a, b, c, index);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_bf16_16x16x32_bf16_w64:
+// CHECK-GFX1200: v_swmmac_bf16_16x16x32_bf16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
+//
+void test_amdgcn_swmmac_bf16_16x16x32_bf16_w64(global v4s* out, v4s a, v8s b, v4s c, short index)
+{
+ *out = __builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64(a, b, c, index);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_i32_16x16x32_iu8_w64:
+// CHECK-GFX1200: v_swmmac_i32_16x16x32_iu8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} neg_lo:[1,1,0] clamp
+//
+void test_amdgcn_swmmac_i32_16x16x32_iu8_w64(global v4i* out, int a, v2i b, v4i c, short index)
+{
+ *out = __builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64(true, a, true, b, c, index, true);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_i32_16x16x32_iu4_w64:
+// CHECK-GFX1200: v_swmmac_i32_16x16x32_iu4 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} neg_lo:[1,1,0] clamp
+//
+void test_amdgcn_swmmac_i32_16x16x32_iu4_w64(global v4i* out, int a, int b, v4i c, short index)
+{
+ *out = __builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64(true, a, true, b, c, index, true);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_i32_16x16x64_iu4_w64:
+// CHECK-GFX1200: v_swmmac_i32_16x16x64_iu4 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} neg_lo:[1,1,0] clamp
+//
+void test_amdgcn_swmmac_i32_16x16x64_iu4_w64(global v4i* out, int a, v2i b, v4i c, short index)
+{
+ *out = __builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64(true, a, true, b, c, index, true);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64:
+// CHECK-GFX1200: v_swmmac_f32_16x16x32_fp8_fp8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
+//
+void test_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64(global v4f* out, int a, v2i b, v4f c, short index)
+{
+ *out = __builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64(a, b, c, index);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64:
+// CHECK-GFX1200: v_swmmac_f32_16x16x32_fp8_bf8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
+//
+void test_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64(global v4f* out, int a, v2i b, v4f c, short index)
+{
+ *out = __builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64(a, b, c, index);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64:
+// CHECK-GFX1200: v_swmmac_f32_16x16x32_bf8_fp8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
+//
+void test_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64(global v4f* out, int a, v2i b, v4f c, short index)
+{
+ *out = __builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64(a, b, c, index);
+}
+
+// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64:
+// CHECK-GFX1200: v_swmmac_f32_16x16x32_bf8_bf8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
+//
+void test_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64(global v4f* out, int a, v2i b, v4f c, short index)
+{
+ *out = __builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64(a, b, c, index);
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index e6db9da5526aa3..9eb1ac8e27befb 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2601,6 +2601,11 @@ def int_amdgcn_ds_bvh_stack_rtn :
[ImmArg<ArgIndex<3>>, IntrWillReturn, IntrNoCallback, IntrNoFree]
>;
+def int_amdgcn_s_wait_event_export_ready :
+ ClangBuiltin<"__builtin_amdgcn_s_wait_event_export_ready">,
+ Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn]
+>;
+
// WMMA (Wave Matrix Multiply-Accumulate) intrinsics
//
// These operations perform a matrix multiplication and accumulation of
@@ -2608,10 +2613,10 @@ def int_amdgcn_ds_bvh_stack_rtn :
class AMDGPUWmmaIntrinsic<LLVMType AB, LLVMType CD> :
Intrinsic<
- [CD], // %D
+ [CD], // %D
[
AB, // %A
- AB, // %B
+ LLVMMatchType<1>, // %B
LLVMMatchType<0>, // %C
],
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]
@@ -2619,49 +2624,50 @@ class AMDGPUWmmaIntrinsic<LLVMType AB, LLVMType CD> :
class AMDGPUWmmaIntrinsicOPSEL<LLVMType AB, LLVMType CD> :
Intrinsic<
- [CD], // %D
+ [CD], // %D
[
AB, // %A
- AB, // %B
+ LLVMMatchType<1>, // %B
LLVMMatchType<0>, // %C
- llvm_i1_ty, // %high
+ llvm_i1_ty, // %high (op_sel) for GFX11, 0 for GFX12
],
[IntrNoMem, IntrConvergent, ImmArg<ArgIndex<3>>, IntrWillReturn, IntrNoCallback, IntrNoFree]
>;
class AMDGPUWmmaIntrinsicIU<LLVMType AB, LLVMType CD> :
Intrinsic<
- [CD], // %D
+ [CD], // %D
[
llvm_i1_ty, // %A_sign
AB, // %A
llvm_i1_ty, // %B_sign
- AB, // %B
+ LLVMMatchType<1>, // %B
LLVMMatchType<0>, // %C
llvm_i1_ty, // %clamp
],
[IntrNoMem, IntrConvergent, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree]
>;
-def int_amdgcn_wmma_f32_16x16x16_f16 : AMDGPUWmmaIntrinsic<llvm_v16f16_ty, llvm_anyfloat_ty>;
-def int_amdgcn_wmma_f32_16x16x16_bf16 : AMDGPUWmmaIntrinsic<llvm_v16i16_ty, llvm_anyfloat_ty>;
-// The regular, untied f16/bf16 wmma intrinsics only write to one half
-// of the registers (set via the op_sel bit).
-// The content of the other 16-bit of the registers is undefined.
-def int_amdgcn_wmma_f16_16x16x16_f16 : AMDGPUWmmaIntrinsicOPSEL<llvm_v16f16_ty, llvm_anyfloat_ty>;
-def int_amdgcn_wmma_bf16_16x16x16_bf16 : AMDGPUWmmaIntrinsicOPSEL<llvm_v16i16_ty, llvm_anyint_ty>;
-// The tied versions of the f16/bf16 wmma intrinsics tie the destination matrix
-// registers to the input accumulator registers.
-// Essentially, the content of the other 16-bit is preserved from the input.
-def int_amdgcn_wmma_f16_16x16x16_f16_tied : AMDGPUWmmaIntrinsicOPSEL<llvm_v16f16_ty, llvm_anyfloat_ty>;
-def int_amdgcn_wmma_bf16_16x16x16_bf16_tied : AMDGPUWmmaIntrinsicOPSEL<llvm_v16i16_ty, llvm_anyint_ty>;
-def int_amdgcn_wmma_i32_16x16x16_iu8 : AMDGPUWmmaIntrinsicIU<llvm_v4i32_ty, llvm_anyint_ty>;
-def int_amdgcn_wmma_i32_16x16x16_iu4 : AMDGPUWmmaIntrinsicIU<llvm_v2i32_ty, llvm_anyint_ty>;
+// WMMA GFX11Only
-def int_amdgcn_s_wait_event_export_ready :
- ClangBuiltin<"__builtin_amdgcn_s_wait_event_export_ready">,
- Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn]
->;
+// The OPSEL intrinsics read from and write to one half of the registers, selected by the op_sel bit.
+// The tied versions of the f16/bf16 wmma intrinsics tie the destination matrix registers to the input accumulator registers.
+// The content of the other 16-bit half is preserved from the input.
+def int_amdgcn_wmma_f16_16x16x16_f16_tied : AMDGPUWmmaIntrinsicOPSEL<llvm_anyfloat_ty, llvm_anyfloat_ty>;
+def int_amdgcn_wmma_bf16_16x16x16_bf16_tied : AMDGPUWmmaIntrinsicOPSEL<llvm_anyint_ty, llvm_anyint_ty>;
+
+// WMMA GFX11Plus
+
+def int_amdgcn_wmma_f32_16x16x16_f16 : AMDGPUWmmaIntrinsic<llvm_anyfloat_ty, llvm_anyfloat_ty>;
+def int_amdgcn_wmma_f32_16x16x16_bf16 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
+def int_amdgcn_wmma_i32_16x16x16_iu8 : AMDGPUWmmaIntrinsicIU<llvm_anyint_ty, llvm_anyint_ty>;
+def int_amdgcn_wmma_i32_16x16x16_iu4 : AMDGPUWmmaIntrinsicIU<llvm_anyint_ty, llvm_anyint_ty>;
+
+// GFX11: The OPSEL intrinsics read from and write to one half of the registers, selected by the op_sel bit.
+// The content of the other 16-bit half is undefined.
+// GFX12: The op_sel bit must be 0.
+def int_amdgcn_wmma_f16_16x16x16_f16 : AMDGPUWmmaIntrinsicOPSEL<llvm_anyfloat_ty, llvm_anyfloat_ty>;
+def int_amdgcn_wmma_bf16_16x16x16_bf16 : AMDGPUWmmaIntrinsicOPSEL<llvm_anyint_ty, llvm_anyint_ty>;
//===----------------------------------------------------------------------===//
// GFX12 Intrinsics
@@ -2681,6 +2687,65 @@ def int_amdgcn_permlanex16_var : ClangBuiltin<"__builtin_amdgcn_permlanex16_var"
[IntrNoMem, IntrConvergent, IntrWillReturn,
ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree]>;
+
+// WMMA (Wave Matrix Multiply-Accumulate) intrinsics
+//
+// These operations perform a matrix multiplication and accumulation of
+// the form: D = A * B + C .
+
+// A and B are <8 x fp8> or <8 x bf8>, but since fp8 and bf8 are not supported by llvm we use <2 x i32>.
+def int_amdgcn_wmma_f32_16x16x16_fp8_fp8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
+def int_amdgcn_wmma_f32_16x16x16_fp8_bf8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
+def int_amdgcn_wmma_f32_16x16x16_bf8_fp8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
+def int_amdgcn_wmma_f32_16x16x16_bf8_bf8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
+// A and B are <16 x iu4>.
+def int_amdgcn_wmma_i32_16x16x32_iu4 : AMDGPUWmmaIntrinsicIU<llvm_anyint_ty, llvm_anyint_ty>;
+
+// SWMMAC (Wave Matrix(sparse) Multiply-Accumulate) intrinsics
+//
+// These operations perform a sparse matrix multiplication and accumulation of
+// the form: D = A * B + C.
+// A is sparse matrix, half the size of B, and is expanded using sparsity index.
+
+class AMDGPUSWmmacIntrinsicIdx<LLVMType A, LLVMType B, LLVMType CD, LLVMType Index> :
+ Intrinsic<
+ [CD], // %D
+ [
+ A, // %A
+ B, // %B
+ LLVMMatchType<0>, // %C
+ Index // %Sparsity index for A
+ ],
+ [IntrNoMem, IntrConvergent, IntrWillReturn]
+>;
+
+class AMDGPUSWmmacIntrinsicIUIdx<LLVMType A, LLVMType B, LLVMType CD, LLVMType Index> :
+ Intrinsic<
+ [CD], // %D
+ [
+ llvm_i1_ty, // %A_sign
+ A, // %A
+ llvm_i1_ty, // %B_sign
+ B, // %B
+ LLVMMatchType<0>, // %C
+ Index, // %Sparsity index for A
+ llvm_i1_ty, // %clamp
+ ],
+ [IntrNoMem, IntrConvergent, IntrWillReturn, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<6>>]
+>;
+
+def int_amdgcn_swmmac_f32_16x16x32_f16 : AMDGPUSWmmacIntrinsicIdx<llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
+def int_amdgcn_swmmac_f32_16x16x32_bf16 : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
+def int_amdgcn_swmmac_f16_16x16x32_f16 : AMDGPUSWmmacIntrinsicIdx<llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
+def int_amdgcn_swmmac_bf16_16x16x32_bf16 : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty>;
+def int_amdgcn_swmmac_i32_16x16x32_iu8 : AMDGPUSWmmacIntrinsicIUIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty>;
+def int_amdgcn_swmmac_i32_16x16x32_iu4 : AMDGPUSWmmacIntrinsicIUIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty>;
+def int_amdgcn_swmmac_i32_16x16x64_iu4 : AMDGPUSWmmacIntrinsicIUIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty>;
+def int_amdgcn_swmmac_f32_16x16x32_fp8_fp8 : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
+def int_amdgcn_swmmac_f32_16x16x32_fp8_bf8 : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
+def int_amdgcn_swmmac_f32_16x16x32_bf8_fp8 : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
+def int_amdgcn_swmmac_f32_16x16x32_bf8_bf8 : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
+
def int_amdgcn_global_atomic_ordered_add_b64 : AMDGPUAtomicRtn<llvm_i64_ty, global_ptr_ty>;
def int_amdgcn_flat_atomic_fmin_num : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index a19b03b9292337..152f495a452ba2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -59,6 +59,30 @@ def gi_wmmaopselvop3pmods :
GIComplexOperandMatcher<s32, "selectWMMAOpSelVOP3PMods">,
GIComplexPatternEquiv<WMMAOpSelVOP3PMods>;
+def gi_wmmavisrc :
+ GIComplexOperandMatcher<s32, "selectWMMAVISrc">,
+ GIComplexPatternEquiv<WMMAVISrc>;
+
+def gi_wmmamods :
+ GIComplexOperandMatcher<s32, "selectWMMAModsF32NegAbs">,
+ GIComplexPatternEquiv<WMMAModsF32NegAbs>;
+
+def gi_wmmamodsf16Neg :
+ GIComplexOperandMatcher<s32, "selectWMMAModsF16Neg">,
+ GIComplexPatternEquiv<WMMAModsF16Neg>;
+
+def gi_wmmamodsf16NegAbs :
+ GIComplexOperandMatcher<s32, "selectWMMAModsF16NegAbs">,
+ GIComplexPatternEquiv<WMMAModsF16NegAbs>;
+
+def gi_swmmacindex8 :
+ GIComplexOperandMatcher<s32, "selectSWMMACIndex8">,
+ GIComplexPatternEquiv<SWMMACIndex8>;
+
+def gi_swmmacindex16 :
+ GIComplexOperandMatcher<s32, "selectSWMMACIndex16">,
+ GIComplexPatternEquiv<SWMMACIndex16>;
+
def gi_vop3opselmods :
GIComplexOperandMatcher<s32, "selectVOP3OpSelMods">,
GIComplexPatternEquiv<VOP3OpSelMods>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 4c35649cec6c8a..4f7bf3f7d35e71 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -3048,6 +3048,336 @@ bool AMDGPUDAGToDAGISel::SelectWMMAOpSelVOP3PMods(SDValue In,
return true;
}
+static MachineSDNode *buildRegSequence32(SmallVectorImpl<SDValue> &Elts,
+ llvm::SelectionDAG *CurDAG,
+ const SDLoc &DL) {
+ unsigned DstRegClass;
+ EVT DstTy;
+ switch (Elts.size()) {
+ case 8:
+ DstRegClass = AMDGPU::VReg_256RegClassID;
+ DstTy = MVT::v8i32;
+ break;
+ case 4:
+ DstRegClass = AMDGPU::VReg_128RegClassID;
+ DstTy = MVT::v4i32;
+ break;
+ case 2:
+ DstRegClass = AMDGPU::VReg_64RegClassID;
+ DstTy = MVT::v2i32;
+ break;
+ default:
+ llvm_unreachable("unhandled Reg sequence size");
+ }
+
+ SmallVector<SDValue, 17> Ops;
+ Ops.push_back(CurDAG->getTargetConstant(DstRegClass, DL, MVT::i32));
+ for (unsigned i = 0; i < Elts.size(); ++i) {
+ Ops.push_back(Elts[i]);
+ Ops.push_back(CurDAG->getTargetConstant(
+ SIRegisterInfo::getSubRegFromChannel(i), DL, MVT::i32));
+ }
+ return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, DstTy, Ops);
+}
+
+static MachineSDNode *buildRegSequence16(SmallVectorImpl<SDValue> &Elts,
+ llvm::SelectionDAG *CurDAG,
+ const SDLoc &DL) {
+ SmallVector<SDValue, 8> PackedElts;
+ assert("unhandled Reg sequence size" &&
+ (Elts.size() == 8 || Elts.size() == 16));
+
+ // Pack 16-bit elements in pairs into 32-bit register. If both elements are
+ // unpacked from 32-bit source use it, otherwise pack them using v_perm.
+ for (unsigned i = 0; i < Elts.size(); i += 2) {
+ SDValue LoSrc = stripExtractLoElt(stripBitcast(Elts[i]));
+ SDValue HiSrc;
+ if (isExtractHiElt(Elts[i + 1], HiSrc) && LoSrc == HiSrc) {
+ PackedElts.push_back(HiSrc);
+ } else {
+ SDValue PackLoLo = CurDAG->getTargetConstant(0x05040100, DL, MVT::i32);
+ MachineSDNode *Packed =
+ CurDAG->getMachineNode(AMDGPU::V_PERM_B32_e64, DL, MVT::i32,
+ {Elts[i + 1], Elts[i], PackLoLo});
+ PackedElts.push_back(SDValue(Packed, 0));
+ }
+ }
+
+ return buildRegSequence32(PackedElts, CurDAG, DL);
+}
+
+static MachineSDNode *buildRegSequence(SmallVectorImpl<SDValue> &Elts,
+ llvm::SelectionDAG *CurDAG,
+ const SDLoc &DL, unsigned ElementSize) {
+ if (ElementSize == 16)
+ return buildRegSequence16(Elts, CurDAG, DL);
+ if (ElementSize == 32)
+ return buildRegSequence32(Elts, CurDAG, DL);
+ llvm_unreachable("Unhandled element size");
+}
+
+static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods,
+ SmallVectorImpl<SDValue> &Elts, SDValue &Src,
+ llvm::SelectionDAG *CurDAG, const SDLoc &DL,
+ unsigned ElementSize) {
+ if (ModOpcode == ISD::FNEG) {
+ Mods |= SISrcMods::NEG;
+ // Check if all elements also have abs modifier
+ SmallVector<SDValue, 8> NegAbsElts;
+ for (auto El : Elts) {
+ if (El.getOpcode() != ISD::FABS)
+ break;
+ NegAbsElts.push_back(El->getOperand(0));
+ }
+ if (Elts.size() != NegAbsElts.size()) {
+ // Neg
+ Src = SDValue(buildRegSequence(Elts, CurDAG, DL, ElementSize), 0);
+ } else {
+ // Neg and Abs
+ Mods |= SISrcMods::NEG_HI;
+ Src = SDValue(buildRegSequence(NegAbsElts, CurDAG, DL, ElementSize), 0);
+ }
+ } else {
+ assert(ModOpcode == ISD::FABS);
+ // Abs
+ Mods |= SISrcMods::NEG_HI;
+ Src = SDValue(buildRegSequence(Elts, CurDAG, DL, ElementSize), 0);
+ }
+}
+
+// Check all f16 elements for modifiers while looking through b32 and v2b16
+// build vector, stop if element does not satisfy ModifierCheck.
+static void
+checkWMMAElementsModifiersF16(BuildVectorSDNode *BV,
+ std::function<bool(SDValue)> ModifierCheck) {
+ for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
+ if (auto *F16Pair =
+ dyn_cast<BuildVectorSDNode>(stripBitcast(BV->getOperand(i)))) {
+ for (unsigned i = 0; i < F16Pair->getNumOperands(); ++i) {
+ SDValue ElF16 = stripBitcast(F16Pair->getOperand(i));
+ if (!ModifierCheck(ElF16))
+ break;
+ }
+ }
+ }
+}
+
+bool AMDGPUDAGToDAGISel::SelectWMMAModsF16Neg(SDValue In, SDValue &Src,
+ SDValue &SrcMods) const {
+ Src = In;
+ unsigned Mods = SISrcMods::OP_SEL_1;
+
+ // mods are on f16 elements
+ if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
+ SmallVector<SDValue, 8> EltsF16;
+
+ checkWMMAElementsModifiersF16(BV, [&](SDValue Element) -> bool {
+ if (Element.getOpcode() != ISD::FNEG)
+ return false;
+ EltsF16.push_back(Element.getOperand(0));
+ return true;
+ });
+
+ // All elements have neg modifier
+ if (BV->getNumOperands() * 2 == EltsF16.size()) {
+ Src = SDValue(buildRegSequence16(EltsF16, CurDAG, SDLoc(In)), 0);
+ Mods |= SISrcMods::NEG;
+ Mods |= SISrcMods::NEG_HI;
+ }
+ }
+
+ // mods are on v2f16 elements
+ if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
+ SmallVector<SDValue, 8> EltsV2F16;
+ for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
+ SDValue ElV2f16 = stripBitcast(BV->getOperand(i));
+ // Based on first element decide which mod we match, neg or abs
+ if (ElV2f16.getOpcode() != ISD::FNEG)
+ break;
+ EltsV2F16.push_back(ElV2f16.getOperand(0));
+ }
+
+ // All pairs of elements have neg modifier
+ if (BV->getNumOperands() == EltsV2F16.size()) {
+ Src = SDValue(buildRegSequence32(EltsV2F16, CurDAG, SDLoc(In)), 0);
+ Mods |= SISrcMods::NEG;
+ Mods |= SISrcMods::NEG_HI;
+ }
+ }
+
+ SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
+ return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectWMMAModsF16NegAbs(SDValue In, SDValue &Src,
+ SDValue &SrcMods) const {
+ Src = In;
+ unsigned Mods = SISrcMods::OP_SEL_1;
+ unsigned ModOpcode;
+
+ // mods are on f16 elements
+ if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
+ SmallVector<SDValue, 8> EltsF16;
+ checkWMMAElementsModifiersF16(BV, [&](SDValue ElF16) -> bool {
+ // Based on first element decide which mod we match, neg or abs
+ if (EltsF16.empty())
+ ModOpcode = (ElF16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
+ if (ElF16.getOpcode() != ModOpcode)
+ return false;
+ EltsF16.push_back(ElF16.getOperand(0));
+ return true;
+ });
+
+ // All elements have ModOpcode modifier
+ if (BV->getNumOperands() * 2 == EltsF16.size())
+ selectWMMAModsNegAbs(ModOpcode, Mods, EltsF16, Src, CurDAG, SDLoc(In),
+ 16);
+ }
+
+ // mods are on v2f16 elements
+ if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
+ SmallVector<SDValue, 8> EltsV2F16;
+
+ for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
+ SDValue ElV2f16 = stripBitcast(BV->getOperand(i));
+ // Based on first element decide which mod we match, neg or abs
+ if (EltsV2F16.empty())
+ ModOpcode = (ElV2f16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
+ if (ElV2f16->getOpcode() != ModOpcode)
+ break;
+ EltsV2F16.push_back(ElV2f16->getOperand(0));
+ }
+
+ // All elements have ModOpcode modifier
+ if (BV->getNumOperands() == EltsV2F16.size())
+ selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, CurDAG, SDLoc(In),
+ 32);
+ }
+
+ SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
+ return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectWMMAModsF32NegAbs(SDValue In, SDValue &Src,
+ SDValue &SrcMods) const {
+ Src = In;
+ unsigned Mods = SISrcMods::OP_SEL_1;
+ unsigned ModOpcode;
+ SmallVector<SDValue, 8> EltsF32;
+
+ if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
+ for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
+ SDValue ElF32 = stripBitcast(BV->getOperand(i));
+ // Based on first element decide which mod we match, neg or abs
+ if (EltsF32.empty())
+ ModOpcode = (ElF32.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
+ if (ElF32.getOpcode() != ModOpcode)
+ break;
+ EltsF32.push_back(ElF32.getOperand(0));
+ }
+
+ // All elements had ModOpcode modifier
+ if (BV->getNumOperands() == EltsF32.size())
+ selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, CurDAG, SDLoc(In),
+ 32);
+ }
+
+ SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
+ return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectWMMAVISrc(SDValue In, SDValue &Src) const {
+ if (auto *BV = dyn_cast<BuildVectorSDNode>(In)) {
+ BitVector UndefElements;
+ if (SDValue Splat = BV->getSplatValue(&UndefElements))
+ if (isInlineImmediate(Splat.getNode())) {
+ if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Splat)) {
+ unsigned Imm = C->getAPIntValue().getSExtValue();
+ Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32);
+ return true;
+ }
+ if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Splat)) {
+ unsigned Imm = C->getValueAPF().bitcastToAPInt().getSExtValue();
+ Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32);
+ return true;
+ }
+ llvm_unreachable("unhandled Constant node");
+ }
+ }
+
+ // 16 bit splat
+ SDValue SplatSrc32 = stripBitcast(In);
+ if (auto *SplatSrc32BV = dyn_cast<BuildVectorSDNode>(SplatSrc32)) {
+ if (SDValue Splat32 = SplatSrc32BV->getSplatValue()) {
+ SDValue SplatSrc16 = stripBitcast(Splat32);
+ if (auto *SplatSrc16BV = dyn_cast<BuildVectorSDNode>(SplatSrc16)) {
+ if (SDValue Splat = SplatSrc16BV->getSplatValue()) {
+
+ // f16
+ if (isInlineImmediate(Splat.getNode())) {
+ const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Splat);
+ int64_t Imm = C->getValueAPF().bitcastToAPInt().getSExtValue();
+ Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i16);
+ return true;
+ }
+
+ // bf16
+ if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Splat)) {
+ const SIInstrInfo *TII = Subtarget->getInstrInfo();
+ APInt BF16Value = C->getAPIntValue();
+ APInt F32Value = BF16Value.zext(32).shl(16);
+ if (TII->isInlineConstant(F32Value)) {
+ int64_t Imm = F32Value.getSExtValue();
+ Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32);
+ return true;
+ }
+ }
+ }
+ }
+ }
+ }
+
+ return false;
+}
+
+bool AMDGPUDAGToDAGISel::SelectSWMMACIndex8(SDValue In, SDValue &Src,
+ SDValue &IndexKey) const {
+ unsigned Key = 0;
+ Src = In;
+
+ if (In.getOpcode() == ISD::SRL) {
+ const llvm::SDValue &ShiftSrc = In.getOperand(0);
+ ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(In.getOperand(1));
+ if (ShiftSrc.getValueType().getSizeInBits() == 32 && ShiftAmt &&
+ ShiftAmt->getZExtValue() % 8 == 0) {
+ Key = ShiftAmt->getZExtValue() / 8;
+ Src = ShiftSrc;
+ }
+ }
+
+ IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);
+ return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectSWMMACIndex16(SDValue In, SDValue &Src,
+ SDValue &IndexKey) const {
+ unsigned Key = 0;
+ Src = In;
+
+ if (In.getOpcode() == ISD::SRL) {
+ const llvm::SDValue &ShiftSrc = In.getOperand(0);
+ ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(In.getOperand(1));
+ if (ShiftSrc.getValueType().getSizeInBits() == 32 && ShiftAmt &&
+ ShiftAmt->getZExtValue() == 16) {
+ Key = 1;
+ Src = ShiftSrc;
+ }
+ }
+
+ IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);
+ return true;
+}
+
bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src,
SDValue &SrcMods) const {
Src = In;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index 8645490f0b16f1..3b42d88df0c246 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -240,6 +240,16 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
bool SelectVOP3PModsNeg(SDValue In, SDValue &Src) const;
bool SelectWMMAOpSelVOP3PMods(SDValue In, SDValue &Src) const;
+ bool SelectWMMAModsF32NegAbs(SDValue In, SDValue &Src,
+ SDValue &SrcMods) const;
+ bool SelectWMMAModsF16Neg(SDValue In, SDValue &Src, SDValue &SrcMods) const;
+ bool SelectWMMAModsF16NegAbs(SDValue In, SDValue &Src,
+ SDValue &SrcMods) const;
+ bool SelectWMMAVISrc(SDValue In, SDValue &Src) const;
+
+ bool SelectSWMMACIndex8(SDValue In, SDValue &Src, SDValue &IndexKey) const;
+ bool SelectSWMMACIndex16(SDValue In, SDValue &Src, SDValue &IndexKey) const;
+
bool SelectVOP3OpSel(SDValue In, SDValue &Src, SDValue &SrcMods) const;
bool SelectVOP3OpSelMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index fdee74d58d2691..f255d098b631c7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -3956,6 +3956,219 @@ AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods(
}};
}
+static Register buildRegSequence(SmallVectorImpl<Register> &Elts,
+ MachineInstr *InsertPt,
+ MachineRegisterInfo &MRI) {
+ const TargetRegisterClass *DstRegClass;
+ switch (Elts.size()) {
+ case 8:
+ DstRegClass = &AMDGPU::VReg_256RegClass;
+ break;
+ case 4:
+ DstRegClass = &AMDGPU::VReg_128RegClass;
+ break;
+ case 2:
+ DstRegClass = &AMDGPU::VReg_64RegClass;
+ break;
+ default:
+ llvm_unreachable("unhandled Reg sequence size");
+ }
+
+ MachineIRBuilder B(*InsertPt);
+ auto MIB = B.buildInstr(AMDGPU::REG_SEQUENCE)
+ .addDef(MRI.createVirtualRegister(DstRegClass));
+ for (unsigned i = 0; i < Elts.size(); ++i) {
+ MIB.addReg(Elts[i]);
+ MIB.addImm(SIRegisterInfo::getSubRegFromChannel(i));
+ }
+ return MIB->getOperand(0).getReg();
+}
+
+static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods,
+ SmallVectorImpl<Register> &Elts, Register &Src,
+ MachineInstr *InsertPt,
+ MachineRegisterInfo &MRI) {
+ if (ModOpcode == TargetOpcode::G_FNEG) {
+ Mods |= SISrcMods::NEG;
+ // Check if all elements also have abs modifier
+ SmallVector<Register, 8> NegAbsElts;
+ for (auto El : Elts) {
+ Register FabsSrc;
+ if (!mi_match(El, MRI, m_GFabs(m_Reg(FabsSrc))))
+ break;
+ NegAbsElts.push_back(FabsSrc);
+ }
+ if (Elts.size() != NegAbsElts.size()) {
+ // Neg
+ Src = buildRegSequence(Elts, InsertPt, MRI);
+ } else {
+ // Neg and Abs
+ Mods |= SISrcMods::NEG_HI;
+ Src = buildRegSequence(NegAbsElts, InsertPt, MRI);
+ }
+ } else {
+ assert(ModOpcode == TargetOpcode::G_FABS);
+ // Abs
+ Mods |= SISrcMods::NEG_HI;
+ Src = buildRegSequence(Elts, InsertPt, MRI);
+ }
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectWMMAModsF32NegAbs(MachineOperand &Root) const {
+ Register Src = Root.getReg();
+ unsigned Mods = SISrcMods::OP_SEL_1;
+ unsigned ModOpcode;
+ SmallVector<Register, 8> EltsF32;
+
+ if (GBuildVector *BV = dyn_cast<GBuildVector>(MRI->getVRegDef(Src))) {
+ for (unsigned i = 0; i < BV->getNumSources(); ++i) {
+ MachineInstr *ElF32 = MRI->getVRegDef(BV->getSourceReg(i));
+ // Based on first element decide which mod we match, neg or abs
+ if (EltsF32.empty())
+ ModOpcode = (ElF32->getOpcode() == AMDGPU::G_FNEG) ? AMDGPU::G_FNEG
+ : AMDGPU::G_FABS;
+ if (ElF32->getOpcode() != ModOpcode)
+ break;
+ EltsF32.push_back(ElF32->getOperand(1).getReg());
+ }
+
+ // All elements had ModOpcode modifier
+ if (BV->getNumSources() == EltsF32.size()) {
+ selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, Root.getParent(),
+ *MRI);
+ }
+ }
+
+ return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectWMMAModsF16Neg(MachineOperand &Root) const {
+ Register Src = Root.getReg();
+ unsigned Mods = SISrcMods::OP_SEL_1;
+ SmallVector<Register, 8> EltsV2F16;
+
+ if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
+ for (unsigned i = 0; i < CV->getNumSources(); ++i) {
+ Register FNegSrc;
+ if (!mi_match(CV->getSourceReg(i), *MRI, m_GFNeg(m_Reg(FNegSrc))))
+ break;
+ EltsV2F16.push_back(FNegSrc);
+ }
+
+ // All elements had ModOpcode modifier
+ if (CV->getNumSources() == EltsV2F16.size()) {
+ Mods |= SISrcMods::NEG;
+ Mods |= SISrcMods::NEG_HI;
+ Src = buildRegSequence(EltsV2F16, Root.getParent(), *MRI);
+ }
+ }
+
+ return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectWMMAModsF16NegAbs(MachineOperand &Root) const {
+ Register Src = Root.getReg();
+ unsigned Mods = SISrcMods::OP_SEL_1;
+ unsigned ModOpcode;
+ SmallVector<Register, 8> EltsV2F16;
+
+ if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
+ for (unsigned i = 0; i < CV->getNumSources(); ++i) {
+ MachineInstr *ElV2F16 = MRI->getVRegDef(CV->getSourceReg(i));
+ // Based on first element decide which mod we match, neg or abs
+ if (EltsV2F16.empty())
+ ModOpcode = (ElV2F16->getOpcode() == AMDGPU::G_FNEG) ? AMDGPU::G_FNEG
+ : AMDGPU::G_FABS;
+ if (ElV2F16->getOpcode() != ModOpcode)
+ break;
+ EltsV2F16.push_back(ElV2F16->getOperand(1).getReg());
+ }
+
+ // All elements had ModOpcode modifier
+ if (CV->getNumSources() == EltsV2F16.size()) {
+ MachineIRBuilder B(*Root.getParent());
+ selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, Root.getParent(),
+ *MRI);
+ }
+ }
+
+ return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectWMMAVISrc(MachineOperand &Root) const {
+ std::optional<FPValueAndVReg> FPValReg;
+ if (mi_match(Root.getReg(), *MRI, m_GFCstOrSplat(FPValReg))) {
+ if (TII.isInlineConstant(FPValReg->Value.bitcastToAPInt())) {
+ return {{[=](MachineInstrBuilder &MIB) {
+ MIB.addImm(FPValReg->Value.bitcastToAPInt().getSExtValue());
+ }}};
+ }
+ // Non-inlineable splat floats should not fall-through for integer immediate
+ // checks.
+ return {};
+ }
+
+ APInt ICst;
+ if (mi_match(Root.getReg(), *MRI, m_ICstOrSplat(ICst))) {
+ if (TII.isInlineConstant(ICst)) {
+ return {
+ {[=](MachineInstrBuilder &MIB) { MIB.addImm(ICst.getSExtValue()); }}};
+ }
+ }
+
+ return {};
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectSWMMACIndex8(MachineOperand &Root) const {
+ Register Src =
+ getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
+ unsigned Key = 0;
+
+ Register ShiftSrc;
+ std::optional<ValueAndVReg> ShiftAmt;
+ if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
+ MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
+ ShiftAmt->Value.getZExtValue() % 8 == 0) {
+ Key = ShiftAmt->Value.getZExtValue() / 8;
+ Src = ShiftSrc;
+ }
+
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
+ }};
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectSWMMACIndex16(MachineOperand &Root) const {
+
+ Register Src =
+ getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
+ unsigned Key = 0;
+
+ Register ShiftSrc;
+ std::optional<ValueAndVReg> ShiftAmt;
+ if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
+ MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
+ ShiftAmt->Value.getZExtValue() == 16) {
+ Src = ShiftSrc;
+ Key = 1;
+ }
+
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
+ }};
+}
+
InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
Register Src;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 12ea46c2895b04..ef7630f137aca6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -199,6 +199,19 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
InstructionSelector::ComplexRendererFns
selectWMMAOpSelVOP3PMods(MachineOperand &Root) const;
+ InstructionSelector::ComplexRendererFns
+ selectWMMAModsF32NegAbs(MachineOperand &Root) const;
+ InstructionSelector::ComplexRendererFns
+ selectWMMAModsF16Neg(MachineOperand &Root) const;
+ InstructionSelector::ComplexRendererFns
+ selectWMMAModsF16NegAbs(MachineOperand &Root) const;
+ InstructionSelector::ComplexRendererFns
+ selectWMMAVISrc(MachineOperand &Root) const;
+ InstructionSelector::ComplexRendererFns
+ selectSWMMACIndex8(MachineOperand &Root) const;
+ InstructionSelector::ComplexRendererFns
+ selectSWMMACIndex16(MachineOperand &Root) const;
+
InstructionSelector::ComplexRendererFns
selectVOP3OpSelMods(MachineOperand &Root) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 8e74d4c0e94592..32921bb248caf0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -7134,6 +7134,29 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
case Intrinsic::amdgcn_image_bvh_intersect_ray:
return legalizeBVHIntrinsic(MI, B);
+ case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
+ case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
+ case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
+ case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
+ case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
+ case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
+ case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
+ case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
+ Register Index = MI.getOperand(5).getReg();
+ LLT S32 = LLT::scalar(32);
+ if (MRI.getType(Index) != S32)
+ MI.getOperand(5).setReg(B.buildAnyExt(S32, Index).getReg(0));
+ return true;
+ }
+ case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
+ case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
+ case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
+ Register Index = MI.getOperand(7).getReg();
+ LLT S32 = LLT::scalar(32);
+ if (MRI.getType(Index) != S32)
+ MI.getOperand(7).setReg(B.buildAnyExt(S32, Index).getReg(0));
+ return true;
+ }
case Intrinsic::amdgcn_fmed3: {
GISelChangeObserver &Observer = Helper.Observer;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index bdd4e891f15899..09fac963d222d4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4505,6 +4505,22 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_wmma_f32_16x16x16_f16:
case Intrinsic::amdgcn_wmma_i32_16x16x16_iu4:
case Intrinsic::amdgcn_wmma_i32_16x16x16_iu8:
+ case Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_fp8:
+ case Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_bf8:
+ case Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_fp8:
+ case Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_bf8:
+ case Intrinsic::amdgcn_wmma_i32_16x16x32_iu4:
+ case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
+ case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
+ case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
+ case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
+ case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
+ case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
+ case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4:
+ case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
+ case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
+ case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
+ case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8:
return getDefaultMappingVOP(MI);
case Intrinsic::amdgcn_log:
case Intrinsic::amdgcn_exp2:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index 67263f23b98314..bb1c6b73372999 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -414,6 +414,22 @@ def : SourceOfDivergence<int_amdgcn_wmma_f16_16x16x16_f16>;
def : SourceOfDivergence<int_amdgcn_wmma_bf16_16x16x16_bf16>;
def : SourceOfDivergence<int_amdgcn_wmma_i32_16x16x16_iu8>;
def : SourceOfDivergence<int_amdgcn_wmma_i32_16x16x16_iu4>;
+def : SourceOfDivergence<int_amdgcn_wmma_f32_16x16x16_fp8_fp8>;
+def : SourceOfDivergence<int_amdgcn_wmma_f32_16x16x16_fp8_bf8>;
+def : SourceOfDivergence<int_amdgcn_wmma_f32_16x16x16_bf8_fp8>;
+def : SourceOfDivergence<int_amdgcn_wmma_f32_16x16x16_bf8_bf8>;
+def : SourceOfDivergence<int_amdgcn_wmma_i32_16x16x32_iu4>;
+def : SourceOfDivergence<int_amdgcn_swmmac_f32_16x16x32_f16>;
+def : SourceOfDivergence<int_amdgcn_swmmac_f32_16x16x32_bf16>;
+def : SourceOfDivergence<int_amdgcn_swmmac_f16_16x16x32_f16>;
+def : SourceOfDivergence<int_amdgcn_swmmac_bf16_16x16x32_bf16>;
+def : SourceOfDivergence<int_amdgcn_swmmac_i32_16x16x32_iu8>;
+def : SourceOfDivergence<int_amdgcn_swmmac_i32_16x16x32_iu4>;
+def : SourceOfDivergence<int_amdgcn_swmmac_i32_16x16x64_iu4>;
+def : SourceOfDivergence<int_amdgcn_swmmac_f32_16x16x32_fp8_fp8>;
+def : SourceOfDivergence<int_amdgcn_swmmac_f32_16x16x32_fp8_bf8>;
+def : SourceOfDivergence<int_amdgcn_swmmac_f32_16x16x32_bf8_fp8>;
+def : SourceOfDivergence<int_amdgcn_swmmac_f32_16x16x32_bf8_bf8>;
def : SourceOfDivergence<int_amdgcn_global_load_tr>;
// The dummy boolean output is divergent from the IR's perspective,
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 6b1d04b04066fe..9ab657f4e7bb4f 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -151,6 +151,8 @@ class AMDGPUOperand : public MCParsedAsmOperand {
ImmTyOpSelHi,
ImmTyNegLo,
ImmTyNegHi,
+ ImmTyIndexKey8bit,
+ ImmTyIndexKey16bit,
ImmTyDPP8,
ImmTyDppCtrl,
ImmTyDppRowMask,
@@ -383,6 +385,8 @@ class AMDGPUOperand : public MCParsedAsmOperand {
bool isGDS() const { return isImmTy(ImmTyGDS); }
bool isLDS() const { return isImmTy(ImmTyLDS); }
bool isCPol() const { return isImmTy(ImmTyCPol); }
+ bool isIndexKey8bit() const { return isImmTy(ImmTyIndexKey8bit); }
+ bool isIndexKey16bit() const { return isImmTy(ImmTyIndexKey16bit); }
bool isTFE() const { return isImmTy(ImmTyTFE); }
bool isFORMAT() const { return isImmTy(ImmTyFORMAT) && isUInt<7>(getImm()); }
bool isDppBankMask() const { return isImmTy(ImmTyDppBankMask); }
@@ -656,6 +660,14 @@ class AMDGPUOperand : public MCParsedAsmOperand {
return isVISrcF16() || isVISrcB32();
}
+ bool isVISrc_64F16() const {
+ return isRegOrInlineNoMods(AMDGPU::VReg_64RegClassID, MVT::f16);
+ }
+
+ bool isVISrc_64B32() const {
+ return isRegOrInlineNoMods(AMDGPU::VReg_64RegClassID, MVT::i32);
+ }
+
bool isVISrc_64B64() const {
return isRegOrInlineNoMods(AMDGPU::VReg_64RegClassID, MVT::i64);
}
@@ -672,6 +684,14 @@ class AMDGPUOperand : public MCParsedAsmOperand {
return isRegOrInlineNoMods(AMDGPU::VReg_64RegClassID, MVT::i32);
}
+ bool isVISrc_256B32() const {
+ return isRegOrInlineNoMods(AMDGPU::VReg_256RegClassID, MVT::i32);
+ }
+
+ bool isVISrc_256F32() const {
+ return isRegOrInlineNoMods(AMDGPU::VReg_256RegClassID, MVT::f32);
+ }
+
bool isVISrc_256B64() const {
return isRegOrInlineNoMods(AMDGPU::VReg_256RegClassID, MVT::i64);
}
@@ -1047,6 +1067,8 @@ class AMDGPUOperand : public MCParsedAsmOperand {
case ImmTyOffset1: OS << "Offset1"; break;
case ImmTySMEMOffsetMod: OS << "SMEMOffsetMod"; break;
case ImmTyCPol: OS << "CPol"; break;
+ case ImmTyIndexKey8bit: OS << "index_key"; break;
+ case ImmTyIndexKey16bit: OS << "index_key"; break;
case ImmTyTFE: OS << "TFE"; break;
case ImmTyD16: OS << "D16"; break;
case ImmTyFORMAT: OS << "FORMAT"; break;
@@ -1604,6 +1626,11 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
ParseStatus parseRegWithFPInputMods(OperandVector &Operands);
ParseStatus parseRegWithIntInputMods(OperandVector &Operands);
ParseStatus parseVReg32OrOff(OperandVector &Operands);
+ ParseStatus tryParseIndexKey(OperandVector &Operands,
+ AMDGPUOperand::ImmTy ImmTy);
+ ParseStatus parseIndexKey8bit(OperandVector &Operands);
+ ParseStatus parseIndexKey16bit(OperandVector &Operands);
+
ParseStatus parseDfmtNfmt(int64_t &Format);
ParseStatus parseUfmt(int64_t &Format);
ParseStatus parseSymbolicSplitFormat(StringRef FormatStr, SMLoc Loc,
@@ -1784,6 +1811,8 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
void cvtVOP3OpSel(MCInst &Inst, const OperandVector &Operands);
void cvtVOP3(MCInst &Inst, const OperandVector &Operands);
void cvtVOP3P(MCInst &Inst, const OperandVector &Operands);
+ void cvtSWMMAC(MCInst &Inst, const OperandVector &Operands);
+
void cvtVOPD(MCInst &Inst, const OperandVector &Operands);
void cvtVOP3OpSel(MCInst &Inst, const OperandVector &Operands,
OptionalImmIndexMap &OptionalIdx);
@@ -4367,7 +4396,11 @@ bool AMDGPUAsmParser::validateNeg(const MCInst &Inst, int OpName) {
uint64_t TSFlags = MII.get(Opc).TSFlags;
// v_dot4 fp8/bf8 neg_lo/neg_hi not allowed on src0 and src1 (allowed on src2)
- if (!(TSFlags & SIInstrFlags::IsDOT))
+ // v_wmma iu4/iu8 neg_lo not allowed on src2 (allowed on src0, src1)
+ // v_swmmac f16/bf16 neg_lo/neg_hi not allowed on src2 (allowed on src0, src1)
+ // other wmma/swmmac instructions don't have neg_lo/neg_hi operand.
+ if (!(TSFlags & SIInstrFlags::IsDOT) && !(TSFlags & SIInstrFlags::IsWMMA) &&
+ !(TSFlags & SIInstrFlags::IsSWMMAC))
return true;
int NegIdx = AMDGPU::getNamedOperandIdx(Opc, OpName);
@@ -6468,6 +6501,33 @@ bool AMDGPUAsmParser::tryParseFmt(const char *Pref,
return true;
}
+ParseStatus AMDGPUAsmParser::tryParseIndexKey(OperandVector &Operands,
+ AMDGPUOperand::ImmTy ImmTy) {
+ const char *Pref = "index_key";
+ int64_t ImmVal = 0;
+ SMLoc Loc = getLoc();
+ auto Res = parseIntWithPrefix(Pref, ImmVal);
+ if (!Res.isSuccess())
+ return Res;
+
+ if (ImmTy == AMDGPUOperand::ImmTyIndexKey16bit && (ImmVal < 0 || ImmVal > 1))
+ return Error(Loc, Twine("out of range ", StringRef(Pref)));
+
+ if (ImmTy == AMDGPUOperand::ImmTyIndexKey8bit && (ImmVal < 0 || ImmVal > 3))
+ return Error(Loc, Twine("out of range ", StringRef(Pref)));
+
+ Operands.push_back(AMDGPUOperand::CreateImm(this, ImmVal, Loc, ImmTy));
+ return ParseStatus::Success;
+}
+
+ParseStatus AMDGPUAsmParser::parseIndexKey8bit(OperandVector &Operands) {
+ return tryParseIndexKey(Operands, AMDGPUOperand::ImmTyIndexKey8bit);
+}
+
+ParseStatus AMDGPUAsmParser::parseIndexKey16bit(OperandVector &Operands) {
+ return tryParseIndexKey(Operands, AMDGPUOperand::ImmTyIndexKey16bit);
+}
+
// dfmt and nfmt (in a tbuffer instruction) are parsed as one to allow their
// values to live in a joint format operand in the MCInst encoding.
ParseStatus AMDGPUAsmParser::parseDfmtNfmt(int64_t &Format) {
@@ -8340,10 +8400,12 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands,
}
int NegLoIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::neg_lo);
- if (NegLoIdx != -1) {
+ if (NegLoIdx != -1)
addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyNegLo);
+
+ int NegHiIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::neg_hi);
+ if (NegHiIdx != -1)
addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyNegHi);
- }
const int Ops[] = { AMDGPU::OpName::src0,
AMDGPU::OpName::src1,
@@ -8363,11 +8425,11 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands,
if (OpSelHiIdx != -1)
OpSelHi = Inst.getOperand(OpSelHiIdx).getImm();
- if (NegLoIdx != -1) {
- int NegHiIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::neg_hi);
+ if (NegLoIdx != -1)
NegLo = Inst.getOperand(NegLoIdx).getImm();
+
+ if (NegHiIdx != -1)
NegHi = Inst.getOperand(NegHiIdx).getImm();
- }
for (int J = 0; J < 3; ++J) {
int OpIdx = AMDGPU::getNamedOperandIdx(Opc, Ops[J]);
@@ -8403,6 +8465,43 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands) {
cvtVOP3P(Inst, Operands, OptIdx);
}
+static void addSrcModifiersAndSrc(MCInst &Inst, const OperandVector &Operands,
+ unsigned i, unsigned Opc, unsigned OpName) {
+ if (AMDGPU::getNamedOperandIdx(Opc, OpName) != -1)
+ ((AMDGPUOperand &)*Operands[i]).addRegOrImmWithFPInputModsOperands(Inst, 2);
+ else
+ ((AMDGPUOperand &)*Operands[i]).addRegOperands(Inst, 1);
+}
+
+void AMDGPUAsmParser::cvtSWMMAC(MCInst &Inst, const OperandVector &Operands) {
+ unsigned Opc = Inst.getOpcode();
+
+ ((AMDGPUOperand &)*Operands[1]).addRegOperands(Inst, 1);
+ addSrcModifiersAndSrc(Inst, Operands, 2, Opc, AMDGPU::OpName::src0_modifiers);
+ addSrcModifiersAndSrc(Inst, Operands, 3, Opc, AMDGPU::OpName::src1_modifiers);
+ ((AMDGPUOperand &)*Operands[1]).addRegOperands(Inst, 1); // srcTiedDef
+ ((AMDGPUOperand &)*Operands[4]).addRegOperands(Inst, 1); // src2
+
+ OptionalImmIndexMap OptIdx;
+ for (unsigned i = 5; i < Operands.size(); ++i) {
+ AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]);
+ OptIdx[Op.getImmTy()] = i;
+ }
+
+ if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::index_key_8bit))
+ addOptionalImmOperand(Inst, Operands, OptIdx,
+ AMDGPUOperand::ImmTyIndexKey8bit);
+
+ if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::index_key_16bit))
+ addOptionalImmOperand(Inst, Operands, OptIdx,
+ AMDGPUOperand::ImmTyIndexKey16bit);
+
+ if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::clamp))
+ addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyClampSI);
+
+ cvtVOP3P(Inst, Operands, OptIdx);
+}
+
//===----------------------------------------------------------------------===//
// VOPD
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 2c6aa2ee348a1f..a9968cfe25b46b 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -260,8 +260,12 @@ DECODE_OPERAND_SRC_REG_OR_IMM_9(VS_32, OPW32, 32)
DECODE_OPERAND_SRC_REG_OR_IMM_9(VS_64, OPW64, 64)
DECODE_OPERAND_SRC_REG_OR_IMM_9(VS_64, OPW64, 32)
DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_64, OPW64, 64)
+DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_64, OPW64, 32)
+DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_64, OPW64, 16)
DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_128, OPW128, 32)
+DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_128, OPW128, 16)
DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_256, OPW256, 64)
+DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_256, OPW256, 32)
DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_512, OPW512, 32)
DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_1024, OPW1024, 32)
@@ -704,6 +708,10 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
break;
Res = tryDecodeInst(DecoderTableWMMAGFX1164, MI, QW, Address, CS);
+ if (Res)
+ break;
+
+ Res = tryDecodeInst(DecoderTableWMMAGFX1264, MI, QW, Address, CS);
} while (false);
if (Res && AMDGPU::isMAC(MI.getOpcode())) {
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 102ecb5e7680e7..1c013cc25b2963 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -1716,14 +1716,14 @@ bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) {
}
bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
- if (!SIInstrInfo::isWMMA(*MI))
+ if (!SIInstrInfo::isWMMA(*MI) && !SIInstrInfo::isSWMMAC(*MI))
return false;
const SIInstrInfo *TII = ST.getInstrInfo();
const SIRegisterInfo *TRI = ST.getRegisterInfo();
- auto IsHazardFn = [MI, TII, TRI](const MachineInstr &I) {
- if (!SIInstrInfo::isWMMA(I))
+ auto IsHazardFn = [MI, TII, TRI, this](const MachineInstr &I) {
+ if (!SIInstrInfo::isWMMA(I) && !SIInstrInfo::isSWMMAC(I))
return false;
// Src0(matrix A) or Src1(matrix B) of the current wmma instruction overlaps
@@ -1741,6 +1741,18 @@ bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
return true;
}
+ // GFX12+ allows overlap of matrix C with PrevDstReg (hardware will stall)
+ // but Index can't overlap with PrevDstReg.
+ if (AMDGPU::isGFX12Plus(ST)) {
+ if (SIInstrInfo::isSWMMAC(*MI)) {
+ const Register CurIndex =
+ TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg();
+ if (TRI->regsOverlap(PrevDstReg, CurIndex))
+ return true;
+ }
+ return false;
+ }
+
return false;
};
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index 9e64e3fd79576d..abfa4a3531e8e1 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -1275,6 +1275,23 @@ void AMDGPUInstPrinter::printPackedModifier(const MCInst *MI,
(ModIdx != -1) ? MI->getOperand(ModIdx).getImm() : DefaultValue;
}
+ // Print three values of neg/opsel for wmma instructions (prints 0 when there
+ // is no src_modifier operand instead of not printing anything).
+ if (MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::IsSWMMAC ||
+ MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::IsWMMA) {
+ NumOps = 0;
+ int DefaultValue = Mod == SISrcMods::OP_SEL_1;
+ for (int OpName :
+ {AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers,
+ AMDGPU::OpName::src2_modifiers}) {
+ int Idx = AMDGPU::getNamedOperandIdx(Opc, OpName);
+ if (Idx != -1)
+ Ops[NumOps++] = MI->getOperand(Idx).getImm();
+ else
+ Ops[NumOps++] = DefaultValue;
+ }
+ }
+
const bool HasDstSel =
NumOps > 0 &&
Mod == SISrcMods::OP_SEL_0 &&
@@ -1346,6 +1363,26 @@ void AMDGPUInstPrinter::printNegHi(const MCInst *MI, unsigned OpNo,
printPackedModifier(MI, " neg_hi:[", SISrcMods::NEG_HI, O);
}
+void AMDGPUInstPrinter::printIndexKey8bit(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ auto Imm = MI->getOperand(OpNo).getImm() & 0x7;
+ if (Imm == 0)
+ return;
+
+ O << " index_key:" << Imm;
+}
+
+void AMDGPUInstPrinter::printIndexKey16bit(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ auto Imm = MI->getOperand(OpNo).getImm() & 0x7;
+ if (Imm == 0)
+ return;
+
+ O << " index_key:" << Imm;
+}
+
void AMDGPUInstPrinter::printInterpSlot(const MCInst *MI, unsigned OpNum,
const MCSubtargetInfo &STI,
raw_ostream &O) {
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
index e3958f88277da8..e91ff86b219a0c 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
@@ -139,6 +139,10 @@ class AMDGPUInstPrinter : public MCInstPrinter {
const MCSubtargetInfo &STI, raw_ostream &O);
void printNegHi(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O);
+ void printIndexKey8bit(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printIndexKey16bit(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
void printInterpSlot(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O);
void printInterpAttr(const MCInst *MI, unsigned OpNo,
diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index 8ab66d4fd5b861..19596d53b45328 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -167,6 +167,9 @@ enum : uint64_t {
// ds_gws_* instructions.
GWS = UINT64_C(1) << 62,
+
+ // Is a SWMMAC instruction.
+ IsSWMMAC = UINT64_C(1) << 63,
};
// v_cmp_class_* etc. use a 10-bit mask for what operation is checked.
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 2862a7787e75a3..a812cdc61500cc 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -208,6 +208,7 @@ bool SIFoldOperands::canUseImmWithOpSel(FoldCandidate &Fold) const {
assert(Old.isReg() && Fold.isImm());
if (!(TSFlags & SIInstrFlags::IsPacked) || (TSFlags & SIInstrFlags::IsMAI) ||
+ (TSFlags & SIInstrFlags::IsWMMA) || (TSFlags & SIInstrFlags::IsSWMMAC) ||
(ST->hasDOTOpSelHazard() && (TSFlags & SIInstrFlags::IsDOT)))
return false;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index cf947dccafac55..d35b76c8ad54eb 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -8242,6 +8242,36 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
SIInstrInfo::MO_ABS32_LO);
return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
}
+ case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
+ case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
+ case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
+ case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
+ case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
+ case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
+ case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
+ case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
+ if (Op.getOperand(4).getValueType() == MVT::i32)
+ return SDValue();
+
+ SDLoc SL(Op);
+ auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i32);
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
+ Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
+ Op.getOperand(3), IndexKeyi32);
+ }
+ case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
+ case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
+ case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
+ if (Op.getOperand(6).getValueType() == MVT::i32)
+ return SDValue();
+
+ SDLoc SL(Op);
+ auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, MVT::i32);
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
+ {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
+ Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
+ IndexKeyi32, Op.getOperand(7)});
+ }
default:
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
diff --git a/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/llvm/lib/Target/AMDGPU/SIInstrFormats.td
index 1b66d163714fbc..ab536f8f49d537 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrFormats.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrFormats.td
@@ -161,6 +161,9 @@ class InstSI <dag outs, dag ins, string asm = "",
// ds_gws_* instructions.
field bit GWS = 0;
+ // This bit indicates that this is one of SWMMAC instructions.
+ field bit IsSWMMAC = 0;
+
// These need to be kept in sync with the enum in SIInstrFlags.
let TSFlags{0} = SALU;
let TSFlags{1} = VALU;
@@ -248,6 +251,8 @@ class InstSI <dag outs, dag ins, string asm = "",
let TSFlags{62} = GWS;
+ let TSFlags{63} = IsSWMMAC;
+
let SchedRW = [Write32Bit];
let AsmVariantName = AMDGPUAsmVariants.Default;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index fc85b089aa471b..1c9dacc09f8154 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -802,6 +802,14 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
return isMFMA(MI) || isWMMA(MI);
}
+ static bool isSWMMAC(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::IsSWMMAC;
+ }
+
+ bool isSWMMAC(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::IsSWMMAC;
+ }
+
bool isDOT(uint16_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::IsDOT;
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 903a04718b6446..29b3b014893f27 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1088,6 +1088,9 @@ def op_sel_hi0 : ArrayOperand0<"op_sel_hi", "OpSelHi">;
def neg_lo0 : ArrayOperand0<"neg_lo", "NegLo">;
def neg_hi0 : ArrayOperand0<"neg_hi", "NegHi">;
+def IndexKey16bit : CustomOperand<i32, 1>;
+def IndexKey8bit : CustomOperand<i32, 1>;
+
def dpp8 : CustomOperand<i32, 0, "DPP8">;
def dpp_ctrl : CustomOperand<i32, 0, "DPPCtrl">;
@@ -1344,6 +1347,13 @@ def VOP3PModsDOT : ComplexPattern<untyped, 2, "SelectVOP3PModsDOT">;
def VOP3PModsNeg : ComplexPattern<untyped, 1, "SelectVOP3PModsNeg">;
def WMMAOpSelVOP3PMods : ComplexPattern<untyped, 1, "SelectWMMAOpSelVOP3PMods">;
+def WMMAModsF32NegAbs : ComplexPattern<untyped, 2, "SelectWMMAModsF32NegAbs">;
+def WMMAModsF16Neg : ComplexPattern<untyped, 2, "SelectWMMAModsF16Neg">;
+def WMMAModsF16NegAbs : ComplexPattern<untyped, 2, "SelectWMMAModsF16NegAbs">;
+def WMMAVISrc : ComplexPattern<untyped, 1, "SelectWMMAVISrc">;
+def SWMMACIndex8 : ComplexPattern<untyped, 2, "SelectSWMMACIndex8">;
+def SWMMACIndex16 : ComplexPattern<untyped, 2, "SelectSWMMACIndex16">;
+
def VOP3OpSel : ComplexPattern<untyped, 2, "SelectVOP3OpSel">;
def VOP3OpSelMods : ComplexPattern<untyped, 2, "SelectVOP3OpSelMods">;
@@ -2279,6 +2289,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
field bit IsDOT = 0;
field bit IsSingle = 0;
field bit IsWMMA = 0;
+ field bit IsSWMMAC = 0;
field bit IsFP8 = 0;
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index f42af89cf5e6d3..b3265b73fa7e11 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -1341,9 +1341,14 @@ def VCSrc_v2f16 : RegOrV2F16 <"VS_32", "OPERAND_REG_INLINE_C">;
// VISrc_* Operands with a VGPR or an inline constant
//===----------------------------------------------------------------------===//
+def VISrc_64_f16 : RegOrF16 <"VReg_64", "OPERAND_REG_INLINE_C">;
+def VISrc_64_b32 : RegOrB32 <"VReg_64", "OPERAND_REG_INLINE_C">;
def VISrc_64_f64 : RegOrF64 <"VReg_64", "OPERAND_REG_INLINE_C">;
+def VISrc_128_f16 : RegOrF16 <"VReg_128", "OPERAND_REG_INLINE_C">;
def VISrc_128_b32 : RegOrB32 <"VReg_128", "OPERAND_REG_INLINE_C">;
def VISrc_128_f32 : RegOrF32 <"VReg_128", "OPERAND_REG_INLINE_C">;
+def VISrc_256_b32 : RegOrB32 <"VReg_256", "OPERAND_REG_INLINE_C">;
+def VISrc_256_f32 : RegOrF32 <"VReg_256", "OPERAND_REG_INLINE_C">;
def VISrc_256_f64 : RegOrF64 <"VReg_256", "OPERAND_REG_INLINE_C">;
def VISrc_512_b32 : RegOrB32 <"VReg_512", "OPERAND_REG_INLINE_C">;
def VISrc_512_f32 : RegOrF32 <"VReg_512", "OPERAND_REG_INLINE_C">;
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 0c7a08cd4bc91f..107b95a9ca8eb0 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -936,16 +936,19 @@ multiclass WMMAInst<string Suffix, string Instr, VOPProfile P, SDPatternOperator
!cast<Instruction>(NAME # _threeaddr # Suffix)>;
}
- if !eq(Type, WMMAOpSel) then {
- def : WMMAOpSelPat<!cast<Instruction>(NAME # _twoaddr # Suffix), node, P>;
- } else if !eq(Type, WMMAUIClamp) then {
- def : WMMAUIClampPat<!cast<Instruction>(NAME # _twoaddr # Suffix), node, P>;
- } else {
- def : WMMARegularPat<!cast<Instruction>(NAME # _twoaddr # Suffix), node, P>;
+ let SubtargetPredicate = isGFX11Only in {
+ if !eq(Type, WMMAOpSel) then {
+ def : WMMAOpSelPat<!cast<Instruction>(NAME # _twoaddr # Suffix), node, P>;
+ } else if !eq(Type, WMMAUIClamp) then {
+ def : WMMAUIClampPat<!cast<Instruction>(NAME # _twoaddr # Suffix), node, P>;
+ } else {
+ def : WMMARegularPat<!cast<Instruction>(NAME # _twoaddr # Suffix), node, P>;
+ }
}
}
+
let WaveSizePredicate = isWave32 in {
defm V_WMMA_F32_16X16X16_F16 : WMMAInst<"_w32", "v_wmma_f32_16x16x16_f16", VOP_V8F32_V16F16_V16F16_V8F32, int_amdgcn_wmma_f32_16x16x16_f16, VRegSrc_256, WMMARegular, 1>;
defm V_WMMA_F32_16X16X16_BF16 : WMMAInst<"_w32", "v_wmma_f32_16x16x16_bf16", VOP_V8F32_V16I16_V16I16_V8F32, int_amdgcn_wmma_f32_16x16x16_bf16, VRegSrc_256, WMMARegular, 1>;
@@ -969,6 +972,398 @@ let WaveSizePredicate = isWave64 in {
}
+class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
+ bit _IsIU, bit _IsFP8BF8>
+ : VOP3P_Profile<VOPProfile<ArgTy>> {
+ bit IsIU = _IsIU;
+ bit IsFP8BF8 = _IsFP8BF8;
+ bit IsF16BF16 = !not(!or(IsIU, IsFP8BF8));
+
+ int IndexType = _IndexType;
+
+ let IsPacked = 1;
+ let IsWMMA = !not(_IsSWMMAC);
+ let IsSWMMAC = _IsSWMMAC;
+
+ bit IsAB_F16 = !and(IsF16BF16, ArgTy[1].isFP);
+ bit IsAB_BF16 = !and(IsF16BF16, isIntType<ArgTy[1]>.ret);
+ bit IsC_F32 = !or(!eq(ArgTy[3], v8f32), !eq(ArgTy[3], v4f32));
+ bit IsC_BF16 = !or(!eq(ArgTy[3], v8i16), !eq(ArgTy[3], v4i16));
+ bit IsC_F16 = !or(!eq(ArgTy[3], v8f16), !eq(ArgTy[3], v4f16));
+
+ bit NegLo01 = !or(IsF16BF16, IsIU);
+ bit NegLo2 = !and(!or(IsF16BF16, IsFP8BF8), IsWMMA);
+ bit NegHi01 = IsF16BF16;
+ bit NegHi2 = !and(!or(IsF16BF16, IsFP8BF8), IsWMMA);
+ bit NegLoAny = !or(NegLo01, NegLo2);
+ bit NegHiAny = !or(NegHi01, NegHi2);
+
+ let DstRC = !cond(!eq(ArgTy[0], v8f32): VDst_256,
+ !eq(ArgTy[0], v8i32): VDst_256,
+ !eq(ArgTy[0], v8f16): VDst_128,
+ !eq(ArgTy[0], v8i16): VDst_128,
+ !eq(ArgTy[0], v4f32): VDst_128,
+ !eq(ArgTy[0], v4i32): VDst_128,
+ !eq(ArgTy[0], v4f16): VDst_64,
+ !eq(ArgTy[0], v4i16): VDst_64);
+ let Src0RC64 = !cond(!eq(ArgTy[1], v8f16): VRegSrc_128,
+ !eq(ArgTy[1], v4f16): VRegSrc_64,
+ !eq(ArgTy[1], v4i16): VRegSrc_64,
+ !eq(ArgTy[1], v8i16): VRegSrc_128,
+ !eq(ArgTy[1], v4i32): VRegSrc_128,
+ !eq(ArgTy[1], v2i32): VRegSrc_64,
+ !eq(ArgTy[1], i32) : VRegSrc_32);
+ let Src1RC64 = !cond(!eq(ArgTy[2], v16f16): VRegSrc_256,
+ !eq(ArgTy[2], v16i16): VRegSrc_256,
+ !eq(ArgTy[2], v8f16): VRegSrc_128,
+ !eq(ArgTy[2], v8i16): VRegSrc_128,
+ !eq(ArgTy[2], v4i32): VRegSrc_128,
+ !eq(ArgTy[1], v4i16): VRegSrc_64,
+ !eq(ArgTy[1], v4f16): VRegSrc_64,
+ !eq(ArgTy[2], v2i32): VRegSrc_64,
+ !eq(ArgTy[2], i32) : VRegSrc_32);
+ let Src2RC64 = !if(IsSWMMAC, DstRC,
+ !cond(!eq(ArgTy[3], v8f32): VISrc_256_f32,
+ !eq(ArgTy[3], v8i32): VISrc_256_b32,
+ !eq(ArgTy[3], v8f16): VISrc_128_f16,
+ !eq(ArgTy[3], v8i16): VISrc_128_f32, // bf16
+ !eq(ArgTy[3], v4f16): VISrc_64_f16,
+ !eq(ArgTy[3], v4i16): VISrc_64_b32,
+ !eq(ArgTy[3], v4i32): VISrc_128_b32,
+ !eq(ArgTy[3], v4f32): VISrc_128_f32));
+
+ // For f16 and bf16 matrices A and B, each element can be modified by
+ // fneg(neg_lo,neg_hi = 1). For iu4 and iu8 matrices A and B neg_lo is
+ // overloaded to mean unsigned/signed: neg_lo = 0 (u4 and u8) unsigned(zext)
+ // neg_lo = 1 (i4 and i8) signed(sext). For f16, bf16 and f32 matrix C each
+ // element can be modified by fneg(neg_lo = 1) or fabs(neg_hi = 1).
+
+ // Opcode | src0/src1 - matrix A/B | src2 - matrix C or Index
+ // ---------------------------------------------------------------------------
+ // wmma f32_f16 | both neg_lo,neg_hi = 1 | neg_lo = 1 neg C(f32)
+ // wmma f32_bf16 | neg A/B (f16 or bf16) | neg_hi = 1 abs C(f32)
+ // ---------------------------------------------------------------------------
+ // wmma f16_f16 | both neg_lo,neg_hi = 1 | neg_lo = 1 neg C(f16 or bf16)
+ // wmma bf16_bf16 | neg A/B (f16 or bf16) | neg_hi = 1 abs C(f16 or bf16)
+ // ---------------------------------------------------------------------------
+ // wmma i32_iu8/iu4 | neg_lo = 0 u4/u8(zext) | not allowed for
+ // | neg_lo = 1 i4/i8(sext) | i32 matrices
+ // ---------------------------------------------------------------------------
+ // wmma f32_fp8/bf8 | not allowed for | neg_lo = 1 neg C(f32)
+ // (4 instructions) | f8 and bf8 matrices | neg_hi = 1 abs C(f32)
+ // ---------------------------------------------------------------------------
+ // swmmac f32_f16 | both neg_lo,neg_hi = 1 | not allowed for sparse matrix
+ // swmmac f32_bf16 | neg A/B (f16 or bf16) | A Index - matrix C is in dst
+ // ---------------------------------------------------------------------------
+ // swmmac f16_f16 | both neg_lo,neg_hi = 1 | not allowed for sparse matrix
+ // swmmac bf16_bf16 | neg A/B (f16 or bf16) | A Index - matrix C is in dst
+ // ---------------------------------------------------------------------------
+ // swmmac i32_iu8/iu4 | neg_lo = 0 u4/u8(zext) | not allowed for sparse matrix
+ // | neg_lo = 1 i4/i8(sext) | A Index - matrix C is in dst
+ // ---------------------------------------------------------------------------
+ // swmmac f32_fp8/bf8 | not allowed for | not allowed for sparse matrix
+ // (4 instructions) | f8 and bf8 matrices | A Index - matrix C is in dst
+
+ // pseudo
+
+ // fp8bf8 wmmas don't use src (0 and 1) modifiers, iu use neg_lo, f16 and bf16
+ // use neg_lo and neg_hi. iu wmmas (C is i32) don't use src 2 modifiers,
+ // remaining wmmas(f16, bf16 and f8bf8) use neg_lo and neg_hi for C (C is f32
+ // f16 or bf16). swmmac use index_key and don't use src 2 modifiers.
+
+ dag Src0Mods = !if(IsFP8BF8, (ins), (ins PackedF16InputMods:$src0_modifiers));
+ dag Src1Mods = !if(IsFP8BF8, (ins), (ins PackedF16InputMods:$src1_modifiers));
+ dag Src2Mods = !if(IsIU, (ins), (ins PackedF16InputMods:$src2_modifiers));
+ dag IndexKey = !cond(!eq(IndexType, 0) : (ins),
+ !eq(IndexType, 8) : (ins IndexKey8bit:$index_key_8bit),
+ !eq(IndexType, 16): (ins IndexKey16bit:$index_key_16bit));
+ dag Clamp = !if(IsIU, (ins clampmod0:$clamp), (ins));
+ dag Neg = !cond(!and(NegLoAny, NegHiAny) : (ins neg_lo0:$neg_lo, neg_hi0:$neg_hi),
+ !and(NegLoAny, !not(NegHiAny)) : (ins neg_lo0:$neg_lo),
+ !and(!not(NegLoAny), !not(NegHiAny)) : (ins));
+
+ let InsVOP3P = !con(Src0Mods, (ins Src0RC64:$src0), Src1Mods, (ins Src1RC64:$src1),
+ !cond(IsWMMA : !con(Src2Mods, (ins Src2RC64:$src2)),
+ IsSWMMAC : !con((ins DstRC:$srcTiedDef), (ins VRegSrc_32:$src2), IndexKey)),
+ Clamp, Neg);
+
+ // asm
+
+ string IndexKeyAsm = !cond(!eq(IndexType, 0) : "",
+ !eq(IndexType, 8) : "$index_key_8bit",
+ !eq(IndexType, 16) : "$index_key_16bit");
+ string ClampAsm = !if(IsIU, "$clamp", "");
+ string NegAsm = !cond(!and(NegLoAny, NegHiAny) : "$neg_lo$neg_hi",
+ !and(NegLoAny, !not(NegHiAny)) : "$neg_lo",
+ !and(!not(NegLoAny), !not(NegHiAny)) : "");
+
+ let AsmVOP3P = "$vdst, $src0, $src1, $src2"#IndexKeyAsm#NegAsm#ClampAsm;
+
+ // isel patterns
+
+ dag Src0InPat = !cond(IsAB_F16 : (ins (Src0VT (WMMAModsF16Neg Src0VT:$src0, i32:$src0_modifiers))),
+ IsAB_BF16 : (ins Src0VT:$src0),
+ IsIU : (ins (VOP3PModsNeg i32:$src0_modifiers), Src0VT:$src0),
+ IsFP8BF8 : (ins Src0VT:$src0));
+ dag Src0OutPat = !cond(IsAB_F16 : (ins i32:$src0_modifiers, Src0VT:$src0),
+ IsAB_BF16 : (ins (i32 8), Src0VT:$src0),
+ IsIU : (ins i32:$src0_modifiers, Src0VT:$src0),
+ IsFP8BF8 : (ins Src0VT:$src0));
+ dag Src1InPat = !cond(IsAB_F16 : (ins (Src1VT (WMMAModsF16Neg Src1VT:$src1, i32:$src1_modifiers))),
+ IsAB_BF16 : (ins Src1VT:$src1),
+ IsIU : (ins (VOP3PModsNeg i32:$src1_modifiers), Src1VT:$src1),
+ IsFP8BF8 : (ins Src1VT:$src1));
+ dag Src1OutPat = !cond(IsAB_F16 : (ins i32:$src1_modifiers, Src1VT:$src1),
+ IsAB_BF16 : (ins (i32 8), Src1VT:$src1),
+ IsIU : (ins i32:$src1_modifiers, Src1VT:$src1),
+ IsFP8BF8 : (ins Src1VT:$src1));
+ dag Src2InPatWmma = !cond(IsC_F32 : (ins (Src2VT (WMMAModsF32NegAbs Src2VT:$src2, i32:$src2_modifiers))),
+ IsC_F16 : (ins (Src2VT (WMMAModsF16NegAbs Src2VT:$src2, i32:$src2_modifiers))),
+ IsC_BF16 : (ins Src2VT:$src2),
+ IsIU : (ins Src2VT:$src2),
+ IsSWMMAC : (ins));
+ dag Src2OutPatWmma = !cond(IsC_F32 : (ins i32:$src2_modifiers, Src2VT:$src2),
+ IsC_F16 : (ins i32:$src2_modifiers, Src2VT:$src2),
+ IsC_BF16 : (ins (i32 8), Src2VT:$src2),
+ IsIU : (ins Src2VT:$src2),
+ IsSWMMAC : (ins));
+ dag ClampPat = !if(IsIU, (ins i1:$clamp), (ins));
+ dag IndexInPat = !cond(!eq(IndexType, 0) : (ins i32:$src2),
+ !eq(IndexType, 8) : (ins (i32 (SWMMACIndex8 i32:$src2, i32:$index_key_8bit))),
+ !eq(IndexType, 16): (ins (i32 (SWMMACIndex16 i32:$src2, i32:$index_key_16bit))));
+ dag IndexOutPat = !cond(!eq(IndexType, 0) : (ins i32:$src2),
+ !eq(IndexType, 8) : (ins i32:$src2, i32:$index_key_8bit),
+ !eq(IndexType, 16): (ins i32:$src2, i32:$index_key_16bit));
+ dag Src2InlineInPat = (ins (Src2VT (WMMAVISrc Src2VT:$src2)));
+ dag Src2InlineOutPat = !con(!if(IsIU, (ins), (ins (i32 8))), (ins Src2VT:$src2));
+
+
+ dag WmmaInPat = !con(Src0InPat, Src1InPat, Src2InPatWmma, ClampPat);
+ dag WmmaOutPat = !con(Src0OutPat, Src1OutPat, Src2OutPatWmma, ClampPat);
+
+ dag SwmmacInPat = !con(Src0InPat, Src1InPat, (ins Src2VT:$srcTiedDef), IndexInPat, ClampPat);
+ dag SwmmacOutPat = !con(Src0OutPat, Src1OutPat, (ins Src2VT:$srcTiedDef), IndexOutPat, ClampPat);
+
+ // wmma pattern where src2 is inline imm uses _threeaddr pseudo,
+ // can't use _twoaddr since it would violate src2 tied to vdst constraint.
+ dag WmmaInlineInPat = !con(Src0InPat, Src1InPat, Src2InlineInPat, ClampPat);
+ dag WmmaInlineOutPat = !con(Src0OutPat, Src1OutPat, Src2InlineOutPat, ClampPat);
+}
+
+multiclass WMMAInstGFX12<string Instr, VOP3PWMMA_Profile WMMAProfile, string PseudoInstrSuffix> {
+ let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in {
+ let Constraints = "@earlyclobber $vdst,$vdst = $src2", isConvertibleToThreeAddress = 1 in
+ def _twoaddr : VOP3P_Pseudo<Instr, WMMAProfile>{
+ let PseudoInstr = Instr#PseudoInstrSuffix;
+ }
+
+ let Constraints = "@earlyclobber $vdst", SchedRW = [Write32Bit, Write32Bit] in
+ def _threeaddr : VOP3P_Pseudo<Instr, WMMAProfile>{
+ let PseudoInstr = Instr#PseudoInstrSuffix;
+ }
+
+ }
+ def : WMMAOpcodeMapping<!cast<Instruction>(NAME # _twoaddr),
+ !cast<Instruction>(NAME # _threeaddr)>;
+}
+
+multiclass SWMMACInstGFX12<string Instr, VOP3PWMMA_Profile WMMAProfile, string PseudoInstrSuffix> {
+ def _twoaddr : VOP3P_Pseudo<Instr, WMMAProfile>{
+ let Mnemonic = Instr;
+ let PseudoInstr = Instr#PseudoInstrSuffix;
+ let mayRaiseFPException = 0;
+ let ReadsModeReg = 0;
+ let AsmMatchConverter = "cvtSWMMAC";
+
+ let Constraints = "@earlyclobber $vdst,$vdst = $srcTiedDef";
+ }
+}
+
+// First argument in Profile is types for matrices D, A, B and C (D = A * B + C)
+// as used by llvm ir, types are vectors(with matrix elements)
+// wave32:
+// For 16x16 matrices, lanes 0 to 31 will have 8 matrix elts,
+// for 16 x 32 16 elts and for 16 x 64 lanes have 32 elts.
+// wave64:
+// lanes will have half the size of elements in lanes compared to wave32 with
+// exception of 16x16_iu4: lanes0-31 will have 8xi4, remaining lanes are ignored
+
+// general idea on element distribution
diff erences:
+// wave32: lane n has 8 matrix elements
+// wave64: lane n has first 4, lane n+32 has other 4 elements
+
+// index size, for each 2 elements in lane you need 4bits in index
+
+// Non-standard types (iu8, iu4, fp8, bf8) will be packed in vectors of i32s.
+// Original type for them is in comment on the right and refers to A and B.
+
+def F32_F16_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v8f16, v8f16, v8f32], 0, 0, 0, 0>;
+def F32_BF16_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v8i16, v8i16, v8f32], 0, 0, 0, 0>;
+def F16_F16_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v8f16, v8f16, v8f16], 0, 0, 0, 0>;
+def BF16_BF16_WMMA_w32 : VOP3PWMMA_Profile<[v8i16, v8i16, v8i16, v8i16], 0, 0, 0, 0>;
+def I32_IU8_WMMA_w32 : VOP3PWMMA_Profile<[v8i32, v2i32, v2i32, v8i32], 0, 0, 1, 0>; // 8xi8
+def I32_IU4X16_WMMA_w32 : VOP3PWMMA_Profile<[v8i32, i32, i32, v8i32], 0, 0, 1, 0>; // 8xi4
+def F32_FP8BF8_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v2i32, v2i32, v8f32], 0, 0, 0, 1>; // 8xf8
+def I32_IU4X32_WMMA_w32 : VOP3PWMMA_Profile<[v8i32, v2i32, v2i32, v8i32], 0, 0, 1, 0>; // 16xi4
+
+def F32_F16_WMMA_w64 : VOP3PWMMA_Profile<[v4f32, v4f16, v4f16, v4f32], 0, 0, 0, 0>;
+def F32_BF16_WMMA_w64 : VOP3PWMMA_Profile<[v4f32, v4i16, v4i16, v4f32], 0, 0, 0, 0>;
+def F16_F16_WMMA_w64 : VOP3PWMMA_Profile<[v4f16, v4f16, v4f16, v4f16], 0, 0, 0, 0>;
+def BF16_BF16_WMMA_w64 : VOP3PWMMA_Profile<[v4i16, v4i16, v4i16, v4i16], 0, 0, 0, 0>;
+def I32_IU8_WMMA_w64 : VOP3PWMMA_Profile<[v4i32, i32, i32, v4i32], 0, 0, 1, 0>; // 4xi8
+def I32_IU4X16_WMMA_w64 : VOP3PWMMA_Profile<[v4i32, i32, i32, v4i32], 0, 0, 1, 0>; // 8xi4 *
+def F32_FP8BF8_WMMA_w64 : VOP3PWMMA_Profile<[v4f32, i32, i32, v4f32], 0, 0, 0, 1>; // 4xf8
+def I32_IU4X32_WMMA_w64 : VOP3PWMMA_Profile<[v4i32, i32, i32, v4i32], 0, 0, 1, 0>; // 8xi4
+
+def F32_F16_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v8f16, v16f16, v8f32], 1, 16, 0, 0>;
+def F32_BF16_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v8i16, v16i16, v8f32], 1, 16, 0, 0>;
+def F16_F16_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f16, v8f16, v16f16, v8f16], 1, 16, 0, 0>;
+def BF16_BF16_SWMMAC_w32 : VOP3PWMMA_Profile<[v8i16, v8i16, v16i16, v8i16], 1, 16, 0, 0>;
+def I32_IU8_SWMMAC_w32 : VOP3PWMMA_Profile<[v8i32, v2i32, v4i32, v8i32], 1, 16, 1, 0>; // 8xi8, 16xi8
+def I32_IU4X32_SWMMAC_w32 : VOP3PWMMA_Profile<[v8i32, i32, v2i32, v8i32], 1, 16, 1, 0>; // 8xi4, 16xi4
+def I32_IU4X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8i32, v2i32, v4i32, v8i32], 1, 0, 1, 0>; // 16xi4, 32xi4 **
+def F32_FP8BF8_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v2i32, v4i32, v8f32], 1, 16, 0, 1>; // 8xf8, 16xf8
+
+def F32_F16_SWMMAC_w64 : VOP3PWMMA_Profile<[v4f32, v4f16, v8f16, v4f32], 1, 8, 0, 0>;
+def F32_BF16_SWMMAC_w64 : VOP3PWMMA_Profile<[v4f32, v4i16, v8i16, v4f32], 1, 8, 0, 0>;
+def F16_F16_SWMMAC_w64 : VOP3PWMMA_Profile<[v4f16, v4f16, v8f16, v4f16], 1, 8, 0, 0>;
+def BF16_BF16_SWMMAC_w64 : VOP3PWMMA_Profile<[v4i16, v4i16, v8i16, v4i16], 1, 8, 0, 0>;
+def I32_IU8_SWMMAC_w64 : VOP3PWMMA_Profile<[v4i32, i32, v2i32, v4i32], 1, 8, 1, 0>; // 4xi8, 8xi8
+def I32_IU4X32_SWMMAC_w64 : VOP3PWMMA_Profile<[v4i32, i32, i32, v4i32], 1, 16, 1, 0>; // 8xi4, 8xi4 ***
+def I32_IU4X64_SWMMAC_w64 : VOP3PWMMA_Profile<[v4i32, i32, v2i32, v4i32], 1, 16, 1, 0>; // 8xi4, 16xi4
+def F32_FP8BF8_SWMMAC_w64 : VOP3PWMMA_Profile<[v4f32, i32, v2i32, v4f32], 1, 8, 0, 1>; // 4xf8, 8xf8
+
+// * IU4X16_WMMA_w64 lanes 0-31 will have 8xi4, remaining lanes are ignored
+// ** IU4X64_SWMMAC_w32 index is i32, index_key is not used
+// *** IU4X32_SWMMAC_w64 lanes 0-31 will have 8xi4 remaining lanes are ignored
+// for matrix A, index is i16; Matrix B uses all lanes
+
+let WaveSizePredicate = isWave32 in {
+defm V_WMMA_F32_16X16X16_F16_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x16_f16", F32_F16_WMMA_w32, "_w32">;
+defm V_WMMA_F32_16X16X16_BF16_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x16_bf16", F32_BF16_WMMA_w32, "_w32">;
+defm V_WMMA_F16_16X16X16_F16_w32 : WMMAInstGFX12<"v_wmma_f16_16x16x16_f16", F16_F16_WMMA_w32, "_w32">;
+defm V_WMMA_BF16_16X16X16_BF16_w32 : WMMAInstGFX12<"v_wmma_bf16_16x16x16_bf16", BF16_BF16_WMMA_w32, "_w32">;
+defm V_WMMA_I32_16X16X16_IU8_w32 : WMMAInstGFX12<"v_wmma_i32_16x16x16_iu8", I32_IU8_WMMA_w32, "_w32">;
+defm V_WMMA_I32_16X16X16_IU4_w32 : WMMAInstGFX12<"v_wmma_i32_16x16x16_iu4", I32_IU4X16_WMMA_w32, "_w32">;
+defm V_WMMA_F32_16X16X16_FP8_FP8_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x16_fp8_fp8", F32_FP8BF8_WMMA_w32, "_w32">;
+defm V_WMMA_F32_16X16X16_FP8_BF8_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x16_fp8_bf8", F32_FP8BF8_WMMA_w32, "_w32">;
+defm V_WMMA_F32_16X16X16_BF8_FP8_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x16_bf8_fp8", F32_FP8BF8_WMMA_w32, "_w32">;
+defm V_WMMA_F32_16X16X16_BF8_BF8_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x16_bf8_bf8", F32_FP8BF8_WMMA_w32, "_w32">;
+defm V_WMMA_I32_16X16X32_IU4_w32 : WMMAInstGFX12<"v_wmma_i32_16x16x32_iu4", I32_IU4X32_WMMA_w32, "_w32">;
+
+defm V_SWMMAC_F32_16X16X32_F16_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_f16", F32_F16_SWMMAC_w32, "_w32">;
+defm V_SWMMAC_F32_16X16X32_BF16_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_bf16", F32_BF16_SWMMAC_w32, "_w32">;
+defm V_SWMMAC_F16_16X16X32_F16_w32 : SWMMACInstGFX12<"v_swmmac_f16_16x16x32_f16", F16_F16_SWMMAC_w32, "_w32">;
+defm V_SWMMAC_BF16_16X16X32_BF16_w32 : SWMMACInstGFX12<"v_swmmac_bf16_16x16x32_bf16", BF16_BF16_SWMMAC_w32, "_w32">;
+defm V_SWMMAC_I32_16X16X32_IU8_w32 : SWMMACInstGFX12<"v_swmmac_i32_16x16x32_iu8", I32_IU8_SWMMAC_w32, "_w32">;
+defm V_SWMMAC_I32_16X16X32_IU4_w32 : SWMMACInstGFX12<"v_swmmac_i32_16x16x32_iu4", I32_IU4X32_SWMMAC_w32, "_w32">;
+defm V_SWMMAC_I32_16X16X64_IU4_w32 : SWMMACInstGFX12<"v_swmmac_i32_16x16x64_iu4", I32_IU4X64_SWMMAC_w32, "_w32">;
+defm V_SWMMAC_F32_16X16X32_FP8_FP8_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_fp8_fp8", F32_FP8BF8_SWMMAC_w32, "_w32">;
+defm V_SWMMAC_F32_16X16X32_FP8_BF8_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_fp8_bf8", F32_FP8BF8_SWMMAC_w32, "_w32">;
+defm V_SWMMAC_F32_16X16X32_BF8_FP8_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_bf8_fp8", F32_FP8BF8_SWMMAC_w32, "_w32">;
+defm V_SWMMAC_F32_16X16X32_BF8_BF8_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_bf8_bf8", F32_FP8BF8_SWMMAC_w32, "_w32">;
+}
+
+let WaveSizePredicate = isWave64 in {
+defm V_WMMA_F32_16X16X16_F16_w64 : WMMAInstGFX12<"v_wmma_f32_16x16x16_f16", F32_F16_WMMA_w64, "_w64">;
+defm V_WMMA_F32_16X16X16_BF16_w64 : WMMAInstGFX12<"v_wmma_f32_16x16x16_bf16", F32_BF16_WMMA_w64, "_w64">;
+defm V_WMMA_F16_16X16X16_F16_w64 : WMMAInstGFX12<"v_wmma_f16_16x16x16_f16", F16_F16_WMMA_w64, "_w64">;
+defm V_WMMA_BF16_16X16X16_BF16_w64 : WMMAInstGFX12<"v_wmma_bf16_16x16x16_bf16", BF16_BF16_WMMA_w64, "_w64">;
+defm V_WMMA_I32_16X16X16_IU8_w64 : WMMAInstGFX12<"v_wmma_i32_16x16x16_iu8", I32_IU8_WMMA_w64, "_w64">;
+defm V_WMMA_I32_16X16X16_IU4_w64 : WMMAInstGFX12<"v_wmma_i32_16x16x16_iu4", I32_IU4X16_WMMA_w64, "_w64">;
+defm V_WMMA_F32_16X16X16_FP8_FP8_w64 : WMMAInstGFX12<"v_wmma_f32_16x16x16_fp8_fp8", F32_FP8BF8_WMMA_w64, "_w64">;
+defm V_WMMA_F32_16X16X16_FP8_BF8_w64 : WMMAInstGFX12<"v_wmma_f32_16x16x16_fp8_bf8", F32_FP8BF8_WMMA_w64, "_w64">;
+defm V_WMMA_F32_16X16X16_BF8_FP8_w64 : WMMAInstGFX12<"v_wmma_f32_16x16x16_bf8_fp8", F32_FP8BF8_WMMA_w64, "_w64">;
+defm V_WMMA_F32_16X16X16_BF8_BF8_w64 : WMMAInstGFX12<"v_wmma_f32_16x16x16_bf8_bf8", F32_FP8BF8_WMMA_w64, "_w64">;
+defm V_WMMA_I32_16X16X32_IU4_w64 : WMMAInstGFX12<"v_wmma_i32_16x16x32_iu4", I32_IU4X32_WMMA_w64, "_w64">;
+
+defm V_SWMMAC_F32_16X16X32_F16_w64 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_f16", F32_F16_SWMMAC_w64, "_w64">;
+defm V_SWMMAC_F32_16X16X32_BF16_w64 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_bf16", F32_BF16_SWMMAC_w64, "_w64">;
+defm V_SWMMAC_F16_16X16X32_F16_w64 : SWMMACInstGFX12<"v_swmmac_f16_16x16x32_f16", F16_F16_SWMMAC_w64, "_w64">;
+defm V_SWMMAC_BF16_16X16X32_BF16_w64 : SWMMACInstGFX12<"v_swmmac_bf16_16x16x32_bf16", BF16_BF16_SWMMAC_w64, "_w64">;
+defm V_SWMMAC_I32_16X16X32_IU8_w64 : SWMMACInstGFX12<"v_swmmac_i32_16x16x32_iu8", I32_IU8_SWMMAC_w64, "_w64">;
+defm V_SWMMAC_I32_16X16X32_IU4_w64 : SWMMACInstGFX12<"v_swmmac_i32_16x16x32_iu4", I32_IU4X32_SWMMAC_w64, "_w64">;
+defm V_SWMMAC_I32_16X16X64_IU4_w64 : SWMMACInstGFX12<"v_swmmac_i32_16x16x64_iu4", I32_IU4X64_SWMMAC_w64, "_w64">;
+defm V_SWMMAC_F32_16X16X32_FP8_FP8_w64 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_fp8_fp8", F32_FP8BF8_SWMMAC_w64, "_w64">;
+defm V_SWMMAC_F32_16X16X32_FP8_BF8_w64 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_fp8_bf8", F32_FP8BF8_SWMMAC_w64, "_w64">;
+defm V_SWMMAC_F32_16X16X32_BF8_FP8_w64 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_bf8_fp8", F32_FP8BF8_SWMMAC_w64, "_w64">;
+defm V_SWMMAC_F32_16X16X32_BF8_BF8_w64 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_bf8_bf8", F32_FP8BF8_SWMMAC_w64, "_w64">;
+}
+
+// IsGFX11OpselIntrinsic: f16_f16 and bf16_bf16 Intrinsics have imm operand that
+// controls opsel. Used by gfx11, removed in gfx12 (operand must be 0).
+multiclass WMMAPat<string Inst, SDPatternOperator node, VOP3PWMMA_Profile P, bit IsGFX11OpselIntrinsic = 0> {
+ def : GCNPat <(P.DstVT !setdagop(!con(P.WmmaInPat, !if(IsGFX11OpselIntrinsic, (ins 0), (ins))), node)),
+ (P.DstVT !setdagop(P.WmmaOutPat, !cast<Instruction>(Inst#"_twoaddr")))>;
+ let AddedComplexity = 4 in
+ def : GCNPat <(P.DstVT !setdagop(!con(P.WmmaInlineInPat, !if(IsGFX11OpselIntrinsic, (ins 0), (ins))), node)),
+ (P.DstVT !setdagop(P.WmmaInlineOutPat, !cast<Instruction>(Inst#"_threeaddr")))>;
+}
+
+class SWMMACPat<Instruction Inst, SDPatternOperator node, VOP3PWMMA_Profile P> :
+ GCNPat <(P.DstVT !setdagop(P.SwmmacInPat, node)),
+ (P.DstVT !setdagop(P.SwmmacOutPat, Inst))>;
+
+class SWMMACPat_w64<Instruction Inst, SDPatternOperator node, VOP3PWMMA_Profile P> :
+ GCNPat <(P.DstVT !setdagop(P.SwmmacInPat, node)),
+ (P.DstVT !setdagop(P.SwmmacOutPat, Inst))>{
+ let WaveSizePredicate = isWave64;
+ }
+
+let WaveSizePredicate = isWave32, SubtargetPredicate = isGFX12Plus in {
+ defm : WMMAPat<"V_WMMA_F32_16X16X16_F16_w32", int_amdgcn_wmma_f32_16x16x16_f16, F32_F16_WMMA_w32>;
+ defm : WMMAPat<"V_WMMA_F32_16X16X16_BF16_w32", int_amdgcn_wmma_f32_16x16x16_bf16, F32_BF16_WMMA_w32>;
+ defm : WMMAPat<"V_WMMA_F16_16X16X16_F16_w32", int_amdgcn_wmma_f16_16x16x16_f16, F16_F16_WMMA_w32,1>;
+ defm : WMMAPat<"V_WMMA_BF16_16X16X16_BF16_w32", int_amdgcn_wmma_bf16_16x16x16_bf16, BF16_BF16_WMMA_w32,1>;
+ defm : WMMAPat<"V_WMMA_I32_16X16X16_IU8_w32", int_amdgcn_wmma_i32_16x16x16_iu8, I32_IU8_WMMA_w32>;
+ defm : WMMAPat<"V_WMMA_I32_16X16X16_IU4_w32", int_amdgcn_wmma_i32_16x16x16_iu4, I32_IU4X16_WMMA_w32>;
+ defm : WMMAPat<"V_WMMA_F32_16X16X16_FP8_FP8_w32", int_amdgcn_wmma_f32_16x16x16_fp8_fp8, F32_FP8BF8_WMMA_w32>;
+ defm : WMMAPat<"V_WMMA_F32_16X16X16_FP8_BF8_w32", int_amdgcn_wmma_f32_16x16x16_fp8_bf8, F32_FP8BF8_WMMA_w32>;
+ defm : WMMAPat<"V_WMMA_F32_16X16X16_BF8_FP8_w32", int_amdgcn_wmma_f32_16x16x16_bf8_fp8, F32_FP8BF8_WMMA_w32>;
+ defm : WMMAPat<"V_WMMA_F32_16X16X16_BF8_BF8_w32", int_amdgcn_wmma_f32_16x16x16_bf8_bf8, F32_FP8BF8_WMMA_w32>;
+ defm : WMMAPat<"V_WMMA_I32_16X16X32_IU4_w32", int_amdgcn_wmma_i32_16x16x32_iu4, I32_IU4X32_WMMA_w32>;
+
+ def : SWMMACPat<V_SWMMAC_F32_16X16X32_F16_w32_twoaddr, int_amdgcn_swmmac_f32_16x16x32_f16, F32_F16_SWMMAC_w32>;
+ def : SWMMACPat<V_SWMMAC_F32_16X16X32_BF16_w32_twoaddr, int_amdgcn_swmmac_f32_16x16x32_bf16, F32_BF16_SWMMAC_w32>;
+ def : SWMMACPat<V_SWMMAC_F16_16X16X32_F16_w32_twoaddr, int_amdgcn_swmmac_f16_16x16x32_f16, F16_F16_SWMMAC_w32>;
+ def : SWMMACPat<V_SWMMAC_BF16_16X16X32_BF16_w32_twoaddr, int_amdgcn_swmmac_bf16_16x16x32_bf16, BF16_BF16_SWMMAC_w32>;
+ def : SWMMACPat<V_SWMMAC_I32_16X16X32_IU8_w32_twoaddr, int_amdgcn_swmmac_i32_16x16x32_iu8, I32_IU8_SWMMAC_w32>;
+ def : SWMMACPat<V_SWMMAC_I32_16X16X32_IU4_w32_twoaddr, int_amdgcn_swmmac_i32_16x16x32_iu4, I32_IU4X32_SWMMAC_w32>;
+ def : GCNPat <(I32_IU4X64_SWMMAC_w32.DstVT !setdagop(I32_IU4X64_SWMMAC_w32.SwmmacInPat, int_amdgcn_swmmac_i32_16x16x64_iu4)),
+ (I32_IU4X64_SWMMAC_w32.DstVT !setdagop(I32_IU4X64_SWMMAC_w32.SwmmacOutPat, V_SWMMAC_I32_16X16X64_IU4_w32_twoaddr))>;
+ def : SWMMACPat<V_SWMMAC_F32_16X16X32_FP8_FP8_w32_twoaddr, int_amdgcn_swmmac_f32_16x16x32_fp8_fp8, F32_FP8BF8_SWMMAC_w32>;
+ def : SWMMACPat<V_SWMMAC_F32_16X16X32_FP8_BF8_w32_twoaddr, int_amdgcn_swmmac_f32_16x16x32_fp8_bf8, F32_FP8BF8_SWMMAC_w32>;
+ def : SWMMACPat<V_SWMMAC_F32_16X16X32_BF8_FP8_w32_twoaddr, int_amdgcn_swmmac_f32_16x16x32_bf8_fp8, F32_FP8BF8_SWMMAC_w32>;
+ def : SWMMACPat<V_SWMMAC_F32_16X16X32_BF8_BF8_w32_twoaddr, int_amdgcn_swmmac_f32_16x16x32_bf8_bf8, F32_FP8BF8_SWMMAC_w32>;
+}
+
+let WaveSizePredicate = isWave64, SubtargetPredicate = isGFX12Plus in {
+ defm : WMMAPat<"V_WMMA_F32_16X16X16_F16_w64", int_amdgcn_wmma_f32_16x16x16_f16, F32_F16_WMMA_w64>;
+ defm : WMMAPat<"V_WMMA_F32_16X16X16_BF16_w64", int_amdgcn_wmma_f32_16x16x16_bf16, F32_BF16_WMMA_w64>;
+ defm : WMMAPat<"V_WMMA_F16_16X16X16_F16_w64", int_amdgcn_wmma_f16_16x16x16_f16, F16_F16_WMMA_w64,1>;
+ defm : WMMAPat<"V_WMMA_BF16_16X16X16_BF16_w64", int_amdgcn_wmma_bf16_16x16x16_bf16, BF16_BF16_WMMA_w64,1>;
+ defm : WMMAPat<"V_WMMA_I32_16X16X16_IU8_w64", int_amdgcn_wmma_i32_16x16x16_iu8, I32_IU8_WMMA_w64>;
+ defm : WMMAPat<"V_WMMA_I32_16X16X16_IU4_w64", int_amdgcn_wmma_i32_16x16x16_iu4, I32_IU4X16_WMMA_w64>;
+ defm : WMMAPat<"V_WMMA_F32_16X16X16_FP8_FP8_w64", int_amdgcn_wmma_f32_16x16x16_fp8_fp8, F32_FP8BF8_WMMA_w64>;
+ defm : WMMAPat<"V_WMMA_F32_16X16X16_FP8_BF8_w64", int_amdgcn_wmma_f32_16x16x16_fp8_bf8, F32_FP8BF8_WMMA_w64>;
+ defm : WMMAPat<"V_WMMA_F32_16X16X16_BF8_FP8_w64", int_amdgcn_wmma_f32_16x16x16_bf8_fp8, F32_FP8BF8_WMMA_w64>;
+ defm : WMMAPat<"V_WMMA_F32_16X16X16_BF8_BF8_w64", int_amdgcn_wmma_f32_16x16x16_bf8_bf8, F32_FP8BF8_WMMA_w64>;
+ defm : WMMAPat<"V_WMMA_I32_16X16X32_IU4_w64", int_amdgcn_wmma_i32_16x16x32_iu4, I32_IU4X32_WMMA_w64>;
+
+ def : SWMMACPat<V_SWMMAC_F32_16X16X32_F16_w64_twoaddr, int_amdgcn_swmmac_f32_16x16x32_f16, F32_F16_SWMMAC_w64>;
+ def : SWMMACPat<V_SWMMAC_F32_16X16X32_BF16_w64_twoaddr, int_amdgcn_swmmac_f32_16x16x32_bf16, F32_BF16_SWMMAC_w64>;
+ def : SWMMACPat<V_SWMMAC_F16_16X16X32_F16_w64_twoaddr, int_amdgcn_swmmac_f16_16x16x32_f16, F16_F16_SWMMAC_w64>;
+ def : SWMMACPat<V_SWMMAC_BF16_16X16X32_BF16_w64_twoaddr, int_amdgcn_swmmac_bf16_16x16x32_bf16, BF16_BF16_SWMMAC_w64>;
+ def : SWMMACPat<V_SWMMAC_I32_16X16X32_IU8_w64_twoaddr, int_amdgcn_swmmac_i32_16x16x32_iu8, I32_IU8_SWMMAC_w64>;
+ def : SWMMACPat<V_SWMMAC_I32_16X16X32_IU4_w64_twoaddr, int_amdgcn_swmmac_i32_16x16x32_iu4, I32_IU4X32_SWMMAC_w64>;
+ def : SWMMACPat<V_SWMMAC_I32_16X16X64_IU4_w64_twoaddr, int_amdgcn_swmmac_i32_16x16x64_iu4, I32_IU4X64_SWMMAC_w64>;
+ def : SWMMACPat<V_SWMMAC_F32_16X16X32_FP8_FP8_w64_twoaddr, int_amdgcn_swmmac_f32_16x16x32_fp8_fp8, F32_FP8BF8_SWMMAC_w64>;
+ def : SWMMACPat<V_SWMMAC_F32_16X16X32_FP8_BF8_w64_twoaddr, int_amdgcn_swmmac_f32_16x16x32_fp8_bf8, F32_FP8BF8_SWMMAC_w64>;
+ def : SWMMACPat<V_SWMMAC_F32_16X16X32_BF8_FP8_w64_twoaddr, int_amdgcn_swmmac_f32_16x16x32_bf8_fp8, F32_FP8BF8_SWMMAC_w64>;
+ def : SWMMACPat<V_SWMMAC_F32_16X16X32_BF8_BF8_w64_twoaddr, int_amdgcn_swmmac_f32_16x16x32_bf8_bf8, F32_FP8BF8_SWMMAC_w64>;
+}
+
+
//===----------------------------------------------------------------------===//
// Begin Real Encodings
//===----------------------------------------------------------------------===//
@@ -1005,6 +1400,99 @@ multiclass VOP3P_Real_Base<GFXGen Gen, bits<7> op, string backing_ps_name = NAME
VOP3Pe_gfx11_gfx12<op, !cast<VOP3P_Pseudo>(backing_ps_name).Pfl>;
}
+class VOP3PeWmma<bits<7> op, VOPProfile P, VOP3PWMMA_Profile WMMAP>
+ : VOP3Pe_gfx11_gfx12<op, P>{
+ // opsel
+ let Inst{11} = !cond(!eq(WMMAP.IndexType, 0) : 0,
+ !eq(WMMAP.IndexType, 8) : index_key_8bit{0},
+ !eq(WMMAP.IndexType, 16) : index_key_16bit{0});
+ let Inst{12} = !if(!eq(WMMAP.IndexType, 8), index_key_8bit{1}, 0);
+ let Inst{13} = 0;
+ // opsel_hi
+ let Inst{59} = 1;
+ let Inst{60} = 1;
+ let Inst{14} = 1;
+ // neg_lo
+ let Inst{61} = !if(WMMAP.NegLo01, src0_modifiers{0}, 0);
+ let Inst{62} = !if(WMMAP.NegLo01, src1_modifiers{0}, 0);
+ let Inst{63} = !if(WMMAP.NegLo2, src2_modifiers{0}, 0);
+ // neg_hi
+ let Inst{8} = !if(WMMAP.NegHi01, src0_modifiers{1}, 0);
+ let Inst{9} = !if(WMMAP.NegHi01, src1_modifiers{1}, 0);
+ let Inst{10} = !if(WMMAP.NegHi2, src2_modifiers{1}, 0);
+ // clamp
+ let Inst{15} = !if(WMMAP.IsIU, clamp{0}, 0);
+}
+
+multiclass VOP3P_WMMA_Real_Base<GFXGen Gen, bits<7> op, VOP3PWMMA_Profile WMMAP,
+ string backing_ps_name = NAME,
+ string asmName = !cast<VOP3P_Pseudo>(NAME).Mnemonic> {
+ def Gen.Suffix :
+ VOP3P_Real_Gen<!cast<VOP3P_Pseudo>(backing_ps_name), Gen, asmName>,
+ VOP3PeWmma<op, !cast<VOP3P_Pseudo>(backing_ps_name).Pfl, WMMAP>;
+}
+
+multiclass VOP3P_Real_WMMA_gfx12 <bits<7> op, VOP3PWMMA_Profile WMMAP> {
+ let WaveSizePredicate = isWave32, DecoderNamespace = "GFX12" in {
+ defm _twoaddr : VOP3P_WMMA_Real_Base <GFX12Gen, op, WMMAP>;
+ }
+}
+
+multiclass VOP3P_Real_WMMA_gfx12w64 <bits<7> op, VOP3PWMMA_Profile WMMAP> {
+ let WaveSizePredicate = isWave64, DecoderNamespace = "WMMAGFX12" in {
+ defm _twoaddr : VOP3P_WMMA_Real_Base <GFX12Gen, op, WMMAP>;
+ }
+}
+
+defm V_WMMA_F32_16X16X16_F16_w32 : VOP3P_Real_WMMA_gfx12 <0x040, F32_F16_WMMA_w32>;
+defm V_WMMA_F32_16X16X16_BF16_w32 : VOP3P_Real_WMMA_gfx12 <0x041, F32_BF16_WMMA_w32>;
+defm V_WMMA_F16_16X16X16_F16_w32 : VOP3P_Real_WMMA_gfx12 <0x042, F16_F16_WMMA_w32>;
+defm V_WMMA_BF16_16X16X16_BF16_w32 : VOP3P_Real_WMMA_gfx12 <0x043, BF16_BF16_WMMA_w32>;
+defm V_WMMA_I32_16X16X16_IU8_w32 : VOP3P_Real_WMMA_gfx12 <0x044, I32_IU8_WMMA_w32>;
+defm V_WMMA_I32_16X16X16_IU4_w32 : VOP3P_Real_WMMA_gfx12 <0x045, I32_IU4X16_WMMA_w32>;
+defm V_WMMA_F32_16X16X16_FP8_FP8_w32 : VOP3P_Real_WMMA_gfx12 <0x046, F32_FP8BF8_WMMA_w32>;
+defm V_WMMA_F32_16X16X16_FP8_BF8_w32 : VOP3P_Real_WMMA_gfx12 <0x047, F32_FP8BF8_WMMA_w32>;
+defm V_WMMA_F32_16X16X16_BF8_FP8_w32 : VOP3P_Real_WMMA_gfx12 <0x048, F32_FP8BF8_WMMA_w32>;
+defm V_WMMA_F32_16X16X16_BF8_BF8_w32 : VOP3P_Real_WMMA_gfx12 <0x049, F32_FP8BF8_WMMA_w32>;
+defm V_WMMA_I32_16X16X32_IU4_w32 : VOP3P_Real_WMMA_gfx12 <0x04a, I32_IU4X32_WMMA_w32>;
+
+defm V_WMMA_F32_16X16X16_F16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x040, F32_F16_WMMA_w64>;
+defm V_WMMA_F32_16X16X16_BF16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x041, F32_BF16_WMMA_w64>;
+defm V_WMMA_F16_16X16X16_F16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x042, F16_F16_WMMA_w64>;
+defm V_WMMA_BF16_16X16X16_BF16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x043, BF16_BF16_WMMA_w64>;
+defm V_WMMA_I32_16X16X16_IU8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x044, I32_IU8_WMMA_w64>;
+defm V_WMMA_I32_16X16X16_IU4_w64 : VOP3P_Real_WMMA_gfx12w64 <0x045, I32_IU4X16_WMMA_w64>;
+defm V_WMMA_F32_16X16X16_FP8_FP8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x046, F32_FP8BF8_WMMA_w64>;
+defm V_WMMA_F32_16X16X16_FP8_BF8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x047, F32_FP8BF8_WMMA_w64>;
+defm V_WMMA_F32_16X16X16_BF8_FP8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x048, F32_FP8BF8_WMMA_w64>;
+defm V_WMMA_F32_16X16X16_BF8_BF8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x049, F32_FP8BF8_WMMA_w64>;
+defm V_WMMA_I32_16X16X32_IU4_w64 : VOP3P_Real_WMMA_gfx12w64 <0x04a, I32_IU4X32_WMMA_w64>;
+
+
+defm V_SWMMAC_F32_16X16X32_F16_w32 : VOP3P_Real_WMMA_gfx12 <0x050, F32_F16_SWMMAC_w32>;
+defm V_SWMMAC_F32_16X16X32_BF16_w32 : VOP3P_Real_WMMA_gfx12 <0x051, F32_BF16_SWMMAC_w32>;
+defm V_SWMMAC_F16_16X16X32_F16_w32 : VOP3P_Real_WMMA_gfx12 <0x052, F16_F16_SWMMAC_w32>;
+defm V_SWMMAC_BF16_16X16X32_BF16_w32 : VOP3P_Real_WMMA_gfx12 <0x053, BF16_BF16_SWMMAC_w32>;
+defm V_SWMMAC_I32_16X16X32_IU8_w32 : VOP3P_Real_WMMA_gfx12 <0x054, I32_IU8_SWMMAC_w32>;
+defm V_SWMMAC_I32_16X16X32_IU4_w32 : VOP3P_Real_WMMA_gfx12 <0x055, I32_IU4X32_SWMMAC_w32>;
+defm V_SWMMAC_I32_16X16X64_IU4_w32 : VOP3P_Real_WMMA_gfx12 <0x056, I32_IU4X64_SWMMAC_w32>;
+defm V_SWMMAC_F32_16X16X32_FP8_FP8_w32 : VOP3P_Real_WMMA_gfx12 <0x057, F32_FP8BF8_SWMMAC_w32>;
+defm V_SWMMAC_F32_16X16X32_FP8_BF8_w32 : VOP3P_Real_WMMA_gfx12 <0x058, F32_FP8BF8_SWMMAC_w32>;
+defm V_SWMMAC_F32_16X16X32_BF8_FP8_w32 : VOP3P_Real_WMMA_gfx12 <0x059, F32_FP8BF8_SWMMAC_w32>;
+defm V_SWMMAC_F32_16X16X32_BF8_BF8_w32 : VOP3P_Real_WMMA_gfx12 <0x05a, F32_FP8BF8_SWMMAC_w32>;
+
+defm V_SWMMAC_F32_16X16X32_F16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x050, F32_F16_SWMMAC_w64>;
+defm V_SWMMAC_F32_16X16X32_BF16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x051, F32_BF16_SWMMAC_w64>;
+defm V_SWMMAC_F16_16X16X32_F16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x052, F16_F16_SWMMAC_w64>;
+defm V_SWMMAC_BF16_16X16X32_BF16_w64 : VOP3P_Real_WMMA_gfx12w64 <0x053, BF16_BF16_SWMMAC_w64>;
+defm V_SWMMAC_I32_16X16X32_IU8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x054, I32_IU8_SWMMAC_w64>;
+defm V_SWMMAC_I32_16X16X32_IU4_w64 : VOP3P_Real_WMMA_gfx12w64 <0x055, I32_IU4X32_SWMMAC_w64>;
+defm V_SWMMAC_I32_16X16X64_IU4_w64 : VOP3P_Real_WMMA_gfx12w64 <0x056, I32_IU4X64_SWMMAC_w64>;
+defm V_SWMMAC_F32_16X16X32_FP8_FP8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x057, F32_FP8BF8_SWMMAC_w64>;
+defm V_SWMMAC_F32_16X16X32_FP8_BF8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x058, F32_FP8BF8_SWMMAC_w64>;
+defm V_SWMMAC_F32_16X16X32_BF8_FP8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x059, F32_FP8BF8_SWMMAC_w64>;
+defm V_SWMMAC_F32_16X16X32_BF8_BF8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x05a, F32_FP8BF8_SWMMAC_w64>;
+
multiclass VOP3P_Real_with_name<GFXGen Gen, bits<7> op,
string backing_ps_name = NAME,
string asmName = !cast<VOP3P_Pseudo>(NAME).Mnemonic> {
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index 8b6d1ddba595e6..20d7c88fb7e59f 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -124,6 +124,7 @@ class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern = [],
let IsPacked = P.IsPacked;
let IsMAI = P.IsMAI;
let IsWMMA = P.IsWMMA;
+ let IsSWMMAC = P.IsSWMMAC;
let AsmOperands = !if(isVop3OpSel,
P.AsmVOP3OpSel,
@@ -383,6 +384,8 @@ class VOP3Pe <bits<7> op, VOPProfile P> : Enc64 {
bits<4> src2_modifiers;
bits<9> src2;
bits<1> clamp;
+ bits<2> index_key_8bit;
+ bits<1> index_key_16bit;
let Inst{7-0} = vdst;
let Inst{8} = !if(P.HasSrc0Mods, src0_modifiers{1}, 0); // neg_hi src0
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
index a08ca86c8a619b..4b6f6159ff23f3 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
@@ -63,52 +63,140 @@ define amdgpu_kernel void @writelane(ptr addrspace(1) %out) #0 {
ret void
}
-; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32(<16 x half> %A, <16 x half> %B, <8 x float> %C)
+; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v16f16(<16 x half> %A, <16 x half> %B, <8 x float> %C)
define amdgpu_kernel void @wmma_f32_16x16x16_f16(<16 x half> %A, <16 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
- %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32(<16 x half> %A, <16 x half> %B, <8 x float> %C)
+ %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v16f16(<16 x half> %A, <16 x half> %B, <8 x float> %C)
store <8 x float> %tmp0, ptr addrspace(1) %out, align 32
ret void
}
-; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32(<16 x i16> %A, <16 x i16> %B, <8 x float> %C)
+; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v16i16(<16 x i16> %A, <16 x i16> %B, <8 x float> %C)
define amdgpu_kernel void @wmma_f32_16x16x16_ibf16(<16 x i16> %A, <16 x i16> %B, <8 x float> %C, ptr addrspace(1) %out) {
- %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32(<16 x i16> %A, <16 x i16> %B, <8 x float> %C)
+ %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v16i16(<16 x i16> %A, <16 x i16> %B, <8 x float> %C)
store <8 x float> %tmp0, ptr addrspace(1) %out, align 32
ret void
}
-; CHECK: DIVERGENT: %tmp0 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16(<16 x half> %A, <16 x half> %B, <16 x half> %C, i1 false)
+; CHECK: DIVERGENT: %tmp0 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> %A, <16 x half> %B, <16 x half> %C, i1 false)
define amdgpu_kernel void @wmma_f16_16x16x16_f16(<16 x half> %A, <16 x half> %B, <16 x half> %C, ptr addrspace(1) %out) {
bb:
- %tmp0 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16(<16 x half> %A, <16 x half> %B, <16 x half> %C, i1 false)
+ %tmp0 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> %A, <16 x half> %B, <16 x half> %C, i1 false)
store <16 x half> %tmp0, ptr addrspace(1) %out, align 32
ret void
}
-; CHECK: DIVERGENT: %tmp0 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, i1 false)
+; CHECK: DIVERGENT: %tmp0 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16.v16i16(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, i1 false)
define amdgpu_kernel void @wmma_f16_16x16x16_bf16(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, ptr addrspace(1) %out) {
bb:
- %tmp0 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, i1 false)
+ %tmp0 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16.v16i16(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, i1 false)
store <16 x i16> %tmp0, ptr addrspace(1) %out, align 32
ret void
}
-; CHECK: DIVERGENT: %tmp0 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32(i1 false, <4 x i32> %A, i1 false, <4 x i32> %B, <8 x i32> %C, i1 false)
+; CHECK: DIVERGENT: %tmp0 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v4i32(i1 false, <4 x i32> %A, i1 false, <4 x i32> %B, <8 x i32> %C, i1 false)
define amdgpu_kernel void @wmma_i32_16x16x16_ui8(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
bb:
- %tmp0 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32(i1 false, <4 x i32> %A, i1 false, <4 x i32> %B, <8 x i32> %C, i1 false)
+ %tmp0 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v4i32(i1 false, <4 x i32> %A, i1 false, <4 x i32> %B, <8 x i32> %C, i1 false)
store <8 x i32> %tmp0, ptr addrspace(1) %out, align 32
ret void
}
-; CHECK: DIVERGENT: %tmp0 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32(i1 false, <2 x i32> %A, i1 false, <2 x i32> %B, <8 x i32> %C, i1 false)
+; CHECK: DIVERGENT: %tmp0 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.v2i32(i1 false, <2 x i32> %A, i1 false, <2 x i32> %B, <8 x i32> %C, i1 false)
define amdgpu_kernel void @wmma_i32_16x16x16_ui4(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
bb:
- %tmp0 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32(i1 false, <2 x i32> %A, i1 false, <2 x i32> %B, <8 x i32> %C, i1 false)
+ %tmp0 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.v2i32(i1 false, <2 x i32> %A, i1 false, <2 x i32> %B, <8 x i32> %C, i1 false)
store <8 x i32> %tmp0, ptr addrspace(1) %out, align 32
ret void
}
+; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.i16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index)
+define amdgpu_kernel void @swmmac_f32_16x16x32_f16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
+bb:
+ %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index)
+ store <8 x float> %tmp0, ptr addrspace(1) %out, align 32
+ ret void
+}
+
+; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index)
+define amdgpu_kernel void @swmmac_f32_16x16x32_bf16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
+bb:
+ %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index)
+ store <8 x float> %tmp0, ptr addrspace(1) %out, align 32
+ ret void
+}
+
+; CHECK: DIVERGENT: %tmp0 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index)
+define amdgpu_kernel void @swmmac_f16_16x16x32_f16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) {
+bb:
+ %tmp0 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index)
+ store <8 x half> %tmp0, ptr addrspace(1) %out, align 32
+ ret void
+}
+
+; CHECK: DIVERGENT: %tmp0 = call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index)
+define amdgpu_kernel void @swmmac_bf16_16x16x32_bf16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index, ptr addrspace(1) %out) {
+bb:
+ %tmp0 = call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index)
+ store <8 x i16> %tmp0, ptr addrspace(1) %out, align 32
+ ret void
+}
+
+; CHECK: DIVERGENT: %tmp0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.i16(i1 false, <2 x i32> %A, i1 false, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 false)
+define amdgpu_kernel void @swmmac_i32_16x16x32_iu8(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+bb:
+ %tmp0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8(i1 false, <2 x i32> %A, i1 false, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 false)
+ store <8 x i32> %tmp0, ptr addrspace(1) %out, align 32
+ ret void
+}
+
+; CHECK: DIVERGENT: %tmp0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.i16(i1 false, i32 %A, i1 false, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 false)
+define amdgpu_kernel void @swmmac_i32_16x16x32_iu4(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+bb:
+ %tmp0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4(i1 false, i32 %A, i1 false, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 false)
+ store <8 x i32> %tmp0, ptr addrspace(1) %out, align 32
+ ret void
+}
+
+; CHECK: DIVERGENT: %tmp0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.i16(i1 false, <2 x i32> %A, i1 false, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 false)
+define amdgpu_kernel void @swmmac_i32_16x16x64_iu4(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+bb:
+ %tmp0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4(i1 false, <2 x i32> %A, i1 false, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 false)
+ store <8 x i32> %tmp0, ptr addrspace(1) %out, align 32
+ ret void
+}
+
+; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
+define amdgpu_kernel void @swmmac_f32_16x16x32_fp8.fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
+bb:
+ %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
+ store <8 x float> %tmp0, ptr addrspace(1) %out, align 32
+ ret void
+}
+
+; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
+define amdgpu_kernel void @swmmac_f32_16x16x32_fp8.bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
+bb:
+ %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
+ store <8 x float> %tmp0, ptr addrspace(1) %out, align 32
+ ret void
+}
+
+; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
+define amdgpu_kernel void @swmmac_f32_16x16x32_bf8.fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
+bb:
+ %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
+ store <8 x float> %tmp0, ptr addrspace(1) %out, align 32
+ ret void
+}
+
+; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
+define amdgpu_kernel void @swmmac_f32_16x16x32_bf8.bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
+bb:
+ %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
+ store <8 x float> %tmp0, ptr addrspace(1) %out, align 32
+ ret void
+}
+
; CHECK: DIVERGENT: %tmp0 = call <2 x i32> @llvm.amdgcn.global.load.tr.v2i32(ptr addrspace(1) %gep)
define amdgpu_kernel void @global_load_tr_b64_v2i32(ptr addrspace(1) %addr, ptr addrspace(1) %out) {
bb:
@@ -190,12 +278,23 @@ declare i32 @llvm.amdgcn.mov.dpp.i32(i32, i32, i32, i32, i1) #1
declare i32 @llvm.amdgcn.mov.dpp8.i32(i32, i32) #1
declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32, i32, i32, i1) #1
declare i32 @llvm.amdgcn.writelane(i32, i32, i32) #1
-declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32(<16 x half>, <16 x half> , <8 x float>) #1
-declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32(<16 x i16>, <16 x i16> , <8 x float>) #1
-declare <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16(<16 x half>, <16 x half> , <16 x half>, i1 immarg) #1
-declare <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16(<16 x i16>, <16 x i16> , <16 x i16>, i1 immarg) #1
-declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32(i1 immarg, <4 x i32>, i1 immarg, <4 x i32> , <8 x i32>, i1 immarg) #1
-declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32> , <8 x i32>, i1 immarg) #1
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v16f16(<16 x half>, <16 x half> , <8 x float>) #1
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v16i16(<16 x i16>, <16 x i16> , <8 x float>) #1
+declare <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half>, <16 x half> , <16 x half>, i1 immarg) #1
+declare <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16.v16i16(<16 x i16>, <16 x i16> , <16 x i16>, i1 immarg) #1
+declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v4i32(i1 immarg, <4 x i32>, i1 immarg, <4 x i32> , <8 x i32>, i1 immarg) #1
+declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.v2i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32> , <8 x i32>, i1 immarg) #1
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16(<8 x half>, <16 x half>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16(<8 x i16>, <16 x i16>, <8 x float>, i16)
+declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16(<8 x half>, <16 x half>, <8 x half>, i16)
+declare <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16(<8 x i16>, <16 x i16>, <8 x i16>, i16)
+declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16, i1)
+declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16, i1)
+declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16, i1)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8(<2 x i32>, <4 x i32>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8(<2 x i32>, <4 x i32>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8(<2 x i32>, <4 x i32>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8(<2 x i32>, <4 x i32>, <8 x float>, i16)
declare <2 x i32> @llvm.amdgcn.global.load.tr.v2i32(ptr addrspace(1))
declare <8 x i16> @llvm.amdgcn.global.load.tr.v8i16(ptr addrspace(1))
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll
new file mode 100644
index 00000000000000..ea377531df2aea
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll
@@ -0,0 +1,504 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
+; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fneg.A = fneg <8 x half> %A
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %fneg.A, <8 x half> %B, <8 x float> %C)
+ store <8 x float> %res, <8 x float> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negB:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
+; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fneg.B = fneg <8 x half> %B
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %fneg.B, <8 x float> %C)
+ store <8 x float> %res, <8 x float> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negC:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
+; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fneg.C = fneg <8 x float> %C
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fneg.C)
+ store <8 x float> %res, <8 x float> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_absC:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
+; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fabs.C)
+ store <8 x float> %res, <8 x float> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_negC:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
+; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fneg.C = fneg <8 x float> %C
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> %fneg.C)
+ store <8 x float> %res, <8 x float> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_absC:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
+; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> %fabs.C)
+ store <8 x float> %res, <8 x float> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negA:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fneg.A = fneg <8 x half> %A
+ %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %fneg.A, <8 x half> %B, <8 x half> %C, i1 0)
+ store <8 x half> %res, <8 x half> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fneg.B = fneg <8 x half> %B
+ %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %fneg.B, <8 x half> %C, i1 0)
+ store <8 x half> %res, <8 x half> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1]
+; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fneg.C = fneg <8 x half> %C
+ %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fneg.C, i1 0)
+ store <8 x half> %res, <8 x half> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_absC:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1]
+; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fabs.C = call <8 x half> @llvm.fabs.v8f16(<8 x half> %C)
+ %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fabs.C, i1 0)
+ store <8 x half> %res, <8 x half> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_negC:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fneg.C = fneg <8 x float> %C
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
+ store <8 x float> %res, <8 x float> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_absC:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
+ store <8 x float> %res, <8 x float> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_negC:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fneg.C = fneg <8 x float> %C
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
+ store <8 x float> %res, <8 x float> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_absC:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
+ store <8 x float> %res, <8 x float> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_negC:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fneg.C = fneg <8 x float> %C
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
+ store <8 x float> %res, <8 x float> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_absC:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
+ store <8 x float> %res, <8 x float> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_negC:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fneg.C = fneg <8 x float> %C
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
+ store <8 x float> %res, <8 x float> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_absC:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
+ store <8 x float> %res, <8 x float> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negA:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off
+; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fneg.A = fneg <8 x half> %A
+ %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f16.v16f16.v8f32.i16(<8 x half> %fneg.A, <16 x half> %B, <8 x float> %C, i16 %Index)
+ store <8 x float> %res, <8 x float> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negB:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off
+; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fneg.B = fneg <16 x half> %B
+ %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f16.v16f16.v8f32.i16(<8 x half> %A, <16 x half> %fneg.B, <8 x float> %C, i16 %Index)
+ store <8 x float> %res, <8 x float> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, <8 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negA:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fneg.A = fneg <8 x half> %A
+ %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v16f16.v8f16.i16(<8 x half> %fneg.A, <16 x half> %B, <8 x half> %C, i16 %Index)
+ store <8 x half> %res, <8 x half> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, <8 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negB:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fneg.B = fneg <16 x half> %B
+ %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v16f16.v8f16.i16(<8 x half> %A, <16 x half> %fneg.B, <8 x half> %C, i16 %Index)
+ store <8 x half> %res, <8 x half> addrspace(1)* %out
+ ret void
+}
+
+; both neg and abs patterns (wmma matrix C f32 or f16 )
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negabsC:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
+; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
+ %fneg.fabs.C = fneg <8 x float> %fabs.C
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fneg.fabs.C)
+ store <8 x float> %res, <8 x float> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negabsC:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fabs.C = call <8 x half> @llvm.fabs.v8f16(<8 x half> %C)
+ %fneg.fabs.C = fneg <8 x half> %fabs.C
+ %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fneg.fabs.C, i1 0)
+ store <8 x half> %res, <8 x half> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_neg_partial_fabsA:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_and_b32_e32 v11, 0x7fffffff, v11
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
+; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %el3 = extractelement <8 x float> %C, i32 3
+ %el3.fabs = call float @llvm.fabs.f32(float %el3)
+ %partial.fabs.C = insertelement <8 x float> %C, float %el3.fabs, i32 3
+ %fneg.partial.fabs.C = fneg <8 x float> %partial.fabs.C
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fneg.partial.fabs.C)
+ store <8 x float> %res, <8 x float> addrspace(1)* %out
+ ret void
+}
+
+; A or B matrix modifier and constant in C
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA_constantC:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
+; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fneg.A = fneg <8 x half> %A
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %fneg.A, <8 x half> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
+ store <8 x float> %res, <8 x float> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB_constantC:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fneg.B = fneg <8 x half> %B
+ %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %fneg.B, <8 x half> <half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
+ store <8 x half> %res, <8 x half> addrspace(1)* %out
+ ret void
+}
+
+; pack f16 elements with v_perm_b32 since they don't come from same b32
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<8 x half> %A, <8 x half> %B, ptr %Caddr, <8 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC_pack:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: flat_load_b128 v[12:15], v[8:9]
+; GFX12-NEXT: flat_load_b128 v[16:19], v[8:9] offset:16
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x101
+; GFX12-NEXT: v_and_b32_e32 v8, 0xffff, v12
+; GFX12-NEXT: v_and_b32_e32 v9, 0xffff, v14
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: v_and_b32_e32 v14, 0xffff, v16
+; GFX12-NEXT: v_and_b32_e32 v16, 0xffff, v18
+; GFX12-NEXT: v_lshl_or_b32 v12, v13, 16, v8
+; GFX12-NEXT: v_lshl_or_b32 v13, v15, 16, v9
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_lshl_or_b32 v14, v17, 16, v14
+; GFX12-NEXT: v_lshl_or_b32 v15, v19, 16, v16
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[12:15], v[0:3], v[4:7], v[12:15] neg_lo:[0,0,1]
+; GFX12-NEXT: global_store_b128 v[10:11], v[12:15], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %C = load <16 x half>, ptr %Caddr
+ %C_shuffle = shufflevector <16 x half> %C, <16 x half> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+ %fneg.C_shuffle = fneg <8 x half> %C_shuffle
+ %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fneg.C_shuffle , i1 0)
+ store <8 x half> %res, <8 x half> addrspace(1)* %out
+ ret void
+}
+
+declare <8 x half> @llvm.fabs.v8f16(<8 x half>)
+declare <8 x float> @llvm.fabs.v8f32(<8 x float>)
+declare float @llvm.fabs.f32(float)
+
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half>, <8 x half>, <8 x float>)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8i16.v8f32(<8 x i16>, <8 x i16>, <8 x float>)
+declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half>, <8 x half>, <8 x half>, i1 immarg)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f16.v16f16.v8f32.i16(<8 x half>, <16 x half>, <8 x float>, i16)
+declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v16f16.v8f16.i16(<8 x half>, <16 x half>, <8 x half>, i16)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-imm.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-imm.ll
new file mode 100644
index 00000000000000..6251dfdc392ebc
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-imm.ll
@@ -0,0 +1,519 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], 1.0
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
+; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm_non_inlineable(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm_non_inlineable:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_mov_b32 s0, 0x40400000
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_mov_b32 s7, s0
+; GFX12-NEXT: s_mov_b32 s1, s0
+; GFX12-NEXT: s_mov_b32 s2, s0
+; GFX12-NEXT: s_mov_b32 s3, s0
+; GFX12-NEXT: s_mov_b32 s4, s0
+; GFX12-NEXT: s_mov_b32 s5, s0
+; GFX12-NEXT: s_mov_b32 s6, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_dual_mov_b32 v17, s7 :: v_dual_mov_b32 v16, s6
+; GFX12-NEXT: v_dual_mov_b32 v15, s5 :: v_dual_mov_b32 v14, s4
+; GFX12-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2
+; GFX12-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0
+; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], v[10:17]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
+; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[10:17], v[0:3], v[4:7], 1.0
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
+; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm_non_inlineable(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm_non_inlineable:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_mov_b32 s0, 0x40400000
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_mov_b32 s7, s0
+; GFX12-NEXT: s_mov_b32 s1, s0
+; GFX12-NEXT: s_mov_b32 s2, s0
+; GFX12-NEXT: s_mov_b32 s3, s0
+; GFX12-NEXT: s_mov_b32 s4, s0
+; GFX12-NEXT: s_mov_b32 s5, s0
+; GFX12-NEXT: s_mov_b32 s6, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_dual_mov_b32 v17, s7 :: v_dual_mov_b32 v16, s6
+; GFX12-NEXT: v_dual_mov_b32 v15, s5 :: v_dual_mov_b32 v14, s4
+; GFX12-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2
+; GFX12-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0
+; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[10:17], v[0:3], v[4:7], v[10:17]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
+; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0
+; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> <half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
+ store <8 x half> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm_non_inlineable(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm_non_inlineable:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_mov_b32 s0, 0x42004200
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_mov_b32 s3, s0
+; GFX12-NEXT: s_mov_b32 s1, s0
+; GFX12-NEXT: s_mov_b32 s2, s0
+; GFX12-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2
+; GFX12-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], v[10:13]
+; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> <half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0>, i1 0)
+ store <8 x half> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_mov_b32 s0, 0x3f803f80
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_mov_b32 s3, s0
+; GFX12-NEXT: s_mov_b32 s1, s0
+; GFX12-NEXT: s_mov_b32 s2, s0
+; GFX12-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2
+; GFX12-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13]
+; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> <i16 16256, i16 16256, i16 16256, i16 16256, i16 16256, i16 16256, i16 16256, i16 16256>, i1 0)
+ store <8 x i16> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm_non_inlineable(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm_non_inlineable:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_mov_b32 s0, 0x3fc03fc0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_mov_b32 s3, s0
+; GFX12-NEXT: s_mov_b32 s1, s0
+; GFX12-NEXT: s_mov_b32 s2, s0
+; GFX12-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2
+; GFX12-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13]
+; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> <i16 16320, i16 16320, i16 16320, i16 16320, i16 16320, i16 16320, i16 16320, i16 16320>, i1 0)
+ store <8 x i16> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[6:13], v[0:1], v[2:3], 1
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i1 0)
+ store <8 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm_non_inlineable:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_movk_i32 s0, 0x80
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_mov_b32 s7, s0
+; GFX12-NEXT: s_mov_b32 s1, s0
+; GFX12-NEXT: s_mov_b32 s2, s0
+; GFX12-NEXT: s_mov_b32 s3, s0
+; GFX12-NEXT: s_mov_b32 s4, s0
+; GFX12-NEXT: s_mov_b32 s5, s0
+; GFX12-NEXT: s_mov_b32 s6, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
+; GFX12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
+; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
+; GFX12-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
+; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[6:13], v[0:1], v[2:3], v[6:13]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128>, i1 0)
+ store <8 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm(i32 %A, i32 %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v0, v1, 1
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT: global_store_b128 v[2:3], v[8:11], off offset:16
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i1 0)
+ store <8 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm_non_inlineable:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_movk_i32 s0, 0x80
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_mov_b32 s7, s0
+; GFX12-NEXT: s_mov_b32 s1, s0
+; GFX12-NEXT: s_mov_b32 s2, s0
+; GFX12-NEXT: s_mov_b32 s3, s0
+; GFX12-NEXT: s_mov_b32 s4, s0
+; GFX12-NEXT: s_mov_b32 s5, s0
+; GFX12-NEXT: s_mov_b32 s6, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_dual_mov_b32 v11, s7 :: v_dual_mov_b32 v10, s6
+; GFX12-NEXT: v_dual_mov_b32 v9, s5 :: v_dual_mov_b32 v8, s4
+; GFX12-NEXT: v_dual_mov_b32 v7, s3 :: v_dual_mov_b32 v6, s2
+; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0
+; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v0, v1, v[4:11]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT: global_store_b128 v[2:3], v[8:11], off offset:16
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> <i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128>, i1 0)
+ store <8 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[6:13], v[0:1], v[2:3], 1.0
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_mov_b32 s0, 0x40400000
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_mov_b32 s7, s0
+; GFX12-NEXT: s_mov_b32 s1, s0
+; GFX12-NEXT: s_mov_b32 s2, s0
+; GFX12-NEXT: s_mov_b32 s3, s0
+; GFX12-NEXT: s_mov_b32 s4, s0
+; GFX12-NEXT: s_mov_b32 s5, s0
+; GFX12-NEXT: s_mov_b32 s6, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
+; GFX12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
+; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
+; GFX12-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
+; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[6:13], v[0:1], v[2:3], v[6:13]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[6:13], v[0:1], v[2:3], 1.0
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_mov_b32 s0, 0x40400000
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_mov_b32 s7, s0
+; GFX12-NEXT: s_mov_b32 s1, s0
+; GFX12-NEXT: s_mov_b32 s2, s0
+; GFX12-NEXT: s_mov_b32 s3, s0
+; GFX12-NEXT: s_mov_b32 s4, s0
+; GFX12-NEXT: s_mov_b32 s5, s0
+; GFX12-NEXT: s_mov_b32 s6, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
+; GFX12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
+; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
+; GFX12-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
+; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[6:13], v[0:1], v[2:3], v[6:13]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[6:13], v[0:1], v[2:3], 1.0
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_mov_b32 s0, 0x40400000
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_mov_b32 s7, s0
+; GFX12-NEXT: s_mov_b32 s1, s0
+; GFX12-NEXT: s_mov_b32 s2, s0
+; GFX12-NEXT: s_mov_b32 s3, s0
+; GFX12-NEXT: s_mov_b32 s4, s0
+; GFX12-NEXT: s_mov_b32 s5, s0
+; GFX12-NEXT: s_mov_b32 s6, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
+; GFX12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
+; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
+; GFX12-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
+; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[6:13], v[0:1], v[2:3], v[6:13]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[6:13], v[0:1], v[2:3], 1.0
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_mov_b32 s0, 0x40400000
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_mov_b32 s7, s0
+; GFX12-NEXT: s_mov_b32 s1, s0
+; GFX12-NEXT: s_mov_b32 s2, s0
+; GFX12-NEXT: s_mov_b32 s3, s0
+; GFX12-NEXT: s_mov_b32 s4, s0
+; GFX12-NEXT: s_mov_b32 s5, s0
+; GFX12-NEXT: s_mov_b32 s6, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
+; GFX12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
+; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
+; GFX12-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
+; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[6:13], v[0:1], v[2:3], v[6:13]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[6:13], v[0:1], v[2:3], 1
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i1 0)
+ store <8 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm_non_inlineable:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_movk_i32 s0, 0x80
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_mov_b32 s7, s0
+; GFX12-NEXT: s_mov_b32 s1, s0
+; GFX12-NEXT: s_mov_b32 s2, s0
+; GFX12-NEXT: s_mov_b32 s3, s0
+; GFX12-NEXT: s_mov_b32 s4, s0
+; GFX12-NEXT: s_mov_b32 s5, s0
+; GFX12-NEXT: s_mov_b32 s6, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
+; GFX12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
+; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
+; GFX12-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
+; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[6:13], v[0:1], v[2:3], v[6:13]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128>, i1 0)
+ store <8 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half>, <8 x half>, <8 x float>)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16>, <8 x i16>, <8 x float>)
+declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half>, <8 x half>, <8 x half>, i1 immarg)
+declare <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16>, <8 x i16>, <8 x i16>, i1 immarg)
+declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg)
+declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 immarg, i32, i1 immarg, i32, <8 x i32>, i1 immarg)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
+declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half>, <16 x half>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16>, <16 x i16>, <8 x float>, i16)
+declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half>, <16 x half>, <8 x half>, i16)
+declare <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16>, <16 x i16>, <8 x i16>, i16)
+declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16 %Index, i1 immarg)
+declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16 %Index, i1 immarg)
+declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i32 %Index, i1 immarg)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-iu-modifiers.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-iu-modifiers.ll
new file mode 100644
index 00000000000000..fe6d16bd8b5ead
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-iu-modifiers.ll
@@ -0,0 +1,309 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src0(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src0:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
+ store <8 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src1(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src1:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0)
+ store <8 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_clamp:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] clamp
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1)
+ store <8 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src0(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src0:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[1,0,0]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off
+; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 1, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 0)
+ store <8 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src1(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src1:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[0,1,0]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off
+; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 1, i32 %B, <8 x i32> %C, i1 0)
+ store <8 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_clamp(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_clamp:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] clamp
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off
+; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 1)
+ store <8 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+
+
+define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src0(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src0:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
+ store <8 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src1(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src1:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0)
+ store <8 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_clamp:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] clamp
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1)
+ store <8 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+
+
+
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src0(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src0:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
+; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 1, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
+ store <8 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src1(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src1:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
+; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
+ store <8 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_clamp(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_clamp:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 clamp
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
+; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 1)
+ store <8 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src0(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src0:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[1,0,0]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off
+; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 1, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
+ store <8 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src1(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src1:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[0,1,0]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off
+; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 1, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
+ store <8 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_clamp(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_clamp:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 clamp
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off
+; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 1)
+ store <8 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+
+
+define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src0(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src0:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
+; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 1, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0)
+ store <8 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src1(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src1:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
+; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 0, <2 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0)
+ store <8 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_clamp(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_clamp:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 clamp
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
+; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 1)
+ store <8 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg)
+declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 immarg, i32, i1 immarg, i32, <8 x i32>, i1 immarg)
+declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg)
+declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16 %Index, i1 immarg)
+declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16 %Index, i1 immarg)
+declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i32 %Index, i1 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-swmmac-index_key.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-swmmac-index_key.ll
new file mode 100644
index 00000000000000..c80d7a6d9a836e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-swmmac-index_key.ll
@@ -0,0 +1,321 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_index_key(<8 x half> %A, <16 x half> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_index_key:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: global_load_b32 v20, v[20:21], off
+; GFX12-NEXT: v_dual_mov_b32 v33, v19 :: v_dual_mov_b32 v32, v18
+; GFX12-NEXT: v_dual_mov_b32 v31, v17 :: v_dual_mov_b32 v30, v16
+; GFX12-NEXT: v_dual_mov_b32 v29, v15 :: v_dual_mov_b32 v28, v14
+; GFX12-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[26:33], v[0:3], v[4:11], v20
+; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 index_key:1
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[22:23], v[26:29], off
+; GFX12-NEXT: global_store_b128 v[22:23], v[30:33], off offset:16
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[24:25], v[12:15], off
+; GFX12-NEXT: global_store_b128 v[24:25], v[16:19], off offset:16
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
+ %Index0 = extractelement <2 x i16> %IndexVec, i32 0
+ %res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index0)
+ store <8 x float> %res0, ptr addrspace(1) %out0
+ %Index1 = extractelement <2 x i16> %IndexVec, i32 1
+ %res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index1)
+ store <8 x float> %res1, ptr addrspace(1) %out1
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16_index_key(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16_index_key:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: global_load_b32 v20, v[20:21], off
+; GFX12-NEXT: v_dual_mov_b32 v33, v19 :: v_dual_mov_b32 v32, v18
+; GFX12-NEXT: v_dual_mov_b32 v31, v17 :: v_dual_mov_b32 v30, v16
+; GFX12-NEXT: v_dual_mov_b32 v29, v15 :: v_dual_mov_b32 v28, v14
+; GFX12-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[26:33], v[0:3], v[4:11], v20
+; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 index_key:1
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[22:23], v[26:29], off
+; GFX12-NEXT: global_store_b128 v[22:23], v[30:33], off offset:16
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[24:25], v[12:15], off
+; GFX12-NEXT: global_store_b128 v[24:25], v[16:19], off offset:16
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
+ %Index0 = extractelement <2 x i16> %IndexVec, i32 0
+ %res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index0)
+ store <8 x float> %res0, ptr addrspace(1) %out0
+ %Index1 = extractelement <2 x i16> %IndexVec, i32 1
+ %res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index1)
+ store <8 x float> %res1, ptr addrspace(1) %out1
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_index_key(<8 x half> %A, <16 x half> %B, <8 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_index_key:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: global_load_b32 v16, v[16:17], off
+; GFX12-NEXT: v_dual_mov_b32 v25, v15 :: v_dual_mov_b32 v24, v14
+; GFX12-NEXT: v_dual_mov_b32 v23, v13 :: v_dual_mov_b32 v22, v12
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[22:25], v[0:3], v[4:11], v16
+; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:1
+; GFX12-NEXT: global_store_b128 v[18:19], v[22:25], off
+; GFX12-NEXT: global_store_b128 v[20:21], v[12:15], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
+ %Index0 = extractelement <2 x i16> %IndexVec, i32 0
+ %res0 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index0)
+ store <8 x half> %res0, ptr addrspace(1) %out0
+ %Index1 = extractelement <2 x i16> %IndexVec, i32 1
+ %res1 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index1)
+ store <8 x half> %res1, ptr addrspace(1) %out1
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16_index_key(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16_index_key:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: global_load_b32 v16, v[16:17], off
+; GFX12-NEXT: v_dual_mov_b32 v25, v15 :: v_dual_mov_b32 v24, v14
+; GFX12-NEXT: v_dual_mov_b32 v23, v13 :: v_dual_mov_b32 v22, v12
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[22:25], v[0:3], v[4:11], v16
+; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:1
+; GFX12-NEXT: global_store_b128 v[18:19], v[22:25], off
+; GFX12-NEXT: global_store_b128 v[20:21], v[12:15], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
+ %Index0 = extractelement <2 x i16> %IndexVec, i32 0
+ %res0 = call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index0)
+ store <8 x i16> %res0, ptr addrspace(1) %out0
+ %Index1 = extractelement <2 x i16> %IndexVec, i32 1
+ %res1 = call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index1)
+ store <8 x i16> %res1, ptr addrspace(1) %out1
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_index_key:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: global_load_b32 v14, v[14:15], off
+; GFX12-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
+; GFX12-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10
+; GFX12-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8
+; GFX12-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[20:27], v[0:1], v[2:5], v14
+; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 index_key:1
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[16:17], v[20:23], off
+; GFX12-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off
+; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
+ %Index0 = extractelement <2 x i16> %IndexVec, i32 0
+ %res0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index0, i1 0)
+ store <8 x i32> %res0, ptr addrspace(1) %out0
+ %Index1 = extractelement <2 x i16> %IndexVec, i32 1
+ %res1 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index1, i1 0)
+ store <8 x i32> %res1, ptr addrspace(1) %out1
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_index_key(i32 %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_index_key:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: global_load_b32 v11, v[11:12], off
+; GFX12-NEXT: v_dual_mov_b32 v24, v10 :: v_dual_mov_b32 v23, v9
+; GFX12-NEXT: v_dual_mov_b32 v22, v8 :: v_dual_mov_b32 v21, v7
+; GFX12-NEXT: v_dual_mov_b32 v20, v6 :: v_dual_mov_b32 v19, v5
+; GFX12-NEXT: v_dual_mov_b32 v18, v4 :: v_dual_mov_b32 v17, v3
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[17:24], v0, v[1:2], v11
+; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 index_key:1
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[13:14], v[17:20], off
+; GFX12-NEXT: global_store_b128 v[13:14], v[21:24], off offset:16
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off
+; GFX12-NEXT: global_store_b128 v[15:16], v[7:10], off offset:16
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
+ %Index0 = extractelement <2 x i16> %IndexVec, i32 0
+ %res0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index0, i1 0)
+ store <8 x i32> %res0, ptr addrspace(1) %out0
+ %Index1 = extractelement <2 x i16> %IndexVec, i32 1
+ %res1 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index1, i1 0)
+ store <8 x i32> %res1, ptr addrspace(1) %out1
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8_index_key:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: global_load_b32 v14, v[14:15], off
+; GFX12-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
+; GFX12-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10
+; GFX12-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8
+; GFX12-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[20:27], v[0:1], v[2:5], v14
+; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[16:17], v[20:23], off
+; GFX12-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off
+; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
+ %Index0 = extractelement <2 x i16> %IndexVec, i32 0
+ %res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index0)
+ store <8 x float> %res0, ptr addrspace(1) %out0
+ %Index1 = extractelement <2 x i16> %IndexVec, i32 1
+ %res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index1)
+ store <8 x float> %res1, ptr addrspace(1) %out1
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8_index_key:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: global_load_b32 v14, v[14:15], off
+; GFX12-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
+; GFX12-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10
+; GFX12-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8
+; GFX12-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[20:27], v[0:1], v[2:5], v14
+; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[16:17], v[20:23], off
+; GFX12-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off
+; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
+ %Index0 = extractelement <2 x i16> %IndexVec, i32 0
+ %res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index0)
+ store <8 x float> %res0, ptr addrspace(1) %out0
+ %Index1 = extractelement <2 x i16> %IndexVec, i32 1
+ %res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index1)
+ store <8 x float> %res1, ptr addrspace(1) %out1
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8_index_key:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: global_load_b32 v14, v[14:15], off
+; GFX12-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
+; GFX12-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10
+; GFX12-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8
+; GFX12-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[20:27], v[0:1], v[2:5], v14
+; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[16:17], v[20:23], off
+; GFX12-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off
+; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
+ %Index0 = extractelement <2 x i16> %IndexVec, i32 0
+ %res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index0)
+ store <8 x float> %res0, ptr addrspace(1) %out0
+ %Index1 = extractelement <2 x i16> %IndexVec, i32 1
+ %res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index1)
+ store <8 x float> %res1, ptr addrspace(1) %out1
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8_index_key:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: global_load_b32 v14, v[14:15], off
+; GFX12-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
+; GFX12-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10
+; GFX12-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8
+; GFX12-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[20:27], v[0:1], v[2:5], v14
+; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[16:17], v[20:23], off
+; GFX12-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off
+; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
+ %Index0 = extractelement <2 x i16> %IndexVec, i32 0
+ %res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index0)
+ store <8 x float> %res0, ptr addrspace(1) %out0
+ %Index1 = extractelement <2 x i16> %IndexVec, i32 1
+ %res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index1)
+ store <8 x float> %res1, ptr addrspace(1) %out1
+ ret void
+}
+
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half>, <16 x half>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16>, <16 x i16>, <8 x float>, i16)
+declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half>, <16 x half>, <8 x half>, i16)
+declare <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16>, <16 x i16>, <8 x i16>, i16)
+declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16 %Index, i1 immarg)
+declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16 %Index, i1 immarg)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32.ll
new file mode 100644
index 00000000000000..c4edc5b72b2fbb
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32.ll
@@ -0,0 +1,370 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
+; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %C)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf16:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
+; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> %C)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11]
+; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %C, i1 0)
+ store <8 x half> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11]
+; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i1 0)
+ store <8 x i16> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu8(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu8:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
+ store <8 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu4(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu4:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off
+; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 0)
+ store <8 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x32_iu4(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x32_iu4:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
+ store <8 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_f16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_f16:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off
+; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off
+; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_f16_16x16x32_f16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_f16_16x16x32_f16:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16
+; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index)
+ store <8 x half> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16
+; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index)
+ store <8 x i16> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
+; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
+ store <8 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off
+; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
+ store <8 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
+; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0)
+ store <8 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
+; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
+; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
+; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
+; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half>, <8 x half>, <8 x float>)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16>, <8 x i16>, <8 x float>)
+declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half>, <8 x half>, <8 x half>, i1 immarg)
+declare <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16>, <8 x i16>, <8 x i16>, i1 immarg)
+declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg)
+declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 immarg, i32, i1 immarg, i32, <8 x i32>, i1 immarg)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
+declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half>, <16 x half>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16>, <16 x i16>, <8 x float>, i16)
+declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half>, <16 x half>, <8 x half>, i16)
+declare <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16>, <16 x i16>, <8 x i16>, i16)
+declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16 %Index, i1 immarg)
+declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16 %Index, i1 immarg)
+declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i32 %Index, i1 immarg)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll
new file mode 100644
index 00000000000000..eafbfb6d1eeb53
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll
@@ -0,0 +1,459 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fneg.A = fneg <4 x half> %A
+ %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %fneg.A, <4 x half> %B, <4 x float> %C)
+ store <4 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negB:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fneg.B = fneg <4 x half> %B
+ %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %fneg.B, <4 x float> %C)
+ store <4 x float> %res, <4 x float> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negC:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
+; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fneg.C = fneg <4 x float> %C
+ %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %fneg.C)
+ store <4 x float> %res, <4 x float> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_absC:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1]
+; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
+ %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %fabs.C)
+ store <4 x float> %res, <4 x float> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_negC:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
+; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fneg.C = fneg <4 x float> %C
+ %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> %fneg.C)
+ store <4 x float> %res, <4 x float> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_absC:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1]
+; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
+ %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> %fabs.C)
+ store <4 x float> %res, <4 x float> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negA:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fneg.A = fneg <4 x half> %A
+ %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %fneg.A, <4 x half> %B, <4 x half> %C, i1 0)
+ store <4 x half> %res, <4 x half> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fneg.B = fneg <4 x half> %B
+ %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %fneg.B, <4 x half> %C, i1 0)
+ store <4 x half> %res, <4 x half> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1]
+; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fneg.C = fneg <4 x half> %C
+ %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fneg.C, i1 0)
+ store <4 x half> %res, <4 x half> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_absC:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1]
+; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fabs.C = call <4 x half> @llvm.fabs.v4f16(<4 x half> %C)
+ %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fabs.C, i1 0)
+ store <4 x half> %res, <4 x half> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_negC:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
+; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fneg.C = fneg <4 x float> %C
+ %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C)
+ store <4 x float> %res, <4 x float> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_absC:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
+; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
+ %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C)
+ store <4 x float> %res, <4 x float> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_negC:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
+; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fneg.C = fneg <4 x float> %C
+ %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C)
+ store <4 x float> %res, <4 x float> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_absC:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
+; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
+ %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C)
+ store <4 x float> %res, <4 x float> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_negC:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
+; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fneg.C = fneg <4 x float> %C
+ %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C)
+ store <4 x float> %res, <4 x float> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_absC:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
+; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
+ %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C)
+ store <4 x float> %res, <4 x float> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_negC:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
+; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fneg.C = fneg <4 x float> %C
+ %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C)
+ store <4 x float> %res, <4 x float> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_absC:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
+; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
+ %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C)
+ store <4 x float> %res, <4 x float> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<4 x half> %A, <8 x half> %B, <4 x float> %C, i16 %Index, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negA:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fneg.A = fneg <4 x half> %A
+ %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f16.v8f16.v4f32.i16(<4 x half> %fneg.A, <8 x half> %B, <4 x float> %C, i16 %Index)
+ store <4 x float> %res, <4 x float> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<4 x half> %A, <8 x half> %B, <4 x float> %C, i16 %Index, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negB:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fneg.B = fneg <8 x half> %B
+ %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f16.v8f16.v4f32.i16(<4 x half> %A, <8 x half> %fneg.B, <4 x float> %C, i16 %Index)
+ store <4 x float> %res, <4 x float> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<4 x half> %A, <8 x half> %B, <4 x half> %C, i16 %Index, <4 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negA:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fneg.A = fneg <4 x half> %A
+ %res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v8f16.v4f16.i16(<4 x half> %fneg.A, <8 x half> %B, <4 x half> %C, i16 %Index)
+ store <4 x half> %res, <4 x half> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<4 x half> %A, <8 x half> %B, <4 x half> %C, i16 %Index, <4 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negB:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fneg.B = fneg <8 x half> %B
+ %res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v8f16.v4f16.i16(<4 x half> %A, <8 x half> %fneg.B, <4 x half> %C, i16 %Index)
+ store <4 x half> %res, <4 x half> addrspace(1)* %out
+ ret void
+}
+
+; both neg and abs patterns (wmma matrix C f32 or f16 )
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negabsC:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
+ %fneg.fabs.C = fneg <4 x float> %fabs.C
+ %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %fneg.fabs.C)
+ store <4 x float> %res, <4 x float> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negabsC:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fabs.C = call <4 x half> @llvm.fabs.v4f16(<4 x half> %C)
+ %fneg.fabs.C = fneg <4 x half> %fabs.C
+ %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fneg.fabs.C, i1 0)
+ store <4 x half> %res, <4 x half> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_neg_partial_fabsA:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_and_b32_e32 v7, 0x7fffffff, v7
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
+; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %el3 = extractelement <4 x float> %C, i32 3
+ %el3.fabs = call float @llvm.fabs.f32(float %el3)
+ %partial.fabs.C = insertelement <4 x float> %C, float %el3.fabs, i32 3
+ %fneg.partial.fabs.C = fneg <4 x float> %partial.fabs.C
+ %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %fneg.partial.fabs.C)
+ store <4 x float> %res, <4 x float> addrspace(1)* %out
+ ret void
+}
+
+; A or B matrix modifier and constant in C
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA_constantC:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fneg.A = fneg <4 x half> %A
+ %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %fneg.A, <4 x half> %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
+ store <4 x float> %res, <4 x float> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB_constantC:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fneg.B = fneg <4 x half> %B
+ %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %fneg.B, <4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
+ store <4 x half> %res, <4 x half> addrspace(1)* %out
+ ret void
+}
+
+; pack f16 elements with v_perm_b32 since they don't come from same b32
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<4 x half> %A, <4 x half> %B, ptr %Caddr, <4 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC_pack:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: flat_load_b128 v[8:11], v[4:5]
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v8
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v10
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_lshl_or_b32 v4, v9, 16, v4
+; GFX12-NEXT: v_lshl_or_b32 v5, v11, 16, v5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1]
+; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %C = load <8 x half>, ptr %Caddr
+ %C_shuffle = shufflevector <8 x half> %C, <8 x half> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+ %fneg.C_shuffle = fneg <4 x half> %C_shuffle
+ %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fneg.C_shuffle , i1 0)
+ store <4 x half> %res, <4 x half> addrspace(1)* %out
+ ret void
+}
+
+declare <4 x half> @llvm.fabs.v4f16(<4 x half>)
+declare <4 x float> @llvm.fabs.v4f32(<4 x float>)
+declare float @llvm.fabs.f32(float)
+
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half>, <4 x half>, <4 x float>)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4i16.v4f32(<4 x i16>, <4 x i16>, <4 x float>)
+declare <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half>, <4 x half>, <4 x half>, i1 immarg)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.i32.v4f32(i32, i32, <4 x float>)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.i32.v4f32(i32, i32, <4 x float>)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.i32.v4f32(i32, i32, <4 x float>)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.i32.v4f32(i32, i32, <4 x float>)
+declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f16.v8f16.v4f32.i16(<4 x half>, <8 x half>, <4 x float>, i16)
+declare <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v8f16.v4f16.i16(<4 x half>, <8 x half>, <4 x half>, i16)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-imm.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-imm.ll
new file mode 100644
index 00000000000000..c4d70fd5f0637f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-imm.ll
@@ -0,0 +1,430 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0
+; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
+ store <4 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm_non_inlineable(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm_non_inlineable:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_mov_b32 s0, 0x40400000
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_mov_b32 s3, s0
+; GFX12-NEXT: s_mov_b32 s1, s0
+; GFX12-NEXT: s_mov_b32 s2, s0
+; GFX12-NEXT: v_mov_b32_e32 v9, s3
+; GFX12-NEXT: v_mov_b32_e32 v8, s2
+; GFX12-NEXT: v_mov_b32_e32 v7, s1
+; GFX12-NEXT: v_mov_b32_e32 v6, s0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], v[6:9]
+; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
+ store <4 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[6:9], v[0:1], v[2:3], 1.0
+; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
+ store <4 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm_non_inlineable(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm_non_inlineable:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_mov_b32 s0, 0x40400000
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_mov_b32 s3, s0
+; GFX12-NEXT: s_mov_b32 s1, s0
+; GFX12-NEXT: s_mov_b32 s2, s0
+; GFX12-NEXT: v_mov_b32_e32 v9, s3
+; GFX12-NEXT: v_mov_b32_e32 v8, s2
+; GFX12-NEXT: v_mov_b32_e32 v7, s1
+; GFX12-NEXT: v_mov_b32_e32 v6, s0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[6:9], v[0:1], v[2:3], v[6:9]
+; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
+ store <4 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0
+; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
+ store <4 x half> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm_non_inlineable(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm_non_inlineable:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_mov_b32 s0, 0x42004200
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_mov_b32 s1, s0
+; GFX12-NEXT: v_mov_b32_e32 v7, s1
+; GFX12-NEXT: v_mov_b32_e32 v6, s0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], v[6:7]
+; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> <half 3.0, half 3.0, half 3.0, half 3.0>, i1 0)
+ store <4 x half> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_mov_b32 s0, 0x3f803f80
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_mov_b32 s1, s0
+; GFX12-NEXT: v_mov_b32_e32 v7, s1
+; GFX12-NEXT: v_mov_b32_e32 v6, s0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], v[6:7]
+; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> <i16 16256, i16 16256, i16 16256, i16 16256>, i1 0)
+ store <4 x i16> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm_non_inlineable(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm_non_inlineable:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_mov_b32 s0, 0x3fc03fc0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_mov_b32 s1, s0
+; GFX12-NEXT: v_mov_b32_e32 v7, s1
+; GFX12-NEXT: v_mov_b32_e32 v6, s0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], v[6:7]
+; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> <i16 16320, i16 16320, i16 16320, i16 16320>, i1 0)
+ store <4 x i16> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm(i32 %A, i32 %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:7], v0, v1, 1
+; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, i1 0)
+ store <4 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm_non_inlineable:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_movk_i32 s0, 0x80
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_mov_b32 s3, s0
+; GFX12-NEXT: s_mov_b32 s1, s0
+; GFX12-NEXT: s_mov_b32 s2, s0
+; GFX12-NEXT: v_mov_b32_e32 v7, s3
+; GFX12-NEXT: v_mov_b32_e32 v6, s2
+; GFX12-NEXT: v_mov_b32_e32 v5, s1
+; GFX12-NEXT: v_mov_b32_e32 v4, s0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:7], v0, v1, v[4:7]
+; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 128, i32 128, i32 128, i32 128>, i1 0)
+ store <4 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm(i32 %A, i32 %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v0, v1, 1
+; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, i1 0)
+ store <4 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm_non_inlineable:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_movk_i32 s0, 0x80
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_mov_b32 s3, s0
+; GFX12-NEXT: s_mov_b32 s1, s0
+; GFX12-NEXT: s_mov_b32 s2, s0
+; GFX12-NEXT: v_mov_b32_e32 v7, s3
+; GFX12-NEXT: v_mov_b32_e32 v6, s2
+; GFX12-NEXT: v_mov_b32_e32 v5, s1
+; GFX12-NEXT: v_mov_b32_e32 v4, s0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v0, v1, v[4:7]
+; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 128, i32 128, i32 128, i32 128>, i1 0)
+ store <4 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:7], v0, v1, 1.0
+; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
+ store <4 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_mov_b32 s0, 0x40400000
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_mov_b32 s3, s0
+; GFX12-NEXT: s_mov_b32 s1, s0
+; GFX12-NEXT: s_mov_b32 s2, s0
+; GFX12-NEXT: v_mov_b32_e32 v7, s3
+; GFX12-NEXT: v_mov_b32_e32 v6, s2
+; GFX12-NEXT: v_mov_b32_e32 v5, s1
+; GFX12-NEXT: v_mov_b32_e32 v4, s0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:7], v0, v1, v[4:7]
+; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
+ store <4 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:7], v0, v1, 1.0
+; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
+ store <4 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_mov_b32 s0, 0x40400000
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_mov_b32 s3, s0
+; GFX12-NEXT: s_mov_b32 s1, s0
+; GFX12-NEXT: s_mov_b32 s2, s0
+; GFX12-NEXT: v_mov_b32_e32 v7, s3
+; GFX12-NEXT: v_mov_b32_e32 v6, s2
+; GFX12-NEXT: v_mov_b32_e32 v5, s1
+; GFX12-NEXT: v_mov_b32_e32 v4, s0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:7], v0, v1, v[4:7]
+; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
+ store <4 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:7], v0, v1, 1.0
+; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
+ store <4 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_mov_b32 s0, 0x40400000
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_mov_b32 s3, s0
+; GFX12-NEXT: s_mov_b32 s1, s0
+; GFX12-NEXT: s_mov_b32 s2, s0
+; GFX12-NEXT: v_mov_b32_e32 v7, s3
+; GFX12-NEXT: v_mov_b32_e32 v6, s2
+; GFX12-NEXT: v_mov_b32_e32 v5, s1
+; GFX12-NEXT: v_mov_b32_e32 v4, s0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:7], v0, v1, v[4:7]
+; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
+ store <4 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:7], v0, v1, 1.0
+; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
+ store <4 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_mov_b32 s0, 0x40400000
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_mov_b32 s3, s0
+; GFX12-NEXT: s_mov_b32 s1, s0
+; GFX12-NEXT: s_mov_b32 s2, s0
+; GFX12-NEXT: v_mov_b32_e32 v7, s3
+; GFX12-NEXT: v_mov_b32_e32 v6, s2
+; GFX12-NEXT: v_mov_b32_e32 v5, s1
+; GFX12-NEXT: v_mov_b32_e32 v4, s0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:7], v0, v1, v[4:7]
+; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
+ store <4 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:7], v0, v1, 1
+; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, i1 0)
+ store <4 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm_non_inlineable(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm_non_inlineable:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_movk_i32 s0, 0x80
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_mov_b32 s3, s0
+; GFX12-NEXT: s_mov_b32 s1, s0
+; GFX12-NEXT: s_mov_b32 s2, s0
+; GFX12-NEXT: v_mov_b32_e32 v7, s3
+; GFX12-NEXT: v_mov_b32_e32 v6, s2
+; GFX12-NEXT: v_mov_b32_e32 v5, s1
+; GFX12-NEXT: v_mov_b32_e32 v4, s0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:7], v0, v1, v[4:7]
+; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 128, i32 128, i32 128, i32 128>, i1 0)
+ store <4 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half>, <4 x half>, <4 x float>)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16>, <4 x i16>, <4 x float>)
+declare <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half>, <4 x half>, <4 x half>, i1 immarg)
+declare <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16>, <4 x i16>, <4 x i16>, i1 immarg)
+declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
+declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
+declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-iu-modifiers.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-iu-modifiers.ll
new file mode 100644
index 00000000000000..7e1d09805df3f6
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-iu-modifiers.ll
@@ -0,0 +1,274 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src0:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
+; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
+ store <4 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src1(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src1:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
+; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0)
+ store <4 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_clamp(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_clamp:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] clamp
+; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1)
+ store <4 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src0:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
+; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
+ store <4 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src1(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src1:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
+; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0)
+ store <4 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_clamp(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_clamp:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] clamp
+; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1)
+ store <4 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+
+
+define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src0:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
+; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
+ store <4 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src1(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src1:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
+; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0)
+ store <4 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_clamp(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_clamp:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] clamp
+; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1)
+ store <4 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+
+
+
+
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src0(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src0:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
+; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 1, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0)
+ store <4 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src1(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src1:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
+; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 1, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0)
+ store <4 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_clamp(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_clamp:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 clamp
+; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 1)
+ store <4 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src0(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src0:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[1,0,0]
+; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 0)
+ store <4 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src1(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src1:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[0,1,0]
+; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i16 %Index, i1 0)
+ store <4 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_clamp(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_clamp:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 clamp
+; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 1)
+ store <4 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+
+
+define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src0(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src0:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
+; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 1, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0)
+ store <4 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src1(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src1:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
+; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 1, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0)
+ store <4 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_clamp(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_clamp:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 clamp
+; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 1)
+ store <4 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
+declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
+declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
+declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i8 %Index, i1 immarg)
+declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i16 %Index, i1 immarg)
+declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i16 %Index, i1 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-swmmac-index_key.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-swmmac-index_key.ll
new file mode 100644
index 00000000000000..b6f1828dce2576
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-swmmac-index_key.ll
@@ -0,0 +1,472 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_index_key(<4 x half> %A, <8 x half> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_index_key:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: global_load_b32 v10, v[10:11], off
+; GFX12-NEXT: v_mov_b32_e32 v23, v9
+; GFX12-NEXT: v_mov_b32_e32 v22, v8
+; GFX12-NEXT: v_mov_b32_e32 v21, v7
+; GFX12-NEXT: v_mov_b32_e32 v20, v6
+; GFX12-NEXT: v_mov_b32_e32 v27, v9
+; GFX12-NEXT: v_mov_b32_e32 v26, v8
+; GFX12-NEXT: v_mov_b32_e32 v25, v7
+; GFX12-NEXT: v_mov_b32_e32 v24, v6
+; GFX12-NEXT: v_mov_b32_e32 v31, v9
+; GFX12-NEXT: v_mov_b32_e32 v30, v8
+; GFX12-NEXT: v_mov_b32_e32 v29, v7
+; GFX12-NEXT: v_mov_b32_e32 v28, v6
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[20:23], v[0:1], v[2:5], v10
+; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[24:27], v[0:1], v[2:5], v10 index_key:1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[28:31], v[0:1], v[2:5], v10 index_key:2
+; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:3
+; GFX12-NEXT: global_store_b128 v[12:13], v[20:23], off
+; GFX12-NEXT: global_store_b128 v[14:15], v[24:27], off
+; GFX12-NEXT: global_store_b128 v[16:17], v[28:31], off
+; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
+ %Index0 = extractelement <4 x i8> %IndexVec, i32 0
+ %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index0)
+ store <4 x float> %res0, ptr addrspace(1) %out0
+ %Index1 = extractelement <4 x i8> %IndexVec, i32 1
+ %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index1)
+ store <4 x float> %res1, ptr addrspace(1) %out1
+ %Index2 = extractelement <4 x i8> %IndexVec, i32 2
+ %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index2)
+ store <4 x float> %res2, ptr addrspace(1) %out2
+ %Index3 = extractelement <4 x i8> %IndexVec, i32 3
+ %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index3)
+ store <4 x float> %res3, ptr addrspace(1) %out3
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16_index_key(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16_index_key:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: global_load_b32 v10, v[10:11], off
+; GFX12-NEXT: v_mov_b32_e32 v23, v9
+; GFX12-NEXT: v_mov_b32_e32 v22, v8
+; GFX12-NEXT: v_mov_b32_e32 v21, v7
+; GFX12-NEXT: v_mov_b32_e32 v20, v6
+; GFX12-NEXT: v_mov_b32_e32 v27, v9
+; GFX12-NEXT: v_mov_b32_e32 v26, v8
+; GFX12-NEXT: v_mov_b32_e32 v25, v7
+; GFX12-NEXT: v_mov_b32_e32 v24, v6
+; GFX12-NEXT: v_mov_b32_e32 v31, v9
+; GFX12-NEXT: v_mov_b32_e32 v30, v8
+; GFX12-NEXT: v_mov_b32_e32 v29, v7
+; GFX12-NEXT: v_mov_b32_e32 v28, v6
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[20:23], v[0:1], v[2:5], v10
+; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[24:27], v[0:1], v[2:5], v10 index_key:1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[28:31], v[0:1], v[2:5], v10 index_key:2
+; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:3
+; GFX12-NEXT: global_store_b128 v[12:13], v[20:23], off
+; GFX12-NEXT: global_store_b128 v[14:15], v[24:27], off
+; GFX12-NEXT: global_store_b128 v[16:17], v[28:31], off
+; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
+ %Index0 = extractelement <4 x i8> %IndexVec, i32 0
+ %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index0)
+ store <4 x float> %res0, ptr addrspace(1) %out0
+ %Index1 = extractelement <4 x i8> %IndexVec, i32 1
+ %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index1)
+ store <4 x float> %res1, ptr addrspace(1) %out1
+ %Index2 = extractelement <4 x i8> %IndexVec, i32 2
+ %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index2)
+ store <4 x float> %res2, ptr addrspace(1) %out2
+ %Index3 = extractelement <4 x i8> %IndexVec, i32 3
+ %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index3)
+ store <4 x float> %res3, ptr addrspace(1) %out3
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_index_key(<4 x half> %A, <8 x half> %B, <4 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_index_key:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: global_load_b32 v22, v[8:9], off
+; GFX12-NEXT: v_mov_b32_e32 v9, v7
+; GFX12-NEXT: v_mov_b32_e32 v8, v6
+; GFX12-NEXT: v_mov_b32_e32 v19, v7
+; GFX12-NEXT: v_mov_b32_e32 v18, v6
+; GFX12-NEXT: v_mov_b32_e32 v21, v7
+; GFX12-NEXT: v_mov_b32_e32 v20, v6
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[8:9], v[0:1], v[2:5], v22
+; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[18:19], v[0:1], v[2:5], v22 index_key:1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[20:21], v[0:1], v[2:5], v22 index_key:2
+; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v22 index_key:3
+; GFX12-NEXT: global_store_b64 v[10:11], v[8:9], off
+; GFX12-NEXT: global_store_b64 v[12:13], v[18:19], off
+; GFX12-NEXT: global_store_b64 v[14:15], v[20:21], off
+; GFX12-NEXT: global_store_b64 v[16:17], v[6:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
+ %Index0 = extractelement <4 x i8> %IndexVec, i32 0
+ %res0 = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index0)
+ store <4 x half> %res0, ptr addrspace(1) %out0
+ %Index1 = extractelement <4 x i8> %IndexVec, i32 1
+ %res1 = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index1)
+ store <4 x half> %res1, ptr addrspace(1) %out1
+ %Index2 = extractelement <4 x i8> %IndexVec, i32 2
+ %res2 = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index2)
+ store <4 x half> %res2, ptr addrspace(1) %out2
+ %Index3 = extractelement <4 x i8> %IndexVec, i32 3
+ %res3 = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index3)
+ store <4 x half> %res3, ptr addrspace(1) %out3
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16_index_key(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16_index_key:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: global_load_b32 v22, v[8:9], off
+; GFX12-NEXT: v_mov_b32_e32 v9, v7
+; GFX12-NEXT: v_mov_b32_e32 v8, v6
+; GFX12-NEXT: v_mov_b32_e32 v19, v7
+; GFX12-NEXT: v_mov_b32_e32 v18, v6
+; GFX12-NEXT: v_mov_b32_e32 v21, v7
+; GFX12-NEXT: v_mov_b32_e32 v20, v6
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[8:9], v[0:1], v[2:5], v22
+; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[18:19], v[0:1], v[2:5], v22 index_key:1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[20:21], v[0:1], v[2:5], v22 index_key:2
+; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v22 index_key:3
+; GFX12-NEXT: global_store_b64 v[10:11], v[8:9], off
+; GFX12-NEXT: global_store_b64 v[12:13], v[18:19], off
+; GFX12-NEXT: global_store_b64 v[14:15], v[20:21], off
+; GFX12-NEXT: global_store_b64 v[16:17], v[6:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
+ %Index0 = extractelement <4 x i8> %IndexVec, i32 0
+ %res0 = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index0)
+ store <4 x i16> %res0, ptr addrspace(1) %out0
+ %Index1 = extractelement <4 x i8> %IndexVec, i32 1
+ %res1 = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index1)
+ store <4 x i16> %res1, ptr addrspace(1) %out1
+ %Index2 = extractelement <4 x i8> %IndexVec, i32 2
+ %res2 = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index2)
+ store <4 x i16> %res2, ptr addrspace(1) %out2
+ %Index3 = extractelement <4 x i8> %IndexVec, i32 3
+ %res3 = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index3)
+ store <4 x i16> %res3, ptr addrspace(1) %out3
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_index_key(i32 %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_index_key:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: global_load_b32 v7, v[7:8], off
+; GFX12-NEXT: v_mov_b32_e32 v20, v6
+; GFX12-NEXT: v_mov_b32_e32 v19, v5
+; GFX12-NEXT: v_mov_b32_e32 v18, v4
+; GFX12-NEXT: v_mov_b32_e32 v17, v3
+; GFX12-NEXT: v_mov_b32_e32 v24, v6
+; GFX12-NEXT: v_mov_b32_e32 v23, v5
+; GFX12-NEXT: v_mov_b32_e32 v22, v4
+; GFX12-NEXT: v_mov_b32_e32 v21, v3
+; GFX12-NEXT: v_mov_b32_e32 v28, v6
+; GFX12-NEXT: v_mov_b32_e32 v27, v5
+; GFX12-NEXT: v_mov_b32_e32 v26, v4
+; GFX12-NEXT: v_mov_b32_e32 v25, v3
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[17:20], v0, v[1:2], v7
+; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[21:24], v0, v[1:2], v7 index_key:1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[25:28], v0, v[1:2], v7 index_key:2
+; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:3
+; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off
+; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off
+; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off
+; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
+ %Index0 = extractelement <4 x i8> %IndexVec, i32 0
+ %res0 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index0, i1 0)
+ store <4 x i32> %res0, ptr addrspace(1) %out0
+ %Index1 = extractelement <4 x i8> %IndexVec, i32 1
+ %res1 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index1, i1 0)
+ store <4 x i32> %res1, ptr addrspace(1) %out1
+ %Index2 = extractelement <4 x i8> %IndexVec, i32 2
+ %res2 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index2, i1 0)
+ store <4 x i32> %res2, ptr addrspace(1) %out2
+ %Index3 = extractelement <4 x i8> %IndexVec, i32 3
+ %res3 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index3, i1 0)
+ store <4 x i32> %res3, ptr addrspace(1) %out3
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_index_key(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_index_key:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: global_load_b32 v6, v[6:7], off
+; GFX12-NEXT: v_mov_b32_e32 v15, v5
+; GFX12-NEXT: v_mov_b32_e32 v14, v4
+; GFX12-NEXT: v_mov_b32_e32 v13, v3
+; GFX12-NEXT: v_mov_b32_e32 v12, v2
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[12:15], v0, v1, v6
+; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:1
+; GFX12-NEXT: global_store_b128 v[8:9], v[12:15], off
+; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
+ %Index0 = extractelement <2 x i16> %IndexVec, i32 0
+ %res0 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index0, i1 0)
+ store <4 x i32> %res0, ptr addrspace(1) %out0
+ %Index1 = extractelement <2 x i16> %IndexVec, i32 1
+ %res1 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index1, i1 0)
+ store <4 x i32> %res1, ptr addrspace(1) %out1
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_index_key(i32 %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_index_key:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: global_load_b32 v7, v[7:8], off
+; GFX12-NEXT: v_mov_b32_e32 v16, v6
+; GFX12-NEXT: v_mov_b32_e32 v15, v5
+; GFX12-NEXT: v_mov_b32_e32 v14, v4
+; GFX12-NEXT: v_mov_b32_e32 v13, v3
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[13:16], v0, v[1:2], v7
+; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:1
+; GFX12-NEXT: global_store_b128 v[9:10], v[13:16], off
+; GFX12-NEXT: global_store_b128 v[11:12], v[3:6], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
+ %Index0 = extractelement <2 x i16> %IndexVec, i32 0
+ %res0 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index0, i1 0)
+ store <4 x i32> %res0, ptr addrspace(1) %out0
+ %Index1 = extractelement <2 x i16> %IndexVec, i32 1
+ %res1 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index1, i1 0)
+ store <4 x i32> %res1, ptr addrspace(1) %out1
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8_index_key:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: global_load_b32 v7, v[7:8], off
+; GFX12-NEXT: v_mov_b32_e32 v20, v6
+; GFX12-NEXT: v_mov_b32_e32 v19, v5
+; GFX12-NEXT: v_mov_b32_e32 v18, v4
+; GFX12-NEXT: v_mov_b32_e32 v17, v3
+; GFX12-NEXT: v_mov_b32_e32 v24, v6
+; GFX12-NEXT: v_mov_b32_e32 v23, v5
+; GFX12-NEXT: v_mov_b32_e32 v22, v4
+; GFX12-NEXT: v_mov_b32_e32 v21, v3
+; GFX12-NEXT: v_mov_b32_e32 v28, v6
+; GFX12-NEXT: v_mov_b32_e32 v27, v5
+; GFX12-NEXT: v_mov_b32_e32 v26, v4
+; GFX12-NEXT: v_mov_b32_e32 v25, v3
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[17:20], v0, v[1:2], v7
+; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[21:24], v0, v[1:2], v7 index_key:1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[25:28], v0, v[1:2], v7 index_key:2
+; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:3
+; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off
+; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off
+; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off
+; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
+ %Index0 = extractelement <4 x i8> %IndexVec, i32 0
+ %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index0)
+ store <4 x float> %res0, ptr addrspace(1) %out0
+ %Index1 = extractelement <4 x i8> %IndexVec, i32 1
+ %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index1)
+ store <4 x float> %res1, ptr addrspace(1) %out1
+ %Index2 = extractelement <4 x i8> %IndexVec, i32 2
+ %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index2)
+ store <4 x float> %res2, ptr addrspace(1) %out2
+ %Index3 = extractelement <4 x i8> %IndexVec, i32 3
+ %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index3)
+ store <4 x float> %res3, ptr addrspace(1) %out3
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8_index_key:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: global_load_b32 v7, v[7:8], off
+; GFX12-NEXT: v_mov_b32_e32 v20, v6
+; GFX12-NEXT: v_mov_b32_e32 v19, v5
+; GFX12-NEXT: v_mov_b32_e32 v18, v4
+; GFX12-NEXT: v_mov_b32_e32 v17, v3
+; GFX12-NEXT: v_mov_b32_e32 v24, v6
+; GFX12-NEXT: v_mov_b32_e32 v23, v5
+; GFX12-NEXT: v_mov_b32_e32 v22, v4
+; GFX12-NEXT: v_mov_b32_e32 v21, v3
+; GFX12-NEXT: v_mov_b32_e32 v28, v6
+; GFX12-NEXT: v_mov_b32_e32 v27, v5
+; GFX12-NEXT: v_mov_b32_e32 v26, v4
+; GFX12-NEXT: v_mov_b32_e32 v25, v3
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[17:20], v0, v[1:2], v7
+; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[21:24], v0, v[1:2], v7 index_key:1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[25:28], v0, v[1:2], v7 index_key:2
+; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:3
+; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off
+; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off
+; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off
+; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
+ %Index0 = extractelement <4 x i8> %IndexVec, i32 0
+ %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index0)
+ store <4 x float> %res0, ptr addrspace(1) %out0
+ %Index1 = extractelement <4 x i8> %IndexVec, i32 1
+ %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index1)
+ store <4 x float> %res1, ptr addrspace(1) %out1
+ %Index2 = extractelement <4 x i8> %IndexVec, i32 2
+ %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index2)
+ store <4 x float> %res2, ptr addrspace(1) %out2
+ %Index3 = extractelement <4 x i8> %IndexVec, i32 3
+ %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index3)
+ store <4 x float> %res3, ptr addrspace(1) %out3
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8_index_key:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: global_load_b32 v7, v[7:8], off
+; GFX12-NEXT: v_mov_b32_e32 v20, v6
+; GFX12-NEXT: v_mov_b32_e32 v19, v5
+; GFX12-NEXT: v_mov_b32_e32 v18, v4
+; GFX12-NEXT: v_mov_b32_e32 v17, v3
+; GFX12-NEXT: v_mov_b32_e32 v24, v6
+; GFX12-NEXT: v_mov_b32_e32 v23, v5
+; GFX12-NEXT: v_mov_b32_e32 v22, v4
+; GFX12-NEXT: v_mov_b32_e32 v21, v3
+; GFX12-NEXT: v_mov_b32_e32 v28, v6
+; GFX12-NEXT: v_mov_b32_e32 v27, v5
+; GFX12-NEXT: v_mov_b32_e32 v26, v4
+; GFX12-NEXT: v_mov_b32_e32 v25, v3
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[17:20], v0, v[1:2], v7
+; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[21:24], v0, v[1:2], v7 index_key:1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[25:28], v0, v[1:2], v7 index_key:2
+; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:3
+; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off
+; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off
+; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off
+; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
+ %Index0 = extractelement <4 x i8> %IndexVec, i32 0
+ %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index0)
+ store <4 x float> %res0, ptr addrspace(1) %out0
+ %Index1 = extractelement <4 x i8> %IndexVec, i32 1
+ %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index1)
+ store <4 x float> %res1, ptr addrspace(1) %out1
+ %Index2 = extractelement <4 x i8> %IndexVec, i32 2
+ %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index2)
+ store <4 x float> %res2, ptr addrspace(1) %out2
+ %Index3 = extractelement <4 x i8> %IndexVec, i32 3
+ %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index3)
+ store <4 x float> %res3, ptr addrspace(1) %out3
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8_index_key:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: global_load_b32 v7, v[7:8], off
+; GFX12-NEXT: v_mov_b32_e32 v20, v6
+; GFX12-NEXT: v_mov_b32_e32 v19, v5
+; GFX12-NEXT: v_mov_b32_e32 v18, v4
+; GFX12-NEXT: v_mov_b32_e32 v17, v3
+; GFX12-NEXT: v_mov_b32_e32 v24, v6
+; GFX12-NEXT: v_mov_b32_e32 v23, v5
+; GFX12-NEXT: v_mov_b32_e32 v22, v4
+; GFX12-NEXT: v_mov_b32_e32 v21, v3
+; GFX12-NEXT: v_mov_b32_e32 v28, v6
+; GFX12-NEXT: v_mov_b32_e32 v27, v5
+; GFX12-NEXT: v_mov_b32_e32 v26, v4
+; GFX12-NEXT: v_mov_b32_e32 v25, v3
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[17:20], v0, v[1:2], v7
+; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[21:24], v0, v[1:2], v7 index_key:1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[25:28], v0, v[1:2], v7 index_key:2
+; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:3
+; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off
+; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off
+; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off
+; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
+ %Index0 = extractelement <4 x i8> %IndexVec, i32 0
+ %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index0)
+ store <4 x float> %res0, ptr addrspace(1) %out0
+ %Index1 = extractelement <4 x i8> %IndexVec, i32 1
+ %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index1)
+ store <4 x float> %res1, ptr addrspace(1) %out1
+ %Index2 = extractelement <4 x i8> %IndexVec, i32 2
+ %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index2)
+ store <4 x float> %res2, ptr addrspace(1) %out2
+ %Index3 = extractelement <4 x i8> %IndexVec, i32 3
+ %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index3)
+ store <4 x float> %res3, ptr addrspace(1) %out3
+ ret void
+}
+
+declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half>, <8 x half>, <4 x float>, i8)
+declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16>, <8 x i16>, <4 x float>, i8)
+declare <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half>, <8 x half>, <4 x half>, i8)
+declare <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16>, <8 x i16>, <4 x i16>, i8)
+declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i8 %Index, i1 immarg)
+declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i16 %Index, i1 immarg)
+declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i16 %Index, i1 immarg)
+declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
+declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
+declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
+declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64.ll
new file mode 100644
index 00000000000000..0d1871a18d4055
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64.ll
@@ -0,0 +1,333 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7]
+; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %C)
+ store <4 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf16:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7]
+; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> %C)
+ store <4 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5]
+; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> %C, i1 0)
+ store <4 x half> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5]
+; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i1 0)
+ store <4 x i16> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu8(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu8:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5]
+; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
+ store <4 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu4(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu4:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5]
+; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
+ store <4 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5]
+; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C)
+ store <4 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5]
+; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C)
+ store <4 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5]
+; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C)
+ store <4 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5]
+; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C)
+ store <4 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x32_iu4(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x32_iu4:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5]
+; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
+ store <4 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_f16(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_f16:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10
+; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index)
+ store <4 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10
+; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index)
+ store <4 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_f16_16x16x32_f16(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_f16_16x16x32_f16:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8
+; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index)
+ store <4 x half> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8
+; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index)
+ store <4 x i16> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7
+; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0)
+ store <4 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6
+; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 0)
+ store <4 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7
+; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0)
+ store <4 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7
+; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index)
+ store <4 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7
+; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index)
+ store <4 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7
+; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index)
+ store <4 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7
+; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index)
+ store <4 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half>, <4 x half>, <4 x float>)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16>, <4 x i16>, <4 x float>)
+declare <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half>, <4 x half>, <4 x half>, i1 immarg)
+declare <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16>, <4 x i16>, <4 x i16>, i1 immarg)
+declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
+declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
+declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
+declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half>, <8 x half>, <4 x float>, i8)
+declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16>, <8 x i16>, <4 x float>, i8)
+declare <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half>, <8 x half>, <4 x half>, i8)
+declare <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16>, <8 x i16>, <4 x i16>, i8)
+declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i8 %Index, i1 immarg)
+declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i16 %Index, i1 immarg)
+declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i16 %Index, i1 immarg)
+declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
+declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
+declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
+declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll
new file mode 100644
index 00000000000000..fb2af36839b5d2
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll
@@ -0,0 +1,499 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
+; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fneg.A = fneg <8 x half> %A
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %fneg.A, <8 x half> %B, <8 x float> %C)
+ store <8 x float> %res, <8 x float> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negB:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
+; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fneg.B = fneg <8 x half> %B
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %fneg.B, <8 x float> %C)
+ store <8 x float> %res, <8 x float> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negC:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
+; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fneg.C = fneg <8 x float> %C
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fneg.C)
+ store <8 x float> %res, <8 x float> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_absC:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
+; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fabs.C)
+ store <8 x float> %res, <8 x float> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_negC:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
+; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fneg.C = fneg <8 x float> %C
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> %fneg.C)
+ store <8 x float> %res, <8 x float> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_absC:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
+; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> %fabs.C)
+ store <8 x float> %res, <8 x float> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negA:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fneg.A = fneg <8 x half> %A
+ %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %fneg.A, <8 x half> %B, <8 x half> %C, i1 0)
+ store <8 x half> %res, <8 x half> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fneg.B = fneg <8 x half> %B
+ %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %fneg.B, <8 x half> %C, i1 0)
+ store <8 x half> %res, <8 x half> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1]
+; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fneg.C = fneg <8 x half> %C
+ %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fneg.C, i1 0)
+ store <8 x half> %res, <8 x half> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_absC:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1]
+; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fabs.C = call <8 x half> @llvm.fabs.v8f16(<8 x half> %C)
+ %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fabs.C, i1 0)
+ store <8 x half> %res, <8 x half> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_negC:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fneg.C = fneg <8 x float> %C
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
+ store <8 x float> %res, <8 x float> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_absC:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
+ store <8 x float> %res, <8 x float> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_negC:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fneg.C = fneg <8 x float> %C
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
+ store <8 x float> %res, <8 x float> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_absC:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
+ store <8 x float> %res, <8 x float> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_negC:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fneg.C = fneg <8 x float> %C
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
+ store <8 x float> %res, <8 x float> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_absC:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
+ store <8 x float> %res, <8 x float> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_negC:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fneg.C = fneg <8 x float> %C
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
+ store <8 x float> %res, <8 x float> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_absC:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
+ store <8 x float> %res, <8 x float> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negA:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16
+; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fneg.A = fneg <8 x half> %A
+ %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f16.v16f16.v8f32.i16(<8 x half> %fneg.A, <16 x half> %B, <8 x float> %C, i16 %Index)
+ store <8 x float> %res, <8 x float> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negB:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16
+; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fneg.B = fneg <16 x half> %B
+ %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f16.v16f16.v8f32.i16(<8 x half> %A, <16 x half> %fneg.B, <8 x float> %C, i16 %Index)
+ store <8 x float> %res, <8 x float> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, <8 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negA:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fneg.A = fneg <8 x half> %A
+ %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v16f16.v8f16.i16(<8 x half> %fneg.A, <16 x half> %B, <8 x half> %C, i16 %Index)
+ store <8 x half> %res, <8 x half> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, <8 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negB:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fneg.B = fneg <16 x half> %B
+ %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v16f16.v8f16.i16(<8 x half> %A, <16 x half> %fneg.B, <8 x half> %C, i16 %Index)
+ store <8 x half> %res, <8 x half> addrspace(1)* %out
+ ret void
+}
+
+; both neg and abs patterns (wmma matrix C f32 or f16 )
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negabsC:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
+; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
+ %fneg.fabs.C = fneg <8 x float> %fabs.C
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fneg.fabs.C)
+ store <8 x float> %res, <8 x float> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negabsC:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fabs.C = call <8 x half> @llvm.fabs.v8f16(<8 x half> %C)
+ %fneg.fabs.C = fneg <8 x half> %fabs.C
+ %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fneg.fabs.C, i1 0)
+ store <8 x half> %res, <8 x half> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_neg_partial_fabsA:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_and_b32_e32 v11, 0x7fffffff, v11
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
+; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %el3 = extractelement <8 x float> %C, i32 3
+ %el3.fabs = call float @llvm.fabs.f32(float %el3)
+ %partial.fabs.C = insertelement <8 x float> %C, float %el3.fabs, i32 3
+ %fneg.partial.fabs.C = fneg <8 x float> %partial.fabs.C
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fneg.partial.fabs.C)
+ store <8 x float> %res, <8 x float> addrspace(1)* %out
+ ret void
+}
+
+; A or B matrix modifier and constant in C
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA_constantC:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
+; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fneg.A = fneg <8 x half> %A
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %fneg.A, <8 x half> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
+ store <8 x float> %res, <8 x float> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB_constantC:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fneg.B = fneg <8 x half> %B
+ %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %fneg.B, <8 x half> <half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
+ store <8 x half> %res, <8 x half> addrspace(1)* %out
+ ret void
+}
+
+; pack f16 elements with v_perm_b32 since they don't come from same b32
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<8 x half> %A, <8 x half> %B, ptr %Caddr, <8 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC_pack:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: flat_load_b128 v[12:15], v[8:9] offset:16
+; GFX12-NEXT: flat_load_b128 v[16:19], v[8:9]
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x101
+; GFX12-NEXT: v_perm_b32 v15, v15, v14, 0x5040100
+; GFX12-NEXT: v_perm_b32 v14, v13, v12, 0x5040100
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: v_perm_b32 v13, v19, v18, 0x5040100
+; GFX12-NEXT: v_perm_b32 v12, v17, v16, 0x5040100
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[12:15], v[0:3], v[4:7], v[12:15] neg_lo:[0,0,1]
+; GFX12-NEXT: global_store_b128 v[10:11], v[12:15], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %C = load <16 x half>, ptr %Caddr
+ %C_shuffle = shufflevector <16 x half> %C, <16 x half> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+ %fneg.C_shuffle = fneg <8 x half> %C_shuffle
+ %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fneg.C_shuffle , i1 0)
+ store <8 x half> %res, <8 x half> addrspace(1)* %out
+ ret void
+}
+
+declare <8 x half> @llvm.fabs.v8f16(<8 x half>)
+declare <8 x float> @llvm.fabs.v8f32(<8 x float>)
+declare float @llvm.fabs.f32(float)
+
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half>, <8 x half>, <8 x float>)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8i16.v8f32(<8 x i16>, <8 x i16>, <8 x float>)
+declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half>, <8 x half>, <8 x half>, i1 immarg)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f16.v16f16.v8f32.i16(<8 x half>, <16 x half>, <8 x float>, i16)
+declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v16f16.v8f16.i16(<8 x half>, <16 x half>, <8 x half>, i16)
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-imm.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-imm.ll
new file mode 100644
index 00000000000000..51f93b57f38e90
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-imm.ll
@@ -0,0 +1,431 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], 1.0
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
+; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm_non_inlineable(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm_non_inlineable:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_mov_b32_e32 v10, 0x40400000
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v12, v10
+; GFX12-NEXT: v_dual_mov_b32 v13, v10 :: v_dual_mov_b32 v14, v10
+; GFX12-NEXT: v_dual_mov_b32 v15, v10 :: v_dual_mov_b32 v16, v10
+; GFX12-NEXT: v_mov_b32_e32 v17, v10
+; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], v[10:17]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
+; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[10:17], v[0:3], v[4:7], 1.0
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
+; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm_non_inlineable(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm_non_inlineable:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_mov_b32_e32 v10, 0x40400000
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v12, v10
+; GFX12-NEXT: v_dual_mov_b32 v13, v10 :: v_dual_mov_b32 v14, v10
+; GFX12-NEXT: v_dual_mov_b32 v15, v10 :: v_dual_mov_b32 v16, v10
+; GFX12-NEXT: v_mov_b32_e32 v17, v10
+; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[10:17], v[0:3], v[4:7], v[10:17]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
+; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0
+; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> <half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
+ store <8 x half> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm_non_inlineable(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm_non_inlineable:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_mov_b32_e32 v10, 0x42004200
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v12, v10
+; GFX12-NEXT: v_mov_b32_e32 v13, v10
+; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], v[10:13]
+; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> <half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0>, i1 0)
+ store <8 x half> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], 1.0
+; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> <i16 16256, i16 16256, i16 16256, i16 16256, i16 16256, i16 16256, i16 16256, i16 16256>, i1 0)
+ store <8 x i16> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm_non_inlineable(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm_non_inlineable:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_mov_b32_e32 v10, 0x3fc03fc0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v12, v10
+; GFX12-NEXT: v_mov_b32_e32 v13, v10
+; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13]
+; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> <i16 16320, i16 16320, i16 16320, i16 16320, i16 16320, i16 16320, i16 16320, i16 16320>, i1 0)
+ store <8 x i16> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[6:13], v[0:1], v[2:3], 1
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i1 0)
+ store <8 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm_non_inlineable:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_mov_b32_e32 v6, 0x80
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6
+; GFX12-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6
+; GFX12-NEXT: v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6
+; GFX12-NEXT: v_mov_b32_e32 v13, v6
+; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[6:13], v[0:1], v[2:3], v[6:13]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128>, i1 0)
+ store <8 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm(i32 %A, i32 %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v0, v1, 1
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[2:3], v[8:11], off offset:16
+; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i1 0)
+ store <8 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm_non_inlineable:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_mov_b32_e32 v4, 0x80
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT: v_dual_mov_b32 v7, v4 :: v_dual_mov_b32 v8, v4
+; GFX12-NEXT: v_dual_mov_b32 v9, v4 :: v_dual_mov_b32 v10, v4
+; GFX12-NEXT: v_mov_b32_e32 v11, v4
+; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v0, v1, v[4:11]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[2:3], v[8:11], off offset:16
+; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> <i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128>, i1 0)
+ store <8 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[6:13], v[0:1], v[2:3], 1.0
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_mov_b32_e32 v6, 0x40400000
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6
+; GFX12-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6
+; GFX12-NEXT: v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6
+; GFX12-NEXT: v_mov_b32_e32 v13, v6
+; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[6:13], v[0:1], v[2:3], v[6:13]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[6:13], v[0:1], v[2:3], 1.0
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_mov_b32_e32 v6, 0x40400000
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6
+; GFX12-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6
+; GFX12-NEXT: v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6
+; GFX12-NEXT: v_mov_b32_e32 v13, v6
+; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[6:13], v[0:1], v[2:3], v[6:13]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[6:13], v[0:1], v[2:3], 1.0
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_mov_b32_e32 v6, 0x40400000
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6
+; GFX12-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6
+; GFX12-NEXT: v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6
+; GFX12-NEXT: v_mov_b32_e32 v13, v6
+; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[6:13], v[0:1], v[2:3], v[6:13]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[6:13], v[0:1], v[2:3], 1.0
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_mov_b32_e32 v6, 0x40400000
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6
+; GFX12-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6
+; GFX12-NEXT: v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6
+; GFX12-NEXT: v_mov_b32_e32 v13, v6
+; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[6:13], v[0:1], v[2:3], v[6:13]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[6:13], v[0:1], v[2:3], 1
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i1 0)
+ store <8 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm_non_inlineable:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_mov_b32_e32 v6, 0x80
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6
+; GFX12-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6
+; GFX12-NEXT: v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6
+; GFX12-NEXT: v_mov_b32_e32 v13, v6
+; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[6:13], v[0:1], v[2:3], v[6:13]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128>, i1 0)
+ store <8 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half>, <8 x half>, <8 x float>)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16>, <8 x i16>, <8 x float>)
+declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half>, <8 x half>, <8 x half>, i1 immarg)
+declare <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16>, <8 x i16>, <8 x i16>, i1 immarg)
+declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg)
+declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 immarg, i32, i1 immarg, i32, <8 x i32>, i1 immarg)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
+declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half>, <16 x half>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16>, <16 x i16>, <8 x float>, i16)
+declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half>, <16 x half>, <8 x half>, i16)
+declare <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16>, <16 x i16>, <8 x i16>, i16)
+declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16 %Index, i1 immarg)
+declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16 %Index, i1 immarg)
+declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i32 %Index, i1 immarg)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-iu-modifiers.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-iu-modifiers.ll
new file mode 100644
index 00000000000000..48297932aa5984
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-iu-modifiers.ll
@@ -0,0 +1,309 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src0(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src0:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
+ store <8 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src1(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src1:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0)
+ store <8 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_clamp:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] clamp
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1)
+ store <8 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src0(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src0:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[1,0,0]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16
+; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 1, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 0)
+ store <8 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src1(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src1:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[0,1,0]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16
+; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 1, i32 %B, <8 x i32> %C, i1 0)
+ store <8 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_clamp(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_clamp:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] clamp
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16
+; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 1)
+ store <8 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+
+
+define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src0(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src0:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
+ store <8 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src1(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src1:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0)
+ store <8 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_clamp:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] clamp
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1)
+ store <8 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+
+
+
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src0(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src0:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
+; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 1, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
+ store <8 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src1(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src1:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
+; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
+ store <8 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_clamp(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_clamp:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 clamp
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
+; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 1)
+ store <8 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src0(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src0:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[1,0,0]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16
+; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 1, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
+ store <8 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src1(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src1:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[0,1,0]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16
+; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 1, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
+ store <8 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_clamp(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_clamp:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 clamp
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16
+; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 1)
+ store <8 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+
+
+define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src0(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src0:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
+; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 1, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0)
+ store <8 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src1(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src1:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
+; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 0, <2 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0)
+ store <8 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_clamp(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_clamp:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 clamp
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
+; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 1)
+ store <8 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg)
+declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 immarg, i32, i1 immarg, i32, <8 x i32>, i1 immarg)
+declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg)
+declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16 %Index, i1 immarg)
+declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16 %Index, i1 immarg)
+declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i32 %Index, i1 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-swmmac-index_key.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-swmmac-index_key.ll
new file mode 100644
index 00000000000000..43538768f00fd9
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-swmmac-index_key.ll
@@ -0,0 +1,321 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_index_key(<8 x half> %A, <16 x half> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_index_key:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: global_load_b32 v20, v[20:21], off
+; GFX12-NEXT: v_dual_mov_b32 v33, v19 :: v_dual_mov_b32 v32, v18
+; GFX12-NEXT: v_dual_mov_b32 v31, v17 :: v_dual_mov_b32 v30, v16
+; GFX12-NEXT: v_dual_mov_b32 v29, v15 :: v_dual_mov_b32 v28, v14
+; GFX12-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[26:33], v[0:3], v[4:11], v20
+; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 index_key:1
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[22:23], v[30:33], off offset:16
+; GFX12-NEXT: global_store_b128 v[22:23], v[26:29], off
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[24:25], v[16:19], off offset:16
+; GFX12-NEXT: global_store_b128 v[24:25], v[12:15], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
+ %Index0 = extractelement <2 x i16> %IndexVec, i32 0
+ %res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index0)
+ store <8 x float> %res0, ptr addrspace(1) %out0
+ %Index1 = extractelement <2 x i16> %IndexVec, i32 1
+ %res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index1)
+ store <8 x float> %res1, ptr addrspace(1) %out1
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16_index_key(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16_index_key:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: global_load_b32 v20, v[20:21], off
+; GFX12-NEXT: v_dual_mov_b32 v33, v19 :: v_dual_mov_b32 v32, v18
+; GFX12-NEXT: v_dual_mov_b32 v31, v17 :: v_dual_mov_b32 v30, v16
+; GFX12-NEXT: v_dual_mov_b32 v29, v15 :: v_dual_mov_b32 v28, v14
+; GFX12-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[26:33], v[0:3], v[4:11], v20
+; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 index_key:1
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[22:23], v[30:33], off offset:16
+; GFX12-NEXT: global_store_b128 v[22:23], v[26:29], off
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[24:25], v[16:19], off offset:16
+; GFX12-NEXT: global_store_b128 v[24:25], v[12:15], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
+ %Index0 = extractelement <2 x i16> %IndexVec, i32 0
+ %res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index0)
+ store <8 x float> %res0, ptr addrspace(1) %out0
+ %Index1 = extractelement <2 x i16> %IndexVec, i32 1
+ %res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index1)
+ store <8 x float> %res1, ptr addrspace(1) %out1
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_index_key(<8 x half> %A, <16 x half> %B, <8 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_index_key:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: global_load_b32 v16, v[16:17], off
+; GFX12-NEXT: v_dual_mov_b32 v25, v15 :: v_dual_mov_b32 v24, v14
+; GFX12-NEXT: v_dual_mov_b32 v23, v13 :: v_dual_mov_b32 v22, v12
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[22:25], v[0:3], v[4:11], v16
+; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:1
+; GFX12-NEXT: global_store_b128 v[18:19], v[22:25], off
+; GFX12-NEXT: global_store_b128 v[20:21], v[12:15], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
+ %Index0 = extractelement <2 x i16> %IndexVec, i32 0
+ %res0 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index0)
+ store <8 x half> %res0, ptr addrspace(1) %out0
+ %Index1 = extractelement <2 x i16> %IndexVec, i32 1
+ %res1 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index1)
+ store <8 x half> %res1, ptr addrspace(1) %out1
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16_index_key(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16_index_key:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: global_load_b32 v16, v[16:17], off
+; GFX12-NEXT: v_dual_mov_b32 v25, v15 :: v_dual_mov_b32 v24, v14
+; GFX12-NEXT: v_dual_mov_b32 v23, v13 :: v_dual_mov_b32 v22, v12
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[22:25], v[0:3], v[4:11], v16
+; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:1
+; GFX12-NEXT: global_store_b128 v[18:19], v[22:25], off
+; GFX12-NEXT: global_store_b128 v[20:21], v[12:15], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
+ %Index0 = extractelement <2 x i16> %IndexVec, i32 0
+ %res0 = call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index0)
+ store <8 x i16> %res0, ptr addrspace(1) %out0
+ %Index1 = extractelement <2 x i16> %IndexVec, i32 1
+ %res1 = call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index1)
+ store <8 x i16> %res1, ptr addrspace(1) %out1
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_index_key:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: global_load_b32 v14, v[14:15], off
+; GFX12-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
+; GFX12-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10
+; GFX12-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8
+; GFX12-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[20:27], v[0:1], v[2:5], v14
+; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 index_key:1
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16
+; GFX12-NEXT: global_store_b128 v[16:17], v[20:23], off
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16
+; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
+ %Index0 = extractelement <2 x i16> %IndexVec, i32 0
+ %res0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index0, i1 0)
+ store <8 x i32> %res0, ptr addrspace(1) %out0
+ %Index1 = extractelement <2 x i16> %IndexVec, i32 1
+ %res1 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index1, i1 0)
+ store <8 x i32> %res1, ptr addrspace(1) %out1
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_index_key(i32 %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_index_key:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: global_load_b32 v11, v[11:12], off
+; GFX12-NEXT: v_dual_mov_b32 v24, v10 :: v_dual_mov_b32 v23, v9
+; GFX12-NEXT: v_dual_mov_b32 v22, v8 :: v_dual_mov_b32 v21, v7
+; GFX12-NEXT: v_dual_mov_b32 v20, v6 :: v_dual_mov_b32 v19, v5
+; GFX12-NEXT: v_dual_mov_b32 v18, v4 :: v_dual_mov_b32 v17, v3
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[17:24], v0, v[1:2], v11
+; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 index_key:1
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[13:14], v[21:24], off offset:16
+; GFX12-NEXT: global_store_b128 v[13:14], v[17:20], off
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[15:16], v[7:10], off offset:16
+; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
+ %Index0 = extractelement <2 x i16> %IndexVec, i32 0
+ %res0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index0, i1 0)
+ store <8 x i32> %res0, ptr addrspace(1) %out0
+ %Index1 = extractelement <2 x i16> %IndexVec, i32 1
+ %res1 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index1, i1 0)
+ store <8 x i32> %res1, ptr addrspace(1) %out1
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8_index_key:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: global_load_b32 v14, v[14:15], off
+; GFX12-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
+; GFX12-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10
+; GFX12-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8
+; GFX12-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[20:27], v[0:1], v[2:5], v14
+; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16
+; GFX12-NEXT: global_store_b128 v[16:17], v[20:23], off
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16
+; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
+ %Index0 = extractelement <2 x i16> %IndexVec, i32 0
+ %res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index0)
+ store <8 x float> %res0, ptr addrspace(1) %out0
+ %Index1 = extractelement <2 x i16> %IndexVec, i32 1
+ %res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index1)
+ store <8 x float> %res1, ptr addrspace(1) %out1
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8_index_key:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: global_load_b32 v14, v[14:15], off
+; GFX12-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
+; GFX12-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10
+; GFX12-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8
+; GFX12-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[20:27], v[0:1], v[2:5], v14
+; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16
+; GFX12-NEXT: global_store_b128 v[16:17], v[20:23], off
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16
+; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
+ %Index0 = extractelement <2 x i16> %IndexVec, i32 0
+ %res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index0)
+ store <8 x float> %res0, ptr addrspace(1) %out0
+ %Index1 = extractelement <2 x i16> %IndexVec, i32 1
+ %res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index1)
+ store <8 x float> %res1, ptr addrspace(1) %out1
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8_index_key:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: global_load_b32 v14, v[14:15], off
+; GFX12-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
+; GFX12-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10
+; GFX12-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8
+; GFX12-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[20:27], v[0:1], v[2:5], v14
+; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16
+; GFX12-NEXT: global_store_b128 v[16:17], v[20:23], off
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16
+; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
+ %Index0 = extractelement <2 x i16> %IndexVec, i32 0
+ %res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index0)
+ store <8 x float> %res0, ptr addrspace(1) %out0
+ %Index1 = extractelement <2 x i16> %IndexVec, i32 1
+ %res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index1)
+ store <8 x float> %res1, ptr addrspace(1) %out1
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8_index_key:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: global_load_b32 v14, v[14:15], off
+; GFX12-NEXT: v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
+; GFX12-NEXT: v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10
+; GFX12-NEXT: v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8
+; GFX12-NEXT: v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[20:27], v[0:1], v[2:5], v14
+; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16
+; GFX12-NEXT: global_store_b128 v[16:17], v[20:23], off
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16
+; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
+ %Index0 = extractelement <2 x i16> %IndexVec, i32 0
+ %res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index0)
+ store <8 x float> %res0, ptr addrspace(1) %out0
+ %Index1 = extractelement <2 x i16> %IndexVec, i32 1
+ %res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index1)
+ store <8 x float> %res1, ptr addrspace(1) %out1
+ ret void
+}
+
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half>, <16 x half>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16>, <16 x i16>, <8 x float>, i16)
+declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half>, <16 x half>, <8 x half>, i16)
+declare <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16>, <16 x i16>, <8 x i16>, i16)
+declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16 %Index, i1 immarg)
+declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16 %Index, i1 immarg)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32.ll
new file mode 100644
index 00000000000000..2db3b07de54d0a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32.ll
@@ -0,0 +1,370 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
+; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %C)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf16:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
+; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> %C)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11]
+; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %C, i1 0)
+ store <8 x half> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11]
+; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i1 0)
+ store <8 x i16> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu8(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu8:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
+ store <8 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu4(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu4:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16
+; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 0)
+ store <8 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x32_iu4(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x32_iu4:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11]
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
+ store <8 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_f16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_f16:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16
+; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16
+; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_f16_16x16x32_f16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_f16_16x16x32_f16:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16
+; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index)
+ store <8 x half> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16
+; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index)
+ store <8 x i16> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
+; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
+ store <8 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16
+; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
+ store <8 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
+; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0)
+ store <8 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
+; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
+; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
+; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
+; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
+ store <8 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half>, <8 x half>, <8 x float>)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16>, <8 x i16>, <8 x float>)
+declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half>, <8 x half>, <8 x half>, i1 immarg)
+declare <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16>, <8 x i16>, <8 x i16>, i1 immarg)
+declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg)
+declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 immarg, i32, i1 immarg, i32, <8 x i32>, i1 immarg)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
+declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half>, <16 x half>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16>, <16 x i16>, <8 x float>, i16)
+declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half>, <16 x half>, <8 x half>, i16)
+declare <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16>, <16 x i16>, <8 x i16>, i16)
+declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16 %Index, i1 immarg)
+declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16 %Index, i1 immarg)
+declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i32 %Index, i1 immarg)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll
new file mode 100644
index 00000000000000..1f2ffaf7627125
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll
@@ -0,0 +1,456 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fneg.A = fneg <4 x half> %A
+ %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %fneg.A, <4 x half> %B, <4 x float> %C)
+ store <4 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negB:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fneg.B = fneg <4 x half> %B
+ %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %fneg.B, <4 x float> %C)
+ store <4 x float> %res, <4 x float> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negC:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
+; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fneg.C = fneg <4 x float> %C
+ %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %fneg.C)
+ store <4 x float> %res, <4 x float> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_absC:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1]
+; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
+ %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %fabs.C)
+ store <4 x float> %res, <4 x float> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_negC:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
+; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fneg.C = fneg <4 x float> %C
+ %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> %fneg.C)
+ store <4 x float> %res, <4 x float> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_absC:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1]
+; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
+ %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> %fabs.C)
+ store <4 x float> %res, <4 x float> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negA:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fneg.A = fneg <4 x half> %A
+ %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %fneg.A, <4 x half> %B, <4 x half> %C, i1 0)
+ store <4 x half> %res, <4 x half> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fneg.B = fneg <4 x half> %B
+ %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %fneg.B, <4 x half> %C, i1 0)
+ store <4 x half> %res, <4 x half> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1]
+; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fneg.C = fneg <4 x half> %C
+ %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fneg.C, i1 0)
+ store <4 x half> %res, <4 x half> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_absC:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1]
+; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fabs.C = call <4 x half> @llvm.fabs.v4f16(<4 x half> %C)
+ %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fabs.C, i1 0)
+ store <4 x half> %res, <4 x half> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_negC:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
+; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fneg.C = fneg <4 x float> %C
+ %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C)
+ store <4 x float> %res, <4 x float> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_absC:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
+; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
+ %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C)
+ store <4 x float> %res, <4 x float> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_negC:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
+; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fneg.C = fneg <4 x float> %C
+ %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C)
+ store <4 x float> %res, <4 x float> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_absC:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
+; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
+ %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C)
+ store <4 x float> %res, <4 x float> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_negC:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
+; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fneg.C = fneg <4 x float> %C
+ %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C)
+ store <4 x float> %res, <4 x float> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_absC:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
+; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
+ %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C)
+ store <4 x float> %res, <4 x float> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_negC:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
+; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fneg.C = fneg <4 x float> %C
+ %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C)
+ store <4 x float> %res, <4 x float> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_absC:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
+; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
+ %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C)
+ store <4 x float> %res, <4 x float> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<4 x half> %A, <8 x half> %B, <4 x float> %C, i16 %Index, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negA:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fneg.A = fneg <4 x half> %A
+ %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f16.v8f16.v4f32.i16(<4 x half> %fneg.A, <8 x half> %B, <4 x float> %C, i16 %Index)
+ store <4 x float> %res, <4 x float> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<4 x half> %A, <8 x half> %B, <4 x float> %C, i16 %Index, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negB:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fneg.B = fneg <8 x half> %B
+ %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f16.v8f16.v4f32.i16(<4 x half> %A, <8 x half> %fneg.B, <4 x float> %C, i16 %Index)
+ store <4 x float> %res, <4 x float> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<4 x half> %A, <8 x half> %B, <4 x half> %C, i16 %Index, <4 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negA:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fneg.A = fneg <4 x half> %A
+ %res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v8f16.v4f16.i16(<4 x half> %fneg.A, <8 x half> %B, <4 x half> %C, i16 %Index)
+ store <4 x half> %res, <4 x half> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<4 x half> %A, <8 x half> %B, <4 x half> %C, i16 %Index, <4 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negB:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fneg.B = fneg <8 x half> %B
+ %res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v8f16.v4f16.i16(<4 x half> %A, <8 x half> %fneg.B, <4 x half> %C, i16 %Index)
+ store <4 x half> %res, <4 x half> addrspace(1)* %out
+ ret void
+}
+
+; both neg and abs patterns (wmma matrix C f32 or f16 )
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negabsC:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
+ %fneg.fabs.C = fneg <4 x float> %fabs.C
+ %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %fneg.fabs.C)
+ store <4 x float> %res, <4 x float> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negabsC:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fabs.C = call <4 x half> @llvm.fabs.v4f16(<4 x half> %C)
+ %fneg.fabs.C = fneg <4 x half> %fabs.C
+ %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fneg.fabs.C, i1 0)
+ store <4 x half> %res, <4 x half> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_neg_partial_fabsA:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_and_b32_e32 v7, 0x7fffffff, v7
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
+; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %el3 = extractelement <4 x float> %C, i32 3
+ %el3.fabs = call float @llvm.fabs.f32(float %el3)
+ %partial.fabs.C = insertelement <4 x float> %C, float %el3.fabs, i32 3
+ %fneg.partial.fabs.C = fneg <4 x float> %partial.fabs.C
+ %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %fneg.partial.fabs.C)
+ store <4 x float> %res, <4 x float> addrspace(1)* %out
+ ret void
+}
+
+; A or B matrix modifier and constant in C
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA_constantC:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fneg.A = fneg <4 x half> %A
+ %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %fneg.A, <4 x half> %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
+ store <4 x float> %res, <4 x float> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB_constantC:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %fneg.B = fneg <4 x half> %B
+ %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %fneg.B, <4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
+ store <4 x half> %res, <4 x half> addrspace(1)* %out
+ ret void
+}
+
+; pack f16 elements with v_perm_b32 since they don't come from same b32
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<4 x half> %A, <4 x half> %B, ptr %Caddr, <4 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC_pack:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: flat_load_b128 v[8:11], v[4:5]
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: v_perm_b32 v5, v11, v10, 0x5040100
+; GFX12-NEXT: v_perm_b32 v4, v9, v8, 0x5040100
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1]
+; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %C = load <8 x half>, ptr %Caddr
+ %C_shuffle = shufflevector <8 x half> %C, <8 x half> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+ %fneg.C_shuffle = fneg <4 x half> %C_shuffle
+ %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fneg.C_shuffle , i1 0)
+ store <4 x half> %res, <4 x half> addrspace(1)* %out
+ ret void
+}
+
+declare <4 x half> @llvm.fabs.v4f16(<4 x half>)
+declare <4 x float> @llvm.fabs.v4f32(<4 x float>)
+declare float @llvm.fabs.f32(float)
+
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half>, <4 x half>, <4 x float>)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4i16.v4f32(<4 x i16>, <4 x i16>, <4 x float>)
+declare <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half>, <4 x half>, <4 x half>, i1 immarg)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.i32.v4f32(i32, i32, <4 x float>)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.i32.v4f32(i32, i32, <4 x float>)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.i32.v4f32(i32, i32, <4 x float>)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.i32.v4f32(i32, i32, <4 x float>)
+declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f16.v8f16.v4f32.i16(<4 x half>, <8 x half>, <4 x float>, i16)
+declare <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v8f16.v4f16.i16(<4 x half>, <8 x half>, <4 x half>, i16)
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-imm.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-imm.ll
new file mode 100644
index 00000000000000..fa0a7c98cea323
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-imm.ll
@@ -0,0 +1,373 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0
+; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
+ store <4 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm_non_inlineable(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm_non_inlineable:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_mov_b32_e32 v6, 0x40400000
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_mov_b32_e32 v7, v6
+; GFX12-NEXT: v_mov_b32_e32 v8, v6
+; GFX12-NEXT: v_mov_b32_e32 v9, v6
+; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], v[6:9]
+; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
+ store <4 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[6:9], v[0:1], v[2:3], 1.0
+; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
+ store <4 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm_non_inlineable(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm_non_inlineable:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_mov_b32_e32 v6, 0x40400000
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_mov_b32_e32 v7, v6
+; GFX12-NEXT: v_mov_b32_e32 v8, v6
+; GFX12-NEXT: v_mov_b32_e32 v9, v6
+; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[6:9], v[0:1], v[2:3], v[6:9]
+; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
+ store <4 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0
+; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
+ store <4 x half> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm_non_inlineable(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm_non_inlineable:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_mov_b32_e32 v6, 0x42004200
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_mov_b32_e32 v7, v6
+; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], v[6:7]
+; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> <half 3.0, half 3.0, half 3.0, half 3.0>, i1 0)
+ store <4 x half> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], 1.0
+; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> <i16 16256, i16 16256, i16 16256, i16 16256>, i1 0)
+ store <4 x i16> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm_non_inlineable(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm_non_inlineable:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_mov_b32_e32 v6, 0x3fc03fc0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_mov_b32_e32 v7, v6
+; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], v[6:7]
+; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> <i16 16320, i16 16320, i16 16320, i16 16320>, i1 0)
+ store <4 x i16> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm(i32 %A, i32 %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:7], v0, v1, 1
+; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, i1 0)
+ store <4 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm_non_inlineable:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_mov_b32_e32 v4, 0x80
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_mov_b32_e32 v5, v4
+; GFX12-NEXT: v_mov_b32_e32 v6, v4
+; GFX12-NEXT: v_mov_b32_e32 v7, v4
+; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:7], v0, v1, v[4:7]
+; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 128, i32 128, i32 128, i32 128>, i1 0)
+ store <4 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm(i32 %A, i32 %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v0, v1, 1
+; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, i1 0)
+ store <4 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm_non_inlineable:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_mov_b32_e32 v4, 0x80
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_mov_b32_e32 v5, v4
+; GFX12-NEXT: v_mov_b32_e32 v6, v4
+; GFX12-NEXT: v_mov_b32_e32 v7, v4
+; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v0, v1, v[4:7]
+; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 128, i32 128, i32 128, i32 128>, i1 0)
+ store <4 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:7], v0, v1, 1.0
+; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
+ store <4 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_mov_b32_e32 v4, 0x40400000
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_mov_b32_e32 v5, v4
+; GFX12-NEXT: v_mov_b32_e32 v6, v4
+; GFX12-NEXT: v_mov_b32_e32 v7, v4
+; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:7], v0, v1, v[4:7]
+; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
+ store <4 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:7], v0, v1, 1.0
+; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
+ store <4 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_mov_b32_e32 v4, 0x40400000
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_mov_b32_e32 v5, v4
+; GFX12-NEXT: v_mov_b32_e32 v6, v4
+; GFX12-NEXT: v_mov_b32_e32 v7, v4
+; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:7], v0, v1, v[4:7]
+; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
+ store <4 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:7], v0, v1, 1.0
+; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
+ store <4 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_mov_b32_e32 v4, 0x40400000
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_mov_b32_e32 v5, v4
+; GFX12-NEXT: v_mov_b32_e32 v6, v4
+; GFX12-NEXT: v_mov_b32_e32 v7, v4
+; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:7], v0, v1, v[4:7]
+; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
+ store <4 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:7], v0, v1, 1.0
+; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
+ store <4 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_mov_b32_e32 v4, 0x40400000
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_mov_b32_e32 v5, v4
+; GFX12-NEXT: v_mov_b32_e32 v6, v4
+; GFX12-NEXT: v_mov_b32_e32 v7, v4
+; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:7], v0, v1, v[4:7]
+; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
+ store <4 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:7], v0, v1, 1
+; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, i1 0)
+ store <4 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm_non_inlineable(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm_non_inlineable:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_mov_b32_e32 v4, 0x80
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_mov_b32_e32 v5, v4
+; GFX12-NEXT: v_mov_b32_e32 v6, v4
+; GFX12-NEXT: v_mov_b32_e32 v7, v4
+; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:7], v0, v1, v[4:7]
+; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 128, i32 128, i32 128, i32 128>, i1 0)
+ store <4 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half>, <4 x half>, <4 x float>)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16>, <4 x i16>, <4 x float>)
+declare <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half>, <4 x half>, <4 x half>, i1 immarg)
+declare <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16>, <4 x i16>, <4 x i16>, i1 immarg)
+declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
+declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
+declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-iu-modifiers.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-iu-modifiers.ll
new file mode 100644
index 00000000000000..803453e45ced05
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-iu-modifiers.ll
@@ -0,0 +1,274 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src0:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
+; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
+ store <4 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src1(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src1:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
+; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0)
+ store <4 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_clamp(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_clamp:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] clamp
+; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1)
+ store <4 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src0:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
+; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
+ store <4 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src1(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src1:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
+; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0)
+ store <4 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_clamp(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_clamp:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] clamp
+; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1)
+ store <4 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+
+
+define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src0:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
+; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
+ store <4 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src1(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src1:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
+; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0)
+ store <4 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_clamp(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_clamp:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] clamp
+; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1)
+ store <4 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+
+
+
+
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src0(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src0:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
+; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 1, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0)
+ store <4 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src1(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src1:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
+; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 1, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0)
+ store <4 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_clamp(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_clamp:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 clamp
+; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 1)
+ store <4 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src0(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src0:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[1,0,0]
+; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 0)
+ store <4 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src1(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src1:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[0,1,0]
+; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i16 %Index, i1 0)
+ store <4 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_clamp(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_clamp:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 clamp
+; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 1)
+ store <4 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+
+
+define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src0(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src0:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
+; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 1, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0)
+ store <4 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src1(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src1:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
+; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 1, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0)
+ store <4 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_clamp(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_clamp:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 clamp
+; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 1)
+ store <4 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
+declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
+declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
+declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i8 %Index, i1 immarg)
+declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i16 %Index, i1 immarg)
+declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i16 %Index, i1 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll
new file mode 100644
index 00000000000000..0c5b19a8416fb8
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll
@@ -0,0 +1,472 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_index_key(<4 x half> %A, <8 x half> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_index_key:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: global_load_b32 v10, v[10:11], off
+; GFX12-NEXT: v_mov_b32_e32 v23, v9
+; GFX12-NEXT: v_mov_b32_e32 v22, v8
+; GFX12-NEXT: v_mov_b32_e32 v21, v7
+; GFX12-NEXT: v_mov_b32_e32 v20, v6
+; GFX12-NEXT: v_mov_b32_e32 v27, v9
+; GFX12-NEXT: v_mov_b32_e32 v26, v8
+; GFX12-NEXT: v_mov_b32_e32 v25, v7
+; GFX12-NEXT: v_mov_b32_e32 v24, v6
+; GFX12-NEXT: v_mov_b32_e32 v31, v9
+; GFX12-NEXT: v_mov_b32_e32 v30, v8
+; GFX12-NEXT: v_mov_b32_e32 v29, v7
+; GFX12-NEXT: v_mov_b32_e32 v28, v6
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[20:23], v[0:1], v[2:5], v10
+; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[24:27], v[0:1], v[2:5], v10 index_key:1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[28:31], v[0:1], v[2:5], v10 index_key:2
+; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:3
+; GFX12-NEXT: global_store_b128 v[12:13], v[20:23], off
+; GFX12-NEXT: global_store_b128 v[14:15], v[24:27], off
+; GFX12-NEXT: global_store_b128 v[16:17], v[28:31], off
+; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
+ %Index0 = extractelement <4 x i8> %IndexVec, i32 0
+ %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index0)
+ store <4 x float> %res0, ptr addrspace(1) %out0
+ %Index1 = extractelement <4 x i8> %IndexVec, i32 1
+ %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index1)
+ store <4 x float> %res1, ptr addrspace(1) %out1
+ %Index2 = extractelement <4 x i8> %IndexVec, i32 2
+ %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index2)
+ store <4 x float> %res2, ptr addrspace(1) %out2
+ %Index3 = extractelement <4 x i8> %IndexVec, i32 3
+ %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index3)
+ store <4 x float> %res3, ptr addrspace(1) %out3
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16_index_key(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16_index_key:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: global_load_b32 v10, v[10:11], off
+; GFX12-NEXT: v_mov_b32_e32 v23, v9
+; GFX12-NEXT: v_mov_b32_e32 v22, v8
+; GFX12-NEXT: v_mov_b32_e32 v21, v7
+; GFX12-NEXT: v_mov_b32_e32 v20, v6
+; GFX12-NEXT: v_mov_b32_e32 v27, v9
+; GFX12-NEXT: v_mov_b32_e32 v26, v8
+; GFX12-NEXT: v_mov_b32_e32 v25, v7
+; GFX12-NEXT: v_mov_b32_e32 v24, v6
+; GFX12-NEXT: v_mov_b32_e32 v31, v9
+; GFX12-NEXT: v_mov_b32_e32 v30, v8
+; GFX12-NEXT: v_mov_b32_e32 v29, v7
+; GFX12-NEXT: v_mov_b32_e32 v28, v6
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[20:23], v[0:1], v[2:5], v10
+; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[24:27], v[0:1], v[2:5], v10 index_key:1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[28:31], v[0:1], v[2:5], v10 index_key:2
+; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:3
+; GFX12-NEXT: global_store_b128 v[12:13], v[20:23], off
+; GFX12-NEXT: global_store_b128 v[14:15], v[24:27], off
+; GFX12-NEXT: global_store_b128 v[16:17], v[28:31], off
+; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
+ %Index0 = extractelement <4 x i8> %IndexVec, i32 0
+ %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index0)
+ store <4 x float> %res0, ptr addrspace(1) %out0
+ %Index1 = extractelement <4 x i8> %IndexVec, i32 1
+ %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index1)
+ store <4 x float> %res1, ptr addrspace(1) %out1
+ %Index2 = extractelement <4 x i8> %IndexVec, i32 2
+ %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index2)
+ store <4 x float> %res2, ptr addrspace(1) %out2
+ %Index3 = extractelement <4 x i8> %IndexVec, i32 3
+ %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index3)
+ store <4 x float> %res3, ptr addrspace(1) %out3
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_index_key(<4 x half> %A, <8 x half> %B, <4 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_index_key:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: global_load_b32 v22, v[8:9], off
+; GFX12-NEXT: v_mov_b32_e32 v9, v7
+; GFX12-NEXT: v_mov_b32_e32 v8, v6
+; GFX12-NEXT: v_mov_b32_e32 v19, v7
+; GFX12-NEXT: v_mov_b32_e32 v18, v6
+; GFX12-NEXT: v_mov_b32_e32 v21, v7
+; GFX12-NEXT: v_mov_b32_e32 v20, v6
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[8:9], v[0:1], v[2:5], v22
+; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[18:19], v[0:1], v[2:5], v22 index_key:1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[20:21], v[0:1], v[2:5], v22 index_key:2
+; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v22 index_key:3
+; GFX12-NEXT: global_store_b64 v[10:11], v[8:9], off
+; GFX12-NEXT: global_store_b64 v[12:13], v[18:19], off
+; GFX12-NEXT: global_store_b64 v[14:15], v[20:21], off
+; GFX12-NEXT: global_store_b64 v[16:17], v[6:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
+ %Index0 = extractelement <4 x i8> %IndexVec, i32 0
+ %res0 = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index0)
+ store <4 x half> %res0, ptr addrspace(1) %out0
+ %Index1 = extractelement <4 x i8> %IndexVec, i32 1
+ %res1 = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index1)
+ store <4 x half> %res1, ptr addrspace(1) %out1
+ %Index2 = extractelement <4 x i8> %IndexVec, i32 2
+ %res2 = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index2)
+ store <4 x half> %res2, ptr addrspace(1) %out2
+ %Index3 = extractelement <4 x i8> %IndexVec, i32 3
+ %res3 = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index3)
+ store <4 x half> %res3, ptr addrspace(1) %out3
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16_index_key(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16_index_key:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: global_load_b32 v22, v[8:9], off
+; GFX12-NEXT: v_mov_b32_e32 v9, v7
+; GFX12-NEXT: v_mov_b32_e32 v8, v6
+; GFX12-NEXT: v_mov_b32_e32 v19, v7
+; GFX12-NEXT: v_mov_b32_e32 v18, v6
+; GFX12-NEXT: v_mov_b32_e32 v21, v7
+; GFX12-NEXT: v_mov_b32_e32 v20, v6
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[8:9], v[0:1], v[2:5], v22
+; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[18:19], v[0:1], v[2:5], v22 index_key:1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[20:21], v[0:1], v[2:5], v22 index_key:2
+; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v22 index_key:3
+; GFX12-NEXT: global_store_b64 v[10:11], v[8:9], off
+; GFX12-NEXT: global_store_b64 v[12:13], v[18:19], off
+; GFX12-NEXT: global_store_b64 v[14:15], v[20:21], off
+; GFX12-NEXT: global_store_b64 v[16:17], v[6:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
+ %Index0 = extractelement <4 x i8> %IndexVec, i32 0
+ %res0 = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index0)
+ store <4 x i16> %res0, ptr addrspace(1) %out0
+ %Index1 = extractelement <4 x i8> %IndexVec, i32 1
+ %res1 = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index1)
+ store <4 x i16> %res1, ptr addrspace(1) %out1
+ %Index2 = extractelement <4 x i8> %IndexVec, i32 2
+ %res2 = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index2)
+ store <4 x i16> %res2, ptr addrspace(1) %out2
+ %Index3 = extractelement <4 x i8> %IndexVec, i32 3
+ %res3 = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index3)
+ store <4 x i16> %res3, ptr addrspace(1) %out3
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_index_key(i32 %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_index_key:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: global_load_b32 v7, v[7:8], off
+; GFX12-NEXT: v_mov_b32_e32 v20, v6
+; GFX12-NEXT: v_mov_b32_e32 v19, v5
+; GFX12-NEXT: v_mov_b32_e32 v18, v4
+; GFX12-NEXT: v_mov_b32_e32 v17, v3
+; GFX12-NEXT: v_mov_b32_e32 v24, v6
+; GFX12-NEXT: v_mov_b32_e32 v23, v5
+; GFX12-NEXT: v_mov_b32_e32 v22, v4
+; GFX12-NEXT: v_mov_b32_e32 v21, v3
+; GFX12-NEXT: v_mov_b32_e32 v28, v6
+; GFX12-NEXT: v_mov_b32_e32 v27, v5
+; GFX12-NEXT: v_mov_b32_e32 v26, v4
+; GFX12-NEXT: v_mov_b32_e32 v25, v3
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[17:20], v0, v[1:2], v7
+; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[21:24], v0, v[1:2], v7 index_key:1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[25:28], v0, v[1:2], v7 index_key:2
+; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:3
+; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off
+; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off
+; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off
+; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
+ %Index0 = extractelement <4 x i8> %IndexVec, i32 0
+ %res0 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index0, i1 0)
+ store <4 x i32> %res0, ptr addrspace(1) %out0
+ %Index1 = extractelement <4 x i8> %IndexVec, i32 1
+ %res1 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index1, i1 0)
+ store <4 x i32> %res1, ptr addrspace(1) %out1
+ %Index2 = extractelement <4 x i8> %IndexVec, i32 2
+ %res2 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index2, i1 0)
+ store <4 x i32> %res2, ptr addrspace(1) %out2
+ %Index3 = extractelement <4 x i8> %IndexVec, i32 3
+ %res3 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index3, i1 0)
+ store <4 x i32> %res3, ptr addrspace(1) %out3
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_index_key(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_index_key:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: global_load_b32 v6, v[6:7], off
+; GFX12-NEXT: v_mov_b32_e32 v15, v5
+; GFX12-NEXT: v_mov_b32_e32 v14, v4
+; GFX12-NEXT: v_mov_b32_e32 v13, v3
+; GFX12-NEXT: v_mov_b32_e32 v12, v2
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[12:15], v0, v1, v6
+; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:1
+; GFX12-NEXT: global_store_b128 v[8:9], v[12:15], off
+; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
+ %Index0 = extractelement <2 x i16> %IndexVec, i32 0
+ %res0 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index0, i1 0)
+ store <4 x i32> %res0, ptr addrspace(1) %out0
+ %Index1 = extractelement <2 x i16> %IndexVec, i32 1
+ %res1 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index1, i1 0)
+ store <4 x i32> %res1, ptr addrspace(1) %out1
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_index_key(i32 %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_index_key:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: global_load_b32 v7, v[7:8], off
+; GFX12-NEXT: v_mov_b32_e32 v16, v6
+; GFX12-NEXT: v_mov_b32_e32 v15, v5
+; GFX12-NEXT: v_mov_b32_e32 v14, v4
+; GFX12-NEXT: v_mov_b32_e32 v13, v3
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[13:16], v0, v[1:2], v7
+; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:1
+; GFX12-NEXT: global_store_b128 v[9:10], v[13:16], off
+; GFX12-NEXT: global_store_b128 v[11:12], v[3:6], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
+ %Index0 = extractelement <2 x i16> %IndexVec, i32 0
+ %res0 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index0, i1 0)
+ store <4 x i32> %res0, ptr addrspace(1) %out0
+ %Index1 = extractelement <2 x i16> %IndexVec, i32 1
+ %res1 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index1, i1 0)
+ store <4 x i32> %res1, ptr addrspace(1) %out1
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8_index_key:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: global_load_b32 v7, v[7:8], off
+; GFX12-NEXT: v_mov_b32_e32 v20, v6
+; GFX12-NEXT: v_mov_b32_e32 v19, v5
+; GFX12-NEXT: v_mov_b32_e32 v18, v4
+; GFX12-NEXT: v_mov_b32_e32 v17, v3
+; GFX12-NEXT: v_mov_b32_e32 v24, v6
+; GFX12-NEXT: v_mov_b32_e32 v23, v5
+; GFX12-NEXT: v_mov_b32_e32 v22, v4
+; GFX12-NEXT: v_mov_b32_e32 v21, v3
+; GFX12-NEXT: v_mov_b32_e32 v28, v6
+; GFX12-NEXT: v_mov_b32_e32 v27, v5
+; GFX12-NEXT: v_mov_b32_e32 v26, v4
+; GFX12-NEXT: v_mov_b32_e32 v25, v3
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[17:20], v0, v[1:2], v7
+; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[21:24], v0, v[1:2], v7 index_key:1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[25:28], v0, v[1:2], v7 index_key:2
+; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:3
+; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off
+; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off
+; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off
+; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
+ %Index0 = extractelement <4 x i8> %IndexVec, i32 0
+ %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index0)
+ store <4 x float> %res0, ptr addrspace(1) %out0
+ %Index1 = extractelement <4 x i8> %IndexVec, i32 1
+ %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index1)
+ store <4 x float> %res1, ptr addrspace(1) %out1
+ %Index2 = extractelement <4 x i8> %IndexVec, i32 2
+ %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index2)
+ store <4 x float> %res2, ptr addrspace(1) %out2
+ %Index3 = extractelement <4 x i8> %IndexVec, i32 3
+ %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index3)
+ store <4 x float> %res3, ptr addrspace(1) %out3
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8_index_key:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: global_load_b32 v7, v[7:8], off
+; GFX12-NEXT: v_mov_b32_e32 v20, v6
+; GFX12-NEXT: v_mov_b32_e32 v19, v5
+; GFX12-NEXT: v_mov_b32_e32 v18, v4
+; GFX12-NEXT: v_mov_b32_e32 v17, v3
+; GFX12-NEXT: v_mov_b32_e32 v24, v6
+; GFX12-NEXT: v_mov_b32_e32 v23, v5
+; GFX12-NEXT: v_mov_b32_e32 v22, v4
+; GFX12-NEXT: v_mov_b32_e32 v21, v3
+; GFX12-NEXT: v_mov_b32_e32 v28, v6
+; GFX12-NEXT: v_mov_b32_e32 v27, v5
+; GFX12-NEXT: v_mov_b32_e32 v26, v4
+; GFX12-NEXT: v_mov_b32_e32 v25, v3
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[17:20], v0, v[1:2], v7
+; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[21:24], v0, v[1:2], v7 index_key:1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[25:28], v0, v[1:2], v7 index_key:2
+; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:3
+; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off
+; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off
+; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off
+; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
+ %Index0 = extractelement <4 x i8> %IndexVec, i32 0
+ %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index0)
+ store <4 x float> %res0, ptr addrspace(1) %out0
+ %Index1 = extractelement <4 x i8> %IndexVec, i32 1
+ %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index1)
+ store <4 x float> %res1, ptr addrspace(1) %out1
+ %Index2 = extractelement <4 x i8> %IndexVec, i32 2
+ %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index2)
+ store <4 x float> %res2, ptr addrspace(1) %out2
+ %Index3 = extractelement <4 x i8> %IndexVec, i32 3
+ %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index3)
+ store <4 x float> %res3, ptr addrspace(1) %out3
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8_index_key:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: global_load_b32 v7, v[7:8], off
+; GFX12-NEXT: v_mov_b32_e32 v20, v6
+; GFX12-NEXT: v_mov_b32_e32 v19, v5
+; GFX12-NEXT: v_mov_b32_e32 v18, v4
+; GFX12-NEXT: v_mov_b32_e32 v17, v3
+; GFX12-NEXT: v_mov_b32_e32 v24, v6
+; GFX12-NEXT: v_mov_b32_e32 v23, v5
+; GFX12-NEXT: v_mov_b32_e32 v22, v4
+; GFX12-NEXT: v_mov_b32_e32 v21, v3
+; GFX12-NEXT: v_mov_b32_e32 v28, v6
+; GFX12-NEXT: v_mov_b32_e32 v27, v5
+; GFX12-NEXT: v_mov_b32_e32 v26, v4
+; GFX12-NEXT: v_mov_b32_e32 v25, v3
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[17:20], v0, v[1:2], v7
+; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[21:24], v0, v[1:2], v7 index_key:1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[25:28], v0, v[1:2], v7 index_key:2
+; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:3
+; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off
+; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off
+; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off
+; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
+ %Index0 = extractelement <4 x i8> %IndexVec, i32 0
+ %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index0)
+ store <4 x float> %res0, ptr addrspace(1) %out0
+ %Index1 = extractelement <4 x i8> %IndexVec, i32 1
+ %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index1)
+ store <4 x float> %res1, ptr addrspace(1) %out1
+ %Index2 = extractelement <4 x i8> %IndexVec, i32 2
+ %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index2)
+ store <4 x float> %res2, ptr addrspace(1) %out2
+ %Index3 = extractelement <4 x i8> %IndexVec, i32 3
+ %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index3)
+ store <4 x float> %res3, ptr addrspace(1) %out3
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8_index_key:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: global_load_b32 v7, v[7:8], off
+; GFX12-NEXT: v_mov_b32_e32 v20, v6
+; GFX12-NEXT: v_mov_b32_e32 v19, v5
+; GFX12-NEXT: v_mov_b32_e32 v18, v4
+; GFX12-NEXT: v_mov_b32_e32 v17, v3
+; GFX12-NEXT: v_mov_b32_e32 v24, v6
+; GFX12-NEXT: v_mov_b32_e32 v23, v5
+; GFX12-NEXT: v_mov_b32_e32 v22, v4
+; GFX12-NEXT: v_mov_b32_e32 v21, v3
+; GFX12-NEXT: v_mov_b32_e32 v28, v6
+; GFX12-NEXT: v_mov_b32_e32 v27, v5
+; GFX12-NEXT: v_mov_b32_e32 v26, v4
+; GFX12-NEXT: v_mov_b32_e32 v25, v3
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[17:20], v0, v[1:2], v7
+; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[21:24], v0, v[1:2], v7 index_key:1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[25:28], v0, v[1:2], v7 index_key:2
+; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:3
+; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off
+; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off
+; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off
+; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
+ %Index0 = extractelement <4 x i8> %IndexVec, i32 0
+ %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index0)
+ store <4 x float> %res0, ptr addrspace(1) %out0
+ %Index1 = extractelement <4 x i8> %IndexVec, i32 1
+ %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index1)
+ store <4 x float> %res1, ptr addrspace(1) %out1
+ %Index2 = extractelement <4 x i8> %IndexVec, i32 2
+ %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index2)
+ store <4 x float> %res2, ptr addrspace(1) %out2
+ %Index3 = extractelement <4 x i8> %IndexVec, i32 3
+ %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index3)
+ store <4 x float> %res3, ptr addrspace(1) %out3
+ ret void
+}
+
+declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half>, <8 x half>, <4 x float>, i8)
+declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16>, <8 x i16>, <4 x float>, i8)
+declare <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half>, <8 x half>, <4 x half>, i8)
+declare <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16>, <8 x i16>, <4 x i16>, i8)
+declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i8 %Index, i1 immarg)
+declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i16 %Index, i1 immarg)
+declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i16 %Index, i1 immarg)
+declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
+declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
+declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
+declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64.ll
new file mode 100644
index 00000000000000..b25f2785133310
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64.ll
@@ -0,0 +1,333 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7]
+; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %C)
+ store <4 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf16:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7]
+; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> %C)
+ store <4 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5]
+; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> %C, i1 0)
+ store <4 x half> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5]
+; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i1 0)
+ store <4 x i16> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu8(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu8:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5]
+; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
+ store <4 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu4(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu4:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5]
+; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
+ store <4 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5]
+; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C)
+ store <4 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5]
+; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C)
+ store <4 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5]
+; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C)
+ store <4 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5]
+; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C)
+ store <4 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x32_iu4(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x32_iu4:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5]
+; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
+ store <4 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_f16(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_f16:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10
+; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index)
+ store <4 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10
+; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index)
+ store <4 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_f16_16x16x32_f16(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_f16_16x16x32_f16:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8
+; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index)
+ store <4 x half> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8
+; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index)
+ store <4 x i16> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7
+; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0)
+ store <4 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6
+; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 0)
+ store <4 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7
+; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0)
+ store <4 x i32> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7
+; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index)
+ store <4 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7
+; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index)
+ store <4 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7
+; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index)
+ store <4 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7
+; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+bb:
+ %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index)
+ store <4 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half>, <4 x half>, <4 x float>)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16>, <4 x i16>, <4 x float>)
+declare <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half>, <4 x half>, <4 x half>, i1 immarg)
+declare <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16>, <4 x i16>, <4 x i16>, i1 immarg)
+declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
+declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
+declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
+declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half>, <8 x half>, <4 x float>, i8)
+declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16>, <8 x i16>, <4 x float>, i8)
+declare <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half>, <8 x half>, <4 x half>, i8)
+declare <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16>, <8 x i16>, <4 x i16>, i8)
+declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i8 %Index, i1 immarg)
+declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i16 %Index, i1 immarg)
+declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i16 %Index, i1 immarg)
+declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
+declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
+declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
+declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w32.mir b/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w32.mir
new file mode 100644
index 00000000000000..47a1e06c5d7dc5
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w32.mir
@@ -0,0 +1,351 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=GFX12 %s
+
+# D0 overlaps A1, B1, C1 or Index1. Overlap starts at vgpr0.
+# $D0 = wmma0 $A0, $B0, $C0 or $D0 = swmmac0 $A0, $B0, $C0, $Index0
+# $D1 = wmma1 $A1, $B1, $C1 or $D1 = swmmac1 $A1, $B1, $C1, $Index1
+
+---
+name: test_wmma_f32_16x16x16_f16_D0_overlaps_A1
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45
+
+ ; GFX12-LABEL: name: test_wmma_f32_16x16x16_f16_D0_overlaps_A1
+ ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+ ; GFX12-NEXT: V_NOP_e32 implicit $exec
+ ; GFX12-NEXT: early-clobber renamable $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35, 8, killed $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, implicit $exec
+ early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+ early-clobber renamable $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35, 8, killed $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f32_16x16x16_bf16_D0_overlaps_B1
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45
+
+ ; GFX12-LABEL: name: test_wmma_f32_16x16x16_bf16_D0_overlaps_B1
+ ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+ ; GFX12-NEXT: V_NOP_e32 implicit $exec
+ ; GFX12-NEXT: early-clobber renamable $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43 = V_WMMA_F32_16X16X16_BF16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, implicit $exec
+ early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+ early-clobber renamable $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43 = V_WMMA_F32_16X16X16_BF16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f16_16x16x16_f16_D0_overlaps_C1
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
+
+ ; GFX12-LABEL: name: test_wmma_f16_16x16x16_f16_D0_overlaps_C1
+ ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+ ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F16_16X16X16_F16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+ early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+ early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F16_16X16X16_F16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_bf16_16x16x16_bf16_D0_overlaps_A1
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
+
+ ; GFX12-LABEL: name: test_wmma_bf16_16x16x16_bf16_D0_overlaps_A1
+ ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+ ; GFX12-NEXT: V_NOP_e32 implicit $exec
+ ; GFX12-NEXT: early-clobber renamable $vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_BF16_16X16X16_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35, 8, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
+ early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+ early-clobber renamable $vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_BF16_16X16X16_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35, 8, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
+...
+---
+name: test_wmma_i32_16x16x16_iu8_D0_overlaps_B1
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
+
+ ; GFX12-LABEL: name: test_wmma_i32_16x16x16_iu8_D0_overlaps_B1
+ ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+ ; GFX12-NEXT: V_NOP_e32 implicit $exec
+ ; GFX12-NEXT: early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_I32_16X16X16_IU8_w32_twoaddr 8, killed $vgpr28_vgpr29, 8, killed $vgpr0_vgpr1, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
+ early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+ early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_I32_16X16X16_IU8_w32_twoaddr 8, killed $vgpr28_vgpr29, 8, killed $vgpr0_vgpr1, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_i32_16x16x16_iu4_D0_overlaps_C1
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39
+
+ ; GFX12-LABEL: name: test_wmma_i32_16x16x16_iu4_D0_overlaps_C1
+ ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+ ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_I32_16X16X16_IU4_w32_twoaddr 8, killed $vgpr28, 8, killed $vgpr29, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+ early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+ early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_I32_16X16X16_IU4_w32_twoaddr 8, killed $vgpr28, 8, killed $vgpr29, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f32_16x16x16_fp8_fp8_D0_overlaps_A1
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
+
+ ; GFX12-LABEL: name: test_wmma_f32_16x16x16_fp8_fp8_D0_overlaps_A1
+ ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+ ; GFX12-NEXT: V_NOP_e32 implicit $exec
+ ; GFX12-NEXT: early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X16_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
+ early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+ early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X16_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f32_16x16x16_bf8_fp8_D0_overlaps_B1
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
+
+ ; GFX12-LABEL: name: test_wmma_f32_16x16x16_bf8_fp8_D0_overlaps_B1
+ ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+ ; GFX12-NEXT: V_NOP_e32 implicit $exec
+ ; GFX12-NEXT: early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X16_BF8_FP8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr0_vgpr1, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
+ early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+ early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X16_BF8_FP8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr0_vgpr1, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f32_16x16x16_fp8_bf8_D0_overlaps_C1
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
+
+ ; GFX12-LABEL: name: test_wmma_f32_16x16x16_fp8_bf8_D0_overlaps_C1
+ ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+ ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_FP8_BF8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr30_vgpr31, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+ early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+ early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_FP8_BF8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr30_vgpr31, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f32_16x16x16_bf8_bf8_D0_overlaps_A1
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
+
+ ; GFX12-LABEL: name: test_wmma_f32_16x16x16_bf8_bf8_D0_overlaps_A1
+ ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+ ; GFX12-NEXT: V_NOP_e32 implicit $exec
+ ; GFX12-NEXT: early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X16_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
+ early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+ early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X16_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_i32_16x16x32_iu4_D0_overlaps_B1
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
+
+ ; GFX12-LABEL: name: test_wmma_i32_16x16x32_iu4_D0_overlaps_B1
+ ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+ ; GFX12-NEXT: V_NOP_e32 implicit $exec
+ ; GFX12-NEXT: early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_I32_16X16X32_IU4_w32_twoaddr 8, killed $vgpr28_vgpr29, 8, killed $vgpr0_vgpr1, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
+ early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+ early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_I32_16X16X32_IU4_w32_twoaddr 8, killed $vgpr28_vgpr29, 8, killed $vgpr0_vgpr1, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x32_f16_D0_overlaps_C1
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr48, $vgpr49, $vgpr50
+
+ ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_f16_D0_overlaps_C1
+ ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr48, $vgpr49, $vgpr50
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+ ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_SWMMAC_F32_16X16X32_F16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr44, 0, 0, 0, implicit $exec
+ early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+ early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_SWMMAC_F32_16X16X32_F16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr44, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x32_bf16_D0_overlaps_Index1
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr48, $vgpr49, $vgpr50
+
+ ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_bf16_D0_overlaps_Index1
+ ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr48, $vgpr49, $vgpr50
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+ ; GFX12-NEXT: V_NOP_e32 implicit $exec
+ ; GFX12-NEXT: early-clobber renamable $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 = V_SWMMAC_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr0, 0, 0, 0, implicit $exec
+ early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+ early-clobber renamable $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 = V_SWMMAC_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f16_16x16x32_f16_D0_overlaps_A1
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46
+
+ ; GFX12-LABEL: name: test_swmmac_f16_16x16x32_f16_D0_overlaps_A1
+ ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+ ; GFX12-NEXT: V_NOP_e32 implicit $exec
+ ; GFX12-NEXT: early-clobber renamable $vgpr40_vgpr41_vgpr42_vgpr43 = V_SWMMAC_F16_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43, killed $vgpr44, 0, 0, 0, implicit $exec
+ early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+ early-clobber renamable $vgpr40_vgpr41_vgpr42_vgpr43 = V_SWMMAC_F16_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43, killed $vgpr44, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_bf16_16x16x32_bf16_D0_overlaps_B1
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46
+
+ ; GFX12-LABEL: name: test_swmmac_bf16_16x16x32_bf16_D0_overlaps_B1
+ ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+ ; GFX12-NEXT: V_NOP_e32 implicit $exec
+ ; GFX12-NEXT: early-clobber renamable $vgpr40_vgpr41_vgpr42_vgpr43 = V_SWMMAC_BF16_16X16X32_BF16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr40_vgpr41_vgpr42_vgpr43, killed $vgpr44, 0, 0, 0, implicit $exec
+ early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+ early-clobber renamable $vgpr40_vgpr41_vgpr42_vgpr43 = V_SWMMAC_BF16_16X16X32_BF16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr40_vgpr41_vgpr42_vgpr43, killed $vgpr44, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_i32_16x16x32_iu8_D0_overlaps_C1
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
+
+ ; GFX12-LABEL: name: test_swmmac_i32_16x16x32_iu8_D0_overlaps_C1
+ ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+ ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_SWMMAC_I32_16X16X32_IU8_w32_twoaddr 8, killed $vgpr28_vgpr29, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr42, 0, 0, 0, implicit $exec
+ early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+ early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_SWMMAC_I32_16X16X32_IU8_w32_twoaddr 8, killed $vgpr28_vgpr29, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr42, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_i32_16x16x32_iu4_D0_overlaps_Index1
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
+
+ ; GFX12-LABEL: name: test_swmmac_i32_16x16x32_iu4_D0_overlaps_Index1
+ ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+ ; GFX12-NEXT: V_NOP_e32 implicit $exec
+ ; GFX12-NEXT: early-clobber renamable $vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38 = V_SWMMAC_I32_16X16X32_IU4_w32_twoaddr 8, killed $vgpr28, 8, killed $vgpr28_vgpr29, killed $vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38, killed $vgpr0, 0, 0, 0, implicit $exec
+ early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+ early-clobber renamable $vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38 = V_SWMMAC_I32_16X16X32_IU4_w32_twoaddr 8, killed $vgpr28, 8, killed $vgpr28_vgpr29, killed $vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38, killed $vgpr0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_i32_16x16x64_iu4_D0_overlaps_A1
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
+
+ ; GFX12-LABEL: name: test_swmmac_i32_16x16x64_iu4_D0_overlaps_A1
+ ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+ ; GFX12-NEXT: V_NOP_e32 implicit $exec
+ ; GFX12-NEXT: early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_I32_16X16X64_IU4_w32_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42, 0, 0, implicit $exec
+ early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+ early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_I32_16X16X64_IU4_w32_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x32_fp8_fp8_D0_overlaps_B1
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
+
+ ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_fp8_fp8_D0_overlaps_B1
+ ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+ ; GFX12-NEXT: V_NOP_e32 implicit $exec
+ ; GFX12-NEXT: early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_F32_16X16X32_FP8_FP8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42, 0, implicit $exec
+ early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+ early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_F32_16X16X32_FP8_FP8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x32_fp8_bf8_D0_overlaps_C1
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
+
+ ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_fp8_bf8_D0_overlaps_C1
+ ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+ ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_SWMMAC_F32_16X16X32_FP8_BF8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr42, 0, implicit $exec
+ early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+ early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_SWMMAC_F32_16X16X32_FP8_BF8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr42, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x32_bf8_fp8_D0_overlaps_Index1
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
+
+ ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_bf8_fp8_D0_overlaps_Index1
+ ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+ ; GFX12-NEXT: V_NOP_e32 implicit $exec
+ ; GFX12-NEXT: early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_F32_16X16X32_BF8_FP8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr0, 0, implicit $exec
+ early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+ early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_F32_16X16X32_BF8_FP8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x32_bf8_bf8_D0_overlaps_A1
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
+
+ ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_bf8_bf8_D0_overlaps_A1
+ ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+ ; GFX12-NEXT: V_NOP_e32 implicit $exec
+ ; GFX12-NEXT: early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_F32_16X16X32_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42, 0, implicit $exec
+ early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+ early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_F32_16X16X32_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42, 0, implicit $exec
+...
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w64.mir b/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w64.mir
new file mode 100644
index 00000000000000..34c37aa91ab80f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w64.mir
@@ -0,0 +1,352 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=-wavefrontsize32,+wavefrontsize64 -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=GFX12 %s
+
+# D0 overlaps A1, B1, C1 or Index1. Overlap starts at vgpr0.
+# $D0 = wmma0 $A0, $B0, $C0 or $D0 = swmmac0 $A0, $B0, $C0, $Index0
+# $D1 = wmma1 $A1, $B1, $C1 or $D1 = swmmac1 $A1, $B1, $C1, $Index1
+
+---
+name: test_wmma_f32_16x16x16_f16_D0_overlaps_A1
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
+
+ ; GFX12-LABEL: name: test_wmma_f32_16x16x16_f16_D0_overlaps_A1
+ ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+ ; GFX12-NEXT: V_NOP_e32 implicit $exec
+ ; GFX12-NEXT: early-clobber renamable $vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr18_vgpr19, 8, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
+ early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+ early-clobber renamable $vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr18_vgpr19, 8, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f32_16x16x16_bf16_D0_overlaps_B1
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
+
+ ; GFX12-LABEL: name: test_wmma_f32_16x16x16_bf16_D0_overlaps_B1
+ ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+ ; GFX12-NEXT: V_NOP_e32 implicit $exec
+ ; GFX12-NEXT: early-clobber renamable $vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_BF16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr0_vgpr1, 8, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
+ early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+ early-clobber renamable $vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_BF16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr0_vgpr1, 8, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f16_16x16x16_f16_D0_overlaps_C1
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+
+ ; GFX12-LABEL: name: test_wmma_f16_16x16x16_f16_D0_overlaps_C1
+ ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+ ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1 = V_WMMA_F16_16X16X16_F16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19, 8, killed $vgpr0_vgpr1, 0, 0, implicit $exec
+ early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+ early-clobber renamable $vgpr0_vgpr1 = V_WMMA_F16_16X16X16_F16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19, 8, killed $vgpr0_vgpr1, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_bf16_16x16x16_bf16_D0_overlaps_A1
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+
+ ; GFX12-LABEL: name: test_wmma_bf16_16x16x16_bf16_D0_overlaps_A1
+ ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+ ; GFX12-NEXT: V_NOP_e32 implicit $exec
+ ; GFX12-NEXT: early-clobber renamable $vgpr20_vgpr21 = V_WMMA_BF16_16X16X16_BF16_w64_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr18_vgpr19, 8, killed $vgpr20_vgpr21, 0, 0, implicit $exec
+ early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+ early-clobber renamable $vgpr20_vgpr21 = V_WMMA_BF16_16X16X16_BF16_w64_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr18_vgpr19, 8, killed $vgpr20_vgpr21, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_i32_16x16x16_iu8_D0_overlaps_B1
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+
+ ; GFX12-LABEL: name: test_wmma_i32_16x16x16_iu8_D0_overlaps_B1
+ ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+ ; GFX12-NEXT: V_NOP_e32 implicit $exec
+ ; GFX12-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_I32_16X16X16_IU8_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr0, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
+ early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+ early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_I32_16X16X16_IU8_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr0, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_i32_16x16x16_iu4_D0_overlaps_C1
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+
+ ; GFX12-LABEL: name: test_wmma_i32_16x16x16_iu4_D0_overlaps_C1
+ ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+ ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_I32_16X16X16_IU4_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr17, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+ early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+ early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_I32_16X16X16_IU4_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr17, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f32_16x16x16_fp8_fp8_D0_overlaps_A1
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+
+ ; GFX12-LABEL: name: test_wmma_f32_16x16x16_fp8_fp8_D0_overlaps_A1
+ ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+ ; GFX12-NEXT: V_NOP_e32 implicit $exec
+ ; GFX12-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_F32_16X16X16_FP8_FP8_w64_twoaddr killed $vgpr0, killed $vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
+ early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+ early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_F32_16X16X16_FP8_FP8_w64_twoaddr killed $vgpr0, killed $vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f32_16x16x16_bf8_fp8_D0_overlaps_B1
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+
+ ; GFX12-LABEL: name: test_wmma_f32_16x16x16_bf8_fp8_D0_overlaps_B1
+ ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+ ; GFX12-NEXT: V_NOP_e32 implicit $exec
+ ; GFX12-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_F32_16X16X16_BF8_FP8_w64_twoaddr killed $vgpr16, killed $vgpr0, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
+ early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+ early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_F32_16X16X16_BF8_FP8_w64_twoaddr killed $vgpr16, killed $vgpr0, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f32_16x16x16_fp8_bf8_D0_overlaps_C1
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+
+ ; GFX12-LABEL: name: test_wmma_f32_16x16x16_fp8_bf8_D0_overlaps_C1
+ ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+ ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_FP8_BF8_w64_twoaddr killed $vgpr16, killed $vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+ early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+ early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_FP8_BF8_w64_twoaddr killed $vgpr16, killed $vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f32_16x16x16_bf8_bf8_D0_overlaps_A1
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+
+ ; GFX12-LABEL: name: test_wmma_f32_16x16x16_bf8_bf8_D0_overlaps_A1
+ ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+ ; GFX12-NEXT: V_NOP_e32 implicit $exec
+ ; GFX12-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_F32_16X16X16_BF8_BF8_w64_twoaddr killed $vgpr0, killed $vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
+ early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+ early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_F32_16X16X16_BF8_BF8_w64_twoaddr killed $vgpr0, killed $vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_i32_16x16x32_iu4_D0_overlaps_B1
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+
+ ; GFX12-LABEL: name: test_wmma_i32_16x16x32_iu4_D0_overlaps_B1
+ ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+ ; GFX12-NEXT: V_NOP_e32 implicit $exec
+ ; GFX12-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_I32_16X16X32_IU4_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr0, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
+ early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+ early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_I32_16X16X32_IU4_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr0, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x32_f16_D0_overlaps_C1
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28
+
+ ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_f16_D0_overlaps_C1
+ ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+ ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_SWMMAC_F32_16X16X32_F16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr26, 0, 0, 0, implicit $exec
+ early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+ early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_SWMMAC_F32_16X16X32_F16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr26, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x32_bf16_D0_overlaps_Index1
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28
+
+ ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_bf16_D0_overlaps_Index1
+ ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+ ; GFX12-NEXT: V_NOP_e32 implicit $exec
+ ; GFX12-NEXT: early-clobber renamable $vgpr22_vgpr23_vgpr24_vgpr25 = V_SWMMAC_F32_16X16X32_BF16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr22_vgpr23_vgpr24_vgpr25, killed $vgpr0, 0, 0, 0, implicit $exec
+ early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+ early-clobber renamable $vgpr22_vgpr23_vgpr24_vgpr25 = V_SWMMAC_F32_16X16X32_BF16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr22_vgpr23_vgpr24_vgpr25, killed $vgpr0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f16_16x16x32_f16_D0_overlaps_A1
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26
+
+ ; GFX12-LABEL: name: test_swmmac_f16_16x16x32_f16_D0_overlaps_A1
+ ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+ ; GFX12-NEXT: V_NOP_e32 implicit $exec
+ ; GFX12-NEXT: early-clobber renamable $vgpr22_vgpr23 = V_SWMMAC_F16_16X16X32_F16_w64_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr22_vgpr23, killed $vgpr24, 0, 0, 0, implicit $exec
+ early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+ early-clobber renamable $vgpr22_vgpr23 = V_SWMMAC_F16_16X16X32_F16_w64_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr22_vgpr23, killed $vgpr24, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_bf16_16x16x32_bf16_D0_overlaps_B1
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26
+
+ ; GFX12-LABEL: name: test_swmmac_bf16_16x16x32_bf16_D0_overlaps_B1
+ ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+ ; GFX12-NEXT: V_NOP_e32 implicit $exec
+ ; GFX12-NEXT: early-clobber renamable $vgpr22_vgpr23 = V_SWMMAC_BF16_16X16X32_BF16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr22_vgpr23, killed $vgpr24, 0, 0, 0, implicit $exec
+ early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+ early-clobber renamable $vgpr22_vgpr23 = V_SWMMAC_BF16_16X16X32_BF16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr22_vgpr23, killed $vgpr24, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_i32_16x16x32_iu8_D0_overlaps_C1
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
+
+ ; GFX12-LABEL: name: test_swmmac_i32_16x16x32_iu8_D0_overlaps_C1
+ ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+ ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_SWMMAC_I32_16X16X32_IU8_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr17_vgpr18, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr23, 0, 0, 0, implicit $exec
+ early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+ early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_SWMMAC_I32_16X16X32_IU8_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr17_vgpr18, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr23, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_i32_16x16x32_iu4_D0_overlaps_Index1
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24
+
+ ; GFX12-LABEL: name: test_swmmac_i32_16x16x32_iu4_D0_overlaps_Index1
+ ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+ ; GFX12-NEXT: V_NOP_e32 implicit $exec
+ ; GFX12-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_SWMMAC_I32_16X16X32_IU4_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr16, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr0, 0, 0, 0, implicit $exec
+ early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+ early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_SWMMAC_I32_16X16X32_IU4_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr16, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_i32_16x16x64_iu4_D0_overlaps_A1
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
+
+ ; GFX12-LABEL: name: test_swmmac_i32_16x16x64_iu4_D0_overlaps_A1
+ ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+ ; GFX12-NEXT: V_NOP_e32 implicit $exec
+ ; GFX12-NEXT: early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_I32_16X16X64_IU4_w64_twoaddr 8, killed $vgpr0, 8, killed $vgpr17_vgpr18, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr23, 0, 0, 0, implicit $exec
+ early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+ early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_I32_16X16X64_IU4_w64_twoaddr 8, killed $vgpr0, 8, killed $vgpr17_vgpr18, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr23, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x32_fp8_fp8_D0_overlaps_B1
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
+
+ ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_fp8_fp8_D0_overlaps_B1
+ ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+ ; GFX12-NEXT: V_NOP_e32 implicit $exec
+ ; GFX12-NEXT: early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_F32_16X16X32_FP8_FP8_w64_twoaddr killed $vgpr16, killed $vgpr0_vgpr1, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr23, 0, implicit $exec
+ early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+ early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_F32_16X16X32_FP8_FP8_w64_twoaddr killed $vgpr16, killed $vgpr0_vgpr1, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr23, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x32_fp8_bf8_D0_overlaps_C1
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
+
+ ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_fp8_bf8_D0_overlaps_C1
+ ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+ ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_SWMMAC_F32_16X16X32_FP8_BF8_w64_twoaddr killed $vgpr16, killed $vgpr17_vgpr18, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr23, 0, implicit $exec
+ early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+ early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_SWMMAC_F32_16X16X32_FP8_BF8_w64_twoaddr killed $vgpr16, killed $vgpr17_vgpr18, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr23, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x32_bf8_fp8_D0_overlaps_Index1
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
+
+ ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_bf8_fp8_D0_overlaps_Index1
+ ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+ ; GFX12-NEXT: V_NOP_e32 implicit $exec
+ ; GFX12-NEXT: early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_F32_16X16X32_BF8_FP8_w64_twoaddr killed $vgpr16, killed $vgpr17_vgpr18, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr0, 0, implicit $exec
+ early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+ early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_F32_16X16X32_BF8_FP8_w64_twoaddr killed $vgpr16, killed $vgpr17_vgpr18, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x32_bf8_bf8_D0_overlaps_A1
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
+
+ ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_bf8_bf8_D0_overlaps_A1
+ ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+ ; GFX12-NEXT: V_NOP_e32 implicit $exec
+ ; GFX12-NEXT: early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_F32_16X16X32_BF8_BF8_w64_twoaddr killed $vgpr0, killed $vgpr17_vgpr18, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr23, 0, implicit $exec
+ early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+ early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_F32_16X16X32_BF8_BF8_w64_twoaddr killed $vgpr0, killed $vgpr17_vgpr18, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr23, 0, implicit $exec
+...
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_wmma_w32.s b/llvm/test/MC/AMDGPU/gfx12_asm_wmma_w32.s
new file mode 100644
index 00000000000000..e1cd0cab663472
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_wmma_w32.s
@@ -0,0 +1,1529 @@
+// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1200 -show-encoding %s | FileCheck --check-prefix=GFX12 %s
+// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1200 %s 2>&1 | FileCheck --check-prefix=GFX12-ERR --implicit-check-not=error: %s
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15]
+// GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x1c]
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0]
+// GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x3c]
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0]
+// GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x5c]
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
+// GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x9c]
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[1,0,0]
+// GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x40,0xcc,0x00,0x09,0x22,0x1c]
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,1,0]
+// GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x40,0xcc,0x00,0x09,0x22,0x1c]
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1]
+// GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x40,0xcc,0x00,0x09,0x22,0x1c]
+
+v_wmma_f32_16x16x16_f16 v[8:15], s[0:3], v[4:7], v[8:15]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], s[4:7], v[8:15]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], s[8:15]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_f16 v[8:15], 1.0, v[4:7], v[8:15]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], 1.0, v[8:15]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], 1.0
+// GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], 1.0 ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0xca,0x1b]
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], 1
+// GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x06,0x1a]
+
+
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15]
+// GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x1c]
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0]
+// GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x3c]
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0]
+// GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x5c]
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
+// GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x9c]
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[1,0,0]
+// GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x41,0xcc,0x00,0x09,0x22,0x1c]
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,1,0]
+// GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x41,0xcc,0x00,0x09,0x22,0x1c]
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1]
+// GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x41,0xcc,0x00,0x09,0x22,0x1c]
+
+v_wmma_f32_16x16x16_bf16 v[8:15], s[0:3], v[4:7], v[8:15]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], s[4:7], v[8:15]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], s[8:15]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf16 v[8:15], 1.0, v[4:7], v[8:15]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], 1.0, v[8:15]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], 1.0
+// GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], 1.0 ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0xca,0x1b]
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], 1
+// GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x06,0x1a]
+
+
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11]
+// GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x1c]
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0]
+// GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x3c]
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0]
+// GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x5c]
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1]
+// GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x9c]
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[1,0,0]
+// GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x42,0xcc,0x00,0x09,0x22,0x1c]
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,1,0]
+// GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x42,0xcc,0x00,0x09,0x22,0x1c]
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1]
+// GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x42,0xcc,0x00,0x09,0x22,0x1c]
+
+v_wmma_f16_16x16x16_f16 v[8:11], s[0:3], v[4:7], v[8:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], s[4:7], v[8:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], s[8:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f16_16x16x16_f16 v[8:11], 1.0, v[4:7], v[8:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], 1.0, v[8:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], 1.0
+// GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], 1.0 ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0xca,0x1b]
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], 1
+// GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x06,0x1a]
+
+
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11]
+// GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x1c]
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0]
+// GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x3c]
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0]
+// GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x5c]
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1]
+// GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x9c]
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[1,0,0]
+// GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x43,0xcc,0x00,0x09,0x22,0x1c]
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,1,0]
+// GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x43,0xcc,0x00,0x09,0x22,0x1c]
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1]
+// GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x43,0xcc,0x00,0x09,0x22,0x1c]
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], s[0:3], v[4:7], v[8:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], s[4:7], v[8:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], s[8:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], 1.0, v[4:7], v[8:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], 1.0, v[8:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], 1.0
+// GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], 1.0 ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0xca,0x1b]
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], 1
+// GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x06,0x1a]
+
+
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11]
+// GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] clamp
+// GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] clamp ; encoding: [0x04,0xc0,0x44,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
+// GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x3c]
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
+// GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x5c]
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[4:11], s[0:1], v[2:3], v[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], s[2:3], v[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], s[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu8 v[4:11], 1, v[2:3], v[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], 1, v[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], 1
+// GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x06,0x1a]
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], 1.0
+// GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0xca,0x1b]
+
+
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9]
+// GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x1c]
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] clamp
+// GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] clamp ; encoding: [0x02,0xc0,0x45,0xcc,0x00,0x03,0x0a,0x1c]
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[1,0,0]
+// GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x3c]
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[0,1,0]
+// GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x5c]
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:9], s0, v1, v[2:9]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, s1, v[2:9]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, s[0:7]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu4 v[2:9], 1, v1, v[2:9]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, 1, v[2:9]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, 1
+// GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, 1 ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x06,0x1a]
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, 1.0
+// GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, 1.0 ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0xca,0x1b]
+
+
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11]
+// GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
+// GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x9c]
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
+// GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x46,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], s[0:1], v[2:3], v[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], s[2:3], v[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], s[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], 1.0, v[2:3], v[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], 1.0, v[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], 1.0
+// GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0xca,0x1b]
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], 1
+// GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0x06,0x1a]
+
+
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11]
+// GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
+// GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x9c]
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
+// GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x48,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], s[0:1], v[2:3], v[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], s[2:3], v[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], s[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], 1.0, v[2:3], v[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], 1.0, v[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], 1.0
+// GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0xca,0x1b]
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], 1
+// GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0x06,0x1a]
+
+
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11]
+// GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
+// GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x9c]
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
+// GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x47,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], s[0:1], v[2:3], v[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], s[2:3], v[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], s[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], 1.0, v[2:3], v[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], 1.0, v[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], 1.0
+// GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0xca,0x1b]
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], 1
+// GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0x06,0x1a]
+
+
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11]
+// GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
+// GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x9c]
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
+// GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x49,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], s[0:1], v[2:3], v[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], s[2:3], v[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], s[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], 1.0, v[2:3], v[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], 1.0, v[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], 1.0
+// GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0xca,0x1b]
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], 1
+// GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0x06,0x1a]
+
+
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11]
+// GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] clamp
+// GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] clamp ; encoding: [0x04,0xc0,0x4a,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
+// GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x3c]
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
+// GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x5c]
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[4:11], s[0:1], v[2:3], v[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], s[2:3], v[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], s[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x32_iu4 v[4:11], 1, v[2:3], v[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], 1, v[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], 1
+// GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x06,0x1a]
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], 1.0
+// GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0xca,0x1b]
+
+
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20
+// GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x1c]
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 index_key:1
+// GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 index_key:1 ; encoding: [0x0c,0x48,0x50,0xcc,0x00,0x09,0x52,0x1c]
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 index_key:2
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 index_key:3
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0]
+// GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x3c]
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0]
+// GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x5c]
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[1,0,0]
+// GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[1,0,0] ; encoding: [0x0c,0x41,0x50,0xcc,0x00,0x09,0x52,0x1c]
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[0,1,0]
+// GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[0,1,0] ; encoding: [0x0c,0x42,0x50,0xcc,0x00,0x09,0x52,0x1c]
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_swmmac_f32_16x16x32_f16 v[12:19], s[0:3], v[4:11], v20
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], s[4:11], v20
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], s20
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_f16 v[12:19], 1.0, v[4:11], v20
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], 1.0, v20
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], 1.0
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], 1
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20
+// GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x1c]
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 index_key:1
+// GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 index_key:1 ; encoding: [0x0c,0x48,0x51,0xcc,0x00,0x09,0x52,0x1c]
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 index_key:2
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 index_key:3
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0]
+// GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x3c]
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0]
+// GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x5c]
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[1,0,0]
+// GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[1,0,0] ; encoding: [0x0c,0x41,0x51,0xcc,0x00,0x09,0x52,0x1c]
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[0,1,0]
+// GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[0,1,0] ; encoding: [0x0c,0x42,0x51,0xcc,0x00,0x09,0x52,0x1c]
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], s[0:3], v[4:11], v20
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], s[4:11], v20
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], s20
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], 1.0, v[4:11], v20
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], 1.0, v20
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], 1.0
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], 1
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16
+// GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x1c]
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:1
+// GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:1 ; encoding: [0x0c,0x48,0x52,0xcc,0x00,0x09,0x42,0x1c]
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:2
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:3
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0]
+// GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x3c]
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0]
+// GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x5c]
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[1,0,0]
+// GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[1,0,0] ; encoding: [0x0c,0x41,0x52,0xcc,0x00,0x09,0x42,0x1c]
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[0,1,0]
+// GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[0,1,0] ; encoding: [0x0c,0x42,0x52,0xcc,0x00,0x09,0x42,0x1c]
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_swmmac_f16_16x16x32_f16 v[12:15], s[0:3], v[4:11], v16
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], s[4:11], v16
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], s16
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f16_16x16x32_f16 v[12:15], 1.0, v[4:11], v16
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], 1.0, v16
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], 1.0
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], 1
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16
+// GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x1c]
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:1
+// GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:1 ; encoding: [0x0c,0x48,0x53,0xcc,0x00,0x09,0x42,0x1c]
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:2
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:3
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0]
+// GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x3c]
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0]
+// GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x5c]
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[1,0,0]
+// GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[1,0,0] ; encoding: [0x0c,0x41,0x53,0xcc,0x00,0x09,0x42,0x1c]
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[0,1,0]
+// GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[0,1,0] ; encoding: [0x0c,0x42,0x53,0xcc,0x00,0x09,0x42,0x1c]
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], s[0:3], v[4:11], v16
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], s[4:11], v16
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], s16
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], 1.0, v[4:11], v16
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], 1.0, v16
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], 1.0
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], 1
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14
+// GFX12: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x1c]
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 clamp
+// GFX12: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 clamp ; encoding: [0x06,0xc0,0x54,0xcc,0x00,0x05,0x3a,0x1c]
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 index_key:1
+// GFX12: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x54,0xcc,0x00,0x05,0x3a,0x1c]
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 index_key:2
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 index_key:3
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0]
+// GFX12: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x3c]
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0]
+// GFX12: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x5c]
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], s[0:1], v[2:5], v14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], s[0:3], v14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], s14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], 1, v[2:5], v14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], 1, v14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], 1
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], 1.0
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11
+// GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x1c]
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 clamp
+// GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 clamp ; encoding: [0x03,0xc0,0x55,0xcc,0x00,0x03,0x2e,0x1c]
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 index_key:1
+// GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 index_key:1 ; encoding: [0x03,0x48,0x55,0xcc,0x00,0x03,0x2e,0x1c]
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 index_key:2
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 index_key:3
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[1,0,0]
+// GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[1,0,0] ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x3c]
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[0,1,0]
+// GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[0,1,0] ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x5c]
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], s0, v[1:2], v11
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, s[0:1], v11
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], s11
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], 1, v[1:2], v11
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, 1, v11
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], 1
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], 1.0
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14
+// GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x1c]
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 clamp
+// GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 clamp ; encoding: [0x06,0xc0,0x56,0xcc,0x00,0x05,0x3a,0x1c]
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 index_key:1
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 index_key:2
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 index_key:3
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0]
+// GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x3c]
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0]
+// GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x5c]
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], s[0:1], v[2:5], v14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], s[0:3], v14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], s14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], 1, v[2:5], v14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], 1, v14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], 1
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], 1.0
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14
+// GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x1c]
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1
+// GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x57,0xcc,0x00,0x05,0x3a,0x1c]
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:2
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:3
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], s[0:1], v[2:5], v14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], s[0:3], v14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], s14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], 1.0, v[2:5], v14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], 1.0, v14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], 1.0
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], 1
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14
+// GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x1c]
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1
+// GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x58,0xcc,0x00,0x05,0x3a,0x1c]
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:2
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:3
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], s[0:1], v[2:5], v14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], s[0:3], v14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], s14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], 1.0, v[2:5], v14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], 1.0, v14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], 1.0
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], 1
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14
+// GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x1c]
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1
+// GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x59,0xcc,0x00,0x05,0x3a,0x1c]
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:2
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:3
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], s[0:1], v[2:5], v14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], s[0:3], v14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], s14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], 1.0, v[2:5], v14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], 1.0, v14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], 1.0
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], 1
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14
+// GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x1c]
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1
+// GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x5a,0xcc,0x00,0x05,0x3a,0x1c]
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:2
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:3
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], s[0:1], v[2:5], v14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], s[0:3], v14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], s14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], 1.0, v[2:5], v14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], 1.0, v14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], 1.0
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], 1
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_wmma_w64.s b/llvm/test/MC/AMDGPU/gfx12_asm_wmma_w64.s
new file mode 100644
index 00000000000000..8bd9e5039b7207
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_wmma_w64.s
@@ -0,0 +1,1529 @@
+// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=GFX12 %s
+// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=GFX12-ERR --implicit-check-not=error: %s
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7]
+// GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0]
+// GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x3c]
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0]
+// GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x5c]
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
+// GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x9c]
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[1,0,0]
+// GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[1,0,0] ; encoding: [0x04,0x41,0x40,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,1,0]
+// GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,1,0] ; encoding: [0x04,0x42,0x40,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1]
+// GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x40,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f32_16x16x16_f16 v[4:7], s[0:1], v[2:3], v[4:7]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], s[2:3], v[4:7]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], s[4:7]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_f16 v[4:7], 1.0, v[2:3], v[4:7]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], 1.0, v[4:7]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], 1.0
+// GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0xca,0x1b]
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], 1
+// GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x06,0x1a]
+
+
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7]
+// GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0]
+// GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x3c]
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0]
+// GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x5c]
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
+// GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x9c]
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[1,0,0]
+// GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[1,0,0] ; encoding: [0x04,0x41,0x41,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,1,0]
+// GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,1,0] ; encoding: [0x04,0x42,0x41,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1]
+// GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x41,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f32_16x16x16_bf16 v[4:7], s[0:1], v[2:3], v[4:7]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], s[2:3], v[4:7]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], s[4:7]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf16 v[4:7], 1.0, v[2:3], v[4:7]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], 1.0, v[4:7]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], 1.0
+// GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0xca,0x1b]
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], 1
+// GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x06,0x1a]
+
+
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5]
+// GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0]
+// GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x3c]
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0]
+// GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x5c]
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1]
+// GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x9c]
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[1,0,0]
+// GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[1,0,0] ; encoding: [0x04,0x41,0x42,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,1,0]
+// GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,1,0] ; encoding: [0x04,0x42,0x42,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1]
+// GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x42,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f16_16x16x16_f16 v[4:5], s[0:1], v[2:3], v[4:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], s[2:3], v[4:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], s[4:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f16_16x16x16_f16 v[4:5], 1.0, v[2:3], v[4:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], 1.0, v[4:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], 1.0
+// GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0xca,0x1b]
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], 1
+// GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x06,0x1a]
+
+
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5]
+// GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0]
+// GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x3c]
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0]
+// GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x5c]
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1]
+// GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x9c]
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[1,0,0]
+// GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[1,0,0] ; encoding: [0x04,0x41,0x43,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,1,0]
+// GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,1,0] ; encoding: [0x04,0x42,0x43,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1]
+// GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x43,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], s[0:1], v[2:3], v[4:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], s[2:3], v[4:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], s[4:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], 1.0, v[2:3], v[4:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], 1.0, v[4:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], 1.0
+// GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0xca,0x1b]
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], 1
+// GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x06,0x1a]
+
+
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5]
+// GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x1c]
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] clamp
+// GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] clamp ; encoding: [0x02,0xc0,0x44,0xcc,0x00,0x03,0x0a,0x1c]
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
+// GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x3c]
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
+// GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x5c]
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[2:5], s0, v1, v[2:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, s1, v[2:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, s[0:3]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu8 v[2:5], 1, v1, v[2:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, 1, v[2:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, 1
+// GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x06,0x1a]
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, 1.0
+// GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0xca,0x1b]
+
+
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5]
+// GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x1c]
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] clamp
+// GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] clamp ; encoding: [0x02,0xc0,0x45,0xcc,0x00,0x03,0x0a,0x1c]
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
+// GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x3c]
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
+// GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x5c]
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:5], s0, v1, v[2:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, s1, v[2:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, s[0:3]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu4 v[2:5], 1, v1, v[2:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, 1, v[2:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, 1
+// GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x06,0x1a]
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, 1.0
+// GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0xca,0x1b]
+
+
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5]
+// GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x1c]
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
+// GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x9c]
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
+// GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] ; encoding: [0x02,0x44,0x46,0xcc,0x00,0x03,0x0a,0x1c]
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], s0, v1, v[2:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, s1, v[2:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, s[0:3]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], 1.0, v1, v[2:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, 1.0, v[2:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, 1.0
+// GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x03,0xca,0x1b]
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, 1
+// GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x03,0x06,0x1a]
+
+
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5]
+// GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x1c]
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
+// GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x9c]
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
+// GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] ; encoding: [0x02,0x44,0x47,0xcc,0x00,0x03,0x0a,0x1c]
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], s0, v1, v[2:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, s1, v[2:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, s[0:3]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], 1.0, v1, v[2:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, 1.0, v[2:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, 1.0
+// GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x03,0xca,0x1b]
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, 1
+// GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x03,0x06,0x1a]
+
+
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5]
+// GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x1c]
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
+// GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x9c]
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
+// GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] ; encoding: [0x02,0x44,0x48,0xcc,0x00,0x03,0x0a,0x1c]
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], s0, v1, v[2:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, s1, v[2:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, s[0:3]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], 1.0, v1, v[2:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, 1.0, v[2:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, 1.0
+// GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x03,0xca,0x1b]
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, 1
+// GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x03,0x06,0x1a]
+
+
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5]
+// GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x1c]
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
+// GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x9c]
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
+// GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] ; encoding: [0x02,0x44,0x49,0xcc,0x00,0x03,0x0a,0x1c]
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], s0, v1, v[2:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, s1, v[2:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, s[0:3]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], 1.0, v1, v[2:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, 1.0, v[2:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, 1.0
+// GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x03,0xca,0x1b]
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, 1
+// GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x03,0x06,0x1a]
+
+
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5]
+// GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x1c]
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] clamp
+// GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] clamp ; encoding: [0x02,0xc0,0x4a,0xcc,0x00,0x03,0x0a,0x1c]
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
+// GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x3c]
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
+// GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x5c]
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[2:5], s0, v1, v[2:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, s1, v[2:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, s[0:3]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x32_iu4 v[2:5], 1.0, v1, v[2:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, 1.0, v[2:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, 1.0
+// GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0xca,0x1b]
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, 1
+// GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x06,0x1a]
+
+
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10
+// GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x1c]
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:1
+// GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:1 ; encoding: [0x06,0x48,0x50,0xcc,0x00,0x05,0x2a,0x1c]
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:2
+// GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:2 ; encoding: [0x06,0x50,0x50,0xcc,0x00,0x05,0x2a,0x1c]
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:3
+// GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:3 ; encoding: [0x06,0x58,0x50,0xcc,0x00,0x05,0x2a,0x1c]
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0]
+// GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x3c]
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0]
+// GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x5c]
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[1,0,0]
+// GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[1,0,0] ; encoding: [0x06,0x41,0x50,0xcc,0x00,0x05,0x2a,0x1c]
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[0,1,0]
+// GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[0,1,0] ; encoding: [0x06,0x42,0x50,0xcc,0x00,0x05,0x2a,0x1c]
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_swmmac_f32_16x16x32_f16 v[6:9], s[0:1], v[2:5], v10
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], s[0:3], v10
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], s10
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_f16 v[6:9], 1.0, v[2:5], v10
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], 1.0, v10
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], 1.0
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], 1
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10
+// GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x1c]
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:1
+// GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:1 ; encoding: [0x06,0x48,0x51,0xcc,0x00,0x05,0x2a,0x1c]
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:2
+// GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:2 ; encoding: [0x06,0x50,0x51,0xcc,0x00,0x05,0x2a,0x1c]
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:3
+// GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:3 ; encoding: [0x06,0x58,0x51,0xcc,0x00,0x05,0x2a,0x1c]
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0]
+// GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x3c]
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0]
+// GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x5c]
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[1,0,0]
+// GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[1,0,0] ; encoding: [0x06,0x41,0x51,0xcc,0x00,0x05,0x2a,0x1c]
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[0,1,0]
+// GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[0,1,0] ; encoding: [0x06,0x42,0x51,0xcc,0x00,0x05,0x2a,0x1c]
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], s[0:1], v[2:5], v10
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], s[0:3], v10
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], s10
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], 1.0, v[2:5], v10
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], 1.0, v10
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], 1.0
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], 1
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8
+// GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x1c]
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:1
+// GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:1 ; encoding: [0x06,0x48,0x52,0xcc,0x00,0x05,0x22,0x1c]
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:2
+// GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:2 ; encoding: [0x06,0x50,0x52,0xcc,0x00,0x05,0x22,0x1c]
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:3
+// GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:3 ; encoding: [0x06,0x58,0x52,0xcc,0x00,0x05,0x22,0x1c]
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0]
+// GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x3c]
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0]
+// GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x5c]
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[1,0,0]
+// GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[1,0,0] ; encoding: [0x06,0x41,0x52,0xcc,0x00,0x05,0x22,0x1c]
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[0,1,0]
+// GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[0,1,0] ; encoding: [0x06,0x42,0x52,0xcc,0x00,0x05,0x22,0x1c]
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_swmmac_f16_16x16x32_f16 v[6:7], s[0:1], v[2:5], v8
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], s[0:3], v8
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], s8
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f16_16x16x32_f16 v[6:7], 1.0, v[2:5], v8
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], 1.0, v8
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], 1.0
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], 1
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8
+// GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x1c]
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:1
+// GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:1 ; encoding: [0x06,0x48,0x53,0xcc,0x00,0x05,0x22,0x1c]
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:2
+// GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:2 ; encoding: [0x06,0x50,0x53,0xcc,0x00,0x05,0x22,0x1c]
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:3
+// GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:3 ; encoding: [0x06,0x58,0x53,0xcc,0x00,0x05,0x22,0x1c]
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0]
+// GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x3c]
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0]
+// GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x5c]
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[1,0,0]
+// GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[1,0,0] ; encoding: [0x06,0x41,0x53,0xcc,0x00,0x05,0x22,0x1c]
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[0,1,0]
+// GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[0,1,0] ; encoding: [0x06,0x42,0x53,0xcc,0x00,0x05,0x22,0x1c]
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], s[0:1], v[2:5], v8
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], s[0:3], v8
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], s8
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], 1.0, v[2:5], v8
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], 1.0, v8
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], 1.0
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], 1
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7
+// GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 clamp
+// GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 clamp ; encoding: [0x03,0xc0,0x54,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:1
+// GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x54,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:2
+// GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x54,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:3
+// GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x54,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
+// GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x3c]
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
+// GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x5c]
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], s0, v[1:2], v7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, s[0:1], v7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], s7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], 1, v[1:2], v7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, 1, v7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], 1
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], 1.0
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6
+// GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x1c]
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 clamp
+// GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 clamp ; encoding: [0x02,0xc0,0x55,0xcc,0x00,0x03,0x1a,0x1c]
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:1
+// GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:1 ; encoding: [0x02,0x48,0x55,0xcc,0x00,0x03,0x1a,0x1c]
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:2
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:3
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[1,0,0]
+// GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x3c]
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[0,1,0]
+// GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x5c]
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], s0, v1, v6
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, s1, v6
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, s6
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], 1, v1, v6
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, 1, v6
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, 1
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, 1.0
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7
+// GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 clamp
+// GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 clamp ; encoding: [0x03,0xc0,0x56,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:1
+// GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x56,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:2
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:3
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
+// GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x3c]
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
+// GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x5c]
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], s0, v[1:2], v7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, s[0:1], v7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], s7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], 1, v[1:2], v7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, 1, v7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], 1
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], 1.0
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7
+// GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x57,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:1
+// GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x57,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:2
+// GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x57,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:3
+// GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x57,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], s0, v[1:2], v7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, s[0:1], v7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], s7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], 1.0, v[1:2], v7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, 1.0, v7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], 1.0
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], 1
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7
+// GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x58,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:1
+// GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x58,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:2
+// GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x58,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:3
+// GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x58,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], s0, v[1:2], v7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, s[0:1], v7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], s7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], 1.0, v[1:2], v7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, 1.0, v7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], 1.0
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], 1
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7
+// GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x59,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:1
+// GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x59,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:2
+// GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x59,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:3
+// GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x59,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], s0, v[1:2], v7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, s[0:1], v7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], s7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], 1.0, v[1:2], v7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, 1.0, v7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], 1.0
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], 1
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7
+// GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x5a,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:1
+// GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x5a,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:2
+// GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x5a,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:3
+// GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x5a,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], s0, v[1:2], v7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, s[0:1], v7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], s7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], 1.0, v[1:2], v7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, 1.0, v7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], 1.0
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], 1
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_wmma_w32.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_wmma_w32.txt
new file mode 100644
index 00000000000000..5079d2f0896561
--- /dev/null
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_wmma_w32.txt
@@ -0,0 +1,1628 @@
+# RUN: not llvm-mc -disassemble -arch=amdgcn -mcpu=gfx1200 -show-encoding %s | FileCheck --check-prefix=GFX12 %s
+# RUN: not llvm-mc -disassemble -arch=amdgcn -mcpu=gfx1200 -show-encoding %s 2>&1 | FileCheck --implicit-check-not=warning: --check-prefix=GFX12-ERR %s
+
+[0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x1c]
+# GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x1c]
+
+[0x08,0xc0,0x40,0xcc,0x00,0x09,0x22,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x48,0x40,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x50,0x40,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x60,0x40,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x00,0x40,0xcc,0x00,0x09,0x22,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x48,0x40,0xcc,0x00,0x09,0x22,0x1c] # index_key:1
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x50,0x40,0xcc,0x00,0x09,0x22,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x58,0x40,0xcc,0x00,0x09,0x22,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x3c] # neg_lo:[1,0,0]
+# GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x3c]
+
+[0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x5c] # neg_lo:[0,1,0]
+# GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x5c]
+
+[0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x9c] # neg_lo:[0,0,1]
+# GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x9c]
+
+[0x08,0x41,0x40,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[1,0,0]
+# GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x40,0xcc,0x00,0x09,0x22,0x1c]
+
+[0x08,0x42,0x40,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[0,1,0]
+# GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x40,0xcc,0x00,0x09,0x22,0x1c]
+
+[0x08,0x44,0x40,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[0,0,1]
+# GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x40,0xcc,0x00,0x09,0x22,0x1c]
+
+[0x08,0x40,0x40,0xcc,0x00,0x08,0x22,0x1c] # sgpr src0
+# GFX12: v_wmma_f32_16x16x16_f16 v[8:15], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v[4:7], v[8:15] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x08,0x22,0x1c]
+
+[0x08,0x40,0x40,0xcc,0x00,0x09,0x20,0x1c] # sgpr src1
+# GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], s[4:7]/*Invalid register, operand has 'VReg_128' register class*/, v[8:15] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x20,0x1c]
+
+[0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x18] # sgpr src2
+# GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], s[8:15]/*Invalid register, operand has 'VReg_256' register class*/ ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x18]
+
+[0x08,0x40,0x40,0xcc,0xf2,0x08,0x22,0x1c] # 1.0 src0
+# GFX12: v_wmma_f32_16x16x16_f16 v[8:15], 1.0/*Invalid immediate*/, v[4:7], v[8:15] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x08,0x22,0x1c]
+
+[0x08,0x40,0x40,0xcc,0x00,0xe5,0x21,0x1c] # 1.0 src1
+# GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], 1.0/*Invalid immediate*/, v[8:15] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x01,0x20,0x1c]
+
+[0x08,0x40,0x40,0xcc,0x00,0x09,0xca,0x1b] # 1.0 src2
+# GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], 1.0 ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0xca,0x1b]
+
+[0x08,0x40,0x40,0xcc,0x00,0x09,0x06,0x1a] # 1 src2
+# GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x06,0x1a]
+
+
+
+[0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x1c]
+# GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x1c]
+
+[0x08,0xc0,0x41,0xcc,0x00,0x09,0x22,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x48,0x41,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x50,0x41,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x60,0x41,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x00,0x41,0xcc,0x00,0x09,0x22,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x41,0xcc,0x00,0x05,0x12,0x1c] # index_key:1
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x41,0xcc,0x00,0x05,0x12,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x58,0x41,0xcc,0x00,0x05,0x12,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x3c] # neg_lo:[1,0,0]
+# GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x3c]
+
+[0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x5c] # neg_lo:[0,1,0]
+# GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x5c]
+
+[0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x9c] # neg_lo:[0,0,1]
+# GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x9c]
+
+[0x08,0x41,0x41,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[1,0,0]
+# GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x41,0xcc,0x00,0x09,0x22,0x1c]
+
+[0x08,0x42,0x41,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[0,1,0]
+# GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x41,0xcc,0x00,0x09,0x22,0x1c]
+
+[0x08,0x44,0x41,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[0,0,1]
+# GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x41,0xcc,0x00,0x09,0x22,0x1c]
+
+[0x08,0x40,0x41,0xcc,0x00,0x08,0x22,0x1c] # sgpr src0
+# GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v[4:7], v[8:15] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x08,0x22,0x1c]
+
+[0x08,0x40,0x41,0xcc,0x00,0x09,0x20,0x1c] # sgpr src1
+# GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], s[4:7]/*Invalid register, operand has 'VReg_128' register class*/, v[8:15] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x20,0x1c]
+
+[0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x18] # sgpr src2
+# GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], s[8:15]/*Invalid register, operand has 'VReg_256' register class*/ ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x18]
+
+[0x08,0x40,0x41,0xcc,0xf2,0x08,0x22,0x1c] # 1.0 src0
+# GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], 1.0/*Invalid immediate*/, v[4:7], v[8:15] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x08,0x22,0x1c]
+
+[0x08,0x40,0x41,0xcc,0x00,0xe5,0x21,0x1c] # 1.0 src1
+# GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], 1.0/*Invalid immediate*/, v[8:15] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x01,0x20,0x1c]
+
+[0x08,0x40,0x41,0xcc,0x00,0x09,0xca,0x1b] # 1.0 src2
+# GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], 1.0 ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0xca,0x1b]
+
+[0x08,0x40,0x41,0xcc,0x00,0x09,0x06,0x1a] # 1 src2
+# GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x06,0x1a]
+
+
+
+[0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x1c]
+# GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x1c]
+
+[0x08,0xc0,0x42,0xcc,0x00,0x09,0x22,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x48,0x42,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x50,0x42,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x60,0x42,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x00,0x42,0xcc,0x00,0x09,0x22,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x42,0xcc,0x00,0x05,0x12,0x1c] # index_key:1
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x42,0xcc,0x00,0x05,0x12,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x58,0x42,0xcc,0x00,0x05,0x12,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x3c] # neg_lo:[1,0,0]
+# GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x3c]
+
+[0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x5c] # neg_lo:[0,1,0]
+# GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x5c]
+
+[0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x9c] # neg_lo:[0,0,1]
+# GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x9c]
+
+[0x08,0x41,0x42,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[1,0,0]
+# GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x42,0xcc,0x00,0x09,0x22,0x1c]
+
+[0x08,0x42,0x42,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[0,1,0]
+# GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x42,0xcc,0x00,0x09,0x22,0x1c]
+
+[0x08,0x44,0x42,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[0,0,1]
+# GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x42,0xcc,0x00,0x09,0x22,0x1c]
+
+[0x08,0x40,0x42,0xcc,0x00,0x08,0x22,0x1c] # sgpr src0
+# GFX12: v_wmma_f16_16x16x16_f16 v[8:11], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v[4:7], v[8:11] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x08,0x22,0x1c]
+
+[0x08,0x40,0x42,0xcc,0x00,0x09,0x20,0x1c] # sgpr src1
+# GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], s[4:7]/*Invalid register, operand has 'VReg_128' register class*/, v[8:11] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x20,0x1c]
+
+[0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x18] # sgpr src2
+# GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], s[8:11]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x18]
+
+[0x08,0x40,0x42,0xcc,0xf2,0x08,0x22,0x1c] # 1.0 src0
+# GFX12: v_wmma_f16_16x16x16_f16 v[8:11], 1.0/*Invalid immediate*/, v[4:7], v[8:11] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x08,0x22,0x1c]
+
+[0x08,0x40,0x42,0xcc,0x00,0xe5,0x21,0x1c] # 1.0 src1
+# GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], 1.0/*Invalid immediate*/, v[8:11] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x01,0x20,0x1c]
+
+[0x08,0x40,0x42,0xcc,0x00,0x09,0xca,0x1b] # 1.0 src2
+# GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], 1.0 ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0xca,0x1b]
+
+[0x08,0x40,0x42,0xcc,0x00,0x09,0x06,0x1a] # 1 src2
+# GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x06,0x1a]
+
+
+
+[0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x1c]
+# GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x1c]
+
+[0x08,0xc0,0x43,0xcc,0x00,0x09,0x22,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x48,0x43,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x50,0x43,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x60,0x43,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x00,0x43,0xcc,0x00,0x09,0x22,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x43,0xcc,0x00,0x05,0x12,0x1c] # index_key:1
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x43,0xcc,0x00,0x05,0x12,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x58,0x43,0xcc,0x00,0x05,0x12,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x3c] # neg_lo:[1,0,0]
+# GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x3c]
+
+[0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x5c] # neg_lo:[0,1,0]
+# GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x5c]
+
+[0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x9c] # neg_lo:[0,0,1]
+# GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x9c]
+
+[0x08,0x41,0x43,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[1,0,0]
+# GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x43,0xcc,0x00,0x09,0x22,0x1c]
+
+[0x08,0x42,0x43,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[0,1,0]
+# GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x43,0xcc,0x00,0x09,0x22,0x1c]
+
+[0x08,0x44,0x43,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[0,0,1]
+# GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x43,0xcc,0x00,0x09,0x22,0x1c]
+
+[0x08,0x40,0x43,0xcc,0x00,0x08,0x22,0x1c] # sgpr src0
+# GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v[4:7], v[8:11] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x08,0x22,0x1c]
+
+[0x08,0x40,0x43,0xcc,0x00,0x09,0x20,0x1c] # sgpr src1
+# GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], s[4:7]/*Invalid register, operand has 'VReg_128' register class*/, v[8:11] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x20,0x1c]
+
+[0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x18] # sgpr src2
+# GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], s[8:11]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x18]
+
+[0x08,0x40,0x43,0xcc,0xf2,0x08,0x22,0x1c] # 1.0 src0
+# GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], 1.0/*Invalid immediate*/, v[4:7], v[8:11] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x08,0x22,0x1c]
+
+[0x08,0x40,0x43,0xcc,0x00,0xe5,0x21,0x1c] # 1.0 src1
+# GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], 1.0/*Invalid immediate*/, v[8:11] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x01,0x20,0x1c]
+
+[0x08,0x40,0x43,0xcc,0x00,0x09,0xca,0x1b] # 1.0 src2
+# GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], 1.0 ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0xca,0x1b]
+
+[0x08,0x40,0x43,0xcc,0x00,0x09,0x06,0x1a] # 1 src2
+# GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x06,0x1a]
+
+
+
+[0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x1c]
+# GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0xc0,0x44,0xcc,0x00,0x05,0x12,0x1c] # clamp
+# GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] clamp ; encoding: [0x04,0xc0,0x44,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x48,0x44,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x44,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x60,0x44,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x00,0x44,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x48,0x44,0xcc,0x00,0x03,0x0a,0x1c] # index_key:1
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x44,0xcc,0x00,0x03,0x0a,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x58,0x44,0xcc,0x00,0x03,0x0a,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0]
+# GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x3c]
+
+[0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0]
+# GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x5c]
+
+[0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x41,0x44,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x42,0x44,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x44,0x44,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x44,0xcc,0x00,0x04,0x12,0x1c] # sgpr src0
+# GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x44,0xcc,0x00,0x05,0x10,0x1c] # sgpr src1
+# GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], s[2:3]/*Invalid register, operand has 'VReg_64' register class*/, v[4:11] ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x10,0x1c]
+
+[0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x18] # sgpr src2
+# GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/ ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x18]
+
+[0x04,0x40,0x44,0xcc,0x81,0x04,0x12,0x1c] # 1 src0
+# GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], 1/*Invalid immediate*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x44,0xcc,0x01,0x04,0x12,0x1c]
+
+[0x04,0x40,0x44,0xcc,0x00,0x03,0x11,0x1c] # 1 src1
+# GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], 1/*Invalid immediate*/, v[4:11] ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x03,0x10,0x1c]
+
+[0x04,0x40,0x44,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x06,0x1a]
+
+[0x04,0x40,0x44,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0xca,0x1b]
+
+
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x1c]
+# GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x1c]
+
+[0x02,0xc0,0x45,0xcc,0x00,0x03,0x0a,0x1c] # clamp
+# GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] clamp ; encoding: [0x02,0xc0,0x45,0xcc,0x00,0x03,0x0a,0x1c]
+
+[0x02,0x48,0x45,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x45,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x60,0x45,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x00,0x45,0xcc,0x00,0x03,0x0a,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x48,0x45,0xcc,0x00,0x03,0x0a,0x1c] # index_key:1
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x45,0xcc,0x00,0x03,0x0a,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x58,0x45,0xcc,0x00,0x03,0x0a,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x3c] # neg_lo:[1,0,0]
+# GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x3c]
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x5c] # neg_lo:[0,1,0]
+# GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x5c]
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x9c] # neg_lo:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x41,0x45,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x42,0x45,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x44,0x45,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x45,0xcc,0x00,0x02,0x0a,0x1c] # sgpr src0
+# GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], s0/*Invalid register, operand has 'VGPR_32' register class*/, v1, v[2:9] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x02,0x0a,0x1c]
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x08,0x1c] # sgpr src1
+# GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, s1/*Invalid register, operand has 'VGPR_32' register class*/, v[2:9] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x08,0x1c]
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x18] # sgpr src2
+# GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, s[0:7]/*Invalid register, operand has 'VReg_256' register class*/ ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x02,0x18]
+
+[0x02,0x40,0x45,0xcc,0x81,0x02,0x0a,0x1c] # 1 src0
+# GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], 1/*Invalid immediate*/, v1, v[2:9] ; encoding: [0x02,0x40,0x45,0xcc,0x01,0x02,0x0a,0x1c]
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x09,0x1c] # 1 src1
+# GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, 1/*Invalid immediate*/, v[2:9] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x08,0x1c]
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x06,0x1a] # 1 src2
+# GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, 1 ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x06,0x1a]
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2
+# GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, 1.0 ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0xca,0x1b]
+
+
+
+[0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x1c]
+# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0xc0,0x46,0xcc,0x00,0x05,0x12,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x46,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x46,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x60,0x46,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x00,0x46,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x46,0xcc,0x00,0x05,0x12,0x1c] # index_key:1
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x46,0xcc,0x00,0x05,0x12,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x58,0x46,0xcc,0x00,0x05,0x12,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1]
+# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x9c]
+
+[0x04,0x41,0x46,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x42,0x46,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x44,0x46,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1]
+# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x46,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x40,0x46,0xcc,0x00,0x04,0x12,0x1c] # sgpr src0
+# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x46,0xcc,0x00,0x05,0x10,0x1c] # sgpr src1
+# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], s[2:3]/*Invalid register, operand has 'VReg_64' register class*/, v[4:11] ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0x10,0x1c]
+
+[0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x18] # sgpr src2
+# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/ ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x18]
+
+[0x04,0x40,0x46,0xcc,0xf2,0x04,0x12,0x1c] # 1.0 src0
+# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], 1.0/*Invalid immediate*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x46,0xcc,0x00,0xe5,0x11,0x1c] # 1.0 src1
+# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], 1.0/*Invalid immediate*/, v[4:11] ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x01,0x10,0x1c]
+
+[0x04,0x40,0x46,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0xca,0x1b]
+
+[0x04,0x40,0x46,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0x06,0x1a]
+
+
+
+[0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x1c]
+# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0xc0,0x48,0xcc,0x00,0x05,0x12,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x48,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x48,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x60,0x48,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x00,0x48,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x48,0xcc,0x00,0x05,0x12,0x1c] # index_key:1
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x48,0xcc,0x00,0x05,0x12,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x58,0x48,0xcc,0x00,0x05,0x12,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1]
+# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x9c]
+
+[0x04,0x41,0x48,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x42,0x48,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x44,0x48,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1]
+# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x48,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x40,0x48,0xcc,0x00,0x04,0x12,0x1c] # sgpr src0
+# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x48,0xcc,0x00,0x05,0x10,0x1c] # sgpr src1
+# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], s[2:3]/*Invalid register, operand has 'VReg_64' register class*/, v[4:11] ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0x10,0x1c]
+
+[0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x18] # sgpr src2
+# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/ ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x18]
+
+[0x04,0x40,0x48,0xcc,0xf2,0x04,0x12,0x1c] # 1.0 src0
+# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], 1.0/*Invalid immediate*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x48,0xcc,0x00,0xe5,0x11,0x1c] # 1.0 src1
+# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], 1.0/*Invalid immediate*/, v[4:11] ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x01,0x10,0x1c]
+
+[0x04,0x40,0x48,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0xca,0x1b]
+
+[0x04,0x40,0x48,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0x06,0x1a]
+
+
+
+[0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x1c]
+# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0xc0,0x47,0xcc,0x00,0x05,0x12,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x47,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x47,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x60,0x47,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x00,0x47,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x47,0xcc,0x00,0x05,0x12,0x1c] # index_key:1
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x47,0xcc,0x00,0x05,0x12,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x58,0x47,0xcc,0x00,0x05,0x12,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1]
+# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x9c]
+
+[0x04,0x41,0x47,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x42,0x47,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x44,0x47,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1]
+# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x47,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x40,0x47,0xcc,0x00,0x04,0x12,0x1c] # sgpr src0
+# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x47,0xcc,0x00,0x05,0x10,0x1c] # sgpr src1
+# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], s[2:3]/*Invalid register, operand has 'VReg_64' register class*/, v[4:11] ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0x10,0x1c]
+
+[0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x18] # sgpr src2
+# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/ ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x18]
+
+[0x04,0x40,0x47,0xcc,0xf2,0x04,0x12,0x1c] # 1.0 src0
+# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], 1.0/*Invalid immediate*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x47,0xcc,0x00,0xe5,0x11,0x1c] # 1.0 src1
+# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], 1.0/*Invalid immediate*/, v[4:11] ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x01,0x10,0x1c]
+
+[0x04,0x40,0x47,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0xca,0x1b]
+
+[0x04,0x40,0x47,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0x06,0x1a]
+
+
+
+[0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x1c]
+# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0xc0,0x49,0xcc,0x00,0x05,0x12,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x49,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x49,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x60,0x49,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x00,0x49,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x49,0xcc,0x00,0x05,0x12,0x1c] # index_key:1
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x49,0xcc,0x00,0x05,0x12,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x58,0x49,0xcc,0x00,0x05,0x12,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1]
+# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x9c]
+
+[0x04,0x41,0x49,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x42,0x49,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x44,0x49,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1]
+# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x49,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x40,0x49,0xcc,0x00,0x04,0x12,0x1c] # sgpr src0
+# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x49,0xcc,0x00,0x05,0x10,0x1c] # sgpr src1
+# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], s[2:3]/*Invalid register, operand has 'VReg_64' register class*/, v[4:11] ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0x10,0x1c]
+
+[0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x18] # sgpr src2
+# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/ ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x18]
+
+[0x04,0x40,0x49,0xcc,0xf2,0x04,0x12,0x1c] # 1.0 src0
+# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], 1.0/*Invalid immediate*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x49,0xcc,0x00,0xe5,0x11,0x1c] # 1.0 src1
+# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], 1.0/*Invalid immediate*/, v[4:11] ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x01,0x10,0x1c]
+
+[0x04,0x40,0x49,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0xca,0x1b]
+
+[0x04,0x40,0x49,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0x06,0x1a]
+
+
+
+[0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x1c]
+# GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0xc0,0x4a,0xcc,0x00,0x05,0x12,0x1c] # clamp
+# GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] clamp ; encoding: [0x04,0xc0,0x4a,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x48,0x4a,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x4a,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x60,0x4a,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x00,0x4a,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x4a,0xcc,0x00,0x05,0x12,0x1c] # index_key:1
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x4a,0xcc,0x00,0x05,0x12,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x58,0x4a,0xcc,0x00,0x05,0x12,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0]
+# GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x3c]
+
+[0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0]
+# GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x5c]
+
+[0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x41,0x4a,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x42,0x4a,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x44,0x4a,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x4a,0xcc,0x00,0x04,0x12,0x1c] # sgpr src0
+# GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x4a,0xcc,0x00,0x05,0x10,0x1c] # sgpr src1
+# GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], s[2:3]/*Invalid register, operand has 'VReg_64' register class*/, v[4:11] ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x10,0x1c]
+
+[0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x18] # sgpr src2
+# GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/ ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x18]
+
+[0x04,0x40,0x4a,0xcc,0x81,0x04,0x12,0x1c] # 1 src0
+# GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], 1/*Invalid immediate*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x4a,0xcc,0x01,0x04,0x12,0x1c]
+
+[0x04,0x40,0x4a,0xcc,0x00,0x03,0x11,0x1c] # 1 src1
+# GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], 1/*Invalid immediate*/, v[4:11] ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x03,0x10,0x1c]
+
+[0x04,0x40,0x4a,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x06,0x1a]
+
+[0x04,0x40,0x4a,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0xca,0x1b]
+
+
+
+[0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x1c]
+# GFX12:v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x1c]
+
+[0x0c,0xc0,0x50,0xcc,0x00,0x09,0x52,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x48,0x50,0xcc,0x00,0x09,0x52,0x1c] # op_sel:[1,0,0]
+# GFX12:v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 index_key:1 ; encoding: [0x0c,0x48,0x50,0xcc,0x00,0x09,0x52,0x1c]
+
+[0x0c,0x50,0x50,0xcc,0x00,0x09,0x52,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x60,0x50,0xcc,0x00,0x09,0x52,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x00,0x50,0xcc,0x00,0x09,0x52,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x48,0x50,0xcc,0x00,0x09,0x52,0x1c] # index_key:1
+# GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 index_key:1 ; encoding: [0x0c,0x48,0x50,0xcc,0x00,0x09,0x52,0x1c]
+
+[0x0c,0x50,0x50,0xcc,0x00,0x09,0x52,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x58,0x50,0xcc,0x00,0x09,0x52,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x3c] # neg_lo:[1,0,0]
+# GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x3c]
+
+[0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x5c] # neg_lo:[0,1,0]
+# GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x5c]
+
+[0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x9c] # neg_lo:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x41,0x50,0xcc,0x00,0x09,0x52,0x1c] # neg_hi:[1,0,0]
+# GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[1,0,0] ; encoding: [0x0c,0x41,0x50,0xcc,0x00,0x09,0x52,0x1c]
+
+[0x0c,0x42,0x50,0xcc,0x00,0x09,0x52,0x1c] # neg_hi:[0,1,0]
+# GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[0,1,0] ; encoding: [0x0c,0x42,0x50,0xcc,0x00,0x09,0x52,0x1c]
+
+[0x0c,0x44,0x50,0xcc,0x00,0x09,0x52,0x1c] # neg_hi:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x40,0x50,0xcc,0x00,0x08,0x52,0x1c] # sgpr src0
+# GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v[4:11], v20 ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x08,0x52,0x1c]
+
+[0x0c,0x40,0x50,0xcc,0x00,0x09,0x50,0x1c] # sgpr src1
+# GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/, v20 ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x50,0x1c]
+
+[0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x18] # sgpr src2
+# GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], s20/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x18]
+
+[0x0c,0x40,0x50,0xcc,0xf2,0x08,0x52,0x1c] # 1.0 src0
+# GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], 1.0/*Invalid immediate*/, v[4:11], v20 ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x08,0x52,0x1c]
+
+[0x0c,0x40,0x50,0xcc,0x00,0xe5,0x51,0x1c] # 1.0 src1
+# GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], 1.0/*Invalid immediate*/, v20 ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x01,0x50,0x1c]
+
+[0x0c,0x40,0x50,0xcc,0x00,0x09,0xca,0x1b] # 1.0 src2
+# GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], 1.0/*Invalid immediate*/ ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x02,0x18]
+
+[0x0c,0x40,0x50,0xcc,0x00,0x09,0x06,0x1a] # 1 src2
+# GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], 1/*Invalid immediate*/ ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x06,0x18]
+
+
+
+[0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x1c]
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x1c]
+
+[0x0c,0xc0,0x51,0xcc,0x00,0x09,0x52,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x48,0x51,0xcc,0x00,0x09,0x52,0x1c] # op_sel:[1,0,0]
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 index_key:1 ; encoding: [0x0c,0x48,0x51,0xcc,0x00,0x09,0x52,0x1c]
+
+[0x0c,0x50,0x51,0xcc,0x00,0x09,0x52,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x60,0x51,0xcc,0x00,0x09,0x52,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x00,0x51,0xcc,0x00,0x09,0x52,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x48,0x51,0xcc,0x00,0x09,0x52,0x1c] # index_key:1
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 index_key:1 ; encoding: [0x0c,0x48,0x51,0xcc,0x00,0x09,0x52,0x1c]
+
+[0x0c,0x50,0x51,0xcc,0x00,0x09,0x52,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x58,0x51,0xcc,0x00,0x09,0x52,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x3c] # neg_lo:[1,0,0]
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x3c]
+
+[0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x5c] # neg_lo:[0,1,0]
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x5c]
+
+[0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x9c] # neg_lo:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x41,0x51,0xcc,0x00,0x09,0x52,0x1c] # neg_hi:[1,0,0]
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[1,0,0] ; encoding: [0x0c,0x41,0x51,0xcc,0x00,0x09,0x52,0x1c]
+
+[0x0c,0x42,0x51,0xcc,0x00,0x09,0x52,0x1c] # neg_hi:[0,1,0]
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[0,1,0] ; encoding: [0x0c,0x42,0x51,0xcc,0x00,0x09,0x52,0x1c]
+
+[0x0c,0x44,0x51,0xcc,0x00,0x09,0x52,0x1c] # neg_hi:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x40,0x51,0xcc,0x00,0x08,0x52,0x1c] # sgpr src0
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v[4:11], v20 ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x08,0x52,0x1c]
+
+[0x0c,0x40,0x51,0xcc,0x00,0x09,0x50,0x1c] # sgpr src1
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/, v20 ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x50,0x1c]
+
+[0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x18] # sgpr src2
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], s20/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x18]
+
+[0x0c,0x40,0x51,0xcc,0xf2,0x08,0x52,0x1c] # 1.0 src0
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], 1.0/*Invalid immediate*/, v[4:11], v20 ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x08,0x52,0x1c]
+
+[0x0c,0x40,0x51,0xcc,0x00,0xe5,0x51,0x1c] # 1.0 src1
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], 1.0/*Invalid immediate*/, v20 ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x01,0x50,0x1c]
+
+[0x0c,0x40,0x51,0xcc,0x00,0x09,0xca,0x1b] # 1.0 src2
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], 1.0/*Invalid immediate*/ ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x02,0x18]
+
+[0x0c,0x40,0x51,0xcc,0x00,0x09,0x06,0x1a] # 1 src2
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], 1/*Invalid immediate*/ ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x06,0x18]
+
+
+
+[0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x1c]
+# GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x1c]
+
+[0x0c,0xc0,0x52,0xcc,0x00,0x09,0x42,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x48,0x52,0xcc,0x00,0x09,0x42,0x1c] # op_sel:[1,0,0]
+# GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:1 ; encoding: [0x0c,0x48,0x52,0xcc,0x00,0x09,0x42,0x1c]
+
+[0x0c,0x50,0x52,0xcc,0x00,0x09,0x42,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x60,0x52,0xcc,0x00,0x09,0x42,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x00,0x52,0xcc,0x00,0x09,0x42,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x48,0x52,0xcc,0x00,0x09,0x42,0x1c] # index_key:1
+# GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:1 ; encoding: [0x0c,0x48,0x52,0xcc,0x00,0x09,0x42,0x1c]
+
+[0x0c,0x50,0x52,0xcc,0x00,0x09,0x42,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x58,0x52,0xcc,0x00,0x09,0x42,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x3c] # neg_lo:[1,0,0]
+# GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x3c]
+
+[0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x5c] # neg_lo:[0,1,0]
+# GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x5c]
+
+[0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x9c] # neg_lo:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x41,0x52,0xcc,0x00,0x09,0x42,0x1c] # neg_hi:[1,0,0]
+# GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[1,0,0] ; encoding: [0x0c,0x41,0x52,0xcc,0x00,0x09,0x42,0x1c]
+
+[0x0c,0x42,0x52,0xcc,0x00,0x09,0x42,0x1c] # neg_hi:[0,1,0]
+# GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[0,1,0] ; encoding: [0x0c,0x42,0x52,0xcc,0x00,0x09,0x42,0x1c]
+
+[0x0c,0x44,0x52,0xcc,0x00,0x09,0x42,0x1c] # neg_hi:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x40,0x52,0xcc,0x00,0x08,0x42,0x1c] # sgpr src0
+# GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v[4:11], v16 ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x08,0x42,0x1c]
+
+[0x0c,0x40,0x52,0xcc,0x00,0x09,0x40,0x1c] # sgpr src1
+# GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/, v16 ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x40,0x1c]
+
+[0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x18] # sgpr src2
+# GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], s16/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x18]
+
+[0x0c,0x40,0x52,0xcc,0xf2,0x08,0x42,0x1c] # 1.0 src0
+# GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], 1.0/*Invalid immediate*/, v[4:11], v16 ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x08,0x42,0x1c]
+
+[0x0c,0x40,0x52,0xcc,0x00,0xe5,0x41,0x1c] # 1.0 src1
+# GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], 1.0/*Invalid immediate*/, v16 ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x01,0x40,0x1c]
+
+[0x0c,0x40,0x52,0xcc,0x00,0x09,0xca,0x1b] # 1.0 src2
+# GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], 1.0/*Invalid immediate*/ ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x02,0x18]
+
+[0x0c,0x40,0x52,0xcc,0x00,0x09,0x06,0x1a] # 1 src2
+# GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], 1/*Invalid immediate*/ ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x06,0x18]
+
+
+
+[0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x1c]
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x1c]
+
+[0x0c,0xc0,0x53,0xcc,0x00,0x09,0x42,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x48,0x53,0xcc,0x00,0x09,0x42,0x1c] # op_sel:[1,0,0]
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:1 ; encoding: [0x0c,0x48,0x53,0xcc,0x00,0x09,0x42,0x1c]
+
+[0x0c,0x50,0x53,0xcc,0x00,0x09,0x42,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x60,0x53,0xcc,0x00,0x09,0x42,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x00,0x53,0xcc,0x00,0x09,0x42,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x48,0x53,0xcc,0x00,0x09,0x42,0x1c] # index_key:1
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:1 ; encoding: [0x0c,0x48,0x53,0xcc,0x00,0x09,0x42,0x1c]
+
+[0x0c,0x50,0x53,0xcc,0x00,0x09,0x42,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x58,0x53,0xcc,0x00,0x09,0x42,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x3c] # neg_lo:[1,0,0]
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x3c]
+
+[0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x5c] # neg_lo:[0,1,0]
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x5c]
+
+[0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x9c] # neg_lo:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x41,0x53,0xcc,0x00,0x09,0x42,0x1c] # neg_hi:[1,0,0]
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[1,0,0] ; encoding: [0x0c,0x41,0x53,0xcc,0x00,0x09,0x42,0x1c]
+
+[0x0c,0x42,0x53,0xcc,0x00,0x09,0x42,0x1c] # neg_hi:[0,1,0]
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[0,1,0] ; encoding: [0x0c,0x42,0x53,0xcc,0x00,0x09,0x42,0x1c]
+
+[0x0c,0x44,0x53,0xcc,0x00,0x09,0x42,0x1c] # neg_hi:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x40,0x53,0xcc,0x00,0x08,0x42,0x1c] # sgpr src0
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v[4:11], v16 ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x08,0x42,0x1c]
+
+[0x0c,0x40,0x53,0xcc,0x00,0x09,0x40,0x1c] # sgpr src1
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/, v16 ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x40,0x1c]
+
+[0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x18] # sgpr src2
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], s16/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x18]
+
+[0x0c,0x40,0x53,0xcc,0xf2,0x08,0x42,0x1c] # 1.0 src0
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], 1.0/*Invalid immediate*/, v[4:11], v16 ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x08,0x42,0x1c]
+
+[0x0c,0x40,0x53,0xcc,0x00,0xe5,0x41,0x1c] # 1.0 src1
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], 1.0/*Invalid immediate*/, v16 ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x01,0x40,0x1c]
+
+[0x0c,0x40,0x53,0xcc,0x00,0x09,0xca,0x1b] # 1.0 src2
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], 1.0/*Invalid immediate*/ ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x02,0x18]
+
+[0x0c,0x40,0x53,0xcc,0x00,0x09,0x06,0x1a] # 1 src2
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], 1/*Invalid immediate*/ ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x06,0x18]
+
+
+
+[0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x1c]
+# GFX12: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0xc0,0x54,0xcc,0x00,0x05,0x3a,0x1c] # clamp
+# GFX12: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 clamp ; encoding: [0x06,0xc0,0x54,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0x48,0x54,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[1,0,0]
+# GFX12: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x54,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0x50,0x54,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x60,0x54,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x00,0x54,0xcc,0x00,0x05,0x3a,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x54,0xcc,0x00,0x05,0x3a,0x1c] # index_key:1
+# GFX12: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x54,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0x50,0x54,0xcc,0x00,0x05,0x3a,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x58,0x54,0xcc,0x00,0x05,0x3a,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x3c] # neg_lo:[1,0,0]
+# GFX12: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x3c]
+
+[0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x5c] # neg_lo:[0,1,0]
+# GFX12: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x5c]
+
+[0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x9c] # neg_lo:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x41,0x54,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x42,0x54,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x44,0x54,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x54,0xcc,0x00,0x04,0x3a,0x1c] # sgpr src0
+# GFX12:v_swmmac_i32_16x16x32_iu8 v[6:13], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v14 ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x04,0x3a,0x1c]
+
+[0x06,0x40,0x54,0xcc,0x00,0x05,0x38,0x1c] # sgpr src1
+# GFX12:v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v14 ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x01,0x38,0x1c]
+
+[0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x18] # sgpr src2
+# GFX12:v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], s14/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x18]
+
+[0x06,0x40,0x54,0xcc,0x81,0x04,0x3a,0x1c] # 1 src0
+# GFX12:v_swmmac_i32_16x16x32_iu8 v[6:13], 1/*Invalid immediate*/, v[2:5], v14 ; encoding: [0x06,0x40,0x54,0xcc,0x01,0x04,0x3a,0x1c]
+
+[0x06,0x40,0x54,0xcc,0x00,0x03,0x39,0x1c] # 1 src1
+# GFX12:v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], 1/*Invalid immediate*/, v14 ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x03,0x38,0x1c]
+
+[0x06,0x40,0x54,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX12:v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x05,0x06,0x18]
+
+[0x06,0x40,0x54,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX12:v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x05,0x02,0x18]
+
+
+
+[0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x1c]
+# GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x1c]
+
+[0x03,0xc0,0x55,0xcc,0x00,0x03,0x2e,0x1c] # clamp
+# GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 clamp ; encoding: [0x03,0xc0,0x55,0xcc,0x00,0x03,0x2e,0x1c]
+
+[0x03,0x48,0x55,0xcc,0x00,0x03,0x2e,0x1c] # op_sel:[1,0,0]
+# GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 index_key:1 ; encoding: [0x03,0x48,0x55,0xcc,0x00,0x03,0x2e,0x1c]
+
+[0x03,0x50,0x55,0xcc,0x00,0x03,0x2e,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x60,0x55,0xcc,0x00,0x03,0x2e,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x00,0x55,0xcc,0x00,0x03,0x2e,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x48,0x55,0xcc,0x00,0x03,0x2e,0x1c] # index_key:1
+# GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 index_key:1 ; encoding: [0x03,0x48,0x55,0xcc,0x00,0x03,0x2e,0x1c]
+
+[0x03,0x50,0x55,0xcc,0x00,0x03,0x2e,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x58,0x55,0xcc,0x00,0x03,0x2e,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x3c] # neg_lo:[1,0,0]
+# GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[1,0,0] ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x3c]
+
+[0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x5c] # neg_lo:[0,1,0]
+# GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[0,1,0] ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x5c]
+
+[0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x9c] # neg_lo:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x41,0x55,0xcc,0x00,0x03,0x2e,0x1c] # neg_hi:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x42,0x55,0xcc,0x00,0x03,0x2e,0x1c] # neg_hi:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x44,0x55,0xcc,0x00,0x03,0x2e,0x1c] # neg_hi:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x55,0xcc,0x00,0x02,0x2e,0x1c] # sgpr src0
+# GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], s0/*Invalid register, operand has 'VGPR_32' register class*/, v[1:2], v11 ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x02,0x2e,0x1c]
+
+[0x03,0x40,0x55,0xcc,0x00,0x03,0x2c,0x1c] # sgpr src1
+# GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v11 ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x01,0x2c,0x1c]
+
+[0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x18] # sgpr src2
+# GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], s11/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x18]
+
+[0x03,0x40,0x55,0xcc,0x81,0x02,0x2e,0x1c] # 1 src0
+# GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], 1/*Invalid immediate*/, v[1:2], v11 ; encoding: [0x03,0x40,0x55,0xcc,0x01,0x02,0x2e,0x1c]
+
+[0x03,0x40,0x55,0xcc,0x00,0x03,0x2d,0x1c] # 1 src1
+# GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, 1/*Invalid immediate*/, v11 ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x2c,0x1c]
+
+[0x03,0x40,0x55,0xcc,0x00,0x03,0x06,0x1a] # 1 src2
+# GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], 1/*Invalid immediate*/ ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x06,0x18]
+
+[0x03,0x40,0x55,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2
+# GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], 1.0/*Invalid immediate*/ ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x02,0x18]
+
+
+
+[0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x1c]
+# GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0xc0,0x56,0xcc,0x00,0x05,0x3a,0x1c] # clamp
+# GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 clamp ; encoding: [0x06,0xc0,0x56,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0x48,0x56,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x50,0x56,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x60,0x56,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x00,0x56,0xcc,0x00,0x05,0x3a,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x56,0xcc,0x00,0x05,0x3a,0x1c] # index_key:1
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x50,0x56,0xcc,0x00,0x05,0x3a,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x58,0x56,0xcc,0x00,0x05,0x3a,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x3c] # neg_lo:[1,0,0]
+# GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x3c]
+
+[0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x5c] # neg_lo:[0,1,0]
+# GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x5c]
+
+[0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x9c] # neg_lo:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x41,0x56,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x42,0x56,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x44,0x56,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x56,0xcc,0x00,0x04,0x3a,0x1c] # sgpr src0
+# GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v14 ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x04,0x3a,0x1c]
+
+[0x06,0x40,0x56,0xcc,0x00,0x05,0x38,0x1c] # sgpr src1
+# GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v14 ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x01,0x38,0x1c]
+
+[0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x18] # sgpr src2
+# GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], s14/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x18]
+
+[0x06,0x40,0x56,0xcc,0x81,0x04,0x3a,0x1c] # 1 src0
+# GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], 1/*Invalid immediate*/, v[2:5], v14 ; encoding: [0x06,0x40,0x56,0xcc,0x01,0x04,0x3a,0x1c]
+
+[0x06,0x40,0x56,0xcc,0x00,0x03,0x39,0x1c] # 1 src1
+# GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], 1/*Invalid immediate*/, v14 ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x03,0x38,0x1c]
+
+[0x06,0x40,0x56,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x05,0x06,0x18]
+
+[0x06,0x40,0x56,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x05,0x02,0x18]
+
+
+
+[0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x1c]
+# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0xc0,0x57,0xcc,0x00,0x05,0x3a,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x57,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[1,0,0]
+# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x57,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0x50,0x57,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x60,0x57,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x00,0x57,0xcc,0x00,0x05,0x3a,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x57,0xcc,0x00,0x05,0x3a,0x1c] # index_key:1
+# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x57,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0x50,0x57,0xcc,0x00,0x05,0x3a,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x58,0x57,0xcc,0x00,0x05,0x3a,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x3c] # neg_lo:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x5c] # neg_lo:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x9c] # neg_lo:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x41,0x57,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x42,0x57,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x44,0x57,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x57,0xcc,0x00,0x04,0x3a,0x1c] # sgpr src0
+# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v14 ; encoding: [0x06,0x40,0x57,0xcc,0x00,0x04,0x3a,0x1c]
+
+[0x06,0x40,0x57,0xcc,0x00,0x05,0x38,0x1c] # sgpr src1
+# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v14 ; encoding: [0x06,0x40,0x57,0xcc,0x00,0x01,0x38,0x1c]
+
+[0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x18] # sgpr src2
+# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], s14/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x18]
+
+[0x06,0x40,0x57,0xcc,0xf2,0x04,0x3a,0x1c] # 1.0 src0
+# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], 1.0/*Invalid immediate*/, v[2:5], v14 ; encoding: [0x06,0x40,0x57,0xcc,0x00,0x04,0x3a,0x1c]
+
+[0x06,0x40,0x57,0xcc,0x00,0xe5,0x39,0x1c] # 1.0 src1
+# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], 1.0/*Invalid immediate*/, v14 ; encoding: [0x06,0x40,0x57,0xcc,0x00,0x01,0x38,0x1c]
+
+[0x06,0x40,0x57,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x57,0xcc,0x00,0x05,0x02,0x18]
+
+[0x06,0x40,0x57,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x57,0xcc,0x00,0x05,0x06,0x18]
+
+
+
+[0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x1c]
+# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0xc0,0x58,0xcc,0x00,0x05,0x3a,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x58,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[1,0,0]
+# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x58,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0x50,0x58,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x60,0x58,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x00,0x58,0xcc,0x00,0x05,0x3a,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x58,0xcc,0x00,0x05,0x3a,0x1c] # index_key:1
+# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x58,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0x50,0x58,0xcc,0x00,0x05,0x3a,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x58,0x58,0xcc,0x00,0x05,0x3a,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x3c] # neg_lo:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x5c] # neg_lo:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x9c] # neg_lo:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x41,0x58,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x42,0x58,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x44,0x58,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x58,0xcc,0x00,0x04,0x3a,0x1c] # sgpr src0
+# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v14 ; encoding: [0x06,0x40,0x58,0xcc,0x00,0x04,0x3a,0x1c]
+
+[0x06,0x40,0x58,0xcc,0x00,0x05,0x38,0x1c] # sgpr src1
+# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v14 ; encoding: [0x06,0x40,0x58,0xcc,0x00,0x01,0x38,0x1c]
+
+[0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x18] # sgpr src2
+# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], s14/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x18]
+
+[0x06,0x40,0x58,0xcc,0xf2,0x04,0x3a,0x1c] # 1.0 src0
+# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], 1.0/*Invalid immediate*/, v[2:5], v14 ; encoding: [0x06,0x40,0x58,0xcc,0x00,0x04,0x3a,0x1c]
+
+[0x06,0x40,0x58,0xcc,0x00,0xe5,0x39,0x1c] # 1.0 src1
+# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], 1.0/*Invalid immediate*/, v14 ; encoding: [0x06,0x40,0x58,0xcc,0x00,0x01,0x38,0x1c]
+
+[0x06,0x40,0x58,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x58,0xcc,0x00,0x05,0x02,0x18]
+
+[0x06,0x40,0x58,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x58,0xcc,0x00,0x05,0x06,0x18]
+
+
+
+[0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x1c]
+# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0xc0,0x59,0xcc,0x00,0x05,0x3a,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x59,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[1,0,0]
+# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x59,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0x50,0x59,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x60,0x59,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x00,0x59,0xcc,0x00,0x05,0x3a,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x59,0xcc,0x00,0x05,0x3a,0x1c] # index_key:1
+# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x59,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0x50,0x59,0xcc,0x00,0x05,0x3a,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x58,0x59,0xcc,0x00,0x05,0x3a,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x3c] # neg_lo:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x5c] # neg_lo:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x9c] # neg_lo:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x41,0x59,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x42,0x59,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x44,0x59,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x59,0xcc,0x00,0x04,0x3a,0x1c] # sgpr src0
+# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v14 ; encoding: [0x06,0x40,0x59,0xcc,0x00,0x04,0x3a,0x1c]
+
+[0x06,0x40,0x59,0xcc,0x00,0x05,0x38,0x1c] # sgpr src1
+# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v14 ; encoding: [0x06,0x40,0x59,0xcc,0x00,0x01,0x38,0x1c]
+
+[0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x18] # sgpr src2
+# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], s14/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x18]
+
+[0x06,0x40,0x59,0xcc,0xf2,0x04,0x3a,0x1c] # 1.0 src0
+# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], 1.0/*Invalid immediate*/, v[2:5], v14 ; encoding: [0x06,0x40,0x59,0xcc,0x00,0x04,0x3a,0x1c]
+
+[0x06,0x40,0x59,0xcc,0x00,0xe5,0x39,0x1c] # 1.0 src1
+# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], 1.0/*Invalid immediate*/, v14 ; encoding: [0x06,0x40,0x59,0xcc,0x00,0x01,0x38,0x1c]
+
+[0x06,0x40,0x59,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x59,0xcc,0x00,0x05,0x02,0x18]
+
+[0x06,0x40,0x59,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x59,0xcc,0x00,0x05,0x06,0x18]
+
+
+
+[0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x1c]
+# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0xc0,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[1,0,0]
+# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x5a,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0x50,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x60,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x00,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # index_key:1
+# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x5a,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0x50,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x58,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x3c] # neg_lo:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x5c] # neg_lo:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x9c] # neg_lo:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x41,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x42,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x44,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x5a,0xcc,0x00,0x04,0x3a,0x1c] # sgpr src0
+# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v14 ; encoding: [0x06,0x40,0x5a,0xcc,0x00,0x04,0x3a,0x1c]
+
+[0x06,0x40,0x5a,0xcc,0x00,0x05,0x38,0x1c] # sgpr src1
+# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v14 ; encoding: [0x06,0x40,0x5a,0xcc,0x00,0x01,0x38,0x1c]
+
+[0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x18] # sgpr src2
+# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], s14/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x18]
+
+[0x06,0x40,0x5a,0xcc,0xf2,0x04,0x3a,0x1c] # 1.0 src0
+# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], 1.0/*Invalid immediate*/, v[2:5], v14 ; encoding: [0x06,0x40,0x5a,0xcc,0x00,0x04,0x3a,0x1c]
+
+[0x06,0x40,0x5a,0xcc,0x00,0xe5,0x39,0x1c] # 1.0 src1
+# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], 1.0/*Invalid immediate*/, v14 ; encoding: [0x06,0x40,0x5a,0xcc,0x00,0x01,0x38,0x1c]
+
+[0x06,0x40,0x5a,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x5a,0xcc,0x00,0x05,0x02,0x18]
+
+[0x06,0x40,0x5a,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x5a,0xcc,0x00,0x05,0x06,0x18]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_wmma_w64.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_wmma_w64.txt
new file mode 100644
index 00000000000000..61700faa8e6075
--- /dev/null
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_wmma_w64.txt
@@ -0,0 +1,1628 @@
+# RUN: not llvm-mc -disassemble -arch=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=GFX12 %s
+# RUN: not llvm-mc -disassemble -arch=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s 2>&1 | FileCheck --implicit-check-not=warning: --check-prefix=GFX12-ERR %s
+
+[0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x1c]
+# GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0xc0,0x40,0xcc,0x00,0x05,0x12,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x40,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x40,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x60,0x40,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x00,0x40,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x40,0xcc,0x00,0x05,0x12,0x1c] # index_key:1
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x40,0xcc,0x00,0x05,0x12,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x58,0x40,0xcc,0x00,0x05,0x12,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0]
+# GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x3c]
+
+[0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0]
+# GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x5c]
+
+[0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1]
+# GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x9c]
+
+[0x04,0x41,0x40,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0]
+# GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[1,0,0] ; encoding: [0x04,0x41,0x40,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x42,0x40,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0]
+# GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,1,0] ; encoding: [0x04,0x42,0x40,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x44,0x40,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1]
+# GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x40,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x40,0x40,0xcc,0x00,0x04,0x12,0x1c] # sgpr_0 src0
+# GFX12: v_wmma_f32_16x16x16_f16 v[4:7], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:7] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x40,0xcc,0x00,0x01,0x10,0x1c] # sgpr_0 src1
+# GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[4:7] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x01,0x10,0x1c]
+
+[0x04,0x40,0x40,0xcc,0x00,0x05,0x02,0x18] # sgpr_0 src2
+# GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x02,0x18]
+
+[0x04,0x40,0x40,0xcc,0xf2,0x04,0x12,0x1c] # 1.0 src0
+# GFX12: v_wmma_f32_16x16x16_f16 v[4:7], 1.0/*Invalid immediate*/, v[2:3], v[4:7] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x40,0xcc,0x00,0xe5,0x11,0x1c] # 1.0 src1
+# GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], 1.0/*Invalid immediate*/, v[4:7] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x01,0x10,0x1c]
+
+[0x04,0x40,0x40,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0xca,0x1b]
+
+[0x04,0x40,0x40,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x06,0x1a]
+
+
+
+[0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x1c]
+# GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0xc0,0x41,0xcc,0x00,0x05,0x12,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x41,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x41,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x60,0x41,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x00,0x41,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x41,0xcc,0x00,0x05,0x12,0x1c] # index_key:1
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x41,0xcc,0x00,0x05,0x12,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x58,0x41,0xcc,0x00,0x05,0x12,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0]
+# GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x3c]
+
+[0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0]
+# GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x5c]
+
+[0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1]
+# GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x9c]
+
+[0x04,0x41,0x41,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0]
+# GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[1,0,0] ; encoding: [0x04,0x41,0x41,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x42,0x41,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0]
+# GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,1,0] ; encoding: [0x04,0x42,0x41,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x44,0x41,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1]
+# GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x41,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x40,0x41,0xcc,0x00,0x04,0x12,0x1c] # sgpr_0 src0
+# GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:7] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x41,0xcc,0x00,0x01,0x10,0x1c] # sgpr_0 src1
+# GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[4:7] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x01,0x10,0x1c]
+
+[0x04,0x40,0x41,0xcc,0x00,0x05,0x02,0x18] # sgpr_0 src2
+# GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x02,0x18]
+
+[0x04,0x40,0x41,0xcc,0xf2,0x04,0x12,0x1c] # 1.0 src0
+# GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], 1.0/*Invalid immediate*/, v[2:3], v[4:7] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x41,0xcc,0x00,0xe5,0x11,0x1c] # 1.0 src1
+# GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], 1.0/*Invalid immediate*/, v[4:7] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x01,0x10,0x1c]
+
+[0x04,0x40,0x41,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0xca,0x1b]
+
+[0x04,0x40,0x41,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x06,0x1a]
+
+
+
+[0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x1c]
+# GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0xc0,0x42,0xcc,0x00,0x05,0x12,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x42,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x42,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x60,0x42,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x00,0x42,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x42,0xcc,0x00,0x05,0x12,0x1c] # index_key:1
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x42,0xcc,0x00,0x05,0x12,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x58,0x42,0xcc,0x00,0x05,0x12,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0]
+# GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x3c]
+
+[0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0]
+# GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x5c]
+
+[0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1]
+# GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x9c]
+
+[0x04,0x41,0x42,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0]
+# GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[1,0,0] ; encoding: [0x04,0x41,0x42,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x42,0x42,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0]
+# GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,1,0] ; encoding: [0x04,0x42,0x42,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x44,0x42,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1]
+# GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x42,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x40,0x42,0xcc,0x00,0x04,0x12,0x1c] # sgpr_0 src0
+# GFX12: v_wmma_f16_16x16x16_f16 v[4:5], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:5] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x42,0xcc,0x00,0x01,0x10,0x1c] # sgpr_0 src1
+# GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[4:5] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x01,0x10,0x1c]
+
+[0x04,0x40,0x42,0xcc,0x00,0x05,0x02,0x18] # sgpr_0 src2
+# GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/ ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x02,0x18]
+
+[0x04,0x40,0x42,0xcc,0xf2,0x04,0x12,0x1c] # 1.0 src0
+# GFX12: v_wmma_f16_16x16x16_f16 v[4:5], 1.0/*Invalid immediate*/, v[2:3], v[4:5] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x42,0xcc,0x00,0xe5,0x11,0x1c] # 1.0 src1
+# GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], 1.0/*Invalid immediate*/, v[4:5] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x01,0x10,0x1c]
+
+[0x04,0x40,0x42,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0xca,0x1b]
+
+[0x04,0x40,0x42,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x06,0x1a]
+
+
+
+[0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x1c]
+# GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0xc0,0x43,0xcc,0x00,0x05,0x12,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x43,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x43,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x60,0x43,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x00,0x43,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x43,0xcc,0x00,0x05,0x12,0x1c] # index_key:1
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x43,0xcc,0x00,0x05,0x12,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x58,0x43,0xcc,0x00,0x05,0x12,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0]
+# GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x3c]
+
+[0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0]
+# GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x5c]
+
+[0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1]
+# GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x9c]
+
+[0x04,0x41,0x43,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0]
+# GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[1,0,0] ; encoding: [0x04,0x41,0x43,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x42,0x43,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0]
+# GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,1,0] ; encoding: [0x04,0x42,0x43,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x44,0x43,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1]
+# GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x43,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x40,0x43,0xcc,0x00,0x04,0x12,0x1c] # sgpr_0 src0
+# GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:5] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x43,0xcc,0x00,0x01,0x10,0x1c] # sgpr_0 src1
+# GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[4:5] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x01,0x10,0x1c]
+
+[0x04,0x40,0x43,0xcc,0x00,0x05,0x02,0x18] # sgpr_0 src2
+# GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/ ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x02,0x18]
+
+[0x04,0x40,0x43,0xcc,0xf2,0x04,0x12,0x1c] # 1.0 src0
+# GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], 1.0/*Invalid immediate*/, v[2:3], v[4:5] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x43,0xcc,0x00,0xe5,0x11,0x1c] # 1.0 src1
+# GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], 1.0/*Invalid immediate*/, v[4:5] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x01,0x10,0x1c]
+
+[0x04,0x40,0x43,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0xca,0x1b]
+
+[0x04,0x40,0x43,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x06,0x1a]
+
+
+
+[0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x1c]
+# GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x1c]
+
+[0x02,0xc0,0x44,0xcc,0x00,0x03,0x0a,0x1c] # clamp
+# GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] clamp ; encoding: [0x02,0xc0,0x44,0xcc,0x00,0x03,0x0a,0x1c]
+
+[0x02,0x48,0x44,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x44,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x60,0x44,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x00,0x44,0xcc,0x00,0x03,0x0a,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x48,0x44,0xcc,0x00,0x03,0x0a,0x1c] # index_key:1
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x44,0xcc,0x00,0x03,0x0a,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x58,0x44,0xcc,0x00,0x03,0x0a,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x3c] # neg_lo:[1,0,0]
+# GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x3c]
+
+[0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x5c] # neg_lo:[0,1,0]
+# GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x5c]
+
+[0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x9c] # neg_lo:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x41,0x44,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x42,0x44,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x44,0x44,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x44,0xcc,0x00,0x02,0x0a,0x1c] # sgpr_0 src0
+# GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/, v1, v[2:5] ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x02,0x0a,0x1c]
+
+[0x02,0x40,0x44,0xcc,0x00,0x01,0x08,0x1c] # sgpr_0 src1
+# GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, s0/*Invalid register, operand has 'VGPR_32' register class*/, v[2:5] ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x01,0x08,0x1c]
+
+[0x02,0x40,0x44,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2
+# GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, s[0:3]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x02,0x18]
+
+[0x02,0x40,0x44,0xcc,0x81,0x02,0x0a,0x1c] # 1 src0
+# GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], 1/*Invalid immediate*/, v1, v[2:5] ; encoding: [0x02,0x40,0x44,0xcc,0x01,0x02,0x0a,0x1c]
+
+[0x02,0x40,0x44,0xcc,0x00,0x03,0x09,0x1c] # 1 src1
+# GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, 1/*Invalid immediate*/, v[2:5] ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x08,0x1c]
+
+[0x02,0x40,0x44,0xcc,0x00,0x03,0x06,0x1a] # 1 src2
+# GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x06,0x1a]
+
+[0x02,0x40,0x44,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2
+# GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0xca,0x1b]
+
+
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x1c]
+# GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x1c]
+
+[0x02,0xc0,0x45,0xcc,0x00,0x03,0x0a,0x1c] # clamp
+# GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] clamp ; encoding: [0x02,0xc0,0x45,0xcc,0x00,0x03,0x0a,0x1c]
+
+[0x02,0x48,0x45,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x45,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x60,0x45,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x00,0x45,0xcc,0x00,0x03,0x0a,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x48,0x45,0xcc,0x00,0x03,0x0a,0x1c] # index_key:1
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x45,0xcc,0x00,0x03,0x0a,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x58,0x45,0xcc,0x00,0x03,0x0a,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x3c] # neg_lo:[1,0,0]
+# GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x3c]
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x5c] # neg_lo:[0,1,0]
+# GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x5c]
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x9c] # neg_lo:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x41,0x45,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x42,0x45,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x44,0x45,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x45,0xcc,0x00,0x02,0x0a,0x1c] # sgpr_0 src0
+# GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/, v1, v[2:5] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x02,0x0a,0x1c]
+
+[0x02,0x40,0x45,0xcc,0x00,0x01,0x08,0x1c] # sgpr_0 src1
+# GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, s0/*Invalid register, operand has 'VGPR_32' register class*/, v[2:5] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x01,0x08,0x1c]
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2
+# GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, s[0:3]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x02,0x18]
+
+[0x02,0x40,0x45,0xcc,0x81,0x02,0x0a,0x1c] # 1 src0
+# GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], 1/*Invalid immediate*/, v1, v[2:5] ; encoding: [0x02,0x40,0x45,0xcc,0x01,0x02,0x0a,0x1c]
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x09,0x1c] # 1 src1
+# GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, 1/*Invalid immediate*/, v[2:5] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x08,0x1c]
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x06,0x1a] # 1 src2
+# GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x06,0x1a]
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2
+# GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0xca,0x1b]
+
+
+
+[0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x1c]
+# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x1c]
+
+[0x02,0xc0,0x46,0xcc,0x00,0x03,0x0a,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x48,0x46,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x46,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x60,0x46,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x00,0x46,0xcc,0x00,0x03,0x0a,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x48,0x46,0xcc,0x00,0x03,0x0a,0x1c] # index_key:1
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x46,0xcc,0x00,0x03,0x0a,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x58,0x46,0xcc,0x00,0x03,0x0a,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x3c] # neg_lo:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x5c] # neg_lo:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x9c] # neg_lo:[0,0,1]
+# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x9c]
+
+[0x02,0x41,0x46,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x42,0x46,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x44,0x46,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,0,1]
+# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] ; encoding: [0x02,0x44,0x46,0xcc,0x00,0x03,0x0a,0x1c]
+
+[0x02,0x40,0x46,0xcc,0x00,0x02,0x0a,0x1c] # sgpr_0 src0
+# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/, v1, v[2:5] ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x02,0x0a,0x1c]
+
+[0x02,0x40,0x46,0xcc,0x00,0x01,0x08,0x1c] # sgpr_0 src1
+# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, s0/*Invalid register, operand has 'VGPR_32' register class*/, v[2:5] ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x01,0x08,0x1c]
+
+[0x02,0x40,0x46,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2
+# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, s[0:3]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x03,0x02,0x18]
+
+[0x02,0x40,0x46,0xcc,0xf2,0x02,0x0a,0x1c] # 1.0 src0
+# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], 1.0/*Invalid immediate*/, v1, v[2:5] ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x02,0x0a,0x1c]
+
+[0x02,0x40,0x46,0xcc,0x00,0xe5,0x09,0x1c] # 1.0 src1
+# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, 1.0/*Invalid immediate*/, v[2:5] ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x01,0x08,0x1c]
+
+[0x02,0x40,0x46,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2
+# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x03,0xca,0x1b]
+
+[0x02,0x40,0x46,0xcc,0x00,0x03,0x06,0x1a] # 1 src2
+# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x03,0x06,0x1a]
+
+
+
+[0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x1c]
+# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x1c]
+
+[0x02,0xc0,0x47,0xcc,0x00,0x03,0x0a,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x48,0x47,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x47,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x60,0x47,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x00,0x47,0xcc,0x00,0x03,0x0a,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x48,0x47,0xcc,0x00,0x03,0x0a,0x1c] # index_key:1
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x47,0xcc,0x00,0x03,0x0a,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x58,0x47,0xcc,0x00,0x03,0x0a,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x3c] # neg_lo:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x5c] # neg_lo:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x9c] # neg_lo:[0,0,1]
+# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x9c]
+
+[0x02,0x41,0x47,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x42,0x47,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x44,0x47,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,0,1]
+# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] ; encoding: [0x02,0x44,0x47,0xcc,0x00,0x03,0x0a,0x1c]
+
+[0x02,0x40,0x47,0xcc,0x00,0x02,0x0a,0x1c] # sgpr_0 src0
+# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/, v1, v[2:5] ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x02,0x0a,0x1c]
+
+[0x02,0x40,0x47,0xcc,0x00,0x01,0x08,0x1c] # sgpr_0 src1
+# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, s0/*Invalid register, operand has 'VGPR_32' register class*/, v[2:5] ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x01,0x08,0x1c]
+
+[0x02,0x40,0x47,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2
+# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, s[0:3]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x03,0x02,0x18]
+
+[0x02,0x40,0x47,0xcc,0xf2,0x02,0x0a,0x1c] # 1.0 src0
+# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], 1.0/*Invalid immediate*/, v1, v[2:5] ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x02,0x0a,0x1c]
+
+[0x02,0x40,0x47,0xcc,0x00,0xe5,0x09,0x1c] # 1.0 src1
+# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, 1.0/*Invalid immediate*/, v[2:5] ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x01,0x08,0x1c]
+
+[0x02,0x40,0x47,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2
+# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x03,0xca,0x1b]
+
+[0x02,0x40,0x47,0xcc,0x00,0x03,0x06,0x1a] # 1 src2
+# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x03,0x06,0x1a]
+
+
+
+[0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x1c]
+# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x1c]
+
+[0x02,0xc0,0x48,0xcc,0x00,0x03,0x0a,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x48,0x48,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x48,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x60,0x48,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x00,0x48,0xcc,0x00,0x03,0x0a,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x48,0x48,0xcc,0x00,0x03,0x0a,0x1c] # index_key:1
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x48,0xcc,0x00,0x03,0x0a,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x58,0x48,0xcc,0x00,0x03,0x0a,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x3c] # neg_lo:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x5c] # neg_lo:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x9c] # neg_lo:[0,0,1]
+# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x9c]
+
+[0x02,0x41,0x48,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x42,0x48,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x44,0x48,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,0,1]
+# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] ; encoding: [0x02,0x44,0x48,0xcc,0x00,0x03,0x0a,0x1c]
+
+[0x02,0x40,0x48,0xcc,0x00,0x02,0x0a,0x1c] # sgpr_0 src0
+# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/, v1, v[2:5] ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x02,0x0a,0x1c]
+
+[0x02,0x40,0x48,0xcc,0x00,0x01,0x08,0x1c] # sgpr_0 src1
+# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, s0/*Invalid register, operand has 'VGPR_32' register class*/, v[2:5] ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x01,0x08,0x1c]
+
+[0x02,0x40,0x48,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2
+# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, s[0:3]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x03,0x02,0x18]
+
+[0x02,0x40,0x48,0xcc,0xf2,0x02,0x0a,0x1c] # 1.0 src0
+# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], 1.0/*Invalid immediate*/, v1, v[2:5] ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x02,0x0a,0x1c]
+
+[0x02,0x40,0x48,0xcc,0x00,0xe5,0x09,0x1c] # 1.0 src1
+# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, 1.0/*Invalid immediate*/, v[2:5] ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x01,0x08,0x1c]
+
+[0x02,0x40,0x48,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2
+# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x03,0xca,0x1b]
+
+[0x02,0x40,0x48,0xcc,0x00,0x03,0x06,0x1a] # 1 src2
+# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x03,0x06,0x1a]
+
+
+
+[0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x1c]
+# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x1c]
+
+[0x02,0xc0,0x49,0xcc,0x00,0x03,0x0a,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x48,0x49,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x49,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x60,0x49,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x00,0x49,0xcc,0x00,0x03,0x0a,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x48,0x49,0xcc,0x00,0x03,0x0a,0x1c] # index_key:1
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x49,0xcc,0x00,0x03,0x0a,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x58,0x49,0xcc,0x00,0x03,0x0a,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x3c] # neg_lo:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x5c] # neg_lo:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x9c] # neg_lo:[0,0,1]
+# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x9c]
+
+[0x02,0x41,0x49,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x42,0x49,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x44,0x49,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,0,1]
+# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] ; encoding: [0x02,0x44,0x49,0xcc,0x00,0x03,0x0a,0x1c]
+
+[0x02,0x40,0x49,0xcc,0x00,0x02,0x0a,0x1c] # sgpr_0 src0
+# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/, v1, v[2:5] ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x02,0x0a,0x1c]
+
+[0x02,0x40,0x49,0xcc,0x00,0x01,0x08,0x1c] # sgpr_0 src1
+# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, s0/*Invalid register, operand has 'VGPR_32' register class*/, v[2:5] ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x01,0x08,0x1c]
+
+[0x02,0x40,0x49,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2
+# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, s[0:3]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x03,0x02,0x18]
+
+[0x02,0x40,0x49,0xcc,0xf2,0x02,0x0a,0x1c] # 1.0 src0
+# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], 1.0/*Invalid immediate*/, v1, v[2:5] ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x02,0x0a,0x1c]
+
+[0x02,0x40,0x49,0xcc,0x00,0xe5,0x09,0x1c] # 1.0 src1
+# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, 1.0/*Invalid immediate*/, v[2:5] ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x01,0x08,0x1c]
+
+[0x02,0x40,0x49,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2
+# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x03,0xca,0x1b]
+
+[0x02,0x40,0x49,0xcc,0x00,0x03,0x06,0x1a] # 1 src2
+# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x03,0x06,0x1a]
+
+
+
+[0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x1c]
+# GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x1c]
+
+[0x02,0xc0,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # clamp
+# GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] clamp ; encoding: [0x02,0xc0,0x4a,0xcc,0x00,0x03,0x0a,0x1c]
+
+[0x02,0x48,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x60,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x00,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x48,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # index_key:1
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x58,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x3c] # neg_lo:[1,0,0]
+# GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x3c]
+
+[0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x5c] # neg_lo:[0,1,0]
+# GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x5c]
+
+[0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x9c] # neg_lo:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x41,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x42,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x44,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x4a,0xcc,0x00,0x02,0x0a,0x1c] # sgpr_0 src0
+# GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/, v1, v[2:5] ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x02,0x0a,0x1c]
+
+[0x02,0x40,0x4a,0xcc,0x00,0x01,0x08,0x1c] # sgpr_0 src1
+# GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, s0/*Invalid register, operand has 'VGPR_32' register class*/, v[2:5] ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x01,0x08,0x1c]
+
+[0x02,0x40,0x4a,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2
+# GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, s[0:3]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x02,0x18]
+
+[0x02,0x40,0x4a,0xcc,0x81,0x02,0x0a,0x1c] # 1 src0
+# GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], 1/*Invalid immediate*/, v1, v[2:5] ; encoding: [0x02,0x40,0x4a,0xcc,0x01,0x02,0x0a,0x1c]
+
+[0x02,0x40,0x4a,0xcc,0x00,0x03,0x09,0x1c] # 1 src1
+# GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, 1/*Invalid immediate*/, v[2:5] ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x08,0x1c]
+
+[0x02,0x40,0x4a,0xcc,0x00,0x03,0x06,0x1a] # 1 src2
+# GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x06,0x1a]
+
+[0x02,0x40,0x4a,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2
+# GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0xca,0x1b]
+
+
+
+[0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x1c]
+# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x1c]
+
+[0x06,0xc0,0x50,0xcc,0x00,0x05,0x2a,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x50,0xcc,0x00,0x05,0x2a,0x1c] # op_sel:[1,0,0]
+# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:1 ; encoding: [0x06,0x48,0x50,0xcc,0x00,0x05,0x2a,0x1c]
+
+[0x06,0x50,0x50,0xcc,0x00,0x05,0x2a,0x1c] # op_sel:[0,1,0]
+# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:2 ; encoding: [0x06,0x50,0x50,0xcc,0x00,0x05,0x2a,0x1c]
+
+[0x06,0x60,0x50,0xcc,0x00,0x05,0x2a,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x00,0x50,0xcc,0x00,0x05,0x2a,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x50,0xcc,0x00,0x05,0x2a,0x1c] # index_key:1
+# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:1 ; encoding: [0x06,0x48,0x50,0xcc,0x00,0x05,0x2a,0x1c]
+
+[0x06,0x50,0x50,0xcc,0x00,0x05,0x2a,0x1c] # index_key:2
+# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:2 ; encoding: [0x06,0x50,0x50,0xcc,0x00,0x05,0x2a,0x1c]
+
+[0x06,0x58,0x50,0xcc,0x00,0x05,0x2a,0x1c] # index_key:3
+# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:3 ; encoding: [0x06,0x58,0x50,0xcc,0x00,0x05,0x2a,0x1c]
+
+[0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x3c] # neg_lo:[1,0,0]
+# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x3c]
+
+[0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x5c] # neg_lo:[0,1,0]
+# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x5c]
+
+[0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x9c] # neg_lo:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x41,0x50,0xcc,0x00,0x05,0x2a,0x1c] # neg_hi:[1,0,0]
+# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[1,0,0] ; encoding: [0x06,0x41,0x50,0xcc,0x00,0x05,0x2a,0x1c]
+
+[0x06,0x42,0x50,0xcc,0x00,0x05,0x2a,0x1c] # neg_hi:[0,1,0]
+# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[0,1,0] ; encoding: [0x06,0x42,0x50,0xcc,0x00,0x05,0x2a,0x1c]
+
+[0x06,0x44,0x50,0xcc,0x00,0x05,0x2a,0x1c] # neg_hi:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x50,0xcc,0x00,0x04,0x2a,0x1c] # sgpr_0 src0
+# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v10 ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x04,0x2a,0x1c]
+
+[0x06,0x40,0x50,0xcc,0x00,0x01,0x28,0x1c] # sgpr_0 src1
+# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v10 ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x01,0x28,0x1c]
+
+[0x06,0x40,0x50,0xcc,0x00,0x05,0x02,0x18] # sgpr_0 src2
+# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x05,0x02,0x18]
+
+[0x06,0x40,0x50,0xcc,0xf2,0x04,0x2a,0x1c] # 1.0 src0
+# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], 1.0/*Invalid immediate*/, v[2:5], v10 ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x04,0x2a,0x1c]
+
+[0x06,0x40,0x50,0xcc,0x00,0xe5,0x29,0x1c] # 1.0 src1
+# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], 1.0/*Invalid immediate*/, v10 ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x01,0x28,0x1c]
+
+[0x06,0x40,0x50,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x05,0x02,0x18]
+
+[0x06,0x40,0x50,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x05,0x06,0x18]
+
+
+
+[0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x1c]
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x1c]
+
+[0x06,0xc0,0x51,0xcc,0x00,0x05,0x2a,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x51,0xcc,0x00,0x05,0x2a,0x1c] # op_sel:[1,0,0]
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:1 ; encoding: [0x06,0x48,0x51,0xcc,0x00,0x05,0x2a,0x1c]
+
+[0x06,0x50,0x51,0xcc,0x00,0x05,0x2a,0x1c] # op_sel:[0,1,0]
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:2 ; encoding: [0x06,0x50,0x51,0xcc,0x00,0x05,0x2a,0x1c]
+
+[0x06,0x60,0x51,0xcc,0x00,0x05,0x2a,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x00,0x51,0xcc,0x00,0x05,0x2a,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x51,0xcc,0x00,0x05,0x2a,0x1c] # index_key:1
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:1 ; encoding: [0x06,0x48,0x51,0xcc,0x00,0x05,0x2a,0x1c]
+
+[0x06,0x50,0x51,0xcc,0x00,0x05,0x2a,0x1c] # index_key:2
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:2 ; encoding: [0x06,0x50,0x51,0xcc,0x00,0x05,0x2a,0x1c]
+
+[0x06,0x58,0x51,0xcc,0x00,0x05,0x2a,0x1c] # index_key:3
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:3 ; encoding: [0x06,0x58,0x51,0xcc,0x00,0x05,0x2a,0x1c]
+
+[0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x3c] # neg_lo:[1,0,0]
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x3c]
+
+[0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x5c] # neg_lo:[0,1,0]
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x5c]
+
+[0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x9c] # neg_lo:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x41,0x51,0xcc,0x00,0x05,0x2a,0x1c] # neg_hi:[1,0,0]
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[1,0,0] ; encoding: [0x06,0x41,0x51,0xcc,0x00,0x05,0x2a,0x1c]
+
+[0x06,0x42,0x51,0xcc,0x00,0x05,0x2a,0x1c] # neg_hi:[0,1,0]
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[0,1,0] ; encoding: [0x06,0x42,0x51,0xcc,0x00,0x05,0x2a,0x1c]
+
+[0x06,0x44,0x51,0xcc,0x00,0x05,0x2a,0x1c] # neg_hi:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x51,0xcc,0x00,0x04,0x2a,0x1c] # sgpr_0 src0
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v10 ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x04,0x2a,0x1c]
+
+[0x06,0x40,0x51,0xcc,0x00,0x01,0x28,0x1c] # sgpr_0 src1
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v10 ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x01,0x28,0x1c]
+
+[0x06,0x40,0x51,0xcc,0x00,0x05,0x02,0x18] # sgpr_0 src2
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x05,0x02,0x18]
+
+[0x06,0x40,0x51,0xcc,0xf2,0x04,0x2a,0x1c] # 1.0 src0
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], 1.0/*Invalid immediate*/, v[2:5], v10 ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x04,0x2a,0x1c]
+
+[0x06,0x40,0x51,0xcc,0x00,0xe5,0x29,0x1c] # 1.0 src1
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], 1.0/*Invalid immediate*/, v10 ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x01,0x28,0x1c]
+
+[0x06,0x40,0x51,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x05,0x02,0x18]
+
+[0x06,0x40,0x51,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x05,0x06,0x18]
+
+
+
+[0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x1c]
+# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x1c]
+
+[0x06,0xc0,0x52,0xcc,0x00,0x05,0x22,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x52,0xcc,0x00,0x05,0x22,0x1c] # op_sel:[1,0,0]
+# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:1 ; encoding: [0x06,0x48,0x52,0xcc,0x00,0x05,0x22,0x1c]
+
+[0x06,0x50,0x52,0xcc,0x00,0x05,0x22,0x1c] # op_sel:[0,1,0]
+# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:2 ; encoding: [0x06,0x50,0x52,0xcc,0x00,0x05,0x22,0x1c]
+
+[0x06,0x60,0x52,0xcc,0x00,0x05,0x22,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x00,0x52,0xcc,0x00,0x05,0x22,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x52,0xcc,0x00,0x05,0x22,0x1c] # index_key:1
+# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:1 ; encoding: [0x06,0x48,0x52,0xcc,0x00,0x05,0x22,0x1c]
+
+[0x06,0x50,0x52,0xcc,0x00,0x05,0x22,0x1c] # index_key:2
+# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:2 ; encoding: [0x06,0x50,0x52,0xcc,0x00,0x05,0x22,0x1c]
+
+[0x06,0x58,0x52,0xcc,0x00,0x05,0x22,0x1c] # index_key:3
+# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:3 ; encoding: [0x06,0x58,0x52,0xcc,0x00,0x05,0x22,0x1c]
+
+[0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x3c] # neg_lo:[1,0,0]
+# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x3c]
+
+[0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x5c] # neg_lo:[0,1,0]
+# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x5c]
+
+[0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x9c] # neg_lo:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x41,0x52,0xcc,0x00,0x05,0x22,0x1c] # neg_hi:[1,0,0]
+# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[1,0,0] ; encoding: [0x06,0x41,0x52,0xcc,0x00,0x05,0x22,0x1c]
+
+[0x06,0x42,0x52,0xcc,0x00,0x05,0x22,0x1c] # neg_hi:[0,1,0]
+# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[0,1,0] ; encoding: [0x06,0x42,0x52,0xcc,0x00,0x05,0x22,0x1c]
+
+[0x06,0x44,0x52,0xcc,0x00,0x05,0x22,0x1c] # neg_hi:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x52,0xcc,0x00,0x04,0x22,0x1c] # sgpr_0 src0
+# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v8 ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x04,0x22,0x1c]
+
+[0x06,0x40,0x52,0xcc,0x00,0x01,0x20,0x1c] # sgpr_0 src1
+# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v8 ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x01,0x20,0x1c]
+
+[0x06,0x40,0x52,0xcc,0x00,0x05,0x02,0x18] # sgpr_0 src2
+# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x05,0x02,0x18]
+
+[0x06,0x40,0x52,0xcc,0xf2,0x04,0x22,0x1c] # 1.0 src0
+# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], 1.0/*Invalid immediate*/, v[2:5], v8 ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x04,0x22,0x1c]
+
+[0x06,0x40,0x52,0xcc,0x00,0xe5,0x21,0x1c] # 1.0 src1
+# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], 1.0/*Invalid immediate*/, v8 ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x01,0x20,0x1c]
+
+[0x06,0x40,0x52,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x05,0x02,0x18]
+
+[0x06,0x40,0x52,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x05,0x06,0x18]
+
+
+
+[0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x1c]
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x1c]
+
+[0x06,0xc0,0x53,0xcc,0x00,0x05,0x22,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x53,0xcc,0x00,0x05,0x22,0x1c] # op_sel:[1,0,0]
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:1 ; encoding: [0x06,0x48,0x53,0xcc,0x00,0x05,0x22,0x1c]
+
+[0x06,0x50,0x53,0xcc,0x00,0x05,0x22,0x1c] # op_sel:[0,1,0]
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:2 ; encoding: [0x06,0x50,0x53,0xcc,0x00,0x05,0x22,0x1c]
+
+[0x06,0x60,0x53,0xcc,0x00,0x05,0x22,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x00,0x53,0xcc,0x00,0x05,0x22,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x53,0xcc,0x00,0x05,0x22,0x1c] # index_key:1
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:1 ; encoding: [0x06,0x48,0x53,0xcc,0x00,0x05,0x22,0x1c]
+
+[0x06,0x50,0x53,0xcc,0x00,0x05,0x22,0x1c] # index_key:2
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:2 ; encoding: [0x06,0x50,0x53,0xcc,0x00,0x05,0x22,0x1c]
+
+[0x06,0x58,0x53,0xcc,0x00,0x05,0x22,0x1c] # index_key:3
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:3 ; encoding: [0x06,0x58,0x53,0xcc,0x00,0x05,0x22,0x1c]
+
+[0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x3c] # neg_lo:[1,0,0]
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x3c]
+
+[0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x5c] # neg_lo:[0,1,0]
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x5c]
+
+[0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x9c] # neg_lo:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x41,0x53,0xcc,0x00,0x05,0x22,0x1c] # neg_hi:[1,0,0]
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[1,0,0] ; encoding: [0x06,0x41,0x53,0xcc,0x00,0x05,0x22,0x1c]
+
+[0x06,0x42,0x53,0xcc,0x00,0x05,0x22,0x1c] # neg_hi:[0,1,0]
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[0,1,0] ; encoding: [0x06,0x42,0x53,0xcc,0x00,0x05,0x22,0x1c]
+
+[0x06,0x44,0x53,0xcc,0x00,0x05,0x22,0x1c] # neg_hi:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x53,0xcc,0x00,0x04,0x22,0x1c] # sgpr_0 src0
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v8 ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x04,0x22,0x1c]
+
+[0x06,0x40,0x53,0xcc,0x00,0x01,0x20,0x1c] # sgpr_0 src1
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v8 ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x01,0x20,0x1c]
+
+[0x06,0x40,0x53,0xcc,0x00,0x05,0x02,0x18] # sgpr_0 src2
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x05,0x02,0x18]
+
+[0x06,0x40,0x53,0xcc,0xf2,0x04,0x22,0x1c] # 1.0 src0
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], 1.0/*Invalid immediate*/, v[2:5], v8 ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x04,0x22,0x1c]
+
+[0x06,0x40,0x53,0xcc,0x00,0xe5,0x21,0x1c] # 1.0 src1
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], 1.0/*Invalid immediate*/, v8 ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x01,0x20,0x1c]
+
+[0x06,0x40,0x53,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x05,0x02,0x18]
+
+[0x06,0x40,0x53,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x05,0x06,0x18]
+
+
+
+[0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x1c]
+# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0xc0,0x54,0xcc,0x00,0x03,0x1e,0x1c] # clamp
+# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 clamp ; encoding: [0x03,0xc0,0x54,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x48,0x54,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[1,0,0]
+# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x54,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x50,0x54,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,1,0]
+# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x54,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x60,0x54,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x00,0x54,0xcc,0x00,0x03,0x1e,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x48,0x54,0xcc,0x00,0x03,0x1e,0x1c] # index_key:1
+# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x54,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x50,0x54,0xcc,0x00,0x03,0x1e,0x1c] # index_key:2
+# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x54,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x58,0x54,0xcc,0x00,0x03,0x1e,0x1c] # index_key:3
+# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x54,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x3c] # neg_lo:[1,0,0]
+# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x3c]
+
+[0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x5c] # neg_lo:[0,1,0]
+# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x5c]
+
+[0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x9c] # neg_lo:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x41,0x54,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x42,0x54,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x44,0x54,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x54,0xcc,0x00,0x02,0x1e,0x1c] # sgpr_0 src0
+# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], s0/*Invalid register, operand has 'VGPR_32' register class*/, v[1:2], v7 ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x02,0x1e,0x1c]
+
+[0x03,0x40,0x54,0xcc,0x00,0x01,0x1c,0x1c] # sgpr_0 src1
+# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v7 ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x01,0x1c,0x1c]
+
+[0x03,0x40,0x54,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2
+# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x02,0x18]
+
+[0x03,0x40,0x54,0xcc,0x81,0x02,0x1e,0x1c] # 1 src0
+# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], 1/*Invalid immediate*/, v[1:2], v7 ; encoding: [0x03,0x40,0x54,0xcc,0x01,0x02,0x1e,0x1c]
+
+[0x03,0x40,0x54,0xcc,0x00,0x03,0x1d,0x1c] # 1 src1
+# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, 1/*Invalid immediate*/, v7 ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x1c,0x1c]
+
+[0x03,0x40,0x54,0xcc,0x00,0x03,0x06,0x1a] # 1 src2
+# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], 1/*Invalid immediate*/ ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x06,0x18]
+
+[0x03,0x40,0x54,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2
+# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], 1.0/*Invalid immediate*/ ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x02,0x18]
+
+
+
+[0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x1c]
+# GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x1c]
+
+[0x02,0xc0,0x55,0xcc,0x00,0x03,0x1a,0x1c] # clamp
+# GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 clamp ; encoding: [0x02,0xc0,0x55,0xcc,0x00,0x03,0x1a,0x1c]
+
+[0x02,0x48,0x55,0xcc,0x00,0x03,0x1a,0x1c] # op_sel:[1,0,0]
+# GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:1 ; encoding: [0x02,0x48,0x55,0xcc,0x00,0x03,0x1a,0x1c]
+
+[0x02,0x50,0x55,0xcc,0x00,0x03,0x1a,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x60,0x55,0xcc,0x00,0x03,0x1a,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x00,0x55,0xcc,0x00,0x03,0x1a,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x48,0x55,0xcc,0x00,0x03,0x1a,0x1c] # index_key:1
+# GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:1 ; encoding: [0x02,0x48,0x55,0xcc,0x00,0x03,0x1a,0x1c]
+
+[0x02,0x50,0x55,0xcc,0x00,0x03,0x1a,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x58,0x55,0xcc,0x00,0x03,0x1a,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x3c] # neg_lo:[1,0,0]
+# GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x3c]
+
+[0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x5c] # neg_lo:[0,1,0]
+# GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x5c]
+
+[0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x9c] # neg_lo:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x41,0x55,0xcc,0x00,0x03,0x1a,0x1c] # neg_hi:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x42,0x55,0xcc,0x00,0x03,0x1a,0x1c] # neg_hi:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x44,0x55,0xcc,0x00,0x03,0x1a,0x1c] # neg_hi:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x55,0xcc,0x00,0x02,0x1a,0x1c] # sgpr_0 src0
+# GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/, v1, v6 ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x02,0x1a,0x1c]
+
+[0x02,0x40,0x55,0xcc,0x00,0x01,0x18,0x1c] # sgpr_0 src1
+# GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, s0/*Invalid register, operand has 'VGPR_32' register class*/, v6 ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x01,0x18,0x1c]
+
+[0x02,0x40,0x55,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2
+# GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x02,0x18]
+
+[0x02,0x40,0x55,0xcc,0x81,0x02,0x1a,0x1c] # 1 src0
+# GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], 1/*Invalid immediate*/, v1, v6 ; encoding: [0x02,0x40,0x55,0xcc,0x01,0x02,0x1a,0x1c]
+
+[0x02,0x40,0x55,0xcc,0x00,0x03,0x19,0x1c] # 1 src1
+# GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, 1/*Invalid immediate*/, v6 ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x18,0x1c]
+
+[0x02,0x40,0x55,0xcc,0x00,0x03,0x06,0x1a] # 1 src2
+# GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, 1/*Invalid immediate*/ ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x06,0x18]
+
+[0x02,0x40,0x55,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2
+# GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, 1.0/*Invalid immediate*/ ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x02,0x18]
+
+
+
+[0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x1c]
+# GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0xc0,0x56,0xcc,0x00,0x03,0x1e,0x1c] # clamp
+# GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 clamp ; encoding: [0x03,0xc0,0x56,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x48,0x56,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[1,0,0]
+# GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x56,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x50,0x56,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x60,0x56,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x00,0x56,0xcc,0x00,0x03,0x1e,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x48,0x56,0xcc,0x00,0x03,0x1e,0x1c] # index_key:1
+# GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x56,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x50,0x56,0xcc,0x00,0x03,0x1e,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x58,0x56,0xcc,0x00,0x03,0x1e,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x3c] # neg_lo:[1,0,0]
+# GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x3c]
+
+[0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x5c] # neg_lo:[0,1,0]
+# GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x5c]
+
+[0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x9c] # neg_lo:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x41,0x56,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x42,0x56,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x44,0x56,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x56,0xcc,0x00,0x02,0x1e,0x1c] # sgpr_0 src0
+# GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], s0/*Invalid register, operand has 'VGPR_32' register class*/, v[1:2], v7 ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x02,0x1e,0x1c]
+
+[0x03,0x40,0x56,0xcc,0x00,0x01,0x1c,0x1c] # sgpr_0 src1
+# GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v7 ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x01,0x1c,0x1c]
+
+[0x03,0x40,0x56,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2
+# GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x02,0x18]
+
+[0x03,0x40,0x56,0xcc,0x81,0x02,0x1e,0x1c] # 1 src0
+# GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], 1/*Invalid immediate*/, v[1:2], v7 ; encoding: [0x03,0x40,0x56,0xcc,0x01,0x02,0x1e,0x1c]
+
+[0x03,0x40,0x56,0xcc,0x00,0x03,0x1d,0x1c] # 1 src1
+# GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, 1/*Invalid immediate*/, v7 ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x1c,0x1c]
+
+[0x03,0x40,0x56,0xcc,0x00,0x03,0x06,0x1a] # 1 src2
+# GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], 1/*Invalid immediate*/ ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x06,0x18]
+
+[0x03,0x40,0x56,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2
+# GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], 1.0/*Invalid immediate*/ ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x02,0x18]
+
+
+
+[0x03,0x40,0x57,0xcc,0x00,0x03,0x1e,0x1c]
+# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x57,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0xc0,0x57,0xcc,0x00,0x03,0x1e,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x48,0x57,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[1,0,0]
+# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x57,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x50,0x57,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,1,0]
+# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x57,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x60,0x57,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x57,0xcc,0x00,0x03,0x1e,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x57,0xcc,0x00,0x03,0x1e,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x00,0x57,0xcc,0x00,0x03,0x1e,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x48,0x57,0xcc,0x00,0x03,0x1e,0x1c] # index_key:1
+# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x57,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x50,0x57,0xcc,0x00,0x03,0x1e,0x1c] # index_key:2
+# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x57,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x58,0x57,0xcc,0x00,0x03,0x1e,0x1c] # index_key:3
+# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x57,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x40,0x57,0xcc,0x00,0x03,0x1e,0x3c] # neg_lo:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x57,0xcc,0x00,0x03,0x1e,0x5c] # neg_lo:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x57,0xcc,0x00,0x03,0x1e,0x9c] # neg_lo:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x41,0x57,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x42,0x57,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x44,0x57,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x57,0xcc,0x00,0x02,0x1e,0x1c] # sgpr_0 src0
+# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], s0/*Invalid register, operand has 'VGPR_32' register class*/, v[1:2], v7 ; encoding: [0x03,0x40,0x57,0xcc,0x00,0x02,0x1e,0x1c]
+
+[0x03,0x40,0x57,0xcc,0x00,0x01,0x1c,0x1c] # sgpr_0 src1
+# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v7 ; encoding: [0x03,0x40,0x57,0xcc,0x00,0x01,0x1c,0x1c]
+
+[0x03,0x40,0x57,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2
+# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x03,0x40,0x57,0xcc,0x00,0x03,0x02,0x18]
+
+[0x03,0x40,0x57,0xcc,0xf2,0x02,0x1e,0x1c] # 1.0 src0
+# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], 1.0/*Invalid immediate*/, v[1:2], v7 ; encoding: [0x03,0x40,0x57,0xcc,0x00,0x02,0x1e,0x1c]
+
+[0x03,0x40,0x57,0xcc,0x00,0xe5,0x1d,0x1c] # 1.0 src1
+# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, 1.0/*Invalid immediate*/, v7 ; encoding: [0x03,0x40,0x57,0xcc,0x00,0x01,0x1c,0x1c]
+
+[0x03,0x40,0x57,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2
+# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], 1.0/*Invalid immediate*/ ; encoding: [0x03,0x40,0x57,0xcc,0x00,0x03,0x02,0x18]
+
+[0x03,0x40,0x57,0xcc,0x00,0x03,0x06,0x1a] # 1 src2
+# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], 1/*Invalid immediate*/ ; encoding: [0x03,0x40,0x57,0xcc,0x00,0x03,0x06,0x18]
+
+
+
+[0x03,0x40,0x58,0xcc,0x00,0x03,0x1e,0x1c]
+# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x58,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0xc0,0x58,0xcc,0x00,0x03,0x1e,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x48,0x58,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[1,0,0]
+# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x58,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x50,0x58,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,1,0]
+# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x58,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x60,0x58,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x58,0xcc,0x00,0x03,0x1e,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x58,0xcc,0x00,0x03,0x1e,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x00,0x58,0xcc,0x00,0x03,0x1e,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x48,0x58,0xcc,0x00,0x03,0x1e,0x1c] # index_key:1
+# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x58,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x50,0x58,0xcc,0x00,0x03,0x1e,0x1c] # index_key:2
+# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x58,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x58,0x58,0xcc,0x00,0x03,0x1e,0x1c] # index_key:3
+# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x58,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x40,0x58,0xcc,0x00,0x03,0x1e,0x3c] # neg_lo:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x58,0xcc,0x00,0x03,0x1e,0x5c] # neg_lo:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x58,0xcc,0x00,0x03,0x1e,0x9c] # neg_lo:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x41,0x58,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x42,0x58,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x44,0x58,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x58,0xcc,0x00,0x02,0x1e,0x1c] # sgpr_0 src0
+# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], s0/*Invalid register, operand has 'VGPR_32' register class*/, v[1:2], v7 ; encoding: [0x03,0x40,0x58,0xcc,0x00,0x02,0x1e,0x1c]
+
+[0x03,0x40,0x58,0xcc,0x00,0x01,0x1c,0x1c] # sgpr_0 src1
+# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v7 ; encoding: [0x03,0x40,0x58,0xcc,0x00,0x01,0x1c,0x1c]
+
+[0x03,0x40,0x58,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2
+# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x03,0x40,0x58,0xcc,0x00,0x03,0x02,0x18]
+
+[0x03,0x40,0x58,0xcc,0xf2,0x02,0x1e,0x1c] # 1.0 src0
+# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], 1.0/*Invalid immediate*/, v[1:2], v7 ; encoding: [0x03,0x40,0x58,0xcc,0x00,0x02,0x1e,0x1c]
+
+[0x03,0x40,0x58,0xcc,0x00,0xe5,0x1d,0x1c] # 1.0 src1
+# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, 1.0/*Invalid immediate*/, v7 ; encoding: [0x03,0x40,0x58,0xcc,0x00,0x01,0x1c,0x1c]
+
+[0x03,0x40,0x58,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2
+# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], 1.0/*Invalid immediate*/ ; encoding: [0x03,0x40,0x58,0xcc,0x00,0x03,0x02,0x18]
+
+[0x03,0x40,0x58,0xcc,0x00,0x03,0x06,0x1a] # 1 src2
+# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], 1/*Invalid immediate*/ ; encoding: [0x03,0x40,0x58,0xcc,0x00,0x03,0x06,0x18]
+
+
+
+[0x03,0x40,0x59,0xcc,0x00,0x03,0x1e,0x1c]
+# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x59,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0xc0,0x59,0xcc,0x00,0x03,0x1e,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x48,0x59,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[1,0,0]
+# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x59,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x50,0x59,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,1,0]
+# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x59,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x60,0x59,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x59,0xcc,0x00,0x03,0x1e,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x59,0xcc,0x00,0x03,0x1e,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x00,0x59,0xcc,0x00,0x03,0x1e,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x48,0x59,0xcc,0x00,0x03,0x1e,0x1c] # index_key:1
+# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x59,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x50,0x59,0xcc,0x00,0x03,0x1e,0x1c] # index_key:2
+# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x59,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x58,0x59,0xcc,0x00,0x03,0x1e,0x1c] # index_key:3
+# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x59,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x40,0x59,0xcc,0x00,0x03,0x1e,0x3c] # neg_lo:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x59,0xcc,0x00,0x03,0x1e,0x5c] # neg_lo:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x59,0xcc,0x00,0x03,0x1e,0x9c] # neg_lo:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x41,0x59,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x42,0x59,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x44,0x59,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x59,0xcc,0x00,0x02,0x1e,0x1c] # sgpr_0 src0
+# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], s0/*Invalid register, operand has 'VGPR_32' register class*/, v[1:2], v7 ; encoding: [0x03,0x40,0x59,0xcc,0x00,0x02,0x1e,0x1c]
+
+[0x03,0x40,0x59,0xcc,0x00,0x01,0x1c,0x1c] # sgpr_0 src1
+# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v7 ; encoding: [0x03,0x40,0x59,0xcc,0x00,0x01,0x1c,0x1c]
+
+[0x03,0x40,0x59,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2
+# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x03,0x40,0x59,0xcc,0x00,0x03,0x02,0x18]
+
+[0x03,0x40,0x59,0xcc,0xf2,0x02,0x1e,0x1c] # 1.0 src0
+# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], 1.0/*Invalid immediate*/, v[1:2], v7 ; encoding: [0x03,0x40,0x59,0xcc,0x00,0x02,0x1e,0x1c]
+
+[0x03,0x40,0x59,0xcc,0x00,0xe5,0x1d,0x1c] # 1.0 src1
+# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, 1.0/*Invalid immediate*/, v7 ; encoding: [0x03,0x40,0x59,0xcc,0x00,0x01,0x1c,0x1c]
+
+[0x03,0x40,0x59,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2
+# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], 1.0/*Invalid immediate*/ ; encoding: [0x03,0x40,0x59,0xcc,0x00,0x03,0x02,0x18]
+
+[0x03,0x40,0x59,0xcc,0x00,0x03,0x06,0x1a] # 1 src2
+# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], 1/*Invalid immediate*/ ; encoding: [0x03,0x40,0x59,0xcc,0x00,0x03,0x06,0x18]
+
+
+
+[0x03,0x40,0x5a,0xcc,0x00,0x03,0x1e,0x1c]
+# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x5a,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0xc0,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x48,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[1,0,0]
+# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x5a,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x50,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,1,0]
+# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x5a,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x60,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x5a,0xcc,0x00,0x03,0x1e,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x5a,0xcc,0x00,0x03,0x1e,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x00,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x48,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # index_key:1
+# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x5a,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x50,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # index_key:2
+# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x5a,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x58,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # index_key:3
+# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x5a,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x40,0x5a,0xcc,0x00,0x03,0x1e,0x3c] # neg_lo:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x5a,0xcc,0x00,0x03,0x1e,0x5c] # neg_lo:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x5a,0xcc,0x00,0x03,0x1e,0x9c] # neg_lo:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x41,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x42,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x44,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x5a,0xcc,0x00,0x02,0x1e,0x1c] # sgpr_0 src0
+# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], s0/*Invalid register, operand has 'VGPR_32' register class*/, v[1:2], v7 ; encoding: [0x03,0x40,0x5a,0xcc,0x00,0x02,0x1e,0x1c]
+
+[0x03,0x40,0x5a,0xcc,0x00,0x01,0x1c,0x1c] # sgpr_0 src1
+# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v7 ; encoding: [0x03,0x40,0x5a,0xcc,0x00,0x01,0x1c,0x1c]
+
+[0x03,0x40,0x5a,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2
+# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x03,0x40,0x5a,0xcc,0x00,0x03,0x02,0x18]
+
+[0x03,0x40,0x5a,0xcc,0xf2,0x02,0x1e,0x1c] # 1.0 src0
+# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], 1.0/*Invalid immediate*/, v[1:2], v7 ; encoding: [0x03,0x40,0x5a,0xcc,0x00,0x02,0x1e,0x1c]
+
+[0x03,0x40,0x5a,0xcc,0x00,0xe5,0x1d,0x1c] # 1.0 src1
+# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, 1.0/*Invalid immediate*/, v7 ; encoding: [0x03,0x40,0x5a,0xcc,0x00,0x01,0x1c,0x1c]
+
+[0x03,0x40,0x5a,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2
+# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], 1.0/*Invalid immediate*/ ; encoding: [0x03,0x40,0x5a,0xcc,0x00,0x03,0x02,0x18]
+
+[0x03,0x40,0x5a,0xcc,0x00,0x03,0x06,0x1a] # 1 src2
+# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], 1/*Invalid immediate*/ ; encoding: [0x03,0x40,0x5a,0xcc,0x00,0x03,0x06,0x18]
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index 516a984399ff81..638e46a2f9c752 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -253,22 +253,23 @@ def ROCDL_mfma_f32_32x32x16_fp8_fp8 : ROCDL_Mfma_IntrOp<"mfma.f32.32x32x16.fp8.f
//===---------------------------------------------------------------------===//
// WMMA intrinsics
-class ROCDL_Wmma_IntrOp<string mnemonic, list<Trait> traits = []> :
+class ROCDL_Wmma_IntrOp<string mnemonic, list<int> overloadedOperands,
+ list<Trait> traits = []> :
LLVM_IntrOpBase<ROCDL_Dialect, mnemonic,
"amdgcn_" # !subst(".","_", mnemonic),
- [0], [], traits, 1>,
+ [0], overloadedOperands, traits, 1>,
Arguments<(ins Variadic<LLVM_Type>:$args)> {
let assemblyFormat =
"$args attr-dict `:` functional-type($args, $res)";
}
// Available on RDNA3
-def ROCDL_wmma_f32_16x16x16_f16 : ROCDL_Wmma_IntrOp<"wmma.f32.16x16x16.f16">;
-def ROCDL_wmma_f32_16x16x16_bf16 : ROCDL_Wmma_IntrOp<"wmma.f32.16x16x16.bf16">;
-def ROCDL_wmma_f16_16x16x16_f16 : ROCDL_Wmma_IntrOp<"wmma.f16.16x16x16.f16">;
-def ROCDL_wmma_bf16_16x16x16_bf16 : ROCDL_Wmma_IntrOp<"wmma.bf16.16x16x16.bf16">;
-def ROCDL_wmma_i32_16x16x16_iu8 : ROCDL_Wmma_IntrOp<"wmma.i32.16x16x16.iu8">;
-def ROCDL_wmma_i32_16x16x16_iu4 : ROCDL_Wmma_IntrOp<"wmma.i32.16x16x16.iu4">;
+def ROCDL_wmma_f32_16x16x16_f16 : ROCDL_Wmma_IntrOp<"wmma.f32.16x16x16.f16", [0]>;
+def ROCDL_wmma_f32_16x16x16_bf16 : ROCDL_Wmma_IntrOp<"wmma.f32.16x16x16.bf16", [0]>;
+def ROCDL_wmma_f16_16x16x16_f16 : ROCDL_Wmma_IntrOp<"wmma.f16.16x16x16.f16", [0]>;
+def ROCDL_wmma_bf16_16x16x16_bf16 : ROCDL_Wmma_IntrOp<"wmma.bf16.16x16x16.bf16", [0]>;
+def ROCDL_wmma_i32_16x16x16_iu8 : ROCDL_Wmma_IntrOp<"wmma.i32.16x16x16.iu8", [1]>;
+def ROCDL_wmma_i32_16x16x16_iu4 : ROCDL_Wmma_IntrOp<"wmma.i32.16x16x16.iu4", [1]>;
//===---------------------------------------------------------------------===//
// Operations on raw buffer resources (stride of 0, bounds checks either off or in
diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir
index 3c9c70711ae230..26123300d74888 100644
--- a/mlir/test/Target/LLVMIR/rocdl.mlir
+++ b/mlir/test/Target/LLVMIR/rocdl.mlir
@@ -248,53 +248,53 @@ llvm.func @rocdl.wmma(%arg0 : vector<8xf32>, %arg1 : vector<16 x f16>, %arg2 : v
// ---- Wave32 -----
// f16 -> f32
- // CHECK: call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <8 x float> %{{.*}})
+ // CHECK: call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v16f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <8 x float> %{{.*}})
%r0 = rocdl.wmma.f32.16x16x16.f16 %arg1, %arg1, %arg0 : (vector<16xf16>, vector<16xf16>, vector<8xf32>) -> vector<8xf32>
// bf16 -> f32
- // CHECK: call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <8 x float> %{{.*}})
+ // CHECK: call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v16i16(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <8 x float> %{{.*}})
%r1 = rocdl.wmma.f32.16x16x16.bf16 %arg2, %arg2, %arg0 : (vector<16xi16>, vector<16xi16>, vector<8xf32>) -> vector<8xf32>
// f16 -> f16 (OPSEL = {0,1})
- // CHECK: call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}}, i1 {{.*}})
+ // CHECK: call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}}, i1 {{.*}})
%r2 = rocdl.wmma.f16.16x16x16.f16 %arg1, %arg1, %arg1, %zero : (vector<16xf16>, vector<16xf16>, vector<16xf16>, i1) -> vector<16xf16>
// bf16 -> bf16 (OPSEL = {0,1})
- // CHECK: call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}, i1 {{.*}})
+ // CHECK: call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16.v16i16(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}, i1 {{.*}})
%r4 = rocdl.wmma.bf16.16x16x16.bf16 %arg2, %arg2, %arg2, %zero : (vector<16xi16>, vector<16xi16>, vector<16xi16>, i1) -> vector<16xi16>
// int8 -> int32 (signA = {0,1}, signB = {0,1}, clamp = {0,1})
- // CHECK: call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32(i1 {{.*}}, <4 x i32> %{{.*}}, i1 {{.*}}, <4 x i32> %{{.*}}, <8 x i32> %{{.*}}, i1 {{.*}})
+ // CHECK: call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v4i32(i1 {{.*}}, <4 x i32> %{{.*}}, i1 {{.*}}, <4 x i32> %{{.*}}, <8 x i32> %{{.*}}, i1 {{.*}})
%r5 = rocdl.wmma.i32.16x16x16.iu8 %zero, %arg5, %zero, %arg5, %arg3, %zero : (i1, vector<4xi32>, i1, vector<4xi32>, vector<8xi32>, i1) -> vector<8xi32>
// int4 -> int32 (signA = {0,1}, signB = {0,1}, clamp = {0,1})
- // CHECK: call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32(i1 {{.*}}, <2 x i32> %{{.*}}, i1 {{.*}}, <2 x i32> %{{.*}}, <8 x i32> %{{.*}}, i1 {{.*}})
+ // CHECK: call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.v2i32(i1 {{.*}}, <2 x i32> %{{.*}}, i1 {{.*}}, <2 x i32> %{{.*}}, <8 x i32> %{{.*}}, i1 {{.*}})
%r6 = rocdl.wmma.i32.16x16x16.iu4 %zero, %arg4, %zero, %arg4, %arg3, %zero : (i1, vector<2xi32>, i1, vector<2xi32>, vector<8xi32>, i1) -> vector<8xi32>
// ---- Wave64 -----
// f16 -> f32
- // CHECK: call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <4 x float> %{{.*}})
+ // CHECK: call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v16f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <4 x float> %{{.*}})
%r7 = rocdl.wmma.f32.16x16x16.f16 %arg1, %arg1, %arg6 : (vector<16xf16>, vector<16xf16>, vector<4xf32>) -> vector<4xf32>
// bf16 -> f32
- // CHECK: call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <4 x float> %{{.*}})
+ // CHECK: call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v16i16(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <4 x float> %{{.*}})
%r8 = rocdl.wmma.f32.16x16x16.bf16 %arg2, %arg2, %arg6 : (vector<16xi16>, vector<16xi16>, vector<4xf32>) -> vector<4xf32>
// f16 -> f16 (OPSEL = {0,1})
- // CHECK: call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <8 x half> %{{.*}}, i1 {{.*}})
+ // CHECK: call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v16f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <8 x half> %{{.*}}, i1 {{.*}})
%r9 = rocdl.wmma.f16.16x16x16.f16 %arg1, %arg1, %arg7, %zero : (vector<16xf16>, vector<16xf16>, vector<8xf16>, i1) -> vector<8xf16>
// bf16 -> bf16 (OPSEL = {0,1})
- // CHECK: call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <8 x i16> %{{.*}}, i1 {{.*}})
+ // CHECK: call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v16i16(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <8 x i16> %{{.*}}, i1 {{.*}})
%r11 = rocdl.wmma.bf16.16x16x16.bf16 %arg2, %arg2, %arg8, %zero : (vector<16xi16>, vector<16xi16>, vector<8xi16>, i1) -> vector<8xi16>
// int8 -> int32 (signA = {0,1}, signB = {0,1}, clamp = {0,1})
- // CHECK: call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32(i1 {{.*}}, <4 x i32> %{{.*}}, i1 {{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 {{.*}})
+ // CHECK: call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 {{.*}}, <4 x i32> %{{.*}}, i1 {{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 {{.*}})
%r12 = rocdl.wmma.i32.16x16x16.iu8 %zero, %arg5, %zero, %arg5, %arg5, %zero : (i1, vector<4xi32>, i1, vector<4xi32>, vector<4xi32>, i1) -> vector<4xi32>
// int4 -> int32 (signA = {0,1}, signB = {0,1}, clamp = {0,1})
- // CHECK: call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32(i1 {{.*}}, <2 x i32> %{{.*}}, i1 {{.*}}, <2 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 {{.*}})
+ // CHECK: call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 {{.*}}, <2 x i32> %{{.*}}, i1 {{.*}}, <2 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 {{.*}})
%r13 = rocdl.wmma.i32.16x16x16.iu4 %zero, %arg4, %zero, %arg4, %arg5, %zero : (i1, vector<2xi32>, i1, vector<2xi32>, vector<4xi32>, i1) -> vector<4xi32>
llvm.return %r0 : vector<8xf32>
More information about the cfe-commits
mailing list