[flang-commits] [flang] c09d3c1 - [flang] Add PowerPC MMA intrinsics - part 2
Daniel Chen via flang-commits
flang-commits at lists.llvm.org
Tue Aug 15 11:33:07 PDT 2023
Author: Daniel Chen
Date: 2023-08-15T13:56:28-04:00
New Revision: c09d3c1ead632e39c19a78049f6e663b6dbebc4e
URL: https://github.com/llvm/llvm-project/commit/c09d3c1ead632e39c19a78049f6e663b6dbebc4e
DIFF: https://github.com/llvm/llvm-project/commit/c09d3c1ead632e39c19a78049f6e663b6dbebc4e.diff
LOG: [flang] Add PowerPC MMA intrinsics - part 2
Added:
flang/test/Lower/PowerPC/ppc-mma-accumulator-move-clear.f90
flang/test/Lower/PowerPC/ppc-mma-outer-product-1.f90
flang/test/Lower/PowerPC/ppc-mma-outer-product-2.f90
Modified:
flang/include/flang/Optimizer/Builder/PPCIntrinsicCall.h
flang/lib/Optimizer/Builder/PPCIntrinsicCall.cpp
flang/module/mma.f90
flang/test/Lower/PowerPC/ppc-mma-assemble-disassemble.f90
Removed:
################################################################################
diff --git a/flang/include/flang/Optimizer/Builder/PPCIntrinsicCall.h b/flang/include/flang/Optimizer/Builder/PPCIntrinsicCall.h
index e5344922c2c476..68ae78abf9dc97 100644
--- a/flang/include/flang/Optimizer/Builder/PPCIntrinsicCall.h
+++ b/flang/include/flang/Optimizer/Builder/PPCIntrinsicCall.h
@@ -72,6 +72,70 @@ enum class MMAOp {
AssemblePair,
DisassembleAcc,
DisassemblePair,
+ Xxmfacc,
+ Xxmtacc,
+ Xxsetaccz,
+ Pmxvbf16ger2,
+ Pmxvbf16ger2nn,
+ Pmxvbf16ger2np,
+ Pmxvbf16ger2pn,
+ Pmxvbf16ger2pp,
+ Pmxvf16ger2,
+ Pmxvf16ger2nn,
+ Pmxvf16ger2np,
+ Pmxvf16ger2pn,
+ Pmxvf16ger2pp,
+
+ Pmxvf32ger,
+ Pmxvf32gernn,
+ Pmxvf32gernp,
+ Pmxvf32gerpn,
+ Pmxvf32gerpp,
+ Pmxvf64ger,
+ Pmxvf64gernn,
+ Pmxvf64gernp,
+ Pmxvf64gerpn,
+ Pmxvf64gerpp,
+
+ Pmxvi16ger2,
+ Pmxvi16ger2pp,
+ Pmxvi16ger2s,
+ Pmxvi16ger2spp,
+ Pmxvi4ger8,
+ Pmxvi4ger8pp,
+ Pmxvi8ger4,
+ Pmxvi8ger4pp,
+ Pmxvi8ger4spp,
+
+ Xvbf16ger2,
+ Xvbf16ger2nn,
+ Xvbf16ger2np,
+ Xvbf16ger2pn,
+ Xvbf16ger2pp,
+ Xvf16ger2,
+ Xvf16ger2nn,
+ Xvf16ger2np,
+ Xvf16ger2pn,
+ Xvf16ger2pp,
+ Xvf32ger,
+ Xvf32gernn,
+ Xvf32gernp,
+ Xvf32gerpn,
+ Xvf32gerpp,
+ Xvf64ger,
+ Xvf64gernn,
+ Xvf64gernp,
+ Xvf64gerpn,
+ Xvf64gerpp,
+ Xvi16ger2,
+ Xvi16ger2pp,
+ Xvi16ger2s,
+ Xvi16ger2spp,
+ Xvi4ger8,
+ Xvi4ger8pp,
+ Xvi8ger4,
+ Xvi8ger4pp,
+ Xvi8ger4spp,
};
enum class MMAHandlerOp {
diff --git a/flang/lib/Optimizer/Builder/PPCIntrinsicCall.cpp b/flang/lib/Optimizer/Builder/PPCIntrinsicCall.cpp
index f47780dd8cd8c1..9ce09262edf555 100644
--- a/flang/lib/Optimizer/Builder/PPCIntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/PPCIntrinsicCall.cpp
@@ -59,6 +59,451 @@ static constexpr IntrinsicHandler ppcHandlers[]{
&PI::genMmaIntr<MMAOp::DisassemblePair, MMAHandlerOp::SubToFunc>),
{{{"data", asAddr}, {"pair", asValue}}},
/*isElemental=*/true},
+ {"__ppc_mma_pmxvbf16ger2_",
+ static_cast<IntrinsicLibrary::SubroutineGenerator>(
+ &PI::genMmaIntr<MMAOp::Pmxvbf16ger2, MMAHandlerOp::SubToFunc>),
+ {{{"acc", asAddr},
+ {"a", asValue},
+ {"b", asValue},
+ {"xmask", asValue},
+ {"ymask", asValue},
+ {"pmask", asValue}}},
+ /*isElemental=*/true},
+ {"__ppc_mma_pmxvbf16ger2nn",
+ static_cast<IntrinsicLibrary::SubroutineGenerator>(
+ &PI::genMmaIntr<MMAOp::Pmxvbf16ger2nn,
+ MMAHandlerOp::FirstArgIsResult>),
+ {{{"acc", asAddr},
+ {"a", asValue},
+ {"b", asValue},
+ {"xmask", asValue},
+ {"ymask", asValue},
+ {"pmask", asValue}}},
+ /*isElemental=*/true},
+ {"__ppc_mma_pmxvbf16ger2np",
+ static_cast<IntrinsicLibrary::SubroutineGenerator>(
+ &PI::genMmaIntr<MMAOp::Pmxvbf16ger2np,
+ MMAHandlerOp::FirstArgIsResult>),
+ {{{"acc", asAddr},
+ {"a", asValue},
+ {"b", asValue},
+ {"xmask", asValue},
+ {"ymask", asValue},
+ {"pmask", asValue}}},
+ /*isElemental=*/true},
+ {"__ppc_mma_pmxvbf16ger2pn",
+ static_cast<IntrinsicLibrary::SubroutineGenerator>(
+ &PI::genMmaIntr<MMAOp::Pmxvbf16ger2pn,
+ MMAHandlerOp::FirstArgIsResult>),
+ {{{"acc", asAddr},
+ {"a", asValue},
+ {"b", asValue},
+ {"xmask", asValue},
+ {"ymask", asValue},
+ {"pmask", asValue}}},
+ /*isElemental=*/true},
+ {"__ppc_mma_pmxvbf16ger2pp",
+ static_cast<IntrinsicLibrary::SubroutineGenerator>(
+ &PI::genMmaIntr<MMAOp::Pmxvbf16ger2pp,
+ MMAHandlerOp::FirstArgIsResult>),
+ {{{"acc", asAddr},
+ {"a", asValue},
+ {"b", asValue},
+ {"xmask", asValue},
+ {"ymask", asValue},
+ {"pmask", asValue}}},
+ /*isElemental=*/true},
+ {"__ppc_mma_pmxvf16ger2_",
+ static_cast<IntrinsicLibrary::SubroutineGenerator>(
+ &PI::genMmaIntr<MMAOp::Pmxvf16ger2, MMAHandlerOp::SubToFunc>),
+ {{{"acc", asAddr},
+ {"a", asValue},
+ {"b", asValue},
+ {"xmask", asValue},
+ {"ymask", asValue},
+ {"pmask", asValue}}},
+ /*isElemental=*/true},
+ {"__ppc_mma_pmxvf16ger2nn",
+ static_cast<IntrinsicLibrary::SubroutineGenerator>(
+ &PI::genMmaIntr<MMAOp::Pmxvf16ger2nn, MMAHandlerOp::FirstArgIsResult>),
+ {{{"acc", asAddr},
+ {"a", asValue},
+ {"b", asValue},
+ {"xmask", asValue},
+ {"ymask", asValue},
+ {"pmask", asValue}}},
+ /*isElemental=*/true},
+ {"__ppc_mma_pmxvf16ger2np",
+ static_cast<IntrinsicLibrary::SubroutineGenerator>(
+ &PI::genMmaIntr<MMAOp::Pmxvf16ger2np, MMAHandlerOp::FirstArgIsResult>),
+ {{{"acc", asAddr},
+ {"a", asValue},
+ {"b", asValue},
+ {"xmask", asValue},
+ {"ymask", asValue},
+ {"pmask", asValue}}},
+ /*isElemental=*/true},
+ {"__ppc_mma_pmxvf16ger2pn",
+ static_cast<IntrinsicLibrary::SubroutineGenerator>(
+ &PI::genMmaIntr<MMAOp::Pmxvf16ger2pn, MMAHandlerOp::FirstArgIsResult>),
+ {{{"acc", asAddr},
+ {"a", asValue},
+ {"b", asValue},
+ {"xmask", asValue},
+ {"ymask", asValue},
+ {"pmask", asValue}}},
+ /*isElemental=*/true},
+ {"__ppc_mma_pmxvf16ger2pp",
+ static_cast<IntrinsicLibrary::SubroutineGenerator>(
+ &PI::genMmaIntr<MMAOp::Pmxvf16ger2pp, MMAHandlerOp::FirstArgIsResult>),
+ {{{"acc", asAddr},
+ {"a", asValue},
+ {"b", asValue},
+ {"xmask", asValue},
+ {"ymask", asValue},
+ {"pmask", asValue}}},
+ /*isElemental=*/true},
+ {"__ppc_mma_pmxvf32ger",
+ static_cast<IntrinsicLibrary::SubroutineGenerator>(
+ &PI::genMmaIntr<MMAOp::Pmxvf32ger, MMAHandlerOp::SubToFunc>),
+ {{{"acc", asAddr},
+ {"a", asValue},
+ {"b", asValue},
+ {"xmask", asValue},
+ {"ymask", asValue}}},
+ /*isElemental=*/true},
+ {"__ppc_mma_pmxvf32gernn",
+ static_cast<IntrinsicLibrary::SubroutineGenerator>(
+ &PI::genMmaIntr<MMAOp::Pmxvf32gernn, MMAHandlerOp::FirstArgIsResult>),
+ {{{"acc", asAddr},
+ {"a", asValue},
+ {"b", asValue},
+ {"xmask", asValue},
+ {"ymask", asValue}}},
+ /*isElemental=*/true},
+ {"__ppc_mma_pmxvf32gernp",
+ static_cast<IntrinsicLibrary::SubroutineGenerator>(
+ &PI::genMmaIntr<MMAOp::Pmxvf32gernp, MMAHandlerOp::FirstArgIsResult>),
+ {{{"acc", asAddr},
+ {"a", asValue},
+ {"b", asValue},
+ {"xmask", asValue},
+ {"ymask", asValue}}},
+ /*isElemental=*/true},
+ {"__ppc_mma_pmxvf32gerpn",
+ static_cast<IntrinsicLibrary::SubroutineGenerator>(
+ &PI::genMmaIntr<MMAOp::Pmxvf32gerpn, MMAHandlerOp::FirstArgIsResult>),
+ {{{"acc", asAddr},
+ {"a", asValue},
+ {"b", asValue},
+ {"xmask", asValue},
+ {"ymask", asValue}}},
+ /*isElemental=*/true},
+ {"__ppc_mma_pmxvf32gerpp",
+ static_cast<IntrinsicLibrary::SubroutineGenerator>(
+ &PI::genMmaIntr<MMAOp::Pmxvf32gerpp, MMAHandlerOp::FirstArgIsResult>),
+ {{{"acc", asAddr},
+ {"a", asValue},
+ {"b", asValue},
+ {"xmask", asValue},
+ {"ymask", asValue}}},
+ /*isElemental=*/true},
+ {"__ppc_mma_pmxvf64ger",
+ static_cast<IntrinsicLibrary::SubroutineGenerator>(
+ &PI::genMmaIntr<MMAOp::Pmxvf64ger, MMAHandlerOp::SubToFunc>),
+ {{{"acc", asAddr},
+ {"a", asValue},
+ {"b", asValue},
+ {"xmask", asValue},
+ {"ymask", asValue}}},
+ /*isElemental=*/true},
+ {"__ppc_mma_pmxvf64gernn",
+ static_cast<IntrinsicLibrary::SubroutineGenerator>(
+ &PI::genMmaIntr<MMAOp::Pmxvf64gernn, MMAHandlerOp::FirstArgIsResult>),
+ {{{"acc", asAddr},
+ {"a", asValue},
+ {"b", asValue},
+ {"xmask", asValue},
+ {"ymask", asValue}}},
+ /*isElemental=*/true},
+ {"__ppc_mma_pmxvf64gernp",
+ static_cast<IntrinsicLibrary::SubroutineGenerator>(
+ &PI::genMmaIntr<MMAOp::Pmxvf64gernp, MMAHandlerOp::FirstArgIsResult>),
+ {{{"acc", asAddr},
+ {"a", asValue},
+ {"b", asValue},
+ {"xmask", asValue},
+ {"ymask", asValue}}},
+ /*isElemental=*/true},
+ {"__ppc_mma_pmxvf64gerpn",
+ static_cast<IntrinsicLibrary::SubroutineGenerator>(
+ &PI::genMmaIntr<MMAOp::Pmxvf64gerpn, MMAHandlerOp::FirstArgIsResult>),
+ {{{"acc", asAddr},
+ {"a", asValue},
+ {"b", asValue},
+ {"xmask", asValue},
+ {"ymask", asValue}}},
+ /*isElemental=*/true},
+ {"__ppc_mma_pmxvf64gerpp",
+ static_cast<IntrinsicLibrary::SubroutineGenerator>(
+ &PI::genMmaIntr<MMAOp::Pmxvf64gerpp, MMAHandlerOp::FirstArgIsResult>),
+ {{{"acc", asAddr},
+ {"a", asValue},
+ {"b", asValue},
+ {"xmask", asValue},
+ {"ymask", asValue}}},
+ /*isElemental=*/true},
+ {"__ppc_mma_pmxvi16ger2_",
+ static_cast<IntrinsicLibrary::SubroutineGenerator>(
+ &PI::genMmaIntr<MMAOp::Pmxvi16ger2, MMAHandlerOp::SubToFunc>),
+ {{{"acc", asAddr},
+ {"a", asValue},
+ {"b", asValue},
+ {"xmask", asValue},
+ {"ymask", asValue},
+ {"pmask", asValue}}},
+ /*isElemental=*/true},
+ {"__ppc_mma_pmxvi16ger2pp",
+ static_cast<IntrinsicLibrary::SubroutineGenerator>(
+ &PI::genMmaIntr<MMAOp::Pmxvi16ger2pp, MMAHandlerOp::FirstArgIsResult>),
+ {{{"acc", asAddr},
+ {"a", asValue},
+ {"b", asValue},
+ {"xmask", asValue},
+ {"ymask", asValue},
+ {"pmask", asValue}}},
+ /*isElemental=*/true},
+ {"__ppc_mma_pmxvi16ger2s",
+ static_cast<IntrinsicLibrary::SubroutineGenerator>(
+ &PI::genMmaIntr<MMAOp::Pmxvi16ger2s, MMAHandlerOp::SubToFunc>),
+ {{{"acc", asAddr},
+ {"a", asValue},
+ {"b", asValue},
+ {"xmask", asValue},
+ {"ymask", asValue},
+ {"pmask", asValue}}},
+ /*isElemental=*/true},
+ {"__ppc_mma_pmxvi16ger2spp",
+ static_cast<IntrinsicLibrary::SubroutineGenerator>(
+ &PI::genMmaIntr<MMAOp::Pmxvi16ger2spp,
+ MMAHandlerOp::FirstArgIsResult>),
+ {{{"acc", asAddr},
+ {"a", asValue},
+ {"b", asValue},
+ {"xmask", asValue},
+ {"ymask", asValue},
+ {"pmask", asValue}}},
+ /*isElemental=*/true},
+ {"__ppc_mma_pmxvi4ger8_",
+ static_cast<IntrinsicLibrary::SubroutineGenerator>(
+ &PI::genMmaIntr<MMAOp::Pmxvi4ger8, MMAHandlerOp::SubToFunc>),
+ {{{"acc", asAddr},
+ {"a", asValue},
+ {"b", asValue},
+ {"xmask", asValue},
+ {"ymask", asValue},
+ {"pmask", asValue}}},
+ /*isElemental=*/true},
+ {"__ppc_mma_pmxvi4ger8pp",
+ static_cast<IntrinsicLibrary::SubroutineGenerator>(
+ &PI::genMmaIntr<MMAOp::Pmxvi4ger8pp, MMAHandlerOp::FirstArgIsResult>),
+ {{{"acc", asAddr},
+ {"a", asValue},
+ {"b", asValue},
+ {"xmask", asValue},
+ {"ymask", asValue},
+ {"pmask", asValue}}},
+ /*isElemental=*/true},
+ {"__ppc_mma_pmxvi8ger4_",
+ static_cast<IntrinsicLibrary::SubroutineGenerator>(
+ &PI::genMmaIntr<MMAOp::Pmxvi8ger4, MMAHandlerOp::SubToFunc>),
+ {{{"acc", asAddr},
+ {"a", asValue},
+ {"b", asValue},
+ {"xmask", asValue},
+ {"ymask", asValue},
+ {"pmask", asValue}}},
+ /*isElemental=*/true},
+ {"__ppc_mma_pmxvi8ger4pp",
+ static_cast<IntrinsicLibrary::SubroutineGenerator>(
+ &PI::genMmaIntr<MMAOp::Pmxvi8ger4pp, MMAHandlerOp::FirstArgIsResult>),
+ {{{"acc", asAddr},
+ {"a", asValue},
+ {"b", asValue},
+ {"xmask", asValue},
+ {"ymask", asValue},
+ {"pmask", asValue}}},
+ /*isElemental=*/true},
+ {"__ppc_mma_pmxvi8ger4spp",
+ static_cast<IntrinsicLibrary::SubroutineGenerator>(
+ &PI::genMmaIntr<MMAOp::Pmxvi8ger4spp, MMAHandlerOp::FirstArgIsResult>),
+ {{{"acc", asAddr},
+ {"a", asValue},
+ {"b", asValue},
+ {"xmask", asValue},
+ {"ymask", asValue},
+ {"pmask", asValue}}},
+ /*isElemental=*/true},
+ {"__ppc_mma_xvbf16ger2_",
+ static_cast<IntrinsicLibrary::SubroutineGenerator>(
+ &PI::genMmaIntr<MMAOp::Xvbf16ger2, MMAHandlerOp::SubToFunc>),
+ {{{"acc", asAddr}, {"a", asValue}, {"b", asValue}}},
+ /*isElemental=*/true},
+ {"__ppc_mma_xvbf16ger2nn",
+ static_cast<IntrinsicLibrary::SubroutineGenerator>(
+ &PI::genMmaIntr<MMAOp::Xvbf16ger2nn, MMAHandlerOp::FirstArgIsResult>),
+ {{{"acc", asAddr}, {"a", asValue}, {"b", asValue}}},
+ /*isElemental=*/true},
+ {"__ppc_mma_xvbf16ger2np",
+ static_cast<IntrinsicLibrary::SubroutineGenerator>(
+ &PI::genMmaIntr<MMAOp::Xvbf16ger2np, MMAHandlerOp::FirstArgIsResult>),
+ {{{"acc", asAddr}, {"a", asValue}, {"b", asValue}}},
+ /*isElemental=*/true},
+ {"__ppc_mma_xvbf16ger2pn",
+ static_cast<IntrinsicLibrary::SubroutineGenerator>(
+ &PI::genMmaIntr<MMAOp::Xvbf16ger2pn, MMAHandlerOp::FirstArgIsResult>),
+ {{{"acc", asAddr}, {"a", asValue}, {"b", asValue}}},
+ /*isElemental=*/true},
+ {"__ppc_mma_xvbf16ger2pp",
+ static_cast<IntrinsicLibrary::SubroutineGenerator>(
+ &PI::genMmaIntr<MMAOp::Xvbf16ger2pp, MMAHandlerOp::FirstArgIsResult>),
+ {{{"acc", asAddr}, {"a", asValue}, {"b", asValue}}},
+ /*isElemental=*/true},
+ {"__ppc_mma_xvf16ger2_",
+ static_cast<IntrinsicLibrary::SubroutineGenerator>(
+ &PI::genMmaIntr<MMAOp::Xvf16ger2, MMAHandlerOp::SubToFunc>),
+ {{{"acc", asAddr}, {"a", asValue}, {"b", asValue}}},
+ /*isElemental=*/true},
+ {"__ppc_mma_xvf16ger2nn",
+ static_cast<IntrinsicLibrary::SubroutineGenerator>(
+ &PI::genMmaIntr<MMAOp::Xvf16ger2nn, MMAHandlerOp::FirstArgIsResult>),
+ {{{"acc", asAddr}, {"a", asValue}, {"b", asValue}}},
+ /*isElemental=*/true},
+ {"__ppc_mma_xvf16ger2np",
+ static_cast<IntrinsicLibrary::SubroutineGenerator>(
+ &PI::genMmaIntr<MMAOp::Xvf16ger2np, MMAHandlerOp::FirstArgIsResult>),
+ {{{"acc", asAddr}, {"a", asValue}, {"b", asValue}}},
+ /*isElemental=*/true},
+ {"__ppc_mma_xvf16ger2pn",
+ static_cast<IntrinsicLibrary::SubroutineGenerator>(
+ &PI::genMmaIntr<MMAOp::Xvf16ger2pn, MMAHandlerOp::FirstArgIsResult>),
+ {{{"acc", asAddr}, {"a", asValue}, {"b", asValue}}},
+ /*isElemental=*/true},
+ {"__ppc_mma_xvf16ger2pp",
+ static_cast<IntrinsicLibrary::SubroutineGenerator>(
+ &PI::genMmaIntr<MMAOp::Xvf16ger2pp, MMAHandlerOp::FirstArgIsResult>),
+ {{{"acc", asAddr}, {"a", asValue}, {"b", asValue}}},
+ /*isElemental=*/true},
+ {"__ppc_mma_xvf32ger",
+ static_cast<IntrinsicLibrary::SubroutineGenerator>(
+ &PI::genMmaIntr<MMAOp::Xvf32ger, MMAHandlerOp::SubToFunc>),
+ {{{"acc", asAddr}, {"a", asValue}, {"b", asValue}}},
+ /*isElemental=*/true},
+ {"__ppc_mma_xvf32gernn",
+ static_cast<IntrinsicLibrary::SubroutineGenerator>(
+ &PI::genMmaIntr<MMAOp::Xvf32gernn, MMAHandlerOp::FirstArgIsResult>),
+ {{{"acc", asAddr}, {"a", asValue}, {"b", asValue}}},
+ /*isElemental=*/true},
+ {"__ppc_mma_xvf32gernp",
+ static_cast<IntrinsicLibrary::SubroutineGenerator>(
+ &PI::genMmaIntr<MMAOp::Xvf32gernp, MMAHandlerOp::FirstArgIsResult>),
+ {{{"acc", asAddr}, {"a", asValue}, {"b", asValue}}},
+ /*isElemental=*/true},
+ {"__ppc_mma_xvf32gerpn",
+ static_cast<IntrinsicLibrary::SubroutineGenerator>(
+ &PI::genMmaIntr<MMAOp::Xvf32gerpn, MMAHandlerOp::FirstArgIsResult>),
+ {{{"acc", asAddr}, {"a", asValue}, {"b", asValue}}},
+ /*isElemental=*/true},
+ {"__ppc_mma_xvf32gerpp",
+ static_cast<IntrinsicLibrary::SubroutineGenerator>(
+ &PI::genMmaIntr<MMAOp::Xvf32gerpp, MMAHandlerOp::FirstArgIsResult>),
+ {{{"acc", asAddr}, {"a", asValue}, {"b", asValue}}},
+ /*isElemental=*/true},
+ {"__ppc_mma_xvf64ger",
+ static_cast<IntrinsicLibrary::SubroutineGenerator>(
+ &PI::genMmaIntr<MMAOp::Xvf64ger, MMAHandlerOp::SubToFunc>),
+ {{{"acc", asAddr}, {"a", asValue}, {"b", asValue}}},
+ /*isElemental=*/true},
+ {"__ppc_mma_xvf64gernn",
+ static_cast<IntrinsicLibrary::SubroutineGenerator>(
+ &PI::genMmaIntr<MMAOp::Xvf64gernn, MMAHandlerOp::FirstArgIsResult>),
+ {{{"acc", asAddr}, {"a", asValue}, {"b", asValue}}},
+ /*isElemental=*/true},
+ {"__ppc_mma_xvf64gernp",
+ static_cast<IntrinsicLibrary::SubroutineGenerator>(
+ &PI::genMmaIntr<MMAOp::Xvf64gernp, MMAHandlerOp::FirstArgIsResult>),
+ {{{"acc", asAddr}, {"a", asValue}, {"b", asValue}}},
+ /*isElemental=*/true},
+ {"__ppc_mma_xvf64gerpn",
+ static_cast<IntrinsicLibrary::SubroutineGenerator>(
+ &PI::genMmaIntr<MMAOp::Xvf64gerpn, MMAHandlerOp::FirstArgIsResult>),
+ {{{"acc", asAddr}, {"a", asValue}, {"b", asValue}}},
+ /*isElemental=*/true},
+ {"__ppc_mma_xvf64gerpp",
+ static_cast<IntrinsicLibrary::SubroutineGenerator>(
+ &PI::genMmaIntr<MMAOp::Xvf64gerpp, MMAHandlerOp::FirstArgIsResult>),
+ {{{"acc", asAddr}, {"a", asValue}, {"b", asValue}}},
+ /*isElemental=*/true},
+ {"__ppc_mma_xvi16ger2_",
+ static_cast<IntrinsicLibrary::SubroutineGenerator>(
+ &PI::genMmaIntr<MMAOp::Xvi16ger2, MMAHandlerOp::SubToFunc>),
+ {{{"acc", asAddr}, {"a", asValue}, {"b", asValue}}},
+ /*isElemental=*/true},
+ {"__ppc_mma_xvi16ger2pp",
+ static_cast<IntrinsicLibrary::SubroutineGenerator>(
+ &PI::genMmaIntr<MMAOp::Xvi16ger2pp, MMAHandlerOp::FirstArgIsResult>),
+ {{{"acc", asAddr}, {"a", asValue}, {"b", asValue}}},
+ /*isElemental=*/true},
+ {"__ppc_mma_xvi16ger2s",
+ static_cast<IntrinsicLibrary::SubroutineGenerator>(
+ &PI::genMmaIntr<MMAOp::Xvi16ger2s, MMAHandlerOp::SubToFunc>),
+ {{{"acc", asAddr}, {"a", asValue}, {"b", asValue}}},
+ /*isElemental=*/true},
+ {"__ppc_mma_xvi16ger2spp",
+ static_cast<IntrinsicLibrary::SubroutineGenerator>(
+ &PI::genMmaIntr<MMAOp::Xvi16ger2spp, MMAHandlerOp::FirstArgIsResult>),
+ {{{"acc", asAddr}, {"a", asValue}, {"b", asValue}}},
+ /*isElemental=*/true},
+ {"__ppc_mma_xvi4ger8_",
+ static_cast<IntrinsicLibrary::SubroutineGenerator>(
+ &PI::genMmaIntr<MMAOp::Xvi4ger8, MMAHandlerOp::SubToFunc>),
+ {{{"acc", asAddr}, {"a", asValue}, {"b", asValue}}},
+ /*isElemental=*/true},
+ {"__ppc_mma_xvi4ger8pp",
+ static_cast<IntrinsicLibrary::SubroutineGenerator>(
+ &PI::genMmaIntr<MMAOp::Xvi4ger8pp, MMAHandlerOp::FirstArgIsResult>),
+ {{{"acc", asAddr}, {"a", asValue}, {"b", asValue}}},
+ /*isElemental=*/true},
+ {"__ppc_mma_xvi8ger4_",
+ static_cast<IntrinsicLibrary::SubroutineGenerator>(
+ &PI::genMmaIntr<MMAOp::Xvi8ger4, MMAHandlerOp::SubToFunc>),
+ {{{"acc", asAddr}, {"a", asValue}, {"b", asValue}}},
+ /*isElemental=*/true},
+ {"__ppc_mma_xvi8ger4pp",
+ static_cast<IntrinsicLibrary::SubroutineGenerator>(
+ &PI::genMmaIntr<MMAOp::Xvi8ger4pp, MMAHandlerOp::FirstArgIsResult>),
+ {{{"acc", asAddr}, {"a", asValue}, {"b", asValue}}},
+ /*isElemental=*/true},
+ {"__ppc_mma_xvi8ger4spp",
+ static_cast<IntrinsicLibrary::SubroutineGenerator>(
+ &PI::genMmaIntr<MMAOp::Xvi8ger4spp, MMAHandlerOp::FirstArgIsResult>),
+ {{{"acc", asAddr}, {"a", asValue}, {"b", asValue}}},
+ /*isElemental=*/true},
+ {"__ppc_mma_xxmfacc",
+ static_cast<IntrinsicLibrary::SubroutineGenerator>(
+ &PI::genMmaIntr<MMAOp::Xxmfacc, MMAHandlerOp::FirstArgIsResult>),
+ {{{"acc", asAddr}}},
+ /*isElemental=*/true},
+ {"__ppc_mma_xxmtacc",
+ static_cast<IntrinsicLibrary::SubroutineGenerator>(
+ &PI::genMmaIntr<MMAOp::Xxmtacc, MMAHandlerOp::FirstArgIsResult>),
+ {{{"acc", asAddr}}},
+ /*isElemental=*/true},
+ {"__ppc_mma_xxsetaccz",
+ static_cast<IntrinsicLibrary::SubroutineGenerator>(
+ &PI::genMmaIntr<MMAOp::Xxsetaccz, MMAHandlerOp::SubToFunc>),
+ {{{"acc", asAddr}}},
+ /*isElemental=*/true},
{"__ppc_mtfsf",
static_cast<IntrinsicLibrary::SubroutineGenerator>(&PI::genMtfsf<false>),
{{{"mask", asValue}, {"r", asValue}}},
@@ -1836,6 +2281,128 @@ const char *getMmaIrIntrName(MMAOp mmaOp) {
return "llvm.ppc.mma.disassemble.acc";
case MMAOp::DisassemblePair:
return "llvm.ppc.vsx.disassemble.pair";
+ case MMAOp::Xxmfacc:
+ return "llvm.ppc.mma.xxmfacc";
+ case MMAOp::Xxmtacc:
+ return "llvm.ppc.mma.xxmtacc";
+ case MMAOp::Xxsetaccz:
+ return "llvm.ppc.mma.xxsetaccz";
+ case MMAOp::Pmxvbf16ger2:
+ return "llvm.ppc.mma.pmxvbf16ger2";
+ case MMAOp::Pmxvbf16ger2nn:
+ return "llvm.ppc.mma.pmxvbf16ger2nn";
+ case MMAOp::Pmxvbf16ger2np:
+ return "llvm.ppc.mma.pmxvbf16ger2np";
+ case MMAOp::Pmxvbf16ger2pn:
+ return "llvm.ppc.mma.pmxvbf16ger2pn";
+ case MMAOp::Pmxvbf16ger2pp:
+ return "llvm.ppc.mma.pmxvbf16ger2pp";
+ case MMAOp::Pmxvf16ger2:
+ return "llvm.ppc.mma.pmxvf16ger2";
+ case MMAOp::Pmxvf16ger2nn:
+ return "llvm.ppc.mma.pmxvf16ger2nn";
+ case MMAOp::Pmxvf16ger2np:
+ return "llvm.ppc.mma.pmxvf16ger2np";
+ case MMAOp::Pmxvf16ger2pn:
+ return "llvm.ppc.mma.pmxvf16ger2pn";
+ case MMAOp::Pmxvf16ger2pp:
+ return "llvm.ppc.mma.pmxvf16ger2pp";
+ case MMAOp::Pmxvf32ger:
+ return "llvm.ppc.mma.pmxvf32ger";
+ case MMAOp::Pmxvf32gernn:
+ return "llvm.ppc.mma.pmxvf32gernn";
+ case MMAOp::Pmxvf32gernp:
+ return "llvm.ppc.mma.pmxvf32gernp";
+ case MMAOp::Pmxvf32gerpn:
+ return "llvm.ppc.mma.pmxvf32gerpn";
+ case MMAOp::Pmxvf32gerpp:
+ return "llvm.ppc.mma.pmxvf32gerpp";
+ case MMAOp::Pmxvf64ger:
+ return "llvm.ppc.mma.pmxvf64ger";
+ case MMAOp::Pmxvf64gernn:
+ return "llvm.ppc.mma.pmxvf64gernn";
+ case MMAOp::Pmxvf64gernp:
+ return "llvm.ppc.mma.pmxvf64gernp";
+ case MMAOp::Pmxvf64gerpn:
+ return "llvm.ppc.mma.pmxvf64gerpn";
+ case MMAOp::Pmxvf64gerpp:
+ return "llvm.ppc.mma.pmxvf64gerpp";
+ case MMAOp::Pmxvi16ger2:
+ return "llvm.ppc.mma.pmxvi16ger2";
+ case MMAOp::Pmxvi16ger2pp:
+ return "llvm.ppc.mma.pmxvi16ger2pp";
+ case MMAOp::Pmxvi16ger2s:
+ return "llvm.ppc.mma.pmxvi16ger2s";
+ case MMAOp::Pmxvi16ger2spp:
+ return "llvm.ppc.mma.pmxvi16ger2spp";
+ case MMAOp::Pmxvi4ger8:
+ return "llvm.ppc.mma.pmxvi4ger8";
+ case MMAOp::Pmxvi4ger8pp:
+ return "llvm.ppc.mma.pmxvi4ger8pp";
+ case MMAOp::Pmxvi8ger4:
+ return "llvm.ppc.mma.pmxvi8ger4";
+ case MMAOp::Pmxvi8ger4pp:
+ return "llvm.ppc.mma.pmxvi8ger4pp";
+ case MMAOp::Pmxvi8ger4spp:
+ return "llvm.ppc.mma.pmxvi8ger4spp";
+ case MMAOp::Xvbf16ger2:
+ return "llvm.ppc.mma.xvbf16ger2";
+ case MMAOp::Xvbf16ger2nn:
+ return "llvm.ppc.mma.xvbf16ger2nn";
+ case MMAOp::Xvbf16ger2np:
+ return "llvm.ppc.mma.xvbf16ger2np";
+ case MMAOp::Xvbf16ger2pn:
+ return "llvm.ppc.mma.xvbf16ger2pn";
+ case MMAOp::Xvbf16ger2pp:
+ return "llvm.ppc.mma.xvbf16ger2pp";
+ case MMAOp::Xvf16ger2:
+ return "llvm.ppc.mma.xvf16ger2";
+ case MMAOp::Xvf16ger2nn:
+ return "llvm.ppc.mma.xvf16ger2nn";
+ case MMAOp::Xvf16ger2np:
+ return "llvm.ppc.mma.xvf16ger2np";
+ case MMAOp::Xvf16ger2pn:
+ return "llvm.ppc.mma.xvf16ger2pn";
+ case MMAOp::Xvf16ger2pp:
+ return "llvm.ppc.mma.xvf16ger2pp";
+ case MMAOp::Xvf32ger:
+ return "llvm.ppc.mma.xvf32ger";
+ case MMAOp::Xvf32gernn:
+ return "llvm.ppc.mma.xvf32gernn";
+ case MMAOp::Xvf32gernp:
+ return "llvm.ppc.mma.xvf32gernp";
+ case MMAOp::Xvf32gerpn:
+ return "llvm.ppc.mma.xvf32gerpn";
+ case MMAOp::Xvf32gerpp:
+ return "llvm.ppc.mma.xvf32gerpp";
+ case MMAOp::Xvf64ger:
+ return "llvm.ppc.mma.xvf64ger";
+ case MMAOp::Xvf64gernn:
+ return "llvm.ppc.mma.xvf64gernn";
+ case MMAOp::Xvf64gernp:
+ return "llvm.ppc.mma.xvf64gernp";
+ case MMAOp::Xvf64gerpn:
+ return "llvm.ppc.mma.xvf64gerpn";
+ case MMAOp::Xvf64gerpp:
+ return "llvm.ppc.mma.xvf64gerpp";
+ case MMAOp::Xvi16ger2:
+ return "llvm.ppc.mma.xvi16ger2";
+ case MMAOp::Xvi16ger2pp:
+ return "llvm.ppc.mma.xvi16ger2pp";
+ case MMAOp::Xvi16ger2s:
+ return "llvm.ppc.mma.xvi16ger2s";
+ case MMAOp::Xvi16ger2spp:
+ return "llvm.ppc.mma.xvi16ger2spp";
+ case MMAOp::Xvi4ger8:
+ return "llvm.ppc.mma.xvi4ger8";
+ case MMAOp::Xvi4ger8pp:
+ return "llvm.ppc.mma.xvi4ger8pp";
+ case MMAOp::Xvi8ger4:
+ return "llvm.ppc.mma.xvi8ger4";
+ case MMAOp::Xvi8ger4pp:
+ return "llvm.ppc.mma.xvi8ger4pp";
+ case MMAOp::Xvi8ger4spp:
+ return "llvm.ppc.mma.xvi8ger4spp";
}
llvm_unreachable("getMmaIrIntrName");
}
@@ -1850,6 +2417,157 @@ mlir::FunctionType getMmaIrFuncType(mlir::MLIRContext *context, MMAOp mmaOp) {
return genMmaDisassembleFuncType(context, mmaOp);
case MMAOp::DisassemblePair:
return genMmaDisassembleFuncType(context, mmaOp);
+ case MMAOp::Xxmfacc:
+ return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 0, /*Vector*/ 0);
+ case MMAOp::Xxmtacc:
+ return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 0, /*Vector*/ 0);
+ case MMAOp::Xxsetaccz:
+ return genMmaVqFuncType(context, /*Quad*/ 0, /*Pair*/ 0, /*Vector*/ 0);
+ case MMAOp::Pmxvbf16ger2:
+ return genMmaVqFuncType(context, /*Quad*/ 0, /*Pair*/ 0, /*Vector*/ 2,
+ /*Integer*/ 3);
+ case MMAOp::Pmxvbf16ger2nn:
+ return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 0, /*Vector*/ 2,
+ /*Integer*/ 3);
+ case MMAOp::Pmxvbf16ger2np:
+ return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 0, /*Vector*/ 2,
+ /*Integer*/ 3);
+ case MMAOp::Pmxvbf16ger2pn:
+ return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 0, /*Vector*/ 2,
+ /*Integer*/ 3);
+ case MMAOp::Pmxvbf16ger2pp:
+ return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 0, /*Vector*/ 2,
+ /*Integer*/ 3);
+ case MMAOp::Pmxvf16ger2:
+ return genMmaVqFuncType(context, /*Quad*/ 0, /*Pair*/ 0, /*Vector*/ 2,
+ /*Integer*/ 3);
+ case MMAOp::Pmxvf16ger2nn:
+ return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 0, /*Vector*/ 2,
+ /*Integer*/ 3);
+ case MMAOp::Pmxvf16ger2np:
+ return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 0, /*Vector*/ 2,
+ /*Integer*/ 3);
+ case MMAOp::Pmxvf16ger2pn:
+ return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 0, /*Vector*/ 2,
+ /*Integer*/ 3);
+ case MMAOp::Pmxvf16ger2pp:
+ return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 0, /*Vector*/ 2,
+ /*Integer*/ 3);
+ case MMAOp::Pmxvf32ger:
+ return genMmaVqFuncType(context, /*Quad*/ 0, /*Pair*/ 0, /*Vector*/ 2,
+ /*Integer*/ 2);
+ case MMAOp::Pmxvf32gernn:
+ return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 0, /*Vector*/ 2,
+ /*Integer*/ 2);
+ case MMAOp::Pmxvf32gernp:
+ return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 0, /*Vector*/ 2,
+ /*Integer*/ 2);
+ case MMAOp::Pmxvf32gerpn:
+ return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 0, /*Vector*/ 2,
+ /*Integer*/ 2);
+ case MMAOp::Pmxvf32gerpp:
+ return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 0, /*Vector*/ 2,
+ /*Integer*/ 2);
+ case MMAOp::Pmxvf64ger:
+ return genMmaVqFuncType(context, /*Quad*/ 0, /*Pair*/ 1, /*Vector*/ 1,
+ /*Integer*/ 2);
+ case MMAOp::Pmxvf64gernn:
+ return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 1, /*Vector*/ 1,
+ /*Integer*/ 2);
+ case MMAOp::Pmxvf64gernp:
+ return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 1, /*Vector*/ 1,
+ /*Integer*/ 2);
+ case MMAOp::Pmxvf64gerpn:
+ return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 1, /*Vector*/ 1,
+ /*Integer*/ 2);
+ case MMAOp::Pmxvf64gerpp:
+ return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 1, /*Vector*/ 1,
+ /*Integer*/ 2);
+ case MMAOp::Pmxvi16ger2:
+ return genMmaVqFuncType(context, /*Quad*/ 0, /*Pair*/ 0, /*Vector*/ 2,
+ /*Integer*/ 3);
+ case MMAOp::Pmxvi16ger2pp:
+ return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 0, /*Vector*/ 2,
+ /*Integer*/ 3);
+ case MMAOp::Pmxvi16ger2s:
+ return genMmaVqFuncType(context, /*Quad*/ 0, /*Pair*/ 0, /*Vector*/ 2,
+ /*Integer*/ 3);
+ case MMAOp::Pmxvi16ger2spp:
+ return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 0, /*Vector*/ 2,
+ /*Integer*/ 3);
+ case MMAOp::Pmxvi4ger8:
+ return genMmaVqFuncType(context, /*Quad*/ 0, /*Pair*/ 0, /*Vector*/ 2,
+ /*Integer*/ 3);
+ case MMAOp::Pmxvi4ger8pp:
+ return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 0, /*Vector*/ 2,
+ /*Integer*/ 3);
+ case MMAOp::Pmxvi8ger4:
+ return genMmaVqFuncType(context, /*Quad*/ 0, /*Pair*/ 0, /*Vector*/ 2,
+ /*Integer*/ 3);
+ case MMAOp::Pmxvi8ger4pp:
+ return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 0, /*Vector*/ 2,
+ /*Integer*/ 3);
+ case MMAOp::Pmxvi8ger4spp:
+ return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 0, /*Vector*/ 2,
+ /*Integer*/ 3);
+ case MMAOp::Xvbf16ger2:
+ return genMmaVqFuncType(context, /*Quad*/ 0, /*Pair*/ 0, /*Vector*/ 2);
+ case MMAOp::Xvbf16ger2nn:
+ return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 0, /*Vector*/ 2);
+ case MMAOp::Xvbf16ger2np:
+ return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 0, /*Vector*/ 2);
+ case MMAOp::Xvbf16ger2pn:
+ return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 0, /*Vector*/ 2);
+ case MMAOp::Xvbf16ger2pp:
+ return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 0, /*Vector*/ 2);
+ case MMAOp::Xvf16ger2:
+ return genMmaVqFuncType(context, /*Quad*/ 0, /*Pair*/ 0, /*Vector*/ 2);
+ case MMAOp::Xvf16ger2nn:
+ return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 0, /*Vector*/ 2);
+ case MMAOp::Xvf16ger2np:
+ return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 0, /*Vector*/ 2);
+ case MMAOp::Xvf16ger2pn:
+ return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 0, /*Vector*/ 2);
+ case MMAOp::Xvf16ger2pp:
+ return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 0, /*Vector*/ 2);
+ case MMAOp::Xvf32ger:
+ return genMmaVqFuncType(context, /*Quad*/ 0, /*Pair*/ 0, /*Vector*/ 2);
+ case MMAOp::Xvf32gernn:
+ return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 0, /*Vector*/ 2);
+ case MMAOp::Xvf32gernp:
+ return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 0, /*Vector*/ 2);
+ case MMAOp::Xvf32gerpn:
+ return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 0, /*Vector*/ 2);
+ case MMAOp::Xvf32gerpp:
+ return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 0, /*Vector*/ 2);
+ case MMAOp::Xvf64ger:
+ return genMmaVqFuncType(context, /*Quad*/ 0, /*Pair*/ 1, /*Vector*/ 1);
+ case MMAOp::Xvf64gernn:
+ return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 1, /*Vector*/ 1);
+ case MMAOp::Xvf64gernp:
+ return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 1, /*Vector*/ 1);
+ case MMAOp::Xvf64gerpn:
+ return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 1, /*Vector*/ 1);
+ case MMAOp::Xvf64gerpp:
+ return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 1, /*Vector*/ 1);
+ case MMAOp::Xvi16ger2:
+ return genMmaVqFuncType(context, /*Quad*/ 0, /*Pair*/ 0, /*Vector*/ 2);
+ case MMAOp::Xvi16ger2pp:
+ return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 0, /*Vector*/ 2);
+ case MMAOp::Xvi16ger2s:
+ return genMmaVqFuncType(context, /*Quad*/ 0, /*Pair*/ 0, /*Vector*/ 2);
+ case MMAOp::Xvi16ger2spp:
+ return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 0, /*Vector*/ 2);
+ case MMAOp::Xvi4ger8:
+ return genMmaVqFuncType(context, /*Quad*/ 0, /*Pair*/ 0, /*Vector*/ 2);
+ case MMAOp::Xvi4ger8pp:
+ return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 0, /*Vector*/ 2);
+ case MMAOp::Xvi8ger4:
+ return genMmaVqFuncType(context, /*Quad*/ 0, /*Pair*/ 0, /*Vector*/ 2);
+ case MMAOp::Xvi8ger4pp:
+ return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 0, /*Vector*/ 2);
+ case MMAOp::Xvi8ger4spp:
+ return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 0, /*Vector*/ 2);
}
llvm_unreachable("getMmaIrFuncType");
}
diff --git a/flang/module/mma.f90 b/flang/module/mma.f90
index f34e6c8fd1a725..d6d2eb87879bc0 100644
--- a/flang/module/mma.f90
+++ b/flang/module/mma.f90
@@ -12,6 +12,12 @@ module mma
abstract interface
+!! ========== 1 argument subroutine interface ================================!!
+!! subroutine s(__vector_quad)
+ elemental subroutine sub_vq(acc)
+ __vector_quad, intent(inout) :: acc
+ end subroutine
+
!! ========== 3 arguments subroutine interface ===============================!!
!! subroutine s(__vector_pair, vector(i), vector(i))
#define ELEM_SUB_VPVIVI(VKIND) \
@@ -44,6 +50,62 @@ elemental subroutine sub_vpvr##VKIND##vr##VKIND(pair, arg1, arg2); \
#undef ELEM_SUB_VPVUVU
#undef ELEM_SUB_VPVRVR
+!! subroutine s(__vector_quad, vector(i), vector(i))
+#define ELEM_SUB_VQVIVI(INTENT, VKIND) \
+ elemental subroutine sub_vq##INTENT##vi##VKIND##vi##VKIND(acc, a, b); \
+ __vector_quad, intent(INTENT) :: acc; \
+ vector(integer(VKIND)), intent(in) :: a, b; \
+ end subroutine ;
+
+!! subroutine s(__vector_quad, vector(u), vector(u))
+#define ELEM_SUB_VQVUVU(INTENT, VKIND) \
+ elemental subroutine sub_vq##INTENT##vu##VKIND##vu##VKIND(acc, a, b); \
+ __vector_quad, intent(INTENT) :: acc; \
+ vector(unsigned(VKIND)), intent(in) :: a, b; \
+ end subroutine ;
+
+!! subroutine s(__vector_quad, vector(r), vector(r))
+#define ELEM_SUB_VQVRVR(INTENT, VKIND) \
+ elemental subroutine sub_vq##INTENT##vr##VKIND##vr##VKIND(acc, a, b); \
+ __vector_quad, intent(INTENT) :: acc; \
+ vector(real(VKIND)), intent(in) :: a, b; \
+ end subroutine ;
+
+ ELEM_SUB_VQVIVI(inout,1) ELEM_SUB_VQVIVI(inout,2)
+ ELEM_SUB_VQVUVU(inout,1)
+ ELEM_SUB_VQVRVR(inout,4)
+ ELEM_SUB_VQVIVI(out,1) ELEM_SUB_VQVIVI(out,2)
+ ELEM_SUB_VQVUVU(out,1)
+ ELEM_SUB_VQVRVR(out,4)
+
+#undef ELEM_SUB_VQVRVR
+#undef ELEM_SUB_VQVUVU
+#undef ELEM_SUB_VQVIVI
+
+!! subroutine s(__vector_quad, __vector_pair, vector(u))
+#define ELEM_SUB_VQVPVU(INTENT, VKIND) \
+ elemental subroutine sub_vq##INTENT##vpvu##VKIND(acc, a, b); \
+ __vector_quad, intent(INTENT) :: acc; \
+ __vector_pair, intent(in) :: a; \
+ vector(unsigned(VKIND)), intent(in) :: b; \
+ end subroutine ;
+
+!! subroutine s(__vector_quad, __vector_pair, vector(r))
+#define ELEM_SUB_VQVPVR(INTENT, VKIND) \
+ elemental subroutine sub_vq##INTENT##vpvr##VKIND(acc, a, b); \
+ __vector_quad, intent(INTENT) :: acc; \
+ __vector_pair, intent(in) :: a; \
+ vector(real(VKIND)), intent(in) :: b; \
+ end subroutine ;
+
+ ELEM_SUB_VQVPVU(inout,1)
+ ELEM_SUB_VQVPVR(inout,8)
+ ELEM_SUB_VQVPVU(out,1)
+ ELEM_SUB_VQVPVR(out,8)
+
+#undef ELEM_SUB_VQVPVR
+#undef ELEM_SUB_VQVPVU
+
!! ========== 5 arguments subroutine interface ===============================!!
!! subroutine s(__vector_quad, vector(i), vector(i), vector(i), vector(i))
#define ELEM_SUB_VQVIVIVIVI(VKIND) \
@@ -76,6 +138,95 @@ elemental subroutine sub_vqvr##VKIND##vr##VKIND##vr##VKIND##vr##VKIND(acc, arg1,
#undef ELEM_SUB_VQVUVUVUVU
#undef ELEM_SUB_VQVIVIVIVI
+!! subroutine s(__vector_quad, vector(u), vector(u), integer, integer)
+#define ELEM_SUB_VQVUVUII(INTENT, VKIND) \
+ elemental subroutine sub_vq##INTENT##vu##VKIND##vu##VKIND##ii(acc, a, b, xmask, ymask); \
+ __vector_quad, intent(INTENT) :: acc; \
+ vector(unsigned(VKIND)), intent(in) :: a, b; \
+ integer(8), intent(in) :: xmask, ymask; \
+ !dir$ ignore_tkr(k) xmask; \
+ !dir$ ignore_tkr(k) ymask; \
+ end subroutine ;
+
+!! subroutine s(__vector_quad, vector(r), vector(r), integer, integer)
+#define ELEM_SUB_VQVRVRII(INTENT, VKIND) \
+ elemental subroutine sub_vq##INTENT##vr##VKIND##vr##VKIND##ii(acc, a, b, xmask, ymask); \
+ __vector_quad, intent(INTENT) :: acc; \
+ vector(real(VKIND)), intent(in) :: a, b; \
+ integer(8), intent(in) :: xmask, ymask; \
+ !dir$ ignore_tkr(k) xmask; \
+ !dir$ ignore_tkr(k) ymask; \
+ end subroutine ;
+
+ ELEM_SUB_VQVUVUII(inout,1)
+ ELEM_SUB_VQVRVRII(inout,4)
+ ELEM_SUB_VQVUVUII(out,1)
+ ELEM_SUB_VQVRVRII(out,4)
+
+#undef ELEM_SUB_VQVRVRII
+#undef ELEM_SUB_VQVUVUII
+
+!! subroutine s(__vector_quad, __vector_pair, vector(u), integer, integer)
+#define ELEM_SUB_VQVPVUII(INTENT, VKIND) \
+ elemental subroutine sub_vq##INTENT##vpvu##VKIND##ii(acc, a, b, xmask, ymask); \
+ __vector_quad, intent(INTENT) :: acc; \
+ __vector_pair, intent(in) :: a; \
+ vector(unsigned(VKIND)), intent(in) :: b; \
+ integer(8), intent(in) :: xmask, ymask; \
+ !dir$ ignore_tkr(k) xmask; \
+ !dir$ ignore_tkr(k) ymask; \
+ end subroutine ;
+
+!! subroutine s(__vector_quad, __vector_pair, vector(r), integer, integer)
+#define ELEM_SUB_VQVPVRII(INTENT, VKIND) \
+ elemental subroutine sub_vq##INTENT##vpvr##VKIND##ii(acc, a, b, xmask, ymask); \
+ __vector_quad, intent(INTENT) :: acc; \
+ __vector_pair, intent(in) :: a; \
+ vector(real(VKIND)), intent(in) :: b; \
+ integer(8), intent(in) :: xmask, ymask; \
+ !dir$ ignore_tkr(k) xmask; \
+ !dir$ ignore_tkr(k) ymask; \
+ end subroutine ;
+
+ ELEM_SUB_VQVPVUII(inout,1)
+ ELEM_SUB_VQVPVRII(inout,8)
+ ELEM_SUB_VQVPVUII(out,1)
+ ELEM_SUB_VQVPVRII(out,8)
+
+#undef ELEM_SUB_VQVPVRII
+#undef ELEM_SUB_VQVPVUII
+
+!! ========== 6 arguments subroutine interface ===============================!!
+!! subroutine s(__vector_quad, vector(i), vector(i), integer, integer, integer)
+#define ELEM_SUB_VQVIVIIII(INTENT, VKIND) \
+ elemental subroutine sub_vq##INTENT##vi##VKIND##vi##VKIND##iii(acc, a, b, xmask, ymask, pmask); \
+ __vector_quad, intent(INTENT) :: acc; \
+ vector(integer(VKIND)), intent(in) :: a, b; \
+ integer(8), intent(in) :: xmask, ymask, pmask; \
+ !dir$ ignore_tkr(k) xmask; \
+ !dir$ ignore_tkr(k) ymask; \
+ !dir$ ignore_tkr(k) pmask; \
+ end subroutine ;
+
+!! subroutine s(__vector_quad, vector(u), vector(u), integer, integer, integer)
+#define ELEM_SUB_VQVUVUIII(INTENT, VKIND) \
+ elemental subroutine sub_vq##INTENT##vu##VKIND##vu##VKIND##iii(acc, a, b, xmask, ymask, pmask); \
+ __vector_quad, intent(INTENT) :: acc; \
+ vector(unsigned(VKIND)), intent(in) :: a, b; \
+ integer(8), intent(in) :: xmask, ymask, pmask; \
+ !dir$ ignore_tkr(k) xmask; \
+ !dir$ ignore_tkr(k) ymask; \
+ !dir$ ignore_tkr(k) pmask; \
+ end subroutine ;
+
+ ELEM_SUB_VQVIVIIII(inout,1) ELEM_SUB_VQVIVIIII(inout,2)
+ ELEM_SUB_VQVUVUIII(inout,1)
+ ELEM_SUB_VQVIVIIII(out,1) ELEM_SUB_VQVIVIIII(out,2)
+ ELEM_SUB_VQVUVUIII(out,1)
+
+#undef ELEM_SUB_VQVUVUIII
+#undef ELEM_SUB_VQVIVIIII
+
!! ========== non-macro interface =============================================!!
elemental subroutine sub_atvp(data, pair)
! Dummy arg 'data' is supposed to be intent(out) of any type,
@@ -202,6 +353,552 @@ elemental subroutine sub_atvq(data, acc)
#undef SUB_VP_VU_VU
#undef SUB_VP_VI_VI
+#define SUB_VQ_VI_VI_I_I_I(NAME, VKIND) __ppc_##NAME##_vqvi##VKIND##vi##VKINDi0i0i0
+#define SUB_VQ_VU_VU_I_I_I(NAME, VKIND) __ppc_##NAME##_vqvu##VKIND##vu##VKINDi0i0ii0
+
+#define VEC_SUB_VQ_VI_VI_I_I_I(NAME, INTENT, VKIND) \
+ procedure(sub_vq##INTENT##vi##VKIND##vi##VKIND##iii) :: SUB_VQ_VI_VI_I_I_I(NAME, VKIND);
+#define VEC_SUB_VQ_VU_VU_I_I_I(NAME, INTENT, VKIND) \
+ procedure(sub_vq##INTENT##vu##VKIND##vu##VKIND##iii) :: SUB_VQ_VU_VU_I_I_I(NAME, VKIND);
+
+! mma_pmxvbf16ger2
+ VEC_SUB_VQ_VU_VU_I_I_I(mma_pmxvbf16ger2_,out,1)
+ interface mma_pmxvbf16ger2
+ procedure :: SUB_VQ_VU_VU_I_I_I(mma_pmxvbf16ger2_,1)
+ end interface mma_pmxvbf16ger2
+ public mma_pmxvbf16ger2
+
+! mma_pmxvbf16ger2nn
+ VEC_SUB_VQ_VU_VU_I_I_I(mma_pmxvbf16ger2nn,inout,1)
+ interface mma_pmxvbf16ger2nn
+ procedure :: SUB_VQ_VU_VU_I_I_I(mma_pmxvbf16ger2nn,1)
+ end interface mma_pmxvbf16ger2nn
+ public mma_pmxvbf16ger2nn
+
+! mma_pmxvbf16ger2np
+ VEC_SUB_VQ_VU_VU_I_I_I(mma_pmxvbf16ger2np,inout,1)
+ interface mma_pmxvbf16ger2np
+ procedure :: SUB_VQ_VU_VU_I_I_I(mma_pmxvbf16ger2np,1)
+ end interface mma_pmxvbf16ger2np
+ public mma_pmxvbf16ger2np
+
+! mma_pmxvbf16ger2pn
+ VEC_SUB_VQ_VU_VU_I_I_I(mma_pmxvbf16ger2pn,inout,1)
+ interface mma_pmxvbf16ger2pn
+ procedure :: SUB_VQ_VU_VU_I_I_I(mma_pmxvbf16ger2pn,1)
+ end interface mma_pmxvbf16ger2pn
+ public mma_pmxvbf16ger2pn
+
+! mma_pmxvbf16ger2pp
+ VEC_SUB_VQ_VU_VU_I_I_I(mma_pmxvbf16ger2pp,inout,1)
+ interface mma_pmxvbf16ger2pp
+ procedure :: SUB_VQ_VU_VU_I_I_I(mma_pmxvbf16ger2pp,1)
+ end interface mma_pmxvbf16ger2pp
+ public mma_pmxvbf16ger2pp
+
+! mma_pmxvf16ger2
+ VEC_SUB_VQ_VU_VU_I_I_I(mma_pmxvf16ger2_,out,1)
+ interface mma_pmxvf16ger2
+ procedure :: SUB_VQ_VU_VU_I_I_I(mma_pmxvf16ger2_,1)
+ end interface mma_pmxvf16ger2
+ public mma_pmxvf16ger2
+
+! mma_pmxvf16ger2nn
+ VEC_SUB_VQ_VU_VU_I_I_I(mma_pmxvf16ger2nn,inout,1)
+ interface mma_pmxvf16ger2nn
+ procedure :: SUB_VQ_VU_VU_I_I_I(mma_pmxvf16ger2nn,1)
+ end interface mma_pmxvf16ger2nn
+ public mma_pmxvf16ger2nn
+
+! mma_pmxvf16ger2np
+ VEC_SUB_VQ_VU_VU_I_I_I(mma_pmxvf16ger2np,inout,1)
+ interface mma_pmxvf16ger2np
+ procedure :: SUB_VQ_VU_VU_I_I_I(mma_pmxvf16ger2np,1)
+ end interface mma_pmxvf16ger2np
+ public mma_pmxvf16ger2np
+
+! mma_pmxvf16ger2pn
+ VEC_SUB_VQ_VU_VU_I_I_I(mma_pmxvf16ger2pn,inout,1)
+ interface mma_pmxvf16ger2pn
+ procedure :: SUB_VQ_VU_VU_I_I_I(mma_pmxvf16ger2pn,1)
+ end interface mma_pmxvf16ger2pn
+ public mma_pmxvf16ger2pn
+
+! mma_pmxvf16ger2pp
+ VEC_SUB_VQ_VU_VU_I_I_I(mma_pmxvf16ger2pp,inout,1)
+ interface mma_pmxvf16ger2pp
+ procedure :: SUB_VQ_VU_VU_I_I_I(mma_pmxvf16ger2pp,1)
+ end interface mma_pmxvf16ger2pp
+ public mma_pmxvf16ger2pp
+
+! mma_pmxvi16ger2
+ VEC_SUB_VQ_VI_VI_I_I_I(mma_pmxvi16ger2_,out,2)
+ VEC_SUB_VQ_VU_VU_I_I_I(mma_pmxvi16ger2_,out,1)
+ interface mma_pmxvi16ger2
+ procedure :: SUB_VQ_VI_VI_I_I_I(mma_pmxvi16ger2_,2)
+ procedure :: SUB_VQ_VU_VU_I_I_I(mma_pmxvi16ger2_,1)
+ end interface mma_pmxvi16ger2
+ public mma_pmxvi16ger2
+
+! mma_pmxvi16ger2pp
+ VEC_SUB_VQ_VI_VI_I_I_I(mma_pmxvi16ger2pp,inout,2)
+ VEC_SUB_VQ_VU_VU_I_I_I(mma_pmxvi16ger2pp,inout,1)
+ interface mma_pmxvi16ger2pp
+ procedure :: SUB_VQ_VI_VI_I_I_I(mma_pmxvi16ger2pp,2)
+ procedure :: SUB_VQ_VU_VU_I_I_I(mma_pmxvi16ger2pp,1)
+ end interface mma_pmxvi16ger2pp
+ public mma_pmxvi16ger2pp
+
+! mma_pmxvi16ger2s
+ VEC_SUB_VQ_VI_VI_I_I_I(mma_pmxvi16ger2s,out,2)
+ VEC_SUB_VQ_VU_VU_I_I_I(mma_pmxvi16ger2s,out,1)
+ interface mma_pmxvi16ger2s
+ procedure :: SUB_VQ_VI_VI_I_I_I(mma_pmxvi16ger2s,2)
+ procedure :: SUB_VQ_VU_VU_I_I_I(mma_pmxvi16ger2s,1)
+ end interface mma_pmxvi16ger2s
+ public mma_pmxvi16ger2s
+
+! mma_pmxvi16ger2spp
+ VEC_SUB_VQ_VI_VI_I_I_I(mma_pmxvi16ger2spp,inout,2)
+ VEC_SUB_VQ_VU_VU_I_I_I(mma_pmxvi16ger2spp,inout,1)
+ interface mma_pmxvi16ger2spp
+ procedure :: SUB_VQ_VI_VI_I_I_I(mma_pmxvi16ger2spp,2)
+ procedure :: SUB_VQ_VU_VU_I_I_I(mma_pmxvi16ger2spp,1)
+ end interface mma_pmxvi16ger2spp
+ public mma_pmxvi16ger2spp
+
+! mma_pmxvi4ger8
+ VEC_SUB_VQ_VU_VU_I_I_I(mma_pmxvi4ger8_,out,1)
+ interface mma_pmxvi4ger8
+ procedure :: SUB_VQ_VU_VU_I_I_I(mma_pmxvi4ger8_,1)
+ end interface mma_pmxvi4ger8
+ public mma_pmxvi4ger8
+
+! mma_pmxvi4ger8pp
+ VEC_SUB_VQ_VU_VU_I_I_I(mma_pmxvi4ger8pp,inout,1)
+ interface mma_pmxvi4ger8pp
+ procedure :: SUB_VQ_VU_VU_I_I_I(mma_pmxvi4ger8pp,1)
+ end interface mma_pmxvi4ger8pp
+ public mma_pmxvi4ger8pp
+
+! mma_pmxvi8ger4
+ VEC_SUB_VQ_VI_VI_I_I_I(mma_pmxvi8ger4_,out,1)
+ VEC_SUB_VQ_VU_VU_I_I_I(mma_pmxvi8ger4_,out,1)
+ interface mma_pmxvi8ger4
+ procedure :: SUB_VQ_VI_VI_I_I_I(mma_pmxvi8ger4_,1)
+ procedure :: SUB_VQ_VU_VU_I_I_I(mma_pmxvi8ger4_,1)
+ end interface mma_pmxvi8ger4
+ public mma_pmxvi8ger4
+
+! mma_pmxvi8ger4pp
+ VEC_SUB_VQ_VI_VI_I_I_I(mma_pmxvi8ger4pp,inout,1)
+ VEC_SUB_VQ_VU_VU_I_I_I(mma_pmxvi8ger4pp,inout,1)
+ interface mma_pmxvi8ger4pp
+ procedure :: SUB_VQ_VI_VI_I_I_I(mma_pmxvi8ger4pp,1)
+ procedure :: SUB_VQ_VU_VU_I_I_I(mma_pmxvi8ger4pp,1)
+ end interface mma_pmxvi8ger4pp
+ public mma_pmxvi8ger4pp
+
+! mma_pmxvi8ger4spp
+ VEC_SUB_VQ_VI_VI_I_I_I(mma_pmxvi8ger4spp,inout,1)
+ VEC_SUB_VQ_VU_VU_I_I_I(mma_pmxvi8ger4spp,inout,1)
+ interface mma_pmxvi8ger4spp
+ procedure :: SUB_VQ_VI_VI_I_I_I(mma_pmxvi8ger4spp,1)
+ procedure :: SUB_VQ_VU_VU_I_I_I(mma_pmxvi8ger4spp,1)
+ end interface mma_pmxvi8ger4spp
+ public mma_pmxvi8ger4spp
+
+#undef VEC_SUB_VQ_VU_VU_I_I_I
+#undef VEC_SUB_VQ_VI_VI_I_I_I
+#undef SUB_VQ_VU_VU_I_I_I
+#undef SUB_VQ_VI_VI_I_I_I
+
+#define SUB_VQ_VU_VU_I_I(NAME, VKIND) __ppc_##NAME##_vqvu##VKIND##vu##VKINDi0i0
+#define SUB_VQ_VR_VR_I_I(NAME, VKIND) __ppc_##NAME##_vqvr##VKIND##vr##VKINDi0i0
+
+#define VEC_SUB_VQ_VU_VU_I_I(NAME, INTENT, VKIND) \
+ procedure(sub_vq##INTENT##vu##VKIND##vu##VKIND##ii) :: SUB_VQ_VU_VU_I_I(NAME, VKIND);
+#define VEC_SUB_VQ_VR_VR_I_I(NAME, INTENT, VKIND) \
+ procedure(sub_vq##INTENT##vr##VKIND##vr##VKIND##ii) :: SUB_VQ_VR_VR_I_I(NAME, VKIND);
+
+! mma_pmxvf32ger
+ VEC_SUB_VQ_VU_VU_I_I(mma_pmxvf32ger,out,1)
+ VEC_SUB_VQ_VR_VR_I_I(mma_pmxvf32ger,out,4)
+ interface mma_pmxvf32ger
+ procedure :: SUB_VQ_VU_VU_I_I(mma_pmxvf32ger,1)
+ procedure :: SUB_VQ_VR_VR_I_I(mma_pmxvf32ger,4)
+ end interface mma_pmxvf32ger
+ public mma_pmxvf32ger
+
+! mma_pmxvf32gernn
+ VEC_SUB_VQ_VU_VU_I_I(mma_pmxvf32gernn,inout,1)
+ VEC_SUB_VQ_VR_VR_I_I(mma_pmxvf32gernn,inout,4)
+ interface mma_pmxvf32gernn
+ procedure :: SUB_VQ_VU_VU_I_I(mma_pmxvf32gernn,1)
+ procedure :: SUB_VQ_VR_VR_I_I(mma_pmxvf32gernn,4)
+ end interface mma_pmxvf32gernn
+ public mma_pmxvf32gernn
+
+! mma_pmxvf32gernp
+ VEC_SUB_VQ_VU_VU_I_I(mma_pmxvf32gernp,inout,1)
+ VEC_SUB_VQ_VR_VR_I_I(mma_pmxvf32gernp,inout,4)
+ interface mma_pmxvf32gernp
+ procedure :: SUB_VQ_VU_VU_I_I(mma_pmxvf32gernp,1)
+ procedure :: SUB_VQ_VR_VR_I_I(mma_pmxvf32gernp,4)
+ end interface mma_pmxvf32gernp
+ public mma_pmxvf32gernp
+
+! mma_pmxvf32gerpn
+ VEC_SUB_VQ_VU_VU_I_I(mma_pmxvf32gerpn,inout,1)
+ VEC_SUB_VQ_VR_VR_I_I(mma_pmxvf32gerpn,inout,4)
+ interface mma_pmxvf32gerpn
+ procedure :: SUB_VQ_VU_VU_I_I(mma_pmxvf32gerpn,1)
+ procedure :: SUB_VQ_VR_VR_I_I(mma_pmxvf32gerpn,4)
+ end interface mma_pmxvf32gerpn
+ public mma_pmxvf32gerpn
+
+! mma_pmxvf32gerpp
+ VEC_SUB_VQ_VU_VU_I_I(mma_pmxvf32gerpp,inout,1)
+ VEC_SUB_VQ_VR_VR_I_I(mma_pmxvf32gerpp,inout,4)
+ interface mma_pmxvf32gerpp
+ procedure :: SUB_VQ_VU_VU_I_I(mma_pmxvf32gerpp,1)
+ procedure :: SUB_VQ_VR_VR_I_I(mma_pmxvf32gerpp,4)
+ end interface mma_pmxvf32gerpp
+ public mma_pmxvf32gerpp
+
+#undef VEC_SUB_VQ_VR_VR_I_I
+#undef VEC_SUB_VQ_VU_VU_I_I
+#undef SUB_VQ_VR_VR_I_I
+#undef SUB_VQ_VU_VU_I_I
+
+#define SUB_VQ_VP_VU_I_I(NAME, VKIND) __ppc_##NAME##_vqvpvu##VKINDi0i0
+#define SUB_VQ_VP_VR_I_I(NAME, VKIND) __ppc_##NAME##_vqvpvr##VKINDi0i0
+
+#define VEC_SUB_VQ_VP_VU_I_I(NAME, INTENT, VKIND) \
+ procedure(sub_vq##INTENT##vpvu##VKIND##ii) :: SUB_VQ_VP_VU_I_I(NAME, VKIND);
+#define VEC_SUB_VQ_VP_VR_I_I(NAME, INTENT, VKIND) \
+ procedure(sub_vq##INTENT##vpvr##VKIND##ii) :: SUB_VQ_VP_VR_I_I(NAME, VKIND);
+
+! mma_pmxvf64ger
+ VEC_SUB_VQ_VP_VU_I_I(mma_pmxvf64ger,out,1)
+ VEC_SUB_VQ_VP_VR_I_I(mma_pmxvf64ger,out,8)
+ interface mma_pmxvf64ger
+ procedure :: SUB_VQ_VP_VU_I_I(mma_pmxvf64ger,1)
+ procedure :: SUB_VQ_VP_VR_I_I(mma_pmxvf64ger,8)
+ end interface mma_pmxvf64ger
+ public mma_pmxvf64ger
+
+! mma_pmxvf64gernn
+ VEC_SUB_VQ_VP_VU_I_I(mma_pmxvf64gernn,inout,1)
+ VEC_SUB_VQ_VP_VR_I_I(mma_pmxvf64gernn,inout,8)
+ interface mma_pmxvf64gernn
+ procedure :: SUB_VQ_VP_VU_I_I(mma_pmxvf64gernn,1)
+ procedure :: SUB_VQ_VP_VR_I_I(mma_pmxvf64gernn,8)
+ end interface mma_pmxvf64gernn
+ public mma_pmxvf64gernn
+
+! mma_pmxvf64gernp
+ VEC_SUB_VQ_VP_VU_I_I(mma_pmxvf64gernp,inout,1)
+ VEC_SUB_VQ_VP_VR_I_I(mma_pmxvf64gernp,inout,8)
+ interface mma_pmxvf64gernp
+ procedure :: SUB_VQ_VP_VU_I_I(mma_pmxvf64gernp,1)
+ procedure :: SUB_VQ_VP_VR_I_I(mma_pmxvf64gernp,8)
+ end interface mma_pmxvf64gernp
+ public mma_pmxvf64gernp
+
+! mma_pmxvf64gerpn
+ VEC_SUB_VQ_VP_VU_I_I(mma_pmxvf64gerpn,inout,1)
+ VEC_SUB_VQ_VP_VR_I_I(mma_pmxvf64gerpn,inout,8)
+ interface mma_pmxvf64gerpn
+ procedure :: SUB_VQ_VP_VU_I_I(mma_pmxvf64gerpn,1)
+ procedure :: SUB_VQ_VP_VR_I_I(mma_pmxvf64gerpn,8)
+ end interface mma_pmxvf64gerpn
+ public mma_pmxvf64gerpn
+
+! mma_pmxvf64gerpp
+ VEC_SUB_VQ_VP_VU_I_I(mma_pmxvf64gerpp,inout,1)
+ VEC_SUB_VQ_VP_VR_I_I(mma_pmxvf64gerpp,inout,8)
+ interface mma_pmxvf64gerpp
+ procedure :: SUB_VQ_VP_VU_I_I(mma_pmxvf64gerpp,1)
+ procedure :: SUB_VQ_VP_VR_I_I(mma_pmxvf64gerpp,8)
+ end interface mma_pmxvf64gerpp
+ public mma_pmxvf64gerpp
+
+#undef VEC_SUB_VQ_VP_VR_I_I
+#undef VEC_SUB_VQ_VP_VU_I_I
+#undef SUB_VQ_VP_VR_I_I
+#undef SUB_VQ_VP_VU_I_I
+
+#define SUB_VQ_VI_VI(NAME, VKIND) __ppc_##NAME##_vi##VKIND##vi##VKIND
+#define SUB_VQ_VU_VU(NAME, VKIND) __ppc_##NAME##_vu##VKIND##vu##VKIND
+#define SUB_VQ_VR_VR(NAME, VKIND) __ppc_##NAME##_vr##VKIND##vr##VKIND
+
+#define VEC_SUB_VQ_VI_VI(NAME, INTENT, VKIND) \
+ procedure(sub_vq##INTENT##vi##VKIND##vi##VKIND) :: SUB_VQ_VI_VI(NAME, VKIND);
+#define VEC_SUB_VQ_VU_VU(NAME, INTENT, VKIND) \
+ procedure(sub_vq##INTENT##vu##VKIND##vu##VKIND) :: SUB_VQ_VU_VU(NAME, VKIND);
+#define VEC_SUB_VQ_VR_VR(NAME, INTENT, VKIND) \
+ procedure(sub_vq##INTENT##vr##VKIND##vr##VKIND) :: SUB_VQ_VR_VR(NAME, VKIND);
+
+!! First argument with INTENT(INOUT)
+! mma_xvbf16ger2nn
+ VEC_SUB_VQ_VU_VU(mma_xvbf16ger2nn,inout,1)
+ interface mma_xvbf16ger2nn
+ procedure :: SUB_VQ_VU_VU(mma_xvbf16ger2nn,1)
+ end interface
+ public mma_xvbf16ger2nn
+
+! mma_xvbf16ger2np
+ VEC_SUB_VQ_VU_VU(mma_xvbf16ger2np,inout,1)
+ interface mma_xvbf16ger2np
+ procedure :: SUB_VQ_VU_VU(mma_xvbf16ger2np,1)
+ end interface
+ public mma_xvbf16ger2np
+
+! mma_xvbf16ger2pn
+ VEC_SUB_VQ_VU_VU(mma_xvbf16ger2pn,inout,1)
+ interface mma_xvbf16ger2pn
+ procedure :: SUB_VQ_VU_VU(mma_xvbf16ger2pn,1)
+ end interface
+ public mma_xvbf16ger2pn
+
+! mma_xvbf16ger2pp
+ VEC_SUB_VQ_VU_VU(mma_xvbf16ger2pp,inout,1)
+ interface mma_xvbf16ger2pp
+ procedure :: SUB_VQ_VU_VU(mma_xvbf16ger2pp,1)
+ end interface
+ public mma_xvbf16ger2pp
+
+! mma_xvi8ger4pp
+ VEC_SUB_VQ_VI_VI(mma_xvi8ger4pp,inout,1)
+ VEC_SUB_VQ_VU_VU(mma_xvi8ger4pp,inout,1)
+ interface mma_xvi8ger4pp
+ procedure :: SUB_VQ_VI_VI(mma_xvi8ger4pp,1)
+ procedure :: SUB_VQ_VU_VU(mma_xvi8ger4pp,1)
+ end interface
+ public mma_xvi8ger4pp
+
+! mma_xvi8ger4spp
+ VEC_SUB_VQ_VI_VI(mma_xvi8ger4spp,inout,1)
+ VEC_SUB_VQ_VU_VU(mma_xvi8ger4spp,inout,1)
+ interface mma_xvi8ger4spp
+ procedure :: SUB_VQ_VI_VI(mma_xvi8ger4spp,1)
+ procedure :: SUB_VQ_VU_VU(mma_xvi8ger4spp,1)
+ end interface
+ public mma_xvi8ger4spp
+
+! mma_xvi16ger2pp
+ VEC_SUB_VQ_VI_VI(mma_xvi16ger2pp,inout,2)
+ VEC_SUB_VQ_VU_VU(mma_xvi16ger2pp,inout,1)
+ interface mma_xvi16ger2pp
+ procedure :: SUB_VQ_VI_VI(mma_xvi16ger2pp,2)
+ procedure :: SUB_VQ_VU_VU(mma_xvi16ger2pp,1)
+ end interface
+ public mma_xvi16ger2pp
+
+! mma_xvi16ger2s
+ VEC_SUB_VQ_VI_VI(mma_xvi16ger2s,inout,2)
+ VEC_SUB_VQ_VU_VU(mma_xvi16ger2s,inout,1)
+ interface mma_xvi16ger2s
+ procedure :: SUB_VQ_VI_VI(mma_xvi16ger2s,2)
+ procedure :: SUB_VQ_VU_VU(mma_xvi16ger2s,1)
+ end interface
+ public mma_xvi16ger2s
+
+! mma_xvi16ger2spp
+ VEC_SUB_VQ_VI_VI(mma_xvi16ger2spp,inout,2)
+ VEC_SUB_VQ_VU_VU(mma_xvi16ger2spp,inout,1)
+ interface mma_xvi16ger2spp
+ procedure :: SUB_VQ_VI_VI(mma_xvi16ger2spp,2)
+ procedure :: SUB_VQ_VU_VU(mma_xvi16ger2spp,1)
+ end interface
+ public mma_xvi16ger2spp
+
+! mma_xvi4ger8pp
+ VEC_SUB_VQ_VU_VU(mma_xvi4ger8pp,inout,1)
+ interface mma_xvi4ger8pp
+ procedure :: SUB_VQ_VU_VU(mma_xvi4ger8pp,1)
+ end interface
+ public mma_xvi4ger8pp
+
+! mma_xvf16ger2nn
+ VEC_SUB_VQ_VU_VU(mma_xvf16ger2nn,inout,1)
+ interface mma_xvf16ger2nn
+ procedure :: SUB_VQ_VU_VU(mma_xvf16ger2nn,1)
+ end interface
+ public mma_xvf16ger2nn
+
+! mma_xvf16ger2np
+ VEC_SUB_VQ_VU_VU(mma_xvf16ger2np,inout,1)
+ interface mma_xvf16ger2np
+ procedure :: SUB_VQ_VU_VU(mma_xvf16ger2np,1)
+ end interface
+ public mma_xvf16ger2np
+
+! mma_xvf16ger2pn
+ VEC_SUB_VQ_VU_VU(mma_xvf16ger2pn,inout,1)
+ interface mma_xvf16ger2pn
+ procedure :: SUB_VQ_VU_VU(mma_xvf16ger2pn,1)
+ end interface
+ public mma_xvf16ger2pn
+
+! mma_xvf16ger2pp
+ VEC_SUB_VQ_VU_VU(mma_xvf16ger2pp,inout,1)
+ interface mma_xvf16ger2pp
+ procedure :: SUB_VQ_VU_VU(mma_xvf16ger2pp,1)
+ end interface
+ public mma_xvf16ger2pp
+
+! mma_xvf32gernn
+ VEC_SUB_VQ_VU_VU(mma_xvf32gernn,inout,1)
+ VEC_SUB_VQ_VR_VR(mma_xvf32gernn,inout,4)
+ interface mma_xvf32gernn
+ procedure :: SUB_VQ_VU_VU(mma_xvf32gernn,1)
+ procedure :: SUB_VQ_VR_VR(mma_xvf32gernn,4)
+ end interface
+ public mma_xvf32gernn
+
+! mma_xvf32gernp
+ VEC_SUB_VQ_VU_VU(mma_xvf32gernp,inout,1)
+ VEC_SUB_VQ_VR_VR(mma_xvf32gernp,inout,4)
+ interface mma_xvf32gernp
+ procedure :: SUB_VQ_VU_VU(mma_xvf32gernp,1)
+ procedure :: SUB_VQ_VR_VR(mma_xvf32gernp,4)
+ end interface
+ public mma_xvf32gernp
+
+! mma_xvf32gerpn
+ VEC_SUB_VQ_VU_VU(mma_xvf32gerpn,inout,1)
+ VEC_SUB_VQ_VR_VR(mma_xvf32gerpn,inout,4)
+ interface mma_xvf32gerpn
+ procedure :: SUB_VQ_VU_VU(mma_xvf32gerpn,1)
+ procedure :: SUB_VQ_VR_VR(mma_xvf32gerpn,4)
+ end interface
+ public mma_xvf32gerpn
+
+! mma_xvf32gerpp
+ VEC_SUB_VQ_VU_VU(mma_xvf32gerpp,inout,1)
+ VEC_SUB_VQ_VR_VR(mma_xvf32gerpp,inout,4)
+ interface mma_xvf32gerpp
+ procedure :: SUB_VQ_VU_VU(mma_xvf32gerpp,1)
+ procedure :: SUB_VQ_VR_VR(mma_xvf32gerpp,4)
+ end interface
+ public mma_xvf32gerpp
+
+!! First argument with INTENT(OUT)
+! mma_xvbf16ger2
+ VEC_SUB_VQ_VU_VU(mma_xvbf16ger2_,out,1)
+ interface mma_xvbf16ger2
+ procedure :: SUB_VQ_VU_VU(mma_xvbf16ger2_,1)
+ end interface
+ public mma_xvbf16ger2
+
+! mma_xvi16ger2
+ VEC_SUB_VQ_VI_VI(mma_xvi16ger2_,out,2)
+ VEC_SUB_VQ_VU_VU(mma_xvi16ger2_,out,1)
+ interface mma_xvi16ger2
+ procedure :: SUB_VQ_VI_VI(mma_xvi16ger2_,2)
+ procedure :: SUB_VQ_VU_VU(mma_xvi16ger2_,1)
+ end interface
+ public mma_xvi16ger2
+
+! mma_xvi4ger8
+ VEC_SUB_VQ_VU_VU(mma_xvi4ger8_,out,1)
+ interface mma_xvi4ger8
+ procedure :: SUB_VQ_VU_VU(mma_xvi4ger8_,1)
+ end interface
+ public mma_xvi4ger8
+
+! mma_xvi8ger4
+ VEC_SUB_VQ_VI_VI(mma_xvi8ger4_,out,1)
+ VEC_SUB_VQ_VU_VU(mma_xvi8ger4_,out,1)
+ interface mma_xvi8ger4
+ procedure :: SUB_VQ_VI_VI(mma_xvi8ger4_,1)
+ procedure :: SUB_VQ_VU_VU(mma_xvi8ger4_,1)
+ end interface
+ public mma_xvi8ger4
+
+! mma_xvf16ger2
+ VEC_SUB_VQ_VU_VU(mma_xvf16ger2_,out,1)
+ interface mma_xvf16ger2
+ procedure :: SUB_VQ_VU_VU(mma_xvf16ger2_,1)
+ end interface
+ public mma_xvf16ger2
+
+! mma_xvf32ger
+ VEC_SUB_VQ_VU_VU(mma_xvf32ger,out,1)
+ VEC_SUB_VQ_VR_VR(mma_xvf32ger,out,4)
+ interface mma_xvf32ger
+ procedure :: SUB_VQ_VU_VU(mma_xvf32ger,1)
+ procedure :: SUB_VQ_VR_VR(mma_xvf32ger,4)
+ end interface
+ public mma_xvf32ger
+
+#undef VEC_SUB_VQ_VR_VR
+#undef VEC_SUB_VQ_VU_VU
+#undef VEC_SUB_VQ_VI_VI
+#undef SUB_VQ_VR_VR
+#undef SUB_VQ_VU_VU
+#undef SUB_VQ_VI_VI
+
+#define SUB_VQ_VP_VU(NAME, VKIND) __ppc_##NAME##_vu##VKIND
+#define SUB_VQ_VP_VR(NAME, VKIND) __ppc_##NAME##_vr##VKIND
+
+#define VEC_SUB_VQ_VP_VU(NAME, INTENT, VKIND) \
+ procedure(sub_vq##INTENT##vpvu##VKIND) :: SUB_VQ_VP_VU(NAME, VKIND);
+#define VEC_SUB_VQ_VP_VR(NAME, INTENT, VKIND) \
+ procedure(sub_vq##INTENT##vpvr##VKIND) :: SUB_VQ_VP_VR(NAME, VKIND);
+
+! mma_xvf64ger
+ VEC_SUB_VQ_VP_VU(mma_xvf64ger,out,1)
+ VEC_SUB_VQ_VP_VR(mma_xvf64ger,out,8)
+ interface mma_xvf64ger
+ procedure :: SUB_VQ_VP_VU(mma_xvf64ger,1)
+ procedure :: SUB_VQ_VP_VR(mma_xvf64ger,8)
+ end interface
+ public mma_xvf64ger
+
+! mma_xvf64gernn
+ VEC_SUB_VQ_VP_VU(mma_xvf64gernn,inout,1)
+ VEC_SUB_VQ_VP_VR(mma_xvf64gernn,inout,8)
+ interface mma_xvf64gernn
+ procedure :: SUB_VQ_VP_VU(mma_xvf64gernn,1)
+ procedure :: SUB_VQ_VP_VR(mma_xvf64gernn,8)
+ end interface
+ public mma_xvf64gernn
+
+! mma_xvf64gernp
+ VEC_SUB_VQ_VP_VU(mma_xvf64gernp,inout,1)
+ VEC_SUB_VQ_VP_VR(mma_xvf64gernp,inout,8)
+ interface mma_xvf64gernp
+ procedure :: SUB_VQ_VP_VU(mma_xvf64gernp,1)
+ procedure :: SUB_VQ_VP_VR(mma_xvf64gernp,8)
+ end interface
+ public mma_xvf64gernp
+
+! mma_xvf64gerpn
+ VEC_SUB_VQ_VP_VU(mma_xvf64gerpn,inout,1)
+ VEC_SUB_VQ_VP_VR(mma_xvf64gerpn,inout,8)
+ interface mma_xvf64gerpn
+ procedure :: SUB_VQ_VP_VU(mma_xvf64gerpn,1)
+ procedure :: SUB_VQ_VP_VR(mma_xvf64gerpn,8)
+ end interface
+ public mma_xvf64gerpn
+
+! mma_xvf64gerpp
+ VEC_SUB_VQ_VP_VU(mma_xvf64gerpp,inout,1)
+ VEC_SUB_VQ_VP_VR(mma_xvf64gerpp,inout,8)
+ interface mma_xvf64gerpp
+ procedure :: SUB_VQ_VP_VU(mma_xvf64gerpp,1)
+ procedure :: SUB_VQ_VP_VR(mma_xvf64gerpp,8)
+ end interface
+ public mma_xvf64gerpp
+
+#undef VEC_SUB_VQ_VP_VR
+#undef VEC_SUB_VQ_VP_VU
+#undef SUB_VQ_VP_VR
+#undef SUB_VQ_VP_VU
+
! mma_disassemble_acc
procedure(sub_atvq) :: __ppc_mma_disassemble_acc
interface mma_disassemble_acc
@@ -216,5 +913,25 @@ elemental subroutine sub_atvq(data, acc)
end interface
public mma_disassemble_pair
-end module
+! mma_xxmfacc
+ procedure(sub_vq) :: __ppc_mma_xxmfacc
+ interface mma_xxmfacc
+ procedure :: __ppc_mma_xxmfacc
+ end interface
+ public mma_xxmfacc
+! mma_xxmtacc
+ procedure(sub_vq) :: __ppc_mma_xxmtacc
+ interface mma_xxmtacc
+ procedure :: __ppc_mma_xxmtacc
+ end interface
+ public mma_xxmtacc
+
+! mma_xxsetaccz
+ procedure(sub_vq) :: __ppc_mma_xxsetaccz
+ interface mma_xxsetaccz
+ procedure :: __ppc_mma_xxsetaccz
+ end interface
+ public mma_xxsetaccz
+
+end module
diff --git a/flang/test/Lower/PowerPC/ppc-mma-accumulator-move-clear.f90 b/flang/test/Lower/PowerPC/ppc-mma-accumulator-move-clear.f90
new file mode 100644
index 00000000000000..cc9689b70343c6
--- /dev/null
+++ b/flang/test/Lower/PowerPC/ppc-mma-accumulator-move-clear.f90
@@ -0,0 +1,40 @@
+! RUN: %flang_fc1 -triple powerpc64le-unknown-unknown -target-cpu pwr10 -emit-llvm %s -o - | FileCheck --check-prefixes="CHECK" %s
+! REQUIRES: target=powerpc{{.*}}
+
+ subroutine test_xxmfacc()
+ use, intrinsic :: mma
+ implicit none
+ __vector_quad :: cq
+ call mma_xxmfacc(cq)
+ end subroutine test_xxmfacc
+
+!CHECK-LABEL: @test_xxmfacc_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = load <512 x i1>, ptr %1, align 64
+!CHECK: %3 = call <512 x i1> @llvm.ppc.mma.xxmfacc(<512 x i1> %2)
+!CHECK: store <512 x i1> %3, ptr %1, align 64
+
+ subroutine test_xxmtacc()
+ use, intrinsic :: mma
+ implicit none
+ __vector_quad :: cq
+ call mma_xxmtacc(cq)
+ end subroutine test_xxmtacc
+
+!CHECK-LABEL: @test_xxmtacc_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = load <512 x i1>, ptr %1, align 64
+!CHECK: %3 = call <512 x i1> @llvm.ppc.mma.xxmtacc(<512 x i1> %2)
+!CHECK: store <512 x i1> %3, ptr %1, align 64
+
+ subroutine test_xxsetaccz()
+ use, intrinsic :: mma
+ implicit none
+ __vector_quad :: cq
+ call mma_xxsetaccz(cq)
+ end subroutine test_xxsetaccz
+
+!CHECK-LABEL: @test_xxsetaccz_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = call <512 x i1> @llvm.ppc.mma.xxsetaccz()
+!CHECK: store <512 x i1> %2, ptr %1, align 64
diff --git a/flang/test/Lower/PowerPC/ppc-mma-assemble-disassemble.f90 b/flang/test/Lower/PowerPC/ppc-mma-assemble-disassemble.f90
index 673ec4b846354f..1ae6c5305345f3 100644
--- a/flang/test/Lower/PowerPC/ppc-mma-assemble-disassemble.f90
+++ b/flang/test/Lower/PowerPC/ppc-mma-assemble-disassemble.f90
@@ -1,4 +1,4 @@
-! RUN: %flang --target=powerpc64le-unknown-linux-gnu -mcpu=pwr10 -emit-llvm -S %s -o - | FileCheck --check-prefixes="CHECK" %s
+! RUN: %flang_fc1 -triple powerpc64le-unknown-unknown -target-cpu pwr10 -emit-llvm %s -o - | FileCheck --check-prefixes="CHECK" %s
! REQUIRES: target=powerpc{{.*}}
! mma_assemble_acc
diff --git a/flang/test/Lower/PowerPC/ppc-mma-outer-product-1.f90 b/flang/test/Lower/PowerPC/ppc-mma-outer-product-1.f90
new file mode 100644
index 00000000000000..96c7d65a1817ab
--- /dev/null
+++ b/flang/test/Lower/PowerPC/ppc-mma-outer-product-1.f90
@@ -0,0 +1,1701 @@
+! RUN: %flang_fc1 -triple powerpc64le-unknown-unknown -target-cpu pwr10 -emit-llvm %s -o - | FileCheck --check-prefixes="CHECK" %s
+! REQUIRES: target=powerpc{{.*}}
+
+ subroutine test_pmxvbf16ger2_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10, vu11
+ __vector_quad :: cq
+ call mma_pmxvbf16ger2(cq, vu10, vu11, 7, 7, 2)
+ end subroutine test_pmxvbf16ger2_def
+
+!CHECK-LABEL: @test_pmxvbf16ger2_def_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <16 x i8>, ptr %2, align 16
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = call <512 x i1> @llvm.ppc.mma.pmxvbf16ger2(<16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK: store <512 x i1> %6, ptr %1, align 64
+
+
+ subroutine test_pmxvbf16ger2_non_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10, vu11
+ __vector_quad :: cq
+ call mma_pmxvbf16ger2(cq, vu10, vu11, 7_2, 7_1, 2_8)
+ end subroutine test_pmxvbf16ger2_non_def
+
+!CHECK-LABEL: @test_pmxvbf16ger2_non_def_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <16 x i8>, ptr %2, align 16
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = call <512 x i1> @llvm.ppc.mma.pmxvbf16ger2(<16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK: store <512 x i1> %6, ptr %1, align 64
+
+
+ subroutine test_pmxvbf16ger2nn_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10, vu11
+ __vector_quad :: cq
+ call mma_pmxvbf16ger2nn(cq, vu10, vu11, 7, 7, 2)
+ end subroutine test_pmxvbf16ger2nn_def
+
+!CHECK-LABEL: @test_pmxvbf16ger2nn_def_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <16 x i8>, ptr %2, align 16
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %1, align 64
+!CHECK: %7 = call <512 x i1> @llvm.ppc.mma.pmxvbf16ger2nn(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK: store <512 x i1> %7, ptr %1, align 64
+
+ subroutine test_pmxvbf16ger2nn_non_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10, vu11
+ __vector_quad :: cq
+ call mma_pmxvbf16ger2nn(cq, vu10, vu11, 7_2, 7_1, 2_8)
+ end subroutine test_pmxvbf16ger2nn_non_def
+
+!CHECK-LABEL: @test_pmxvbf16ger2nn_non_def_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <16 x i8>, ptr %2, align 16
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %1, align 64
+!CHECK: %7 = call <512 x i1> @llvm.ppc.mma.pmxvbf16ger2nn(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK: store <512 x i1> %7, ptr %1, align 64
+
+ subroutine test_pmxvbf16ger2np_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10, vu11
+ __vector_quad :: cq
+ call mma_pmxvbf16ger2np(cq, vu10, vu11, 7, 7, 2)
+ end subroutine test_pmxvbf16ger2np_def
+
+!CHECK-LABEL: @test_pmxvbf16ger2np_def_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <16 x i8>, ptr %2, align 16
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %1, align 64
+!CHECK: %7 = call <512 x i1> @llvm.ppc.mma.pmxvbf16ger2np(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK: store <512 x i1> %7, ptr %1, align 64
+
+ subroutine test_pmxvbf16ger2np_non_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10, vu11
+ __vector_quad :: cq
+ call mma_pmxvbf16ger2np(cq, vu10, vu11, 7_2, 7_1, 2_8)
+ end subroutine test_pmxvbf16ger2np_non_def
+
+!CHECK-LABEL: @test_pmxvbf16ger2np_non_def_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <16 x i8>, ptr %2, align 16
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %1, align 64
+!CHECK: %7 = call <512 x i1> @llvm.ppc.mma.pmxvbf16ger2np(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK: store <512 x i1> %7, ptr %1, align 64
+
+ subroutine test_pmxvbf16ger2pn_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10, vu11
+ __vector_quad :: cq
+ call mma_pmxvbf16ger2pn(cq, vu10, vu11, 7, 7, 2)
+ end subroutine test_pmxvbf16ger2pn_def
+
+!CHECK-LABEL: @test_pmxvbf16ger2pn_def_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <16 x i8>, ptr %2, align 16
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %1, align 64
+!CHECK: %7 = call <512 x i1> @llvm.ppc.mma.pmxvbf16ger2pn(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK: store <512 x i1> %7, ptr %1, align 64
+
+ subroutine test_pmxvbf16ger2pn_non_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10, vu11
+ __vector_quad :: cq
+ call mma_pmxvbf16ger2pn(cq, vu10, vu11, 7_2, 7_1, 2_8)
+ end subroutine test_pmxvbf16ger2pn_non_def
+
+!CHECK-LABEL: @test_pmxvbf16ger2pn_non_def_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <16 x i8>, ptr %2, align 16
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %1, align 64
+!CHECK: %7 = call <512 x i1> @llvm.ppc.mma.pmxvbf16ger2pn(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK: store <512 x i1> %7, ptr %1, align 64
+
+ subroutine test_pmxvbf16ger2pp_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10, vu11
+ __vector_quad :: cq
+ call mma_pmxvbf16ger2pp(cq, vu10, vu11, 7, 7, 2)
+ end subroutine test_pmxvbf16ger2pp_def
+
+!CHECK-LABEL: @test_pmxvbf16ger2pp_def_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <16 x i8>, ptr %2, align 16
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %1, align 64
+!CHECK: %7 = call <512 x i1> @llvm.ppc.mma.pmxvbf16ger2pp(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK: store <512 x i1> %7, ptr %1, align 64
+
+ subroutine test_pmxvbf16ger2pp_non_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10, vu11
+ __vector_quad :: cq
+ call mma_pmxvbf16ger2pp(cq, vu10, vu11, 7_2, 7_1, 2_8)
+ end subroutine test_pmxvbf16ger2pp_non_def
+
+!CHECK-LABEL: @test_pmxvbf16ger2pp_non_def_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <16 x i8>, ptr %2, align 16
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %1, align 64
+!CHECK: %7 = call <512 x i1> @llvm.ppc.mma.pmxvbf16ger2pp(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK: store <512 x i1> %7, ptr %1, align 64
+
+ subroutine test_pmxvf16ger2_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10, vu11
+ __vector_quad :: cq
+ call mma_pmxvf16ger2(cq, vu10, vu11, 7, 7, 2)
+ end subroutine test_pmxvf16ger2_def
+
+!CHECK-LABEL: @test_pmxvf16ger2_def_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <16 x i8>, ptr %2, align 16
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = call <512 x i1> @llvm.ppc.mma.pmxvf16ger2(<16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK: store <512 x i1> %6, ptr %1, align 64
+
+ subroutine test_pmxvf16ger2_non_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10, vu11
+ __vector_quad :: cq
+ call mma_pmxvf16ger2(cq, vu10, vu11, 7_2, 7_1, 2_8)
+ end subroutine test_pmxvf16ger2_non_def
+
+!CHECK-LABEL: @test_pmxvf16ger2_non_def_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <16 x i8>, ptr %2, align 16
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = call <512 x i1> @llvm.ppc.mma.pmxvf16ger2(<16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK: store <512 x i1> %6, ptr %1, align 64
+
+ subroutine test_pmxvf16ger2nn_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10, vu11
+ __vector_quad :: cq
+ call mma_pmxvf16ger2nn(cq, vu10, vu11, 7, 7, 2)
+ end subroutine test_pmxvf16ger2nn_def
+
+!CHECK-LABEL: @test_pmxvf16ger2nn_def_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <16 x i8>, ptr %2, align 16
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %1, align 64
+!CHECK: %7 = call <512 x i1> @llvm.ppc.mma.pmxvf16ger2nn(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK: store <512 x i1> %7, ptr %1, align 64
+
+ subroutine test_pmxvf16ger2nn_non_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10, vu11
+ __vector_quad :: cq
+ call mma_pmxvf16ger2nn(cq, vu10, vu11, 7_2, 7_1, 2_8)
+ end subroutine test_pmxvf16ger2nn_non_def
+
+!CHECK-LABEL: @test_pmxvf16ger2nn_non_def_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <16 x i8>, ptr %2, align 16
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %1, align 64
+!CHECK: %7 = call <512 x i1> @llvm.ppc.mma.pmxvf16ger2nn(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK: store <512 x i1> %7, ptr %1, align 64
+
+ subroutine test_pmxvf16ger2np_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10, vu11
+ __vector_quad :: cq
+ call mma_pmxvf16ger2np(cq, vu10, vu11, 7, 7, 2)
+ end subroutine test_pmxvf16ger2np_def
+
+!CHECK-LABEL: @test_pmxvf16ger2np_def_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <16 x i8>, ptr %2, align 16
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %1, align 64
+!CHECK: %7 = call <512 x i1> @llvm.ppc.mma.pmxvf16ger2np(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK: store <512 x i1> %7, ptr %1, align 64
+
+ subroutine test_pmxvf16ger2np_non_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10, vu11
+ __vector_quad :: cq
+ call mma_pmxvf16ger2np(cq, vu10, vu11, 7_2, 7_1, 2_8)
+ end subroutine test_pmxvf16ger2np_non_def
+
+!CHECK-LABEL: @test_pmxvf16ger2np_non_def_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <16 x i8>, ptr %2, align 16
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %1, align 64
+!CHECK: %7 = call <512 x i1> @llvm.ppc.mma.pmxvf16ger2np(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK: store <512 x i1> %7, ptr %1, align 64
+
+ subroutine test_pmxvf16ger2pn_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10, vu11
+ __vector_quad :: cq
+ call mma_pmxvf16ger2pn(cq, vu10, vu11, 7, 7, 2)
+ end subroutine test_pmxvf16ger2pn_def
+
+!CHECK-LABEL: @test_pmxvf16ger2pn_def_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <16 x i8>, ptr %2, align 16
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %1, align 64
+!CHECK: %7 = call <512 x i1> @llvm.ppc.mma.pmxvf16ger2pn(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK: store <512 x i1> %7, ptr %1, align 64
+
+ subroutine test_pmxvf16ger2pn_non_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10, vu11
+ __vector_quad :: cq
+ call mma_pmxvf16ger2pn(cq, vu10, vu11, 7_2, 7_1, 2_8)
+ end subroutine test_pmxvf16ger2pn_non_def
+
+!CHECK-LABEL: @test_pmxvf16ger2pn_non_def_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <16 x i8>, ptr %2, align 16
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %1, align 64
+!CHECK: %7 = call <512 x i1> @llvm.ppc.mma.pmxvf16ger2pn(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK: store <512 x i1> %7, ptr %1, align 64
+
+ subroutine test_pmxvf16ger2pp_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10, vu11
+ __vector_quad :: cq
+ call mma_pmxvf16ger2pp(cq, vu10, vu11, 7, 7, 2)
+ end subroutine test_pmxvf16ger2pp_def
+
+!CHECK-LABEL: @test_pmxvf16ger2pp_def_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <16 x i8>, ptr %2, align 16
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %1, align 64
+!CHECK: %7 = call <512 x i1> @llvm.ppc.mma.pmxvf16ger2pp(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK: store <512 x i1> %7, ptr %1, align 64
+
+ subroutine test_pmxvf16ger2pp_non_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10, vu11
+ __vector_quad :: cq
+ call mma_pmxvf16ger2pp(cq, vu10, vu11, 7_2, 7_1, 2_8)
+ end subroutine test_pmxvf16ger2pp_non_def
+
+!CHECK-LABEL: @test_pmxvf16ger2pp_non_def_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <16 x i8>, ptr %2, align 16
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %1, align 64
+!CHECK: %7 = call <512 x i1> @llvm.ppc.mma.pmxvf16ger2pp(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK: store <512 x i1> %7, ptr %1, align 64
+
+ subroutine test_pmxvf32ger_u1_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10, vu11
+ __vector_quad :: cq
+ call mma_pmxvf32ger(cq, vu10, vu11, 7, 2)
+ end subroutine test_pmxvf32ger_u1_def
+
+!CHECK-LABEL: @test_pmxvf32ger_u1_def_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <16 x i8>, ptr %2, align 16
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = call <512 x i1> @llvm.ppc.mma.pmxvf32ger(<16 x i8> %4, <16 x i8> %5, i32 7, i32 2)
+!CHECK: store <512 x i1> %6, ptr %1, align 64
+
+ subroutine test_pmxvf32ger_u1_non_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10, vu11
+ __vector_quad :: cq
+ call mma_pmxvf32ger(cq, vu10, vu11, 7_2, 2_1)
+ end subroutine test_pmxvf32ger_u1_non_def
+
+!CHECK-LABEL: @test_pmxvf32ger_u1_non_def_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <16 x i8>, ptr %2, align 16
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = call <512 x i1> @llvm.ppc.mma.pmxvf32ger(<16 x i8> %4, <16 x i8> %5, i32 7, i32 2)
+!CHECK: store <512 x i1> %6, ptr %1, align 64
+
+ subroutine test_pmxvf32ger_r4_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(real(4)) vr40, vr41
+ __vector_quad :: cq
+ call mma_pmxvf32ger(cq, vr40, vr41, 7, 2)
+ end subroutine test_pmxvf32ger_r4_def
+
+!CHECK-LABEL: @test_pmxvf32ger_r4_def_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <4 x float>, i64 1, align 16
+!CHECK: %3 = alloca <4 x float>, i64 1, align 16
+!CHECK: %4 = load <4 x float>, ptr %2, align 16
+!CHECK: %5 = load <4 x float>, ptr %3, align 16
+!CHECK: %6 = bitcast <4 x float> %4 to <16 x i8>
+!CHECK: %7 = bitcast <4 x float> %5 to <16 x i8>
+!CHECK: %8 = call <512 x i1> @llvm.ppc.mma.pmxvf32ger(<16 x i8> %6, <16 x i8> %7, i32 7, i32 2)
+!CHECK: store <512 x i1> %8, ptr %1, align 64
+
+ subroutine test_pmxvf32ger_r4_non_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(real(4)) vr40, vr41
+ __vector_quad :: cq
+ call mma_pmxvf32ger(cq, vr40, vr41, 7_2, 2_1)
+ end subroutine test_pmxvf32ger_r4_non_def
+
+!CHECK-LABEL: @test_pmxvf32ger_r4_non_def_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <4 x float>, i64 1, align 16
+!CHECK: %3 = alloca <4 x float>, i64 1, align 16
+!CHECK: %4 = load <4 x float>, ptr %2, align 16
+!CHECK: %5 = load <4 x float>, ptr %3, align 16
+!CHECK: %6 = bitcast <4 x float> %4 to <16 x i8>
+!CHECK: %7 = bitcast <4 x float> %5 to <16 x i8>
+!CHECK: %8 = call <512 x i1> @llvm.ppc.mma.pmxvf32ger(<16 x i8> %6, <16 x i8> %7, i32 7, i32 2)
+!CHECK: store <512 x i1> %8, ptr %1, align 64
+
+ subroutine test_pmxvf32gernn_u1_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10, vu11
+ __vector_quad :: cq
+ call mma_pmxvf32gernn(cq, vu10, vu11, 7, 2)
+ end subroutine test_pmxvf32gernn_u1_def
+
+!CHECK-LABEL: @test_pmxvf32gernn_u1_def_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <16 x i8>, ptr %2, align 16
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %1, align 64
+!CHECK: %7 = call <512 x i1> @llvm.ppc.mma.pmxvf32gernn(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5, i32 7, i32 2)
+!CHECK: store <512 x i1> %7, ptr %1, align 64
+
+ subroutine test_pmxvf32gernn_u1_non_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10, vu11
+ __vector_quad :: cq
+ call mma_pmxvf32gernn(cq, vu10, vu11, 7_2, 2_1)
+ end subroutine test_pmxvf32gernn_u1_non_def
+
+!CHECK-LABEL: @test_pmxvf32gernn_u1_non_def_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <16 x i8>, ptr %2, align 16
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %1, align 64
+!CHECK: %7 = call <512 x i1> @llvm.ppc.mma.pmxvf32gernn(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5, i32 7, i32 2)
+!CHECK: store <512 x i1> %7, ptr %1, align 64
+
+ subroutine test_pmxvf32gernn_r4_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(real(4)) vr40, vr41
+ __vector_quad :: cq
+ call mma_pmxvf32gernn(cq, vr40, vr41, 7, 2)
+ end subroutine test_pmxvf32gernn_r4_def
+
+!CHECK-LABEL: @test_pmxvf32gernn_r4_def_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <4 x float>, i64 1, align 16
+!CHECK: %3 = alloca <4 x float>, i64 1, align 16
+!CHECK: %4 = load <4 x float>, ptr %2, align 16
+!CHECK: %5 = load <4 x float>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %1, align 64
+!CHECK: %7 = bitcast <4 x float> %4 to <16 x i8>
+!CHECK: %8 = bitcast <4 x float> %5 to <16 x i8>
+!CHECK: %9 = call <512 x i1> @llvm.ppc.mma.pmxvf32gernn(<512 x i1> %6, <16 x i8> %7, <16 x i8> %8, i32 7, i32 2)
+!CHECK: store <512 x i1> %9, ptr %1, align 64
+
+ subroutine test_pmxvf32gernn_r4_non_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(real(4)) vr40, vr41
+ __vector_quad :: cq
+ call mma_pmxvf32gernn(cq, vr40, vr41, 7_2, 2_1)
+ end subroutine test_pmxvf32gernn_r4_non_def
+
+!CHECK-LABEL: @test_pmxvf32gernn_r4_non_def_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <4 x float>, i64 1, align 16
+!CHECK: %3 = alloca <4 x float>, i64 1, align 16
+!CHECK: %4 = load <4 x float>, ptr %2, align 16
+!CHECK: %5 = load <4 x float>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %1, align 64
+!CHECK: %7 = bitcast <4 x float> %4 to <16 x i8>
+!CHECK: %8 = bitcast <4 x float> %5 to <16 x i8>
+!CHECK: %9 = call <512 x i1> @llvm.ppc.mma.pmxvf32gernn(<512 x i1> %6, <16 x i8> %7, <16 x i8> %8, i32 7, i32 2)
+!CHECK: store <512 x i1> %9, ptr %1, align 64
+
+ subroutine test_pmxvf32gernp_u1_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10, vu11
+ __vector_quad :: cq
+ call mma_pmxvf32gernp(cq, vu10, vu11, 7, 2)
+ end subroutine test_pmxvf32gernp_u1_def
+
+!CHECK-LABEL: @test_pmxvf32gernp_u1_def_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <16 x i8>, ptr %2, align 16
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %1, align 64
+!CHECK: %7 = call <512 x i1> @llvm.ppc.mma.pmxvf32gernp(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5, i32 7, i32 2)
+!CHECK: store <512 x i1> %7, ptr %1, align 64
+
+ subroutine test_pmxvf32gernp_u1_non_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10, vu11
+ __vector_quad :: cq
+ call mma_pmxvf32gernp(cq, vu10, vu11, 7_2, 2_1)
+ end subroutine test_pmxvf32gernp_u1_non_def
+
+!CHECK-LABEL: @test_pmxvf32gernp_u1_non_def_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <16 x i8>, ptr %2, align 16
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %1, align 64
+!CHECK: %7 = call <512 x i1> @llvm.ppc.mma.pmxvf32gernp(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5, i32 7, i32 2)
+!CHECK: store <512 x i1> %7, ptr %1, align 64
+
+ subroutine test_pmxvf32gernp_r4_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(real(4)) vr40, vr41
+ __vector_quad :: cq
+ call mma_pmxvf32gernp(cq, vr40, vr41, 7, 2)
+ end subroutine test_pmxvf32gernp_r4_def
+
+!CHECK-LABEL: @test_pmxvf32gernp_r4_def_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <4 x float>, i64 1, align 16
+!CHECK: %3 = alloca <4 x float>, i64 1, align 16
+!CHECK: %4 = load <4 x float>, ptr %2, align 16
+!CHECK: %5 = load <4 x float>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %1, align 64
+!CHECK: %7 = bitcast <4 x float> %4 to <16 x i8>
+!CHECK: %8 = bitcast <4 x float> %5 to <16 x i8>
+!CHECK: %9 = call <512 x i1> @llvm.ppc.mma.pmxvf32gernp(<512 x i1> %6, <16 x i8> %7, <16 x i8> %8, i32 7, i32 2)
+!CHECK: store <512 x i1> %9, ptr %1, align 64
+
+ subroutine test_pmxvf32gernp_r4_non_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(real(4)) vr40, vr41
+ __vector_quad :: cq
+ call mma_pmxvf32gernp(cq, vr40, vr41, 7_2, 2_1)
+ end subroutine test_pmxvf32gernp_r4_non_def
+
+!CHECK-LABEL: @test_pmxvf32gernp_r4_non_def_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <4 x float>, i64 1, align 16
+!CHECK: %3 = alloca <4 x float>, i64 1, align 16
+!CHECK: %4 = load <4 x float>, ptr %2, align 16
+!CHECK: %5 = load <4 x float>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %1, align 64
+!CHECK: %7 = bitcast <4 x float> %4 to <16 x i8>
+!CHECK: %8 = bitcast <4 x float> %5 to <16 x i8>
+!CHECK: %9 = call <512 x i1> @llvm.ppc.mma.pmxvf32gernp(<512 x i1> %6, <16 x i8> %7, <16 x i8> %8, i32 7, i32 2)
+!CHECK: store <512 x i1> %9, ptr %1, align 64
+
+ subroutine test_pmxvf32gerpn_u1_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10, vu11
+ __vector_quad :: cq
+ call mma_pmxvf32gerpn(cq, vu10, vu11, 7, 2)
+ end subroutine test_pmxvf32gerpn_u1_def
+
+!CHECK-LABEL: @test_pmxvf32gerpn_u1_def_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <16 x i8>, ptr %2, align 16
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %1, align 64
+!CHECK: %7 = call <512 x i1> @llvm.ppc.mma.pmxvf32gerpn(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5, i32 7, i32 2)
+!CHECK: store <512 x i1> %7, ptr %1, align 64
+
+ subroutine test_pmxvf32gerpn_u1_non_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10, vu11
+ __vector_quad :: cq
+ call mma_pmxvf32gerpn(cq, vu10, vu11, 7_2, 2_1)
+ end subroutine test_pmxvf32gerpn_u1_non_def
+
+!CHECK-LABEL: @test_pmxvf32gerpn_u1_non_def_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <16 x i8>, ptr %2, align 16
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %1, align 64
+!CHECK: %7 = call <512 x i1> @llvm.ppc.mma.pmxvf32gerpn(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5, i32 7, i32 2)
+!CHECK: store <512 x i1> %7, ptr %1, align 64
+
+ subroutine test_pmxvf32gerpn_r4_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(real(4)) vr40, vr41
+ __vector_quad :: cq
+ call mma_pmxvf32gerpn(cq, vr40, vr41, 7, 2)
+ end subroutine test_pmxvf32gerpn_r4_def
+
+!CHECK-LABEL: @test_pmxvf32gerpn_r4_def_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <4 x float>, i64 1, align 16
+!CHECK: %3 = alloca <4 x float>, i64 1, align 16
+!CHECK: %4 = load <4 x float>, ptr %2, align 16
+!CHECK: %5 = load <4 x float>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %1, align 64
+!CHECK: %7 = bitcast <4 x float> %4 to <16 x i8>
+!CHECK: %8 = bitcast <4 x float> %5 to <16 x i8>
+!CHECK: %9 = call <512 x i1> @llvm.ppc.mma.pmxvf32gerpn(<512 x i1> %6, <16 x i8> %7, <16 x i8> %8, i32 7, i32 2)
+!CHECK: store <512 x i1> %9, ptr %1, align 64
+
+ subroutine test_pmxvf32gerpn_r4_non_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(real(4)) vr40, vr41
+ __vector_quad :: cq
+ call mma_pmxvf32gerpn(cq, vr40, vr41, 7_2, 2_1)
+ end subroutine test_pmxvf32gerpn_r4_non_def
+
+!CHECK-LABEL: @test_pmxvf32gerpn_r4_non_def_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <4 x float>, i64 1, align 16
+!CHECK: %3 = alloca <4 x float>, i64 1, align 16
+!CHECK: %4 = load <4 x float>, ptr %2, align 16
+!CHECK: %5 = load <4 x float>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %1, align 64
+!CHECK: %7 = bitcast <4 x float> %4 to <16 x i8>
+!CHECK: %8 = bitcast <4 x float> %5 to <16 x i8>
+!CHECK: %9 = call <512 x i1> @llvm.ppc.mma.pmxvf32gerpn(<512 x i1> %6, <16 x i8> %7, <16 x i8> %8, i32 7, i32 2)
+!CHECK: store <512 x i1> %9, ptr %1, align 64
+
+ subroutine test_pmxvf32gerpp_u1_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10, vu11
+ __vector_quad :: cq
+ call mma_pmxvf32gerpp(cq, vu10, vu11, 7, 2)
+ end subroutine test_pmxvf32gerpp_u1_def
+
+!CHECK-LABEL: @test_pmxvf32gerpp_u1_def_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <16 x i8>, ptr %2, align 16
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %1, align 64
+!CHECK: %7 = call <512 x i1> @llvm.ppc.mma.pmxvf32gerpp(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5, i32 7, i32 2)
+!CHECK: store <512 x i1> %7, ptr %1, align 64
+
+ subroutine test_pmxvf32gerpp_u1_non_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10, vu11
+ __vector_quad :: cq
+ call mma_pmxvf32gerpp(cq, vu10, vu11, 7_2, 2_1)
+ end subroutine test_pmxvf32gerpp_u1_non_def
+
+!CHECK-LABEL: @test_pmxvf32gerpp_u1_non_def_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <16 x i8>, ptr %2, align 16
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %1, align 64
+!CHECK: %7 = call <512 x i1> @llvm.ppc.mma.pmxvf32gerpp(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5, i32 7, i32 2)
+!CHECK: store <512 x i1> %7, ptr %1, align 64
+
+ subroutine test_pmxvf32gerpp_r4_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(real(4)) vr40, vr41
+ __vector_quad :: cq
+ call mma_pmxvf32gerpp(cq, vr40, vr41, 7, 2)
+ end subroutine test_pmxvf32gerpp_r4_def
+
+!CHECK-LABEL: @test_pmxvf32gerpp_r4_def_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <4 x float>, i64 1, align 16
+!CHECK: %3 = alloca <4 x float>, i64 1, align 16
+!CHECK: %4 = load <4 x float>, ptr %2, align 16
+!CHECK: %5 = load <4 x float>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %1, align 64
+!CHECK: %7 = bitcast <4 x float> %4 to <16 x i8>
+!CHECK: %8 = bitcast <4 x float> %5 to <16 x i8>
+!CHECK: %9 = call <512 x i1> @llvm.ppc.mma.pmxvf32gerpp(<512 x i1> %6, <16 x i8> %7, <16 x i8> %8, i32 7, i32 2)
+!CHECK: store <512 x i1> %9, ptr %1, align 64
+
+ subroutine test_pmxvf32gerpp_r4_non_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(real(4)) vr40, vr41
+ __vector_quad :: cq
+ call mma_pmxvf32gerpp(cq, vr40, vr41, 7_2, 2_1)
+ end subroutine test_pmxvf32gerpp_r4_non_def
+
+!CHECK-LABEL: @test_pmxvf32gerpp_r4_non_def_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <4 x float>, i64 1, align 16
+!CHECK: %3 = alloca <4 x float>, i64 1, align 16
+!CHECK: %4 = load <4 x float>, ptr %2, align 16
+!CHECK: %5 = load <4 x float>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %1, align 64
+!CHECK: %7 = bitcast <4 x float> %4 to <16 x i8>
+!CHECK: %8 = bitcast <4 x float> %5 to <16 x i8>
+!CHECK: %9 = call <512 x i1> @llvm.ppc.mma.pmxvf32gerpp(<512 x i1> %6, <16 x i8> %7, <16 x i8> %8, i32 7, i32 2)
+!CHECK: store <512 x i1> %9, ptr %1, align 64
+
+ subroutine test_pmxvf64ger_u1_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10
+ __vector_pair :: cp
+ __vector_quad :: cq
+ call mma_pmxvf64ger(cq, cp, vu10, 7, 2)
+ end subroutine test_pmxvf64ger_u1_def
+
+!CHECK-LABEL: @test_pmxvf64ger_u1_def_
+!CHECK: %1 = alloca <256 x i1>, i64 1, align 32
+!CHECK: %2 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <256 x i1>, ptr %1, align 32
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = call <512 x i1> @llvm.ppc.mma.pmxvf64ger(<256 x i1> %4, <16 x i8> %5, i32 7, i32 2)
+!CHECK: store <512 x i1> %6, ptr %2, align 64
+
+ subroutine test_pmxvf64ger_u1_non_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10
+ __vector_pair :: cp
+ __vector_quad :: cq
+ call mma_pmxvf64ger(cq, cp, vu10, 7_2, 2_1)
+ end subroutine test_pmxvf64ger_u1_non_def
+
+!CHECK-LABEL: @test_pmxvf64ger_u1_non_def_
+!CHECK: %1 = alloca <256 x i1>, i64 1, align 32
+!CHECK: %2 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <256 x i1>, ptr %1, align 32
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = call <512 x i1> @llvm.ppc.mma.pmxvf64ger(<256 x i1> %4, <16 x i8> %5, i32 7, i32 2)
+!CHECK: store <512 x i1> %6, ptr %2, align 64
+
+ subroutine test_pmxvf64ger_r8_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(real(8)) vr80
+ __vector_pair :: cp
+ __vector_quad :: cq
+ call mma_pmxvf64ger(cq, cp, vr80, 7, 2)
+ end subroutine test_pmxvf64ger_r8_def
+
+!CHECK-LABEL: @test_pmxvf64ger_r8_def_
+!CHECK: %1 = alloca <256 x i1>, i64 1, align 32
+!CHECK: %2 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %3 = alloca <2 x double>, i64 1, align 16
+!CHECK: %4 = load <256 x i1>, ptr %1, align 32
+!CHECK: %5 = load <2 x double>, ptr %3, align 16
+!CHECK: %6 = bitcast <2 x double> %5 to <16 x i8>
+!CHECK: %7 = call <512 x i1> @llvm.ppc.mma.pmxvf64ger(<256 x i1> %4, <16 x i8> %6, i32 7, i32 2)
+!CHECK: store <512 x i1> %7, ptr %2, align 64
+
+ subroutine test_pmxvf64ger_r8_non_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(real(8)) vr80
+ __vector_pair :: cp
+ __vector_quad :: cq
+ call mma_pmxvf64ger(cq, cp, vr80, 7_2, 2_1)
+ end subroutine test_pmxvf64ger_r8_non_def
+
+!CHECK-LABEL: @test_pmxvf64ger_r8_non_def_
+!CHECK: %1 = alloca <256 x i1>, i64 1, align 32
+!CHECK: %2 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %3 = alloca <2 x double>, i64 1, align 16
+!CHECK: %4 = load <256 x i1>, ptr %1, align 32
+!CHECK: %5 = load <2 x double>, ptr %3, align 16
+!CHECK: %6 = bitcast <2 x double> %5 to <16 x i8>
+!CHECK: %7 = call <512 x i1> @llvm.ppc.mma.pmxvf64ger(<256 x i1> %4, <16 x i8> %6, i32 7, i32 2)
+!CHECK: store <512 x i1> %7, ptr %2, align 64
+
+ subroutine test_pmxvf64gernn_u1_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10
+ __vector_pair :: cp
+ __vector_quad :: cq
+ call mma_pmxvf64gernn(cq, cp, vu10, 7, 2)
+ end subroutine test_pmxvf64gernn_u1_def
+
+!CHECK-LABEL: @test_pmxvf64gernn_u1_def_
+!CHECK: %1 = alloca <256 x i1>, i64 1, align 32
+!CHECK: %2 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <256 x i1>, ptr %1, align 32
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %2, align 64
+!CHECK: %7 = call <512 x i1> @llvm.ppc.mma.pmxvf64gernn(<512 x i1> %6, <256 x i1> %4, <16 x i8> %5, i32 7, i32 2)
+!CHECK: store <512 x i1> %7, ptr %2, align 64
+
+ subroutine test_pmxvf64gernn_u1_non_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10
+ __vector_pair :: cp
+ __vector_quad :: cq
+ call mma_pmxvf64gernn(cq, cp, vu10, 7_2, 2_1)
+ end subroutine test_pmxvf64gernn_u1_non_def
+
+!CHECK-LABEL: @test_pmxvf64gernn_u1_non_def_
+!CHECK: %1 = alloca <256 x i1>, i64 1, align 32
+!CHECK: %2 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <256 x i1>, ptr %1, align 32
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %2, align 64
+!CHECK: %7 = call <512 x i1> @llvm.ppc.mma.pmxvf64gernn(<512 x i1> %6, <256 x i1> %4, <16 x i8> %5, i32 7, i32 2)
+!CHECK: store <512 x i1> %7, ptr %2, align 64
+
+ subroutine test_pmxvf64gernn_r8_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(real(8)) vr80
+ __vector_pair :: cp
+ __vector_quad :: cq
+ call mma_pmxvf64gernn(cq, cp, vr80, 7, 2)
+ end subroutine test_pmxvf64gernn_r8_def
+
+!CHECK-LABEL: @test_pmxvf64gernn_r8_def_
+!CHECK: %1 = alloca <256 x i1>, i64 1, align 32
+!CHECK: %2 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %3 = alloca <2 x double>, i64 1, align 16
+!CHECK: %4 = load <256 x i1>, ptr %1, align 32
+!CHECK: %5 = load <2 x double>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %2, align 64
+!CHECK: %7 = bitcast <2 x double> %5 to <16 x i8>
+!CHECK: %8 = call <512 x i1> @llvm.ppc.mma.pmxvf64gernn(<512 x i1> %6, <256 x i1> %4, <16 x i8> %7, i32 7, i32 2)
+!CHECK: store <512 x i1> %8, ptr %2, align 64
+
+ subroutine test_pmxvf64gernn_r8_non_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(real(8)) vr80
+ __vector_pair :: cp
+ __vector_quad :: cq
+ call mma_pmxvf64gernn(cq, cp, vr80, 7_2, 2_1)
+ end subroutine test_pmxvf64gernn_r8_non_def
+
+!CHECK-LABEL: @test_pmxvf64gernn_r8_non_def_
+!CHECK: %1 = alloca <256 x i1>, i64 1, align 32
+!CHECK: %2 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %3 = alloca <2 x double>, i64 1, align 16
+!CHECK: %4 = load <256 x i1>, ptr %1, align 32
+!CHECK: %5 = load <2 x double>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %2, align 64
+!CHECK: %7 = bitcast <2 x double> %5 to <16 x i8>
+!CHECK: %8 = call <512 x i1> @llvm.ppc.mma.pmxvf64gernn(<512 x i1> %6, <256 x i1> %4, <16 x i8> %7, i32 7, i32 2)
+!CHECK: store <512 x i1> %8, ptr %2, align 64
+
+ subroutine test_pmxvf64gernp_u1_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10
+ __vector_pair :: cp
+ __vector_quad :: cq
+ call mma_pmxvf64gernp(cq, cp, vu10, 7, 2)
+ end subroutine test_pmxvf64gernp_u1_def
+
+!CHECK-LABEL: @test_pmxvf64gernp_u1_def_
+!CHECK: %1 = alloca <256 x i1>, i64 1, align 32
+!CHECK: %2 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <256 x i1>, ptr %1, align 32
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %2, align 64
+!CHECK: %7 = call <512 x i1> @llvm.ppc.mma.pmxvf64gernp(<512 x i1> %6, <256 x i1> %4, <16 x i8> %5, i32 7, i32 2)
+!CHECK: store <512 x i1> %7, ptr %2, align 64
+
+ subroutine test_pmxvf64gernp_u1_non_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10
+ __vector_pair :: cp
+ __vector_quad :: cq
+ call mma_pmxvf64gernp(cq, cp, vu10, 7_2, 2_1)
+ end subroutine test_pmxvf64gernp_u1_non_def
+
+!CHECK-LABEL: @test_pmxvf64gernp_u1_non_def_
+!CHECK: %1 = alloca <256 x i1>, i64 1, align 32
+!CHECK: %2 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <256 x i1>, ptr %1, align 32
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %2, align 64
+!CHECK: %7 = call <512 x i1> @llvm.ppc.mma.pmxvf64gernp(<512 x i1> %6, <256 x i1> %4, <16 x i8> %5, i32 7, i32 2)
+!CHECK: store <512 x i1> %7, ptr %2, align 64
+
+ subroutine test_pmxvf64gernp_r8_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(real(8)) vr80
+ __vector_pair :: cp
+ __vector_quad :: cq
+ call mma_pmxvf64gernp(cq, cp, vr80, 7, 2)
+ end subroutine test_pmxvf64gernp_r8_def
+
+!CHECK-LABEL: @test_pmxvf64gernp_r8_def_
+!CHECK: %1 = alloca <256 x i1>, i64 1, align 32
+!CHECK: %2 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %3 = alloca <2 x double>, i64 1, align 16
+!CHECK: %4 = load <256 x i1>, ptr %1, align 32
+!CHECK: %5 = load <2 x double>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %2, align 64
+!CHECK: %7 = bitcast <2 x double> %5 to <16 x i8>
+!CHECK: %8 = call <512 x i1> @llvm.ppc.mma.pmxvf64gernp(<512 x i1> %6, <256 x i1> %4, <16 x i8> %7, i32 7, i32 2)
+!CHECK: store <512 x i1> %8, ptr %2, align 64
+
+ subroutine test_pmxvf64gernp_r8_non_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(real(8)) vr80
+ __vector_pair :: cp
+ __vector_quad :: cq
+ call mma_pmxvf64gernp(cq, cp, vr80, 7_2, 2_1)
+ end subroutine test_pmxvf64gernp_r8_non_def
+
+!CHECK-LABEL: @test_pmxvf64gernp_r8_non_def_
+!CHECK: %1 = alloca <256 x i1>, i64 1, align 32
+!CHECK: %2 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %3 = alloca <2 x double>, i64 1, align 16
+!CHECK: %4 = load <256 x i1>, ptr %1, align 32
+!CHECK: %5 = load <2 x double>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %2, align 64
+!CHECK: %7 = bitcast <2 x double> %5 to <16 x i8>
+!CHECK: %8 = call <512 x i1> @llvm.ppc.mma.pmxvf64gernp(<512 x i1> %6, <256 x i1> %4, <16 x i8> %7, i32 7, i32 2)
+!CHECK: store <512 x i1> %8, ptr %2, align 64
+
+ subroutine test_pmxvf64gerpn_u1_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10
+ __vector_pair :: cp
+ __vector_quad :: cq
+ call mma_pmxvf64gerpn(cq, cp, vu10, 7, 2)
+ end subroutine test_pmxvf64gerpn_u1_def
+
+!CHECK-LABEL: @test_pmxvf64gerpn_u1_def_
+!CHECK: %1 = alloca <256 x i1>, i64 1, align 32
+!CHECK: %2 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <256 x i1>, ptr %1, align 32
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %2, align 64
+!CHECK: %7 = call <512 x i1> @llvm.ppc.mma.pmxvf64gerpn(<512 x i1> %6, <256 x i1> %4, <16 x i8> %5, i32 7, i32 2)
+!CHECK: store <512 x i1> %7, ptr %2, align 64
+
+ subroutine test_pmxvf64gerpn_u1_non_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10
+ __vector_pair :: cp
+ __vector_quad :: cq
+ call mma_pmxvf64gerpn(cq, cp, vu10, 7_2, 2_1)
+ end subroutine test_pmxvf64gerpn_u1_non_def
+
+!CHECK-LABEL: @test_pmxvf64gerpn_u1_non_def_
+!CHECK: %1 = alloca <256 x i1>, i64 1, align 32
+!CHECK: %2 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <256 x i1>, ptr %1, align 32
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %2, align 64
+!CHECK: %7 = call <512 x i1> @llvm.ppc.mma.pmxvf64gerpn(<512 x i1> %6, <256 x i1> %4, <16 x i8> %5, i32 7, i32 2)
+!CHECK: store <512 x i1> %7, ptr %2, align 64
+
+ subroutine test_pmxvf64gerpn_r8_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(real(8)) vr80
+ __vector_pair :: cp
+ __vector_quad :: cq
+ call mma_pmxvf64gerpn(cq, cp, vr80, 7, 2)
+ end subroutine test_pmxvf64gerpn_r8_def
+
+!CHECK-LABEL: @test_pmxvf64gerpn_r8_def_
+!CHECK: %1 = alloca <256 x i1>, i64 1, align 32
+!CHECK: %2 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %3 = alloca <2 x double>, i64 1, align 16
+!CHECK: %4 = load <256 x i1>, ptr %1, align 32
+!CHECK: %5 = load <2 x double>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %2, align 64
+!CHECK: %7 = bitcast <2 x double> %5 to <16 x i8>
+!CHECK: %8 = call <512 x i1> @llvm.ppc.mma.pmxvf64gerpn(<512 x i1> %6, <256 x i1> %4, <16 x i8> %7, i32 7, i32 2)
+!CHECK: store <512 x i1> %8, ptr %2, align 64
+
+ subroutine test_pmxvf64gerpn_r8_non_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(real(8)) vr80
+ __vector_pair :: cp
+ __vector_quad :: cq
+ call mma_pmxvf64gerpn(cq, cp, vr80, 7_2, 2_1)
+ end subroutine test_pmxvf64gerpn_r8_non_def
+
+!CHECK-LABEL: @test_pmxvf64gerpn_r8_non_def_
+!CHECK: %1 = alloca <256 x i1>, i64 1, align 32
+!CHECK: %2 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %3 = alloca <2 x double>, i64 1, align 16
+!CHECK: %4 = load <256 x i1>, ptr %1, align 32
+!CHECK: %5 = load <2 x double>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %2, align 64
+!CHECK: %7 = bitcast <2 x double> %5 to <16 x i8>
+!CHECK: %8 = call <512 x i1> @llvm.ppc.mma.pmxvf64gerpn(<512 x i1> %6, <256 x i1> %4, <16 x i8> %7, i32 7, i32 2)
+!CHECK: store <512 x i1> %8, ptr %2, align 64
+
+ subroutine test_pmxvf64gerpp_u1_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10
+ __vector_pair :: cp
+ __vector_quad :: cq
+ call mma_pmxvf64gerpp(cq, cp, vu10, 7, 2)
+ end subroutine test_pmxvf64gerpp_u1_def
+
+!CHECK-LABEL: @test_pmxvf64gerpp_u1_def_
+!CHECK: %1 = alloca <256 x i1>, i64 1, align 32
+!CHECK: %2 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <256 x i1>, ptr %1, align 32
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %2, align 64
+!CHECK: %7 = call <512 x i1> @llvm.ppc.mma.pmxvf64gerpp(<512 x i1> %6, <256 x i1> %4, <16 x i8> %5, i32 7, i32 2)
+!CHECK: store <512 x i1> %7, ptr %2, align 64
+
+ subroutine test_pmxvf64gerpp_u1_non_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10
+ __vector_pair :: cp
+ __vector_quad :: cq
+ call mma_pmxvf64gerpp(cq, cp, vu10, 7_2, 2_1)
+ end subroutine test_pmxvf64gerpp_u1_non_def
+
+!CHECK-LABEL: @test_pmxvf64gerpp_u1_non_def_
+!CHECK: %1 = alloca <256 x i1>, i64 1, align 32
+!CHECK: %2 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <256 x i1>, ptr %1, align 32
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %2, align 64
+!CHECK: %7 = call <512 x i1> @llvm.ppc.mma.pmxvf64gerpp(<512 x i1> %6, <256 x i1> %4, <16 x i8> %5, i32 7, i32 2)
+!CHECK: store <512 x i1> %7, ptr %2, align 64
+
+ subroutine test_pmxvf64gerpp_r8_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(real(8)) vr80
+ __vector_pair :: cp
+ __vector_quad :: cq
+ call mma_pmxvf64gerpp(cq, cp, vr80, 7, 2)
+ end subroutine test_pmxvf64gerpp_r8_def
+
+!CHECK-LABEL: @test_pmxvf64gerpp_r8_def_
+!CHECK: %1 = alloca <256 x i1>, i64 1, align 32
+!CHECK: %2 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %3 = alloca <2 x double>, i64 1, align 16
+!CHECK: %4 = load <256 x i1>, ptr %1, align 32
+!CHECK: %5 = load <2 x double>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %2, align 64
+!CHECK: %7 = bitcast <2 x double> %5 to <16 x i8>
+!CHECK: %8 = call <512 x i1> @llvm.ppc.mma.pmxvf64gerpp(<512 x i1> %6, <256 x i1> %4, <16 x i8> %7, i32 7, i32 2)
+!CHECK: store <512 x i1> %8, ptr %2, align 64
+
+ subroutine test_pmxvf64gerpp_r8_non_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(real(8)) vr80
+ __vector_pair :: cp
+ __vector_quad :: cq
+ call mma_pmxvf64gerpp(cq, cp, vr80, 7_2, 2_1)
+ end subroutine test_pmxvf64gerpp_r8_non_def
+
+!CHECK-LABEL: @test_pmxvf64gerpp_r8_non_def_
+!CHECK: %1 = alloca <256 x i1>, i64 1, align 32
+!CHECK: %2 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %3 = alloca <2 x double>, i64 1, align 16
+!CHECK: %4 = load <256 x i1>, ptr %1, align 32
+!CHECK: %5 = load <2 x double>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %2, align 64
+!CHECK: %7 = bitcast <2 x double> %5 to <16 x i8>
+!CHECK: %8 = call <512 x i1> @llvm.ppc.mma.pmxvf64gerpp(<512 x i1> %6, <256 x i1> %4, <16 x i8> %7, i32 7, i32 2)
+!CHECK: store <512 x i1> %8, ptr %2, align 64
+
+ subroutine test_pmxvi16ger2_u1_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10, vu11
+ __vector_quad :: cq
+ call mma_pmxvi16ger2(cq, vu10, vu11, 7, 7, 2)
+ end subroutine test_pmxvi16ger2_u1_def
+
+!CHECK-LABEL: @test_pmxvi16ger2_u1_def_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <16 x i8>, ptr %2, align 16
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = call <512 x i1> @llvm.ppc.mma.pmxvi16ger2(<16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK: store <512 x i1> %6, ptr %1, align 64
+
+ subroutine test_pmxvi16ger2_u1_non_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10, vu11
+ __vector_quad :: cq
+ call mma_pmxvi16ger2(cq, vu10, vu11, 7_2, 7_1, 2_8)
+ end subroutine test_pmxvi16ger2_u1_non_def
+
+!CHECK-LABEL: @test_pmxvi16ger2_u1_non_def_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <16 x i8>, ptr %2, align 16
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = call <512 x i1> @llvm.ppc.mma.pmxvi16ger2(<16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK: store <512 x i1> %6, ptr %1, align 64
+
+ subroutine test_pmxvi16ger2_i2_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(integer(2)) vi20, vi21
+ __vector_quad :: cq
+ call mma_pmxvi16ger2(cq, vi20, vi21, 7, 7, 2)
+ end subroutine test_pmxvi16ger2_i2_def
+
+!CHECK-LABEL: @test_pmxvi16ger2_i2_def_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <8 x i16>, i64 1, align 16
+!CHECK: %3 = alloca <8 x i16>, i64 1, align 16
+!CHECK: %4 = load <8 x i16>, ptr %2, align 16
+!CHECK: %5 = load <8 x i16>, ptr %3, align 16
+!CHECK: %6 = bitcast <8 x i16> %4 to <16 x i8>
+!CHECK: %7 = bitcast <8 x i16> %5 to <16 x i8>
+!CHECK: %8 = call <512 x i1> @llvm.ppc.mma.pmxvi16ger2(<16 x i8> %6, <16 x i8> %7, i32 7, i32 7, i32 2)
+!CHECK: store <512 x i1> %8, ptr %1, align 64
+
+ subroutine test_pmxvi16ger2_i2_non_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(integer(2)) vi20, vi21
+ __vector_quad :: cq
+ call mma_pmxvi16ger2(cq, vi20, vi21, 7_2, 7_1, 2_8)
+ end subroutine test_pmxvi16ger2_i2_non_def
+
+!CHECK-LABEL: @test_pmxvi16ger2_i2_non_def_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <8 x i16>, i64 1, align 16
+!CHECK: %3 = alloca <8 x i16>, i64 1, align 16
+!CHECK: %4 = load <8 x i16>, ptr %2, align 16
+!CHECK: %5 = load <8 x i16>, ptr %3, align 16
+!CHECK: %6 = bitcast <8 x i16> %4 to <16 x i8>
+!CHECK: %7 = bitcast <8 x i16> %5 to <16 x i8>
+!CHECK: %8 = call <512 x i1> @llvm.ppc.mma.pmxvi16ger2(<16 x i8> %6, <16 x i8> %7, i32 7, i32 7, i32 2)
+!CHECK: store <512 x i1> %8, ptr %1, align 64
+
+ subroutine test_pmxvi16ger2pp_u1_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10, vu11
+ __vector_quad :: cq
+ call mma_pmxvi16ger2pp(cq, vu10, vu11, 7, 7, 2)
+ end subroutine test_pmxvi16ger2pp_u1_def
+
+!CHECK-LABEL: @test_pmxvi16ger2pp_u1_def_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <16 x i8>, ptr %2, align 16
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %1, align 64
+!CHECK: %7 = call <512 x i1> @llvm.ppc.mma.pmxvi16ger2pp(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK: store <512 x i1> %7, ptr %1, align 64
+
+ subroutine test_pmxvi16ger2pp_u1_non_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10, vu11
+ __vector_quad :: cq
+ call mma_pmxvi16ger2pp(cq, vu10, vu11, 7_2, 7_1, 2_8)
+ end subroutine test_pmxvi16ger2pp_u1_non_def
+
+!CHECK-LABEL: @test_pmxvi16ger2pp_u1_non_def_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <16 x i8>, ptr %2, align 16
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %1, align 64
+!CHECK: %7 = call <512 x i1> @llvm.ppc.mma.pmxvi16ger2pp(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK: store <512 x i1> %7, ptr %1, align 64
+
+ subroutine test_pmxvi16ger2pp_i2_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(integer(2)) vi20, vi21
+ __vector_quad :: cq
+ call mma_pmxvi16ger2pp(cq, vi20, vi21, 7, 7, 2)
+ end subroutine test_pmxvi16ger2pp_i2_def
+
+!CHECK-LABEL: @test_pmxvi16ger2pp_i2_def_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <8 x i16>, i64 1, align 16
+!CHECK: %3 = alloca <8 x i16>, i64 1, align 16
+!CHECK: %4 = load <8 x i16>, ptr %2, align 16
+!CHECK: %5 = load <8 x i16>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %1, align 64
+!CHECK: %7 = bitcast <8 x i16> %4 to <16 x i8>
+!CHECK: %8 = bitcast <8 x i16> %5 to <16 x i8>
+!CHECK: %9 = call <512 x i1> @llvm.ppc.mma.pmxvi16ger2pp(<512 x i1> %6, <16 x i8> %7, <16 x i8> %8, i32 7, i32 7, i32 2)
+!CHECK: store <512 x i1> %9, ptr %1, align 64
+
+ subroutine test_pmxvi16ger2pp_i2_non_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(integer(2)) vi20, vi21
+ __vector_quad :: cq
+ call mma_pmxvi16ger2pp(cq, vi20, vi21, 7_2, 7_1, 2_8)
+ end subroutine test_pmxvi16ger2pp_i2_non_def
+
+!CHECK-LABEL: @test_pmxvi16ger2pp_i2_non_def_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <8 x i16>, i64 1, align 16
+!CHECK: %3 = alloca <8 x i16>, i64 1, align 16
+!CHECK: %4 = load <8 x i16>, ptr %2, align 16
+!CHECK: %5 = load <8 x i16>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %1, align 64
+!CHECK: %7 = bitcast <8 x i16> %4 to <16 x i8>
+!CHECK: %8 = bitcast <8 x i16> %5 to <16 x i8>
+!CHECK: %9 = call <512 x i1> @llvm.ppc.mma.pmxvi16ger2pp(<512 x i1> %6, <16 x i8> %7, <16 x i8> %8, i32 7, i32 7, i32 2)
+!CHECK: store <512 x i1> %9, ptr %1, align 64
+
+ subroutine test_pmxvi16ger2s_u1_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10, vu11
+ __vector_quad :: cq
+ call mma_pmxvi16ger2s(cq, vu10, vu11, 7, 7, 2)
+ end subroutine test_pmxvi16ger2s_u1_def
+
+!CHECK-LABEL: @test_pmxvi16ger2s_u1_def_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <16 x i8>, ptr %2, align 16
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = call <512 x i1> @llvm.ppc.mma.pmxvi16ger2s(<16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK: store <512 x i1> %6, ptr %1, align 64
+
+ subroutine test_pmxvi16ger2s_u1_non_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10, vu11
+ __vector_quad :: cq
+ call mma_pmxvi16ger2s(cq, vu10, vu11, 7_2, 7_1, 2_8)
+ end subroutine test_pmxvi16ger2s_u1_non_def
+
+!CHECK-LABEL: @test_pmxvi16ger2s_u1_non_def_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <16 x i8>, ptr %2, align 16
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = call <512 x i1> @llvm.ppc.mma.pmxvi16ger2s(<16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK: store <512 x i1> %6, ptr %1, align 64
+
+ subroutine test_pmxvi16ger2s_i2_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(integer(2)) vi20, vi21
+ __vector_quad :: cq
+ call mma_pmxvi16ger2s(cq, vi20, vi21, 7, 7, 2)
+ end subroutine test_pmxvi16ger2s_i2_def
+
+!CHECK-LABEL: @test_pmxvi16ger2s_i2_def_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <8 x i16>, i64 1, align 16
+!CHECK: %3 = alloca <8 x i16>, i64 1, align 16
+!CHECK: %4 = load <8 x i16>, ptr %2, align 16
+!CHECK: %5 = load <8 x i16>, ptr %3, align 16
+!CHECK: %6 = bitcast <8 x i16> %4 to <16 x i8>
+!CHECK: %7 = bitcast <8 x i16> %5 to <16 x i8>
+!CHECK: %8 = call <512 x i1> @llvm.ppc.mma.pmxvi16ger2s(<16 x i8> %6, <16 x i8> %7, i32 7, i32 7, i32 2)
+!CHECK: store <512 x i1> %8, ptr %1, align 64
+
+ subroutine test_pmxvi16ger2s_i2_non_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(integer(2)) vi20, vi21
+ __vector_quad :: cq
+ call mma_pmxvi16ger2s(cq, vi20, vi21, 7_2, 7_1, 2_8)
+ end subroutine test_pmxvi16ger2s_i2_non_def
+
+!CHECK-LABEL: @test_pmxvi16ger2s_i2_non_def_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <8 x i16>, i64 1, align 16
+!CHECK: %3 = alloca <8 x i16>, i64 1, align 16
+!CHECK: %4 = load <8 x i16>, ptr %2, align 16
+!CHECK: %5 = load <8 x i16>, ptr %3, align 16
+!CHECK: %6 = bitcast <8 x i16> %4 to <16 x i8>
+!CHECK: %7 = bitcast <8 x i16> %5 to <16 x i8>
+!CHECK: %8 = call <512 x i1> @llvm.ppc.mma.pmxvi16ger2s(<16 x i8> %6, <16 x i8> %7, i32 7, i32 7, i32 2)
+!CHECK: store <512 x i1> %8, ptr %1, align 64
+
+ subroutine test_pmxvi16ger2spp_u1_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10, vu11
+ __vector_quad :: cq
+ call mma_pmxvi16ger2spp(cq, vu10, vu11, 7, 7, 2)
+ end subroutine test_pmxvi16ger2spp_u1_def
+
+!CHECK-LABEL: @test_pmxvi16ger2spp_u1_def_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <16 x i8>, ptr %2, align 16
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %1, align 64
+!CHECK: %7 = call <512 x i1> @llvm.ppc.mma.pmxvi16ger2spp(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK: store <512 x i1> %7, ptr %1, align 64
+
+ subroutine test_pmxvi16ger2spp_u1_non_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10, vu11
+ __vector_quad :: cq
+ call mma_pmxvi16ger2spp(cq, vu10, vu11, 7_2, 7_1, 2_8)
+ end subroutine test_pmxvi16ger2spp_u1_non_def
+
+!CHECK-LABEL: @test_pmxvi16ger2spp_u1_non_def_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <16 x i8>, ptr %2, align 16
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %1, align 64
+!CHECK: %7 = call <512 x i1> @llvm.ppc.mma.pmxvi16ger2spp(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK: store <512 x i1> %7, ptr %1, align 64
+
+ subroutine test_pmxvi16ger2spp_i2_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(integer(2)) vi20, vi21
+ __vector_quad :: cq
+ call mma_pmxvi16ger2spp(cq, vi20, vi21, 7, 7, 2)
+ end subroutine test_pmxvi16ger2spp_i2_def
+
+!CHECK-LABEL: @test_pmxvi16ger2spp_i2_def_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <8 x i16>, i64 1, align 16
+!CHECK: %3 = alloca <8 x i16>, i64 1, align 16
+!CHECK: %4 = load <8 x i16>, ptr %2, align 16
+!CHECK: %5 = load <8 x i16>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %1, align 64
+!CHECK: %7 = bitcast <8 x i16> %4 to <16 x i8>
+!CHECK: %8 = bitcast <8 x i16> %5 to <16 x i8>
+!CHECK: %9 = call <512 x i1> @llvm.ppc.mma.pmxvi16ger2spp(<512 x i1> %6, <16 x i8> %7, <16 x i8> %8, i32 7, i32 7, i32 2)
+!CHECK: store <512 x i1> %9, ptr %1, align 64
+
+ subroutine test_pmxvi16ger2spp_i2_non_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(integer(2)) vi20, vi21
+ __vector_quad :: cq
+ call mma_pmxvi16ger2spp(cq, vi20, vi21, 7_2, 7_1, 2_8)
+ end subroutine test_pmxvi16ger2spp_i2_non_def
+
+!CHECK-LABEL: @test_pmxvi16ger2spp_i2_non_def_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <8 x i16>, i64 1, align 16
+!CHECK: %3 = alloca <8 x i16>, i64 1, align 16
+!CHECK: %4 = load <8 x i16>, ptr %2, align 16
+!CHECK: %5 = load <8 x i16>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %1, align 64
+!CHECK: %7 = bitcast <8 x i16> %4 to <16 x i8>
+!CHECK: %8 = bitcast <8 x i16> %5 to <16 x i8>
+!CHECK: %9 = call <512 x i1> @llvm.ppc.mma.pmxvi16ger2spp(<512 x i1> %6, <16 x i8> %7, <16 x i8> %8, i32 7, i32 7, i32 2)
+!CHECK: store <512 x i1> %9, ptr %1, align 64
+
+
+ subroutine test_pmxvi4ger8_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10, vu11
+ __vector_quad :: cq
+ call mma_pmxvi4ger8(cq, vu10, vu11, 7, 7, 2)
+ end subroutine test_pmxvi4ger8_def
+
+!CHECK-LABEL: @test_pmxvi4ger8_def_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <16 x i8>, ptr %2, align 16
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = call <512 x i1> @llvm.ppc.mma.pmxvi4ger8(<16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK: store <512 x i1> %6, ptr %1, align 64
+
+ subroutine test_pmxvi4ger8_non_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10, vu11
+ __vector_quad :: cq
+ call mma_pmxvi4ger8(cq, vu10, vu11, 7_2, 7_1, 2_8)
+ end subroutine test_pmxvi4ger8_non_def
+
+!CHECK-LABEL: @test_pmxvi4ger8_non_def_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <16 x i8>, ptr %2, align 16
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = call <512 x i1> @llvm.ppc.mma.pmxvi4ger8(<16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK: store <512 x i1> %6, ptr %1, align 64
+
+ subroutine test_pmxvi4ger8pp_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10, vu11
+ __vector_quad :: cq
+ call mma_pmxvi4ger8pp(cq, vu10, vu11, 7, 7, 2)
+ end subroutine test_pmxvi4ger8pp_def
+
+!CHECK-LABEL: @test_pmxvi4ger8pp_def_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <16 x i8>, ptr %2, align 16
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %1, align 64
+!CHECK: %7 = call <512 x i1> @llvm.ppc.mma.pmxvi4ger8pp(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK: store <512 x i1> %7, ptr %1, align 64
+
+ subroutine test_pmxvi4ger8pp_non_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10, vu11
+ __vector_quad :: cq
+ call mma_pmxvi4ger8pp(cq, vu10, vu11, 7_2, 7_1, 2_8)
+ end subroutine test_pmxvi4ger8pp_non_def
+
+!CHECK-LABEL: @test_pmxvi4ger8pp_non_def_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <16 x i8>, ptr %2, align 16
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %1, align 64
+!CHECK: %7 = call <512 x i1> @llvm.ppc.mma.pmxvi4ger8pp(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK: store <512 x i1> %7, ptr %1, align 64
+
+ subroutine test_pmxvi8ger4_u1_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10, vu11
+ __vector_quad :: cq
+ call mma_pmxvi8ger4(cq, vu10, vu11, 7, 7, 2)
+ end subroutine test_pmxvi8ger4_u1_def
+
+!CHECK-LABEL: @test_pmxvi8ger4_u1_def_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <16 x i8>, ptr %2, align 16
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = call <512 x i1> @llvm.ppc.mma.pmxvi8ger4(<16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK: store <512 x i1> %6, ptr %1, align 64
+
+ subroutine test_pmxvi8ger4_u1_non_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10, vu11
+ __vector_quad :: cq
+ call mma_pmxvi8ger4(cq, vu10, vu11, 7_2, 7_1, 2_8)
+ end subroutine test_pmxvi8ger4_u1_non_def
+
+!CHECK-LABEL: @test_pmxvi8ger4_u1_non_def_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <16 x i8>, ptr %2, align 16
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = call <512 x i1> @llvm.ppc.mma.pmxvi8ger4(<16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK: store <512 x i1> %6, ptr %1, align 64
+
+ subroutine test_pmxvi8ger4_i1_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(integer(1)) vi10, vi11
+ __vector_quad :: cq
+ call mma_pmxvi8ger4(cq, vi10, vi11, 7, 7, 2)
+ end subroutine test_pmxvi8ger4_i1_def
+
+!CHECK-LABEL: @test_pmxvi8ger4_i1_def_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <16 x i8>, ptr %2, align 16
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = call <512 x i1> @llvm.ppc.mma.pmxvi8ger4(<16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK: store <512 x i1> %6, ptr %1, align 64
+
+ subroutine test_pmxvi8ger4_i1_non_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(integer(1)) vi10, vi11
+ __vector_quad :: cq
+ call mma_pmxvi8ger4(cq, vi10, vi11, 7_2, 7_1, 2_8)
+ end subroutine test_pmxvi8ger4_i1_non_def
+
+!CHECK-LABEL: @test_pmxvi8ger4_i1_non_def_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <16 x i8>, ptr %2, align 16
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = call <512 x i1> @llvm.ppc.mma.pmxvi8ger4(<16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK: store <512 x i1> %6, ptr %1, align 64
+
+ subroutine test_pmxvi8ger4pp_u1_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10, vu11
+ __vector_quad :: cq
+ call mma_pmxvi8ger4pp(cq, vu10, vu11, 7, 7, 2)
+ end subroutine test_pmxvi8ger4pp_u1_def
+
+!CHECK-LABEL: @test_pmxvi8ger4pp_u1_def_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <16 x i8>, ptr %2, align 16
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %1, align 64
+!CHECK: %7 = call <512 x i1> @llvm.ppc.mma.pmxvi8ger4pp(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK: store <512 x i1> %7, ptr %1, align 64
+
+ subroutine test_pmxvi8ger4pp_u1_non_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10, vu11
+ __vector_quad :: cq
+ call mma_pmxvi8ger4pp(cq, vu10, vu11, 7_2, 7_1, 2_8)
+ end subroutine test_pmxvi8ger4pp_u1_non_def
+
+!CHECK-LABEL: @test_pmxvi8ger4pp_u1_non_def_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <16 x i8>, ptr %2, align 16
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %1, align 64
+!CHECK: %7 = call <512 x i1> @llvm.ppc.mma.pmxvi8ger4pp(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK: store <512 x i1> %7, ptr %1, align 64
+
+ subroutine test_pmxvi8ger4pp_i1_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(integer(1)) vi10, vi11
+ __vector_quad :: cq
+ call mma_pmxvi8ger4pp(cq, vi10, vi11, 7, 7, 2)
+ end subroutine test_pmxvi8ger4pp_i1_def
+
+!CHECK-LABEL: @test_pmxvi8ger4pp_i1_def_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <16 x i8>, ptr %2, align 16
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %1, align 64
+!CHECK: %7 = call <512 x i1> @llvm.ppc.mma.pmxvi8ger4pp(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK: store <512 x i1> %7, ptr %1, align 64
+
+ subroutine test_pmxvi8ger4pp_i1_non_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(integer(1)) vi10, vi11
+ __vector_quad :: cq
+ call mma_pmxvi8ger4pp(cq, vi10, vi11, 7_2, 7_1, 2_8)
+ end subroutine test_pmxvi8ger4pp_i1_non_def
+
+!CHECK-LABEL: @test_pmxvi8ger4pp_i1_non_def_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <16 x i8>, ptr %2, align 16
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %1, align 64
+!CHECK: %7 = call <512 x i1> @llvm.ppc.mma.pmxvi8ger4pp(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK: store <512 x i1> %7, ptr %1, align 64
+
+ subroutine test_pmxvi8ger4spp_u1_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10, vu11
+ __vector_quad :: cq
+ call mma_pmxvi8ger4spp(cq, vu10, vu11, 7, 7, 2)
+ end subroutine test_pmxvi8ger4spp_u1_def
+
+!CHECK-LABEL: @test_pmxvi8ger4spp_u1_def_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <16 x i8>, ptr %2, align 16
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %1, align 64
+!CHECK: %7 = call <512 x i1> @llvm.ppc.mma.pmxvi8ger4spp(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK: store <512 x i1> %7, ptr %1, align 64
+
+ subroutine test_pmxvi8ger4spp_u1_non_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10, vu11
+ __vector_quad :: cq
+ call mma_pmxvi8ger4spp(cq, vu10, vu11, 7_2, 7_1, 2_8)
+ end subroutine test_pmxvi8ger4spp_u1_non_def
+
+!CHECK-LABEL: @test_pmxvi8ger4spp_u1_non_def_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <16 x i8>, ptr %2, align 16
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %1, align 64
+!CHECK: %7 = call <512 x i1> @llvm.ppc.mma.pmxvi8ger4spp(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK: store <512 x i1> %7, ptr %1, align 64
+
+ subroutine test_pmxvi8ger4spp_i1_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(integer(1)) vi10, vi11
+ __vector_quad :: cq
+ call mma_pmxvi8ger4spp(cq, vi10, vi11, 7, 7, 2)
+ end subroutine test_pmxvi8ger4spp_i1_def
+
+!CHECK-LABEL: @test_pmxvi8ger4spp_i1_def_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <16 x i8>, ptr %2, align 16
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %1, align 64
+!CHECK: %7 = call <512 x i1> @llvm.ppc.mma.pmxvi8ger4spp(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK: store <512 x i1> %7, ptr %1, align 64
+
+ subroutine test_pmxvi8ger4spp_i1_non_def()
+ use, intrinsic :: mma
+ implicit none
+ vector(integer(1)) vi10, vi11
+ __vector_quad :: cq
+ call mma_pmxvi8ger4spp(cq, vi10, vi11, 7_2, 7_1, 2_8)
+ end subroutine test_pmxvi8ger4spp_i1_non_def
+
+!CHECK-LABEL: @test_pmxvi8ger4spp_i1_non_def_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <16 x i8>, ptr %2, align 16
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %1, align 64
+!CHECK: %7 = call <512 x i1> @llvm.ppc.mma.pmxvi8ger4spp(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK: store <512 x i1> %7, ptr %1, align 64
diff --git a/flang/test/Lower/PowerPC/ppc-mma-outer-product-2.f90 b/flang/test/Lower/PowerPC/ppc-mma-outer-product-2.f90
new file mode 100644
index 00000000000000..778d58a745be9d
--- /dev/null
+++ b/flang/test/Lower/PowerPC/ppc-mma-outer-product-2.f90
@@ -0,0 +1,856 @@
+! RUN: %flang_fc1 -triple powerpc64le-unknown-unknown -target-cpu pwr10 -emit-llvm %s -o - | FileCheck --check-prefixes="CHECK" %s
+! REQUIRES: target=powerpc{{.*}}
+
+ subroutine test_xvbf16ger2()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10, vu11
+ __vector_quad :: cq
+ call mma_xvbf16ger2(cq, vu10, vu11)
+ end subroutine test_xvbf16ger2
+
+!CHECK-LABEL: @test_xvbf16ger2_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <16 x i8>, ptr %2, align 16
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = call <512 x i1> @llvm.ppc.mma.xvbf16ger2(<16 x i8> %4, <16 x i8> %5)
+!CHECK: store <512 x i1> %6, ptr %1, align 64
+
+
+ subroutine test_xvbf16ger2nn()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10, vu11
+ __vector_quad :: cq
+ call mma_xvbf16ger2nn(cq, vu10, vu11)
+ end subroutine test_xvbf16ger2nn
+
+!CHECK-LABEL: @test_xvbf16ger2nn_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <16 x i8>, ptr %2, align 16
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %1, align 64
+!CHECK: %7 = call <512 x i1> @llvm.ppc.mma.xvbf16ger2nn(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5)
+!CHECK: store <512 x i1> %7, ptr %1, align 64
+
+ subroutine test_xvbf16ger2np()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10, vu11
+ __vector_quad :: cq
+ call mma_xvbf16ger2np(cq, vu10, vu11)
+ end subroutine test_xvbf16ger2np
+
+!CHECK-LABEL: @test_xvbf16ger2np_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <16 x i8>, ptr %2, align 16
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %1, align 64
+!CHECK: %7 = call <512 x i1> @llvm.ppc.mma.xvbf16ger2np(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5)
+!CHECK: store <512 x i1> %7, ptr %1, align 64
+
+ subroutine test_xvbf16ger2pn()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10, vu11
+ __vector_quad :: cq
+ call mma_xvbf16ger2pn(cq, vu10, vu11)
+ end subroutine test_xvbf16ger2pn
+
+!CHECK-LABEL: @test_xvbf16ger2pn_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <16 x i8>, ptr %2, align 16
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %1, align 64
+!CHECK: %7 = call <512 x i1> @llvm.ppc.mma.xvbf16ger2pn(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5)
+!CHECK: store <512 x i1> %7, ptr %1, align 64
+
+ subroutine test_xvbf16ger2pp()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10, vu11
+ __vector_quad :: cq
+ call mma_xvbf16ger2pp(cq, vu10, vu11)
+ end subroutine test_xvbf16ger2pp
+
+!CHECK-LABEL: @test_xvbf16ger2pp_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <16 x i8>, ptr %2, align 16
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %1, align 64
+!CHECK: %7 = call <512 x i1> @llvm.ppc.mma.xvbf16ger2pp(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5)
+!CHECK: store <512 x i1> %7, ptr %1, align 64
+
+ subroutine test_xvf16ger2()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10, vu11
+ __vector_quad :: cq
+ call mma_xvf16ger2(cq, vu10, vu11)
+ end subroutine test_xvf16ger2
+
+!CHECK-LABEL: @test_xvf16ger2_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <16 x i8>, ptr %2, align 16
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = call <512 x i1> @llvm.ppc.mma.xvf16ger2(<16 x i8> %4, <16 x i8> %5)
+!CHECK: store <512 x i1> %6, ptr %1, align 64
+
+ subroutine test_xvf16ger2nn()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10, vu11
+ __vector_quad :: cq
+ call mma_xvf16ger2nn(cq, vu10, vu11)
+ end subroutine test_xvf16ger2nn
+
+!CHECK-LABEL: @test_xvf16ger2nn_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <16 x i8>, ptr %2, align 16
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %1, align 64
+!CHECK: %7 = call <512 x i1> @llvm.ppc.mma.xvf16ger2nn(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5)
+!CHECK: store <512 x i1> %7, ptr %1, align 64
+
+ subroutine test_xvf16ger2np()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10, vu11
+ __vector_quad :: cq
+ call mma_xvf16ger2np(cq, vu10, vu11)
+ end subroutine test_xvf16ger2np
+
+!CHECK-LABEL: @test_xvf16ger2np_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <16 x i8>, ptr %2, align 16
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %1, align 64
+!CHECK: %7 = call <512 x i1> @llvm.ppc.mma.xvf16ger2np(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5)
+!CHECK: store <512 x i1> %7, ptr %1, align 64
+
+ subroutine test_xvf16ger2pn()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10, vu11
+ __vector_quad :: cq
+ call mma_xvf16ger2pn(cq, vu10, vu11)
+ end subroutine test_xvf16ger2pn
+
+!CHECK-LABEL: @test_xvf16ger2pn_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <16 x i8>, ptr %2, align 16
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %1, align 64
+!CHECK: %7 = call <512 x i1> @llvm.ppc.mma.xvf16ger2pn(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5)
+!CHECK: store <512 x i1> %7, ptr %1, align 64
+
+ subroutine test_xvf16ger2pp()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10, vu11
+ __vector_quad :: cq
+ call mma_xvf16ger2pp(cq, vu10, vu11)
+ end subroutine test_xvf16ger2pp
+
+!CHECK-LABEL: @test_xvf16ger2pp_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <16 x i8>, ptr %2, align 16
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %1, align 64
+!CHECK: %7 = call <512 x i1> @llvm.ppc.mma.xvf16ger2pp(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5)
+!CHECK: store <512 x i1> %7, ptr %1, align 64
+
+ subroutine test_xvf32ger_u1()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10, vu11
+ __vector_quad :: cq
+ call mma_xvf32ger(cq, vu10, vu11)
+ end subroutine test_xvf32ger_u1
+
+!CHECK-LABEL: @test_xvf32ger_u1_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <16 x i8>, ptr %2, align 16
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = call <512 x i1> @llvm.ppc.mma.xvf32ger(<16 x i8> %4, <16 x i8> %5)
+!CHECK: store <512 x i1> %6, ptr %1, align 64
+
+
+ subroutine test_xvf32ger_r4()
+ use, intrinsic :: mma
+ implicit none
+ vector(real(4)) vr40, vr41
+ __vector_quad :: cq
+ call mma_xvf32ger(cq, vr40, vr41)
+ end subroutine test_xvf32ger_r4
+
+!CHECK-LABEL: @test_xvf32ger_r4_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <4 x float>, i64 1, align 16
+!CHECK: %3 = alloca <4 x float>, i64 1, align 16
+!CHECK: %4 = load <4 x float>, ptr %2, align 16
+!CHECK: %5 = load <4 x float>, ptr %3, align 16
+!CHECK: %6 = bitcast <4 x float> %4 to <16 x i8>
+!CHECK: %7 = bitcast <4 x float> %5 to <16 x i8>
+!CHECK: %8 = call <512 x i1> @llvm.ppc.mma.xvf32ger(<16 x i8> %6, <16 x i8> %7)
+!CHECK: store <512 x i1> %8, ptr %1, align 64
+
+ subroutine test_xvf32gernn_u1()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10, vu11
+ __vector_quad :: cq
+ call mma_xvf32gernn(cq, vu10, vu11)
+ end subroutine test_xvf32gernn_u1
+
+!CHECK-LABEL: @test_xvf32gernn_u1_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <16 x i8>, ptr %2, align 16
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %1, align 64
+!CHECK: %7 = call <512 x i1> @llvm.ppc.mma.xvf32gernn(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5)
+!CHECK: store <512 x i1> %7, ptr %1, align 64
+
+ subroutine test_xvf32gernn_r4()
+ use, intrinsic :: mma
+ implicit none
+ vector(real(4)) vr40, vr41
+ __vector_quad :: cq
+ call mma_xvf32gernn(cq, vr40, vr41)
+ end subroutine test_xvf32gernn_r4
+
+!CHECK-LABEL: @test_xvf32gernn_r4_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <4 x float>, i64 1, align 16
+!CHECK: %3 = alloca <4 x float>, i64 1, align 16
+!CHECK: %4 = load <4 x float>, ptr %2, align 16
+!CHECK: %5 = load <4 x float>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %1, align 64
+!CHECK: %7 = bitcast <4 x float> %4 to <16 x i8>
+!CHECK: %8 = bitcast <4 x float> %5 to <16 x i8>
+!CHECK: %9 = call <512 x i1> @llvm.ppc.mma.xvf32gernn(<512 x i1> %6, <16 x i8> %7, <16 x i8> %8)
+!CHECK: store <512 x i1> %9, ptr %1, align 64
+
+ subroutine test_xvf32gernp_u1()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10, vu11
+ __vector_quad :: cq
+ call mma_xvf32gernp(cq, vu10, vu11)
+ end subroutine test_xvf32gernp_u1
+
+!CHECK-LABEL: @test_xvf32gernp_u1_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <16 x i8>, ptr %2, align 16
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %1, align 64
+!CHECK: %7 = call <512 x i1> @llvm.ppc.mma.xvf32gernp(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5)
+!CHECK: store <512 x i1> %7, ptr %1, align 64
+
+ subroutine test_xvf32gernp_r4()
+ use, intrinsic :: mma
+ implicit none
+ vector(real(4)) vr40, vr41
+ __vector_quad :: cq
+ call mma_xvf32gernp(cq, vr40, vr41)
+ end subroutine test_xvf32gernp_r4
+
+!CHECK-LABEL: @test_xvf32gernp_r4_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <4 x float>, i64 1, align 16
+!CHECK: %3 = alloca <4 x float>, i64 1, align 16
+!CHECK: %4 = load <4 x float>, ptr %2, align 16
+!CHECK: %5 = load <4 x float>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %1, align 64
+!CHECK: %7 = bitcast <4 x float> %4 to <16 x i8>
+!CHECK: %8 = bitcast <4 x float> %5 to <16 x i8>
+!CHECK: %9 = call <512 x i1> @llvm.ppc.mma.xvf32gernp(<512 x i1> %6, <16 x i8> %7, <16 x i8> %8)
+!CHECK: store <512 x i1> %9, ptr %1, align 64
+
+ subroutine test_xvf32gerpn_u1()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10, vu11
+ __vector_quad :: cq
+ call mma_xvf32gerpn(cq, vu10, vu11)
+ end subroutine test_xvf32gerpn_u1
+
+!CHECK-LABEL: @test_xvf32gerpn_u1_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <16 x i8>, ptr %2, align 16
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %1, align 64
+!CHECK: %7 = call <512 x i1> @llvm.ppc.mma.xvf32gerpn(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5)
+!CHECK: store <512 x i1> %7, ptr %1, align 64
+
+ subroutine test_xvf32gerpn_r4()
+ use, intrinsic :: mma
+ implicit none
+ vector(real(4)) vr40, vr41
+ __vector_quad :: cq
+ call mma_xvf32gerpn(cq, vr40, vr41)
+ end subroutine test_xvf32gerpn_r4
+
+!CHECK-LABEL: @test_xvf32gerpn_r4_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <4 x float>, i64 1, align 16
+!CHECK: %3 = alloca <4 x float>, i64 1, align 16
+!CHECK: %4 = load <4 x float>, ptr %2, align 16
+!CHECK: %5 = load <4 x float>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %1, align 64
+!CHECK: %7 = bitcast <4 x float> %4 to <16 x i8>
+!CHECK: %8 = bitcast <4 x float> %5 to <16 x i8>
+!CHECK: %9 = call <512 x i1> @llvm.ppc.mma.xvf32gerpn(<512 x i1> %6, <16 x i8> %7, <16 x i8> %8)
+!CHECK: store <512 x i1> %9, ptr %1, align 64
+
+ subroutine test_xvf32gerpp_u1()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10, vu11
+ __vector_quad :: cq
+ call mma_xvf32gerpp(cq, vu10, vu11)
+ end subroutine test_xvf32gerpp_u1
+
+!CHECK-LABEL: @test_xvf32gerpp_u1_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <16 x i8>, ptr %2, align 16
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %1, align 64
+!CHECK: %7 = call <512 x i1> @llvm.ppc.mma.xvf32gerpp(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5)
+!CHECK: store <512 x i1> %7, ptr %1, align 64
+
+
+ subroutine test_xvf32gerpp_r4()
+ use, intrinsic :: mma
+ implicit none
+ vector(real(4)) vr40, vr41
+ __vector_quad :: cq
+ call mma_xvf32gerpp(cq, vr40, vr41)
+ end subroutine test_xvf32gerpp_r4
+
+!CHECK-LABEL: @test_xvf32gerpp_r4_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <4 x float>, i64 1, align 16
+!CHECK: %3 = alloca <4 x float>, i64 1, align 16
+!CHECK: %4 = load <4 x float>, ptr %2, align 16
+!CHECK: %5 = load <4 x float>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %1, align 64
+!CHECK: %7 = bitcast <4 x float> %4 to <16 x i8>
+!CHECK: %8 = bitcast <4 x float> %5 to <16 x i8>
+!CHECK: %9 = call <512 x i1> @llvm.ppc.mma.xvf32gerpp(<512 x i1> %6, <16 x i8> %7, <16 x i8> %8)
+!CHECK: store <512 x i1> %9, ptr %1, align 64
+
+ subroutine test_xvf64ger_u1()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10
+ __vector_quad :: cq
+ __vector_pair :: cp
+ call mma_xvf64ger(cq, cp, vu10)
+ end subroutine test_xvf64ger_u1
+
+!CHECK-LABEL: @test_xvf64ger_u1_
+!CHECK: %1 = alloca <256 x i1>, i64 1, align 32
+!CHECK: %2 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <256 x i1>, ptr %1, align 32
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = call <512 x i1> @llvm.ppc.mma.xvf64ger(<256 x i1> %4, <16 x i8> %5)
+!CHECK: store <512 x i1> %6, ptr %2, align 64
+
+ subroutine test_xvf64ger_r8()
+ use, intrinsic :: mma
+ implicit none
+ vector(real(8)) vr80
+ __vector_quad :: cq
+ __vector_pair :: cp
+ call mma_xvf64ger(cq, cp, vr80)
+ end subroutine test_xvf64ger_r8
+
+!CHECK-LABEL: @test_xvf64ger_r8_
+!CHECK: %1 = alloca <256 x i1>, i64 1, align 32
+!CHECK: %2 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %3 = alloca <2 x double>, i64 1, align 16
+!CHECK: %4 = load <256 x i1>, ptr %1, align 32
+!CHECK: %5 = load <2 x double>, ptr %3, align 16
+!CHECK: %6 = bitcast <2 x double> %5 to <16 x i8>
+!CHECK: %7 = call <512 x i1> @llvm.ppc.mma.xvf64ger(<256 x i1> %4, <16 x i8> %6)
+!CHECK: store <512 x i1> %7, ptr %2, align 64
+
+
+ subroutine test_xvf64gernn_u1()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10
+ __vector_quad :: cq
+ __vector_pair :: cp
+ call mma_xvf64gernn(cq, cp, vu10)
+ end subroutine test_xvf64gernn_u1
+
+!CHECK-LABEL: @test_xvf64gernn_u1_
+!CHECK: %1 = alloca <256 x i1>, i64 1, align 32
+!CHECK: %2 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <256 x i1>, ptr %1, align 32
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %2, align 64
+!CHECK: %7 = call <512 x i1> @llvm.ppc.mma.xvf64gernn(<512 x i1> %6, <256 x i1> %4, <16 x i8> %5)
+!CHECK: store <512 x i1> %7, ptr %2, align 64
+
+
+ subroutine test_xvf64gernn_r8()
+ use, intrinsic :: mma
+ implicit none
+ vector(real(8)) vr80
+ __vector_quad :: cq
+ __vector_pair :: cp
+ call mma_xvf64gernn(cq, cp, vr80)
+ end subroutine test_xvf64gernn_r8
+
+!CHECK-LABEL: @test_xvf64gernn_r8_
+!CHECK: %1 = alloca <256 x i1>, i64 1, align 32
+!CHECK: %2 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %3 = alloca <2 x double>, i64 1, align 16
+!CHECK: %4 = load <256 x i1>, ptr %1, align 32
+!CHECK: %5 = load <2 x double>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %2, align 64
+!CHECK: %7 = bitcast <2 x double> %5 to <16 x i8>
+!CHECK: %8 = call <512 x i1> @llvm.ppc.mma.xvf64gernn(<512 x i1> %6, <256 x i1> %4, <16 x i8> %7)
+!CHECK: store <512 x i1> %8, ptr %2, align 64
+
+ subroutine test_xvf64gernp_u1()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10
+ __vector_quad :: cq
+ __vector_pair :: cp
+ call mma_xvf64gernp(cq, cp, vu10)
+ end subroutine test_xvf64gernp_u1
+
+!CHECK-LABEL: @test_xvf64gernp_u1_
+!CHECK: %1 = alloca <256 x i1>, i64 1, align 32
+!CHECK: %2 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <256 x i1>, ptr %1, align 32
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %2, align 64
+!CHECK: %7 = call <512 x i1> @llvm.ppc.mma.xvf64gernp(<512 x i1> %6, <256 x i1> %4, <16 x i8> %5)
+!CHECK: store <512 x i1> %7, ptr %2, align 64
+
+ subroutine test_xvf64gernp_r8()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vr80
+ __vector_quad :: cq
+ __vector_pair :: cp
+ call mma_xvf64gernp(cq, cp, vr80)
+ end subroutine test_xvf64gernp_r8
+
+!CHECK-LABEL: @test_xvf64gernp_r8_
+!CHECK: %1 = alloca <256 x i1>, i64 1, align 32
+!CHECK: %2 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <256 x i1>, ptr %1, align 32
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %2, align 64
+!CHECK: %7 = call <512 x i1> @llvm.ppc.mma.xvf64gernp(<512 x i1> %6, <256 x i1> %4, <16 x i8> %5)
+!CHECK: store <512 x i1> %7, ptr %2, align 64
+
+ subroutine test_xvf64gerpn_u1()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10
+ __vector_quad :: cq
+ __vector_pair :: cp
+ call mma_xvf64gerpn(cq, cp, vu10)
+ end subroutine test_xvf64gerpn_u1
+
+!CHECK-LABEL: @test_xvf64gerpn_u1_
+!CHECK: %1 = alloca <256 x i1>, i64 1, align 32
+!CHECK: %2 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <256 x i1>, ptr %1, align 32
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %2, align 64
+!CHECK: %7 = call <512 x i1> @llvm.ppc.mma.xvf64gerpn(<512 x i1> %6, <256 x i1> %4, <16 x i8> %5)
+!CHECK: store <512 x i1> %7, ptr %2, align 64
+
+ subroutine test_xvf64gerpn_r8()
+ use, intrinsic :: mma
+ implicit none
+ vector(real(8)) vr80
+ __vector_quad :: cq
+ __vector_pair :: cp
+ call mma_xvf64gerpn(cq, cp, vr80)
+ end subroutine test_xvf64gerpn_r8
+
+!CHECK-LABEL: @test_xvf64gerpn_r8_
+!CHECK: %1 = alloca <256 x i1>, i64 1, align 32
+!CHECK: %2 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %3 = alloca <2 x double>, i64 1, align 16
+!CHECK: %4 = load <256 x i1>, ptr %1, align 32
+!CHECK: %5 = load <2 x double>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %2, align 64
+!CHECK: %7 = bitcast <2 x double> %5 to <16 x i8>
+!CHECK: %8 = call <512 x i1> @llvm.ppc.mma.xvf64gerpn(<512 x i1> %6, <256 x i1> %4, <16 x i8> %7)
+!CHECK: store <512 x i1> %8, ptr %2, align 64
+
+ subroutine test_xvf64gerpp_u1()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10
+ __vector_quad :: cq
+ __vector_pair :: cp
+ call mma_xvf64gerpp(cq, cp, vu10)
+ end subroutine test_xvf64gerpp_u1
+
+!CHECK-LABEL: @test_xvf64gerpp_u1_
+!CHECK: %1 = alloca <256 x i1>, i64 1, align 32
+!CHECK: %2 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <256 x i1>, ptr %1, align 32
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %2, align 64
+!CHECK: %7 = call <512 x i1> @llvm.ppc.mma.xvf64gerpp(<512 x i1> %6, <256 x i1> %4, <16 x i8> %5)
+!CHECK: store <512 x i1> %7, ptr %2, align 64
+
+
+ subroutine test_xvf64gerpp_r8()
+ use, intrinsic :: mma
+ implicit none
+ vector(real(8)) vr80
+ __vector_quad :: cq
+ __vector_pair :: cp
+ call mma_xvf64gerpp(cq, cp, vr80)
+ end subroutine test_xvf64gerpp_r8
+
+!CHECK-LABEL: @test_xvf64gerpp_r8_
+!CHECK: %1 = alloca <256 x i1>, i64 1, align 32
+!CHECK: %2 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %3 = alloca <2 x double>, i64 1, align 16
+!CHECK: %4 = load <256 x i1>, ptr %1, align 32
+!CHECK: %5 = load <2 x double>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %2, align 64
+!CHECK: %7 = bitcast <2 x double> %5 to <16 x i8>
+!CHECK: %8 = call <512 x i1> @llvm.ppc.mma.xvf64gerpp(<512 x i1> %6, <256 x i1> %4, <16 x i8> %7)
+!CHECK: store <512 x i1> %8, ptr %2, align 64
+
+ subroutine test_xvi16ger2_u1()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10, vu11
+ __vector_quad :: cq
+ call mma_xvi16ger2(cq, vu10, vu11)
+ end subroutine test_xvi16ger2_u1
+
+!CHECK-LABEL: @test_xvi16ger2_u1_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <16 x i8>, ptr %2, align 16
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = call <512 x i1> @llvm.ppc.mma.xvi16ger2(<16 x i8> %4, <16 x i8> %5)
+!CHECK: store <512 x i1> %6, ptr %1, align 64
+
+ subroutine test_xvi16ger2_i2()
+ use, intrinsic :: mma
+ implicit none
+ vector(integer(2)) vi20, vi21
+ __vector_quad :: cq
+ call mma_xvi16ger2(cq, vi20, vi21)
+ end subroutine test_xvi16ger2_i2
+
+!CHECK-LABEL: @test_xvi16ger2_i2_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <8 x i16>, i64 1, align 16
+!CHECK: %3 = alloca <8 x i16>, i64 1, align 16
+!CHECK: %4 = load <8 x i16>, ptr %2, align 16
+!CHECK: %5 = load <8 x i16>, ptr %3, align 16
+!CHECK: %6 = bitcast <8 x i16> %4 to <16 x i8>
+!CHECK: %7 = bitcast <8 x i16> %5 to <16 x i8>
+!CHECK: %8 = call <512 x i1> @llvm.ppc.mma.xvi16ger2(<16 x i8> %6, <16 x i8> %7)
+!CHECK: store <512 x i1> %8, ptr %1, align 64
+
+ subroutine test_xvi16ger2pp_u1()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10, vu11
+ __vector_quad :: cq
+ call mma_xvi16ger2pp(cq, vu10, vu11)
+ end subroutine test_xvi16ger2pp_u1
+
+!CHECK-LABEL: @test_xvi16ger2pp_u1_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <16 x i8>, ptr %2, align 16
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %1, align 64
+!CHECK: %7 = call <512 x i1> @llvm.ppc.mma.xvi16ger2pp(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5)
+!CHECK: store <512 x i1> %7, ptr %1, align 64
+
+ subroutine test_xvi16ger2pp_i2()
+ use, intrinsic :: mma
+ implicit none
+ vector(integer(2)) vi20, vi21
+ __vector_quad :: cq
+ call mma_xvi16ger2pp(cq, vi20, vi21)
+ end subroutine test_xvi16ger2pp_i2
+
+!CHECK-LABEL: @test_xvi16ger2pp_i2_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <8 x i16>, i64 1, align 16
+!CHECK: %3 = alloca <8 x i16>, i64 1, align 16
+!CHECK: %4 = load <8 x i16>, ptr %2, align 16
+!CHECK: %5 = load <8 x i16>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %1, align 64
+!CHECK: %7 = bitcast <8 x i16> %4 to <16 x i8>
+!CHECK: %8 = bitcast <8 x i16> %5 to <16 x i8>
+!CHECK: %9 = call <512 x i1> @llvm.ppc.mma.xvi16ger2pp(<512 x i1> %6, <16 x i8> %7, <16 x i8> %8)
+!CHECK: store <512 x i1> %9, ptr %1, align 64
+
+ subroutine test_xvi16ger2s_u1()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10, vu11
+ __vector_quad :: cq
+ call mma_xvi16ger2s(cq, vu10, vu11)
+ end subroutine test_xvi16ger2s_u1
+
+!CHECK-LABEL: @test_xvi16ger2s_u1_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <16 x i8>, ptr %2, align 16
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = call <512 x i1> @llvm.ppc.mma.xvi16ger2s(<16 x i8> %4, <16 x i8> %5)
+!CHECK: store <512 x i1> %6, ptr %1, align 64
+
+ subroutine test_xvi16ger2s_i2()
+ use, intrinsic :: mma
+ implicit none
+ vector(integer(2)) vi20, vi21
+ __vector_quad :: cq
+ call mma_xvi16ger2s(cq, vi20, vi21)
+ end subroutine test_xvi16ger2s_i2
+
+!CHECK-LABEL: @test_xvi16ger2s_i2_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <8 x i16>, i64 1, align 16
+!CHECK: %3 = alloca <8 x i16>, i64 1, align 16
+!CHECK: %4 = load <8 x i16>, ptr %2, align 16
+!CHECK: %5 = load <8 x i16>, ptr %3, align 16
+!CHECK: %6 = bitcast <8 x i16> %4 to <16 x i8>
+!CHECK: %7 = bitcast <8 x i16> %5 to <16 x i8>
+!CHECK: %8 = call <512 x i1> @llvm.ppc.mma.xvi16ger2s(<16 x i8> %6, <16 x i8> %7)
+!CHECK: store <512 x i1> %8, ptr %1, align 64
+
+ subroutine test_xvi16ger2spp_u1()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10, vu11
+ __vector_quad :: cq
+ call mma_xvi16ger2spp(cq, vu10, vu11)
+ end subroutine test_xvi16ger2spp_u1
+
+!CHECK-LABEL: @test_xvi16ger2spp_u1_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <16 x i8>, ptr %2, align 16
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %1, align 64
+!CHECK: %7 = call <512 x i1> @llvm.ppc.mma.xvi16ger2spp(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5)
+!CHECK: store <512 x i1> %7, ptr %1, align 64
+
+ subroutine test_xvi16ger2spp_i2()
+ use, intrinsic :: mma
+ implicit none
+ vector(integer(2)) vi20, vi21
+ __vector_quad :: cq
+ call mma_xvi16ger2spp(cq, vi20, vi21)
+ end subroutine test_xvi16ger2spp_i2
+
+!CHECK-LABEL: @test_xvi16ger2spp_i2_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <8 x i16>, i64 1, align 16
+!CHECK: %3 = alloca <8 x i16>, i64 1, align 16
+!CHECK: %4 = load <8 x i16>, ptr %2, align 16
+!CHECK: %5 = load <8 x i16>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %1, align 64
+!CHECK: %7 = bitcast <8 x i16> %4 to <16 x i8>
+!CHECK: %8 = bitcast <8 x i16> %5 to <16 x i8>
+!CHECK: %9 = call <512 x i1> @llvm.ppc.mma.xvi16ger2spp(<512 x i1> %6, <16 x i8> %7, <16 x i8> %8)
+!CHECK: store <512 x i1> %9, ptr %1, align 64
+
+ subroutine test_xvi4ger8()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10, vu11
+ __vector_quad :: cq
+ call mma_xvi4ger8(cq, vu10, vu11)
+ end subroutine test_xvi4ger8
+
+!CHECK-LABEL: @test_xvi4ger8_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <16 x i8>, ptr %2, align 16
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = call <512 x i1> @llvm.ppc.mma.xvi4ger8(<16 x i8> %4, <16 x i8> %5)
+!CHECK: store <512 x i1> %6, ptr %1, align 64
+
+ subroutine test_xvi4ger8pp()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10, vu11
+ __vector_quad :: cq
+ call mma_xvi4ger8pp(cq, vu10, vu11)
+ end subroutine test_xvi4ger8pp
+
+!CHECK-LABEL: @test_xvi4ger8pp_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <16 x i8>, ptr %2, align 16
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %1, align 64
+!CHECK: %7 = call <512 x i1> @llvm.ppc.mma.xvi4ger8pp(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5)
+!CHECK: store <512 x i1> %7, ptr %1, align 64
+
+ subroutine test_xvi8ger4_u1()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10, vu11
+ __vector_quad :: cq
+ call mma_xvi8ger4(cq, vu10, vu11)
+ end subroutine test_xvi8ger4_u1
+
+!CHECK-LABEL: @test_xvi8ger4_u1_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <16 x i8>, ptr %2, align 16
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = call <512 x i1> @llvm.ppc.mma.xvi8ger4(<16 x i8> %4, <16 x i8> %5)
+!CHECK: store <512 x i1> %6, ptr %1, align 64
+
+
+ subroutine test_xvi8ger4_i1()
+ use, intrinsic :: mma
+ implicit none
+ vector(integer(1)) vi10, vi11
+ __vector_quad :: cq
+ call mma_xvi8ger4(cq, vi10, vi11)
+ end subroutine test_xvi8ger4_i1
+
+!CHECK-LABEL: @test_xvi8ger4_i1_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <16 x i8>, ptr %2, align 16
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = call <512 x i1> @llvm.ppc.mma.xvi8ger4(<16 x i8> %4, <16 x i8> %5)
+!CHECK: store <512 x i1> %6, ptr %1, align 64
+
+ subroutine test_xvi8ger4pp_u1()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10, vu11
+ __vector_quad :: cq
+ call mma_xvi8ger4pp(cq, vu10, vu11)
+ end subroutine test_xvi8ger4pp_u1
+
+!CHECK-LABEL: @test_xvi8ger4pp_u1_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <16 x i8>, ptr %2, align 16
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %1, align 64
+!CHECK: %7 = call <512 x i1> @llvm.ppc.mma.xvi8ger4pp(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5)
+!CHECK: store <512 x i1> %7, ptr %1, align 64
+
+ subroutine test_xvi8ger4pp_i1()
+ use, intrinsic :: mma
+ implicit none
+ vector(integer(1)) vi10, vi11
+ __vector_quad :: cq
+ call mma_xvi8ger4pp(cq, vi10, vi11)
+ end subroutine test_xvi8ger4pp_i1
+
+!CHECK-LABEL: @test_xvi8ger4pp_i1_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <16 x i8>, ptr %2, align 16
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %1, align 64
+!CHECK: %7 = call <512 x i1> @llvm.ppc.mma.xvi8ger4pp(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5)
+!CHECK: store <512 x i1> %7, ptr %1, align 64
+
+ subroutine test_xvi8ger4spp_u1()
+ use, intrinsic :: mma
+ implicit none
+ vector(unsigned(1)) vu10, vu11
+ __vector_quad :: cq
+ call mma_xvi8ger4spp(cq, vu10, vu11)
+ end subroutine test_xvi8ger4spp_u1
+
+!CHECK-LABEL: @test_xvi8ger4spp_u1_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <16 x i8>, ptr %2, align 16
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %1, align 64
+!CHECK: %7 = call <512 x i1> @llvm.ppc.mma.xvi8ger4spp(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5)
+!CHECK: store <512 x i1> %7, ptr %1, align 64
+
+ subroutine test_xvi8ger4spp_i1()
+ use, intrinsic :: mma
+ implicit none
+ vector(integer(1)) vi10, vi11
+ __vector_quad :: cq
+ call mma_xvi8ger4spp(cq, vi10, vi11)
+ end subroutine test_xvi8ger4spp_i1
+
+!CHECK-LABEL: @test_xvi8ger4spp_i1_
+!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK: %4 = load <16 x i8>, ptr %2, align 16
+!CHECK: %5 = load <16 x i8>, ptr %3, align 16
+!CHECK: %6 = load <512 x i1>, ptr %1, align 64
+!CHECK: %7 = call <512 x i1> @llvm.ppc.mma.xvi8ger4spp(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5)
+!CHECK: store <512 x i1> %7, ptr %1, align 64
More information about the flang-commits
mailing list